diff --git a/.travis.yml b/.travis.yml index 039ae95208b74..58d6786aab16a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -51,12 +51,14 @@ matrix: os: linux group: deprecated before_script: - - export CC="gcc-4.9" - - export CXX="g++-4.9" - export ARROW_TRAVIS_USE_TOOLCHAIN=1 - export ARROW_TRAVIS_VALGRIND=1 - export ARROW_TRAVIS_PLASMA=1 + - export ARROW_TRAVIS_ORC=1 - export ARROW_TRAVIS_CLANG_FORMAT=1 + - export ARROW_BUILD_WARNING_LEVEL=CHECKIN + - export CC="clang-4.0" + - export CXX="clang++-4.0" - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - $TRAVIS_BUILD_DIR/ci/travis_lint.sh - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh @@ -74,6 +76,8 @@ matrix: before_script: - export ARROW_TRAVIS_USE_TOOLCHAIN=1 - export ARROW_TRAVIS_PLASMA=1 + - export ARROW_TRAVIS_ORC=1 + - export ARROW_BUILD_WARNING_LEVEL=CHECKIN - travis_wait 50 $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh @@ -87,7 +91,7 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_script_manylinux.sh - language: java os: linux - jdk: openjdk8 + jdk: openjdk7 script: - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh - language: java @@ -103,12 +107,14 @@ matrix: - language: java os: linux env: ARROW_TEST_GROUP=integration - jdk: openjdk7 + jdk: openjdk8 before_script: - source $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - export CC="clang-4.0" - export CXX="clang++-4.0" + - nvm install node - $TRAVIS_BUILD_DIR/ci/travis_lint.sh + - $TRAVIS_BUILD_DIR/ci/travis_before_script_js.sh - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_integration.sh @@ -151,6 +157,7 @@ matrix: rvm: 2.2 env: BUILD_SYSTEM=autotools before_script: + - brew update && brew bundle --file=$TRAVIS_BUILD_DIR/c_glib/Brewfile - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library - $TRAVIS_BUILD_DIR/ci/travis_before_script_c_glib.sh script: diff --git a/CHANGELOG.md b/CHANGELOG.md index 485afc1497ebe..153159cb779e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,295 @@ under the License. --> +# Apache Arrow 0.8.0 (12 December 2017) + +## Bug + +* ARROW-1282 - Large memory reallocation by Arrow causes hang in jemalloc +* ARROW-1341 - [C++] Deprecate arrow::MakeTable in favor of new ctor from ARROW-1334 +* ARROW-1347 - [JAVA] List null type should use consistent name for inner field +* ARROW-1398 - [Python] No support reading columns of type decimal(19,4) +* ARROW-1409 - [Format] Use for "page" attribute in Buffer in metadata +* ARROW-1540 - [C++] Fix valgrind warnings in cuda-test if possible +* ARROW-1541 - [C++] Race condition with arrow\_gpu +* ARROW-1543 - [C++] row\_wise\_conversion example doesn't correspond to ListBuilder constructor arguments +* ARROW-1555 - [Python] write\_to\_dataset on s3 +* ARROW-1584 - [PYTHON] serialize\_pandas on empty dataframe +* ARROW-1585 - serialize\_pandas round trip fails on integer columns +* ARROW-1586 - [PYTHON] serialize\_pandas roundtrip loses columns name +* ARROW-1609 - Plasma: Build fails with Xcode 9.0 +* ARROW-1615 - CXX flags for development more permissive than Travis CI builds +* ARROW-1617 - [Python] Do not use symlinks in python/cmake\_modules +* ARROW-1620 - Python: Download Boost in manylinux1 build from bintray +* ARROW-1624 - [C++] Follow up fixes / tweaks to compiler warnings for Plasma / LLVM 4.0, add to readme +* ARROW-1625 - [Serialization] Support OrderedDict properly +* ARROW-1629 - [C++] Fix problematic code paths identified by infer tool +* ARROW-1633 - [Python] numpy "unicode" arrays not understood +* ARROW-1640 - Resolve OpenSSL issues in Travis CI +* ARROW-1647 - [Plasma] Potential bug when reading/writing messages. +* ARROW-1653 - [Plasma] Use static cast to avoid compiler warning. +* ARROW-1656 - [C++] Endianness Macro is Incorrect on Windows And Mac +* ARROW-1657 - [C++] Multithreaded Read Test Failing on Arch Linux +* ARROW-1658 - [Python] Out of bounds dictionary indices causes segfault after converting to pandas +* ARROW-1663 - [Java] Follow up on ARROW-1347 and make schema backward compatible +* ARROW-1670 - [Python] Speed up deserialization code path +* ARROW-1672 - [Python] Failure to write Feather bytes column +* ARROW-1673 - [Python] NumPy boolean arrays get converted to uint8 arrays on NdarrayToTensor roundtrip +* ARROW-1676 - [C++] Correctly truncate oversized validity bitmaps when writing Feather format +* ARROW-1678 - [Python] Incorrect serialization of numpy.float16 +* ARROW-1680 - [Python] Timestamp unit change not done in from\_pandas() conversion +* ARROW-1686 - Documentation generation script creates "apidocs" directory under site/java +* ARROW-1693 - [JS] Error reading dictionary-encoded integration test files +* ARROW-1695 - [Serialization] Fix reference counting of numpy arrays created in custom serialializer +* ARROW-1698 - [JS] File reader attempts to load the same dictionary batch more than once +* ARROW-1704 - [GLib] Go example in test suite is broken +* ARROW-1708 - [JS] Linter problem breaks master build +* ARROW-1709 - [C++] Decimal.ToString is incorrect for negative scale +* ARROW-1711 - [Python] flake8 checks still not failing builds +* ARROW-1714 - [Python] No named pd.Series name serialized as u'None' +* ARROW-1720 - [Python] Segmentation fault while trying to access an out-of-bound chunk +* ARROW-1723 - Windows: \_\_declspec(dllexport) specified when building arrow static library +* ARROW-1730 - [Python] Incorrect result from pyarrow.array when passing timestamp type +* ARROW-1732 - [Python] RecordBatch.from\_pandas fails on DataFrame with no columns when preserve\_index=False +* ARROW-1735 - [C++] Cast kernels cannot write into sliced output array +* ARROW-1738 - [Python] Wrong datetime conversion when pa.array with unit +* ARROW-1739 - [Python] Fix usages of assertRaises causing broken build +* ARROW-1742 - C++: clang-format is not detected correct on OSX anymore +* ARROW-1743 - [Python] Table to\_pandas fails when index contains categorical column +* ARROW-1745 - Compilation failure on Mac OS in plasma tests +* ARROW-1749 - [C++] Handle range of Decimal128 values that require 39 digits to be displayed +* ARROW-1751 - [Python] Pandas 0.21.0 introduces a breaking API change for MultiIndex construction +* ARROW-1754 - [Python] Fix buggy Parquet roundtrip when an index name is the same as a column name +* ARROW-1756 - [Python] Observed int32 overflow in Feather write/read path +* ARROW-1762 - [C++] unittest failure for language environment +* ARROW-1764 - [Python] Add -c conda-forge for Windows dev installation instructions +* ARROW-1766 - [GLib] Fix failing builds on OSX +* ARROW-1768 - [Python] Fix suppressed exception in ParquetWriter.\_\_del\_\_ +* ARROW-1770 - [GLib] Fix GLib compiler warning +* ARROW-1771 - [C++] ARROW-1749 Breaks Public API test in parquet-cpp +* ARROW-1776 - [C++[ arrow::gpu::CudaContext::bytes\_allocated() isn't defined +* ARROW-1778 - [Python] Link parquet-cpp statically, privately in manylinux1 wheels +* ARROW-1781 - [CI] OSX Builds on Travis-CI time out often +* ARROW-1788 - Plasma store crashes when trying to abort objects for disconnected client +* ARROW-1791 - Integration tests generate date[DAY] values outside of reasonable range +* ARROW-1793 - [Integration] fix a typo for README.md +* ARROW-1800 - [C++] Fix and simplify random\_decimals +* ARROW-1805 - [Python] ignore non-parquet files when exploring dataset +* ARROW-1811 - [C++/Python] Rename all Decimal based APIs to Decimal128 +* ARROW-1812 - Plasma store modifies hash table while iterating during client disconnect +* ARROW-1829 - [Plasma] Clean up eviction policy bookkeeping +* ARROW-1830 - [Python] Error when loading all the files in a dictionary +* ARROW-1836 - [C++] Fix C4996 warning from arrow/util/variant.h on MSVC builds +* ARROW-1840 - [Website] The installation command failed on Windows10 anaconda environment. +* ARROW-1845 - [Python] Expose Decimal128Type +* ARROW-1852 - [Plasma] Make retrieving manager file descriptor const +* ARROW-1853 - [Plasma] Fix off-by-one error in retry processing +* ARROW-1863 - [Python] PyObjectStringify could render bytes-like output for more types of objects +* ARROW-1865 - [C++] Adding a column to an empty Table fails +* ARROW-1869 - Fix typo in LowCostIdentityHashMap +* ARROW-1871 - [Python/C++] Appending Python Decimals with different scales requires rescaling +* ARROW-1873 - [Python] Segmentation fault when loading total 2GB of parquet files +* ARROW-1877 - Incorrect comparison in JsonStringArrayList.equals +* ARROW-1879 - [Python] Dask integration tests are not skipped if dask is not installed +* ARROW-1881 - [Python] setuptools\_scm picks up JS version tags +* ARROW-1882 - [C++] Reintroduce DictionaryBuilder +* ARROW-1883 - [Python] BUG: Table.to\_pandas metadata checking fails if columns are not present +* ARROW-1889 - [Python] --exclude is not available in older git versions +* ARROW-1890 - [Python] Masking for date32 arrays not working +* ARROW-1891 - [Python] NaT date32 values are only converted to nulls if from\_pandas is used +* ARROW-1892 - [Python] Unknown list item type: binary +* ARROW-1893 - [Python] test\_primitive\_serialization fails on Python 2.7.3 +* ARROW-1895 - [Python] Add field\_name to pandas index metadata +* ARROW-1897 - [Python] Incorrect numpy\_type for pandas metadata of Categoricals +* ARROW-1904 - [C++] Deprecate PrimitiveArray::raw\_values +* ARROW-1906 - [Python] Creating a pyarrow.Array with timestamp of different unit is not casted +* ARROW-1908 - [Python] Construction of arrow table from pandas DataFrame with duplicate column names crashes +* ARROW-1910 - CPP README Brewfile link incorrect +* ARROW-1914 - [C++] make -j may fail to build with -DARROW\_GPU=on +* ARROW-1915 - [Python] Parquet tests should be optional +* ARROW-1916 - [Java] Do not exclude java/dev/checkstyle from source releases +* ARROW-1917 - [GLib] Must set GI\_TYPELIB\_PATH in verify-release-candidate.sh +* ARROW-226 - [C++] libhdfs: feedback to help determining cause of failure in opening file path +* ARROW-641 - [C++] Do not build/run io-hdfs-test if ARROW\_HDFS=off + +## Improvement + +* ARROW-1087 - [Python] add get\_include to expose directory containing header files +* ARROW-1134 - [C++] Allow C++/CLI projects to build with Arrow​ +* ARROW-1178 - [Python] Create alternative to Table.from\_pandas that yields a list of RecordBatch objects with a given chunk size +* ARROW-1226 - [C++] Improve / correct doxygen function documentation in arrow::ipc +* ARROW-1371 - [Website] Add "Powered By" page to the website +* ARROW-1455 - [Python] Add Dockerfile for validating Dask integration outside of usual CI +* ARROW-1488 - [C++] Implement ArrayBuilder::Finish in terms of internal::ArrayData +* ARROW-1498 - [GitHub] Add CONTRIBUTING.md and ISSUE\_TEMPLATE.md +* ARROW-1503 - [Python] Add serialization callbacks for pandas objects in pyarrow.serialize +* ARROW-1522 - [C++] Support pyarrow.Buffer as built-in type in pyarrow.serialize +* ARROW-1523 - [C++] Add helper data struct with methods for reading a validity bitmap possibly having a non-zero offset +* ARROW-1524 - [C++] More graceful solution for handling non-zero offsets on inputs and outputs in compute library +* ARROW-1525 - [C++] Change functions in arrow/compare.h to not return Status +* ARROW-1526 - [Python] Unit tests to exercise code path in PARQUET-1100 +* ARROW-1535 - [Python] Enable sdist source tarballs to build assuming that Arrow C++ libraries are available on the host system +* ARROW-1538 - [C++] Support Ubuntu 14.04 in .deb packaging automation +* ARROW-1539 - [C++] Remove functions deprecated as of 0.7.0 and prior releases +* ARROW-1556 - [C++] Incorporate AssertArraysEqual function from PARQUET-1100 patch +* ARROW-1588 - [C++/Format] Harden Decimal Format +* ARROW-1593 - [PYTHON] serialize\_pandas should pass through the preserve\_index keyword +* ARROW-1594 - [Python] Enable multi-threaded conversions in Table.from\_pandas +* ARROW-1600 - [C++] Zero-copy Buffer constructor from std::string +* ARROW-1602 - [C++] Add IsValid/IsNotNull method to arrow::Array +* ARROW-1603 - [C++] Add BinaryArray method to get a value as a std::string +* ARROW-1604 - [Python] Support common type aliases in cast(...) and various type= arguments +* ARROW-1605 - [Python] pyarrow.array should be able to yield smaller integer types without an explicit cast +* ARROW-1607 - [C++] Implement DictionaryBuilder for Decimals +* ARROW-1613 - [Java] ArrowReader should not close the input ReadChannel +* ARROW-1616 - [Python] Add "write" method to RecordBatchStreamWriter that dispatches to write\_table/write\_back as appropriate +* ARROW-1626 - Add make targets to run the inter-procedural static analysis tool called "infer". +* ARROW-1627 - [JAVA] Reduce heap usage(Phase 2) - memory footprint in AllocationManager.BufferLedger +* ARROW-1630 - [Serialization] Support Python datetime objects +* ARROW-1635 - Add release management guide for PMCs +* ARROW-1641 - [C++] Do not include in public headers +* ARROW-1651 - [JS] Lazy row accessor in Table +* ARROW-1652 - [JS] Separate Vector into BatchVector and CompositeVector +* ARROW-1654 - [Python] pa.DataType cannot be pickled +* ARROW-1662 - Move OSX Dependency management into brew bundle Brewfiles +* ARROW-1665 - [Serialization] Support more custom datatypes in the default serialization context +* ARROW-1666 - [GLib] Enable gtk-doc on Travis CI Mac environment +* ARROW-1671 - [C++] Change arrow::MakeArray to not return Status +* ARROW-1675 - [Python] Use RecordBatch.from\_pandas in FeatherWriter.write +* ARROW-1677 - [Blog] Add blog post on Ray and Arrow Python serialization +* ARROW-1679 - [GLib] Add garrow\_record\_batch\_reader\_read\_next() +* ARROW-1683 - [Python] Restore "TimestampType" to pyarrow namespace +* ARROW-1684 - [Python] Simplify user API for reading nested Parquet columns +* ARROW-1689 - [Python] Categorical Indices Should Be Zero-Copy +* ARROW-1691 - [Java] Conform Java Decimal type implementation to format decisions in ARROW-1588 +* ARROW-1701 - [Serialization] Support zero copy PyTorch Tensor serialization +* ARROW-1702 - Update jemalloc in manylinux1 build +* ARROW-1703 - [C++] Vendor exact version of jemalloc we depend on +* ARROW-1707 - Update dev README after movement to GitBox +* ARROW-1716 - [Format/JSON] Use string integer value for Decimals in JSON +* ARROW-1721 - [Python] Support null mask in places where it isn't supported in numpy\_to\_arrow.cc +* ARROW-1724 - [Packaging] Support Ubuntu 17.10 +* ARROW-1725 - [Packaging] Upload .deb for Ubuntu 17.10 +* ARROW-1726 - [GLib] Add setup description to verify C GLib build +* ARROW-1727 - [Format] Expand Arrow streaming format to permit new dictionaries and deltas / additions to existing dictionaries +* ARROW-1728 - [C++] Run clang-format checks in Travis CI +* ARROW-1737 - [GLib] Use G\_DECLARE\_DERIVABLE\_TYPE +* ARROW-1746 - [Python] Add build dependencies for Arch Linux +* ARROW-1747 - [C++] Don't export symbols of statically linked libraries +* ARROW-1750 - [C++] Remove the need for arrow/util/random.h +* ARROW-1753 - [Python] Provide for matching subclasses with register\_type in serialization context +* ARROW-1755 - [C++] Add build options for MSVC to use static runtime libraries +* ARROW-1758 - [Python] Remove pickle=True option for object serialization +* ARROW-1763 - [Python] DataType should be hashable +* ARROW-1765 - [Doc] Use dependencies from conda in C++ docker build +* ARROW-1785 - [Format/C++/Java] Remove VectorLayout metadata from Flatbuffers metadata +* ARROW-1787 - [Python] Support reading parquet files into DataFrames in a backward compatible way +* ARROW-1794 - [C++/Python] Rename DecimalArray to Decimal128Array +* ARROW-1801 - [Docs] Update install instructions to use red-data-tools repos +* ARROW-1808 - [C++] Make RecordBatch interface virtual to permit record batches that lazy-materialize columns +* ARROW-1809 - [GLib] Use .xml instead of .sgml for GTK-Doc main file +* ARROW-1810 - [Plasma] Remove test shell scripts +* ARROW-1817 - Configure JsonFileReader to read NaN for floats +* ARROW-1826 - [JAVA] Avoid branching at cell level (copyFrom) +* ARROW-1828 - [C++] Implement hash kernel specialization for BooleanType +* ARROW-1834 - [Doc] Build documentation in separate build folders +* ARROW-1838 - [C++] Use compute::Datum uniformly for input argument to kernels +* ARROW-1841 - [JS] Update text-encoding-utf-8 and tslib for node ESModules support +* ARROW-1849 - [GLib] Add input checks to GArrowRecordBatch +* ARROW-1850 - [C++] Use const void* in Writable::Write instead of const uint8\_t* +* ARROW-1854 - [Python] Improve performance of serializing object dtype ndarrays +* ARROW-1855 - [GLib] Add workaround for build failure on macOS +* ARROW-1864 - [Java] Upgrade Netty to 4.1.x +* ARROW-1884 - [C++] Make JsonReader/JsonWriter classes internal APIs +* ARROW-1901 - [Python] Support recursive mkdir for DaskFilesystem +* ARROW-1902 - [Python] Remove mkdir race condition from write\_to\_dataset +* ARROW-1905 - [Python] Add more functions for checking exact types in pyarrow.types +* ARROW-1911 - Add Graphistry to Arrow JS proof points +* ARROW-905 - [Docs] Add Dockerfile for reproducible documentation generation +* ARROW-942 - Support integration testing on Python 2.7 +* ARROW-950 - [Site] Add Google Analytics tag + +## New Feature + +* ARROW-1032 - [JS] Support custom\_metadata +* ARROW-1047 - [Java] Add generalized stream writer and reader interfaces that are decoupled from IO / message framing +* ARROW-1114 - [C++] Create Record Batch Builder class as a reusable and efficient way to transpose row-by-row data to columns +* ARROW-1250 - [Python] Define API for user type checking of array types +* ARROW-1482 - [C++] Implement casts between date32 and date64 +* ARROW-1483 - [C++] Implement casts between time32 and time64 +* ARROW-1484 - [C++] Implement (safe and unsafe) casts between timestamps and times of different units +* ARROW-1486 - [C++] Decide if arrow::RecordBatch needs to be copyable +* ARROW-1487 - [C++] Implement casts from List to List, where a cast function is defined from any A to B +* ARROW-1559 - [C++] Kernel implementations for "unique" (compute distinct elements of array) +* ARROW-1573 - [C++] Implement stateful kernel function that uses DictionaryBuilder to compute dictionary indices +* ARROW-1575 - [Python] Add pyarrow.column factory function +* ARROW-1577 - [JS] Package release script for NPM modules +* ARROW-1631 - [C++] Add GRPC to ThirdpartyToolchain.cmake +* ARROW-1637 - [C++] IPC round-trip for null type +* ARROW-1648 - C++: Add cast from Dictionary[NullType] to NullType +* ARROW-1649 - C++: Print number of nulls in PrettyPrint for NullArray +* ARROW-1667 - [GLib] Support Meson +* ARROW-1685 - [GLib] Add GArrowTableReader +* ARROW-1690 - [GLib] Add garrow\_array\_is\_valid() +* ARROW-1697 - [GitHub] Add ISSUE\_TEMPLATE.md +* ARROW-1718 - [Python] Implement casts from timestamp to date32/date64 and support in Array.from\_pandas +* ARROW-1734 - C++/Python: Add cast function on Column-level +* ARROW-1736 - [GLib] Add GArrowCastOptions:allow-time-truncate +* ARROW-1748 - [GLib] Add GArrowRecordBatchBuilder +* ARROW-1752 - [Packaging] Add GPU packages for Debian and Ubuntu +* ARROW-1767 - [C++] Support file reads and writes over 2GB on Windows +* ARROW-1772 - [C++] Add public-api-test module in style of parquet-cpp +* ARROW-1773 - [C++] Add casts from date/time types to compatible signed integers +* ARROW-1775 - Ability to abort created but unsealed Plasma objects +* ARROW-1777 - [C++] Add static ctor ArrayData::Make for nicer syntax in places +* ARROW-1782 - [Python] Expose compressors as pyarrow.compress, pyarrow.decompress +* ARROW-1783 - [Python] Convert SerializedPyObject to/from sequence of component buffers with minimal memory allocation / copying +* ARROW-1784 - [Python] Read and write pandas.DataFrame in pyarrow.serialize by decomposing the BlockManager rather than coercing to Arrow format +* ARROW-1802 - [GLib] Add Arrow GPU support +* ARROW-1806 - [GLib] Add garrow\_record\_batch\_writer\_write\_table() +* ARROW-1844 - [C++] Basic benchmark suite for hash kernels +* ARROW-1857 - [Python] Add switch for boost linkage with static parquet in wheels +* ARROW-1859 - [GLib] Add GArrowDictionaryDataType +* ARROW-1862 - [GLib] Add GArrowDictionaryArray +* ARROW-1874 - [GLib] Add garrow\_array\_unique() +* ARROW-1878 - [GLib] Add garrow\_array\_dictionary\_encode() +* ARROW-480 - [Python] Add accessors for Parquet column statistics +* ARROW-504 - [Python] Add adapter to write pandas.DataFrame in user-selected chunk size to streaming format +* ARROW-507 - [C++/Python] Construct List container from offsets and values subarrays +* ARROW-541 - [JS] Implement JavaScript-compatible implementation +* ARROW-571 - [Python] Add APIs to build Parquet files incrementally from Arrow tables +* ARROW-587 - Add JIRA fix version to merge tool +* ARROW-609 - [C++] Function for casting from days since UNIX epoch to int64 date +* ARROW-838 - [Python] Efficient construction of arrays from non-pandas 1D NumPy arrays +* ARROW-972 - [Python] Add test cases and basic APIs for UnionArray + +## Sub-task + +* ARROW-1471 - [JAVA] Document requirements and non/requirements for ValueVector updates +* ARROW-1472 - [JAVA] Design updated ValueVector Object Hierarchy +* ARROW-1473 - [JAVA] Create Prototype Code Hierarchy (Implementation Phase 1) +* ARROW-1474 - [JAVA] ValueVector hierarchy (Implementation Phase 2) +* ARROW-1476 - [JAVA] Implement final ValueVector updates +* ARROW-1710 - [Java] Remove non-nullable vectors in new vector class hierarchy +* ARROW-1717 - [Java] Remove public static helper method in vector classes for JSONReader/Writer +* ARROW-1719 - [Java] Remove accessor/mutator +* ARROW-1779 - [Java] Integration test breaks without zeroing out validity vectors +* ARROW-1819 - [Java] Remove legacy vector classes +* ARROW-1867 - [Java] Add BitVector APIs from old vector class +* ARROW-1885 - [Java] Restore previous MapVector class names + +## Task + +* ARROW-1369 - Support boolean types in the javascript arrow reader library +* ARROW-1818 - Examine Java Dependencies +* ARROW-1827 - [Java] Add checkstyle config file and header file + +## Test + +* ARROW-1549 - [JS] Integrate auto-generated Arrow test files +* ARROW-1821 - Add integration test case to explicitly check for optional validity buffer +* ARROW-1839 - [C++/Python] Add Decimal Parquet Read/Write Tests + # Apache Arrow 0.7.1 (27 September 2017) ## Bug @@ -1286,3 +1575,4 @@ * ARROW-83 - Add basic test infrastructure for DecimalType + diff --git a/LICENSE.txt b/LICENSE.txt index 00cb9ece232b0..30966d36f37a1 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -398,3 +398,188 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This project includes code from the Boost project + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +This project includes code from the mapbox/variant project, BSD 3-clause +license + +Copyright (c) MapBox +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +- Neither the name "MapBox" nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This project includes code from the Boost project + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +This project includes code from the mapbox/variant project, BSD 3-clause +license + +Copyright (c) MapBox +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +- Neither the name "MapBox" nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This project includes code from the FlatBuffers project + +Copyright 2014 Google Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +This project includes code from the tslib project + +Copyright 2015 Microsoft Corporation. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +This project includes code from the jemalloc project + +https://github.com/jemalloc/jemalloc + +Copyright (C) 2002-2017 Jason Evans . +All rights reserved. +Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved. +Copyright (C) 2009-2017 Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice(s), + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice(s), + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +-------------------------------------------------------------------------------- diff --git a/README.md b/README.md index 9dda25de91178..15a9ffea2a7a3 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,9 @@ Arrow is a set of technologies that enable big-data systems to process and move Initial implementations include: - [The Arrow Format](https://github.com/apache/arrow/tree/master/format) - - [Java implementation](https://github.com/apache/arrow/tree/master/java) - [C++ implementation](https://github.com/apache/arrow/tree/master/cpp) + - [Java implementation](https://github.com/apache/arrow/tree/master/java) + - [JavaScript implementation](https://github.com/apache/arrow/tree/master/js) - [Python interface to C++ libraries](https://github.com/apache/arrow/tree/master/python) Arrow is an [Apache Software Foundation](www.apache.org) project. Learn more at @@ -110,4 +111,4 @@ Thank you in advance for your contributions! [1]: mailto:dev-subscribe@arrow.apache.org [2]: https://github.com/apache/arrow/tree/master/format [3]: https://issues.apache.org/jira/browse/ARROW -[4]: https://github.com/apache/arrow \ No newline at end of file +[4]: https://github.com/apache/arrow diff --git a/appveyor.yml b/appveyor.yml index 55c58d0bf664d..ea7922bf658ef 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -20,16 +20,13 @@ os: Visual Studio 2015 environment: matrix: - - JOB: "Cmake_Script_Tests" - GENERATOR: NMake Makefiles - PYTHON: "3.5" - ARCH: "64" - CONFIGURATION: "Release" - JOB: "Build" - GENERATOR: NMake Makefiles + GENERATOR: Visual Studio 15 2017 Win64 PYTHON: "3.5" ARCH: "64" CONFIGURATION: "Release" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + BOOST_ROOT: C:\Libraries\boost_1_64_0 - JOB: "Build_Debug" GENERATOR: Visual Studio 14 2015 Win64 PYTHON: "3.5" @@ -40,18 +37,25 @@ environment: PYTHON: "3.5" ARCH: "64" CONFIGURATION: "Release" + - JOB: "Static_Crt_Build" + GENERATOR: Visual Studio 14 2015 Win64 + PYTHON: "3.5" + ARCH: "64" - JOB: "Toolchain" GENERATOR: Visual Studio 14 2015 Win64 PYTHON: "3.5" ARCH: "64" CONFIGURATION: "Release" + - JOB: "Cmake_Script_Tests" + GENERATOR: NMake Makefiles + PYTHON: "3.5" + ARCH: "64" + CONFIGURATION: "Release" - JOB: "Build" - GENERATOR: Visual Studio 15 2017 Win64 + GENERATOR: NMake Makefiles PYTHON: "3.5" ARCH: "64" CONFIGURATION: "Release" - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 - BOOST_ROOT: C:\Libraries\boost_1_64_0 MSVC_DEFAULT_OPTIONS: ON BOOST_ROOT: C:\Libraries\boost_1_63_0 diff --git a/c_glib/.gitignore b/c_glib/.gitignore index 03bb0fe61892e..2719147405f71 100644 --- a/c_glib/.gitignore +++ b/c_glib/.gitignore @@ -41,6 +41,7 @@ Makefile.in /arrow-glib/enums.h /arrow-glib/stamp-* /arrow-glib/*.pc +/arrow-gpu-glib/*.pc /example/build /example/read-batch /example/read-stream diff --git a/c_glib/Brewfile b/c_glib/Brewfile index 80d3c81dd6f82..9fe5c3b616317 100644 --- a/c_glib/Brewfile +++ b/c_glib/Brewfile @@ -15,11 +15,14 @@ # specific language governing permissions and limitations # under the License. -brew "gtk-doc" brew "autoconf-archive" -brew "gobject-introspection" -brew "git" +brew "boost" +brew "ccache" brew "cmake" -brew "wget" +brew "git" +brew "gobject-introspection" +brew "gtk-doc" +brew "jemalloc" brew "libtool" brew "lua" +brew "wget" diff --git a/c_glib/Makefile.am b/c_glib/Makefile.am index 577b749fb38bc..4cc70e5a08870 100644 --- a/c_glib/Makefile.am +++ b/c_glib/Makefile.am @@ -19,6 +19,7 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS} SUBDIRS = \ arrow-glib \ + arrow-gpu-glib \ doc \ example \ tool diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am index bf68ec4910e77..16d0703142798 100644 --- a/c_glib/arrow-glib/Makefile.am +++ b/c_glib/arrow-glib/Makefile.am @@ -45,10 +45,12 @@ libarrow_glib_la_headers = \ array.h \ array-builder.h \ arrow-glib.h \ + basic-array.h \ basic-data-type.h \ buffer.h \ chunked-array.h \ column.h \ + composite-array.h \ composite-data-type.h \ data-type.h \ error.h \ @@ -86,12 +88,13 @@ libarrow_glib_la_generated_sources = \ $(libarrow_glib_la_generated_headers) libarrow_glib_la_sources = \ - array.cpp \ array-builder.cpp \ + basic-array.cpp \ basic-data-type.cpp \ buffer.cpp \ chunked-array.cpp \ column.cpp \ + composite-array.cpp \ composite-data-type.cpp \ error.cpp \ field.cpp \ @@ -125,6 +128,7 @@ libarrow_glib_la_cpp_headers = \ array.hpp \ array-builder.hpp \ arrow-glib.hpp \ + basic-array.hpp \ basic-data-type.hpp \ buffer.hpp \ chunked-array.hpp \ @@ -203,20 +207,27 @@ pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = \ arrow-glib.pc -# GObject Introspection +if HAVE_INTROSPECTION -include $(INTROSPECTION_MAKEFILE) INTROSPECTION_GIRS = INTROSPECTION_SCANNER_ARGS = +INTROSPECTION_SCANNER_ENV = +if USE_ARROW_BUILD_DIR +INTROSPECTION_SCANNER_ENV += \ + LD_LIBRARY_PATH=$(ARROW_LIB_DIR):$${PKG_CONFIG_PATH} +endif +if OS_MACOS +INTROSPECTION_SCANNER_ENV += \ + ARCHFLAGS= +endif INTROSPECTION_COMPILER_ARGS = -if HAVE_INTROSPECTION Arrow-1.0.gir: libarrow-glib.la Arrow_1_0_gir_PACKAGES = \ - gobject-2.0 \ gio-2.0 -Arrow_1_0_gir_EXPORT_PACKAGES = arrow +Arrow_1_0_gir_EXPORT_PACKAGES = \ + arrow-glib Arrow_1_0_gir_INCLUDES = \ - GObject-2.0 \ Gio-2.0 Arrow_1_0_gir_CFLAGS = \ $(AM_CPPFLAGS) diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 86e7f985be4a4..7625bcd51ee0f 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -24,6 +24,7 @@ #include #include #include +#include template gboolean @@ -316,7 +317,7 @@ garrow_array_builder_new(const std::shared_ptr &type, * * Release ownership of `arrow::ArrayBuilder` in `builder`. * - * Since: 0.8.8 + * Since: 0.8.0 */ void garrow_array_builder_release_ownership(GArrowArrayBuilder *builder) @@ -327,6 +328,39 @@ garrow_array_builder_release_ownership(GArrowArrayBuilder *builder) priv->have_ownership = FALSE; } +/** + * garrow_array_builder_get_value_data_type: + * @builder: A #GArrowArrayBuilder. + * + * Returns: (transfer full): The #GArrowDataType of the value of + * the array builder. + * + * Since: 0.9.0 + */ +GArrowDataType * +garrow_array_builder_get_value_data_type(GArrowArrayBuilder *builder) +{ + auto arrow_builder = garrow_array_builder_get_raw(builder); + auto arrow_type = arrow_builder->type(); + return garrow_data_type_new_raw(&arrow_type); +} + +/** + * garrow_array_builder_get_value_type: + * @builder: A #GArrowArrayBuilder. + * + * Returns: The #GArrowType of the value of the array builder. + * + * Since: 0.9.0 + */ +GArrowType +garrow_array_builder_get_value_type(GArrowArrayBuilder *builder) +{ + auto arrow_builder = garrow_array_builder_get_raw(builder); + auto arrow_type = arrow_builder->type(); + return garrow_type_from_raw(arrow_type->id()); +} + /** * garrow_array_builder_finish: * @builder: A #GArrowArrayBuilder. diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h index 19dadb30999bd..ea95f31e8fae1 100644 --- a/c_glib/arrow-glib/array-builder.h +++ b/c_glib/arrow-glib/array-builder.h @@ -37,6 +37,10 @@ struct _GArrowArrayBuilderClass void garrow_array_builder_release_ownership(GArrowArrayBuilder *builder); +GArrowDataType * +garrow_array_builder_get_value_data_type(GArrowArrayBuilder *builder); +GArrowType garrow_array_builder_get_value_type(GArrowArrayBuilder *builder); + GArrowArray *garrow_array_builder_finish (GArrowArrayBuilder *builder, GError **error); diff --git a/c_glib/arrow-glib/array.h b/c_glib/arrow-glib/array.h index e988a8aae931d..9a845597d4f0f 100644 --- a/c_glib/arrow-glib/array.h +++ b/c_glib/arrow-glib/array.h @@ -19,1226 +19,5 @@ #pragma once -#include -#include -#include - -G_BEGIN_DECLS - -#define GARROW_TYPE_ARRAY \ - (garrow_array_get_type()) -#define GARROW_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), GARROW_TYPE_ARRAY, GArrowArray)) -#define GARROW_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), GARROW_TYPE_ARRAY, GArrowArrayClass)) -#define GARROW_IS_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), GARROW_TYPE_ARRAY)) -#define GARROW_IS_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), GARROW_TYPE_ARRAY)) -#define GARROW_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), GARROW_TYPE_ARRAY, GArrowArrayClass)) - -typedef struct _GArrowArray GArrowArray; -typedef struct _GArrowArrayClass GArrowArrayClass; - -/** - * GArrowArray: - * - * It wraps `arrow::Array`. - */ -struct _GArrowArray -{ - /*< private >*/ - GObject parent_instance; -}; - -struct _GArrowArrayClass -{ - GObjectClass parent_class; -}; - -GType garrow_array_get_type (void) G_GNUC_CONST; - -gboolean garrow_array_equal (GArrowArray *array, - GArrowArray *other_array); -gboolean garrow_array_equal_approx(GArrowArray *array, - GArrowArray *other_array); -gboolean garrow_array_equal_range (GArrowArray *array, - gint64 start_index, - GArrowArray *other_array, - gint64 other_start_index, - gint64 end_index); - -gboolean garrow_array_is_null (GArrowArray *array, - gint64 i); -gboolean garrow_array_is_valid (GArrowArray *array, - gint64 i); -gint64 garrow_array_get_length (GArrowArray *array); -gint64 garrow_array_get_offset (GArrowArray *array); -gint64 garrow_array_get_n_nulls (GArrowArray *array); -GArrowBuffer *garrow_array_get_null_bitmap(GArrowArray *array); -GArrowDataType *garrow_array_get_value_data_type(GArrowArray *array); -GArrowType garrow_array_get_value_type(GArrowArray *array); -GArrowArray *garrow_array_slice (GArrowArray *array, - gint64 offset, - gint64 length); -gchar *garrow_array_to_string (GArrowArray *array, - GError **error); - -GArrowArray *garrow_array_cast (GArrowArray *array, - GArrowDataType *target_data_type, - GArrowCastOptions *options, - GError **error); - -#define GARROW_TYPE_NULL_ARRAY \ - (garrow_null_array_get_type()) -#define GARROW_NULL_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_NULL_ARRAY, \ - GArrowNullArray)) -#define GARROW_NULL_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_NULL_ARRAY, \ - GArrowNullArrayClass)) -#define GARROW_IS_NULL_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_NULL_ARRAY)) -#define GARROW_IS_NULL_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_NULL_ARRAY)) -#define GARROW_NULL_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_NULL_ARRAY, \ - GArrowNullArrayClass)) - -typedef struct _GArrowNullArray GArrowNullArray; -typedef struct _GArrowNullArrayClass GArrowNullArrayClass; - -/** - * GArrowNullArray: - * - * It wraps `arrow::NullArray`. - */ -struct _GArrowNullArray -{ - /*< private >*/ - GArrowArray parent_instance; -}; - -struct _GArrowNullArrayClass -{ - GArrowArrayClass parent_class; -}; - -GType garrow_null_array_get_type(void) G_GNUC_CONST; - -GArrowNullArray *garrow_null_array_new(gint64 length); - - -#define GARROW_TYPE_PRIMITIVE_ARRAY \ - (garrow_primitive_array_get_type()) -#define GARROW_PRIMITIVE_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_PRIMITIVE_ARRAY, \ - GArrowPrimitiveArray)) -#define GARROW_PRIMITIVE_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_PRIMITIVE_ARRAY, \ - GArrowPrimitiveArrayClass)) -#define GARROW_IS_PRIMITIVE_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_PRIMITIVE_ARRAY)) -#define GARROW_IS_PRIMITIVE_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_PRIMITIVE_ARRAY)) -#define GARROW_PRIMITIVE_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_PRIMITIVE_ARRAY, \ - GArrowPrimitiveArrayClass)) - -typedef struct _GArrowPrimitiveArray GArrowPrimitiveArray; -typedef struct _GArrowPrimitiveArrayClass GArrowPrimitiveArrayClass; - -/** - * GArrowPrimitiveArray: - * - * It wraps `arrow::PrimitiveArray`. - */ -struct _GArrowPrimitiveArray -{ - /*< private >*/ - GArrowArray parent_instance; -}; - -struct _GArrowPrimitiveArrayClass -{ - GArrowArrayClass parent_class; -}; - -GType garrow_primitive_array_get_type(void) G_GNUC_CONST; - -GArrowBuffer *garrow_primitive_array_get_buffer(GArrowPrimitiveArray *array); - - -#define GARROW_TYPE_BOOLEAN_ARRAY \ - (garrow_boolean_array_get_type()) -#define GARROW_BOOLEAN_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_BOOLEAN_ARRAY, \ - GArrowBooleanArray)) -#define GARROW_BOOLEAN_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_BOOLEAN_ARRAY, \ - GArrowBooleanArrayClass)) -#define GARROW_IS_BOOLEAN_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_BOOLEAN_ARRAY)) -#define GARROW_IS_BOOLEAN_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_BOOLEAN_ARRAY)) -#define GARROW_BOOLEAN_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_BOOLEAN_ARRAY, \ - GArrowBooleanArrayClass)) - -typedef struct _GArrowBooleanArray GArrowBooleanArray; -typedef struct _GArrowBooleanArrayClass GArrowBooleanArrayClass; - -/** - * GArrowBooleanArray: - * - * It wraps `arrow::BooleanArray`. - */ -struct _GArrowBooleanArray -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowBooleanArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_boolean_array_get_type (void) G_GNUC_CONST; - -GArrowBooleanArray *garrow_boolean_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gboolean garrow_boolean_array_get_value (GArrowBooleanArray *array, - gint64 i); -gboolean *garrow_boolean_array_get_values(GArrowBooleanArray *array, - gint64 *length); - - -#define GARROW_TYPE_INT8_ARRAY \ - (garrow_int8_array_get_type()) -#define GARROW_INT8_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_INT8_ARRAY, \ - GArrowInt8Array)) -#define GARROW_INT8_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_INT8_ARRAY, \ - GArrowInt8ArrayClass)) -#define GARROW_IS_INT8_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_INT8_ARRAY)) -#define GARROW_IS_INT8_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_INT8_ARRAY)) -#define GARROW_INT8_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_INT8_ARRAY, \ - GArrowInt8ArrayClass)) - -typedef struct _GArrowInt8Array GArrowInt8Array; -typedef struct _GArrowInt8ArrayClass GArrowInt8ArrayClass; - -/** - * GArrowInt8Array: - * - * It wraps `arrow::Int8Array`. - */ -struct _GArrowInt8Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowInt8ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_int8_array_get_type(void) G_GNUC_CONST; - -GArrowInt8Array *garrow_int8_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gint8 garrow_int8_array_get_value(GArrowInt8Array *array, - gint64 i); -const gint8 *garrow_int8_array_get_values(GArrowInt8Array *array, - gint64 *length); - - -#define GARROW_TYPE_UINT8_ARRAY \ - (garrow_uint8_array_get_type()) -#define GARROW_UINT8_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_UINT8_ARRAY, \ - GArrowUInt8Array)) -#define GARROW_UINT8_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_UINT8_ARRAY, \ - GArrowUInt8ArrayClass)) -#define GARROW_IS_UINT8_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_UINT8_ARRAY)) -#define GARROW_IS_UINT8_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_UINT8_ARRAY)) -#define GARROW_UINT8_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_UINT8_ARRAY, \ - GArrowUInt8ArrayClass)) - -typedef struct _GArrowUInt8Array GArrowUInt8Array; -typedef struct _GArrowUInt8ArrayClass GArrowUInt8ArrayClass; - -/** - * GArrowUInt8Array: - * - * It wraps `arrow::UInt8Array`. - */ -struct _GArrowUInt8Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowUInt8ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_uint8_array_get_type(void) G_GNUC_CONST; - -GArrowUInt8Array *garrow_uint8_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -guint8 garrow_uint8_array_get_value(GArrowUInt8Array *array, - gint64 i); -const guint8 *garrow_uint8_array_get_values(GArrowUInt8Array *array, - gint64 *length); - - -#define GARROW_TYPE_INT16_ARRAY \ - (garrow_int16_array_get_type()) -#define GARROW_INT16_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_INT16_ARRAY, \ - GArrowInt16Array)) -#define GARROW_INT16_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_INT16_ARRAY, \ - GArrowInt16ArrayClass)) -#define GARROW_IS_INT16_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_INT16_ARRAY)) -#define GARROW_IS_INT16_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_INT16_ARRAY)) -#define GARROW_INT16_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_INT16_ARRAY, \ - GArrowInt16ArrayClass)) - -typedef struct _GArrowInt16Array GArrowInt16Array; -typedef struct _GArrowInt16ArrayClass GArrowInt16ArrayClass; - -/** - * GArrowInt16Array: - * - * It wraps `arrow::Int16Array`. - */ -struct _GArrowInt16Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowInt16ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_int16_array_get_type(void) G_GNUC_CONST; - -GArrowInt16Array *garrow_int16_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gint16 garrow_int16_array_get_value(GArrowInt16Array *array, - gint64 i); -const gint16 *garrow_int16_array_get_values(GArrowInt16Array *array, - gint64 *length); - - -#define GARROW_TYPE_UINT16_ARRAY \ - (garrow_uint16_array_get_type()) -#define GARROW_UINT16_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_UINT16_ARRAY, \ - GArrowUInt16Array)) -#define GARROW_UINT16_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_UINT16_ARRAY, \ - GArrowUInt16ArrayClass)) -#define GARROW_IS_UINT16_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_UINT16_ARRAY)) -#define GARROW_IS_UINT16_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_UINT16_ARRAY)) -#define GARROW_UINT16_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_UINT16_ARRAY, \ - GArrowUInt16ArrayClass)) - -typedef struct _GArrowUInt16Array GArrowUInt16Array; -typedef struct _GArrowUInt16ArrayClass GArrowUInt16ArrayClass; - -/** - * GArrowUInt16Array: - * - * It wraps `arrow::UInt16Array`. - */ -struct _GArrowUInt16Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowUInt16ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_uint16_array_get_type(void) G_GNUC_CONST; - -GArrowUInt16Array *garrow_uint16_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -guint16 garrow_uint16_array_get_value(GArrowUInt16Array *array, - gint64 i); -const guint16 *garrow_uint16_array_get_values(GArrowUInt16Array *array, - gint64 *length); - - -#define GARROW_TYPE_INT32_ARRAY \ - (garrow_int32_array_get_type()) -#define GARROW_INT32_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_INT32_ARRAY, \ - GArrowInt32Array)) -#define GARROW_INT32_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_INT32_ARRAY, \ - GArrowInt32ArrayClass)) -#define GARROW_IS_INT32_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_INT32_ARRAY)) -#define GARROW_IS_INT32_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_INT32_ARRAY)) -#define GARROW_INT32_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_INT32_ARRAY, \ - GArrowInt32ArrayClass)) - -typedef struct _GArrowInt32Array GArrowInt32Array; -typedef struct _GArrowInt32ArrayClass GArrowInt32ArrayClass; - -/** - * GArrowInt32Array: - * - * It wraps `arrow::Int32Array`. - */ -struct _GArrowInt32Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowInt32ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_int32_array_get_type(void) G_GNUC_CONST; - -GArrowInt32Array *garrow_int32_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gint32 garrow_int32_array_get_value(GArrowInt32Array *array, - gint64 i); -const gint32 *garrow_int32_array_get_values(GArrowInt32Array *array, - gint64 *length); - - -#define GARROW_TYPE_UINT32_ARRAY \ - (garrow_uint32_array_get_type()) -#define GARROW_UINT32_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_UINT32_ARRAY, \ - GArrowUInt32Array)) -#define GARROW_UINT32_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_UINT32_ARRAY, \ - GArrowUInt32ArrayClass)) -#define GARROW_IS_UINT32_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_UINT32_ARRAY)) -#define GARROW_IS_UINT32_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_UINT32_ARRAY)) -#define GARROW_UINT32_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_UINT32_ARRAY, \ - GArrowUInt32ArrayClass)) - -typedef struct _GArrowUInt32Array GArrowUInt32Array; -typedef struct _GArrowUInt32ArrayClass GArrowUInt32ArrayClass; - -/** - * GArrowUInt32Array: - * - * It wraps `arrow::UInt32Array`. - */ -struct _GArrowUInt32Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowUInt32ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_uint32_array_get_type(void) G_GNUC_CONST; - -GArrowUInt32Array *garrow_uint32_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -guint32 garrow_uint32_array_get_value(GArrowUInt32Array *array, - gint64 i); -const guint32 *garrow_uint32_array_get_values(GArrowUInt32Array *array, - gint64 *length); - - -#define GARROW_TYPE_INT64_ARRAY \ - (garrow_int64_array_get_type()) -#define GARROW_INT64_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_INT64_ARRAY, \ - GArrowInt64Array)) -#define GARROW_INT64_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_INT64_ARRAY, \ - GArrowInt64ArrayClass)) -#define GARROW_IS_INT64_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_INT64_ARRAY)) -#define GARROW_IS_INT64_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_INT64_ARRAY)) -#define GARROW_INT64_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_INT64_ARRAY, \ - GArrowInt64ArrayClass)) - -typedef struct _GArrowInt64Array GArrowInt64Array; -typedef struct _GArrowInt64ArrayClass GArrowInt64ArrayClass; - -/** - * GArrowInt64Array: - * - * It wraps `arrow::Int64Array`. - */ -struct _GArrowInt64Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowInt64ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_int64_array_get_type(void) G_GNUC_CONST; - -GArrowInt64Array *garrow_int64_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gint64 garrow_int64_array_get_value(GArrowInt64Array *array, - gint64 i); -const gint64 *garrow_int64_array_get_values(GArrowInt64Array *array, - gint64 *length); - - -#define GARROW_TYPE_UINT64_ARRAY \ - (garrow_uint64_array_get_type()) -#define GARROW_UINT64_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_UINT64_ARRAY, \ - GArrowUInt64Array)) -#define GARROW_UINT64_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_UINT64_ARRAY, \ - GArrowUInt64ArrayClass)) -#define GARROW_IS_UINT64_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_UINT64_ARRAY)) -#define GARROW_IS_UINT64_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_UINT64_ARRAY)) -#define GARROW_UINT64_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_UINT64_ARRAY, \ - GArrowUInt64ArrayClass)) - -typedef struct _GArrowUInt64Array GArrowUInt64Array; -typedef struct _GArrowUInt64ArrayClass GArrowUInt64ArrayClass; - -/** - * GArrowUInt64Array: - * - * It wraps `arrow::UInt64Array`. - */ -struct _GArrowUInt64Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowUInt64ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_uint64_array_get_type(void) G_GNUC_CONST; - -GArrowUInt64Array *garrow_uint64_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -guint64 garrow_uint64_array_get_value(GArrowUInt64Array *array, - gint64 i); -const guint64 *garrow_uint64_array_get_values(GArrowUInt64Array *array, - gint64 *length); - - -#define GARROW_TYPE_FLOAT_ARRAY \ - (garrow_float_array_get_type()) -#define GARROW_FLOAT_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_FLOAT_ARRAY, \ - GArrowFloatArray)) -#define GARROW_FLOAT_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_FLOAT_ARRAY, \ - GArrowFloatArrayClass)) -#define GARROW_IS_FLOAT_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_FLOAT_ARRAY)) -#define GARROW_IS_FLOAT_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_FLOAT_ARRAY)) -#define GARROW_FLOAT_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_FLOAT_ARRAY, \ - GArrowFloatArrayClass)) - -typedef struct _GArrowFloatArray GArrowFloatArray; -typedef struct _GArrowFloatArrayClass GArrowFloatArrayClass; - -/** - * GArrowFloatArray: - * - * It wraps `arrow::FloatArray`. - */ -struct _GArrowFloatArray -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowFloatArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_float_array_get_type(void) G_GNUC_CONST; - -GArrowFloatArray *garrow_float_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gfloat garrow_float_array_get_value(GArrowFloatArray *array, - gint64 i); -const gfloat *garrow_float_array_get_values(GArrowFloatArray *array, - gint64 *length); - - -#define GARROW_TYPE_DOUBLE_ARRAY \ - (garrow_double_array_get_type()) -#define GARROW_DOUBLE_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_DOUBLE_ARRAY, \ - GArrowDoubleArray)) -#define GARROW_DOUBLE_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_DOUBLE_ARRAY, \ - GArrowDoubleArrayClass)) -#define GARROW_IS_DOUBLE_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_DOUBLE_ARRAY)) -#define GARROW_IS_DOUBLE_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_DOUBLE_ARRAY)) -#define GARROW_DOUBLE_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_DOUBLE_ARRAY, \ - GArrowDoubleArrayClass)) - -typedef struct _GArrowDoubleArray GArrowDoubleArray; -typedef struct _GArrowDoubleArrayClass GArrowDoubleArrayClass; - -/** - * GArrowDoubleArray: - * - * It wraps `arrow::DoubleArray`. - */ -struct _GArrowDoubleArray -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowDoubleArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_double_array_get_type(void) G_GNUC_CONST; - -GArrowDoubleArray *garrow_double_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gdouble garrow_double_array_get_value(GArrowDoubleArray *array, - gint64 i); -const gdouble *garrow_double_array_get_values(GArrowDoubleArray *array, - gint64 *length); - - -#define GARROW_TYPE_BINARY_ARRAY \ - (garrow_binary_array_get_type()) -#define GARROW_BINARY_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_BINARY_ARRAY, \ - GArrowBinaryArray)) -#define GARROW_BINARY_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_BINARY_ARRAY, \ - GArrowBinaryArrayClass)) -#define GARROW_IS_BINARY_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_BINARY_ARRAY)) -#define GARROW_IS_BINARY_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_BINARY_ARRAY)) -#define GARROW_BINARY_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_BINARY_ARRAY, \ - GArrowBinaryArrayClass)) - -typedef struct _GArrowBinaryArray GArrowBinaryArray; -typedef struct _GArrowBinaryArrayClass GArrowBinaryArrayClass; - -/** - * GArrowBinaryArray: - * - * It wraps `arrow::BinaryArray`. - */ -struct _GArrowBinaryArray -{ - /*< private >*/ - GArrowArray parent_instance; -}; - -struct _GArrowBinaryArrayClass -{ - GArrowArrayClass parent_class; -}; - -GType garrow_binary_array_get_type(void) G_GNUC_CONST; - -GArrowBinaryArray *garrow_binary_array_new(gint64 length, - GArrowBuffer *value_offsets, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -GBytes *garrow_binary_array_get_value(GArrowBinaryArray *array, - gint64 i); -GArrowBuffer *garrow_binary_array_get_buffer(GArrowBinaryArray *array); -GArrowBuffer *garrow_binary_array_get_offsets_buffer(GArrowBinaryArray *array); - -#define GARROW_TYPE_STRING_ARRAY \ - (garrow_string_array_get_type()) -#define GARROW_STRING_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_STRING_ARRAY, \ - GArrowStringArray)) -#define GARROW_STRING_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_STRING_ARRAY, \ - GArrowStringArrayClass)) -#define GARROW_IS_STRING_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_STRING_ARRAY)) -#define GARROW_IS_STRING_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_STRING_ARRAY)) -#define GARROW_STRING_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_STRING_ARRAY, \ - GArrowStringArrayClass)) - -typedef struct _GArrowStringArray GArrowStringArray; -typedef struct _GArrowStringArrayClass GArrowStringArrayClass; - -/** - * GArrowStringArray: - * - * It wraps `arrow::StringArray`. - */ -struct _GArrowStringArray -{ - /*< private >*/ - GArrowBinaryArray parent_instance; -}; - -struct _GArrowStringArrayClass -{ - GArrowBinaryArrayClass parent_class; -}; - -GType garrow_string_array_get_type(void) G_GNUC_CONST; - -GArrowStringArray *garrow_string_array_new(gint64 length, - GArrowBuffer *value_offsets, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gchar *garrow_string_array_get_string(GArrowStringArray *array, - gint64 i); - - -#define GARROW_TYPE_DATE32_ARRAY \ - (garrow_date32_array_get_type()) -#define GARROW_DATE32_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_DATE32_ARRAY, \ - GArrowDate32Array)) -#define GARROW_DATE32_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_DATE32_ARRAY, \ - GArrowDate32ArrayClass)) -#define GARROW_IS_DATE32_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_DATE32_ARRAY)) -#define GARROW_IS_DATE32_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_DATE32_ARRAY)) -#define GARROW_DATE32_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_DATE32_ARRAY, \ - GArrowDate32ArrayClass)) - -typedef struct _GArrowDate32Array GArrowDate32Array; -typedef struct _GArrowDate32ArrayClass GArrowDate32ArrayClass; - -/** - * GArrowDate32Array: - * - * It wraps `arrow::Date32Array`. - */ -struct _GArrowDate32Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowDate32ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_date32_array_get_type(void) G_GNUC_CONST; - -GArrowDate32Array *garrow_date32_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gint32 garrow_date32_array_get_value(GArrowDate32Array *array, - gint64 i); -const gint32 *garrow_date32_array_get_values(GArrowDate32Array *array, - gint64 *length); - - -#define GARROW_TYPE_DATE64_ARRAY \ - (garrow_date64_array_get_type()) -#define GARROW_DATE64_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_DATE64_ARRAY, \ - GArrowDate64Array)) -#define GARROW_DATE64_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_DATE64_ARRAY, \ - GArrowDate64ArrayClass)) -#define GARROW_IS_DATE64_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_DATE64_ARRAY)) -#define GARROW_IS_DATE64_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_DATE64_ARRAY)) -#define GARROW_DATE64_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_DATE64_ARRAY, \ - GArrowDate64ArrayClass)) - -typedef struct _GArrowDate64Array GArrowDate64Array; -typedef struct _GArrowDate64ArrayClass GArrowDate64ArrayClass; - -/** - * GArrowDate64Array: - * - * It wraps `arrow::Date64Array`. - */ -struct _GArrowDate64Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowDate64ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_date64_array_get_type(void) G_GNUC_CONST; - -GArrowDate64Array *garrow_date64_array_new(gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gint64 garrow_date64_array_get_value(GArrowDate64Array *array, - gint64 i); -const gint64 *garrow_date64_array_get_values(GArrowDate64Array *array, - gint64 *length); - - -#define GARROW_TYPE_TIMESTAMP_ARRAY \ - (garrow_timestamp_array_get_type()) -#define GARROW_TIMESTAMP_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_TIMESTAMP_ARRAY, \ - GArrowTimestampArray)) -#define GARROW_TIMESTAMP_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_TIMESTAMP_ARRAY, \ - GArrowTimestampArrayClass)) -#define GARROW_IS_TIMESTAMP_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_TIMESTAMP_ARRAY)) -#define GARROW_IS_TIMESTAMP_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_TIMESTAMP_ARRAY)) -#define GARROW_TIMESTAMP_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_TIMESTAMP_ARRAY, \ - GArrowTimestampArrayClass)) - -typedef struct _GArrowTimestampArray GArrowTimestampArray; -typedef struct _GArrowTimestampArrayClass GArrowTimestampArrayClass; - -/** - * GArrowTimestampArray: - * - * It wraps `arrow::TimestampArray`. - */ -struct _GArrowTimestampArray -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowTimestampArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_timestamp_array_get_type(void) G_GNUC_CONST; - -GArrowTimestampArray *garrow_timestamp_array_new(GArrowTimestampDataType *data_type, - gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gint64 garrow_timestamp_array_get_value(GArrowTimestampArray *array, - gint64 i); -const gint64 *garrow_timestamp_array_get_values(GArrowTimestampArray *array, - gint64 *length); - - -#define GARROW_TYPE_TIME32_ARRAY \ - (garrow_time32_array_get_type()) -#define GARROW_TIME32_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_TIME32_ARRAY, \ - GArrowTime32Array)) -#define GARROW_TIME32_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_TIME32_ARRAY, \ - GArrowTime32ArrayClass)) -#define GARROW_IS_TIME32_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_TIME32_ARRAY)) -#define GARROW_IS_TIME32_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_TIME32_ARRAY)) -#define GARROW_TIME32_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_TIME32_ARRAY, \ - GArrowTime32ArrayClass)) - -typedef struct _GArrowTime32Array GArrowTime32Array; -typedef struct _GArrowTime32ArrayClass GArrowTime32ArrayClass; - -/** - * GArrowTime32Array: - * - * It wraps `arrow::Time32Array`. - */ -struct _GArrowTime32Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowTime32ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_time32_array_get_type(void) G_GNUC_CONST; - -GArrowTime32Array *garrow_time32_array_new(GArrowTime32DataType *data_type, - gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gint32 garrow_time32_array_get_value(GArrowTime32Array *array, - gint64 i); -const gint32 *garrow_time32_array_get_values(GArrowTime32Array *array, - gint64 *length); - - -#define GARROW_TYPE_TIME64_ARRAY \ - (garrow_time64_array_get_type()) -#define GARROW_TIME64_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_TIME64_ARRAY, \ - GArrowTime64Array)) -#define GARROW_TIME64_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_TIME64_ARRAY, \ - GArrowTime64ArrayClass)) -#define GARROW_IS_TIME64_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_TIME64_ARRAY)) -#define GARROW_IS_TIME64_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_TIME64_ARRAY)) -#define GARROW_TIME64_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_TIME64_ARRAY, \ - GArrowTime64ArrayClass)) - -typedef struct _GArrowTime64Array GArrowTime64Array; -typedef struct _GArrowTime64ArrayClass GArrowTime64ArrayClass; - -/** - * GArrowTime64Array: - * - * It wraps `arrow::Time64Array`. - */ -struct _GArrowTime64Array -{ - /*< private >*/ - GArrowPrimitiveArray parent_instance; -}; - -struct _GArrowTime64ArrayClass -{ - GArrowPrimitiveArrayClass parent_class; -}; - -GType garrow_time64_array_get_type(void) G_GNUC_CONST; - -GArrowTime64Array *garrow_time64_array_new(GArrowTime64DataType *data_type, - gint64 length, - GArrowBuffer *data, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -gint64 garrow_time64_array_get_value(GArrowTime64Array *array, - gint64 i); -const gint64 *garrow_time64_array_get_values(GArrowTime64Array *array, - gint64 *length); - - -#define GARROW_TYPE_LIST_ARRAY \ - (garrow_list_array_get_type()) -#define GARROW_LIST_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_LIST_ARRAY, \ - GArrowListArray)) -#define GARROW_LIST_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_LIST_ARRAY, \ - GArrowListArrayClass)) -#define GARROW_IS_LIST_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_LIST_ARRAY)) -#define GARROW_IS_LIST_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_LIST_ARRAY)) -#define GARROW_LIST_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_LIST_ARRAY, \ - GArrowListArrayClass)) - -typedef struct _GArrowListArray GArrowListArray; -typedef struct _GArrowListArrayClass GArrowListArrayClass; - -/** - * GArrowListArray: - * - * It wraps `arrow::ListArray`. - */ -struct _GArrowListArray -{ - /*< private >*/ - GArrowArray parent_instance; -}; - -struct _GArrowListArrayClass -{ - GArrowArrayClass parent_class; -}; - -GType garrow_list_array_get_type(void) G_GNUC_CONST; - -GArrowListArray *garrow_list_array_new(gint64 length, - GArrowBuffer *value_offsets, - GArrowArray *values, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -GArrowDataType *garrow_list_array_get_value_type(GArrowListArray *array); -GArrowArray *garrow_list_array_get_value(GArrowListArray *array, - gint64 i); - - -#define GARROW_TYPE_STRUCT_ARRAY \ - (garrow_struct_array_get_type()) -#define GARROW_STRUCT_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_STRUCT_ARRAY, \ - GArrowStructArray)) -#define GARROW_STRUCT_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_STRUCT_ARRAY, \ - GArrowStructArrayClass)) -#define GARROW_IS_STRUCT_ARRAY(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_STRUCT_ARRAY)) -#define GARROW_IS_STRUCT_ARRAY_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_STRUCT_ARRAY)) -#define GARROW_STRUCT_ARRAY_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_STRUCT_ARRAY, \ - GArrowStructArrayClass)) - -typedef struct _GArrowStructArray GArrowStructArray; -typedef struct _GArrowStructArrayClass GArrowStructArrayClass; - -/** - * GArrowStructArray: - * - * It wraps `arrow::StructArray`. - */ -struct _GArrowStructArray -{ - /*< private >*/ - GArrowArray parent_instance; -}; - -struct _GArrowStructArrayClass -{ - GArrowArrayClass parent_class; -}; - -GType garrow_struct_array_get_type(void) G_GNUC_CONST; - -GArrowStructArray *garrow_struct_array_new(GArrowDataType *data_type, - gint64 length, - GList *children, - GArrowBuffer *null_bitmap, - gint64 n_nulls); - -GArrowArray *garrow_struct_array_get_field(GArrowStructArray *array, - gint i); -GList *garrow_struct_array_get_fields(GArrowStructArray *array); - -G_END_DECLS +#include +#include diff --git a/c_glib/arrow-glib/array.hpp b/c_glib/arrow-glib/array.hpp index d2dff22c48cf9..e575c425380f2 100644 --- a/c_glib/arrow-glib/array.hpp +++ b/c_glib/arrow-glib/array.hpp @@ -19,9 +19,5 @@ #pragma once -#include - -#include - -GArrowArray *garrow_array_new_raw(std::shared_ptr *arrow_array); -std::shared_ptr garrow_array_get_raw(GArrowArray *array); +#include +#include diff --git a/c_glib/arrow-glib/array.cpp b/c_glib/arrow-glib/basic-array.cpp similarity index 90% rename from c_glib/arrow-glib/array.cpp rename to c_glib/arrow-glib/basic-array.cpp index 4945ca3cfe903..3c027c0e11240 100644 --- a/c_glib/arrow-glib/array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include @@ -86,9 +86,9 @@ garrow_primitive_array_new(GArrowDataType *data_type, G_BEGIN_DECLS /** - * SECTION: array - * @section_id: array-classes - * @title: Array classes + * SECTION: basic-array + * @section_id: basic-array-classes + * @title: Basic array classes * @include: arrow-glib/arrow-glib.h * * #GArrowArray is a base class for all array classes such as @@ -194,15 +194,6 @@ G_BEGIN_DECLS * nanoseconds since midnight in 64-bit signed integer array. It can * store zero or more time data. If you don't have Arrow format data, * you need to use #GArrowTime64ArrayBuilder to create a new array. - * - * #GArrowListArray is a class for list array. It can store zero or - * more list data. If you don't have Arrow format data, you need to - * use #GArrowListArrayBuilder to create a new array. - * - * #GArrowStructArray is a class for struct array. It can store zero - * or more structs. One struct has zero or more fields. If you don't - * have Arrow format data, you need to use #GArrowStructArrayBuilder - * to create a new array. */ typedef struct GArrowArrayPrivate_ { @@ -572,6 +563,77 @@ garrow_array_cast(GArrowArray *array, return garrow_array_new_raw(&arrow_casted_array); } +/** + * garrow_array_unique: + * @array: A #GArrowArray. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): + * A newly created unique elements array on success, %NULL on error. + * + * Since: 0.8.0 + */ +GArrowArray * +garrow_array_unique(GArrowArray *array, + GError **error) +{ + auto arrow_array = garrow_array_get_raw(array); + auto memory_pool = arrow::default_memory_pool(); + arrow::compute::FunctionContext context(memory_pool); + std::shared_ptr arrow_unique_array; + auto status = arrow::compute::Unique(&context, + arrow::compute::Datum(arrow_array), + &arrow_unique_array); + if (!status.ok()) { + std::stringstream message; + message << "[array][unique] <"; + message << arrow_array->type()->ToString(); + message << ">"; + garrow_error_check(error, status, message.str().c_str()); + return NULL; + } + + return garrow_array_new_raw(&arrow_unique_array); +} + +/** + * garrow_array_dictionary_encode: + * @array: A #GArrowArray. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): + * A newly created #GArrowDictionarArray for the @array on success, + * %NULL on error. + * + * Since: 0.8.0 + */ +GArrowArray * +garrow_array_dictionary_encode(GArrowArray *array, + GError **error) +{ + auto arrow_array = garrow_array_get_raw(array); + auto memory_pool = arrow::default_memory_pool(); + arrow::compute::FunctionContext context(memory_pool); + arrow::compute::Datum dictionary_encoded_datum; + auto status = + arrow::compute::DictionaryEncode(&context, + arrow::compute::Datum(arrow_array), + &dictionary_encoded_datum); + if (!status.ok()) { + std::stringstream message; + message << "[array][dictionary-encode] <"; + message << arrow_array->type()->ToString(); + message << ">"; + garrow_error_check(error, status, message.str().c_str()); + return NULL; + } + + auto arrow_dictionary_encoded_array = + arrow::MakeArray(dictionary_encoded_datum.array()); + + return garrow_array_new_raw(&arrow_dictionary_encoded_array); +} + G_DEFINE_TYPE(GArrowNullArray, \ garrow_null_array, \ @@ -2028,195 +2090,6 @@ garrow_time64_array_get_values(GArrowTime64Array *array, return reinterpret_cast(values); } - -G_DEFINE_TYPE(GArrowListArray, \ - garrow_list_array, \ - GARROW_TYPE_ARRAY) - -static void -garrow_list_array_init(GArrowListArray *object) -{ -} - -static void -garrow_list_array_class_init(GArrowListArrayClass *klass) -{ -} - -/** - * garrow_list_array_new: - * @length: The number of elements. - * @value_offsets: The offsets of @values in Arrow format. - * @values: The values as #GArrowArray. - * @null_bitmap: (nullable): The bitmap that shows null elements. The - * N-th element is null when the N-th bit is 0, not null otherwise. - * If the array has no null elements, the bitmap must be %NULL and - * @n_nulls is 0. - * @n_nulls: The number of null elements. If -1 is specified, the - * number of nulls are computed from @null_bitmap. - * - * Returns: A newly created #GArrowListArray. - * - * Since: 0.4.0 - */ -GArrowListArray * -garrow_list_array_new(gint64 length, - GArrowBuffer *value_offsets, - GArrowArray *values, - GArrowBuffer *null_bitmap, - gint64 n_nulls) -{ - const auto arrow_value_offsets = garrow_buffer_get_raw(value_offsets); - const auto arrow_values = garrow_array_get_raw(values); - const auto arrow_bitmap = garrow_buffer_get_raw(null_bitmap); - auto arrow_data_type = arrow::list(arrow_values->type()); - auto arrow_list_array = - std::make_shared(arrow_data_type, - length, - arrow_value_offsets, - arrow_values, - arrow_bitmap, - n_nulls); - auto arrow_array = - std::static_pointer_cast(arrow_list_array); - return GARROW_LIST_ARRAY(garrow_array_new_raw(&arrow_array)); -} - -/** - * garrow_list_array_get_value_type: - * @array: A #GArrowListArray. - * - * Returns: (transfer full): The data type of value in each list. - */ -GArrowDataType * -garrow_list_array_get_value_type(GArrowListArray *array) -{ - auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); - auto arrow_list_array = - static_cast(arrow_array.get()); - auto arrow_value_type = arrow_list_array->value_type(); - return garrow_data_type_new_raw(&arrow_value_type); -} - -/** - * garrow_list_array_get_value: - * @array: A #GArrowListArray. - * @i: The index of the target value. - * - * Returns: (transfer full): The i-th list. - */ -GArrowArray * -garrow_list_array_get_value(GArrowListArray *array, - gint64 i) -{ - auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); - auto arrow_list_array = - static_cast(arrow_array.get()); - auto arrow_list = - arrow_list_array->values()->Slice(arrow_list_array->value_offset(i), - arrow_list_array->value_length(i)); - return garrow_array_new_raw(&arrow_list); -} - - -G_DEFINE_TYPE(GArrowStructArray, \ - garrow_struct_array, \ - GARROW_TYPE_ARRAY) - -static void -garrow_struct_array_init(GArrowStructArray *object) -{ -} - -static void -garrow_struct_array_class_init(GArrowStructArrayClass *klass) -{ -} - -/** - * garrow_struct_array_new: - * @data_type: The data type of the struct. - * @length: The number of elements. - * @children: (element-type GArrowArray): The arrays for each field - * as #GList of #GArrowArray. - * @null_bitmap: (nullable): The bitmap that shows null elements. The - * N-th element is null when the N-th bit is 0, not null otherwise. - * If the array has no null elements, the bitmap must be %NULL and - * @n_nulls is 0. - * @n_nulls: The number of null elements. If -1 is specified, the - * number of nulls are computed from @null_bitmap. - * - * Returns: A newly created #GArrowStructArray. - * - * Since: 0.4.0 - */ -GArrowStructArray * -garrow_struct_array_new(GArrowDataType *data_type, - gint64 length, - GList *children, - GArrowBuffer *null_bitmap, - gint64 n_nulls) -{ - const auto arrow_data_type = garrow_data_type_get_raw(data_type); - std::vector> arrow_children; - for (GList *node = children; node; node = node->next) { - GArrowArray *child = GARROW_ARRAY(node->data); - arrow_children.push_back(garrow_array_get_raw(child)); - } - const auto arrow_bitmap = garrow_buffer_get_raw(null_bitmap); - auto arrow_struct_array = - std::make_shared(arrow_data_type, - length, - arrow_children, - arrow_bitmap, - n_nulls); - auto arrow_array = - std::static_pointer_cast(arrow_struct_array); - return GARROW_STRUCT_ARRAY(garrow_array_new_raw(&arrow_array)); -} - -/** - * garrow_struct_array_get_field - * @array: A #GArrowStructArray. - * @i: The index of the field in the struct. - * - * Returns: (transfer full): The i-th field. - */ -GArrowArray * -garrow_struct_array_get_field(GArrowStructArray *array, - gint i) -{ - auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); - auto arrow_struct_array = - static_cast(arrow_array.get()); - auto arrow_field = arrow_struct_array->field(i); - return garrow_array_new_raw(&arrow_field); -} - -/** - * garrow_struct_array_get_fields - * @array: A #GArrowStructArray. - * - * Returns: (element-type GArrowArray) (transfer full): - * The fields in the struct. - */ -GList * -garrow_struct_array_get_fields(GArrowStructArray *array) -{ - const auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); - const auto arrow_struct_array = - static_cast(arrow_array.get()); - - GList *fields = NULL; - for (int i = 0; i < arrow_struct_array->num_fields(); ++i) { - auto arrow_field = arrow_struct_array->field(i); - GArrowArray *field = garrow_array_new_raw(&arrow_field); - fields = g_list_prepend(fields, field); - } - - return g_list_reverse(fields); -} - G_END_DECLS GArrowArray * @@ -2289,6 +2162,9 @@ garrow_array_new_raw(std::shared_ptr *arrow_array) case arrow::Type::type::STRUCT: type = GARROW_TYPE_STRUCT_ARRAY; break; + case arrow::Type::type::DICTIONARY: + type = GARROW_TYPE_DICTIONARY_ARRAY; + break; default: type = GARROW_TYPE_ARRAY; break; diff --git a/c_glib/arrow-glib/basic-array.h b/c_glib/arrow-glib/basic-array.h new file mode 100644 index 0000000000000..420c30ecc7161 --- /dev/null +++ b/c_glib/arrow-glib/basic-array.h @@ -0,0 +1,1119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include + +G_BEGIN_DECLS + +#define GARROW_TYPE_ARRAY (garrow_array_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowArray, + garrow_array, + GARROW, + ARRAY, + GObject) +struct _GArrowArrayClass +{ + GObjectClass parent_class; +}; + +gboolean garrow_array_equal (GArrowArray *array, + GArrowArray *other_array); +gboolean garrow_array_equal_approx(GArrowArray *array, + GArrowArray *other_array); +gboolean garrow_array_equal_range (GArrowArray *array, + gint64 start_index, + GArrowArray *other_array, + gint64 other_start_index, + gint64 end_index); + +gboolean garrow_array_is_null (GArrowArray *array, + gint64 i); +gboolean garrow_array_is_valid (GArrowArray *array, + gint64 i); +gint64 garrow_array_get_length (GArrowArray *array); +gint64 garrow_array_get_offset (GArrowArray *array); +gint64 garrow_array_get_n_nulls (GArrowArray *array); +GArrowBuffer *garrow_array_get_null_bitmap(GArrowArray *array); +GArrowDataType *garrow_array_get_value_data_type(GArrowArray *array); +GArrowType garrow_array_get_value_type(GArrowArray *array); +GArrowArray *garrow_array_slice (GArrowArray *array, + gint64 offset, + gint64 length); +gchar *garrow_array_to_string (GArrowArray *array, + GError **error); + +GArrowArray *garrow_array_cast (GArrowArray *array, + GArrowDataType *target_data_type, + GArrowCastOptions *options, + GError **error); +GArrowArray *garrow_array_unique (GArrowArray *array, + GError **error); +GArrowArray *garrow_array_dictionary_encode(GArrowArray *array, + GError **error); + +#define GARROW_TYPE_NULL_ARRAY \ + (garrow_null_array_get_type()) +#define GARROW_NULL_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_NULL_ARRAY, \ + GArrowNullArray)) +#define GARROW_NULL_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_NULL_ARRAY, \ + GArrowNullArrayClass)) +#define GARROW_IS_NULL_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_NULL_ARRAY)) +#define GARROW_IS_NULL_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_NULL_ARRAY)) +#define GARROW_NULL_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_NULL_ARRAY, \ + GArrowNullArrayClass)) + +typedef struct _GArrowNullArray GArrowNullArray; +typedef struct _GArrowNullArrayClass GArrowNullArrayClass; + +/** + * GArrowNullArray: + * + * It wraps `arrow::NullArray`. + */ +struct _GArrowNullArray +{ + /*< private >*/ + GArrowArray parent_instance; +}; + +struct _GArrowNullArrayClass +{ + GArrowArrayClass parent_class; +}; + +GType garrow_null_array_get_type(void) G_GNUC_CONST; + +GArrowNullArray *garrow_null_array_new(gint64 length); + + +#define GARROW_TYPE_PRIMITIVE_ARRAY \ + (garrow_primitive_array_get_type()) +#define GARROW_PRIMITIVE_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_PRIMITIVE_ARRAY, \ + GArrowPrimitiveArray)) +#define GARROW_PRIMITIVE_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_PRIMITIVE_ARRAY, \ + GArrowPrimitiveArrayClass)) +#define GARROW_IS_PRIMITIVE_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_PRIMITIVE_ARRAY)) +#define GARROW_IS_PRIMITIVE_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_PRIMITIVE_ARRAY)) +#define GARROW_PRIMITIVE_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_PRIMITIVE_ARRAY, \ + GArrowPrimitiveArrayClass)) + +typedef struct _GArrowPrimitiveArray GArrowPrimitiveArray; +typedef struct _GArrowPrimitiveArrayClass GArrowPrimitiveArrayClass; + +/** + * GArrowPrimitiveArray: + * + * It wraps `arrow::PrimitiveArray`. + */ +struct _GArrowPrimitiveArray +{ + /*< private >*/ + GArrowArray parent_instance; +}; + +struct _GArrowPrimitiveArrayClass +{ + GArrowArrayClass parent_class; +}; + +GType garrow_primitive_array_get_type(void) G_GNUC_CONST; + +GArrowBuffer *garrow_primitive_array_get_buffer(GArrowPrimitiveArray *array); + + +#define GARROW_TYPE_BOOLEAN_ARRAY \ + (garrow_boolean_array_get_type()) +#define GARROW_BOOLEAN_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_BOOLEAN_ARRAY, \ + GArrowBooleanArray)) +#define GARROW_BOOLEAN_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_BOOLEAN_ARRAY, \ + GArrowBooleanArrayClass)) +#define GARROW_IS_BOOLEAN_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_BOOLEAN_ARRAY)) +#define GARROW_IS_BOOLEAN_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_BOOLEAN_ARRAY)) +#define GARROW_BOOLEAN_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_BOOLEAN_ARRAY, \ + GArrowBooleanArrayClass)) + +typedef struct _GArrowBooleanArray GArrowBooleanArray; +typedef struct _GArrowBooleanArrayClass GArrowBooleanArrayClass; + +/** + * GArrowBooleanArray: + * + * It wraps `arrow::BooleanArray`. + */ +struct _GArrowBooleanArray +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowBooleanArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_boolean_array_get_type (void) G_GNUC_CONST; + +GArrowBooleanArray *garrow_boolean_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gboolean garrow_boolean_array_get_value (GArrowBooleanArray *array, + gint64 i); +gboolean *garrow_boolean_array_get_values(GArrowBooleanArray *array, + gint64 *length); + + +#define GARROW_TYPE_INT8_ARRAY \ + (garrow_int8_array_get_type()) +#define GARROW_INT8_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_INT8_ARRAY, \ + GArrowInt8Array)) +#define GARROW_INT8_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_INT8_ARRAY, \ + GArrowInt8ArrayClass)) +#define GARROW_IS_INT8_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_INT8_ARRAY)) +#define GARROW_IS_INT8_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_INT8_ARRAY)) +#define GARROW_INT8_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_INT8_ARRAY, \ + GArrowInt8ArrayClass)) + +typedef struct _GArrowInt8Array GArrowInt8Array; +typedef struct _GArrowInt8ArrayClass GArrowInt8ArrayClass; + +/** + * GArrowInt8Array: + * + * It wraps `arrow::Int8Array`. + */ +struct _GArrowInt8Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowInt8ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_int8_array_get_type(void) G_GNUC_CONST; + +GArrowInt8Array *garrow_int8_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gint8 garrow_int8_array_get_value(GArrowInt8Array *array, + gint64 i); +const gint8 *garrow_int8_array_get_values(GArrowInt8Array *array, + gint64 *length); + + +#define GARROW_TYPE_UINT8_ARRAY \ + (garrow_uint8_array_get_type()) +#define GARROW_UINT8_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_UINT8_ARRAY, \ + GArrowUInt8Array)) +#define GARROW_UINT8_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_UINT8_ARRAY, \ + GArrowUInt8ArrayClass)) +#define GARROW_IS_UINT8_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_UINT8_ARRAY)) +#define GARROW_IS_UINT8_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_UINT8_ARRAY)) +#define GARROW_UINT8_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_UINT8_ARRAY, \ + GArrowUInt8ArrayClass)) + +typedef struct _GArrowUInt8Array GArrowUInt8Array; +typedef struct _GArrowUInt8ArrayClass GArrowUInt8ArrayClass; + +/** + * GArrowUInt8Array: + * + * It wraps `arrow::UInt8Array`. + */ +struct _GArrowUInt8Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowUInt8ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_uint8_array_get_type(void) G_GNUC_CONST; + +GArrowUInt8Array *garrow_uint8_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +guint8 garrow_uint8_array_get_value(GArrowUInt8Array *array, + gint64 i); +const guint8 *garrow_uint8_array_get_values(GArrowUInt8Array *array, + gint64 *length); + + +#define GARROW_TYPE_INT16_ARRAY \ + (garrow_int16_array_get_type()) +#define GARROW_INT16_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_INT16_ARRAY, \ + GArrowInt16Array)) +#define GARROW_INT16_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_INT16_ARRAY, \ + GArrowInt16ArrayClass)) +#define GARROW_IS_INT16_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_INT16_ARRAY)) +#define GARROW_IS_INT16_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_INT16_ARRAY)) +#define GARROW_INT16_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_INT16_ARRAY, \ + GArrowInt16ArrayClass)) + +typedef struct _GArrowInt16Array GArrowInt16Array; +typedef struct _GArrowInt16ArrayClass GArrowInt16ArrayClass; + +/** + * GArrowInt16Array: + * + * It wraps `arrow::Int16Array`. + */ +struct _GArrowInt16Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowInt16ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_int16_array_get_type(void) G_GNUC_CONST; + +GArrowInt16Array *garrow_int16_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gint16 garrow_int16_array_get_value(GArrowInt16Array *array, + gint64 i); +const gint16 *garrow_int16_array_get_values(GArrowInt16Array *array, + gint64 *length); + + +#define GARROW_TYPE_UINT16_ARRAY \ + (garrow_uint16_array_get_type()) +#define GARROW_UINT16_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_UINT16_ARRAY, \ + GArrowUInt16Array)) +#define GARROW_UINT16_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_UINT16_ARRAY, \ + GArrowUInt16ArrayClass)) +#define GARROW_IS_UINT16_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_UINT16_ARRAY)) +#define GARROW_IS_UINT16_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_UINT16_ARRAY)) +#define GARROW_UINT16_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_UINT16_ARRAY, \ + GArrowUInt16ArrayClass)) + +typedef struct _GArrowUInt16Array GArrowUInt16Array; +typedef struct _GArrowUInt16ArrayClass GArrowUInt16ArrayClass; + +/** + * GArrowUInt16Array: + * + * It wraps `arrow::UInt16Array`. + */ +struct _GArrowUInt16Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowUInt16ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_uint16_array_get_type(void) G_GNUC_CONST; + +GArrowUInt16Array *garrow_uint16_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +guint16 garrow_uint16_array_get_value(GArrowUInt16Array *array, + gint64 i); +const guint16 *garrow_uint16_array_get_values(GArrowUInt16Array *array, + gint64 *length); + + +#define GARROW_TYPE_INT32_ARRAY \ + (garrow_int32_array_get_type()) +#define GARROW_INT32_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_INT32_ARRAY, \ + GArrowInt32Array)) +#define GARROW_INT32_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_INT32_ARRAY, \ + GArrowInt32ArrayClass)) +#define GARROW_IS_INT32_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_INT32_ARRAY)) +#define GARROW_IS_INT32_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_INT32_ARRAY)) +#define GARROW_INT32_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_INT32_ARRAY, \ + GArrowInt32ArrayClass)) + +typedef struct _GArrowInt32Array GArrowInt32Array; +typedef struct _GArrowInt32ArrayClass GArrowInt32ArrayClass; + +/** + * GArrowInt32Array: + * + * It wraps `arrow::Int32Array`. + */ +struct _GArrowInt32Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowInt32ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_int32_array_get_type(void) G_GNUC_CONST; + +GArrowInt32Array *garrow_int32_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gint32 garrow_int32_array_get_value(GArrowInt32Array *array, + gint64 i); +const gint32 *garrow_int32_array_get_values(GArrowInt32Array *array, + gint64 *length); + + +#define GARROW_TYPE_UINT32_ARRAY \ + (garrow_uint32_array_get_type()) +#define GARROW_UINT32_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_UINT32_ARRAY, \ + GArrowUInt32Array)) +#define GARROW_UINT32_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_UINT32_ARRAY, \ + GArrowUInt32ArrayClass)) +#define GARROW_IS_UINT32_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_UINT32_ARRAY)) +#define GARROW_IS_UINT32_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_UINT32_ARRAY)) +#define GARROW_UINT32_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_UINT32_ARRAY, \ + GArrowUInt32ArrayClass)) + +typedef struct _GArrowUInt32Array GArrowUInt32Array; +typedef struct _GArrowUInt32ArrayClass GArrowUInt32ArrayClass; + +/** + * GArrowUInt32Array: + * + * It wraps `arrow::UInt32Array`. + */ +struct _GArrowUInt32Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowUInt32ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_uint32_array_get_type(void) G_GNUC_CONST; + +GArrowUInt32Array *garrow_uint32_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +guint32 garrow_uint32_array_get_value(GArrowUInt32Array *array, + gint64 i); +const guint32 *garrow_uint32_array_get_values(GArrowUInt32Array *array, + gint64 *length); + + +#define GARROW_TYPE_INT64_ARRAY \ + (garrow_int64_array_get_type()) +#define GARROW_INT64_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_INT64_ARRAY, \ + GArrowInt64Array)) +#define GARROW_INT64_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_INT64_ARRAY, \ + GArrowInt64ArrayClass)) +#define GARROW_IS_INT64_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_INT64_ARRAY)) +#define GARROW_IS_INT64_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_INT64_ARRAY)) +#define GARROW_INT64_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_INT64_ARRAY, \ + GArrowInt64ArrayClass)) + +typedef struct _GArrowInt64Array GArrowInt64Array; +typedef struct _GArrowInt64ArrayClass GArrowInt64ArrayClass; + +/** + * GArrowInt64Array: + * + * It wraps `arrow::Int64Array`. + */ +struct _GArrowInt64Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowInt64ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_int64_array_get_type(void) G_GNUC_CONST; + +GArrowInt64Array *garrow_int64_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gint64 garrow_int64_array_get_value(GArrowInt64Array *array, + gint64 i); +const gint64 *garrow_int64_array_get_values(GArrowInt64Array *array, + gint64 *length); + + +#define GARROW_TYPE_UINT64_ARRAY \ + (garrow_uint64_array_get_type()) +#define GARROW_UINT64_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_UINT64_ARRAY, \ + GArrowUInt64Array)) +#define GARROW_UINT64_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_UINT64_ARRAY, \ + GArrowUInt64ArrayClass)) +#define GARROW_IS_UINT64_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_UINT64_ARRAY)) +#define GARROW_IS_UINT64_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_UINT64_ARRAY)) +#define GARROW_UINT64_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_UINT64_ARRAY, \ + GArrowUInt64ArrayClass)) + +typedef struct _GArrowUInt64Array GArrowUInt64Array; +typedef struct _GArrowUInt64ArrayClass GArrowUInt64ArrayClass; + +/** + * GArrowUInt64Array: + * + * It wraps `arrow::UInt64Array`. + */ +struct _GArrowUInt64Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowUInt64ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_uint64_array_get_type(void) G_GNUC_CONST; + +GArrowUInt64Array *garrow_uint64_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +guint64 garrow_uint64_array_get_value(GArrowUInt64Array *array, + gint64 i); +const guint64 *garrow_uint64_array_get_values(GArrowUInt64Array *array, + gint64 *length); + + +#define GARROW_TYPE_FLOAT_ARRAY \ + (garrow_float_array_get_type()) +#define GARROW_FLOAT_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_FLOAT_ARRAY, \ + GArrowFloatArray)) +#define GARROW_FLOAT_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_FLOAT_ARRAY, \ + GArrowFloatArrayClass)) +#define GARROW_IS_FLOAT_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_FLOAT_ARRAY)) +#define GARROW_IS_FLOAT_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_FLOAT_ARRAY)) +#define GARROW_FLOAT_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_FLOAT_ARRAY, \ + GArrowFloatArrayClass)) + +typedef struct _GArrowFloatArray GArrowFloatArray; +typedef struct _GArrowFloatArrayClass GArrowFloatArrayClass; + +/** + * GArrowFloatArray: + * + * It wraps `arrow::FloatArray`. + */ +struct _GArrowFloatArray +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowFloatArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_float_array_get_type(void) G_GNUC_CONST; + +GArrowFloatArray *garrow_float_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gfloat garrow_float_array_get_value(GArrowFloatArray *array, + gint64 i); +const gfloat *garrow_float_array_get_values(GArrowFloatArray *array, + gint64 *length); + + +#define GARROW_TYPE_DOUBLE_ARRAY \ + (garrow_double_array_get_type()) +#define GARROW_DOUBLE_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_DOUBLE_ARRAY, \ + GArrowDoubleArray)) +#define GARROW_DOUBLE_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_DOUBLE_ARRAY, \ + GArrowDoubleArrayClass)) +#define GARROW_IS_DOUBLE_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_DOUBLE_ARRAY)) +#define GARROW_IS_DOUBLE_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_DOUBLE_ARRAY)) +#define GARROW_DOUBLE_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_DOUBLE_ARRAY, \ + GArrowDoubleArrayClass)) + +typedef struct _GArrowDoubleArray GArrowDoubleArray; +typedef struct _GArrowDoubleArrayClass GArrowDoubleArrayClass; + +/** + * GArrowDoubleArray: + * + * It wraps `arrow::DoubleArray`. + */ +struct _GArrowDoubleArray +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowDoubleArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_double_array_get_type(void) G_GNUC_CONST; + +GArrowDoubleArray *garrow_double_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gdouble garrow_double_array_get_value(GArrowDoubleArray *array, + gint64 i); +const gdouble *garrow_double_array_get_values(GArrowDoubleArray *array, + gint64 *length); + + +#define GARROW_TYPE_BINARY_ARRAY \ + (garrow_binary_array_get_type()) +#define GARROW_BINARY_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_BINARY_ARRAY, \ + GArrowBinaryArray)) +#define GARROW_BINARY_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_BINARY_ARRAY, \ + GArrowBinaryArrayClass)) +#define GARROW_IS_BINARY_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_BINARY_ARRAY)) +#define GARROW_IS_BINARY_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_BINARY_ARRAY)) +#define GARROW_BINARY_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_BINARY_ARRAY, \ + GArrowBinaryArrayClass)) + +typedef struct _GArrowBinaryArray GArrowBinaryArray; +typedef struct _GArrowBinaryArrayClass GArrowBinaryArrayClass; + +/** + * GArrowBinaryArray: + * + * It wraps `arrow::BinaryArray`. + */ +struct _GArrowBinaryArray +{ + /*< private >*/ + GArrowArray parent_instance; +}; + +struct _GArrowBinaryArrayClass +{ + GArrowArrayClass parent_class; +}; + +GType garrow_binary_array_get_type(void) G_GNUC_CONST; + +GArrowBinaryArray *garrow_binary_array_new(gint64 length, + GArrowBuffer *value_offsets, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +GBytes *garrow_binary_array_get_value(GArrowBinaryArray *array, + gint64 i); +GArrowBuffer *garrow_binary_array_get_buffer(GArrowBinaryArray *array); +GArrowBuffer *garrow_binary_array_get_offsets_buffer(GArrowBinaryArray *array); + +#define GARROW_TYPE_STRING_ARRAY \ + (garrow_string_array_get_type()) +#define GARROW_STRING_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_STRING_ARRAY, \ + GArrowStringArray)) +#define GARROW_STRING_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_STRING_ARRAY, \ + GArrowStringArrayClass)) +#define GARROW_IS_STRING_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_STRING_ARRAY)) +#define GARROW_IS_STRING_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_STRING_ARRAY)) +#define GARROW_STRING_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_STRING_ARRAY, \ + GArrowStringArrayClass)) + +typedef struct _GArrowStringArray GArrowStringArray; +typedef struct _GArrowStringArrayClass GArrowStringArrayClass; + +/** + * GArrowStringArray: + * + * It wraps `arrow::StringArray`. + */ +struct _GArrowStringArray +{ + /*< private >*/ + GArrowBinaryArray parent_instance; +}; + +struct _GArrowStringArrayClass +{ + GArrowBinaryArrayClass parent_class; +}; + +GType garrow_string_array_get_type(void) G_GNUC_CONST; + +GArrowStringArray *garrow_string_array_new(gint64 length, + GArrowBuffer *value_offsets, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gchar *garrow_string_array_get_string(GArrowStringArray *array, + gint64 i); + + +#define GARROW_TYPE_DATE32_ARRAY \ + (garrow_date32_array_get_type()) +#define GARROW_DATE32_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_DATE32_ARRAY, \ + GArrowDate32Array)) +#define GARROW_DATE32_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_DATE32_ARRAY, \ + GArrowDate32ArrayClass)) +#define GARROW_IS_DATE32_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_DATE32_ARRAY)) +#define GARROW_IS_DATE32_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_DATE32_ARRAY)) +#define GARROW_DATE32_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_DATE32_ARRAY, \ + GArrowDate32ArrayClass)) + +typedef struct _GArrowDate32Array GArrowDate32Array; +typedef struct _GArrowDate32ArrayClass GArrowDate32ArrayClass; + +/** + * GArrowDate32Array: + * + * It wraps `arrow::Date32Array`. + */ +struct _GArrowDate32Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowDate32ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_date32_array_get_type(void) G_GNUC_CONST; + +GArrowDate32Array *garrow_date32_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gint32 garrow_date32_array_get_value(GArrowDate32Array *array, + gint64 i); +const gint32 *garrow_date32_array_get_values(GArrowDate32Array *array, + gint64 *length); + + +#define GARROW_TYPE_DATE64_ARRAY \ + (garrow_date64_array_get_type()) +#define GARROW_DATE64_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_DATE64_ARRAY, \ + GArrowDate64Array)) +#define GARROW_DATE64_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_DATE64_ARRAY, \ + GArrowDate64ArrayClass)) +#define GARROW_IS_DATE64_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_DATE64_ARRAY)) +#define GARROW_IS_DATE64_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_DATE64_ARRAY)) +#define GARROW_DATE64_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_DATE64_ARRAY, \ + GArrowDate64ArrayClass)) + +typedef struct _GArrowDate64Array GArrowDate64Array; +typedef struct _GArrowDate64ArrayClass GArrowDate64ArrayClass; + +/** + * GArrowDate64Array: + * + * It wraps `arrow::Date64Array`. + */ +struct _GArrowDate64Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowDate64ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_date64_array_get_type(void) G_GNUC_CONST; + +GArrowDate64Array *garrow_date64_array_new(gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gint64 garrow_date64_array_get_value(GArrowDate64Array *array, + gint64 i); +const gint64 *garrow_date64_array_get_values(GArrowDate64Array *array, + gint64 *length); + + +#define GARROW_TYPE_TIMESTAMP_ARRAY \ + (garrow_timestamp_array_get_type()) +#define GARROW_TIMESTAMP_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_TIMESTAMP_ARRAY, \ + GArrowTimestampArray)) +#define GARROW_TIMESTAMP_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_TIMESTAMP_ARRAY, \ + GArrowTimestampArrayClass)) +#define GARROW_IS_TIMESTAMP_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_TIMESTAMP_ARRAY)) +#define GARROW_IS_TIMESTAMP_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_TIMESTAMP_ARRAY)) +#define GARROW_TIMESTAMP_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_TIMESTAMP_ARRAY, \ + GArrowTimestampArrayClass)) + +typedef struct _GArrowTimestampArray GArrowTimestampArray; +typedef struct _GArrowTimestampArrayClass GArrowTimestampArrayClass; + +/** + * GArrowTimestampArray: + * + * It wraps `arrow::TimestampArray`. + */ +struct _GArrowTimestampArray +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowTimestampArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_timestamp_array_get_type(void) G_GNUC_CONST; + +GArrowTimestampArray *garrow_timestamp_array_new(GArrowTimestampDataType *data_type, + gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gint64 garrow_timestamp_array_get_value(GArrowTimestampArray *array, + gint64 i); +const gint64 *garrow_timestamp_array_get_values(GArrowTimestampArray *array, + gint64 *length); + + +#define GARROW_TYPE_TIME32_ARRAY \ + (garrow_time32_array_get_type()) +#define GARROW_TIME32_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_TIME32_ARRAY, \ + GArrowTime32Array)) +#define GARROW_TIME32_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_TIME32_ARRAY, \ + GArrowTime32ArrayClass)) +#define GARROW_IS_TIME32_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_TIME32_ARRAY)) +#define GARROW_IS_TIME32_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_TIME32_ARRAY)) +#define GARROW_TIME32_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_TIME32_ARRAY, \ + GArrowTime32ArrayClass)) + +typedef struct _GArrowTime32Array GArrowTime32Array; +typedef struct _GArrowTime32ArrayClass GArrowTime32ArrayClass; + +/** + * GArrowTime32Array: + * + * It wraps `arrow::Time32Array`. + */ +struct _GArrowTime32Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowTime32ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_time32_array_get_type(void) G_GNUC_CONST; + +GArrowTime32Array *garrow_time32_array_new(GArrowTime32DataType *data_type, + gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gint32 garrow_time32_array_get_value(GArrowTime32Array *array, + gint64 i); +const gint32 *garrow_time32_array_get_values(GArrowTime32Array *array, + gint64 *length); + + +#define GARROW_TYPE_TIME64_ARRAY \ + (garrow_time64_array_get_type()) +#define GARROW_TIME64_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_TIME64_ARRAY, \ + GArrowTime64Array)) +#define GARROW_TIME64_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_TIME64_ARRAY, \ + GArrowTime64ArrayClass)) +#define GARROW_IS_TIME64_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_TIME64_ARRAY)) +#define GARROW_IS_TIME64_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_TIME64_ARRAY)) +#define GARROW_TIME64_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_TIME64_ARRAY, \ + GArrowTime64ArrayClass)) + +typedef struct _GArrowTime64Array GArrowTime64Array; +typedef struct _GArrowTime64ArrayClass GArrowTime64ArrayClass; + +/** + * GArrowTime64Array: + * + * It wraps `arrow::Time64Array`. + */ +struct _GArrowTime64Array +{ + /*< private >*/ + GArrowPrimitiveArray parent_instance; +}; + +struct _GArrowTime64ArrayClass +{ + GArrowPrimitiveArrayClass parent_class; +}; + +GType garrow_time64_array_get_type(void) G_GNUC_CONST; + +GArrowTime64Array *garrow_time64_array_new(GArrowTime64DataType *data_type, + gint64 length, + GArrowBuffer *data, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +gint64 garrow_time64_array_get_value(GArrowTime64Array *array, + gint64 i); +const gint64 *garrow_time64_array_get_values(GArrowTime64Array *array, + gint64 *length); + +G_END_DECLS diff --git a/c_glib/arrow-glib/basic-array.hpp b/c_glib/arrow-glib/basic-array.hpp new file mode 100644 index 0000000000000..52b94a51f3dbf --- /dev/null +++ b/c_glib/arrow-glib/basic-array.hpp @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +GArrowArray *garrow_array_new_raw(std::shared_ptr *arrow_array); +std::shared_ptr garrow_array_get_raw(GArrowArray *array); diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 62f8a7e4d2fbe..a5f7aed1b8f76 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -220,6 +220,37 @@ garrow_data_type_get_id(GArrowDataType *data_type) } +G_DEFINE_ABSTRACT_TYPE(GArrowFixedWidthDataType, \ + garrow_fixed_width_data_type, \ + GARROW_TYPE_DATA_TYPE) + +static void +garrow_fixed_width_data_type_init(GArrowFixedWidthDataType *object) +{ +} + +static void +garrow_fixed_width_data_type_class_init(GArrowFixedWidthDataTypeClass *klass) +{ +} + +/** + * garrow_fixed_width_data_type_get_id: + * @data_type: A #GArrowFixedWidthDataType. + * + * Returns: The number of bits for one data. + */ +gint +garrow_fixed_width_data_type_get_bit_width(GArrowFixedWidthDataType *data_type) +{ + const auto arrow_data_type = + garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + const auto arrow_fixed_width_type = + std::static_pointer_cast(arrow_data_type); + return arrow_fixed_width_type->bit_width(); +} + + G_DEFINE_TYPE(GArrowNullDataType, \ garrow_null_data_type, \ GARROW_TYPE_DATA_TYPE) @@ -254,7 +285,7 @@ garrow_null_data_type_new(void) G_DEFINE_TYPE(GArrowBooleanDataType, \ garrow_boolean_data_type, \ - GARROW_TYPE_DATA_TYPE) + GARROW_TYPE_FIXED_WIDTH_DATA_TYPE) static void garrow_boolean_data_type_init(GArrowBooleanDataType *object) @@ -774,6 +805,24 @@ garrow_timestamp_data_type_new(GArrowTimeUnit unit) return data_type; } +/** + * garrow_timestamp_data_type_get_unit: + * @timestamp_data_type: The #GArrowTimestampDataType. + * + * Returns: The unit of the timestamp data type. + * + * Since: 0.8.0 + */ +GArrowTimeUnit +garrow_timestamp_data_type_get_unit(GArrowTimestampDataType *timestamp_data_type) +{ + const auto arrow_data_type = + garrow_data_type_get_raw(GARROW_DATA_TYPE(timestamp_data_type)); + const auto arrow_timestamp_data_type = + std::static_pointer_cast(arrow_data_type); + return garrow_time_unit_from_raw(arrow_timestamp_data_type->unit()); +} + G_DEFINE_TYPE(GArrowTimeDataType, \ garrow_time_data_type, \ @@ -1015,12 +1064,15 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) case arrow::Type::type::STRUCT: type = GARROW_TYPE_STRUCT_DATA_TYPE; break; + case arrow::Type::type::DICTIONARY: + type = GARROW_TYPE_DICTIONARY_DATA_TYPE; + break; default: type = GARROW_TYPE_DATA_TYPE; break; } data_type = GARROW_DATA_TYPE(g_object_new(type, - "data_type", arrow_data_type, + "data-type", arrow_data_type, NULL)); return data_type; } diff --git a/c_glib/arrow-glib/basic-data-type.h b/c_glib/arrow-glib/basic-data-type.h index d3d52c6120de7..469590191d4e1 100644 --- a/c_glib/arrow-glib/basic-data-type.h +++ b/c_glib/arrow-glib/basic-data-type.h @@ -19,57 +19,45 @@ #pragma once +#include #include G_BEGIN_DECLS -#define GARROW_TYPE_DATA_TYPE \ - (garrow_data_type_get_type()) -#define GARROW_DATA_TYPE(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_DATA_TYPE, \ - GArrowDataType)) -#define GARROW_DATA_TYPE_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_DATA_TYPE, \ - GArrowDataTypeClass)) -#define GARROW_IS_DATA_TYPE(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_DATA_TYPE)) -#define GARROW_IS_DATA_TYPE_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_DATA_TYPE)) -#define GARROW_DATA_TYPE_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_DATA_TYPE, \ - GArrowDataTypeClass)) - -typedef struct _GArrowDataType GArrowDataType; -typedef struct _GArrowDataTypeClass GArrowDataTypeClass; - -/** - * GArrowDataType: - * - * It wraps `arrow::DataType`. - */ -struct _GArrowDataType -{ - /*< private >*/ - GObject parent_instance; -}; - +#define GARROW_TYPE_DATA_TYPE (garrow_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDataType, + garrow_data_type, + GARROW, + DATA_TYPE, + GObject) struct _GArrowDataTypeClass { GObjectClass parent_class; }; -GType garrow_data_type_get_type (void) G_GNUC_CONST; gboolean garrow_data_type_equal (GArrowDataType *data_type, GArrowDataType *other_data_type); gchar *garrow_data_type_to_string (GArrowDataType *data_type); GArrowType garrow_data_type_get_id (GArrowDataType *data_type); +#define GARROW_TYPE_FIXED_WIDTH_DATA_TYPE (garrow_fixed_width_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowFixedWidthDataType, + garrow_fixed_width_data_type, + GARROW, + FIXED_WIDTH_DATA_TYPE, + GArrowDataType) +struct _GArrowFixedWidthDataTypeClass +{ + GArrowDataTypeClass parent_class; +}; + +gint garrow_fixed_width_data_type_get_bit_width(GArrowFixedWidthDataType *data_type); +/* TODO: +GList *garrow_fixed_width_data_type_get_buffer_layout(GArrowFixedWidthDataType *data_type); +*/ + + #define GARROW_TYPE_NULL_DATA_TYPE \ (garrow_null_data_type_get_type()) #define GARROW_NULL_DATA_TYPE(obj) \ @@ -114,47 +102,17 @@ GType garrow_null_data_type_get_type (void) G_GNUC_CONST; GArrowNullDataType *garrow_null_data_type_new (void); -#define GARROW_TYPE_BOOLEAN_DATA_TYPE \ - (garrow_boolean_data_type_get_type()) -#define GARROW_BOOLEAN_DATA_TYPE(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_BOOLEAN_DATA_TYPE, \ - GArrowBooleanDataType)) -#define GARROW_BOOLEAN_DATA_TYPE_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_BOOLEAN_DATA_TYPE, \ - GArrowBooleanDataTypeClass)) -#define GARROW_IS_BOOLEAN_DATA_TYPE(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_BOOLEAN_DATA_TYPE)) -#define GARROW_IS_BOOLEAN_DATA_TYPE_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_BOOLEAN_DATA_TYPE)) -#define GARROW_BOOLEAN_DATA_TYPE_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_BOOLEAN_DATA_TYPE, \ - GArrowBooleanDataTypeClass)) - -typedef struct _GArrowBooleanDataType GArrowBooleanDataType; -typedef struct _GArrowBooleanDataTypeClass GArrowBooleanDataTypeClass; - -/** - * GArrowBooleanDataType: - * - * It wraps `arrow::BooleanType`. - */ -struct _GArrowBooleanDataType -{ - /*< private >*/ - GArrowDataType parent_instance; -}; - +#define GARROW_TYPE_BOOLEAN_DATA_TYPE (garrow_boolean_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowBooleanDataType, + garrow_boolean_data_type, + GARROW, + BOOLEAN_DATA_TYPE, + GArrowFixedWidthDataType) struct _GArrowBooleanDataTypeClass { - GArrowDataTypeClass parent_class; + GArrowFixedWidthDataTypeClass parent_class; }; -GType garrow_boolean_data_type_get_type (void) G_GNUC_CONST; GArrowBooleanDataType *garrow_boolean_data_type_new (void); @@ -816,6 +774,8 @@ struct _GArrowTimestampDataTypeClass GType garrow_timestamp_data_type_get_type (void) G_GNUC_CONST; GArrowTimestampDataType *garrow_timestamp_data_type_new (GArrowTimeUnit unit); +GArrowTimeUnit +garrow_timestamp_data_type_get_unit (GArrowTimestampDataType *timestamp_data_type); #define GARROW_TYPE_TIME_DATA_TYPE \ diff --git a/c_glib/arrow-glib/buffer.h b/c_glib/arrow-glib/buffer.h index b3f3a2cdc5e9b..300bb4f4ea3ca 100644 --- a/c_glib/arrow-glib/buffer.h +++ b/c_glib/arrow-glib/buffer.h @@ -19,44 +19,21 @@ #pragma once -#include +#include G_BEGIN_DECLS -#define GARROW_TYPE_BUFFER \ - (garrow_buffer_get_type()) -#define GARROW_BUFFER(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), GARROW_TYPE_BUFFER, GArrowBuffer)) -#define GARROW_BUFFER_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), GARROW_TYPE_BUFFER, GArrowBufferClass)) -#define GARROW_IS_BUFFER(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), GARROW_TYPE_BUFFER)) -#define GARROW_IS_BUFFER_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), GARROW_TYPE_BUFFER)) -#define GARROW_BUFFER_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), GARROW_TYPE_BUFFER, GArrowBufferClass)) - -typedef struct _GArrowBuffer GArrowBuffer; -typedef struct _GArrowBufferClass GArrowBufferClass; - -/** - * GArrowBuffer: - * - * It wraps `arrow::Buffer`. - */ -struct _GArrowBuffer -{ - /*< private >*/ - GObject parent_instance; -}; - +#define GARROW_TYPE_BUFFER (garrow_buffer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowBuffer, + garrow_buffer, + GARROW, + BUFFER, + GObject) struct _GArrowBufferClass { GObjectClass parent_class; }; -GType garrow_buffer_get_type (void) G_GNUC_CONST; - GArrowBuffer *garrow_buffer_new (const guint8 *data, gint64 size); gboolean garrow_buffer_equal (GArrowBuffer *buffer, @@ -80,49 +57,16 @@ GArrowBuffer *garrow_buffer_slice (GArrowBuffer *buffer, gint64 size); -#define GARROW_TYPE_MUTABLE_BUFFER \ - (garrow_mutable_buffer_get_type()) -#define GARROW_MUTABLE_BUFFER(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_MUTABLE_BUFFER, \ - GArrowMutableBuffer)) -#define GARROW_MUTABLE_BUFFER_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_MUTABLE_BUFFER, \ - GArrowMutableBufferClass)) -#define GARROW_IS_MUTABLE_BUFFER(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), GARROW_TYPE_MUTABLE_BUFFER)) -#define GARROW_IS_MUTABLE_BUFFER_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), GARROW_TYPE_MUTABLE_BUFFER)) -#define GARROW_MUTABLE_BUFFER_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_MUTABLE_BUFFER, \ - GArrowMutableBufferClass)) - -typedef struct _GArrowMutableBuffer GArrowMutableBuffer; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowMutableBufferClass GArrowMutableBufferClass; -#endif - -/** - * GArrowMutableBuffer: - * - * It wraps `arrow::MutableBuffer`. - */ -struct _GArrowMutableBuffer -{ - /*< private >*/ - GArrowBuffer parent_instance; -}; - -#ifndef __GTK_DOC_IGNORE__ +#define GARROW_TYPE_MUTABLE_BUFFER (garrow_mutable_buffer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowMutableBuffer, + garrow_mutable_buffer, + GARROW, + MUTABLE_BUFFER, + GArrowBuffer) struct _GArrowMutableBufferClass { GArrowBufferClass parent_class; }; -#endif - -GType garrow_mutable_buffer_get_type(void) G_GNUC_CONST; GArrowMutableBuffer *garrow_mutable_buffer_new (guint8 *data, gint64 size); diff --git a/c_glib/arrow-glib/chunked-array.cpp b/c_glib/arrow-glib/chunked-array.cpp index 62d666fbcaaba..69195c57b1768 100644 --- a/c_glib/arrow-glib/chunked-array.cpp +++ b/c_glib/arrow-glib/chunked-array.cpp @@ -23,6 +23,8 @@ #include #include +#include +#include G_BEGIN_DECLS @@ -163,6 +165,39 @@ garrow_chunked_array_equal(GArrowChunkedArray *chunked_array, return arrow_chunked_array->Equals(arrow_other_chunked_array); } +/** + * garrow_chunked_array_get_value_data_type: + * @chunked_array: A #GArrowChunkedArray. + * + * Returns: (transfer full): The #GArrowDataType of the value of + * the chunked array. + * + * Since: 0.9.0 + */ +GArrowDataType * +garrow_chunked_array_get_value_data_type(GArrowChunkedArray *chunked_array) +{ + auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); + auto arrow_type = arrow_chunked_array->type(); + return garrow_data_type_new_raw(&arrow_type); +} + +/** + * garrow_chunked_array_get_value_type: + * @chunked_array: A #GArrowChunkedArray. + * + * Returns: The #GArrowType of the value of the chunked array. + * + * Since: 0.9.0 + */ +GArrowType +garrow_chunked_array_get_value_type(GArrowChunkedArray *chunked_array) +{ + auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); + auto arrow_type = arrow_chunked_array->type(); + return garrow_type_from_raw(arrow_type->id()); +} + /** * garrow_chunked_array_get_length: * @chunked_array: A #GArrowChunkedArray. diff --git a/c_glib/arrow-glib/chunked-array.h b/c_glib/arrow-glib/chunked-array.h index c5f986a631835..0c3c81a744ceb 100644 --- a/c_glib/arrow-glib/chunked-array.h +++ b/c_glib/arrow-glib/chunked-array.h @@ -70,6 +70,11 @@ GArrowChunkedArray *garrow_chunked_array_new(GList *chunks); gboolean garrow_chunked_array_equal(GArrowChunkedArray *chunked_array, GArrowChunkedArray *other_chunked_array); +GArrowDataType * +garrow_chunked_array_get_value_data_type(GArrowChunkedArray *chunked_array); +GArrowType +garrow_chunked_array_get_value_type(GArrowChunkedArray *chunked_array); + guint64 garrow_chunked_array_get_length (GArrowChunkedArray *chunked_array); guint64 garrow_chunked_array_get_n_nulls(GArrowChunkedArray *chunked_array); guint garrow_chunked_array_get_n_chunks (GArrowChunkedArray *chunked_array); diff --git a/c_glib/arrow-glib/composite-array.cpp b/c_glib/arrow-glib/composite-array.cpp new file mode 100644 index 0000000000000..14cc46d733ee8 --- /dev/null +++ b/c_glib/arrow-glib/composite-array.cpp @@ -0,0 +1,344 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: composite-array + * @section_id: composite-array-classes + * @title: Composite array classes + * @include: arrow-glib/arrow-glib.h + * + * #GArrowListArray is a class for list array. It can store zero or + * more list data. If you don't have Arrow format data, you need to + * use #GArrowListArrayBuilder to create a new array. + * + * #GArrowStructArray is a class for struct array. It can store zero + * or more structs. One struct has zero or more fields. If you don't + * have Arrow format data, you need to use #GArrowStructArrayBuilder + * to create a new array. + * + * #GArrowDictionaryArray is a class for dictionary array. It can + * store data with dictionary and indices. It's space effective than + * normal array when the array has many same values. You can convert a + * normal array to dictionary array by garrow_array_dictionary_encode(). + */ + +G_DEFINE_TYPE(GArrowListArray, \ + garrow_list_array, \ + GARROW_TYPE_ARRAY) + +static void +garrow_list_array_init(GArrowListArray *object) +{ +} + +static void +garrow_list_array_class_init(GArrowListArrayClass *klass) +{ +} + +/** + * garrow_list_array_new: + * @length: The number of elements. + * @value_offsets: The offsets of @values in Arrow format. + * @values: The values as #GArrowArray. + * @null_bitmap: (nullable): The bitmap that shows null elements. The + * N-th element is null when the N-th bit is 0, not null otherwise. + * If the array has no null elements, the bitmap must be %NULL and + * @n_nulls is 0. + * @n_nulls: The number of null elements. If -1 is specified, the + * number of nulls are computed from @null_bitmap. + * + * Returns: A newly created #GArrowListArray. + * + * Since: 0.4.0 + */ +GArrowListArray * +garrow_list_array_new(gint64 length, + GArrowBuffer *value_offsets, + GArrowArray *values, + GArrowBuffer *null_bitmap, + gint64 n_nulls) +{ + const auto arrow_value_offsets = garrow_buffer_get_raw(value_offsets); + const auto arrow_values = garrow_array_get_raw(values); + const auto arrow_bitmap = garrow_buffer_get_raw(null_bitmap); + auto arrow_data_type = arrow::list(arrow_values->type()); + auto arrow_list_array = + std::make_shared(arrow_data_type, + length, + arrow_value_offsets, + arrow_values, + arrow_bitmap, + n_nulls); + auto arrow_array = + std::static_pointer_cast(arrow_list_array); + return GARROW_LIST_ARRAY(garrow_array_new_raw(&arrow_array)); +} + +/** + * garrow_list_array_get_value_type: + * @array: A #GArrowListArray. + * + * Returns: (transfer full): The data type of value in each list. + */ +GArrowDataType * +garrow_list_array_get_value_type(GArrowListArray *array) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + auto arrow_list_array = + static_cast(arrow_array.get()); + auto arrow_value_type = arrow_list_array->value_type(); + return garrow_data_type_new_raw(&arrow_value_type); +} + +/** + * garrow_list_array_get_value: + * @array: A #GArrowListArray. + * @i: The index of the target value. + * + * Returns: (transfer full): The i-th list. + */ +GArrowArray * +garrow_list_array_get_value(GArrowListArray *array, + gint64 i) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + auto arrow_list_array = + static_cast(arrow_array.get()); + auto arrow_list = + arrow_list_array->values()->Slice(arrow_list_array->value_offset(i), + arrow_list_array->value_length(i)); + return garrow_array_new_raw(&arrow_list); +} + + +G_DEFINE_TYPE(GArrowStructArray, \ + garrow_struct_array, \ + GARROW_TYPE_ARRAY) + +static void +garrow_struct_array_init(GArrowStructArray *object) +{ +} + +static void +garrow_struct_array_class_init(GArrowStructArrayClass *klass) +{ +} + +/** + * garrow_struct_array_new: + * @data_type: The data type of the struct. + * @length: The number of elements. + * @children: (element-type GArrowArray): The arrays for each field + * as #GList of #GArrowArray. + * @null_bitmap: (nullable): The bitmap that shows null elements. The + * N-th element is null when the N-th bit is 0, not null otherwise. + * If the array has no null elements, the bitmap must be %NULL and + * @n_nulls is 0. + * @n_nulls: The number of null elements. If -1 is specified, the + * number of nulls are computed from @null_bitmap. + * + * Returns: A newly created #GArrowStructArray. + * + * Since: 0.4.0 + */ +GArrowStructArray * +garrow_struct_array_new(GArrowDataType *data_type, + gint64 length, + GList *children, + GArrowBuffer *null_bitmap, + gint64 n_nulls) +{ + const auto arrow_data_type = garrow_data_type_get_raw(data_type); + std::vector> arrow_children; + for (GList *node = children; node; node = node->next) { + GArrowArray *child = GARROW_ARRAY(node->data); + arrow_children.push_back(garrow_array_get_raw(child)); + } + const auto arrow_bitmap = garrow_buffer_get_raw(null_bitmap); + auto arrow_struct_array = + std::make_shared(arrow_data_type, + length, + arrow_children, + arrow_bitmap, + n_nulls); + auto arrow_array = + std::static_pointer_cast(arrow_struct_array); + return GARROW_STRUCT_ARRAY(garrow_array_new_raw(&arrow_array)); +} + +/** + * garrow_struct_array_get_field + * @array: A #GArrowStructArray. + * @i: The index of the field in the struct. + * + * Returns: (transfer full): The i-th field. + */ +GArrowArray * +garrow_struct_array_get_field(GArrowStructArray *array, + gint i) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + auto arrow_struct_array = + static_cast(arrow_array.get()); + auto arrow_field = arrow_struct_array->field(i); + return garrow_array_new_raw(&arrow_field); +} + +/** + * garrow_struct_array_get_fields + * @array: A #GArrowStructArray. + * + * Returns: (element-type GArrowArray) (transfer full): + * The fields in the struct. + */ +GList * +garrow_struct_array_get_fields(GArrowStructArray *array) +{ + const auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + const auto arrow_struct_array = + static_cast(arrow_array.get()); + + GList *fields = NULL; + for (int i = 0; i < arrow_struct_array->num_fields(); ++i) { + auto arrow_field = arrow_struct_array->field(i); + GArrowArray *field = garrow_array_new_raw(&arrow_field); + fields = g_list_prepend(fields, field); + } + + return g_list_reverse(fields); +} + + +G_DEFINE_TYPE(GArrowDictionaryArray, \ + garrow_dictionary_array, \ + GARROW_TYPE_ARRAY) + +static void +garrow_dictionary_array_init(GArrowDictionaryArray *object) +{ +} + +static void +garrow_dictionary_array_class_init(GArrowDictionaryArrayClass *klass) +{ +} + +/** + * garrow_dictionary_array_new: + * @data_type: The data type of dictionary. + * @indices: The indices of values in dictionary. + * + * Returns: A newly created #GArrowDictionaryArray. + * + * Since: 0.8.0 + */ +GArrowDictionaryArray * +garrow_dictionary_array_new(GArrowDataType *data_type, + GArrowArray *indices) +{ + const auto arrow_data_type = garrow_data_type_get_raw(data_type); + const auto arrow_indices = garrow_array_get_raw(indices); + auto arrow_dictionary_array = + std::make_shared(arrow_data_type, + arrow_indices); + auto arrow_array = + std::static_pointer_cast(arrow_dictionary_array); + return GARROW_DICTIONARY_ARRAY(garrow_array_new_raw(&arrow_array)); +} + +/** + * garrow_dictionary_array_get_indices: + * @array: A #GArrowDictionaryArray. + * + * Returns: (transfer full): The indices of values in dictionary. + * + * Since: 0.8.0 + */ +GArrowArray * +garrow_dictionary_array_get_indices(GArrowDictionaryArray *array) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + auto arrow_dictionary_array = + std::static_pointer_cast(arrow_array); + auto arrow_indices = arrow_dictionary_array->indices(); + return garrow_array_new_raw(&arrow_indices); +} + +/** + * garrow_dictionary_array_get_dictionary: + * @array: A #GArrowDictionaryArray. + * + * Returns: (transfer full): The dictionary of this array. + * + * Since: 0.8.0 + */ +GArrowArray * +garrow_dictionary_array_get_dictionary(GArrowDictionaryArray *array) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + auto arrow_dictionary_array = + std::static_pointer_cast(arrow_array); + auto arrow_dictionary = arrow_dictionary_array->dictionary(); + return garrow_array_new_raw(&arrow_dictionary); +} + +/** + * garrow_dictionary_array_get_dictionary_data_type: + * @array: A #GArrowDictionaryArray. + * + * Returns: (transfer full): The dictionary data type of this array. + * + * Since: 0.8.0 + */ +GArrowDictionaryDataType * +garrow_dictionary_array_get_dictionary_data_type(GArrowDictionaryArray *array) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + auto arrow_dictionary_array = + std::static_pointer_cast(arrow_array); + auto arrow_dictionary_data_type = arrow_dictionary_array->dict_type(); + auto const_arrow_data_type = + static_cast(arrow_dictionary_data_type); + auto arrow_data_type = const_cast(const_arrow_data_type); + struct NullDeleter { + void operator()(arrow::DataType *data_type) { + } + }; + std::shared_ptr + shared_arrow_data_type(arrow_data_type, NullDeleter()); + auto data_type = garrow_data_type_new_raw(&shared_arrow_data_type); + return GARROW_DICTIONARY_DATA_TYPE(data_type); +} + +G_END_DECLS diff --git a/c_glib/arrow-glib/composite-array.h b/c_glib/arrow-glib/composite-array.h new file mode 100644 index 0000000000000..c59a61681bf42 --- /dev/null +++ b/c_glib/arrow-glib/composite-array.h @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +G_BEGIN_DECLS + +#define GARROW_TYPE_LIST_ARRAY \ + (garrow_list_array_get_type()) +#define GARROW_LIST_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_LIST_ARRAY, \ + GArrowListArray)) +#define GARROW_LIST_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_LIST_ARRAY, \ + GArrowListArrayClass)) +#define GARROW_IS_LIST_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_LIST_ARRAY)) +#define GARROW_IS_LIST_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_LIST_ARRAY)) +#define GARROW_LIST_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_LIST_ARRAY, \ + GArrowListArrayClass)) + +typedef struct _GArrowListArray GArrowListArray; +typedef struct _GArrowListArrayClass GArrowListArrayClass; + +/** + * GArrowListArray: + * + * It wraps `arrow::ListArray`. + */ +struct _GArrowListArray +{ + /*< private >*/ + GArrowArray parent_instance; +}; + +struct _GArrowListArrayClass +{ + GArrowArrayClass parent_class; +}; + +GType garrow_list_array_get_type(void) G_GNUC_CONST; + +GArrowListArray *garrow_list_array_new(gint64 length, + GArrowBuffer *value_offsets, + GArrowArray *values, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +GArrowDataType *garrow_list_array_get_value_type(GArrowListArray *array); +GArrowArray *garrow_list_array_get_value(GArrowListArray *array, + gint64 i); + + +#define GARROW_TYPE_STRUCT_ARRAY \ + (garrow_struct_array_get_type()) +#define GARROW_STRUCT_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_CAST((obj), \ + GARROW_TYPE_STRUCT_ARRAY, \ + GArrowStructArray)) +#define GARROW_STRUCT_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_CAST((klass), \ + GARROW_TYPE_STRUCT_ARRAY, \ + GArrowStructArrayClass)) +#define GARROW_IS_STRUCT_ARRAY(obj) \ + (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ + GARROW_TYPE_STRUCT_ARRAY)) +#define GARROW_IS_STRUCT_ARRAY_CLASS(klass) \ + (G_TYPE_CHECK_CLASS_TYPE((klass), \ + GARROW_TYPE_STRUCT_ARRAY)) +#define GARROW_STRUCT_ARRAY_GET_CLASS(obj) \ + (G_TYPE_INSTANCE_GET_CLASS((obj), \ + GARROW_TYPE_STRUCT_ARRAY, \ + GArrowStructArrayClass)) + +typedef struct _GArrowStructArray GArrowStructArray; +typedef struct _GArrowStructArrayClass GArrowStructArrayClass; + +/** + * GArrowStructArray: + * + * It wraps `arrow::StructArray`. + */ +struct _GArrowStructArray +{ + /*< private >*/ + GArrowArray parent_instance; +}; + +struct _GArrowStructArrayClass +{ + GArrowArrayClass parent_class; +}; + +GType garrow_struct_array_get_type(void) G_GNUC_CONST; + +GArrowStructArray *garrow_struct_array_new(GArrowDataType *data_type, + gint64 length, + GList *children, + GArrowBuffer *null_bitmap, + gint64 n_nulls); + +GArrowArray *garrow_struct_array_get_field(GArrowStructArray *array, + gint i); +GList *garrow_struct_array_get_fields(GArrowStructArray *array); + + +#define GARROW_TYPE_DICTIONARY_ARRAY (garrow_dictionary_array_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDictionaryArray, + garrow_dictionary_array, + GARROW, + DICTIONARY_ARRAY, + GArrowArray) +struct _GArrowDictionaryArrayClass +{ + GArrowArrayClass parent_class; +}; + +GArrowDictionaryArray * +garrow_dictionary_array_new(GArrowDataType *data_type, GArrowArray *indices); +GArrowArray * +garrow_dictionary_array_get_indices(GArrowDictionaryArray *array); +GArrowArray * +garrow_dictionary_array_get_dictionary(GArrowDictionaryArray *array); +GArrowDictionaryDataType * +garrow_dictionary_array_get_dictionary_data_type(GArrowDictionaryArray *array); + +G_END_DECLS diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index ce3d78ca08332..5f742e50e4028 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -21,6 +21,7 @@ # include #endif +#include #include #include #include @@ -38,6 +39,8 @@ G_BEGIN_DECLS * #GArrowListDataType is a class for list data type. * * #GArrowStructDataType is a class for struct data type. + * + * #GArrowDictionaryDataType is a class for dictionary data type. */ G_DEFINE_TYPE(GArrowListDataType, \ @@ -133,4 +136,95 @@ garrow_struct_data_type_new(GList *fields) return data_type; } + +G_DEFINE_TYPE(GArrowDictionaryDataType, \ + garrow_dictionary_data_type, \ + GARROW_TYPE_FIXED_WIDTH_DATA_TYPE) + +static void +garrow_dictionary_data_type_init(GArrowDictionaryDataType *object) +{ +} + +static void +garrow_dictionary_data_type_class_init(GArrowDictionaryDataTypeClass *klass) +{ +} + +/** + * garrow_dictionary_data_type_new: + * @index_data_type: The data type of index. + * @dictionary: The dictionary. + * @ordered: Whether dictionary contents are ordered or not. + * + * Returns: The newly created dictionary data type. + * + * Since: 0.8.0 + */ +GArrowDictionaryDataType * +garrow_dictionary_data_type_new(GArrowDataType *index_data_type, + GArrowArray *dictionary, + gboolean ordered) +{ + auto arrow_index_data_type = garrow_data_type_get_raw(index_data_type); + auto arrow_dictionary = garrow_array_get_raw(dictionary); + auto arrow_data_type = arrow::dictionary(arrow_index_data_type, + arrow_dictionary, + ordered); + return GARROW_DICTIONARY_DATA_TYPE(garrow_data_type_new_raw(&arrow_data_type)); +} + +/** + * garrow_dictionary_data_type_get_index_data_type: + * @data_type: The #GArrowDictionaryDataType. + * + * Returns: (transfer full): The #GArrowDataType of index. + * + * Since: 0.8.0 + */ +GArrowDataType * +garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *data_type) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_dictionary_data_type = + std::static_pointer_cast(arrow_data_type); + auto arrow_index_data_type = arrow_dictionary_data_type->index_type(); + return garrow_data_type_new_raw(&arrow_index_data_type); +} + +/** + * garrow_dictionary_data_type_get_dictionary: + * @data_type: The #GArrowDictionaryDataType. + * + * Returns: (transfer full): The dictionary as #GArrowArray. + * + * Since: 0.8.0 + */ +GArrowArray * +garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *data_type) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_dictionary_data_type = + std::static_pointer_cast(arrow_data_type); + auto arrow_dictionary = arrow_dictionary_data_type->dictionary(); + return garrow_array_new_raw(&arrow_dictionary); +} + +/** + * garrow_dictionary_data_type_is_ordered: + * @data_type: The #GArrowDictionaryDataType. + * + * Returns: Whether dictionary contents are ordered or not. + * + * Since: 0.8.0 + */ +gboolean +garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *data_type) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_dictionary_data_type = + std::static_pointer_cast(arrow_data_type); + return arrow_dictionary_data_type->ordered(); +} + G_END_DECLS diff --git a/c_glib/arrow-glib/composite-data-type.h b/c_glib/arrow-glib/composite-data-type.h index 9dac5bd84bed2..11b83014ff830 100644 --- a/c_glib/arrow-glib/composite-data-type.h +++ b/c_glib/arrow-glib/composite-data-type.h @@ -19,6 +19,7 @@ #pragma once +#include #include #include @@ -112,4 +113,28 @@ struct _GArrowStructDataTypeClass GType garrow_struct_data_type_get_type (void) G_GNUC_CONST; GArrowStructDataType *garrow_struct_data_type_new (GList *fields); + +#define GARROW_TYPE_DICTIONARY_DATA_TYPE (garrow_dictionary_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDictionaryDataType, + garrow_dictionary_data_type, + GARROW, + DICTIONARY_DATA_TYPE, + GArrowFixedWidthDataType) +struct _GArrowDictionaryDataTypeClass +{ + GArrowFixedWidthDataTypeClass parent_class; +}; + +GArrowDictionaryDataType * +garrow_dictionary_data_type_new(GArrowDataType *index_data_type, + GArrowArray *dictionary, + gboolean ordered); +GArrowDataType * +garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *data_type); +GArrowArray * +garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *data_type); +gboolean +garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *data_type); + + G_END_DECLS diff --git a/c_glib/arrow-glib/input-stream.cpp b/c_glib/arrow-glib/input-stream.cpp index d628baeeeae5b..94422241b9cfa 100644 --- a/c_glib/arrow-glib/input-stream.cpp +++ b/c_glib/arrow-glib/input-stream.cpp @@ -420,7 +420,7 @@ namespace garrow { arrow::Status Read(int64_t n_bytes, int64_t *n_read_bytes, - uint8_t *out) override { + void *out) override { GError *error = NULL; *n_read_bytes = g_input_stream_read(input_stream_, out, @@ -437,13 +437,13 @@ namespace garrow { } arrow::Status ReadAt(int64_t position, int64_t n_bytes, - int64_t *n_read_bytes, uint8_t* out) { + int64_t *n_read_bytes, void* out) override { return arrow::io::RandomAccessFile::ReadAt( position, n_bytes, n_read_bytes, out); } arrow::Status ReadAt(int64_t position, int64_t n_bytes, - std::shared_ptr* out) { + std::shared_ptr* out) override { return arrow::io::RandomAccessFile::ReadAt(position, n_bytes, out); } diff --git a/c_glib/arrow-glib/input-stream.h b/c_glib/arrow-glib/input-stream.h index 12c7ae700f79d..c2068d6ac0e41 100644 --- a/c_glib/arrow-glib/input-stream.h +++ b/c_glib/arrow-glib/input-stream.h @@ -26,98 +26,28 @@ G_BEGIN_DECLS -#define GARROW_TYPE_INPUT_STREAM \ - (garrow_input_stream_get_type()) -#define GARROW_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_INPUT_STREAM, \ - GArrowInputStream)) -#define GARROW_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_INPUT_STREAM, \ - GArrowInputStreamClass)) -#define GARROW_IS_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_INPUT_STREAM)) -#define GARROW_IS_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_INPUT_STREAM)) -#define GARROW_INPUT_STREAM_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_INPUT_STREAM, \ - GArrowInputStreamClass)) - -typedef struct _GArrowInputStream GArrowInputStream; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowInputStreamClass GArrowInputStreamClass; -#endif - -/** - * GArrowInputStream: - * - * It wraps `arrow::io::InputStream`. - */ -struct _GArrowInputStream -{ - /*< private >*/ - GObject parent_instance; -}; - -#ifndef __GTK_DOC_IGNORE__ +#define GARROW_TYPE_INPUT_STREAM (garrow_input_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowInputStream, + garrow_input_stream, + GARROW, + INPUT_STREAM, + GObject) struct _GArrowInputStreamClass { GObjectClass parent_class; }; -#endif - -GType garrow_input_stream_get_type(void) G_GNUC_CONST; - #define GARROW_TYPE_SEEKABLE_INPUT_STREAM \ (garrow_seekable_input_stream_get_type()) -#define GARROW_SEEKABLE_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_SEEKABLE_INPUT_STREAM, \ - GArrowSeekableInputStream)) -#define GARROW_SEEKABLE_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_SEEKABLE_INPUT_STREAM, \ - GArrowSeekableInputStreamClass)) -#define GARROW_IS_SEEKABLE_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_SEEKABLE_INPUT_STREAM)) -#define GARROW_IS_SEEKABLE_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_SEEKABLE_INPUT_STREAM)) -#define GARROW_SEEKABLE_INPUT_STREAM_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_SEEKABLE_INPUT_STREAM, \ - GArrowSeekableInputStreamClass)) - -typedef struct _GArrowSeekableInputStream GArrowSeekableInputStream; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowSeekableInputStreamClass GArrowSeekableInputStreamClass; -#endif - -/** - * GArrowSeekableInputStream: - * - * It wraps `arrow::io::RandomAccessFile`. - */ -struct _GArrowSeekableInputStream -{ - /*< private >*/ - GArrowInputStream parent_instance; -}; - -#ifndef __GTK_DOC_IGNORE__ +G_DECLARE_DERIVABLE_TYPE(GArrowSeekableInputStream, + garrow_seekable_input_stream, + GARROW, + SEEKABLE_INPUT_STREAM, + GArrowInputStream) struct _GArrowSeekableInputStreamClass { GArrowInputStreamClass parent_class; }; -#endif - -GType garrow_seekable_input_stream_get_type(void) G_GNUC_CONST; guint64 garrow_seekable_input_stream_get_size(GArrowSeekableInputStream *input_stream, GError **error); @@ -133,49 +63,15 @@ GArrowTensor *garrow_seekable_input_stream_read_tensor(GArrowSeekableInputStream #define GARROW_TYPE_BUFFER_INPUT_STREAM \ (garrow_buffer_input_stream_get_type()) -#define GARROW_BUFFER_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_BUFFER_INPUT_STREAM, \ - GArrowBufferInputStream)) -#define GARROW_BUFFER_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_BUFFER_INPUT_STREAM, \ - GArrowBufferInputStreamClass)) -#define GARROW_IS_BUFFER_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_BUFFER_INPUT_STREAM)) -#define GARROW_IS_BUFFER_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_BUFFER_INPUT_STREAM)) -#define GARROW_BUFFER_INPUT_STREAM_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_BUFFER_INPUT_STREAM, \ - GArrowBufferInputStreamClass)) - -typedef struct _GArrowBufferInputStream GArrowBufferInputStream; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowBufferInputStreamClass GArrowBufferInputStreamClass; -#endif - -/** - * GArrowBufferInputStream: - * - * It wraps `arrow::io::BufferReader`. - */ -struct _GArrowBufferInputStream -{ - /*< private >*/ - GArrowSeekableInputStream parent_instance; -}; - -#ifndef __GTK_DOC_IGNORE__ +G_DECLARE_DERIVABLE_TYPE(GArrowBufferInputStream, + garrow_buffer_input_stream, + GARROW, + BUFFER_INPUT_STREAM, + GArrowSeekableInputStream) struct _GArrowBufferInputStreamClass { GArrowSeekableInputStreamClass parent_class; }; -#endif - -GType garrow_buffer_input_stream_get_type(void) G_GNUC_CONST; GArrowBufferInputStream *garrow_buffer_input_stream_new(GArrowBuffer *buffer); diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index 464a002e78b0c..25968e69c1047 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -18,12 +18,13 @@ # under the License. sources = files( - 'array.cpp', 'array-builder.cpp', + 'basic-array.cpp', 'basic-data-type.cpp', 'buffer.cpp', 'chunked-array.cpp', 'column.cpp', + 'composite-array.cpp', 'composite-data-type.cpp', 'error.cpp', 'field.cpp', @@ -59,10 +60,12 @@ c_headers = files( 'array.h', 'array-builder.h', 'arrow-glib.h', + 'basic-array.h', 'basic-data-type.h', 'buffer.h', 'chunked-array.h', 'column.h', + 'composite-array.h', 'composite-data-type.h', 'data-type.h', 'error.h', @@ -102,6 +105,7 @@ cpp_headers = files( 'array.hpp', 'array-builder.hpp', 'arrow-glib.hpp', + 'basic-array.hpp', 'basic-data-type.hpp', 'buffer.hpp', 'chunked-array.hpp', @@ -179,22 +183,23 @@ pkgconfig.generate(filebase: meson.project_name(), name: 'Apache Arrow GLib', description: 'C API for Apache Arrow based on GLib', version: version, - requires: ['gobject-2.0', 'arrow'], + requires: ['gio-2.0', 'arrow'], libraries: [libarrow_glib], subdirs: ['arrow-glib']) -gnome.generate_gir(libarrow_glib, - sources: sources + c_headers + enums, - namespace: 'Arrow', - nsversion: api_version, - identifier_prefix: 'GArrow', - symbol_prefix: 'garrow', - export_packages: 'arrow-glib', - includes: [ - 'GObject-2.0', - 'Gio-2.0', - ], - install: true, - extra_args: [ - '--warn-all', - ]) +arrow_glib_gir = gnome.generate_gir(libarrow_glib, + sources: sources + c_headers + enums, + namespace: 'Arrow', + nsversion: api_version, + identifier_prefix: 'GArrow', + symbol_prefix: 'garrow', + export_packages: 'arrow-glib', + includes: [ + 'GObject-2.0', + 'Gio-2.0', + ], + install: true, + extra_args: [ + '--warn-all', + ]) +arrow_glib_gir_dependency = declare_dependency(sources: arrow_glib_gir) diff --git a/c_glib/arrow-glib/output-stream.cpp b/c_glib/arrow-glib/output-stream.cpp index 739992fb62b0e..9939f4f086f8e 100644 --- a/c_glib/arrow-glib/output-stream.cpp +++ b/c_glib/arrow-glib/output-stream.cpp @@ -76,7 +76,7 @@ garrow_output_stream_file_interface_init(GArrowFileInterface *iface) iface->get_raw = garrow_output_stream_get_raw_file_interface; } -static std::shared_ptr +static std::shared_ptr garrow_output_stream_get_raw_writeable_interface(GArrowWriteable *writeable) { auto output_stream = GARROW_OUTPUT_STREAM(writeable); @@ -325,7 +325,7 @@ namespace garrow { return arrow::Status::OK(); } - arrow::Status Write(const uint8_t *data, + arrow::Status Write(const void *data, int64_t n_bytes) override { GError *error = NULL; gsize n_written_bytes; diff --git a/c_glib/arrow-glib/output-stream.h b/c_glib/arrow-glib/output-stream.h index e42ebcde47d6b..195a97ac9f053 100644 --- a/c_glib/arrow-glib/output-stream.h +++ b/c_glib/arrow-glib/output-stream.h @@ -26,51 +26,16 @@ G_BEGIN_DECLS -#define GARROW_TYPE_OUTPUT_STREAM \ - (garrow_output_stream_get_type()) -#define GARROW_OUTPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_OUTPUT_STREAM, \ - GArrowOutputStream)) -#define GARROW_OUTPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_OUTPUT_STREAM, \ - GArrowOutputStreamClass)) -#define GARROW_IS_OUTPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_OUTPUT_STREAM)) -#define GARROW_IS_OUTPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_OUTPUT_STREAM)) -#define GARROW_OUTPUT_STREAM_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_OUTPUT_STREAM, \ - GArrowOutputStreamClass)) - -typedef struct _GArrowOutputStream GArrowOutputStream; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowOutputStreamClass GArrowOutputStreamClass; -#endif - -/** - * GArrowOutputStream: - * - * It wraps `arrow::io::OutputStream`. - */ -struct _GArrowOutputStream -{ - /*< private >*/ - GObject parent_instance; -}; - -#ifndef __GTK_DOC_IGNORE__ +#define GARROW_TYPE_OUTPUT_STREAM (garrow_output_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowOutputStream, + garrow_output_stream, + GARROW, + OUTPUT_STREAM, + GObject) struct _GArrowOutputStreamClass { GObjectClass parent_class; }; -#endif - -GType garrow_output_stream_get_type(void) G_GNUC_CONST; gint64 garrow_output_stream_write_tensor(GArrowOutputStream *stream, GArrowTensor *tensor, diff --git a/c_glib/arrow-glib/readable.cpp b/c_glib/arrow-glib/readable.cpp index 6a9023e6cddf0..33f98d98c88a4 100644 --- a/c_glib/arrow-glib/readable.cpp +++ b/c_glib/arrow-glib/readable.cpp @@ -45,6 +45,7 @@ G_DEFINE_INTERFACE(GArrowReadable, static void garrow_readable_default_init (GArrowReadableInterface *iface) { + iface->new_raw = garrow_buffer_new_raw; } /** @@ -66,7 +67,8 @@ garrow_readable_read(GArrowReadable *readable, std::shared_ptr arrow_buffer; auto status = arrow_readable->Read(n_bytes, &arrow_buffer); if (garrow_error_check(error, status, "[io][readable][read]")) { - return garrow_buffer_new_raw(&arrow_buffer); + auto *iface = GARROW_READABLE_GET_IFACE(readable); + return iface->new_raw(&arrow_buffer); } else { return NULL; } diff --git a/c_glib/arrow-glib/readable.hpp b/c_glib/arrow-glib/readable.hpp index c241c77aa0329..ce7770103aa1a 100644 --- a/c_glib/arrow-glib/readable.hpp +++ b/c_glib/arrow-glib/readable.hpp @@ -32,6 +32,7 @@ struct _GArrowReadableInterface { GTypeInterface parent_iface; + GArrowBuffer *(*new_raw)(std::shared_ptr *arrow_buffer); std::shared_ptr (*get_raw)(GArrowReadable *file); }; diff --git a/c_glib/arrow-glib/record-batch.cpp b/c_glib/arrow-glib/record-batch.cpp index f381af0a2c2c7..64f2020ad3efc 100644 --- a/c_glib/arrow-glib/record-batch.cpp +++ b/c_glib/arrow-glib/record-batch.cpp @@ -28,6 +28,23 @@ #include +static inline bool +garrow_record_batch_adjust_index(const std::shared_ptr arrow_record_batch, + gint &i) +{ + auto n_columns = arrow_record_batch->num_columns(); + if (i < 0) { + i += n_columns; + if (i < 0) { + return false; + } + } + if (i >= n_columns) { + return false; + } + return true; +} + G_BEGIN_DECLS /** @@ -135,13 +152,15 @@ garrow_record_batch_class_init(GArrowRecordBatchClass *klass) * @schema: The schema of the record batch. * @n_rows: The number of the rows in the record batch. * @columns: (element-type GArrowArray): The columns in the record batch. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: A newly created #GArrowRecordBatch. + * Returns: (nullable): A newly created #GArrowRecordBatch or %NULL on error. */ GArrowRecordBatch * garrow_record_batch_new(GArrowSchema *schema, guint32 n_rows, - GList *columns) + GList *columns, + GError **error) { std::vector> arrow_columns; for (GList *node = columns; node; node = node->next) { @@ -150,10 +169,14 @@ garrow_record_batch_new(GArrowSchema *schema, } auto arrow_record_batch = - std::make_shared(garrow_schema_get_raw(schema), - n_rows, - arrow_columns); - return garrow_record_batch_new_raw(&arrow_record_batch); + arrow::RecordBatch::Make(garrow_schema_get_raw(schema), + n_rows, arrow_columns); + auto status = arrow_record_batch->Validate(); + if (garrow_error_check(error, status, "[record-batch][new]")) { + return garrow_record_batch_new_raw(&arrow_record_batch); + } else { + return NULL; + } } /** @@ -193,15 +216,21 @@ garrow_record_batch_get_schema(GArrowRecordBatch *record_batch) /** * garrow_record_batch_get_column: * @record_batch: A #GArrowRecordBatch. - * @i: The index of the target column. + * @i: The index of the target column. If it's negative, index is + * counted backward from the end of the columns. `-1` means the last + * column. * - * Returns: (transfer full): The i-th column in the record batch. + * Returns: (transfer full) (nullable): The i-th column in the record batch + * on success, %NULL on out of index. */ GArrowArray * garrow_record_batch_get_column(GArrowRecordBatch *record_batch, - guint i) + gint i) { const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + if (!garrow_record_batch_adjust_index(arrow_record_batch, i)) { + return NULL; + } auto arrow_column = arrow_record_batch->column(i); return garrow_array_new_raw(&arrow_column); } @@ -231,15 +260,21 @@ garrow_record_batch_get_columns(GArrowRecordBatch *record_batch) /** * garrow_record_batch_get_column_name: * @record_batch: A #GArrowRecordBatch. - * @i: The index of the target column. + * @i: The index of the target column. If it's negative, index is + * counted backward from the end of the columns. `-1` means the last + * column. * - * Returns: The name of the i-th column in the record batch. + * Returns: (nullable): The name of the i-th column in the record batch + * on success, %NULL on out of index */ const gchar * garrow_record_batch_get_column_name(GArrowRecordBatch *record_batch, - guint i) + gint i) { const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + if (!garrow_record_batch_adjust_index(arrow_record_batch, i)) { + return NULL; + } return arrow_record_batch->column_name(i).c_str(); } diff --git a/c_glib/arrow-glib/record-batch.h b/c_glib/arrow-glib/record-batch.h index 021f894f3f5a3..d31edf43973d0 100644 --- a/c_glib/arrow-glib/record-batch.h +++ b/c_glib/arrow-glib/record-batch.h @@ -68,17 +68,18 @@ GType garrow_record_batch_get_type(void) G_GNUC_CONST; GArrowRecordBatch *garrow_record_batch_new(GArrowSchema *schema, guint32 n_rows, - GList *columns); + GList *columns, + GError **error); gboolean garrow_record_batch_equal(GArrowRecordBatch *record_batch, GArrowRecordBatch *other_record_batch); GArrowSchema *garrow_record_batch_get_schema (GArrowRecordBatch *record_batch); GArrowArray *garrow_record_batch_get_column (GArrowRecordBatch *record_batch, - guint i); + gint i); GList *garrow_record_batch_get_columns (GArrowRecordBatch *record_batch); const gchar *garrow_record_batch_get_column_name(GArrowRecordBatch *record_batch, - guint i); + gint i); guint garrow_record_batch_get_n_columns (GArrowRecordBatch *record_batch); gint64 garrow_record_batch_get_n_rows (GArrowRecordBatch *record_batch); GArrowRecordBatch *garrow_record_batch_slice (GArrowRecordBatch *record_batch, diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index 779f2ef62b8f5..e086396f8f9e0 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -143,8 +143,7 @@ garrow_table_new(GArrowSchema *schema, } auto arrow_table = - std::make_shared(garrow_schema_get_raw(schema), - arrow_columns); + arrow::Table::Make(garrow_schema_get_raw(schema), arrow_columns); return garrow_table_new_raw(&arrow_table); } diff --git a/c_glib/arrow-glib/writeable.cpp b/c_glib/arrow-glib/writeable.cpp index eb6adfee8c985..a16e43ab17ae9 100644 --- a/c_glib/arrow-glib/writeable.cpp +++ b/c_glib/arrow-glib/writeable.cpp @@ -88,7 +88,7 @@ garrow_writeable_flush(GArrowWriteable *writeable, G_END_DECLS -std::shared_ptr +std::shared_ptr garrow_writeable_get_raw(GArrowWriteable *writeable) { auto *iface = GARROW_WRITEABLE_GET_IFACE(writeable); diff --git a/c_glib/arrow-glib/writeable.hpp b/c_glib/arrow-glib/writeable.hpp index 2b398f8b507c1..806d36fc07957 100644 --- a/c_glib/arrow-glib/writeable.hpp +++ b/c_glib/arrow-glib/writeable.hpp @@ -26,13 +26,13 @@ /** * GArrowWriteableInterface: * - * It wraps `arrow::io::Writeable`. + * It wraps `arrow::io::Writable`. */ struct _GArrowWriteableInterface { GTypeInterface parent_iface; - std::shared_ptr (*get_raw)(GArrowWriteable *file); + std::shared_ptr (*get_raw)(GArrowWriteable *file); }; -std::shared_ptr garrow_writeable_get_raw(GArrowWriteable *writeable); +std::shared_ptr garrow_writeable_get_raw(GArrowWriteable *writeable); diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp index 7d3b59457fd33..9bcda2dbbfd2c 100644 --- a/c_glib/arrow-glib/writer.cpp +++ b/c_glib/arrow-glib/writer.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -165,6 +166,30 @@ garrow_record_batch_writer_write_record_batch(GArrowRecordBatchWriter *writer, "[record-batch-writer][write-record-batch]"); } +/** + * garrow_record_batch_writer_write_table: + * @writer: A #GArrowRecordBatchWriter. + * @table: The table to be written. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.8.0 + */ +gboolean +garrow_record_batch_writer_write_table(GArrowRecordBatchWriter *writer, + GArrowTable *table, + GError **error) +{ + auto arrow_writer = garrow_record_batch_writer_get_raw(writer); + auto arrow_table = garrow_table_get_raw(table); + + auto status = arrow_writer->WriteTable(*arrow_table); + return garrow_error_check(error, + status, + "[record-batch-writer][write-table]"); +} + /** * garrow_record_batch_writer_close: * @writer: A #GArrowRecordBatchWriter. diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h index 3853c2bd72c50..41b5f723d10eb 100644 --- a/c_glib/arrow-glib/writer.h +++ b/c_glib/arrow-glib/writer.h @@ -77,6 +77,10 @@ gboolean garrow_record_batch_writer_write_record_batch( GArrowRecordBatchWriter *writer, GArrowRecordBatch *record_batch, GError **error); +gboolean garrow_record_batch_writer_write_table( + GArrowRecordBatchWriter *writer, + GArrowTable *table, + GError **error); gboolean garrow_record_batch_writer_close( GArrowRecordBatchWriter *writer, GError **error); diff --git a/c_glib/arrow-gpu-glib/Makefile.am b/c_glib/arrow-gpu-glib/Makefile.am new file mode 100644 index 0000000000000..1e1c02ac4a5c0 --- /dev/null +++ b/c_glib/arrow-gpu-glib/Makefile.am @@ -0,0 +1,124 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +CLEANFILES = + +EXTRA_DIST = \ + meson.build + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir) + +if HAVE_ARROW_GPU +lib_LTLIBRARIES = \ + libarrow-gpu-glib.la + +libarrow_gpu_glib_la_CXXFLAGS = \ + $(GLIB_CFLAGS) \ + $(ARROW_CFLAGS) \ + $(ARROW_GPU_CFLAGS) \ + $(GARROW_CXXFLAGS) + +libarrow_gpu_glib_la_LIBADD = \ + $(GLIB_LIBS) \ + $(ARROW_LIBS) \ + $(ARROW_GPU_LIBS) \ + ../arrow-glib/libarrow-glib.la + +libarrow_gpu_glib_la_headers = \ + arrow-gpu-glib.h \ + cuda.h + +libarrow_gpu_glib_la_sources = \ + cuda.cpp \ + $(libarrow_gpu_glib_la_headers) + +libarrow_gpu_glib_la_cpp_headers = \ + arrow-gpu-glib.hpp \ + cuda.hpp + +libarrow_gpu_glib_la_SOURCES = \ + $(libarrow_gpu_glib_la_sources) \ + $(libarrow_gpu_glib_la_cpp_headers) + +arrow_gpu_glib_includedir = \ + $(includedir)/arrow-gpu-glib +arrow_gpu_glib_include_HEADERS = \ + $(libarrow_gpu_glib_la_headers) \ + $(libarrow_gpu_glib_la_cpp_headers) + +pkgconfigdir = $(libdir)/pkgconfig +pkgconfig_DATA = \ + arrow-gpu-glib.pc + +if HAVE_INTROSPECTION +-include $(INTROSPECTION_MAKEFILE) +INTROSPECTION_GIRS = +INTROSPECTION_SCANNER_ARGS = +INTROSPECTION_SCANNER_ENV = +if USE_ARROW_BUILD_DIR +INTROSPECTION_SCANNER_ENV += \ + PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} +else +INTROSPECTION_SCANNER_ENV += \ + PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$${PKG_CONFIG_PATH} +endif +if OS_MACOS +INTROSPECTION_SCANNER_ENV += \ + ARCHFLAGS= +endif +INTROSPECTION_COMPILER_ARGS = \ + --includedir=$(abs_builddir)/../arrow-glib + +ArrowGPU-1.0.gir: libarrow-gpu-glib.la +ArrowGPU_1_0_gir_PACKAGES = \ + arrow-glib +ArrowGPU_1_0_gir_EXPORT_PACKAGES = \ + arrow-gpu-glib +ArrowGPU_1_0_gir_INCLUDES = \ + Arrow-1.0 +ArrowGPU_1_0_gir_CFLAGS = \ + $(AM_CPPFLAGS) +ArrowGPU_1_0_gir_LDFLAGS = +if USE_ARROW_BUILD_DIR +ArrowGPU_1_0_gir_LDFLAGS += \ + -L$(ARROW_LIB_DIR) +endif +ArrowGPU_1_0_gir_LIBS = \ + $(abs_builddir)/../arrow-glib/libarrow-glib.la \ + libarrow-gpu-glib.la +ArrowGPU_1_0_gir_FILES = \ + $(libarrow_gpu_glib_la_sources) +ArrowGPU_1_0_gir_SCANNERFLAGS = \ + --warn-all \ + --add-include-path=$(abs_builddir)/../arrow-glib \ + --identifier-prefix=GArrowGPU \ + --symbol-prefix=garrow_gpu +INTROSPECTION_GIRS += ArrowGPU-1.0.gir + +girdir = $(datadir)/gir-1.0 +gir_DATA = $(INTROSPECTION_GIRS) + +typelibdir = $(libdir)/girepository-1.0 +typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) + +CLEANFILES += \ + $(gir_DATA) \ + $(typelib_DATA) +endif +endif diff --git a/c_glib/arrow-gpu-glib/arrow-gpu-glib.h b/c_glib/arrow-gpu-glib/arrow-gpu-glib.h new file mode 100644 index 0000000000000..1538c9a1865ac --- /dev/null +++ b/c_glib/arrow-gpu-glib/arrow-gpu-glib.h @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include diff --git a/c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp b/c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp new file mode 100644 index 0000000000000..92017d8b67aab --- /dev/null +++ b/c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include diff --git a/c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in b/c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in new file mode 100644 index 0000000000000..38a6bae1a1298 --- /dev/null +++ b/c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: Apache Arrow GPU GLib +Description: C API for Apache Arrow GPU based on GLib +Version: @VERSION@ +Libs: -L${libdir} -larrow-gpu-glib +Cflags: -I${includedir} +Requires: arrow-glib diff --git a/c_glib/arrow-gpu-glib/cuda.cpp b/c_glib/arrow-gpu-glib/cuda.cpp new file mode 100644 index 0000000000000..c2a9af54dda94 --- /dev/null +++ b/c_glib/arrow-gpu-glib/cuda.cpp @@ -0,0 +1,941 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include + +G_BEGIN_DECLS + +/** + * SECTION: cuda + * @section_id: cuda-classes + * @title: CUDA related classes + * @include: arrow-gpu-glib/arrow-gpu-glib.h + * + * The following classes provide CUDA support for Apache Arrow data. + * + * #GArrowGPUCUDADeviceManager is the starting point. You need at + * least one #GArrowGPUCUDAContext to process Apache Arrow data on + * NVIDIA GPU. + * + * #GArrowGPUCUDAContext is a class to keep context for one GPU. You + * need to create #GArrowGPUCUDAContext for each GPU that you want to + * use. You can create #GArrowGPUCUDAContext by + * garrow_gpu_cuda_device_manager_get_context(). + * + * #GArrowGPUCUDABuffer is a class for data on GPU. You can copy data + * on GPU to/from CPU by garrow_gpu_cuda_buffer_copy_to_host() and + * garrow_gpu_cuda_buffer_copy_from_host(). You can share data on GPU + * with other processes by garrow_gpu_cuda_buffer_export() and + * garrow_gpu_cuda_buffer_new_ipc(). + * + * #GArrowGPUCUDAHostBuffer is a class for data on CPU that is + * directly accessible from GPU. + * + * #GArrowGPUCUDAIPCMemoryHandle is a class to share data on GPU with + * other processes. You can export your data on GPU to other processes + * by garrow_gpu_cuda_buffer_export() and + * garrow_gpu_cuda_ipc_memory_handle_new(). You can import other + * process data on GPU by garrow_gpu_cuda_ipc_memory_handle_new() and + * garrow_gpu_cuda_buffer_new_ipc(). + * + * #GArrowGPUCUDABufferInputStream is a class to read data in + * #GArrowGPUCUDABuffer. + * + * #GArrowGPUCUDABufferOutputStream is a class to write data into + * #GArrowGPUCUDABuffer. + */ + +G_DEFINE_TYPE(GArrowGPUCUDADeviceManager, + garrow_gpu_cuda_device_manager, + G_TYPE_OBJECT) + +static void +garrow_gpu_cuda_device_manager_init(GArrowGPUCUDADeviceManager *object) +{ +} + +static void +garrow_gpu_cuda_device_manager_class_init(GArrowGPUCUDADeviceManagerClass *klass) +{ +} + +/** + * garrow_gpu_cuda_device_manager_new: + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GArrowGPUCUDADeviceManager on success, + * %NULL on error. + * + * Since: 0.8.0 + */ +GArrowGPUCUDADeviceManager * +garrow_gpu_cuda_device_manager_new(GError **error) +{ + arrow::gpu::CudaDeviceManager *manager; + auto status = arrow::gpu::CudaDeviceManager::GetInstance(&manager); + if (garrow_error_check(error, status, "[gpu][cuda][device-manager][new]")) { + auto manager = g_object_new(GARROW_GPU_TYPE_CUDA_DEVICE_MANAGER, + NULL); + return GARROW_GPU_CUDA_DEVICE_MANAGER(manager); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_device_manager_get_context: + * @manager: A #GArrowGPUCUDADeviceManager. + * @gpu_number: A GPU device number for the target context. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDAContext on + * success, %NULL on error. Contexts for the same GPU device number + * share the same data internally. + * + * Since: 0.8.0 + */ +GArrowGPUCUDAContext * +garrow_gpu_cuda_device_manager_get_context(GArrowGPUCUDADeviceManager *manager, + gint gpu_number, + GError **error) +{ + arrow::gpu::CudaDeviceManager *arrow_manager; + arrow::gpu::CudaDeviceManager::GetInstance(&arrow_manager); + std::shared_ptr context; + auto status = arrow_manager->GetContext(gpu_number, &context); + if (garrow_error_check(error, status, + "[gpu][cuda][device-manager][get-context]]")) { + return garrow_gpu_cuda_context_new_raw(&context); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_device_manager_get_n_devices: + * @manager: A #GArrowGPUCUDADeviceManager. + * + * Returns: The number of GPU devices. + * + * Since: 0.8.0 + */ +gsize +garrow_gpu_cuda_device_manager_get_n_devices(GArrowGPUCUDADeviceManager *manager) +{ + arrow::gpu::CudaDeviceManager *arrow_manager; + arrow::gpu::CudaDeviceManager::GetInstance(&arrow_manager); + return arrow_manager->num_devices(); +} + + +typedef struct GArrowGPUCUDAContextPrivate_ { + std::shared_ptr context; +} GArrowGPUCUDAContextPrivate; + +enum { + PROP_CONTEXT = 1 +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowGPUCUDAContext, + garrow_gpu_cuda_context, + G_TYPE_OBJECT) + +#define GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(object) \ + static_cast( \ + garrow_gpu_cuda_context_get_instance_private( \ + GARROW_GPU_CUDA_CONTEXT(object))) + +static void +garrow_gpu_cuda_context_finalize(GObject *object) +{ + auto priv = GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(object); + + priv->context = nullptr; + + G_OBJECT_CLASS(garrow_gpu_cuda_context_parent_class)->finalize(object); +} + +static void +garrow_gpu_cuda_context_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CONTEXT: + priv->context = + *static_cast *>(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_gpu_cuda_context_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + switch (prop_id) { + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_gpu_cuda_context_init(GArrowGPUCUDAContext *object) +{ +} + +static void +garrow_gpu_cuda_context_class_init(GArrowGPUCUDAContextClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_gpu_cuda_context_finalize; + gobject_class->set_property = garrow_gpu_cuda_context_set_property; + gobject_class->get_property = garrow_gpu_cuda_context_get_property; + + /** + * GArrowGPUCUDAContext:context: + * + * Since: 0.8.0 + */ + spec = g_param_spec_pointer("context", + "Context", + "The raw std::shared_ptr *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_CONTEXT, spec); +} + +/** + * garrow_gpu_cuda_context_get_allocated_size: + * @context: A #GArrowGPUCUDAContext. + * + * Returns: The allocated memory by this context in bytes. + * + * Since: 0.8.0 + */ +gint64 +garrow_gpu_cuda_context_get_allocated_size(GArrowGPUCUDAContext *context) +{ + auto arrow_context = garrow_gpu_cuda_context_get_raw(context); + return arrow_context->bytes_allocated(); +} + + +G_DEFINE_TYPE(GArrowGPUCUDABuffer, + garrow_gpu_cuda_buffer, + GARROW_TYPE_BUFFER) + +static void +garrow_gpu_cuda_buffer_init(GArrowGPUCUDABuffer *object) +{ +} + +static void +garrow_gpu_cuda_buffer_class_init(GArrowGPUCUDABufferClass *klass) +{ +} + +/** + * garrow_gpu_cuda_buffer_new: + * @context: A #GArrowGPUCUDAContext. + * @size: The number of bytes to be allocated on GPU device for this context. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDABuffer on + * success, %NULL on error. + * + * Since: 0.8.0 + */ +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new(GArrowGPUCUDAContext *context, + gint64 size, + GError **error) +{ + auto arrow_context = garrow_gpu_cuda_context_get_raw(context); + std::shared_ptr arrow_buffer; + auto status = arrow_context->Allocate(size, &arrow_buffer); + if (garrow_error_check(error, status, "[gpu][cuda][buffer][new]")) { + return garrow_gpu_cuda_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_buffer_new_ipc: + * @context: A #GArrowGPUCUDAContext. + * @handle: A #GArrowGPUCUDAIPCMemoryHandle to be communicated. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDABuffer on + * success, %NULL on error. The buffer has data from the IPC target. + * + * Since: 0.8.0 + */ +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_ipc(GArrowGPUCUDAContext *context, + GArrowGPUCUDAIPCMemoryHandle *handle, + GError **error) +{ + auto arrow_context = garrow_gpu_cuda_context_get_raw(context); + auto arrow_handle = garrow_gpu_cuda_ipc_memory_handle_get_raw(handle); + std::shared_ptr arrow_buffer; + auto status = arrow_context->OpenIpcBuffer(*arrow_handle, &arrow_buffer); + if (garrow_error_check(error, status, + "[gpu][cuda][buffer][new-ipc]")) { + return garrow_gpu_cuda_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_buffer_new_record_batch: + * @context: A #GArrowGPUCUDAContext. + * @record_batch: A #GArrowRecordBatch to be serialized. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDABuffer on + * success, %NULL on error. The buffer has serialized record batch + * data. + * + * Since: 0.8.0 + */ +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_record_batch(GArrowGPUCUDAContext *context, + GArrowRecordBatch *record_batch, + GError **error) +{ + auto arrow_context = garrow_gpu_cuda_context_get_raw(context); + auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + std::shared_ptr arrow_buffer; + auto status = arrow::gpu::SerializeRecordBatch(*arrow_record_batch, + arrow_context.get(), + &arrow_buffer); + if (garrow_error_check(error, status, + "[gpu][cuda][buffer][new-record-batch]")) { + return garrow_gpu_cuda_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_buffer_copy_to_host: + * @buffer: A #GArrowGPUCUDABuffer. + * @position: The offset of memory on GPU device to be copied. + * @size: The size of memory on GPU device to be copied in bytes. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A #GBytes that have copied memory on CPU + * host on success, %NULL on error. + * + * Since: 0.8.0 + */ +GBytes * +garrow_gpu_cuda_buffer_copy_to_host(GArrowGPUCUDABuffer *buffer, + gint64 position, + gint64 size, + GError **error) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto data = static_cast(g_malloc(size)); + auto status = arrow_buffer->CopyToHost(position, size, data); + if (garrow_error_check(error, status, "[gpu][cuda][buffer][copy-to-host]")) { + return g_bytes_new_take(data, size); + } else { + g_free(data); + return NULL; + } +} + +/** + * garrow_gpu_cuda_buffer_copy_from_host: + * @buffer: A #GArrowGPUCUDABuffer. + * @data: (array length=size): Data on CPU host to be copied. + * @size: The size of data on CPU host to be copied in bytes. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.8.0 + */ +gboolean +garrow_gpu_cuda_buffer_copy_from_host(GArrowGPUCUDABuffer *buffer, + const guint8 *data, + gint64 size, + GError **error) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto status = arrow_buffer->CopyFromHost(0, data, size); + return garrow_error_check(error, + status, + "[gpu][cuda][buffer][copy-from-host]"); +} + +/** + * garrow_gpu_cuda_buffer_export: + * @buffer: A #GArrowGPUCUDABuffer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created + * #GArrowGPUCUDAIPCMemoryHandle to handle the exported buffer on + * success, %NULL on error + * + * Since: 0.8.0 + */ +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_buffer_export(GArrowGPUCUDABuffer *buffer, GError **error) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + std::unique_ptr arrow_handle; + auto status = arrow_buffer->ExportForIpc(&arrow_handle); + if (garrow_error_check(error, status, "[gpu][cuda][buffer][export-for-ipc]")) { + return garrow_gpu_cuda_ipc_memory_handle_new_raw(arrow_handle.release()); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_buffer_get_context: + * @buffer: A #GArrowGPUCUDABuffer. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDAContext for the + * buffer. Contexts for the same buffer share the same data internally. + * + * Since: 0.8.0 + */ +GArrowGPUCUDAContext * +garrow_gpu_cuda_buffer_get_context(GArrowGPUCUDABuffer *buffer) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto arrow_context = arrow_buffer->context(); + return garrow_gpu_cuda_context_new_raw(&arrow_context); +} + +/** + * garrow_gpu_cuda_buffer_read_record_batch: + * @buffer: A #GArrowGPUCUDABuffer. + * @schema: A #GArrowSchema for record batch. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowRecordBatch on + * success, %NULL on error. The record batch data is located on GPU. + * + * Since: 0.8.0 + */ +GArrowRecordBatch * +garrow_gpu_cuda_buffer_read_record_batch(GArrowGPUCUDABuffer *buffer, + GArrowSchema *schema, + GError **error) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto arrow_schema = garrow_schema_get_raw(schema); + auto pool = arrow::default_memory_pool(); + std::shared_ptr arrow_record_batch; + auto status = arrow::gpu::ReadRecordBatch(arrow_schema, + arrow_buffer, + pool, + &arrow_record_batch); + if (garrow_error_check(error, status, + "[gpu][cuda][buffer][read-record-batch]")) { + return garrow_record_batch_new_raw(&arrow_record_batch); + } else { + return NULL; + } +} + + +G_DEFINE_TYPE(GArrowGPUCUDAHostBuffer, + garrow_gpu_cuda_host_buffer, + GARROW_TYPE_MUTABLE_BUFFER) + +static void +garrow_gpu_cuda_host_buffer_init(GArrowGPUCUDAHostBuffer *object) +{ +} + +static void +garrow_gpu_cuda_host_buffer_class_init(GArrowGPUCUDAHostBufferClass *klass) +{ +} + +/** + * garrow_gpu_cuda_host_buffer_new: + * @size: The number of bytes to be allocated on CPU host. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GArrowGPUCUDAHostBuffer on success, + * %NULL on error. The allocated memory is accessible from GPU + * device for the @context. + * + * Since: 0.8.0 + */ +GArrowGPUCUDAHostBuffer * +garrow_gpu_cuda_host_buffer_new(gint64 size, GError **error) +{ + arrow::gpu::CudaDeviceManager *manager; + auto status = arrow::gpu::CudaDeviceManager::GetInstance(&manager); + std::shared_ptr arrow_buffer; + status = manager->AllocateHost(size, &arrow_buffer); + if (garrow_error_check(error, status, "[gpu][cuda][host-buffer][new]")) { + return garrow_gpu_cuda_host_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + + +typedef struct GArrowGPUCUDAIPCMemoryHandlePrivate_ { + arrow::gpu::CudaIpcMemHandle *ipc_memory_handle; +} GArrowGPUCUDAIPCMemoryHandlePrivate; + +enum { + PROP_IPC_MEMORY_HANDLE = 1 +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowGPUCUDAIPCMemoryHandle, + garrow_gpu_cuda_ipc_memory_handle, + G_TYPE_OBJECT) + +#define GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object) \ + static_cast( \ + garrow_gpu_cuda_ipc_memory_handle_get_instance_private( \ + GARROW_GPU_CUDA_IPC_MEMORY_HANDLE(object))) + +static void +garrow_gpu_cuda_ipc_memory_handle_finalize(GObject *object) +{ + auto priv = GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object); + + delete priv->ipc_memory_handle; + + G_OBJECT_CLASS(garrow_gpu_cuda_ipc_memory_handle_parent_class)->finalize(object); +} + +static void +garrow_gpu_cuda_ipc_memory_handle_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_IPC_MEMORY_HANDLE: + priv->ipc_memory_handle = + static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_gpu_cuda_ipc_memory_handle_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + switch (prop_id) { + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_gpu_cuda_ipc_memory_handle_init(GArrowGPUCUDAIPCMemoryHandle *object) +{ +} + +static void +garrow_gpu_cuda_ipc_memory_handle_class_init(GArrowGPUCUDAIPCMemoryHandleClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_gpu_cuda_ipc_memory_handle_finalize; + gobject_class->set_property = garrow_gpu_cuda_ipc_memory_handle_set_property; + gobject_class->get_property = garrow_gpu_cuda_ipc_memory_handle_get_property; + + /** + * GArrowGPUCUDAIPCMemoryHandle:ipc-memory-handle: + * + * Since: 0.8.0 + */ + spec = g_param_spec_pointer("ipc-memory-handle", + "IPC Memory Handle", + "The raw arrow::gpu::CudaIpcMemHandle *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_IPC_MEMORY_HANDLE, spec); +} + +/** + * garrow_gpu_cuda_ipc_memory_handle_new: + * @data: (array length=size): A serialized #GArrowGPUCUDAIPCMemoryHandle. + * @size: The size of data. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDAIPCMemoryHandle + * on success, %NULL on error. + * + * Since: 0.8.0 + */ +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_ipc_memory_handle_new(const guint8 *data, + gsize size, + GError **error) +{ + std::unique_ptr arrow_handle; + auto status = arrow::gpu::CudaIpcMemHandle::FromBuffer(data, &arrow_handle); + if (garrow_error_check(error, status, + "[gpu][cuda][ipc-memory-handle][new]")) { + return garrow_gpu_cuda_ipc_memory_handle_new_raw(arrow_handle.release()); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_ipc_memory_handle_serialize: + * @handle: A #GArrowGPUCUDAIPCMemoryHandle. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowBuffer on success, + * %NULL on error. The buffer has serialized @handle. The serialized + * @handle can be deserialized by garrow_gpu_cuda_ipc_memory_handle_new() + * in other process. + * + * Since: 0.8.0 + */ +GArrowBuffer * +garrow_gpu_cuda_ipc_memory_handle_serialize(GArrowGPUCUDAIPCMemoryHandle *handle, + GError **error) +{ + auto arrow_handle = garrow_gpu_cuda_ipc_memory_handle_get_raw(handle); + std::shared_ptr arrow_buffer; + auto status = arrow_handle->Serialize(arrow::default_memory_pool(), + &arrow_buffer); + if (garrow_error_check(error, status, + "[gpu][cuda][ipc-memory-handle][serialize]")) { + return garrow_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +GArrowBuffer * +garrow_gpu_cuda_buffer_input_stream_new_raw_readable_interface(std::shared_ptr *arrow_buffer) +{ + auto buffer = GARROW_BUFFER(g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER, + "buffer", arrow_buffer, + NULL)); + return buffer; +} + +static std::shared_ptr +garrow_gpu_cuda_buffer_input_stream_get_raw_readable_interface(GArrowReadable *readable) +{ + auto input_stream = GARROW_INPUT_STREAM(readable); + auto arrow_input_stream = garrow_input_stream_get_raw(input_stream); + return arrow_input_stream; +} + +static void +garrow_gpu_cuda_buffer_input_stream_readable_interface_init(GArrowReadableInterface *iface) +{ + iface->new_raw = + garrow_gpu_cuda_buffer_input_stream_new_raw_readable_interface; + iface->get_raw = + garrow_gpu_cuda_buffer_input_stream_get_raw_readable_interface; +} + +G_DEFINE_TYPE_WITH_CODE( + GArrowGPUCUDABufferInputStream, + garrow_gpu_cuda_buffer_input_stream, + GARROW_TYPE_BUFFER_INPUT_STREAM, + G_IMPLEMENT_INTERFACE( + GARROW_TYPE_READABLE, + garrow_gpu_cuda_buffer_input_stream_readable_interface_init)) + +static void +garrow_gpu_cuda_buffer_input_stream_init(GArrowGPUCUDABufferInputStream *object) +{ +} + +static void +garrow_gpu_cuda_buffer_input_stream_class_init(GArrowGPUCUDABufferInputStreamClass *klass) +{ +} + +/** + * garrow_gpu_cuda_buffer_input_stream_new: + * @buffer: A #GArrowGPUCUDABuffer. + * + * Returns: (transfer full): A newly created + * #GArrowGPUCUDABufferInputStream. + * + * Since: 0.8.0 + */ +GArrowGPUCUDABufferInputStream * +garrow_gpu_cuda_buffer_input_stream_new(GArrowGPUCUDABuffer *buffer) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto arrow_reader = + std::make_shared(arrow_buffer); + return garrow_gpu_cuda_buffer_input_stream_new_raw(&arrow_reader); +} + + +G_DEFINE_TYPE(GArrowGPUCUDABufferOutputStream, + garrow_gpu_cuda_buffer_output_stream, + GARROW_TYPE_OUTPUT_STREAM) + +static void +garrow_gpu_cuda_buffer_output_stream_init(GArrowGPUCUDABufferOutputStream *object) +{ +} + +static void +garrow_gpu_cuda_buffer_output_stream_class_init(GArrowGPUCUDABufferOutputStreamClass *klass) +{ +} + +/** + * garrow_gpu_cuda_buffer_output_stream_new: + * @buffer: A #GArrowGPUCUDABuffer. + * + * Returns: (transfer full): A newly created + * #GArrowGPUCUDABufferOutputStream. + * + * Since: 0.8.0 + */ +GArrowGPUCUDABufferOutputStream * +garrow_gpu_cuda_buffer_output_stream_new(GArrowGPUCUDABuffer *buffer) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto arrow_writer = + std::make_shared(arrow_buffer); + return garrow_gpu_cuda_buffer_output_stream_new_raw(&arrow_writer); +} + +/** + * garrow_gpu_cuda_buffer_output_stream_set_buffer_size: + * @stream: A #GArrowGPUCUDABufferOutputStream. + * @size: A size of CPU buffer in bytes. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Sets CPU buffer size. to limit `cudaMemcpy()` calls. If CPU buffer + * size is `0`, buffering is disabled. + * + * The default is `0`. + * + * Since: 0.8.0 + */ +gboolean +garrow_gpu_cuda_buffer_output_stream_set_buffer_size(GArrowGPUCUDABufferOutputStream *stream, + gint64 size, + GError **error) +{ + auto arrow_stream = garrow_gpu_cuda_buffer_output_stream_get_raw(stream); + auto status = arrow_stream->SetBufferSize(size); + return garrow_error_check(error, + status, + "[gpu][cuda][buffer-output-stream][set-buffer-size]"); +} + +/** + * garrow_gpu_cuda_buffer_output_stream_get_buffer_size: + * @stream: A #GArrowGPUCUDABufferOutputStream. + * + * Returns: The CPU buffer size in bytes. + * + * See garrow_gpu_cuda_buffer_output_stream_set_buffer_size() for CPU + * buffer size details. + * + * Since: 0.8.0 + */ +gint64 +garrow_gpu_cuda_buffer_output_stream_get_buffer_size(GArrowGPUCUDABufferOutputStream *stream) +{ + auto arrow_stream = garrow_gpu_cuda_buffer_output_stream_get_raw(stream); + return arrow_stream->buffer_size(); +} + +/** + * garrow_gpu_cuda_buffer_output_stream_get_buffered_size: + * @stream: A #GArrowGPUCUDABufferOutputStream. + * + * Returns: The size of buffered data in bytes. + * + * Since: 0.8.0 + */ +gint64 +garrow_gpu_cuda_buffer_output_stream_get_buffered_size(GArrowGPUCUDABufferOutputStream *stream) +{ + auto arrow_stream = garrow_gpu_cuda_buffer_output_stream_get_raw(stream); + return arrow_stream->num_bytes_buffered(); +} + + +G_END_DECLS + +GArrowGPUCUDAContext * +garrow_gpu_cuda_context_new_raw(std::shared_ptr *arrow_context) +{ + return GARROW_GPU_CUDA_CONTEXT(g_object_new(GARROW_GPU_TYPE_CUDA_CONTEXT, + "context", arrow_context, + NULL)); +} + +std::shared_ptr +garrow_gpu_cuda_context_get_raw(GArrowGPUCUDAContext *context) +{ + if (!context) + return nullptr; + + auto priv = GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(context); + return priv->context; +} + +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_ipc_memory_handle_new_raw(arrow::gpu::CudaIpcMemHandle *arrow_handle) +{ + auto handle = g_object_new(GARROW_GPU_TYPE_CUDA_IPC_MEMORY_HANDLE, + "ipc-memory-handle", arrow_handle, + NULL); + return GARROW_GPU_CUDA_IPC_MEMORY_HANDLE(handle); +} + +arrow::gpu::CudaIpcMemHandle * +garrow_gpu_cuda_ipc_memory_handle_get_raw(GArrowGPUCUDAIPCMemoryHandle *handle) +{ + if (!handle) + return nullptr; + + auto priv = GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(handle); + return priv->ipc_memory_handle; +} + +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_raw(std::shared_ptr *arrow_buffer) +{ + return GARROW_GPU_CUDA_BUFFER(g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER, + "buffer", arrow_buffer, + NULL)); +} + +std::shared_ptr +garrow_gpu_cuda_buffer_get_raw(GArrowGPUCUDABuffer *buffer) +{ + if (!buffer) + return nullptr; + + auto arrow_buffer = garrow_buffer_get_raw(GARROW_BUFFER(buffer)); + return std::static_pointer_cast(arrow_buffer); +} + +GArrowGPUCUDAHostBuffer * +garrow_gpu_cuda_host_buffer_new_raw(std::shared_ptr *arrow_buffer) +{ + auto buffer = g_object_new(GARROW_GPU_TYPE_CUDA_HOST_BUFFER, + "buffer", arrow_buffer, + NULL); + return GARROW_GPU_CUDA_HOST_BUFFER(buffer); +} + +std::shared_ptr +garrow_gpu_cuda_host_buffer_get_raw(GArrowGPUCUDAHostBuffer *buffer) +{ + if (!buffer) + return nullptr; + + auto arrow_buffer = garrow_buffer_get_raw(GARROW_BUFFER(buffer)); + return std::static_pointer_cast(arrow_buffer); +} + +GArrowGPUCUDABufferInputStream * +garrow_gpu_cuda_buffer_input_stream_new_raw(std::shared_ptr *arrow_reader) +{ + auto input_stream = g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER_INPUT_STREAM, + "input-stream", arrow_reader, + NULL); + return GARROW_GPU_CUDA_BUFFER_INPUT_STREAM(input_stream); +} + +std::shared_ptr +garrow_gpu_cuda_buffer_input_stream_get_raw(GArrowGPUCUDABufferInputStream *input_stream) +{ + if (!input_stream) + return nullptr; + + auto arrow_reader = + garrow_input_stream_get_raw(GARROW_INPUT_STREAM(input_stream)); + return std::static_pointer_cast(arrow_reader); +} + +GArrowGPUCUDABufferOutputStream * +garrow_gpu_cuda_buffer_output_stream_new_raw(std::shared_ptr *arrow_writer) +{ + auto output_stream = g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER_OUTPUT_STREAM, + "output-stream", arrow_writer, + NULL); + return GARROW_GPU_CUDA_BUFFER_OUTPUT_STREAM(output_stream); +} + +std::shared_ptr +garrow_gpu_cuda_buffer_output_stream_get_raw(GArrowGPUCUDABufferOutputStream *output_stream) +{ + if (!output_stream) + return nullptr; + + auto arrow_writer = + garrow_output_stream_get_raw(GARROW_OUTPUT_STREAM(output_stream)); + return std::static_pointer_cast(arrow_writer); +} diff --git a/c_glib/arrow-gpu-glib/cuda.h b/c_glib/arrow-gpu-glib/cuda.h new file mode 100644 index 0000000000000..7c615a144e739 --- /dev/null +++ b/c_glib/arrow-gpu-glib/cuda.h @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +#define GARROW_GPU_TYPE_CUDA_DEVICE_MANAGER \ + (garrow_gpu_cuda_device_manager_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDADeviceManager, + garrow_gpu_cuda_device_manager, + GARROW_GPU, + CUDA_DEVICE_MANAGER, + GObject) +struct _GArrowGPUCUDADeviceManagerClass +{ + GObjectClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_CONTEXT (garrow_gpu_cuda_context_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDAContext, + garrow_gpu_cuda_context, + GARROW_GPU, + CUDA_CONTEXT, + GObject) +struct _GArrowGPUCUDAContextClass +{ + GObjectClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_BUFFER (garrow_gpu_cuda_buffer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDABuffer, + garrow_gpu_cuda_buffer, + GARROW_GPU, + CUDA_BUFFER, + GArrowBuffer) +struct _GArrowGPUCUDABufferClass +{ + GArrowBufferClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_HOST_BUFFER (garrow_gpu_cuda_host_buffer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDAHostBuffer, + garrow_gpu_cuda_host_buffer, + GARROW_GPU, + CUDA_HOST_BUFFER, + GArrowMutableBuffer) +struct _GArrowGPUCUDAHostBufferClass +{ + GArrowMutableBufferClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_IPC_MEMORY_HANDLE \ + (garrow_gpu_cuda_ipc_memory_handle_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDAIPCMemoryHandle, + garrow_gpu_cuda_ipc_memory_handle, + GARROW_GPU, + CUDA_IPC_MEMORY_HANDLE, + GObject) +struct _GArrowGPUCUDAIPCMemoryHandleClass +{ + GObjectClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_BUFFER_INPUT_STREAM \ + (garrow_gpu_cuda_buffer_input_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDABufferInputStream, + garrow_gpu_cuda_buffer_input_stream, + GARROW_GPU, + CUDA_BUFFER_INPUT_STREAM, + GArrowBufferInputStream) +struct _GArrowGPUCUDABufferInputStreamClass +{ + GArrowBufferInputStreamClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_BUFFER_OUTPUT_STREAM \ + (garrow_gpu_cuda_buffer_output_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDABufferOutputStream, + garrow_gpu_cuda_buffer_output_stream, + GARROW_GPU, + CUDA_BUFFER_OUTPUT_STREAM, + GArrowOutputStream) +struct _GArrowGPUCUDABufferOutputStreamClass +{ + GArrowOutputStreamClass parent_class; +}; + +GArrowGPUCUDADeviceManager * +garrow_gpu_cuda_device_manager_new(GError **error); + +GArrowGPUCUDAContext * +garrow_gpu_cuda_device_manager_get_context(GArrowGPUCUDADeviceManager *manager, + gint gpu_number, + GError **error); +gsize +garrow_gpu_cuda_device_manager_get_n_devices(GArrowGPUCUDADeviceManager *manager); + +gint64 +garrow_gpu_cuda_context_get_allocated_size(GArrowGPUCUDAContext *context); + + +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new(GArrowGPUCUDAContext *context, + gint64 size, + GError **error); +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_ipc(GArrowGPUCUDAContext *context, + GArrowGPUCUDAIPCMemoryHandle *handle, + GError **error); +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_record_batch(GArrowGPUCUDAContext *context, + GArrowRecordBatch *record_batch, + GError **error); +GBytes * +garrow_gpu_cuda_buffer_copy_to_host(GArrowGPUCUDABuffer *buffer, + gint64 position, + gint64 size, + GError **error); +gboolean +garrow_gpu_cuda_buffer_copy_from_host(GArrowGPUCUDABuffer *buffer, + const guint8 *data, + gint64 size, + GError **error); +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_buffer_export(GArrowGPUCUDABuffer *buffer, + GError **error); +GArrowGPUCUDAContext * +garrow_gpu_cuda_buffer_get_context(GArrowGPUCUDABuffer *buffer); +GArrowRecordBatch * +garrow_gpu_cuda_buffer_read_record_batch(GArrowGPUCUDABuffer *buffer, + GArrowSchema *schema, + GError **error); + + +GArrowGPUCUDAHostBuffer * +garrow_gpu_cuda_host_buffer_new(gint64 size, GError **error); + +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_ipc_memory_handle_new(const guint8 *data, + gsize size, + GError **error); + +GArrowBuffer * +garrow_gpu_cuda_ipc_memory_handle_serialize(GArrowGPUCUDAIPCMemoryHandle *handle, + GError **error); + +GArrowGPUCUDABufferInputStream * +garrow_gpu_cuda_buffer_input_stream_new(GArrowGPUCUDABuffer *buffer); + +GArrowGPUCUDABufferOutputStream * +garrow_gpu_cuda_buffer_output_stream_new(GArrowGPUCUDABuffer *buffer); + +gboolean +garrow_gpu_cuda_buffer_output_stream_set_buffer_size(GArrowGPUCUDABufferOutputStream *stream, + gint64 size, + GError **error); +gint64 +garrow_gpu_cuda_buffer_output_stream_get_buffer_size(GArrowGPUCUDABufferOutputStream *stream); +gint64 +garrow_gpu_cuda_buffer_output_stream_get_buffered_size(GArrowGPUCUDABufferOutputStream *stream); + +G_END_DECLS diff --git a/c_glib/arrow-gpu-glib/cuda.hpp b/c_glib/arrow-gpu-glib/cuda.hpp new file mode 100644 index 0000000000000..3eeff8b6f18ed --- /dev/null +++ b/c_glib/arrow-gpu-glib/cuda.hpp @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +GArrowGPUCUDAContext * +garrow_gpu_cuda_context_new_raw(std::shared_ptr *arrow_context); +std::shared_ptr +garrow_gpu_cuda_context_get_raw(GArrowGPUCUDAContext *context); + +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_ipc_memory_handle_new_raw(arrow::gpu::CudaIpcMemHandle *arrow_handle); +arrow::gpu::CudaIpcMemHandle * +garrow_gpu_cuda_ipc_memory_handle_get_raw(GArrowGPUCUDAIPCMemoryHandle *handle); + +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_raw(std::shared_ptr *arrow_buffer); +std::shared_ptr +garrow_gpu_cuda_buffer_get_raw(GArrowGPUCUDABuffer *buffer); + +GArrowGPUCUDAHostBuffer * +garrow_gpu_cuda_host_buffer_new_raw(std::shared_ptr *arrow_buffer); +std::shared_ptr +garrow_gpu_cuda_host_buffer_get_raw(GArrowGPUCUDAHostBuffer *buffer); + +GArrowGPUCUDABufferInputStream * +garrow_gpu_cuda_buffer_input_stream_new_raw(std::shared_ptr *arrow_reader); +std::shared_ptr +garrow_gpu_cuda_buffer_input_stream_get_raw(GArrowGPUCUDABufferInputStream *input_stream); + +GArrowGPUCUDABufferOutputStream * +garrow_gpu_cuda_buffer_output_stream_new_raw(std::shared_ptr *arrow_writer); +std::shared_ptr +garrow_gpu_cuda_buffer_output_stream_get_raw(GArrowGPUCUDABufferOutputStream *output_stream); diff --git a/c_glib/arrow-gpu-glib/meson.build b/c_glib/arrow-gpu-glib/meson.build new file mode 100644 index 0000000000000..00c7f079d6485 --- /dev/null +++ b/c_glib/arrow-gpu-glib/meson.build @@ -0,0 +1,80 @@ +# -*- indent-tabs-mode: nil -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sources = files( + 'cuda.cpp', +) + +c_headers = files( + 'arrow-gpu-glib.h', + 'cuda.h', +) + +cpp_headers = files( + 'arrow-gpu-glib.hpp', + 'cuda.hpp', +) + +headers = c_headers + cpp_headers +install_headers(headers, subdir: 'arrow-gpu-glib') + + +dependencies = [ + arrow_gpu_dependency, + libarrow_glib_dependency, +] +libarrow_gpu_glib = library('arrow-gpu-glib', + sources: sources, + install: true, + dependencies: dependencies, + include_directories: [ + root_inc, + ], + soversion: so_version, + version: library_version) +libarrow_gpu_glib_dependency = declare_dependency(link_with: libarrow_gpu_glib, + include_directories: [ + root_inc, + ], + dependencies: dependencies) + +pkgconfig.generate(filebase: 'arrow-gpu-glib', + name: 'Apache Arrow GPU GLib', + description: 'C API for Apache Arrow GPU based on GLib', + version: version, + requires: ['arrow-glib', 'arrow-gpu'], + libraries: [libarrow_gpu_glib], + subdirs: ['arrow-gpu-glib']) + +gnome.generate_gir(libarrow_gpu_glib, + dependencies: arrow_glib_gir_dependency, + sources: sources + c_headers, + namespace: 'ArrowGPU', + nsversion: api_version, + identifier_prefix: 'GArrowGPU', + symbol_prefix: 'garrow_gpu', + export_packages: 'arrow-gpu-glib', + includes: [ + 'Arrow-1.0', + ], + install: true, + extra_args: [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + ]) diff --git a/c_glib/configure.ac b/c_glib/configure.ac index 5db435275a300..eabe7bad51227 100644 --- a/c_glib/configure.ac +++ b/c_glib/configure.ac @@ -34,6 +34,19 @@ AC_CONFIG_HEADERS([config.h]) AM_INIT_AUTOMAKE([1.13 foreign]) AM_SILENT_RULES([yes]) +AC_CANONICAL_HOST +AC_MSG_CHECKING([for macOS]) +case "$host_os" in +darwin*) + os_macos=yes + ;; +*) + os_macos=no + ;; +esac +AC_MSG_RESULT([$os_macos]) +AM_CONDITIONAL(OS_MACOS, test "$os_macos" = "yes") + AC_PROG_CC AC_PROG_CXX AX_CXX_COMPILE_STDCXX_11([ext], [mandatory]) @@ -76,19 +89,47 @@ AC_ARG_WITH(arrow-cpp-build-dir, [GARROW_ARROW_CPP_BUILD_DIR="$withval"], [GARROW_ARROW_CPP_BUILD_DIR=""]) if test "x$GARROW_ARROW_CPP_BUILD_DIR" = "x"; then + USE_ARROW_BUILD_DIR=no + PKG_CHECK_MODULES([ARROW], [arrow arrow-compute]) + PKG_CHECK_MODULES([ARROW_GPU], + [arrow-gpu], + [HAVE_ARROW_GPU=yes], + [HAVE_ARROW_GPU=no]) else - ARROW_INCLUDE_DIR="\$(abs_top_srcdir)/../cpp/src" - ARROW_LIB_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/${GARROW_ARROW_CPP_BUILD_TYPE}" + USE_ARROW_BUILD_DIR=yes - ARROW_CFLAGS="-I${ARROW_INCLUDE_DIR}" - - ARROW_LIBS="-L${ARROW_LIB_DIR} -larrow" + ARROW_BUILD_DIR="${GARROW_ARROW_CPP_BUILD_DIR}" + AC_SUBST(ARROW_BUILD_DIR) + ARROW_SOURCE_INCLUDE_DIR="\$(abs_top_srcdir)/../cpp/src" + ARROW_BUILD_INCLUDE_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/src" + ARROW_LIB_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/${GARROW_ARROW_CPP_BUILD_TYPE}" AC_SUBST(ARROW_LIB_DIR) + ARROW_CFLAGS="-I${ARROW_BUILD_INCLUDE_DIR} -I${ARROW_SOURCE_INCLUDE_DIR}" + ARROW_LIBS="-L${ARROW_LIB_DIR} -larrow" AC_SUBST(ARROW_CFLAGS) AC_SUBST(ARROW_LIBS) + + ARROW_GPU_CFLAGS="" + if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/gpu/arrow-gpu.pc"; then + HAVE_ARROW_GPU=yes + ARROW_GPU_LIBS="-larrow_gpu" + else + HAVE_ARROW_GPU=no + ARROW_GPU_LIBS="" + fi + AC_SUBST(ARROW_GPU_CFLAGS) + AC_SUBST(ARROW_GPU_LIBS) +fi + +AM_CONDITIONAL([USE_ARROW_BUILD_DIR], + [test "$USE_ARROW_BUILD_DIR" = "yes"]) + +AM_CONDITIONAL([HAVE_ARROW_GPU], [test "$HAVE_ARROW_GPU" = "yes"]) +if test "$HAVE_ARROW_GPU" = "yes"; then + AC_DEFINE(HAVE_ARROW_GPU, [1], [Define to 1 if Apache Arrow supports GPU.]) fi exampledir="\$(datadir)/arrow-glib/example" @@ -98,6 +139,8 @@ AC_CONFIG_FILES([ Makefile arrow-glib/Makefile arrow-glib/arrow-glib.pc + arrow-gpu-glib/Makefile + arrow-gpu-glib/arrow-gpu-glib.pc doc/Makefile doc/reference/Makefile doc/reference/xml/Makefile diff --git a/c_glib/doc/Makefile.am b/c_glib/doc/Makefile.am index 85c1d5126097c..1d491ab09110e 100644 --- a/c_glib/doc/Makefile.am +++ b/c_glib/doc/Makefile.am @@ -16,4 +16,4 @@ # under the License. SUBDIRS = \ - reference + reference diff --git a/c_glib/doc/reference/Makefile.am b/c_glib/doc/reference/Makefile.am index 45b11f035183e..4c005c237b300 100644 --- a/c_glib/doc/reference/Makefile.am +++ b/c_glib/doc/reference/Makefile.am @@ -20,7 +20,7 @@ SUBDIRS = \ DOC_MODULE = arrow-glib -DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.sgml +DOC_MAIN_SGML_FILE = $(DOC_MODULE)-docs.xml DOC_SOURCE_DIR = \ $(top_srcdir)/arrow-glib @@ -51,6 +51,17 @@ AM_CFLAGS = \ GTKDOC_LIBS = \ $(top_builddir)/arrow-glib/libarrow-glib.la +if HAVE_ARROW_GPU +DOC_SOURCE_DIR += \ + $(top_srcdir)/arrow-gpu-glib +HFILE_GLOB += \ + $(top_srcdir)/arrow-gpu-glib/*.h +CFILE_GLOB += \ + $(top_srcdir)/arrow-gpu-glib/*.cpp +GTKDOC_LIBS += \ + $(top_builddir)/arrow-gpu-glib/libarrow-gpu-glib.la +endif + include $(srcdir)/gtk-doc.make CLEANFILES += \ diff --git a/c_glib/doc/reference/arrow-glib-docs.sgml b/c_glib/doc/reference/arrow-glib-docs.xml similarity index 95% rename from c_glib/doc/reference/arrow-glib-docs.sgml rename to c_glib/doc/reference/arrow-glib-docs.xml index a504ef1148383..51e7b2a6a6cf5 100644 --- a/c_glib/doc/reference/arrow-glib-docs.sgml +++ b/c_glib/doc/reference/arrow-glib-docs.xml @@ -40,7 +40,8 @@ Data Array - + + Array builder @@ -125,6 +126,16 @@ + + Object Hierarchy diff --git a/c_glib/doc/reference/meson.build b/c_glib/doc/reference/meson.build index 08936daf87288..3374fbde5b9ed 100644 --- a/c_glib/doc/reference/meson.build +++ b/c_glib/doc/reference/meson.build @@ -32,13 +32,26 @@ glib_prefix = dependency('glib-2.0').get_pkgconfig_variable('prefix') glib_doc_path = join_paths(glib_prefix, 'share', 'gtk-doc', 'html') doc_path = join_paths(data_dir, meson.project_name(), 'gtk-doc', 'html') +source_directories = [ + join_paths(meson.source_root(), 'arrow-glib'), + join_paths(meson.build_root(), 'arrow-glib'), +] +dependencies = [ + libarrow_glib_dependency, +] +if arrow_gpu_dependency.found() + source_directories += [ + join_paths(meson.source_root(), 'arrow-gpu-glib'), + join_paths(meson.build_root(), 'arrow-gpu-glib'), + ] + dependencies += [ + libarrow_gpu_glib_dependency, + ] +endif gnome.gtkdoc(meson.project_name(), - main_xml: meson.project_name() + '-docs.sgml', - src_dir: [ - join_paths(meson.source_root(), 'arrow-glib'), - join_paths(meson.build_root(), 'arrow-glib'), - ], - dependencies: libarrow_glib_dependency, + main_xml: meson.project_name() + '-docs.xml', + src_dir: source_directories, + dependencies: dependencies, gobject_typesfile: meson.project_name() + '.types', scan_args: [ '--rebuild-types', diff --git a/c_glib/example/Makefile.am b/c_glib/example/Makefile.am index 3eaf808bc5aa6..f4aca70c657f2 100644 --- a/c_glib/example/Makefile.am +++ b/c_glib/example/Makefile.am @@ -33,6 +33,10 @@ AM_CFLAGS = \ AM_LDFLAGS = \ $(GLIB_LIBS) \ $(builddir)/../arrow-glib/libarrow-glib.la +if USE_ARROW_BUILD_DIR +AM_LDFLAGS += \ + $(ARROW_LIBS) +endif noinst_PROGRAMS = \ build \ diff --git a/c_glib/example/go/read-batch.go b/c_glib/example/go/read-batch.go index ef1a7fb5a69f9..1472939cd9284 100644 --- a/c_glib/example/go/read-batch.go +++ b/c_glib/example/go/read-batch.go @@ -57,8 +57,8 @@ func PrintColumnValue(column *arrow.Array, i int64) { func PrintRecordBatch(recordBatch *arrow.RecordBatch) { nColumns := recordBatch.GetNColumns() for i := uint32(0); i < nColumns; i++ { - column := recordBatch.GetColumn(i) - columnName := recordBatch.GetColumnName(i) + column := recordBatch.GetColumn(int32(i)) + columnName := recordBatch.GetColumnName(int32(i)) fmt.Printf(" %s: [", columnName) nRows := recordBatch.GetNRows() for j := int64(0); j < nRows; j++ { diff --git a/c_glib/example/go/read-stream.go b/c_glib/example/go/read-stream.go index 7bd076473f667..ed75a96c95c39 100644 --- a/c_glib/example/go/read-stream.go +++ b/c_glib/example/go/read-stream.go @@ -57,8 +57,8 @@ func PrintColumnValue(column *arrow.Array, i int64) { func PrintRecordBatch(recordBatch *arrow.RecordBatch) { nColumns := recordBatch.GetNColumns() for i := uint32(0); i < nColumns; i++ { - column := recordBatch.GetColumn(i) - columnName := recordBatch.GetColumnName(i) + column := recordBatch.GetColumn(int32(i)) + columnName := recordBatch.GetColumnName(int32(i)) fmt.Printf(" %s: [", columnName) nRows := recordBatch.GetNRows() for j := int64(0); j < nRows; j++ { diff --git a/c_glib/example/go/write-batch.go b/c_glib/example/go/write-batch.go index 9dbc3c00acc50..f4d03ed922eb5 100644 --- a/c_glib/example/go/write-batch.go +++ b/c_glib/example/go/write-batch.go @@ -188,7 +188,10 @@ func main() { BuildDoubleArray(), } - recordBatch := arrow.NewRecordBatch(schema, 4, columns) + recordBatch, err := arrow.NewRecordBatch(schema, 4, columns) + if err != nil { + log.Fatalf("Failed to create record batch #1: %v", err) + } _, err = writer.WriteRecordBatch(recordBatch) if err != nil { log.Fatalf("Failed to write record batch #1: %v", err) @@ -198,7 +201,10 @@ func main() { for i, column := range columns { slicedColumns[i] = column.Slice(1, 3) } - recordBatch = arrow.NewRecordBatch(schema, 3, slicedColumns) + recordBatch, err = arrow.NewRecordBatch(schema, 3, slicedColumns) + if err != nil { + log.Fatalf("Failed to create record batch #2: %v", err) + } _, err = writer.WriteRecordBatch(recordBatch) if err != nil { log.Fatalf("Failed to write record batch #2: %v", err) diff --git a/c_glib/example/go/write-stream.go b/c_glib/example/go/write-stream.go index 244741e8cfeb0..7225156a7be84 100644 --- a/c_glib/example/go/write-stream.go +++ b/c_glib/example/go/write-stream.go @@ -188,7 +188,10 @@ func main() { BuildDoubleArray(), } - recordBatch := arrow.NewRecordBatch(schema, 4, columns) + recordBatch, err := arrow.NewRecordBatch(schema, 4, columns) + if err != nil { + log.Fatalf("Failed to create record batch #1: %v", err) + } _, err = writer.WriteRecordBatch(recordBatch) if err != nil { log.Fatalf("Failed to write record batch #1: %v", err) @@ -198,7 +201,10 @@ func main() { for i, column := range columns { slicedColumns[i] = column.Slice(1, 3) } - recordBatch = arrow.NewRecordBatch(schema, 3, slicedColumns) + recordBatch, err = arrow.NewRecordBatch(schema, 3, slicedColumns) + if err != nil { + log.Fatalf("Failed to create record batch #2: %v", err) + } writer.WriteRecordBatch(recordBatch) _, err = writer.WriteRecordBatch(recordBatch) if err != nil { diff --git a/c_glib/meson.build b/c_glib/meson.build index 1fa64ba19c406..9fe1b8cbd7179 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -49,6 +49,10 @@ pkgconfig = import('pkgconfig') root_inc = include_directories('.') subdir('arrow-glib') +arrow_gpu_dependency = dependency('arrow-gpu', required: false) +if arrow_gpu_dependency.found() + subdir('arrow-gpu-glib') +endif subdir('example') if get_option('enable_gtk_doc') @@ -58,4 +62,7 @@ endif run_test = find_program('test/run-test.sh') test('unit test', run_test, - env: ['ARROW_GLIB_TYPELIB_DIR=@0@/arrow-glib'.format(meson.build_root())]) + env: [ + 'ARROW_GLIB_TYPELIB_DIR=@0@/arrow-glib'.format(meson.build_root()), + 'ARROW_GPU_GLIB_TYPELIB_DIR=@0@/arrow-gpu-glib'.format(meson.build_root()), + ]) diff --git a/c_glib/test/run-test.rb b/c_glib/test/run-test.rb index 3451bd29fde1b..392c56f33ae51 100755 --- a/c_glib/test/run-test.rb +++ b/c_glib/test/run-test.rb @@ -37,6 +37,12 @@ def initialize(data) end end +begin + ArrowGPU = GI.load("ArrowGPU") +rescue GObjectIntrospection::RepositoryError::TypelibNotFound +end + +require "rbconfig" require "tempfile" require_relative "helper/buildable" require_relative "helper/omittable" diff --git a/c_glib/test/run-test.sh b/c_glib/test/run-test.sh index 19ccf077833a5..d563e8586ce59 100755 --- a/c_glib/test/run-test.sh +++ b/c_glib/test/run-test.sh @@ -20,27 +20,34 @@ test_dir="$(cd $(dirname $0); pwd)" build_dir="$(cd .; pwd)" -arrow_glib_build_dir="${build_dir}/arrow-glib/" -libtool_dir="${arrow_glib_build_dir}/.libs" -if [ -d "${libtool_dir}" ]; then - LD_LIBRARY_PATH="${libtool_dir}:${LD_LIBRARY_PATH}" -else - if [ -d "${arrow_glib_build_dir}" ]; then - LD_LIBRARY_PATH="${arrow_glib_build_dir}:${LD_LIBRARY_PATH}" +modules="arrow-glib arrow-gpu-glib" + +for module in ${modules}; do + module_build_dir="${build_dir}/${module}" + libtool_dir="${module_build_dir}/.libs" + if [ -d "${libtool_dir}" ]; then + LD_LIBRARY_PATH="${libtool_dir}:${LD_LIBRARY_PATH}" + else + if [ -d "${module_build_dir}" ]; then + LD_LIBRARY_PATH="${module_build_dir}:${LD_LIBRARY_PATH}" + fi fi -fi +done if [ -f "Makefile" -a "${NO_MAKE}" != "yes" ]; then make -j8 > /dev/null || exit $? fi -arrow_glib_typelib_dir="${ARROW_GLIB_TYPELIB_DIR}" -if [ -z "${arrow_glib_typelib_dir}" ]; then - arrow_glib_typelib_dir="${build_dir}/arrow-glib" -fi +for module in ${modules}; do + MODULE_TYPELIB_DIR_VAR_NAME="$(echo ${module} | tr a-z- A-Z_)_TYPELIB_DIR" + module_typelib_dir=$(eval "echo \${${MODULE_TYPELIB_DIR_VAR_NAME}}") + if [ -z "${module_typelib_dir}" ]; then + module_typelib_dir="${build_dir}/${module}" + fi -if [ -d "${arrow_glib_typelib_dir}" ]; then - GI_TYPELIB_PATH="${arrow_glib_typelib_dir}:${GI_TYPELIB_PATH}" -fi + if [ -d "${module_typelib_dir}" ]; then + GI_TYPELIB_PATH="${module_typelib_dir}:${GI_TYPELIB_PATH}" + fi +done ${GDB} ruby ${test_dir}/run-test.rb "$@" diff --git a/c_glib/test/test-array-builder.rb b/c_glib/test/test-array-builder.rb index 92976a424ccad..a773131e15b5d 100644 --- a/c_glib/test/test-array-builder.rb +++ b/c_glib/test/test-array-builder.rb @@ -76,6 +76,18 @@ def test_negative end end +module ArrayBuilderValueTypeTests + def test_value_data_type + assert_equal(value_data_type, + build_array(sample_values).value_data_type) + end + + def test_value_type + assert_equal(value_data_type.id, + build_array(sample_values).value_type) + end +end + class TestArrayBuilder < Test::Unit::TestCase include Helper::Buildable include Helper::Omittable @@ -93,6 +105,10 @@ def create_builder Arrow::BooleanArrayBuilder.new end + def value_data_type + Arrow::BooleanDataType.new + end + def builder_class_name "boolean-array-builder" end @@ -101,6 +117,10 @@ def sample_values [true, false, true] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -115,6 +135,10 @@ def create_builder Arrow::IntArrayBuilder.new end + def value_data_type + Arrow::Int8DataType.new + end + def builder_class_name "int-array-builder" end @@ -123,6 +147,10 @@ def sample_values [1, -2, 3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -137,6 +165,10 @@ def create_builder Arrow::UIntArrayBuilder.new end + def value_data_type + Arrow::UInt8DataType.new + end + def builder_class_name "uint-array-builder" end @@ -145,6 +177,10 @@ def sample_values [1, 2, 3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -159,6 +195,10 @@ def create_builder Arrow::Int8ArrayBuilder.new end + def value_data_type + Arrow::Int8DataType.new + end + def builder_class_name "int8-array-builder" end @@ -167,6 +207,10 @@ def sample_values [1, -2, 3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -181,6 +225,10 @@ def create_builder Arrow::UInt8ArrayBuilder.new end + def value_data_type + Arrow::UInt8DataType.new + end + def builder_class_name "uint8-array-builder" end @@ -189,6 +237,10 @@ def sample_values [1, 2, 3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -203,6 +255,10 @@ def create_builder Arrow::Int16ArrayBuilder.new end + def value_data_type + Arrow::Int16DataType.new + end + def builder_class_name "int16-array-builder" end @@ -211,6 +267,10 @@ def sample_values [1, -2, 3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -225,6 +285,10 @@ def create_builder Arrow::UInt16ArrayBuilder.new end + def value_data_type + Arrow::UInt16DataType.new + end + def builder_class_name "uint16-array-builder" end @@ -233,6 +297,10 @@ def sample_values [1, 2, 3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -247,6 +315,10 @@ def create_builder Arrow::Int32ArrayBuilder.new end + def value_data_type + Arrow::Int32DataType.new + end + def builder_class_name "int32-array-builder" end @@ -255,6 +327,10 @@ def sample_values [1, -2, 3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -269,6 +345,10 @@ def create_builder Arrow::UInt32ArrayBuilder.new end + def value_data_type + Arrow::UInt32DataType.new + end + def builder_class_name "uint32-array-builder" end @@ -277,6 +357,10 @@ def sample_values [1, 2, 3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -291,6 +375,10 @@ def create_builder Arrow::Int64ArrayBuilder.new end + def value_data_type + Arrow::Int64DataType.new + end + def builder_class_name "int64-array-builder" end @@ -299,6 +387,10 @@ def sample_values [1, -2, 3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -313,6 +405,10 @@ def create_builder Arrow::UInt64ArrayBuilder.new end + def value_data_type + Arrow::UInt64DataType.new + end + def builder_class_name "uint64-array-builder" end @@ -321,6 +417,10 @@ def sample_values [1, 2, 3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -335,6 +435,10 @@ def create_builder Arrow::FloatArrayBuilder.new end + def value_data_type + Arrow::FloatDataType.new + end + def builder_class_name "float-array-builder" end @@ -343,6 +447,10 @@ def sample_values [1.1, -2.2, 3.3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -357,6 +465,10 @@ def create_builder Arrow::DoubleArrayBuilder.new end + def value_data_type + Arrow::DoubleDataType.new + end + def builder_class_name "double-array-builder" end @@ -365,6 +477,10 @@ def sample_values [1.1, -2.2, 3.3] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -379,6 +495,10 @@ def create_builder Arrow::Date32ArrayBuilder.new end + def value_data_type + Arrow::Date32DataType.new + end + def builder_class_name "date32-array-builder" end @@ -391,6 +511,10 @@ def sample_values ] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -405,6 +529,10 @@ def create_builder Arrow::Date64ArrayBuilder.new end + def value_data_type + Arrow::Date64DataType.new + end + def builder_class_name "date64-array-builder" end @@ -417,6 +545,10 @@ def sample_values ] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -432,6 +564,10 @@ def create_builder Arrow::TimestampArrayBuilder.new(data_type) end + def value_data_type + Arrow::TimestampDataType.new(:milli) + end + def builder_class_name "timestamp-array-builder" end @@ -444,6 +580,10 @@ def sample_values ] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -459,6 +599,10 @@ def create_builder Arrow::Time32ArrayBuilder.new(data_type) end + def value_data_type + Arrow::Time32DataType.new(:second) + end + def builder_class_name "time32-array-builder" end @@ -471,6 +615,10 @@ def sample_values ] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end @@ -486,6 +634,10 @@ def create_builder Arrow::Time64ArrayBuilder.new(data_type) end + def value_data_type + Arrow::Time64DataType.new(:micro) + end + def builder_class_name "time64-array-builder" end @@ -498,6 +650,10 @@ def sample_values ] end + sub_test_case("value type") do + include ArrayBuilderValueTypeTests + end + sub_test_case("#append_values") do include ArrayBuilderAppendValuesTests end diff --git a/c_glib/test/test-chunked-array.rb b/c_glib/test/test-chunked-array.rb index cde7a8b0c61f1..9287058e1abc6 100644 --- a/c_glib/test/test-chunked-array.rb +++ b/c_glib/test/test-chunked-array.rb @@ -31,6 +31,24 @@ def test_equal Arrow::ChunkedArray.new(chunks2)) end + def test_value_data_type + chunks = [ + build_boolean_array([true, false]), + build_boolean_array([true]), + ] + assert_equal(Arrow::BooleanDataType.new, + Arrow::ChunkedArray.new(chunks).value_data_type) + end + + def test_value_type + chunks = [ + build_boolean_array([true, false]), + build_boolean_array([true]), + ] + assert_equal(Arrow::Type::BOOL, + Arrow::ChunkedArray.new(chunks).value_type) + end + def test_length chunks = [ build_boolean_array([true, false]), diff --git a/c_glib/test/test-dictionary-array.rb b/c_glib/test/test-dictionary-array.rb new file mode 100644 index 0000000000000..d4f4b3443057d --- /dev/null +++ b/c_glib/test/test-dictionary-array.rb @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDictionaryArray < Test::Unit::TestCase + include Helper::Buildable + + def setup + @index_data_type = Arrow::Int32DataType.new + @dictionary = build_string_array(["C", "C++", "Ruby"]) + @ordered = false + @data_type = Arrow::DictionaryDataType.new(@index_data_type, + @dictionary, + @ordered) + end + + sub_test_case(".new") do + def test_new + indices = build_int32_array([0, 2, 2, 1, 0]) + dictionary_array = Arrow::DictionaryArray.new(@data_type, indices) + assert_equal(<<-STRING.chomp, dictionary_array.to_s) + +-- is_valid: all not null +-- dictionary: ["C", "C++", "Ruby"] +-- indices: [0, 2, 2, 1, 0] + STRING + end + end + + sub_test_case("instance methods") do + def setup + super + @indices = build_int32_array([0, 2, 2, 1, 0]) + @dictionary_array = Arrow::DictionaryArray.new(@data_type, @indices) + end + + def test_indices + assert_equal(@indices, @dictionary_array.indices) + end + + def test_dictionary + assert_equal(@dictionary, @dictionary_array.dictionary) + end + + def test_dictionary_data_type + assert_equal(@data_type, + @dictionary_array.dictionary_data_type) + end + end +end diff --git a/c_glib/test/test-dictionary-data-type.rb b/c_glib/test/test-dictionary-data-type.rb new file mode 100644 index 0000000000000..5530a0415cb28 --- /dev/null +++ b/c_glib/test/test-dictionary-data-type.rb @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDictionaryDataType < Test::Unit::TestCase + include Helper::Buildable + + def setup + @index_data_type = Arrow::Int32DataType.new + @dictionary = build_string_array(["C", "C++", "Ruby"]) + @ordered = true + @data_type = Arrow::DictionaryDataType.new(@index_data_type, + @dictionary, + @ordered) + end + + def test_type + assert_equal(Arrow::Type::DICTIONARY, @data_type.id) + end + + def test_to_s + assert_equal("dictionary", + @data_type.to_s) + end + + def test_bit_width + assert_equal(32, @data_type.bit_width) + end + + def test_index_data_type + assert_equal(@index_data_type, @data_type.index_data_type) + end + + def test_dictionary + assert_equal(@dictionary, @data_type.dictionary) + end + + def test_ordered? + assert do + @data_type.ordered? + end + end +end diff --git a/c_glib/test/test-dictionary-encode.rb b/c_glib/test/test-dictionary-encode.rb new file mode 100644 index 0000000000000..ea77be64ae41d --- /dev/null +++ b/c_glib/test/test-dictionary-encode.rb @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDictionaryEncode < Test::Unit::TestCase + include Helper::Buildable + include Helper::Omittable + + def test_int32 + array = build_int32_array([1, 3, 1, -1, -3, -1]) + assert_equal(<<-STRING.chomp, array.dictionary_encode.to_s) + +-- is_valid: all not null +-- dictionary: [1, 3, -1, -3] +-- indices: [0, 1, 0, 2, 3, 2] + STRING + end + + def test_string + array = build_string_array(["Ruby", "Python", "Ruby"]) + assert_equal(<<-STRING.chomp, array.dictionary_encode.to_s) + +-- is_valid: all not null +-- dictionary: ["Ruby", "Python"] +-- indices: [0, 1, 0] + STRING + end +end diff --git a/c_glib/test/test-file-writer.rb b/c_glib/test/test-file-writer.rb index 6ba5c7aebca44..67aed85f73b48 100644 --- a/c_glib/test/test-file-writer.rb +++ b/c_glib/test/test-file-writer.rb @@ -16,15 +16,21 @@ # under the License. class TestFileWriter < Test::Unit::TestCase + include Helper::Buildable + def test_write_record_batch + data = [true] + field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) + schema = Arrow::Schema.new([field]) + tempfile = Tempfile.open("arrow-ipc-file-writer") output = Arrow::FileOutputStream.new(tempfile.path, false) begin - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - schema = Arrow::Schema.new([field]) file_writer = Arrow::RecordBatchFileWriter.new(output, schema) begin - record_batch = Arrow::RecordBatch.new(schema, 0, []) + record_batch = Arrow::RecordBatch.new(schema, + data.size, + [build_boolean_array(data)]) file_writer.write_record_batch(record_batch) ensure file_writer.close @@ -36,8 +42,43 @@ def test_write_record_batch input = Arrow::MemoryMappedInputStream.new(tempfile.path) begin file_reader = Arrow::RecordBatchFileReader.new(input) - assert_equal(["enabled"], + assert_equal([field.name], file_reader.schema.fields.collect(&:name)) + assert_equal(Arrow::RecordBatch.new(schema, + data.size, + [build_boolean_array(data)]), + file_reader.read_record_batch(0)) + ensure + input.close + end + end + + def test_write_table + tempfile = Tempfile.open("arrow-ipc-file-writer") + output = Arrow::FileOutputStream.new(tempfile.path, false) + + array = build_boolean_array([true, false, true]) + field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) + schema = Arrow::Schema.new([field]) + column = Arrow::Column.new(field, array) + + begin + file_writer = Arrow::RecordBatchFileWriter.new(output, schema) + begin + table = Arrow::Table.new(schema, [column]) + file_writer.write_table(table) + ensure + file_writer.close + end + ensure + output.close + end + + input = Arrow::MemoryMappedInputStream.new(tempfile.path) + begin + file_reader = Arrow::RecordBatchFileReader.new(input) + assert_equal(Arrow::RecordBatch.new(schema, array.length, [array]), + file_reader.read_record_batch(0)) ensure input.close end diff --git a/c_glib/test/test-gio-input-stream.rb b/c_glib/test/test-gio-input-stream.rb index a71a370430e6d..2adf25b3af51f 100644 --- a/c_glib/test/test-gio-input-stream.rb +++ b/c_glib/test/test-gio-input-stream.rb @@ -16,15 +16,21 @@ # under the License. class TestGIOInputStream < Test::Unit::TestCase + include Helper::Buildable + def test_reader_backend + data = [true] + field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) + schema = Arrow::Schema.new([field]) + tempfile = Tempfile.open("arrow-gio-input-stream") output = Arrow::FileOutputStream.new(tempfile.path, false) begin - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - schema = Arrow::Schema.new([field]) file_writer = Arrow::RecordBatchFileWriter.new(output, schema) begin - record_batch = Arrow::RecordBatch.new(schema, 0, []) + record_batch = Arrow::RecordBatch.new(schema, + data.size, + [build_boolean_array(data)]) file_writer.write_record_batch(record_batch) ensure file_writer.close @@ -38,8 +44,12 @@ def test_reader_backend input = Arrow::GIOInputStream.new(input_stream) begin file_reader = Arrow::RecordBatchFileReader.new(input) - assert_equal(["enabled"], + assert_equal([field.name], file_reader.schema.fields.collect(&:name)) + assert_equal(Arrow::RecordBatch.new(schema, + data.size, + [build_boolean_array(data)]), + file_reader.read_record_batch(0)) ensure input.close end diff --git a/c_glib/test/test-gio-output-stream.rb b/c_glib/test/test-gio-output-stream.rb index adaa8c1b7b2b0..c77598ed110d5 100644 --- a/c_glib/test/test-gio-output-stream.rb +++ b/c_glib/test/test-gio-output-stream.rb @@ -16,17 +16,23 @@ # under the License. class TestGIOOutputStream < Test::Unit::TestCase + include Helper::Buildable + def test_writer_backend + data = [true] + field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) + schema = Arrow::Schema.new([field]) + tempfile = Tempfile.open("arrow-gio-output-stream") file = Gio::File.new_for_path(tempfile.path) output_stream = file.append_to(:none) output = Arrow::GIOOutputStream.new(output_stream) begin - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - schema = Arrow::Schema.new([field]) file_writer = Arrow::RecordBatchFileWriter.new(output, schema) begin - record_batch = Arrow::RecordBatch.new(schema, 0, []) + record_batch = Arrow::RecordBatch.new(schema, + data.size, + [build_boolean_array(data)]) file_writer.write_record_batch(record_batch) ensure file_writer.close @@ -38,8 +44,12 @@ def test_writer_backend input = Arrow::MemoryMappedInputStream.new(tempfile.path) begin file_reader = Arrow::RecordBatchFileReader.new(input) - assert_equal(["enabled"], + assert_equal([field.name], file_reader.schema.fields.collect(&:name)) + assert_equal(Arrow::RecordBatch.new(schema, + data.size, + [build_boolean_array(data)]), + file_reader.read_record_batch(0)) ensure input.close end diff --git a/c_glib/test/test-gpu-cuda.rb b/c_glib/test/test-gpu-cuda.rb new file mode 100644 index 0000000000000..c710ef2264976 --- /dev/null +++ b/c_glib/test/test-gpu-cuda.rb @@ -0,0 +1,144 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGPUCUDA < Test::Unit::TestCase + include Helper::Buildable + + def setup + omit("Arrow GPU is required") unless defined?(::ArrowGPU) + @manager = ArrowGPU::CUDADeviceManager.new + omit("At least one GPU is required") if @manager.n_devices.zero? + @context = @manager.get_context(0) + end + + sub_test_case("Context") do + def test_allocated_size + allocated_size_before = @context.allocated_size + size = 128 + buffer = ArrowGPU::CUDABuffer.new(@context, size) + assert_equal(size, + @context.allocated_size - allocated_size_before) + end + end + + sub_test_case("Buffer") do + def setup + super + @buffer = ArrowGPU::CUDABuffer.new(@context, 128) + end + + def test_copy + @buffer.copy_from_host("Hello World") + assert_equal("llo W", @buffer.copy_to_host(2, 5).to_s) + end + + def test_export + @buffer.copy_from_host("Hello World") + handle = @buffer.export + serialized_handle = handle.serialize.data + Tempfile.open("arrow-gpu-cuda-export") do |output| + pid = spawn(RbConfig.ruby, "-e", <<-SCRIPT) +require "gi" + +Gio = GI.load("Gio") +Arrow = GI.load("Arrow") +ArrowGPU = GI.load("ArrowGPU") + +manager = ArrowGPU::CUDADeviceManager.new +context = manager.get_context(0) +serialized_handle = #{serialized_handle.to_s.dump} +handle = ArrowGPU::CUDAIPCMemoryHandle.new(serialized_handle) +buffer = ArrowGPU::CUDABuffer.new(context, handle) +File.open(#{output.path.dump}, "w") do |output| + output.print(buffer.copy_to_host(0, 6).to_s) +end + SCRIPT + Process.waitpid(pid) + assert_equal("Hello ", output.read) + end + end + + def test_context + assert_equal(@context.allocated_size, + @buffer.context.allocated_size) + end + + def test_record_batch + field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) + schema = Arrow::Schema.new([field]) + columns = [ + build_boolean_array([true]), + ] + cpu_record_batch = Arrow::RecordBatch.new(schema, 1, columns) + + buffer = ArrowGPU::CUDABuffer.new(@context, cpu_record_batch) + gpu_record_batch = buffer.read_record_batch(schema) + assert_equal(cpu_record_batch.n_rows, + gpu_record_batch.n_rows) + end + end + + sub_test_case("HostBuffer") do + def test_new + buffer = ArrowGPU::CUDAHostBuffer.new(128) + assert_equal(128, buffer.size) + end + end + + sub_test_case("BufferInputStream") do + def test_new + buffer = ArrowGPU::CUDABuffer.new(@context, 128) + buffer.copy_from_host("Hello World") + stream = ArrowGPU::CUDABufferInputStream.new(buffer) + begin + assert_equal("Hello Worl", stream.read(5).copy_to_host(0, 10).to_s) + ensure + stream.close + end + end + end + + sub_test_case("BufferOutputStream") do + def setup + super + @buffer = ArrowGPU::CUDABuffer.new(@context, 128) + @buffer.copy_from_host("\x00" * @buffer.size) + @stream = ArrowGPU::CUDABufferOutputStream.new(@buffer) + end + + def cleanup + super + @stream.close + end + + def test_new + @stream.write("Hello World") + assert_equal("Hello World", @buffer.copy_to_host(0, 11).to_s) + end + + def test_buffer + assert_equal(0, @stream.buffer_size) + @stream.buffer_size = 5 + assert_equal(5, @stream.buffer_size) + @stream.write("Hell") + assert_equal(4, @stream.buffered_size) + assert_equal("\x00" * 5, @buffer.copy_to_host(0, 5).to_s) + @stream.write("o") + assert_equal("Hello", @buffer.copy_to_host(0, 5).to_s) + end + end +end diff --git a/c_glib/test/test-record-batch.rb b/c_glib/test/test-record-batch.rb index 9fd34b7d45cff..365922f496fe2 100644 --- a/c_glib/test/test-record-batch.rb +++ b/c_glib/test/test-record-batch.rb @@ -18,32 +18,53 @@ class TestTable < Test::Unit::TestCase include Helper::Buildable - def test_new - fields = [ - Arrow::Field.new("visible", Arrow::BooleanDataType.new), - Arrow::Field.new("valid", Arrow::BooleanDataType.new), - ] - schema = Arrow::Schema.new(fields) - columns = [ - build_boolean_array([true]), - build_boolean_array([false]), - ] - record_batch = Arrow::RecordBatch.new(schema, 1, columns) - assert_equal(1, record_batch.n_rows) + sub_test_case(".new") do + def test_valid + fields = [ + Arrow::Field.new("visible", Arrow::BooleanDataType.new), + Arrow::Field.new("valid", Arrow::BooleanDataType.new), + ] + schema = Arrow::Schema.new(fields) + columns = [ + build_boolean_array([true]), + build_boolean_array([false]), + ] + record_batch = Arrow::RecordBatch.new(schema, 1, columns) + assert_equal(1, record_batch.n_rows) + end + + def test_no_columns + fields = [ + Arrow::Field.new("visible", Arrow::BooleanDataType.new), + ] + schema = Arrow::Schema.new(fields) + message = "[record-batch][new]: " + + "Invalid: Number of columns did not match schema" + assert_raise(Arrow::Error::Invalid.new(message)) do + Arrow::RecordBatch.new(schema, 0, []) + end + end end sub_test_case("instance methods") do def setup + @visible_field = Arrow::Field.new("visible", Arrow::BooleanDataType.new) + @visible_values = [true, false, true, false, true] + @valid_field = Arrow::Field.new("valid", Arrow::BooleanDataType.new) + @valid_values = [false, true, false, true, false] + fields = [ - Arrow::Field.new("visible", Arrow::BooleanDataType.new), - Arrow::Field.new("valid", Arrow::BooleanDataType.new), + @visible_field, + @valid_field, ] schema = Arrow::Schema.new(fields) columns = [ - build_boolean_array([true, false, true, false, true, false]), - build_boolean_array([false, true, false, true, false]), + build_boolean_array(@visible_values), + build_boolean_array(@valid_values), ] - @record_batch = Arrow::RecordBatch.new(schema, 5, columns) + @record_batch = Arrow::RecordBatch.new(schema, + @visible_values.size, + columns) end def test_equal @@ -53,7 +74,7 @@ def test_equal ] schema = Arrow::Schema.new(fields) columns = [ - build_boolean_array([true, false, true, false, true, false]), + build_boolean_array([true, false, true, false, true]), build_boolean_array([false, true, false, true, false]), ] other_record_batch = Arrow::RecordBatch.new(schema, 5, columns) @@ -66,12 +87,28 @@ def test_schema @record_batch.schema.fields.collect(&:name)) end - def test_column - assert_equal(5, @record_batch.get_column(1).length) + sub_test_case("#column") do + def test_positive + assert_equal(build_boolean_array(@valid_values), + @record_batch.get_column(1)) + end + + def test_negative + assert_equal(build_boolean_array(@visible_values), + @record_batch.get_column(-2)) + end + + def test_positive_out_of_index + assert_nil(@record_batch.get_column(2)) + end + + def test_negative_out_of_index + assert_nil(@record_batch.get_column(-3)) + end end def test_columns - assert_equal([6, 5], + assert_equal([5, 5], @record_batch.columns.collect(&:length)) end @@ -94,7 +131,7 @@ def test_slice def test_to_s assert_equal(<<-PRETTY_PRINT, @record_batch.to_s) -visible: [true, false, true, false, true, false] +visible: [true, false, true, false, true] valid: [false, true, false, true, false] PRETTY_PRINT end diff --git a/c_glib/test/test-stream-writer.rb b/c_glib/test/test-stream-writer.rb index c3d0e1490cef6..32754e20838b4 100644 --- a/c_glib/test/test-stream-writer.rb +++ b/c_glib/test/test-stream-writer.rb @@ -19,17 +19,19 @@ class TestStreamWriter < Test::Unit::TestCase include Helper::Buildable def test_write_record_batch + data = [true] + field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) + schema = Arrow::Schema.new([field]) + tempfile = Tempfile.open("arrow-ipc-stream-writer") output = Arrow::FileOutputStream.new(tempfile.path, false) begin - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - schema = Arrow::Schema.new([field]) stream_writer = Arrow::RecordBatchStreamWriter.new(output, schema) begin columns = [ - build_boolean_array([true]), + build_boolean_array(data), ] - record_batch = Arrow::RecordBatch.new(schema, 1, columns) + record_batch = Arrow::RecordBatch.new(schema, data.size, columns) stream_writer.write_record_batch(record_batch) ensure stream_writer.close @@ -41,10 +43,12 @@ def test_write_record_batch input = Arrow::MemoryMappedInputStream.new(tempfile.path) begin stream_reader = Arrow::RecordBatchStreamReader.new(input) - assert_equal(["enabled"], + assert_equal([field.name], stream_reader.schema.fields.collect(&:name)) - assert_equal(true, - stream_reader.read_next.get_column(0).get_value(0)) + assert_equal(Arrow::RecordBatch.new(schema, + data.size, + [build_boolean_array(data)]), + stream_reader.read_next) assert_nil(stream_reader.read_next) ensure input.close diff --git a/c_glib/test/test-timestamp-data-type.rb b/c_glib/test/test-timestamp-data-type.rb index 83038876926f0..bbc597c160ebc 100644 --- a/c_glib/test/test-timestamp-data-type.rb +++ b/c_glib/test/test-timestamp-data-type.rb @@ -29,6 +29,10 @@ def setup def test_to_s assert_equal("timestamp[s]", @data_type.to_s) end + + def test_unit + assert_equal(Arrow::TimeUnit::SECOND, @data_type.unit) + end end sub_test_case("millisecond") do @@ -39,6 +43,10 @@ def setup def test_to_s assert_equal("timestamp[ms]", @data_type.to_s) end + + def test_unit + assert_equal(Arrow::TimeUnit::MILLI, @data_type.unit) + end end sub_test_case("micro") do @@ -49,6 +57,10 @@ def setup def test_to_s assert_equal("timestamp[us]", @data_type.to_s) end + + def test_unit + assert_equal(Arrow::TimeUnit::MICRO, @data_type.unit) + end end sub_test_case("nano") do @@ -59,5 +71,9 @@ def setup def test_to_s assert_equal("timestamp[ns]", @data_type.to_s) end + + def test_unit + assert_equal(Arrow::TimeUnit::NANO, @data_type.unit) + end end end diff --git a/c_glib/test/test-unique.rb b/c_glib/test/test-unique.rb new file mode 100644 index 0000000000000..b94ff462b4600 --- /dev/null +++ b/c_glib/test/test-unique.rb @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestUnique < Test::Unit::TestCase + include Helper::Buildable + include Helper::Omittable + + def test_int32 + assert_equal(build_int32_array([1, 3, -1, -3]), + build_int32_array([1, 3, 1, -1, -3, -1]).unique) + end + + def test_string + assert_equal(build_string_array(["Ruby", "Python"]), + build_string_array(["Ruby", "Python", "Ruby"]).unique) + end +end diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat index e8eb0945e516e..62ebcf364e77b 100644 --- a/ci/msvc-build.bat +++ b/ci/msvc-build.bat @@ -17,6 +17,38 @@ @echo on +if "%JOB%" == "Static_Crt_Build" ( + mkdir cpp\build-debug + pushd cpp\build-debug + + cmake -G "%GENERATOR%" ^ + -DARROW_USE_STATIC_CRT=ON ^ + -DARROW_BOOST_USE_SHARED=OFF ^ + -DCMAKE_BUILD_TYPE=Debug ^ + -DARROW_CXXFLAGS="/MP" ^ + .. || exit /B + + cmake --build . --config Debug || exit /B + popd + + mkdir cpp\build-release + pushd cpp\build-release + + cmake -G "%GENERATOR%" ^ + -DARROW_USE_STATIC_CRT=ON ^ + -DARROW_BOOST_USE_SHARED=OFF ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DARROW_CXXFLAGS="/WX /MP" ^ + .. || exit /B + + cmake --build . --config Release || exit /B + ctest -VV || exit /B + popd + + @rem Finish Static_Crt_Build build successfully + exit /B 0 +) + if "%JOB%" == "Build_Debug" ( mkdir cpp\build-debug pushd cpp\build-debug diff --git a/ci/travis_before_script_c_glib.sh b/ci/travis_before_script_c_glib.sh index a63945e1745d0..9d07e02a84302 100755 --- a/ci/travis_before_script_c_glib.sh +++ b/ci/travis_before_script_c_glib.sh @@ -22,8 +22,6 @@ set -ex source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh if [ $TRAVIS_OS_NAME = "osx" ]; then - brew update && brew bundle --file=c_glib/Brewfile - export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/usr/local/opt/libffi/lib/pkgconfig export XML_CATALOG_FILES=/usr/local/etc/xml/catalog fi @@ -99,7 +97,6 @@ if [ $BUILD_SYSTEM = "autotools" ]; then CONFIGURE_OPTIONS="$CONFIGURE_OPTIONS CXXFLAGS=-DARROW_NO_DEPRECATED_API" ./configure $CONFIGURE_OPTIONS - make -j4 make install else diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index dbdcd33ed0d5b..fd2c1644638c4 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -38,8 +38,10 @@ if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then rapidjson \ flatbuffers \ gflags \ + gtest \ lz4-c \ snappy \ + ccache \ zstd \ brotli \ zlib \ @@ -53,9 +55,6 @@ if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then conda update -y -p $CPP_TOOLCHAIN ca-certificates -c defaults fi -if [ $TRAVIS_OS_NAME == "osx" ]; then - brew update && brew bundle --file=cpp/Brewfile -fi mkdir $ARROW_CPP_BUILD_DIR pushd $ARROW_CPP_BUILD_DIR @@ -85,6 +84,10 @@ if [ $ARROW_TRAVIS_PLASMA == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_PLASMA=ON" fi +if [ $ARROW_TRAVIS_ORC == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_ORC=ON" +fi + if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_TEST_MEMCHECK=ON" fi @@ -92,12 +95,14 @@ fi if [ $TRAVIS_OS_NAME == "linux" ]; then cmake $CMAKE_COMMON_FLAGS \ $CMAKE_LINUX_FLAGS \ - -DBUILD_WARNING_LEVEL=CHECKIN \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DBUILD_WARNING_LEVEL=$ARROW_BUILD_WARNING_LEVEL \ $ARROW_CPP_DIR else cmake $CMAKE_COMMON_FLAGS \ $CMAKE_OSX_FLAGS \ - -DBUILD_WARNING_LEVEL=CHECKIN \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DBUILD_WARNING_LEVEL=$ARROW_BUILD_WARNING_LEVEL \ $ARROW_CPP_DIR fi diff --git a/ci/travis_env_common.sh b/ci/travis_env_common.sh index 52c7da4e0178e..21b6e266ea678 100755 --- a/ci/travis_env_common.sh +++ b/ci/travis_env_common.sh @@ -38,6 +38,9 @@ export ARROW_PYTHON_PARQUET_HOME=$TRAVIS_BUILD_DIR/parquet-env export CMAKE_EXPORT_COMPILE_COMMANDS=1 +export ARROW_BUILD_TYPE=${ARROW_BUILD_TYPE:=debug} +export ARROW_BUILD_WARNING_LEVEL=${ARROW_BUILD_WARNING_LEVEL:=Production} + if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then # C++ toolchain export CPP_TOOLCHAIN=$TRAVIS_BUILD_DIR/cpp-toolchain diff --git a/ci/travis_script_integration.sh b/ci/travis_script_integration.sh index be025512f0b88..0c415dc4865cd 100755 --- a/ci/travis_script_integration.sh +++ b/ci/travis_script_integration.sh @@ -20,6 +20,7 @@ set -e source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh +export ARROW_CPP_EXE_PATH=$ARROW_CPP_BUILD_DIR/debug pushd $ARROW_JAVA_DIR @@ -28,8 +29,15 @@ mvn -B clean package 2>&1 > mvn_package.log || (cat mvn_package.log && false) popd +pushd $ARROW_JS_DIR + +# lint and compile JS source +npm run lint +npm run build + +popd + pushd $ARROW_INTEGRATION_DIR -export ARROW_CPP_EXE_PATH=$ARROW_CPP_BUILD_DIR/debug CONDA_ENV_NAME=arrow-integration-test conda create -y -q -n $CONDA_ENV_NAME python=3.5 @@ -44,3 +52,12 @@ conda install -y pip numpy six python integration_test.py --debug popd + +# pushd $ARROW_JS_DIR + +# run tests against source to generate coverage data +# npm run test:coverage +# Uncomment to upload to coveralls +# cat ./coverage/lcov.info | ./node_modules/coveralls/bin/coveralls.js; + +# popd \ No newline at end of file diff --git a/ci/travis_script_js.sh b/ci/travis_script_js.sh index 9f77dec8c53a7..1871b4265cd01 100755 --- a/ci/travis_script_js.sh +++ b/ci/travis_script_js.sh @@ -17,16 +17,15 @@ # specific language governing permissions and limitations # under the License. -set -e +set -ex -JS_DIR=${TRAVIS_BUILD_DIR}/js +source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh -pushd $JS_DIR +pushd $ARROW_JS_DIR -npm run validate - -# Uncomment to use coveralls -# npm run test:coverage -# cat ./coverage/lcov.info | ./node_modules/coveralls/bin/coveralls.js; +npm run lint +npm run build +# run the non-snapshot unit tests +npm test popd diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 603201bcc166b..9e74906d03739 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -63,6 +63,8 @@ cmake -GNinja \ -DARROW_BUILD_UTILITIES=off \ -DARROW_PLASMA=on \ -DARROW_PYTHON=on \ + -DARROW_ORC=on \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ $ARROW_CPP_DIR @@ -78,13 +80,16 @@ if [ "$PYTHON_VERSION" == "2.7" ]; then pip install futures fi +export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE + pip install -r requirements.txt -python setup.py build_ext --with-parquet --with-plasma \ +python setup.py build_ext --with-parquet --with-plasma --with-orc\ install --single-version-externally-managed --record=record.text popd python -c "import pyarrow.parquet" python -c "import pyarrow.plasma" +python -c "import pyarrow.orc" if [ $TRAVIS_OS_NAME == "linux" ]; then export PLASMA_VALGRIND=1 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d8dc5df88b4a4..0558cba495a83 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -48,6 +48,7 @@ set(ARROW_ABI_VERSION "${ARROW_SO_VERSION}.0.0") set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") +set(CLANG_FORMAT_VERSION "4.0") find_package(ClangTools) if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND) # Generate a Clang compile_commands.json "compilation database" file for use @@ -118,6 +119,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the Arrow GPU extensions (requires CUDA installation)" OFF) + option(ARROW_ORC + "Build the Arrow ORC adapter" + OFF) + option(ARROW_JEMALLOC "Build the Arrow jemalloc-based allocator" OFF) @@ -209,6 +214,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Lz4 static lib suffix used on Windows with MSVC (default _static)") set(ZSTD_MSVC_STATIC_LIB_SUFFIX "_static" CACHE STRING "ZStd static lib suffix used on Windows with MSVC (default _static)") + + option(ARROW_USE_STATIC_CRT + "Build Arrow with statically linked CRT" + OFF) endif() endif() @@ -221,6 +230,17 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) set(ARROW_WITH_ZSTD ON) endif() +if (MSVC) + # ORC doesn't build on windows + set(ARROW_ORC OFF) +endif() + +if(ARROW_ORC) + set(ARROW_WITH_LZ4 ON) + set(ARROW_WITH_SNAPPY ON) + set(ARROW_WITH_ZLIB ON) +endif() + if(NOT ARROW_BUILD_TESTS) set(NO_TESTS 1) endif() @@ -413,6 +433,7 @@ if (UNIX) (item MATCHES "xxhash.h") OR (item MATCHES "xxhash.cc") OR (item MATCHES "config.h") OR + (item MATCHES "util/variant") OR (item MATCHES "zmalloc.h") OR (item MATCHES "ae.h"))) LIST(APPEND FILTERED_LINT_FILES ${item}) @@ -435,11 +456,9 @@ endif (UNIX) # "make format" and "make check-format" targets ############################################################ -set(CLANG_FORMAT_VERSION 4.0) - # runs clang format and updates files in place. add_custom_target(format ${BUILD_SUPPORT_DIR}/run_clang_format.py - ${CLANG_FORMAT_VERSION} + ${CLANG_FORMAT_BIN} ${BUILD_SUPPORT_DIR}/clang_format_exclusions.txt ${CMAKE_CURRENT_SOURCE_DIR}/src) @@ -447,7 +466,7 @@ add_custom_target(format ${BUILD_SUPPORT_DIR}/run_clang_format.py # TODO(wesm): Make this work in run_clang_format.py add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run_clang_format.py - ${CLANG_FORMAT_VERSION} + ${CLANG_FORMAT_BIN} ${BUILD_SUPPORT_DIR}/clang_format_exclusions.txt ${CMAKE_CURRENT_SOURCE_DIR}/src 1) @@ -522,6 +541,13 @@ if (ARROW_WITH_GRPC) ${ARROW_STATIC_LINK_LIBS}) endif() +if (ARROW_ORC) + SET(ARROW_STATIC_LINK_LIBS + orc + protobuf + ${ARROW_STATIC_LINK_LIBS}) +endif() + if (ARROW_STATIC_LINK_LIBS) add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS}) endif() diff --git a/cpp/README.md b/cpp/README.md index 60383535b1596..39a1ccac64818 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -42,7 +42,7 @@ sudo apt-get install cmake \ On OS X, you can use [Homebrew][1]: ```shell -brew update && brew bundle --file=cpp/Brewfile +brew update && brew bundle --file=c_glib/Brewfile ``` If you are developing on Windows, see the [Windows developer guide][2]. @@ -69,6 +69,14 @@ Simple release build: Detailed unit test logs will be placed in the build directory under `build/test-logs`. +On some Linux distributions, running the test suite might require setting an +explicit locale. If you see any locale-related errors, try setting the +environment variable (which requires the `locales` package or equivalent): + +``` +export LC_ALL="en_US.UTF-8" +``` + ### Statically linking to Arrow on Windows The Arrow headers on Windows static library builds (enabled by the CMake @@ -137,6 +145,13 @@ The CUDA toolchain used to build the library can be customized by using the This library is still in Alpha stages, and subject to API changes without deprecation warnings. +### Building Apache ORC integration (optional) + +The optional arrow reader for the Apache ORC format (found in the +`arrow::adapters::orc` namespace) can be built by passing `-DARROW_ORC=on`. +This is currently not supported on windows. Note that this functionality is +still in Alpha stages, and subject to API changes without deprecation warnings. + ### API documentation To generate the (html) API documentation, run the following command in the apidoc diff --git a/cpp/apidoc/Windows.md b/cpp/apidoc/Windows.md index 774482ea1c4f3..aa3d31f1f7b90 100644 --- a/cpp/apidoc/Windows.md +++ b/cpp/apidoc/Windows.md @@ -41,9 +41,13 @@ conda config --add channels conda-forge Now, you can bootstrap a build environment ```shell -conda create -n arrow-dev cmake git boost-cpp flatbuffers rapidjson cmake thrift-cpp snappy zlib brotli gflags lz4-c zstd +conda create -n arrow-dev cmake git boost-cpp flatbuffers rapidjson cmake thrift-cpp snappy zlib brotli gflags lz4-c zstd -c conda-forge ``` +***Note:*** +> *Make sure to get the `conda-forge` build of `gflags` as the + naming of the library differs from that in the `defaults` channel* + Activate just created conda environment with pre-installed packages from previous step: @@ -51,20 +55,16 @@ previous step: activate arrow-dev ``` -We are using [cmake][4] tool to support Windows builds. +We are using the [cmake][4] tool to support Windows builds. To allow cmake to pick up 3rd party dependencies, you should set `ARROW_BUILD_TOOLCHAIN` environment variable to contain `Library` folder path of new created on previous step `arrow-dev` conda environment. -For instance, if `Miniconda` was installed to default destination, `Library` -folder path for `arrow-dev` conda environment will be as following: - -```shell -C:\Users\YOUR_USER_NAME\Miniconda3\envs\arrow-dev\Library -``` -To set `ARROW_BUILD_TOOLCHAIN` environment variable visible only for current terminal session you can run following: +To set `ARROW_BUILD_TOOLCHAIN` environment variable visible only for current terminal +session you can run following. `%CONDA_PREFIX` is set by conda to the current environment +root by the `activate` script. ```shell -set ARROW_BUILD_TOOLCHAIN=C:\Users\YOUR_USER_NAME\Miniconda3\envs\arrow-dev\Library +set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library ``` To validate value of `ARROW_BUILD_TOOLCHAIN` environment variable you can run following terminal command: diff --git a/cpp/build-support/clang_format_exclusions.txt b/cpp/build-support/clang_format_exclusions.txt index 2d5d86d4e4cde..d31d8a00d2ab7 100644 --- a/cpp/build-support/clang_format_exclusions.txt +++ b/cpp/build-support/clang_format_exclusions.txt @@ -3,6 +3,8 @@ *pyarrow_api.h *python/config.h *python/platform.h +*util/variant.h +*util/variant/* *thirdparty/ae/* *xxhash.cc *xxhash.h diff --git a/cpp/build-support/lz4_msbuild_gl_runtimelibrary_params.patch b/cpp/build-support/lz4_msbuild_gl_runtimelibrary_params.patch new file mode 100644 index 0000000000000..c79898d9a786c --- /dev/null +++ b/cpp/build-support/lz4_msbuild_gl_runtimelibrary_params.patch @@ -0,0 +1,601 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff --git a/visual/VS2010/datagen/datagen.vcxproj b/visual/VS2010/datagen/datagen.vcxproj +index aaf81ad..096741a 100644 +--- a/visual/VS2010/datagen/datagen.vcxproj ++++ b/visual/VS2010/datagen/datagen.vcxproj +@@ -39,15 +39,19 @@ + + Application + false +- true + Unicode + + + Application + false +- true + Unicode + ++ ++ true ++ ++ ++ true ++ + + + +@@ -113,6 +117,46 @@ + true + + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + Level4 +diff --git a/visual/VS2010/frametest/frametest.vcxproj b/visual/VS2010/frametest/frametest.vcxproj +index 76d12c9..34ca686 100644 +--- a/visual/VS2010/frametest/frametest.vcxproj ++++ b/visual/VS2010/frametest/frametest.vcxproj +@@ -39,15 +39,19 @@ + + Application + false +- true + Unicode + + + Application + false +- true + Unicode + ++ ++ true ++ ++ ++ true ++ + + + +@@ -113,6 +117,46 @@ + true + + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + Level4 +diff --git a/visual/VS2010/fullbench-dll/fullbench-dll.vcxproj b/visual/VS2010/fullbench-dll/fullbench-dll.vcxproj +index c10552a..c1ca231 100644 +--- a/visual/VS2010/fullbench-dll/fullbench-dll.vcxproj ++++ b/visual/VS2010/fullbench-dll/fullbench-dll.vcxproj +@@ -39,15 +39,19 @@ + + Application + false +- true + Unicode + + + Application + false +- true + Unicode + ++ ++ true ++ ++ ++ true ++ + + + +@@ -117,6 +121,46 @@ + liblz4.lib;%(AdditionalDependencies) + + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + Level4 +diff --git a/visual/VS2010/fullbench/fullbench.vcxproj b/visual/VS2010/fullbench/fullbench.vcxproj +index e2d95c9..613ff3c 100644 +--- a/visual/VS2010/fullbench/fullbench.vcxproj ++++ b/visual/VS2010/fullbench/fullbench.vcxproj +@@ -39,15 +39,19 @@ + + Application + false +- true + Unicode + + + Application + false +- true + Unicode + ++ ++ true ++ ++ ++ true ++ + + + +@@ -113,6 +117,46 @@ + true + + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + Level4 +diff --git a/visual/VS2010/fuzzer/fuzzer.vcxproj b/visual/VS2010/fuzzer/fuzzer.vcxproj +index 85d6c9b..be8b9a1 100644 +--- a/visual/VS2010/fuzzer/fuzzer.vcxproj ++++ b/visual/VS2010/fuzzer/fuzzer.vcxproj +@@ -39,15 +39,19 @@ + + Application + false +- true + Unicode + + + Application + false +- true + Unicode + ++ ++ true ++ ++ ++ true ++ + + + +@@ -113,6 +117,46 @@ + true + + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + Level4 +diff --git a/visual/VS2010/liblz4-dll/liblz4-dll.vcxproj b/visual/VS2010/liblz4-dll/liblz4-dll.vcxproj +index 389f13c..e30e667 100644 +--- a/visual/VS2010/liblz4-dll/liblz4-dll.vcxproj ++++ b/visual/VS2010/liblz4-dll/liblz4-dll.vcxproj +@@ -40,15 +40,19 @@ + + DynamicLibrary + false +- true + Unicode + + + DynamicLibrary + false +- true + Unicode + ++ ++ true ++ ++ ++ true ++ + + + +@@ -116,6 +120,46 @@ + true + + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + Level4 +diff --git a/visual/VS2010/liblz4/liblz4.vcxproj b/visual/VS2010/liblz4/liblz4.vcxproj +index a0b8000..38d2ce2 100644 +--- a/visual/VS2010/liblz4/liblz4.vcxproj ++++ b/visual/VS2010/liblz4/liblz4.vcxproj +@@ -39,15 +39,19 @@ + + StaticLibrary + false +- true + Unicode + + + StaticLibrary + false +- true + Unicode + ++ ++ true ++ ++ ++ true ++ + + + +@@ -115,6 +119,46 @@ + true + + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + Level4 +diff --git a/visual/VS2010/lz4/lz4.vcxproj b/visual/VS2010/lz4/lz4.vcxproj +index 693e121..9eb005b 100644 +--- a/visual/VS2010/lz4/lz4.vcxproj ++++ b/visual/VS2010/lz4/lz4.vcxproj +@@ -39,15 +39,19 @@ + + Application + false +- true + Unicode + + + Application + false +- true + Unicode + ++ ++ true ++ ++ ++ true ++ + + + +@@ -115,6 +119,46 @@ + setargv.obj;%(AdditionalDependencies) + + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + Level4 diff --git a/cpp/build-support/lz4_msbuild_wholeprogramoptimization_param.patch b/cpp/build-support/lz4_msbuild_wholeprogramoptimization_param.patch deleted file mode 100644 index ee0f8a12054bc..0000000000000 --- a/cpp/build-support/lz4_msbuild_wholeprogramoptimization_param.patch +++ /dev/null @@ -1,225 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -diff --git a/visual/VS2010/datagen/datagen.vcxproj b/visual/VS2010/datagen/datagen.vcxproj -index aaf81ad..67b716f 100644 ---- a/visual/VS2010/datagen/datagen.vcxproj -+++ b/visual/VS2010/datagen/datagen.vcxproj -@@ -39,15 +39,19 @@ - - Application - false -- true - Unicode - - - Application - false -- true - Unicode - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/visual/VS2010/frametest/frametest.vcxproj b/visual/VS2010/frametest/frametest.vcxproj -index 76d12c9..723571d 100644 ---- a/visual/VS2010/frametest/frametest.vcxproj -+++ b/visual/VS2010/frametest/frametest.vcxproj -@@ -39,15 +39,19 @@ - - Application - false -- true - Unicode - - - Application - false -- true - Unicode - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/visual/VS2010/fullbench-dll/fullbench-dll.vcxproj b/visual/VS2010/fullbench-dll/fullbench-dll.vcxproj -index c10552a..0c8f293 100644 ---- a/visual/VS2010/fullbench-dll/fullbench-dll.vcxproj -+++ b/visual/VS2010/fullbench-dll/fullbench-dll.vcxproj -@@ -39,15 +39,19 @@ - - Application - false -- true - Unicode - - - Application - false -- true - Unicode - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/visual/VS2010/fullbench/fullbench.vcxproj b/visual/VS2010/fullbench/fullbench.vcxproj -index e2d95c9..4cd88d0 100644 ---- a/visual/VS2010/fullbench/fullbench.vcxproj -+++ b/visual/VS2010/fullbench/fullbench.vcxproj -@@ -39,15 +39,19 @@ - - Application - false -- true - Unicode - - - Application - false -- true - Unicode - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/visual/VS2010/fuzzer/fuzzer.vcxproj b/visual/VS2010/fuzzer/fuzzer.vcxproj -index 85d6c9b..3ddc77d 100644 ---- a/visual/VS2010/fuzzer/fuzzer.vcxproj -+++ b/visual/VS2010/fuzzer/fuzzer.vcxproj -@@ -39,15 +39,19 @@ - - Application - false -- true - Unicode - - - Application - false -- true - Unicode - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/visual/VS2010/liblz4-dll/liblz4-dll.vcxproj b/visual/VS2010/liblz4-dll/liblz4-dll.vcxproj -index 389f13c..038a4d2 100644 ---- a/visual/VS2010/liblz4-dll/liblz4-dll.vcxproj -+++ b/visual/VS2010/liblz4-dll/liblz4-dll.vcxproj -@@ -40,15 +40,19 @@ - - DynamicLibrary - false -- true - Unicode - - - DynamicLibrary - false -- true - Unicode - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/visual/VS2010/liblz4/liblz4.vcxproj b/visual/VS2010/liblz4/liblz4.vcxproj -index a0b8000..9aad8c2 100644 ---- a/visual/VS2010/liblz4/liblz4.vcxproj -+++ b/visual/VS2010/liblz4/liblz4.vcxproj -@@ -39,15 +39,19 @@ - - StaticLibrary - false -- true - Unicode - - - StaticLibrary - false -- true - Unicode - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/visual/VS2010/lz4/lz4.vcxproj b/visual/VS2010/lz4/lz4.vcxproj -index 693e121..7e63f1e 100644 ---- a/visual/VS2010/lz4/lz4.vcxproj -+++ b/visual/VS2010/lz4/lz4.vcxproj -@@ -39,15 +39,19 @@ - - Application - false -- true - Unicode - - - Application - false -- true - Unicode - -+ -+ true -+ -+ -+ true -+ - - - diff --git a/cpp/build-support/run_clang_format.py b/cpp/build-support/run_clang_format.py index fcf39ecc6a5f9..6dec34bd09afe 100755 --- a/cpp/build-support/run_clang_format.py +++ b/cpp/build-support/run_clang_format.py @@ -27,7 +27,7 @@ sys.argv[0]) sys.exit(1) -CLANG_FORMAT = 'clang-format-{0}'.format(sys.argv[1]) +CLANG_FORMAT = sys.argv[1] EXCLUDE_GLOBS_FILENAME = sys.argv[2] SOURCE_DIR = sys.argv[3] diff --git a/cpp/build-support/zstd_msbuild_gl_runtimelibrary_params.patch b/cpp/build-support/zstd_msbuild_gl_runtimelibrary_params.patch new file mode 100644 index 0000000000000..a168a814950e6 --- /dev/null +++ b/cpp/build-support/zstd_msbuild_gl_runtimelibrary_params.patch @@ -0,0 +1,528 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff --git a/build/VS2010/datagen/datagen.vcxproj b/build/VS2010/datagen/datagen.vcxproj +index bd8a213d..691d39fe 100644 +--- a/build/VS2010/datagen/datagen.vcxproj ++++ b/build/VS2010/datagen/datagen.vcxproj +@@ -39,15 +39,19 @@ + + Application + false +- true + MultiByte + + + Application + false +- true + MultiByte + ++ ++ true ++ ++ ++ true ++ + + + +@@ -84,6 +88,46 @@ + false + $(IncludePath);$(SolutionDir)..\..\programs;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\lib\common;$(UniversalCRT_IncludePath); + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + +diff --git a/build/VS2010/fullbench-dll/fullbench-dll.vcxproj b/build/VS2010/fullbench-dll/fullbench-dll.vcxproj +index e697318e..a5720189 100644 +--- a/build/VS2010/fullbench-dll/fullbench-dll.vcxproj ++++ b/build/VS2010/fullbench-dll/fullbench-dll.vcxproj +@@ -39,15 +39,19 @@ + + Application + false +- true + MultiByte + + + Application + false +- true + MultiByte + ++ ++ true ++ ++ ++ true ++ + + + +@@ -84,6 +88,46 @@ + $(IncludePath);$(SolutionDir)..\..\lib;$(SolutionDir)..\..\programs;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\lib\common;$(UniversalCRT_IncludePath); + false + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + +diff --git a/build/VS2010/fullbench/fullbench.vcxproj b/build/VS2010/fullbench/fullbench.vcxproj +index 2bff4ca4..d64fac81 100644 +--- a/build/VS2010/fullbench/fullbench.vcxproj ++++ b/build/VS2010/fullbench/fullbench.vcxproj +@@ -39,15 +39,19 @@ + + Application + false +- true + MultiByte + + + Application + false +- true + MultiByte + ++ ++ true ++ ++ ++ true ++ + + + +@@ -84,6 +88,46 @@ + $(IncludePath);$(SolutionDir)..\..\lib;$(SolutionDir)..\..\programs;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\lib\common;$(UniversalCRT_IncludePath); + false + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + +diff --git a/build/VS2010/fuzzer/fuzzer.vcxproj b/build/VS2010/fuzzer/fuzzer.vcxproj +index 12a4b931..1ffc5941 100644 +--- a/build/VS2010/fuzzer/fuzzer.vcxproj ++++ b/build/VS2010/fuzzer/fuzzer.vcxproj +@@ -39,15 +39,19 @@ + + Application + false +- true + MultiByte + + + Application + false +- true + MultiByte + ++ ++ true ++ ++ ++ true ++ + + + +@@ -84,6 +88,46 @@ + false + $(IncludePath);$(SolutionDir)..\..\lib;$(SolutionDir)..\..\programs;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\dictBuilder;$(SolutionDir)..\..\lib\compress;$(UniversalCRT_IncludePath); + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + +diff --git a/build/VS2010/libzstd-dll/libzstd-dll.vcxproj b/build/VS2010/libzstd-dll/libzstd-dll.vcxproj +index 364b3bea..1f34e956 100644 +--- a/build/VS2010/libzstd-dll/libzstd-dll.vcxproj ++++ b/build/VS2010/libzstd-dll/libzstd-dll.vcxproj +@@ -94,15 +94,19 @@ + + DynamicLibrary + false +- true + MultiByte + + + DynamicLibrary + false +- true + MultiByte + ++ ++ true ++ ++ ++ true ++ + + + +@@ -143,6 +147,46 @@ + $(IncludePath);$(SolutionDir)..\..\lib;$(SolutionDir)..\..\programs\legacy;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\dictBuilder;$(UniversalCRT_IncludePath); + false + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + +diff --git a/build/VS2010/libzstd/libzstd.vcxproj b/build/VS2010/libzstd/libzstd.vcxproj +index 6087d737..0d2a6fe0 100644 +--- a/build/VS2010/libzstd/libzstd.vcxproj ++++ b/build/VS2010/libzstd/libzstd.vcxproj +@@ -91,15 +91,19 @@ + + StaticLibrary + false +- true + MultiByte + + + StaticLibrary + false +- true + MultiByte + ++ ++ true ++ ++ ++ true ++ + + + +@@ -140,6 +144,46 @@ + $(IncludePath);$(SolutionDir)..\..\lib;$(SolutionDir)..\..\programs\legacy;$(SolutionDir)..\..\lib\legacy;$(SolutionDir)..\..\lib\common;$(SolutionDir)..\..\lib\dictBuilder;$(UniversalCRT_IncludePath); + false + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + +diff --git a/build/VS2010/zstd/zstd.vcxproj b/build/VS2010/zstd/zstd.vcxproj +index 438dc617..3fb5bba8 100644 +--- a/build/VS2010/zstd/zstd.vcxproj ++++ b/build/VS2010/zstd/zstd.vcxproj +@@ -100,15 +100,19 @@ + + Application + false +- true + MultiByte + + + Application + false +- true + MultiByte + ++ ++ true ++ ++ ++ true ++ + + + +@@ -149,6 +153,46 @@ + false + $(LibraryPath); + ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ ++ ++ ++ MultiThreaded ++ ++ ++ ++ ++ MultiThreadedDebug ++ ++ ++ ++ ++ MultiThreadedDLL ++ ++ ++ ++ ++ MultiThreadedDebugDLL ++ ++ + + + diff --git a/cpp/build-support/zstd_msbuild_wholeprogramoptimization_param.patch b/cpp/build-support/zstd_msbuild_wholeprogramoptimization_param.patch deleted file mode 100644 index 8bfb928947e06..0000000000000 --- a/cpp/build-support/zstd_msbuild_wholeprogramoptimization_param.patch +++ /dev/null @@ -1,199 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -diff --git a/build/VS2010/datagen/datagen.vcxproj b/build/VS2010/datagen/datagen.vcxproj -index bd8a213..8e4dc89 100644 ---- a/build/VS2010/datagen/datagen.vcxproj -+++ b/build/VS2010/datagen/datagen.vcxproj -@@ -39,15 +39,19 @@ - - Application - false -- true - MultiByte - - - Application - false -- true - MultiByte - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/build/VS2010/fullbench-dll/fullbench-dll.vcxproj b/build/VS2010/fullbench-dll/fullbench-dll.vcxproj -index e697318..82cd4ab 100644 ---- a/build/VS2010/fullbench-dll/fullbench-dll.vcxproj -+++ b/build/VS2010/fullbench-dll/fullbench-dll.vcxproj -@@ -39,15 +39,19 @@ - - Application - false -- true - MultiByte - - - Application - false -- true - MultiByte - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/build/VS2010/fullbench/fullbench.vcxproj b/build/VS2010/fullbench/fullbench.vcxproj -index 2bff4ca..ced4047 100644 ---- a/build/VS2010/fullbench/fullbench.vcxproj -+++ b/build/VS2010/fullbench/fullbench.vcxproj -@@ -39,15 +39,19 @@ - - Application - false -- true - MultiByte - - - Application - false -- true - MultiByte - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/build/VS2010/fuzzer/fuzzer.vcxproj b/build/VS2010/fuzzer/fuzzer.vcxproj -index 12a4b93..227efd1 100644 ---- a/build/VS2010/fuzzer/fuzzer.vcxproj -+++ b/build/VS2010/fuzzer/fuzzer.vcxproj -@@ -39,15 +39,19 @@ - - Application - false -- true - MultiByte - - - Application - false -- true - MultiByte - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/build/VS2010/libzstd-dll/libzstd-dll.vcxproj b/build/VS2010/libzstd-dll/libzstd-dll.vcxproj -index 364b3be..b227320 100644 ---- a/build/VS2010/libzstd-dll/libzstd-dll.vcxproj -+++ b/build/VS2010/libzstd-dll/libzstd-dll.vcxproj -@@ -94,15 +94,19 @@ - - DynamicLibrary - false -- true - MultiByte - - - DynamicLibrary - false -- true - MultiByte - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/build/VS2010/libzstd/libzstd.vcxproj b/build/VS2010/libzstd/libzstd.vcxproj -index 6087d73..51a0572 100644 ---- a/build/VS2010/libzstd/libzstd.vcxproj -+++ b/build/VS2010/libzstd/libzstd.vcxproj -@@ -91,15 +91,19 @@ - - StaticLibrary - false -- true - MultiByte - - - StaticLibrary - false -- true - MultiByte - -+ -+ true -+ -+ -+ true -+ - - - -diff --git a/build/VS2010/zstd/zstd.vcxproj b/build/VS2010/zstd/zstd.vcxproj -index 438dc61..834ae01 100644 ---- a/build/VS2010/zstd/zstd.vcxproj -+++ b/build/VS2010/zstd/zstd.vcxproj -@@ -100,15 +100,19 @@ - - Application - false -- true - MultiByte - - - Application - false -- true - MultiByte - -+ -+ true -+ -+ -+ true -+ - - - diff --git a/cpp/cmake_modules/FindArrow.cmake b/cpp/cmake_modules/FindArrow.cmake index 12f76b6c2b3e8..bce4404a47809 100644 --- a/cpp/cmake_modules/FindArrow.cmake +++ b/cpp/cmake_modules/FindArrow.cmake @@ -97,8 +97,8 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIBS) set(ARROW_SHARED_IMP_LIB ${ARROW_LIBS}/${ARROW_LIB_NAME}.lib) set(ARROW_PYTHON_SHARED_IMP_LIB ${ARROW_PYTHON_LIBS}/${ARROW_PYTHON_LIB_NAME}.lib) else() - set(ARROW_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}/lib${ARROW_LIB_NAME}.a) - set(ARROW_PYTHON_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}/lib${ARROW_PYTHON_LIB_NAME}.a) + set(ARROW_STATIC_LIB ${ARROW_LIBS}/lib${ARROW_LIB_NAME}.a) + set(ARROW_PYTHON_STATIC_LIB ${ARROW_LIBS}/lib${ARROW_PYTHON_LIB_NAME}.a) set(ARROW_SHARED_LIB ${ARROW_LIBS}/lib${ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) set(ARROW_PYTHON_SHARED_LIB ${ARROW_LIBS}/lib${ARROW_PYTHON_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake index 0e9430ba29195..e9221ff22dc1f 100644 --- a/cpp/cmake_modules/FindClangTools.cmake +++ b/cpp/cmake_modules/FindClangTools.cmake @@ -49,16 +49,47 @@ else() message("clang-tidy found at ${CLANG_TIDY_BIN}") endif() -find_program(CLANG_FORMAT_BIN - NAMES clang-format-4.0 - clang-format-3.9 - clang-format-3.8 - clang-format-3.7 - clang-format-3.6 - clang-format - PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin - NO_DEFAULT_PATH -) +if (CLANG_FORMAT_VERSION) + find_program(CLANG_FORMAT_BIN + NAMES clang-format-${CLANG_FORMAT_VERSION} + PATHS + ${ClangTools_PATH} + $ENV{CLANG_TOOLS_PATH} + /usr/local/bin /usr/bin + NO_DEFAULT_PATH + ) + + # If not found yet, search alternative locations + if (("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") AND APPLE) + # Homebrew ships older LLVM versions in /usr/local/opt/llvm@version/ + STRING(REGEX REPLACE "^([0-9]+)\\.[0-9]+" "\\1" CLANG_FORMAT_MAJOR_VERSION "${CLANG_FORMAT_VERSION}") + STRING(REGEX REPLACE "^[0-9]+\\.([0-9]+)" "\\1" CLANG_FORMAT_MINOR_VERSION "${CLANG_FORMAT_VERSION}") + if ("${CLANG_FORMAT_MINOR_VERSION}" STREQUAL "0") + find_program(CLANG_FORMAT_BIN + NAMES clang-format + PATHS /usr/local/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin + NO_DEFAULT_PATH + ) + else() + find_program(CLANG_FORMAT_BIN + NAMES clang-format + PATHS /usr/local/opt/llvm@${CLANG_FORMAT_VERSION}/bin + NO_DEFAULT_PATH + ) + endif() + endif() +else() + find_program(CLANG_FORMAT_BIN + NAMES clang-format-4.0 + clang-format-3.9 + clang-format-3.8 + clang-format-3.7 + clang-format-3.6 + clang-format + PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin + NO_DEFAULT_PATH + ) +endif() if ( "${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND" ) set(CLANG_FORMAT_FOUND 0) diff --git a/cpp/cmake_modules/FindProtobuf.cmake b/cpp/cmake_modules/FindProtobuf.cmake new file mode 100644 index 0000000000000..a42f4493af494 --- /dev/null +++ b/cpp/cmake_modules/FindProtobuf.cmake @@ -0,0 +1,89 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# PROTOBUF_HOME environmental variable is used to check for Protobuf headers and static library + +# PROTOBUF_INCLUDE_DIR: directory containing headers +# PROTOBUF_LIBS: directory containing Protobuf libraries +# PROTOBUF_STATIC_LIB: location of protobuf.a +# PROTOC_STATIC_LIB: location of protoc.a +# PROTOBUF_EXECUTABLE: location of protoc +# PROTOBUF_FOUND is set if Protobuf is found + + +if( NOT "${PROTOBUF_HOME}" STREQUAL "") + file (TO_CMAKE_PATH "${PROTOBUF_HOME}" _protobuf_path) +endif() + +message (STATUS "PROTOBUF_HOME: ${PROTOBUF_HOME}") + +find_path (PROTOBUF_INCLUDE_DIR google/protobuf/io/zero_copy_stream.h HINTS + ${_protobuf_path} + NO_DEFAULT_PATH + PATH_SUFFIXES "include") + +find_path (PROTOBUF_INCLUDE_DIR google/protobuf/io/coded_stream.h HINTS + ${_protobuf_path} + NO_DEFAULT_PATH + PATH_SUFFIXES "include") + +find_library (PROTOBUF_LIBRARY NAMES protobuf PATHS + ${_protobuf_path} + NO_DEFAULT_PATH + PATH_SUFFIXES "lib") + +find_library (PROTOC_LIBRARY NAMES protoc PATHS + ${_protobuf_path} + NO_DEFAULT_PATH + PATH_SUFFIXES "lib") + +find_program(PROTOBUF_EXECUTABLE protoc HINTS + ${_protobuf_path} + NO_DEFAULT_PATH + PATH_SUFFIXES "bin") + +if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOC_LIBRARY AND PROTOBUF_EXECUTABLE) + set (PROTOBUF_FOUND TRUE) + get_filename_component (PROTOBUF_LIBS ${PROTOBUF_LIBRARY} PATH) + set (PROTOBUF_LIB_NAME protobuf) + set (PROTOC_LIB_NAME protoc) + set (PROTOBUF_STATIC_LIB ${PROTOBUF_LIBS}/${CMAKE_STATIC_LIBRARY_PREFIX}${PROTOBUF_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}) + set (PROTOC_STATIC_LIB ${PROTOBUF_LIBS}/${CMAKE_STATIC_LIBRARY_PREFIX}${PROTOC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}) +else () + set (PROTOBUF_FOUND FALSE) +endif () + +if (PROTOBUF_FOUND) + message (STATUS "Found the Protobuf headers: ${PROTOBUF_INCLUDE_DIR}") + message (STATUS "Found the Protobuf library: ${PROTOBUF_STATIC_LIB}") + message (STATUS "Found the Protoc library: ${PROTOC_STATIC_LIB}") + message (STATUS "Found the Protoc executable: ${PROTOBUF_EXECUTABLE}") +else() + if (_protobuf_path) + set (PROTOBUF_ERR_MSG "Could not find Protobuf. Looked in ${_protobuf_path}.") + else () + set (PROTOBUF_ERR_MSG "Could not find Protobuf in system search paths.") + endif() + + if (Protobuf_FIND_REQUIRED) + message (FATAL_ERROR "${PROTOBUF_ERR_MSG}") + else () + message (STATUS "${PROTOBUF_ERR_MSG}") + endif () +endif() + +mark_as_advanced ( + PROTOBUF_INCLUDE_DIR + PROTOBUF_LIBS + PROTOBUF_STATIC_LIB + PROTOC_STATIC_LIB +) diff --git a/cpp/cmake_modules/FindThrift.cmake b/cpp/cmake_modules/FindThrift.cmake new file mode 100644 index 0000000000000..25f50825c3735 --- /dev/null +++ b/cpp/cmake_modules/FindThrift.cmake @@ -0,0 +1,102 @@ +# Copyright 2012 Cloudera Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# - Find Thrift (a cross platform RPC lib/tool) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Thrift_HOME - When set, this path is inspected instead of standard library +# locations as the root of the Thrift installation. +# The environment variable THRIFT_HOME overrides this variable. +# +# This module defines +# THRIFT_VERSION, version string of ant if found +# THRIFT_INCLUDE_DIR, where to find THRIFT headers +# THRIFT_CONTRIB_DIR, where contrib thrift files (e.g. fb303.thrift) are installed +# THRIFT_STATIC_LIB, THRIFT static library +# THRIFT_FOUND, If false, do not try to use ant + +# prefer the thrift version supplied in THRIFT_HOME +if( NOT "${THRIFT_HOME}" STREQUAL "") + file( TO_CMAKE_PATH "${THRIFT_HOME}" _native_path ) + list( APPEND _thrift_roots ${_native_path} ) +elseif ( Thrift_HOME ) + list( APPEND _thrift_roots ${Thrift_HOME} ) +endif() + +message(STATUS "THRIFT_HOME: ${THRIFT_HOME}") +find_path(THRIFT_INCLUDE_DIR thrift/Thrift.h HINTS + ${_thrift_roots} + NO_DEFAULT_PATH + PATH_SUFFIXES "include" +) + +find_path(THRIFT_CONTRIB_DIR share/fb303/if/fb303.thrift HINTS + ${_thrift_roots} + NO_DEFAULT_PATH +) + +if (MSVC AND NOT THRIFT_MSVC_STATIC_LIB_SUFFIX) + set(THRIFT_MSVC_STATIC_LIB_SUFFIX md) +endif() + +find_library(THRIFT_STATIC_LIB NAMES + ${CMAKE_STATIC_LIBRARY_PREFIX}thrift${THRIFT_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX} + HINTS ${_thrift_roots} + NO_DEFAULT_PATH + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" +) + +find_program(THRIFT_COMPILER thrift HINTS + ${_thrift_roots} + NO_DEFAULT_PATH + PATH_SUFFIXES "bin" +) + +if (THRIFT_STATIC_LIB) + set(THRIFT_FOUND TRUE) + exec_program(${THRIFT_COMPILER} + ARGS -version OUTPUT_VARIABLE THRIFT_VERSION RETURN_VALUE THRIFT_RETURN) +else () + set(THRIFT_FOUND FALSE) +endif () + +if (THRIFT_FOUND) + if (NOT Thrift_FIND_QUIETLY) + message(STATUS "Thrift version: ${THRIFT_VERSION}") + endif () +else () + if (NOT Thrift_FIND_QUIETLY) + set(THRIFT_ERR_MSG "Thrift compiler/libraries NOT found: ${THRIFT_RETURN}") + set(THRIFT_ERR_MSG "${THRIFT_ERR_MSG} (${THRIFT_INCLUDE_DIR}, ${THRIFT_STATIC_LIB}).") + if ( _thrift_roots ) + set(THRIFT_ERR_MSG "${THRIFT_ERR_MSG} Looked in ${_thrift_roots}.") + else () + set(THRIFT_ERR_MSG "${THRIFT_ERR_MSG} Looked in system search paths.") + endif () + if ( Thrift_FIND_REQUIRED ) + message(FATAL_ERROR "${THRIFT_ERR_MSG}") + else () + message(STATUS "${THRIFT_ERR_MSG}") + endif () + endif () +endif () + + +mark_as_advanced( + THRIFT_STATIC_LIB + THRIFT_COMPILER + THRIFT_INCLUDE_DIR +) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 4b1950f7a17bd..97aed6b274976 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -34,6 +34,14 @@ if (MSVC) # headers will see dllimport add_definitions(-DARROW_EXPORTING) + # ARROW-1931 See https://github.com/google/googletest/issues/1318 + # + # This is added to CMAKE_CXX_FLAGS instead of CXX_COMMON_FLAGS since only the + # former is passed into the external projects + if (MSVC_VERSION VERSION_GREATER 1900) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_SILENCE_TR1_NAMESPACE_DEPRECATION_WARNING") + endif() + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # clang-cl set(CXX_COMMON_FLAGS "-EHsc") @@ -47,6 +55,18 @@ if (MSVC) # Set desired warning level (e.g. set /W4 for more warnings) set(CXX_COMMON_FLAGS "/W3") endif() + + if (ARROW_USE_STATIC_CRT) + foreach (c_flag CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + string(REPLACE "/MD" "-MT" ${c_flag} "${${c_flag}}") + endforeach() + endif() + + # Support large object code + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /bigobj") else() # Common flags set below with warning level set(CXX_COMMON_FLAGS "") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 42d7eddc9c9d9..4f64434170655 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -24,12 +24,14 @@ set(GFLAGS_VERSION "2.2.0") set(GTEST_VERSION "1.8.0") set(GBENCHMARK_VERSION "1.1.0") set(FLATBUFFERS_VERSION "1.7.1") -set(JEMALLOC_VERSION "4.4.0") +set(JEMALLOC_VERSION "17c897976c60b0e6e4f4a365c751027244dada7a") set(SNAPPY_VERSION "1.1.3") set(BROTLI_VERSION "v0.6.0") set(LZ4_VERSION "1.7.5") set(ZSTD_VERSION "1.2.0") +set(PROTOBUF_VERSION "2.6.0") set(GRPC_VERSION "94582910ad7f82ad447ecc72e6548cb669e4f7a9") # v1.6.5 +set(ORC_VERSION "cf00b67795717ab3eb04e950780ed6d104109017") string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) @@ -130,6 +132,9 @@ endif() set(Boost_DEBUG TRUE) set(Boost_USE_MULTITHREADED ON) +if (MSVC AND ARROW_USE_STATIC_CRT) + set(Boost_USE_STATIC_RUNTIME ON) +endif() set(Boost_ADDITIONAL_VERSIONS "1.65.0" "1.65" "1.64.0" "1.64" @@ -204,11 +209,11 @@ else() else() find_package(Boost COMPONENTS system filesystem REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) - set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) + set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) else() - set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) - set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) + set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_shared) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared) @@ -222,11 +227,11 @@ else() else() find_package(Boost COMPONENTS system filesystem REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) - set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) + set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) else() - set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) - set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) + set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) @@ -272,8 +277,10 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) set(GTEST_VENDORED 1) set(GTEST_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX} - -Dgtest_force_shared_crt=ON -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}) + if (MSVC AND NOT ARROW_USE_STATIC_CRT) + set(GTEST_CMAKE_ARGS ${GTEST_CMAKE_ARGS} -Dgtest_force_shared_crt=ON) + endif() ExternalProject_Add(googletest_ep URL "https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz" @@ -320,6 +327,8 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) -DBUILD_TESTING=OFF -BUILD_CONFIG_TESTS=OFF -DINSTALL_HEADERS=ON + -DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS} + -DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_C_FLAGS} -DCMAKE_CXX_FLAGS=${GFLAGS_CMAKE_CXX_FLAGS}) ExternalProject_Add(gflags_ep @@ -434,6 +443,8 @@ if (ARROW_IPC) "-DCMAKE_CXX_FLAGS=${FLATBUFFERS_CMAKE_CXX_FLAGS}" "-DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_PREFIX}" "-DFLATBUFFERS_BUILD_TESTS=OFF" + "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS}" + "-DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_C_FLAGS}" ${EP_LOG_OPTIONS}) set(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_PREFIX}/include") @@ -471,8 +482,8 @@ if (ARROW_JEMALLOC) set(JEMALLOC_STATIC_LIB "${JEMALLOC_PREFIX}/lib/libjemalloc_pic${CMAKE_STATIC_LIBRARY_SUFFIX}") set(JEMALLOC_VENDORED 1) ExternalProject_Add(jemalloc_ep - URL https://github.com/jemalloc/jemalloc/releases/download/${JEMALLOC_VERSION}/jemalloc-${JEMALLOC_VERSION}.tar.bz2 - CONFIGURE_COMMAND ./configure "--prefix=${JEMALLOC_PREFIX}" "--with-jemalloc-prefix=je_arrow_" "--with-private-namespace=je_arrow_private_" + URL ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/jemalloc/${JEMALLOC_VERSION}.tar.gz + CONFIGURE_COMMAND ./autogen.sh "--prefix=${JEMALLOC_PREFIX}" "--with-jemalloc-prefix=je_arrow_" "--with-private-namespace=je_arrow_private_" && touch doc/jemalloc.html && touch doc/jemalloc.3 ${EP_LOG_OPTIONS} BUILD_IN_SOURCE 1 BUILD_COMMAND ${MAKE} @@ -550,6 +561,8 @@ if (ARROW_WITH_ZLIB) set(ZLIB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX} -DCMAKE_C_FLAGS=${EP_C_FLAGS} + -DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS} + -DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_C_FLAGS} -DBUILD_SHARED_LIBS=OFF) ExternalProject_Add(zlib_ep @@ -599,7 +612,9 @@ if (ARROW_WITH_SNAPPY) if (MSVC) set(SNAPPY_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} "-DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS}" - "-DCMAKE_C_FLAGS=${EX_C_FLAGS}" + "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS}" + "-DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_C_FLAGS}" + "-DCMAKE_C_FLAGS=${EP_C_FLAGS}" "-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}") set(SNAPPY_UPDATE_COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/cmake_modules/SnappyCMakeLists.txt @@ -659,7 +674,9 @@ if (ARROW_WITH_BROTLI) set(BROTLI_STATIC_LIBRARY_COMMON "${BROTLI_PREFIX}/${BROTLI_LIB_DIR}/${CMAKE_LIBRARY_ARCHITECTURE}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlicommon${CMAKE_STATIC_LIBRARY_SUFFIX}") set(BROTLI_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} "-DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS}" - "-DCMAKE_C_FLAGS=${EX_C_FLAGS}" + "-DCMAKE_C_FLAGS=${EP_C_FLAGS}" + "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS}" + "-DCMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_C_FLAGS}" -DCMAKE_INSTALL_PREFIX=${BROTLI_PREFIX} -DCMAKE_INSTALL_LIBDIR=lib/${CMAKE_LIBRARY_ARCHITECTURE} -DBUILD_SHARED_LIBS=OFF) @@ -706,12 +723,21 @@ if (ARROW_WITH_LZ4) if("${LZ4_HOME}" STREQUAL "") set(LZ4_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/lz4_ep-prefix/src/lz4_ep") + set(LZ4_HOME "${LZ4_BUILD_DIR}") set(LZ4_INCLUDE_DIR "${LZ4_BUILD_DIR}/lib") if (MSVC) + if (ARROW_USE_STATIC_CRT) + if (${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") + set(LZ4_RUNTIME_LIBRARY_LINKAGE "/p:RuntimeLibrary=MultiThreadedDebug") + else() + set(LZ4_RUNTIME_LIBRARY_LINKAGE "/p:RuntimeLibrary=MultiThreaded") + endif() + endif() set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/visual/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/liblz4_static.lib") - set(LZ4_BUILD_COMMAND BUILD_COMMAND msbuild.exe /m /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 /t:Build ${LZ4_BUILD_DIR}/visual/VS2010/lz4.sln) - set(LZ4_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose --whitespace=fix ${CMAKE_SOURCE_DIR}/build-support/lz4_msbuild_wholeprogramoptimization_param.patch) + set(LZ4_BUILD_COMMAND BUILD_COMMAND msbuild.exe /m /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 + ${LZ4_RUNTIME_LIBRARY_LINKAGE} /t:Build ${LZ4_BUILD_DIR}/visual/VS2010/lz4.sln) + set(LZ4_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose --whitespace=fix ${CMAKE_SOURCE_DIR}/build-support/lz4_msbuild_gl_runtimelibrary_params.patch) else() set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/lib/liblz4.a") set(LZ4_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-lz4-lib.sh) @@ -753,9 +779,18 @@ if (ARROW_WITH_ZSTD) set(ZSTD_INCLUDE_DIR "${ZSTD_BUILD_DIR}/lib") if (MSVC) + if (ARROW_USE_STATIC_CRT) + if (${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") + set(ZSTD_RUNTIME_LIBRARY_LINKAGE "/p:RuntimeLibrary=MultiThreadedDebug") + else() + set(ZSTD_RUNTIME_LIBRARY_LINKAGE "/p:RuntimeLibrary=MultiThreaded") + endif() + endif() set(ZSTD_STATIC_LIB "${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/libzstd_static.lib") - set(ZSTD_BUILD_COMMAND BUILD_COMMAND msbuild ${ZSTD_BUILD_DIR}/build/VS2010/zstd.sln /t:Build /v:minimal /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 /p:OutDir=${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/ /p:SolutionDir=${ZSTD_BUILD_DIR}/build/VS2010/ ) - set(ZSTD_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose --whitespace=fix ${CMAKE_SOURCE_DIR}/build-support/zstd_msbuild_wholeprogramoptimization_param.patch) + set(ZSTD_BUILD_COMMAND BUILD_COMMAND msbuild ${ZSTD_BUILD_DIR}/build/VS2010/zstd.sln /t:Build /v:minimal /p:Configuration=${CMAKE_BUILD_TYPE} + ${ZSTD_RUNTIME_LIBRARY_LINKAGE} /p:Platform=x64 /p:PlatformToolset=v140 + /p:OutDir=${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/ /p:SolutionDir=${ZSTD_BUILD_DIR}/build/VS2010/ ) + set(ZSTD_PATCH_COMMAND PATCH_COMMAND git --git-dir=. apply --verbose --whitespace=fix ${CMAKE_SOURCE_DIR}/build-support/zstd_msbuild_gl_runtimelibrary_params.patch) else() set(ZSTD_STATIC_LIB "${ZSTD_BUILD_DIR}/lib/libzstd.a") set(ZSTD_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-zstd-lib.sh) @@ -802,7 +837,7 @@ if (ARROW_WITH_GRPC) set(GRPC_STATIC_LIBRARY_GRPCPP "${GRPC_BUILD_DIR}/${CMAKE_CFG_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}grpc++${CMAKE_STATIC_LIBRARY_SUFFIX}") set(GRPC_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} "-DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS}" - "-DCMAKE_C_FLAGS=${EX_C_FLAGS}" + "-DCMAKE_C_FLAGS=${EP_C_FLAGS}" -DCMAKE_INSTALL_PREFIX=${GRPC_PREFIX} -DBUILD_SHARED_LIBS=OFF) @@ -833,3 +868,73 @@ if (ARROW_WITH_GRPC) endif() endif() + +if (ARROW_ORC) + # protobuf + if ("${PROTOBUF_HOME}" STREQUAL "") + set (PROTOBUF_PREFIX "${THIRDPARTY_DIR}/protobuf_ep-install") + set (PROTOBUF_HOME "${PROTOBUF_PREFIX}") + set (PROTOBUF_INCLUDE_DIR "${PROTOBUF_PREFIX}/include") + set (PROTOBUF_STATIC_LIB "${PROTOBUF_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}protobuf${CMAKE_STATIC_LIBRARY_SUFFIX}") + set (PROTOBUF_SRC_URL "https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protobuf-${PROTOBUF_VERSION}.tar.gz") + + ExternalProject_Add(protobuf_ep + CONFIGURE_COMMAND "./configure" "--disable-shared" "--prefix=${PROTOBUF_PREFIX}" "CXXFLAGS=${EP_CXX_FLAGS}" + BUILD_IN_SOURCE 1 + URL ${PROTOBUF_SRC_URL} + LOG_DOWNLOAD 1 + LOG_CONFIGURE 1 + LOG_BUILD 1 + LOG_INSTALL 1 + BUILD_BYPRODUCTS "${PROTOBUF_STATIC_LIB}") + + set (PROTOBUF_VENDORED 1) + else () + find_package (Protobuf REQUIRED) + set (PROTOBUF_VENDORED 0) + endif () + + include_directories (SYSTEM ${PROTOBUF_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(protobuf + STATIC_LIB ${PROTOBUF_STATIC_LIB}) + + if (PROTOBUF_VENDORED) + add_dependencies (protobuf protobuf_ep) + endif () + + # orc + set(ORC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/orc_ep-install") + set(ORC_HOME "${ORC_PREFIX}") + set(ORC_INCLUDE_DIR "${ORC_PREFIX}/include") + set(ORC_STATIC_LIB "${ORC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}orc${CMAKE_STATIC_LIBRARY_SUFFIX}") + + # Since LZ4 isn't installed, the header file is in ${LZ4_HOME}/lib instead of + # ${LZ4_HOME}/include, which forces us to specify the include directory + # manually as well. + set (ORC_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${ORC_PREFIX} + -DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS} + -DBUILD_LIBHDFSPP=OFF + -DBUILD_JAVA=OFF + -DBUILD_TOOLS=OFF + -DBUILD_CPP_TESTS=OFF + -DINSTALL_VENDORED_LIBS=OFF + -DPROTOBUF_HOME=${PROTOBUF_HOME} + -DLZ4_HOME=${LZ4_HOME} + -DLZ4_INCLUDE_DIR=${LZ4_INCLUDE_DIR} + -DSNAPPY_HOME=${SNAPPY_HOME} + -DZLIB_HOME=${ZLIB_HOME}) + + ExternalProject_Add(orc_ep + GIT_REPOSITORY "https://github.com/apache/orc" + GIT_TAG ${ORC_VERSION} + BUILD_BYPRODUCTS ${ORC_STATIC_LIB} + CMAKE_ARGS ${ORC_CMAKE_ARGS}) + + include_directories(SYSTEM ${ORC_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(orc + STATIC_LIB ${ORC_STATIC_LIB}) + + add_dependencies(orc_ep protobuf lz4_static snappy zlib) + add_dependencies(orc orc_ep) +endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 22b475146da7c..ad86256e0be34 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -22,6 +22,7 @@ set(ARROW_SRCS compare.cc memory_pool.cc pretty_print.cc + record_batch.cc status.cc table.cc table_builder.cc @@ -37,6 +38,7 @@ set(ARROW_SRCS util/compression.cc util/cpu-info.cc util/decimal.cc + util/hash.cc util/key_value_metadata.cc ) @@ -50,8 +52,10 @@ endif() if (ARROW_COMPUTE) add_subdirectory(compute) set(ARROW_SRCS ${ARROW_SRCS} - compute/cast.cc compute/context.cc + compute/kernels/cast.cc + compute/kernels/hash.cc + compute/kernels/util-internal.cc ) endif() @@ -86,6 +90,11 @@ if (ARROW_WITH_ZSTD) SET(ARROW_SRCS util/compression_zstd.cc ${ARROW_SRCS}) endif() +if (ARROW_ORC) + add_subdirectory(adapters/orc) + SET(ARROW_SRCS adapters/orc/adapter.cc ${ARROW_SRCS}) +endif() + if (NOT ARROW_BOOST_HEADER_ONLY) set(ARROW_SRCS ${ARROW_SRCS} io/hdfs.cc @@ -142,6 +151,7 @@ install(FILES compare.h memory_pool.h pretty_print.h + record_batch.h status.h table.h table_builder.h @@ -171,6 +181,7 @@ ADD_ARROW_TEST(array-test) ADD_ARROW_TEST(buffer-test) ADD_ARROW_TEST(memory_pool-test) ADD_ARROW_TEST(pretty_print-test) +ADD_ARROW_TEST(public-api-test) ADD_ARROW_TEST(status-test) ADD_ARROW_TEST(type-test) ADD_ARROW_TEST(table-test) diff --git a/cpp/src/arrow/adapters/orc/CMakeLists.txt b/cpp/src/arrow/adapters/orc/CMakeLists.txt new file mode 100644 index 0000000000000..eb7194cd42113 --- /dev/null +++ b/cpp/src/arrow/adapters/orc/CMakeLists.txt @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####################################### +# arrow_orc +####################################### + +# Headers: top level +install(FILES + adapter.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/adapters/orc") diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc new file mode 100644 index 0000000000000..473c90f925124 --- /dev/null +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -0,0 +1,697 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/adapters/orc/adapter.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/io/interfaces.h" +#include "arrow/memory_pool.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/table_builder.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +#include "orc/OrcFile.hh" + +// alias to not interfere with nested orc namespace +namespace liborc = orc; + +namespace arrow { +namespace adapters { +namespace orc { + +#define ORC_THROW_NOT_OK(s) \ + do { \ + Status _s = (s); \ + if (!_s.ok()) { \ + std::stringstream ss; \ + ss << "Arrow error: " << _s.ToString(); \ + throw liborc::ParseError(ss.str()); \ + } \ + } while (0) + +class ArrowInputFile : public liborc::InputStream { + public: + explicit ArrowInputFile(const std::shared_ptr& file) + : file_(file) {} + + uint64_t getLength() const override { + int64_t size; + ORC_THROW_NOT_OK(file_->GetSize(&size)); + return static_cast(size); + } + + uint64_t getNaturalReadSize() const override { return 128 * 1024; } + + void read(void* buf, uint64_t length, uint64_t offset) override { + int64_t bytes_read; + + ORC_THROW_NOT_OK(file_->ReadAt(offset, length, &bytes_read, buf)); + + if (static_cast(bytes_read) != length) { + throw liborc::ParseError("Short read from arrow input file"); + } + } + + const std::string& getName() const override { + static const std::string filename("ArrowInputFile"); + return filename; + } + + private: + std::shared_ptr file_; +}; + +struct StripeInformation { + uint64_t offset; + uint64_t length; + uint64_t num_rows; +}; + +Status GetArrowType(const liborc::Type* type, std::shared_ptr* out) { + // When subselecting fields on read, liborc will set some nodes to nullptr, + // so we need to check for nullptr before progressing + if (type == nullptr) { + *out = null(); + return Status::OK(); + } + liborc::TypeKind kind = type->getKind(); + switch (kind) { + case liborc::BOOLEAN: + *out = boolean(); + break; + case liborc::BYTE: + *out = int8(); + break; + case liborc::SHORT: + *out = int16(); + break; + case liborc::INT: + *out = int32(); + break; + case liborc::LONG: + *out = int64(); + break; + case liborc::FLOAT: + *out = float32(); + break; + case liborc::DOUBLE: + *out = float64(); + break; + case liborc::VARCHAR: + case liborc::STRING: + *out = utf8(); + break; + case liborc::BINARY: + *out = binary(); + break; + case liborc::CHAR: + *out = fixed_size_binary(type->getMaximumLength()); + break; + case liborc::TIMESTAMP: + *out = timestamp(TimeUnit::NANO); + break; + case liborc::DATE: + *out = date32(); + break; + case liborc::DECIMAL: { + if (type->getPrecision() == 0) { + // In HIVE 0.11/0.12 precision is set as 0, but means max precision + *out = decimal(38, 6); + } else { + *out = decimal(type->getPrecision(), type->getScale()); + } + break; + } + case liborc::LIST: { + if (type->getSubtypeCount() != 1) { + return Status::Invalid("Invalid Orc List type"); + } + std::shared_ptr elemtype; + RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &elemtype)); + *out = list(elemtype); + break; + } + case liborc::MAP: { + if (type->getSubtypeCount() != 2) { + return Status::Invalid("Invalid Orc Map type"); + } + std::shared_ptr keytype; + std::shared_ptr valtype; + RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &keytype)); + RETURN_NOT_OK(GetArrowType(type->getSubtype(1), &valtype)); + *out = list(struct_({field("key", keytype), field("value", valtype)})); + break; + } + case liborc::STRUCT: { + int size = type->getSubtypeCount(); + std::vector> fields; + for (int child = 0; child < size; ++child) { + std::shared_ptr elemtype; + RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elemtype)); + std::string name = type->getFieldName(child); + fields.push_back(field(name, elemtype)); + } + *out = struct_(fields); + break; + } + case liborc::UNION: { + int size = type->getSubtypeCount(); + std::vector> fields; + std::vector type_codes; + for (int child = 0; child < size; ++child) { + std::shared_ptr elemtype; + RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elemtype)); + fields.push_back(field("_union_" + std::to_string(child), elemtype)); + type_codes.push_back(static_cast(child)); + } + *out = union_(fields, type_codes); + break; + } + default: { + std::stringstream ss; + ss << "Unknown Orc type kind: " << kind; + return Status::Invalid(ss.str()); + } + } + return Status::OK(); +} + +// The number of rows to read in a ColumnVectorBatch +constexpr int64_t kReadRowsBatch = 1000; + +// The numer of nanoseconds in a second +constexpr int64_t kOneSecondNanos = 1000000000LL; + +class ORCFileReader::Impl { + public: + Impl() {} + ~Impl() {} + + Status Open(const std::shared_ptr& file, MemoryPool* pool) { + std::unique_ptr io_wrapper(new ArrowInputFile(file)); + liborc::ReaderOptions options; + std::unique_ptr liborc_reader; + try { + liborc_reader = createReader(std::move(io_wrapper), options); + } catch (const liborc::ParseError& e) { + return Status::IOError(e.what()); + } + pool_ = pool; + reader_ = std::move(liborc_reader); + + return Init(); + } + + Status Init() { + int64_t nstripes = reader_->getNumberOfStripes(); + stripes_.resize(nstripes); + std::unique_ptr stripe; + for (int i = 0; i < nstripes; ++i) { + stripe = reader_->getStripe(i); + stripes_[i] = StripeInformation( + {stripe->getOffset(), stripe->getLength(), stripe->getNumberOfRows()}); + } + return Status::OK(); + } + + int64_t NumberOfStripes() { return stripes_.size(); } + + int64_t NumberOfRows() { return reader_->getNumberOfRows(); } + + Status ReadSchema(std::shared_ptr* out) { + const liborc::Type& type = reader_->getType(); + return GetArrowSchema(type, out); + } + + Status GetArrowSchema(const liborc::Type& type, std::shared_ptr* out) { + if (type.getKind() != liborc::STRUCT) { + return Status::NotImplemented( + "Only ORC files with a top-level struct " + "can be handled"); + } + int size = type.getSubtypeCount(); + std::vector> fields; + for (int child = 0; child < size; ++child) { + std::shared_ptr elemtype; + RETURN_NOT_OK(GetArrowType(type.getSubtype(child), &elemtype)); + std::string name = type.getFieldName(child); + fields.push_back(field(name, elemtype)); + } + std::list keys = reader_->getMetadataKeys(); + std::shared_ptr metadata; + if (!keys.empty()) { + metadata = std::make_shared(); + for (auto it = keys.begin(); it != keys.end(); ++it) { + metadata->Append(*it, reader_->getMetadataValue(*it)); + } + } + + *out = std::make_shared(fields, metadata); + return Status::OK(); + } + + Status Read(std::shared_ptr* out) { + liborc::RowReaderOptions opts; + return ReadTable(opts, out); + } + + Status Read(const std::vector& include_indices, std::shared_ptr
* out) { + liborc::RowReaderOptions opts; + RETURN_NOT_OK(SelectIndices(&opts, include_indices)); + return ReadTable(opts, out); + } + + Status ReadStripe(int64_t stripe, std::shared_ptr* out) { + liborc::RowReaderOptions opts; + RETURN_NOT_OK(SelectStripe(&opts, stripe)); + return ReadBatch(opts, stripes_[stripe].num_rows, out); + } + + Status ReadStripe(int64_t stripe, const std::vector& include_indices, + std::shared_ptr* out) { + liborc::RowReaderOptions opts; + RETURN_NOT_OK(SelectIndices(&opts, include_indices)); + RETURN_NOT_OK(SelectStripe(&opts, stripe)); + return ReadBatch(opts, stripes_[stripe].num_rows, out); + } + + Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) { + if (stripe < 0 || stripe >= NumberOfStripes()) { + std::stringstream ss; + ss << "Out of bounds stripe: " << stripe; + return Status::Invalid(ss.str()); + } + opts->range(stripes_[stripe].offset, stripes_[stripe].length); + return Status::OK(); + } + + Status SelectIndices(liborc::RowReaderOptions* opts, + const std::vector& include_indices) { + std::list include_indices_list; + for (auto it = include_indices.begin(); it != include_indices.end(); ++it) { + if (*it < 0) { + return Status::Invalid("Negative field index"); + } + include_indices_list.push_back(*it); + } + opts->includeTypes(include_indices_list); + return Status::OK(); + } + + Status ReadTable(const liborc::RowReaderOptions& row_opts, + std::shared_ptr
* out) { + liborc::RowReaderOptions opts(row_opts); + std::vector> batches(stripes_.size()); + for (size_t stripe = 0; stripe < stripes_.size(); stripe++) { + opts.range(stripes_[stripe].offset, stripes_[stripe].length); + RETURN_NOT_OK(ReadBatch(opts, stripes_[stripe].num_rows, &batches[stripe])); + } + return Table::FromRecordBatches(batches, out); + } + + Status ReadBatch(const liborc::RowReaderOptions& opts, int64_t nrows, + std::shared_ptr* out) { + std::unique_ptr rowreader; + std::unique_ptr batch; + try { + rowreader = reader_->createRowReader(opts); + batch = rowreader->createRowBatch(std::min(nrows, kReadRowsBatch)); + } catch (const liborc::ParseError& e) { + return Status::Invalid(e.what()); + } + const liborc::Type& type = rowreader->getSelectedType(); + std::shared_ptr schema; + RETURN_NOT_OK(GetArrowSchema(type, &schema)); + + std::unique_ptr builder; + RETURN_NOT_OK(RecordBatchBuilder::Make(schema, pool_, nrows, &builder)); + + // The top-level type must be a struct to read into an arrow table + const auto& struct_batch = static_cast(*batch); + + while (rowreader->next(*batch)) { + for (int i = 0; i < builder->num_fields(); i++) { + RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0, + batch->numElements, builder->GetField(i))); + } + } + RETURN_NOT_OK(builder->Flush(out)); + return Status::OK(); + } + + Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch, + int64_t offset, int64_t length, ArrayBuilder* builder) { + if (type == nullptr) { + return Status::OK(); + } + liborc::TypeKind kind = type->getKind(); + switch (kind) { + case liborc::STRUCT: + return AppendStructBatch(type, batch, offset, length, builder); + case liborc::LIST: + return AppendListBatch(type, batch, offset, length, builder); + case liborc::MAP: + return AppendMapBatch(type, batch, offset, length, builder); + case liborc::LONG: + return AppendNumericBatch( + batch, offset, length, builder); + case liborc::INT: + return AppendNumericBatchCast(batch, offset, length, builder); + case liborc::SHORT: + return AppendNumericBatchCast(batch, offset, length, builder); + case liborc::BYTE: + return AppendNumericBatchCast(batch, offset, length, builder); + case liborc::DOUBLE: + return AppendNumericBatch( + batch, offset, length, builder); + case liborc::FLOAT: + return AppendNumericBatchCast(batch, offset, length, builder); + case liborc::BOOLEAN: + return AppendBoolBatch(batch, offset, length, builder); + case liborc::VARCHAR: + case liborc::STRING: + return AppendBinaryBatch(batch, offset, length, builder); + case liborc::BINARY: + return AppendBinaryBatch(batch, offset, length, builder); + case liborc::CHAR: + return AppendFixedBinaryBatch(batch, offset, length, builder); + case liborc::DATE: + return AppendNumericBatchCast(batch, offset, length, builder); + case liborc::TIMESTAMP: + return AppendTimestampBatch(batch, offset, length, builder); + case liborc::DECIMAL: + return AppendDecimalBatch(type, batch, offset, length, builder); + default: + std::stringstream ss; + ss << "Not implemented type kind: " << kind; + return Status::NotImplemented(ss.str()); + } + } + + Status AppendStructBatch(const liborc::Type* type, liborc::ColumnVectorBatch* cbatch, + int64_t offset, int64_t length, ArrayBuilder* abuilder) { + auto builder = static_cast(abuilder); + auto batch = static_cast(cbatch); + + const uint8_t* valid_bytes = nullptr; + if (batch->hasNulls) { + valid_bytes = reinterpret_cast(batch->notNull.data()) + offset; + } + RETURN_NOT_OK(builder->Append(length, valid_bytes)); + + for (int i = 0; i < builder->num_fields(); i++) { + RETURN_NOT_OK(AppendBatch(type->getSubtype(i), batch->fields[i], offset, length, + builder->field_builder(i))); + } + return Status::OK(); + } + + Status AppendListBatch(const liborc::Type* type, liborc::ColumnVectorBatch* cbatch, + int64_t offset, int64_t length, ArrayBuilder* abuilder) { + auto builder = static_cast(abuilder); + auto batch = static_cast(cbatch); + liborc::ColumnVectorBatch* elements = batch->elements.get(); + const liborc::Type* elemtype = type->getSubtype(0); + + const bool has_nulls = batch->hasNulls; + for (int i = offset; i < length + offset; i++) { + if (!has_nulls || batch->notNull[i]) { + int64_t start = batch->offsets[i]; + int64_t end = batch->offsets[i + 1]; + RETURN_NOT_OK(builder->Append()); + RETURN_NOT_OK(AppendBatch(elemtype, elements, start, end - start, + builder->value_builder())); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + return Status::OK(); + } + + Status AppendMapBatch(const liborc::Type* type, liborc::ColumnVectorBatch* cbatch, + int64_t offset, int64_t length, ArrayBuilder* abuilder) { + auto list_builder = static_cast(abuilder); + auto struct_builder = static_cast(list_builder->value_builder()); + auto batch = static_cast(cbatch); + liborc::ColumnVectorBatch* keys = batch->keys.get(); + liborc::ColumnVectorBatch* vals = batch->elements.get(); + const liborc::Type* keytype = type->getSubtype(0); + const liborc::Type* valtype = type->getSubtype(1); + + const bool has_nulls = batch->hasNulls; + for (int i = offset; i < length + offset; i++) { + RETURN_NOT_OK(list_builder->Append()); + int64_t start = batch->offsets[i]; + int64_t list_length = batch->offsets[i + 1] - start; + if (list_length && (!has_nulls || batch->notNull[i])) { + RETURN_NOT_OK(struct_builder->Append(list_length, nullptr)); + RETURN_NOT_OK(AppendBatch(keytype, keys, start, list_length, + struct_builder->field_builder(0))); + RETURN_NOT_OK(AppendBatch(valtype, vals, start, list_length, + struct_builder->field_builder(1))); + } + } + return Status::OK(); + } + + template + Status AppendNumericBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { + auto builder = static_cast(abuilder); + auto batch = static_cast(cbatch); + + if (length == 0) { + return Status::OK(); + } + const uint8_t* valid_bytes = nullptr; + if (batch->hasNulls) { + valid_bytes = reinterpret_cast(batch->notNull.data()) + offset; + } + const elem_type* source = batch->data.data() + offset; + RETURN_NOT_OK(builder->Append(source, length, valid_bytes)); + return Status::OK(); + } + + template + Status AppendNumericBatchCast(liborc::ColumnVectorBatch* cbatch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { + auto builder = static_cast(abuilder); + auto batch = static_cast(cbatch); + + if (length == 0) { + return Status::OK(); + } + int start = builder->length(); + + const uint8_t* valid_bytes = nullptr; + if (batch->hasNulls) { + valid_bytes = reinterpret_cast(batch->notNull.data()) + offset; + } + RETURN_NOT_OK(builder->AppendNulls(valid_bytes, length)); + + const source_type* source = batch->data.data() + offset; + target_type* target = reinterpret_cast(builder->data()->mutable_data()); + + std::copy(source, source + length, target + start); + + return Status::OK(); + } + + Status AppendBoolBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { + auto builder = static_cast(abuilder); + auto batch = static_cast(cbatch); + + if (length == 0) { + return Status::OK(); + } + int start = builder->length(); + + const uint8_t* valid_bytes = nullptr; + if (batch->hasNulls) { + valid_bytes = reinterpret_cast(batch->notNull.data()) + offset; + } + RETURN_NOT_OK(builder->AppendNulls(valid_bytes, length)); + + const int64_t* source = batch->data.data() + offset; + uint8_t* target = reinterpret_cast(builder->data()->mutable_data()); + + for (int i = 0; i < length; i++) { + if (source[i]) { + BitUtil::SetBit(target, start + i); + } else { + BitUtil::ClearBit(target, start + i); + } + } + return Status::OK(); + } + + Status AppendTimestampBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { + auto builder = static_cast(abuilder); + auto batch = static_cast(cbatch); + + if (length == 0) { + return Status::OK(); + } + int start = builder->length(); + + const uint8_t* valid_bytes = nullptr; + if (batch->hasNulls) { + valid_bytes = reinterpret_cast(batch->notNull.data()) + offset; + } + RETURN_NOT_OK(builder->AppendNulls(valid_bytes, length)); + + const int64_t* seconds = batch->data.data() + offset; + const int64_t* nanos = batch->nanoseconds.data() + offset; + int64_t* target = reinterpret_cast(builder->data()->mutable_data()); + + for (int i = 0; i < length; i++) { + // TODO: boundscheck this, as ORC supports higher resolution timestamps + // than arrow for nanosecond resolution + target[start + i] = seconds[i] * kOneSecondNanos + nanos[i]; + } + return Status::OK(); + } + + template + Status AppendBinaryBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { + auto builder = static_cast(abuilder); + auto batch = static_cast(cbatch); + + const bool has_nulls = batch->hasNulls; + for (int i = offset; i < length + offset; i++) { + if (!has_nulls || batch->notNull[i]) { + RETURN_NOT_OK(builder->Append(batch->data[i], batch->length[i])); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + return Status::OK(); + } + + Status AppendFixedBinaryBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { + auto builder = static_cast(abuilder); + auto batch = static_cast(cbatch); + + const bool has_nulls = batch->hasNulls; + for (int i = offset; i < length + offset; i++) { + if (!has_nulls || batch->notNull[i]) { + RETURN_NOT_OK(builder->Append(batch->data[i])); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + return Status::OK(); + } + + Status AppendDecimalBatch(const liborc::Type* type, liborc::ColumnVectorBatch* cbatch, + int64_t offset, int64_t length, ArrayBuilder* abuilder) { + auto builder = static_cast(abuilder); + + const bool has_nulls = cbatch->hasNulls; + if (type->getPrecision() == 0 || type->getPrecision() > 18) { + auto batch = static_cast(cbatch); + for (int i = offset; i < length + offset; i++) { + if (!has_nulls || batch->notNull[i]) { + RETURN_NOT_OK(builder->Append( + Decimal128(batch->values[i].getHighBits(), batch->values[i].getLowBits()))); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + } else { + auto batch = static_cast(cbatch); + for (int i = offset; i < length + offset; i++) { + if (!has_nulls || batch->notNull[i]) { + RETURN_NOT_OK(builder->Append(Decimal128(batch->values[i]))); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + } + return Status::OK(); + } + + private: + MemoryPool* pool_; + std::unique_ptr reader_; + std::vector stripes_; +}; + +ORCFileReader::ORCFileReader() { impl_.reset(new ORCFileReader::Impl()); } + +ORCFileReader::~ORCFileReader() {} + +Status ORCFileReader::Open(const std::shared_ptr& file, + MemoryPool* pool, std::unique_ptr* reader) { + auto result = std::unique_ptr(new ORCFileReader()); + RETURN_NOT_OK(result->impl_->Open(file, pool)); + *reader = std::move(result); + return Status::OK(); +} + +Status ORCFileReader::ReadSchema(std::shared_ptr* out) { + return impl_->ReadSchema(out); +} + +Status ORCFileReader::Read(std::shared_ptr
* out) { return impl_->Read(out); } + +Status ORCFileReader::Read(const std::vector& include_indices, + std::shared_ptr
* out) { + return impl_->Read(include_indices, out); +} + +Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr* out) { + return impl_->ReadStripe(stripe, out); +} + +Status ORCFileReader::ReadStripe(int64_t stripe, const std::vector& include_indices, + std::shared_ptr* out) { + return impl_->ReadStripe(stripe, include_indices, out); +} + +int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); } + +int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); } + +} // namespace orc +} // namespace adapters +} // namespace arrow diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h new file mode 100644 index 0000000000000..6438658fd45bb --- /dev/null +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_ORC_CONVERTER_H +#define ARROW_ORC_CONVERTER_H + +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/memory_pool.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace adapters { + +namespace orc { + +/// \class ORCFileReader +/// \brief Read an Arrow Table or RecordBatch from an ORC file. +class ARROW_EXPORT ORCFileReader { + public: + ~ORCFileReader(); + + /// \brief Create a new ORC reader + /// + /// \param[in] file the data source + /// \param[in] pool a MemoryPool to use for buffer allocations + /// \param[out] reader the returned reader object + /// \return Status + static Status Open(const std::shared_ptr& file, + MemoryPool* pool, std::unique_ptr* reader); + + /// \brief Return the schema read from the ORC file + /// + /// \param[out] out the returned Schema object + Status ReadSchema(std::shared_ptr* out); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[out] out the returned RecordBatch + Status Read(std::shared_ptr
* out); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] include_indices the selected field indices to read + /// \param[out] out the returned RecordBatch + Status Read(const std::vector& include_indices, std::shared_ptr
* out); + + /// \brief Read a single stripe as a RecordBatch + /// + /// \param[in] stripe the stripe index + /// \param[out] out the returned RecordBatch + Status ReadStripe(int64_t stripe, std::shared_ptr* out); + + /// \brief Read a single stripe as a RecordBatch + /// + /// \param[in] stripe the stripe index + /// \param[in] include_indices the selected field indices to read + /// \param[out] out the returned RecordBatch + Status ReadStripe(int64_t stripe, const std::vector& include_indices, + std::shared_ptr* out); + + /// \brief The number of stripes in the file + int64_t NumberOfStripes(); + + /// \brief The number of rows in the file + int64_t NumberOfRows(); + + private: + class Impl; + std::unique_ptr impl_; + ORCFileReader(); +}; + +} // namespace orc + +} // namespace adapters + +} // namespace arrow + +#endif // ARROW_ORC_CONVERTER_H diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 5d2e859f3a4be..7cae8414a774b 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -26,6 +26,7 @@ #include "arrow/compare.h" #include "arrow/memory_pool.h" #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/table_builder.h" diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 168ef10573e77..7ff3261ecba5e 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -263,6 +263,8 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_TRUE(result->Equals(*expected)); } + int64_t FlipValue(int64_t value) const { return ~value; } + protected: std::shared_ptr type_; std::unique_ptr builder_; @@ -272,44 +274,64 @@ class TestPrimitiveBuilder : public TestBuilder { vector valid_bytes_; }; -#define PTYPE_DECL(CapType, c_type) \ - typedef CapType##Array ArrayType; \ - typedef CapType##Builder BuilderType; \ - typedef CapType##Type Type; \ - typedef c_type T; \ - \ - static std::shared_ptr type() { \ - return std::shared_ptr(new Type()); \ - } +/// \brief uint8_t isn't a valid template parameter to uniform_int_distribution, so +/// we use SampleType to determine which kind of integer to use to sample. +template ::value, T>::type> +struct UniformIntSampleType { + using type = T; +}; + +template <> +struct UniformIntSampleType { + using type = uint16_t; +}; + +template <> +struct UniformIntSampleType { + using type = int16_t; +}; -#define PINT_DECL(CapType, c_type, LOWER, UPPER) \ +#define PTYPE_DECL(CapType, c_type) \ + typedef CapType##Array ArrayType; \ + typedef CapType##Builder BuilderType; \ + typedef CapType##Type Type; \ + typedef c_type T; \ + \ + static std::shared_ptr type() { return std::make_shared(); } + +#define PINT_DECL(CapType, c_type) \ + struct P##CapType { \ + PTYPE_DECL(CapType, c_type) \ + static void draw(int64_t N, vector* draws) { \ + using sample_type = typename UniformIntSampleType::type; \ + const T lower = std::numeric_limits::min(); \ + const T upper = std::numeric_limits::max(); \ + test::randint(N, static_cast(lower), static_cast(upper), \ + draws); \ + } \ + } + +#define PFLOAT_DECL(CapType, c_type, LOWER, UPPER) \ struct P##CapType { \ PTYPE_DECL(CapType, c_type) \ static void draw(int64_t N, vector* draws) { \ - test::randint(N, LOWER, UPPER, draws); \ + test::random_real(N, 0, LOWER, UPPER, draws); \ } \ } -#define PFLOAT_DECL(CapType, c_type, LOWER, UPPER) \ - struct P##CapType { \ - PTYPE_DECL(CapType, c_type) \ - static void draw(int64_t N, vector* draws) { \ - test::random_real(N, 0, LOWER, UPPER, draws); \ - } \ - } - -PINT_DECL(UInt8, uint8_t, 0, UINT8_MAX); -PINT_DECL(UInt16, uint16_t, 0, UINT16_MAX); -PINT_DECL(UInt32, uint32_t, 0, UINT32_MAX); -PINT_DECL(UInt64, uint64_t, 0, UINT64_MAX); +PINT_DECL(UInt8, uint8_t); +PINT_DECL(UInt16, uint16_t); +PINT_DECL(UInt32, uint32_t); +PINT_DECL(UInt64, uint64_t); -PINT_DECL(Int8, int8_t, INT8_MIN, INT8_MAX); -PINT_DECL(Int16, int16_t, INT16_MIN, INT16_MAX); -PINT_DECL(Int32, int32_t, INT32_MIN, INT32_MAX); -PINT_DECL(Int64, int64_t, INT64_MIN, INT64_MAX); +PINT_DECL(Int8, int8_t); +PINT_DECL(Int16, int16_t); +PINT_DECL(Int32, int32_t); +PINT_DECL(Int64, int64_t); -PFLOAT_DECL(Float, float, -1000, 1000); -PFLOAT_DECL(Double, double, -1000, 1000); +PFLOAT_DECL(Float, float, -1000.0f, 1000.0f); +PFLOAT_DECL(Double, double, -1000.0, 1000.0); struct PBoolean { PTYPE_DECL(Boolean, uint8_t) @@ -324,6 +346,11 @@ void TestPrimitiveBuilder::RandomData(int64_t N, double pct_null) { test::random_null_bytes(N, pct_null, valid_bytes_.data()); } +template <> +int64_t TestPrimitiveBuilder::FlipValue(int64_t value) const { + return !value; +} + template <> void TestPrimitiveBuilder::Check(const std::unique_ptr& builder, bool nullable) { @@ -375,10 +402,6 @@ typedef ::testing::Types(~*reinterpret_cast(&draws[first_valid_idx])); + draws[first_valid_idx] = static_cast( + this->FlipValue(*reinterpret_cast(&draws[first_valid_idx]))); ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &unequal_array)); // test normal equality @@ -724,8 +747,8 @@ void CheckSliceApproxEquals() { vector draws2; const uint32_t kSeed = 0; - test::random_real(kSize, kSeed, 0, 100, &draws1); - test::random_real(kSize, kSeed + 1, 0, 100, &draws2); + test::random_real(kSize, kSeed, 0.0, 100.0, &draws1); + test::random_real(kSize, kSeed + 1, 0.0, 100.0, &draws2); // Make the draws equal in the sliced segment, but unequal elsewhere (to // catch not using the slice offset) @@ -2739,9 +2762,8 @@ class DecimalTest : public ::testing::TestWithParam { template void TestCreate(int32_t precision, const DecimalVector& draw, const std::vector& valid_bytes, int64_t offset) const { - auto type = std::make_shared(precision, 4); - - auto builder = std::make_shared(type); + auto type = std::make_shared(precision, 4); + auto builder = std::make_shared(type); size_t null_count = 0; @@ -2772,7 +2794,7 @@ class DecimalTest : public ::testing::TestWithParam { BitUtil::BytesToBits(valid_bytes, default_memory_pool(), &expected_null_bitmap)); int64_t expected_null_count = test::null_count(valid_bytes); - auto expected = std::make_shared( + auto expected = std::make_shared( type, size, expected_data, expected_null_bitmap, expected_null_count); std::shared_ptr lhs = out->Slice(offset); diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index fc4b96e1b2bec..144fbcd05c205 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -36,6 +36,20 @@ namespace arrow { +std::shared_ptr ArrayData::Make(const std::shared_ptr& type, + int64_t length, + std::vector>&& buffers, + int64_t null_count, int64_t offset) { + return std::make_shared(type, length, std::move(buffers), null_count, + offset); +} + +std::shared_ptr ArrayData::Make(const std::shared_ptr& type, + int64_t length, int64_t null_count, + int64_t offset) { + return std::make_shared(type, length, null_count, offset); +} + // ---------------------------------------------------------------------- // Base array class @@ -89,10 +103,10 @@ static inline std::shared_ptr SliceData(const ArrayData& data, int64_ length = std::min(data.length - offset, length); offset += data.offset; - auto new_data = data.ShallowCopy(); + auto new_data = data.Copy(); new_data->length = length; new_data->offset = offset; - new_data->null_count = kUnknownNullCount; + new_data->null_count = data.null_count != 0 ? kUnknownNullCount : 0; return new_data; } @@ -112,8 +126,7 @@ std::string Array::ToString() const { } NullArray::NullArray(int64_t length) { - BufferVector buffers = {nullptr}; - SetData(std::make_shared(null(), length, std::move(buffers), length)); + SetData(ArrayData::Make(null(), length, {nullptr}, length)); } // ---------------------------------------------------------------------- @@ -123,16 +136,18 @@ PrimitiveArray::PrimitiveArray(const std::shared_ptr& type, int64_t le const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) { - BufferVector buffers = {null_bitmap, data}; - SetData( - std::make_shared(type, length, std::move(buffers), null_count, offset)); + SetData(ArrayData::Make(type, length, {null_bitmap, data}, null_count, offset)); } +#ifndef ARROW_NO_DEPRECATED_API + const uint8_t* PrimitiveArray::raw_values() const { return raw_values_ + offset() * static_cast(*type()).bit_width() / CHAR_BIT; } +#endif + template NumericArray::NumericArray(const std::shared_ptr& data) : PrimitiveArray(data) { @@ -165,9 +180,8 @@ ListArray::ListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& values, const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) { - BufferVector buffers = {null_bitmap, value_offsets}; auto internal_data = - std::make_shared(type, length, std::move(buffers), null_count, offset); + ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); internal_data->child_data.emplace_back(values->data()); SetData(internal_data); } @@ -219,9 +233,8 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo } auto list_type = list(values.type()); - auto internal_data = - std::make_shared(list_type, num_offsets - 1, std::move(buffers), - offsets.null_count(), offsets.offset()); + auto internal_data = ArrayData::Make(list_type, num_offsets - 1, std::move(buffers), + offsets.null_count(), offsets.offset()); internal_data->child_data.push_back(values.data()); *out = std::make_shared(internal_data); @@ -276,9 +289,8 @@ BinaryArray::BinaryArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) { - BufferVector buffers = {null_bitmap, value_offsets, data}; - SetData( - std::make_shared(type, length, std::move(buffers), null_count, offset)); + SetData(ArrayData::Make(type, length, {null_bitmap, value_offsets, data}, null_count, + offset)); } StringArray::StringArray(const std::shared_ptr& data) { @@ -314,15 +326,15 @@ const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const { // ---------------------------------------------------------------------- // Decimal -DecimalArray::DecimalArray(const std::shared_ptr& data) +Decimal128Array::Decimal128Array(const std::shared_ptr& data) : FixedSizeBinaryArray(data) { DCHECK_EQ(data->type->id(), Type::DECIMAL); } -std::string DecimalArray::FormatValue(int64_t i) const { - const auto& type_ = static_cast(*type()); - Decimal128 value(GetValue(i)); - return value.ToString(type_.precision(), type_.scale()); +std::string Decimal128Array::FormatValue(int64_t i) const { + const auto& type_ = static_cast(*type()); + const Decimal128 value(GetValue(i)); + return value.ToString(type_.scale()); } // ---------------------------------------------------------------------- @@ -338,9 +350,7 @@ StructArray::StructArray(const std::shared_ptr& type, int64_t length, const std::vector>& children, std::shared_ptr null_bitmap, int64_t null_count, int64_t offset) { - BufferVector buffers = {null_bitmap}; - SetData( - std::make_shared(type, length, std::move(buffers), null_count, offset)); + SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset)); for (const auto& child : children) { data_->child_data.push_back(child->data()); } @@ -384,15 +394,68 @@ UnionArray::UnionArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& null_bitmap, int64_t null_count, int64_t offset) { - BufferVector buffers = {null_bitmap, type_ids, value_offsets}; - auto internal_data = - std::make_shared(type, length, std::move(buffers), null_count, offset); + auto internal_data = ArrayData::Make( + type, length, {null_bitmap, type_ids, value_offsets}, null_count, offset); for (const auto& child : children) { internal_data->child_data.push_back(child->data()); } SetData(internal_data); } +Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + std::shared_ptr* out) { + if (value_offsets.length() == 0) { + return Status::Invalid("UnionArray offsets must have non-zero length"); + } + + if (value_offsets.type_id() != Type::INT32) { + return Status::Invalid("UnionArray offsets must be signed int32"); + } + + if (type_ids.type_id() != Type::INT8) { + return Status::Invalid("UnionArray type_ids must be signed int8"); + } + + if (value_offsets.null_count() != 0) { + return Status::Invalid("MakeDense does not allow NAs in value_offsets"); + } + + BufferVector buffers = {type_ids.null_bitmap(), + static_cast(type_ids).values(), + static_cast(value_offsets).values()}; + auto union_type = union_(children, UnionMode::DENSE); + auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers), + type_ids.null_count(), type_ids.offset()); + for (const auto& child : children) { + internal_data->child_data.push_back(child->data()); + } + *out = std::make_shared(internal_data); + return Status::OK(); +} + +Status UnionArray::MakeSparse(const Array& type_ids, + const std::vector>& children, + std::shared_ptr* out) { + if (type_ids.type_id() != Type::INT8) { + return Status::Invalid("UnionArray type_ids must be signed int8"); + } + BufferVector buffers = {type_ids.null_bitmap(), + static_cast(type_ids).values(), nullptr}; + auto union_type = union_(children, UnionMode::SPARSE); + auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers), + type_ids.null_count(), type_ids.offset()); + for (const auto& child : children) { + internal_data->child_data.push_back(child->data()); + if (child->length() != type_ids.length()) { + return Status::Invalid( + "Sparse UnionArray must have len(child) == len(type_ids) for all children"); + } + } + *out = std::make_shared(internal_data); + return Status::OK(); +} + std::shared_ptr UnionArray::child(int i) const { if (!boxed_fields_[i]) { boxed_fields_[i] = MakeArray(data_->child_data[i]); @@ -423,14 +486,14 @@ DictionaryArray::DictionaryArray(const std::shared_ptr& type, : dict_type_(static_cast(type.get())) { DCHECK_EQ(type->id(), Type::DICTIONARY); DCHECK_EQ(indices->type_id(), dict_type_->index_type()->id()); - auto data = indices->data()->ShallowCopy(); + auto data = indices->data()->Copy(); data->type = type; SetData(data); } void DictionaryArray::SetData(const std::shared_ptr& data) { this->Array::SetData(data); - auto indices_data = data_->ShallowCopy(); + auto indices_data = data_->Copy(); indices_data->type = dict_type_->index_type(); std::shared_ptr result; indices_ = MakeArray(indices_data); @@ -459,7 +522,7 @@ struct ValidateVisitor { Status Visit(const PrimitiveArray&) { return Status::OK(); } - Status Visit(const DecimalArray&) { return Status::OK(); } + Status Visit(const Decimal128Array&) { return Status::OK(); } Status Visit(const BinaryArray&) { // TODO(wesm): what to do here? diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index afbd780dd3ad5..0ae1ddd8ea221 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -65,9 +65,9 @@ struct Decimal; /// could cast from int64 to float64 like so: /// /// Int64Array arr = GetMyData(); -/// auto new_data = arr->data()->ShallowCopy(); +/// auto new_data = arr.data()->ShallowCopy(); /// new_data->type = arrow::float64(); -/// Float64Array double_arr(new_data); +/// DoubleArray double_arr(new_data); /// /// This object is also useful in an analytics setting where memory may be /// reused. For example, if we had a group of operations all returning doubles, @@ -104,6 +104,17 @@ struct ARROW_EXPORT ArrayData { this->buffers = std::move(buffers); } + static std::shared_ptr Make(const std::shared_ptr& type, + int64_t length, + std::vector>&& buffers, + int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + + static std::shared_ptr Make(const std::shared_ptr& type, + int64_t length, + int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + // Move constructor ArrayData(ArrayData&& other) noexcept : type(std::move(other.type)), @@ -132,9 +143,14 @@ struct ARROW_EXPORT ArrayData { return *this; } - std::shared_ptr ShallowCopy() const { - return std::make_shared(*this); - } + std::shared_ptr Copy() const { return std::make_shared(*this); } + +#ifndef ARROW_NO_DEPRECATED_API + + // Deprecated since 0.8.0 + std::shared_ptr ShallowCopy() const { return Copy(); } + +#endif std::shared_ptr type; int64_t length; @@ -279,6 +295,8 @@ class ARROW_EXPORT Array { ARROW_DISALLOW_COPY_AND_ASSIGN(Array); }; +using ArrayVector = std::vector>; + static inline std::ostream& operator<<(std::ostream& os, const Array& x) { os << x.ToString(); return os; @@ -316,9 +334,15 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { /// Does not account for any slice offset std::shared_ptr values() const { return data_->buffers[1]; } +#ifndef ARROW_NO_DEPRECATED_API + /// \brief Return pointer to start of raw data + /// + /// \note Deprecated since 0.8.0 const uint8_t* raw_values() const; +#endif + protected: PrimitiveArray() {} @@ -549,6 +573,8 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { int32_t byte_width() const { return byte_width_; } + const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } + protected: inline void SetData(const std::shared_ptr& data) { this->PrimitiveArray::SetData(data); @@ -559,19 +585,22 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { }; // ---------------------------------------------------------------------- -// DecimalArray -class ARROW_EXPORT DecimalArray : public FixedSizeBinaryArray { +// Decimal128Array +class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray { public: - using TypeClass = DecimalType; + using TypeClass = Decimal128Type; using FixedSizeBinaryArray::FixedSizeBinaryArray; - /// \brief Construct DecimalArray from ArrayData instance - explicit DecimalArray(const std::shared_ptr& data); + /// \brief Construct Decimal128Array from ArrayData instance + explicit Decimal128Array(const std::shared_ptr& data); std::string FormatValue(int64_t i) const; }; +// Backward compatibility +using DecimalArray = Decimal128Array; + // ---------------------------------------------------------------------- // Struct @@ -612,16 +641,47 @@ class ARROW_EXPORT UnionArray : public Array { const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = 0, int64_t offset = 0); + /// \brief Construct Dense UnionArray from types_ids, value_offsets and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. The value_offsets are assumed to be well-formed. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[out] out Will have length equal to value_offsets.length() + static Status MakeDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + std::shared_ptr* out); + + /// \brief Construct Sparse UnionArray from type_ids and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[out] out Will have length equal to type_ids.length() + static Status MakeSparse(const Array& type_ids, + const std::vector>& children, + std::shared_ptr* out); + /// Note that this buffer does not account for any slice offset std::shared_ptr type_ids() const { return data_->buffers[1]; } /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return data_->buffers[2]; } + int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } + const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - UnionMode mode() const { return static_cast(*type()).mode(); } + UnionMode::type mode() const { return static_cast(*type()).mode(); } std::shared_ptr child(int pos) const; diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 8e989064be4e1..450a4c78b5bbb 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -340,12 +340,13 @@ Status AllocateResizableBuffer(MemoryPool* pool, const int64_t size, #ifndef ARROW_NO_DEPRECATED_API /// \brief Create Buffer referencing std::string memory -/// \deprecated Since 0.8.0 /// /// Warning: string instance must stay alive /// /// \param str std::string instance /// \return std::shared_ptr +/// +/// \note Deprecated Since 0.8.0 static inline std::shared_ptr GetBufferFromString(const std::string& str) { return std::make_shared(str); } diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc index 7ac7fe3bed533..12dfbe8170846 100644 --- a/cpp/src/arrow/builder-benchmark.cc +++ b/cpp/src/arrow/builder-benchmark.cc @@ -115,47 +115,6 @@ static void BM_BuildAdaptiveUIntNoNulls( state.SetBytesProcessed(state.iterations() * data.size() * sizeof(int64_t)); } -static void BM_BuildDictionary(benchmark::State& state) { // NOLINT non-const reference - const int64_t iterations = 1024; - while (state.KeepRunning()) { - DictionaryBuilder builder(default_memory_pool()); - for (int64_t i = 0; i < iterations; i++) { - for (int64_t j = 0; j < i; j++) { - ABORT_NOT_OK(builder.Append(j)); - } - } - std::shared_ptr out; - ABORT_NOT_OK(builder.Finish(&out)); - } - state.SetBytesProcessed(state.iterations() * iterations * (iterations + 1) / 2 * - sizeof(int64_t)); -} - -static void BM_BuildStringDictionary( - benchmark::State& state) { // NOLINT non-const reference - const int64_t iterations = 1024; - // Pre-render strings - std::vector data; - for (int64_t i = 0; i < iterations; i++) { - std::stringstream ss; - ss << i; - data.push_back(ss.str()); - } - while (state.KeepRunning()) { - StringDictionaryBuilder builder(default_memory_pool()); - for (int64_t i = 0; i < iterations; i++) { - for (int64_t j = 0; j < i; j++) { - ABORT_NOT_OK(builder.Append(data[j])); - } - } - std::shared_ptr out; - ABORT_NOT_OK(builder.Finish(&out)); - } - // Assuming a string here needs on average 2 bytes - state.SetBytesProcessed(state.iterations() * iterations * (iterations + 1) / 2 * - sizeof(int32_t)); -} - static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const reference const int64_t iterations = 1 << 20; @@ -179,8 +138,6 @@ BENCHMARK(BM_BuildAdaptiveIntNoNullsScalarAppend) ->Repetitions(3) ->Unit(benchmark::kMicrosecond); BENCHMARK(BM_BuildAdaptiveUIntNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildStringDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond); BENCHMARK(BM_BuildBinaryArray)->Repetitions(3)->Unit(benchmark::kMicrosecond); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index c910170dd5c87..de132b5f6a0d1 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -28,19 +28,18 @@ #include "arrow/buffer.h" #include "arrow/compare.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/cpu-info.h" #include "arrow/util/decimal.h" #include "arrow/util/hash-util.h" +#include "arrow/util/hash.h" #include "arrow/util/logging.h" namespace arrow { using internal::AdaptiveIntBuilderBase; -using internal::WrappedBinary; Status ArrayBuilder::AppendToBitmap(bool is_valid) { if (length_ == capacity_) { @@ -221,8 +220,7 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) { // Null builder Status NullBuilder::FinishInternal(std::shared_ptr* out) { - BufferVector buffers = {nullptr}; - *out = std::make_shared(null(), length_, std::move(buffers), length_); + *out = ArrayData::Make(null(), length_, {nullptr}, length_); length_ = null_count_ = 0; return Status::OK(); } @@ -316,8 +314,7 @@ Status PrimitiveBuilder::FinishInternal(std::shared_ptr* out) { // Trim buffers RETURN_NOT_OK(data_->Resize(bytes_required)); } - BufferVector buffers = {null_bitmap_, data_}; - *out = std::make_shared(type_, length_, std::move(buffers), null_count_); + *out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -406,9 +403,7 @@ Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr* out) { return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); } - BufferVector buffers = {null_bitmap_, data_}; - *out = - std::make_shared(output_type, length_, std::move(buffers), null_count_); + *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -564,9 +559,7 @@ Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr* out) { return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); } - BufferVector buffers = {null_bitmap_, data_}; - *out = - std::make_shared(output_type, length_, std::move(buffers), null_count_); + *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -743,8 +736,7 @@ Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { // Trim buffers RETURN_NOT_OK(data_->Resize(bytes_required)); } - BufferVector buffers = {null_bitmap_, data_}; - *out = std::make_shared(boolean(), length_, std::move(buffers), null_count_); + *out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_}, null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -817,11 +809,12 @@ Status BooleanBuilder::Append(const std::vector& values) { // ---------------------------------------------------------------------- // DictionaryBuilder +using internal::WrappedBinary; + template DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool) : ArrayBuilder(type, pool), - hash_table_(new PoolBuffer(pool)), hash_slots_(nullptr), dict_builder_(type, pool), values_builder_(pool), @@ -845,7 +838,6 @@ template <> DictionaryBuilder::DictionaryBuilder( const std::shared_ptr& type, MemoryPool* pool) : ArrayBuilder(type, pool), - hash_table_(new PoolBuffer(pool)), hash_slots_(nullptr), dict_builder_(type, pool), values_builder_(pool), @@ -860,11 +852,12 @@ Status DictionaryBuilder::Init(int64_t elements) { RETURN_NOT_OK(ArrayBuilder::Init(elements)); // Fill the initial hash table - RETURN_NOT_OK(hash_table_->Resize(sizeof(hash_slot_t) * kInitialHashTableSize)); + RETURN_NOT_OK(internal::NewHashTable(kInitialHashTableSize, pool_, &hash_table_)); hash_slots_ = reinterpret_cast(hash_table_->mutable_data()); - std::fill(hash_slots_, hash_slots_ + kInitialHashTableSize, kHashSlotEmpty); hash_table_size_ = kInitialHashTableSize; mod_bitmask_ = kInitialHashTableSize - 1; + hash_table_load_threshold_ = + static_cast(static_cast(elements) * kMaxHashTableLoad); return values_builder_.Init(elements); } @@ -921,7 +914,7 @@ template Status DictionaryBuilder::Append(const Scalar& value) { RETURN_NOT_OK(Reserve(1)); // Based on DictEncoder::Put - int j = HashValue(value) & mod_bitmask_; + int64_t j = HashValue(value) & mod_bitmask_; hash_slot_t index = hash_slots_[j]; // Find an empty slot @@ -941,7 +934,7 @@ Status DictionaryBuilder::Append(const Scalar& value) { RETURN_NOT_OK(AppendDictionary(value)); if (ARROW_PREDICT_FALSE(static_cast(dict_builder_.length()) > - hash_table_size_ * kMaxHashTableLoad)) { + hash_table_load_threshold_)) { RETURN_NOT_OK(DoubleTableSize()); } } @@ -997,45 +990,11 @@ Status DictionaryBuilder::AppendNull() { return values_builder_.Append template Status DictionaryBuilder::DoubleTableSize() { - int new_size = hash_table_size_ * 2; - auto new_hash_table = std::make_shared(pool_); - - RETURN_NOT_OK(new_hash_table->Resize(sizeof(hash_slot_t) * new_size)); - int32_t* new_hash_slots = reinterpret_cast(new_hash_table->mutable_data()); - std::fill(new_hash_slots, new_hash_slots + new_size, kHashSlotEmpty); - int new_mod_bitmask = new_size - 1; - - for (int i = 0; i < hash_table_size_; ++i) { - hash_slot_t index = hash_slots_[i]; - - if (index == kHashSlotEmpty) { - continue; - } - - // Compute the hash value mod the new table size to start looking for an - // empty slot - Scalar value = GetDictionaryValue(static_cast(index)); - - // Find an empty slot in the new hash table - int j = HashValue(value) & new_mod_bitmask; - hash_slot_t slot = new_hash_slots[j]; - - while (kHashSlotEmpty != slot && SlotDifferent(slot, value)) { - ++j; - if (j == new_size) { - j = 0; - } - slot = new_hash_slots[j]; - } +#define INNER_LOOP \ + Scalar value = GetDictionaryValue(static_cast(index)); \ + int64_t j = HashValue(value) & new_mod_bitmask; - // Copy the old slot index to the new hash table - new_hash_slots[j] = index; - } - - hash_table_ = new_hash_table; - hash_slots_ = reinterpret_cast(hash_table_->mutable_data()); - hash_table_size_ = new_size; - mod_bitmask_ = new_size - 1; + DOUBLE_TABLE_SIZE(, INNER_LOOP); return Status::OK(); } @@ -1053,12 +1012,12 @@ const uint8_t* DictionaryBuilder::GetDictionaryValue(int64_ } template -int DictionaryBuilder::HashValue(const Scalar& value) { +int64_t DictionaryBuilder::HashValue(const Scalar& value) { return HashUtil::Hash(&value, sizeof(Scalar), 0); } template <> -int DictionaryBuilder::HashValue(const Scalar& value) { +int64_t DictionaryBuilder::HashValue(const Scalar& value) { return HashUtil::Hash(value, byte_width_, 0); } @@ -1110,7 +1069,7 @@ Status DictionaryBuilder::AppendDictionary(const Scalar& value) { } \ \ template <> \ - int DictionaryBuilder::HashValue(const WrappedBinary& value) { \ + int64_t DictionaryBuilder::HashValue(const WrappedBinary& value) { \ return HashUtil::Hash(value.ptr_, value.length_, 0); \ } \ \ @@ -1147,22 +1106,22 @@ template class DictionaryBuilder; template class DictionaryBuilder; // ---------------------------------------------------------------------- -// DecimalBuilder +// Decimal128Builder -DecimalBuilder::DecimalBuilder(const std::shared_ptr& type, MemoryPool* pool) +Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool) : FixedSizeBinaryBuilder(type, pool) {} -Status DecimalBuilder::Append(const Decimal128& value) { +Status Decimal128Builder::Append(const Decimal128& value) { RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); return FixedSizeBinaryBuilder::Append(value.ToBytes()); } -Status DecimalBuilder::FinishInternal(std::shared_ptr* out) { +Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { std::shared_ptr data; RETURN_NOT_OK(byte_builder_.Finish(&data)); - BufferVector buffers = {null_bitmap_, data}; - *out = std::make_shared(type_, length_, std::move(buffers), null_count_); + *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); return Status::OK(); } @@ -1229,8 +1188,7 @@ Status ListBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(value_builder_->FinishInternal(&items)); } - BufferVector buffers = {null_bitmap_, offsets}; - *out = std::make_shared(type_, length_, std::move(buffers), null_count_); + *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets}, null_count_); (*out)->child_data.emplace_back(std::move(items)); Reset(); return Status::OK(); @@ -1302,8 +1260,8 @@ Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); - BufferVector buffers = {null_bitmap_, offsets, value_data}; - *out = std::make_shared(type_, length_, std::move(buffers), null_count_, 0); + *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets, value_data}, null_count_, + 0); Reset(); return Status::OK(); } @@ -1373,8 +1331,7 @@ Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr* out) { std::shared_ptr data; RETURN_NOT_OK(byte_builder_.Finish(&data)); - BufferVector buffers = {null_bitmap_, data}; - *out = std::make_shared(type_, length_, std::move(buffers), null_count_); + *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); return Status::OK(); } @@ -1393,8 +1350,7 @@ StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* } Status StructBuilder::FinishInternal(std::shared_ptr* out) { - BufferVector buffers = {null_bitmap_}; - *out = std::make_shared(type_, length_, std::move(buffers), null_count_); + *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_); (*out)->child_data.resize(field_builders_.size()); for (size_t i = 0; i < field_builders_.size(); ++i) { @@ -1445,7 +1401,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(STRING, StringBuilder); BUILDER_CASE(BINARY, BinaryBuilder); BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); - BUILDER_CASE(DECIMAL, DecimalBuilder); + BUILDER_CASE(DECIMAL, Decimal128Builder); case Type::LIST: { std::unique_ptr value_builder; std::shared_ptr value_type = @@ -1476,125 +1432,4 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, } } -#define DICTIONARY_BUILDER_CASE(ENUM, BuilderType) \ - case Type::ENUM: \ - out->reset(new BuilderType(type, pool)); \ - return Status::OK(); - -Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, - std::shared_ptr* out) { - switch (type->id()) { - DICTIONARY_BUILDER_CASE(NA, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(UINT8, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(INT8, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(UINT16, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(INT16, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(UINT32, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(INT32, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(UINT64, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(INT64, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(DATE32, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(DATE64, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(TIME32, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(TIME64, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(TIMESTAMP, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(FLOAT, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(DOUBLE, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder); - DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder); - DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, DictionaryBuilder); - DICTIONARY_BUILDER_CASE(DECIMAL, DictionaryBuilder); - default: - return Status::NotImplemented(type->ToString()); - } -} - -#define DICTIONARY_ARRAY_CASE(ENUM, BuilderType) \ - case Type::ENUM: \ - builder = std::make_shared(type, pool); \ - RETURN_NOT_OK(static_cast(*builder).AppendArray(input)); \ - RETURN_NOT_OK(builder->Finish(out)); \ - return Status::OK(); - -Status EncodeArrayToDictionary(const Array& input, MemoryPool* pool, - std::shared_ptr* out) { - const std::shared_ptr& type = input.data()->type; - std::shared_ptr builder; - switch (type->id()) { - DICTIONARY_ARRAY_CASE(NA, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(UINT8, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(INT8, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(UINT16, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(INT16, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(UINT32, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(INT32, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(UINT64, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(INT64, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(DATE32, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(DATE64, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(TIME32, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(TIME64, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(TIMESTAMP, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(FLOAT, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(DOUBLE, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(STRING, StringDictionaryBuilder); - DICTIONARY_ARRAY_CASE(BINARY, BinaryDictionaryBuilder); - DICTIONARY_ARRAY_CASE(FIXED_SIZE_BINARY, DictionaryBuilder); - DICTIONARY_ARRAY_CASE(DECIMAL, DictionaryBuilder); - default: - std::stringstream ss; - ss << "Cannot encode array of type " << type->ToString(); - ss << " to dictionary"; - return Status::NotImplemented(ss.str()); - } -} -#define DICTIONARY_COLUMN_CASE(ENUM, BuilderType) \ - case Type::ENUM: \ - builder = std::make_shared(type, pool); \ - chunks = input.data(); \ - for (auto chunk : chunks->chunks()) { \ - RETURN_NOT_OK(static_cast(*builder).AppendArray(*chunk)); \ - } \ - RETURN_NOT_OK(builder->Finish(&arr)); \ - *out = std::make_shared(input.name(), arr); \ - return Status::OK(); - -/// \brief Encodes a column to a suitable dictionary type -/// \param input Column to be encoded -/// \param pool MemoryPool to allocate the dictionary -/// \param out The new column -/// \return Status -Status EncodeColumnToDictionary(const Column& input, MemoryPool* pool, - std::shared_ptr* out) { - const std::shared_ptr& type = input.type(); - std::shared_ptr builder; - std::shared_ptr arr; - std::shared_ptr chunks; - switch (type->id()) { - DICTIONARY_COLUMN_CASE(UINT8, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(INT8, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(UINT16, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(INT16, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(UINT32, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(INT32, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(UINT64, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(INT64, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(DATE32, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(DATE64, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(TIME32, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(TIME64, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(TIMESTAMP, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(FLOAT, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(DOUBLE, DictionaryBuilder); - DICTIONARY_COLUMN_CASE(STRING, StringDictionaryBuilder); - DICTIONARY_COLUMN_CASE(BINARY, BinaryDictionaryBuilder); - DICTIONARY_COLUMN_CASE(FIXED_SIZE_BINARY, DictionaryBuilder); - default: - std::stringstream ss; - ss << "Cannot encode column of type " << type->ToString(); - ss << " to dictionary"; - return Status::NotImplemented(ss.str()); - } -} - } // namespace arrow diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index c580eeb3b35fb..ce7b8cd197da3 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -29,10 +29,10 @@ #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/hash.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -123,6 +123,18 @@ class ARROW_EXPORT ArrayBuilder { std::shared_ptr type() const { return type_; } + // Unsafe operations (don't check capacity/don't resize) + + // Append to null bitmap. + void UnsafeAppendToBitmap(bool is_valid) { + if (is_valid) { + BitUtil::SetBit(null_bitmap_data_, length_); + } else { + ++null_count_; + } + ++length_; + } + protected: ArrayBuilder() {} @@ -143,18 +155,6 @@ class ARROW_EXPORT ArrayBuilder { void Reset(); - // Unsafe operations (don't check capacity/don't resize) - - // Append to null bitmap. - void UnsafeAppendToBitmap(bool is_valid) { - if (is_valid) { - BitUtil::SetBit(null_bitmap_data_, length_); - } else { - ++null_count_; - } - ++length_; - } - // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null // assume all of length bits are valid. void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length); @@ -753,10 +753,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { BufferBuilder byte_builder_; }; -class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder { +class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { public: - explicit DecimalBuilder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + explicit Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); using FixedSizeBinaryBuilder::Append; @@ -765,6 +765,8 @@ class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder { Status FinishInternal(std::shared_ptr* out) override; }; +using DecimalBuilder = Decimal128Builder; + // ---------------------------------------------------------------------- // Struct @@ -811,17 +813,6 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { // ---------------------------------------------------------------------- // Dictionary builder -// Based on Apache Parquet-cpp's DictEncoder - -// Initially 1024 elements -static constexpr int kInitialHashTableSize = 1 << 10; - -typedef int32_t hash_slot_t; -static constexpr hash_slot_t kHashSlotEmpty = std::numeric_limits::max(); - -// The maximum load factor for the hash table before resizing. -static constexpr double kMaxHashTableLoad = 0.7; - namespace internal { // TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here. @@ -886,23 +877,26 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { protected: Status DoubleTableSize(); Scalar GetDictionaryValue(int64_t index); - int HashValue(const Scalar& value); + int64_t HashValue(const Scalar& value); bool SlotDifferent(hash_slot_t slot, const Scalar& value); Status AppendDictionary(const Scalar& value); - std::shared_ptr hash_table_; + std::shared_ptr hash_table_; int32_t* hash_slots_; /// Size of the table. Must be a power of 2. - int hash_table_size_; + int64_t hash_table_size_; // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j % // hash_table_size_, but uses far fewer CPU cycles - int mod_bitmask_; + int64_t mod_bitmask_; typename TypeTraits::BuilderType dict_builder_; AdaptiveIntBuilder values_builder_; int32_t byte_width_; + + /// Size at which we decide to resize + int64_t hash_table_load_threshold_; }; template <> @@ -974,25 +968,6 @@ class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder& type, std::unique_ptr* out); -Status ARROW_EXPORT MakeDictionaryBuilder(MemoryPool* pool, - const std::shared_ptr& type, - std::shared_ptr* out); - -/// \brief Convert Array to encoded DictionaryArray form -/// -/// \param[in] input The Array to be encoded -/// \param[in] pool MemoryPool to allocate memory for the hash table -/// \param[out] out Array encoded to DictionaryArray -Status ARROW_EXPORT EncodeArrayToDictionary(const Array& input, MemoryPool* pool, - std::shared_ptr* out); - -/// \brief Convert a Column's data internally to DictionaryArray -/// -/// \param[in] input The ChunkedArray to be encoded -/// \param[in] pool MemoryPool to allocate memory for the hash table -/// \param[out] out Column with data converted to DictionaryArray -Status ARROW_EXPORT EncodeColumnToDictionary(const Column& input, MemoryPool* pool, - std::shared_ptr* out); } // namespace arrow #endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/column-benchmark.cc b/cpp/src/arrow/column-benchmark.cc index e50ddf6d7032d..af2c368c32975 100644 --- a/cpp/src/arrow/column-benchmark.cc +++ b/cpp/src/arrow/column-benchmark.cc @@ -19,6 +19,7 @@ #include "arrow/array.h" #include "arrow/memory_pool.h" +#include "arrow/table.h" #include "arrow/test-util.h" namespace arrow { diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2ec86c3695aa5..9f07fa7eff826 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -152,7 +152,7 @@ class RangeEqualsVisitor { bool CompareUnions(const UnionArray& left) const { const auto& right = static_cast(right_); - const UnionMode union_mode = left.mode(); + const UnionMode::type union_mode = left.mode(); if (union_mode != right.mode()) { return false; } @@ -255,7 +255,7 @@ class RangeEqualsVisitor { return Status::OK(); } - Status Visit(const DecimalArray& left) { + Status Visit(const Decimal128Array& left) { return Visit(static_cast(left)); } @@ -312,8 +312,16 @@ static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& r const auto& size_meta = dynamic_cast(*left.type()); const int byte_width = size_meta.bit_width() / CHAR_BIT; - const uint8_t* left_data = left.values() ? left.raw_values() : nullptr; - const uint8_t* right_data = right.values() ? right.raw_values() : nullptr; + const uint8_t* left_data = nullptr; + const uint8_t* right_data = nullptr; + + if (left.values()) { + left_data = left.values()->data() + left.offset() * byte_width; + } + + if (right.values()) { + right_data = right.values()->data() + right.offset() * byte_width; + } if (left.null_count() > 0) { for (int64_t i = 0; i < left.length(); ++i) { @@ -615,8 +623,8 @@ class TypeEqualsVisitor { return Status::OK(); } - Status Visit(const DecimalType& left) { - const auto& right = static_cast(right_); + Status Visit(const Decimal128Type& left) { + const auto& right = static_cast(right_); result_ = left.precision() == right.precision() && left.scale() == right.scale(); return Status::OK(); } diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 27176ed864cdd..df3386e4bfc19 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -33,27 +33,27 @@ class Tensor; #ifndef ARROW_NO_DEPRECATED_API /// Returns true if the arrays are exactly equal -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 Status ARROW_EXPORT ArrayEquals(const Array& left, const Array& right, bool* are_equal); -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 Status ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right, bool* are_equal); /// Returns true if the arrays are approximately equal. For non-floating point /// types, this is equivalent to ArrayEquals(left, right) -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 Status ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right, bool* are_equal); /// Returns true if indicated equal-length segment of arrays is exactly equal -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 Status ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right, int64_t start_idx, int64_t end_idx, int64_t other_start_idx, bool* are_equal); /// Returns true if the type metadata are exactly equal -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 Status ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right, bool* are_equal); #endif diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index 4589afb9574d3..d4369ed27b7c4 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -18,7 +18,6 @@ # Headers: top level install(FILES api.h - cast.h context.h kernel.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/compute") @@ -36,3 +35,6 @@ install( ####################################### ADD_ARROW_TEST(compute-test) +ADD_ARROW_BENCHMARK(compute-benchmark) + +add_subdirectory(kernels) diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h index da7df1cbbfafc..b3700b4c5813c 100644 --- a/cpp/src/arrow/compute/api.h +++ b/cpp/src/arrow/compute/api.h @@ -18,8 +18,10 @@ #ifndef ARROW_COMPUTE_API_H #define ARROW_COMPUTE_API_H -#include "arrow/compute/cast.h" #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/cast.h" +#include "arrow/compute/kernels/hash.h" + #endif // ARROW_COMPUTE_API_H diff --git a/cpp/src/arrow/compute/compute-benchmark.cc b/cpp/src/arrow/compute/compute-benchmark.cc new file mode 100644 index 0000000000000..44df441394ad4 --- /dev/null +++ b/cpp/src/arrow/compute/compute-benchmark.cc @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include + +#include "arrow/builder.h" +#include "arrow/memory_pool.h" +#include "arrow/test-util.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernels/hash.h" + +namespace arrow { +namespace compute { + +static void BM_BuildDictionary(benchmark::State& state) { // NOLINT non-const reference + const int64_t iterations = 1024; + + std::vector values; + std::vector is_valid; + for (int64_t i = 0; i < iterations; i++) { + for (int64_t j = 0; j < i; j++) { + is_valid.push_back((i + j) % 9 == 0); + values.push_back(j); + } + } + + std::shared_ptr arr; + ArrayFromVector(is_valid, values, &arr); + + FunctionContext ctx; + + while (state.KeepRunning()) { + Datum out; + ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out)); + } + state.SetBytesProcessed(state.iterations() * values.size() * sizeof(int64_t)); +} + +static void BM_BuildStringDictionary( + benchmark::State& state) { // NOLINT non-const reference + const int64_t iterations = 1024 * 64; + // Pre-render strings + std::vector data; + + int64_t total_bytes = 0; + for (int64_t i = 0; i < iterations; i++) { + std::stringstream ss; + ss << i; + auto val = ss.str(); + data.push_back(val); + total_bytes += static_cast(val.size()); + } + + std::shared_ptr arr; + ArrayFromVector(data, &arr); + + FunctionContext ctx; + + while (state.KeepRunning()) { + Datum out; + ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out)); + } + // Assuming a string here needs on average 2 bytes + state.SetBytesProcessed(state.iterations() * total_bytes); +} + +template +struct HashParams { + using T = typename Type::c_type; + + double null_percent; + + void GenerateTestData(const int64_t length, const int64_t num_unique, + std::shared_ptr* arr) const { + std::vector draws; + std::vector values; + std::vector is_valid; + test::randint(length, 0, num_unique, &draws); + for (int64_t draw : draws) { + values.push_back(draw); + } + + if (this->null_percent > 0) { + test::random_is_valid(length, this->null_percent, &is_valid); + ArrayFromVector(is_valid, values, arr); + } else { + ArrayFromVector(values, arr); + } + } + + int64_t GetBytesProcessed(int64_t length) const { return length * sizeof(T); } +}; + +template <> +struct HashParams { + double null_percent; + int32_t byte_width; + void GenerateTestData(const int64_t length, const int64_t num_unique, + std::shared_ptr* arr) const { + std::vector draws; + test::randint(length, 0, num_unique, &draws); + + const int64_t total_bytes = this->byte_width * num_unique; + std::vector uniques(total_bytes); + const uint32_t seed = 0; + test::random_bytes(total_bytes, seed, uniques.data()); + + std::vector is_valid; + if (this->null_percent > 0) { + test::random_is_valid(length, this->null_percent, &is_valid); + } + + StringBuilder builder; + for (int64_t i = 0; i < length; ++i) { + if (this->null_percent == 0 || is_valid[i]) { + ABORT_NOT_OK(builder.Append(uniques.data() + this->byte_width * draws[i], + this->byte_width)); + } else { + ABORT_NOT_OK(builder.AppendNull()); + } + } + ABORT_NOT_OK(builder.Finish(arr)); + } + + int64_t GetBytesProcessed(int64_t length) const { return length * byte_width; } +}; + +template +void BenchUnique(benchmark::State& state, const ParamType& params, int64_t length, + int64_t num_unique) { + std::shared_ptr arr; + params.GenerateTestData(length, num_unique, &arr); + + FunctionContext ctx; + while (state.KeepRunning()) { + std::shared_ptr out; + ABORT_NOT_OK(Unique(&ctx, Datum(arr), &out)); + } + state.SetBytesProcessed(state.iterations() * params.GetBytesProcessed(length)); +} + +template +void BenchDictionaryEncode(benchmark::State& state, const ParamType& params, + int64_t length, int64_t num_unique) { + std::shared_ptr arr; + params.GenerateTestData(length, num_unique, &arr); + + FunctionContext ctx; + while (state.KeepRunning()) { + Datum out; + ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out)); + } + state.SetBytesProcessed(state.iterations() * params.GetBytesProcessed(length)); +} + +static void BM_UniqueInt64NoNulls(benchmark::State& state) { + BenchUnique(state, HashParams{0}, state.range(0), state.range(1)); +} + +static void BM_UniqueInt64WithNulls(benchmark::State& state) { + BenchUnique(state, HashParams{0.05}, state.range(0), state.range(1)); +} + +static void BM_UniqueString10bytes(benchmark::State& state) { + // Byte strings with 10 bytes each + BenchUnique(state, HashParams{0.05, 10}, state.range(0), state.range(1)); +} + +static void BM_UniqueString100bytes(benchmark::State& state) { + // Byte strings with 100 bytes each + BenchUnique(state, HashParams{0.05, 100}, state.range(0), state.range(1)); +} + +BENCHMARK(BM_BuildDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildStringDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond); + +constexpr int kHashBenchmarkLength = 1 << 24; + +#define ADD_HASH_ARGS(WHAT) \ + WHAT->Args({kHashBenchmarkLength, 50}) \ + ->Args({kHashBenchmarkLength, 1 << 10}) \ + ->Args({kHashBenchmarkLength, 10 * 1 << 10}) \ + ->Args({kHashBenchmarkLength, 1 << 20}) \ + ->MinTime(1.0) \ + ->Unit(benchmark::kMicrosecond) \ + ->UseRealTime() + +ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64NoNulls)); +ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64WithNulls)); +ADD_HASH_ARGS(BENCHMARK(BM_UniqueString10bytes)); +ADD_HASH_ARGS(BENCHMARK(BM_UniqueString100bytes)); + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 8a7ef923b4719..3fc15018630da 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -37,10 +37,12 @@ #include "arrow/type.h" #include "arrow/type_traits.h" -#include "arrow/compute/cast.h" #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/cast.h" +#include "arrow/compute/kernels/hash.h" +using std::shared_ptr; using std::vector; namespace arrow { @@ -54,6 +56,18 @@ class ComputeFixture { FunctionContext ctx_; }; +template +shared_ptr _MakeArray(const shared_ptr& type, const vector& values, + const vector& is_valid) { + shared_ptr result; + if (is_valid.size() > 0) { + ArrayFromVector(type, is_valid, values, &result); + } else { + ArrayFromVector(type, values, &result); + } + return result; +} + // ---------------------------------------------------------------------- // Cast @@ -65,17 +79,17 @@ static void AssertBufferSame(const Array& left, const Array& right, int buffer_i class TestCast : public ComputeFixture, public TestBase { public: void CheckPass(const Array& input, const Array& expected, - const std::shared_ptr& out_type, const CastOptions& options) { - std::shared_ptr result; + const shared_ptr& out_type, const CastOptions& options) { + shared_ptr result; ASSERT_OK(Cast(&ctx_, input, out_type, options, &result)); ASSERT_ARRAYS_EQUAL(expected, *result); } template - void CheckFails(const std::shared_ptr& in_type, - const std::vector& in_values, const std::vector& is_valid, - const std::shared_ptr& out_type, const CastOptions& options) { - std::shared_ptr input, result; + void CheckFails(const shared_ptr& in_type, const vector& in_values, + const vector& is_valid, const shared_ptr& out_type, + const CastOptions& options) { + shared_ptr input, result; if (is_valid.size() > 0) { ArrayFromVector(in_type, is_valid, in_values, &input); } else { @@ -84,19 +98,18 @@ class TestCast : public ComputeFixture, public TestBase { ASSERT_RAISES(Invalid, Cast(&ctx_, *input, out_type, options, &result)); } - void CheckZeroCopy(const Array& input, const std::shared_ptr& out_type) { - std::shared_ptr result; + void CheckZeroCopy(const Array& input, const shared_ptr& out_type) { + shared_ptr result; ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result)); AssertBufferSame(input, *result, 0); AssertBufferSame(input, *result, 1); } template - void CheckCase(const std::shared_ptr& in_type, - const std::vector& in_values, const std::vector& is_valid, - const std::shared_ptr& out_type, - const std::vector& out_values, const CastOptions& options) { - std::shared_ptr input, expected; + void CheckCase(const shared_ptr& in_type, const vector& in_values, + const vector& is_valid, const shared_ptr& out_type, + const vector& out_values, const CastOptions& options) { + shared_ptr input, expected; if (is_valid.size() > 0) { ArrayFromVector(in_type, is_valid, in_values, &input); ArrayFromVector(out_type, is_valid, out_values, &expected); @@ -117,10 +130,10 @@ TEST_F(TestCast, SameTypeZeroCopy) { vector is_valid = {true, false, true, true, true}; vector v1 = {0, 1, 2, 3, 4}; - std::shared_ptr arr; + shared_ptr arr; ArrayFromVector(int32(), is_valid, v1, &arr); - std::shared_ptr result; + shared_ptr result; ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result)); AssertBufferSame(*arr, *result, 0); @@ -185,7 +198,7 @@ TEST_F(TestCast, OverflowInNullSlot) { vector v11 = {0, 70000, 2000, 1000, 0}; vector e11 = {0, 0, 2000, 1000, 0}; - std::shared_ptr expected; + shared_ptr expected; ArrayFromVector(int16(), is_valid, e11, &expected); auto buf = std::make_shared(reinterpret_cast(v11.data()), @@ -280,8 +293,8 @@ TEST_F(TestCast, TimestampToTimestamp) { auto CheckTimestampCast = [this]( const CastOptions& options, TimeUnit::type from_unit, TimeUnit::type to_unit, - const std::vector& from_values, const std::vector& to_values, - const std::vector& is_valid) { + const vector& from_values, const vector& to_values, + const vector& is_valid) { CheckCase( timestamp(from_unit), from_values, is_valid, timestamp(to_unit), to_values, options); @@ -315,12 +328,15 @@ TEST_F(TestCast, TimestampToTimestamp) { CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::NANO, v6, e6, is_valid); // Zero copy - std::shared_ptr arr; vector v7 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; ArrayFromVector(timestamp(TimeUnit::SECOND), is_valid, v7, &arr); CheckZeroCopy(*arr, timestamp(TimeUnit::SECOND)); + // ARROW-1773, cast to integer + CheckZeroCopy(*arr, int64()); + // Divide, truncate vector v8 = {0, 100123, 200456, 1123, 2456}; vector e8 = {0, 100, 200, 1, 2}; @@ -355,7 +371,71 @@ TEST_F(TestCast, TimestampToTimestamp) { timestamp(TimeUnit::SECOND), options); } -TEST_F(TestCast, TimeToTime) { +TEST_F(TestCast, TimestampToDate32_Date64) { + CastOptions options; + + vector is_valid = {true, true, false}; + + // 2000-01-01, 2000-01-02, null + vector v_nano = {946684800000000000, 946771200000000000, 0}; + vector v_micro = {946684800000000, 946771200000000, 0}; + vector v_milli = {946684800000, 946771200000, 0}; + vector v_second = {946684800, 946771200, 0}; + vector v_day = {10957, 10958, 0}; + + // Simple conversions + CheckCase( + timestamp(TimeUnit::NANO), v_nano, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::MICRO), v_micro, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::MILLI), v_milli, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second, is_valid, date64(), v_milli, options); + + CheckCase( + timestamp(TimeUnit::NANO), v_nano, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::MICRO), v_micro, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::MILLI), v_milli, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second, is_valid, date32(), v_day, options); + + // Disallow truncate, failures + vector v_nano_fail = {946684800000000001, 946771200000000001, 0}; + vector v_micro_fail = {946684800000001, 946771200000001, 0}; + vector v_milli_fail = {946684800001, 946771200001, 0}; + vector v_second_fail = {946684801, 946771201, 0}; + + options.allow_time_truncate = false; + CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, + date64(), options); + + CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, + date32(), options); + + // Make sure that nulls are excluded from the truncation checks + vector v_second_nofail = {946684800, 946771200, 1}; + CheckCase( + timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date32(), v_day, options); +} + +TEST_F(TestCast, TimeToCompatible) { CastOptions options; vector is_valid = {true, false, true, true, true}; @@ -392,11 +472,21 @@ TEST_F(TestCast, TimeToTime) { time64(TimeUnit::MICRO), v6, is_valid, time64(TimeUnit::NANO), e6, options); // Zero copy - std::shared_ptr arr; vector v7 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; ArrayFromVector(time64(TimeUnit::MICRO), is_valid, v7, &arr); CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); + // ARROW-1773: cast to int64 + CheckZeroCopy(*arr, int64()); + + vector v7_2 = {0, 70000, 2000, 1000, 0}; + ArrayFromVector(time32(TimeUnit::SECOND), is_valid, v7_2, &arr); + CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); + + // ARROW-1773: cast to int64 + CheckZeroCopy(*arr, int32()); + // Divide, truncate vector v8 = {0, 100123, 200456, 1123, 2456}; vector e8 = {0, 100, 200, 1, 2}; @@ -438,7 +528,7 @@ TEST_F(TestCast, TimeToTime) { options); } -TEST_F(TestCast, DateToDate) { +TEST_F(TestCast, DateToCompatible) { CastOptions options; vector is_valid = {true, false, true, true, true}; @@ -452,15 +542,21 @@ TEST_F(TestCast, DateToDate) { e1, options); // Zero copy - std::shared_ptr arr; vector v2 = {0, 70000, 2000, 1000, 0}; vector v3 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; ArrayFromVector(date32(), is_valid, v2, &arr); CheckZeroCopy(*arr, date32()); + // ARROW-1773: zero copy cast to integer + CheckZeroCopy(*arr, int32()); + ArrayFromVector(date64(), is_valid, v3, &arr); CheckZeroCopy(*arr, date64()); + // ARROW-1773: zero copy cast to integer + CheckZeroCopy(*arr, int64()); + // Divide, truncate vector v8 = {0, 100 * F + 123, 200 * F + 456, F + 123, 2 * F + 456}; vector e8 = {0, 100, 200, 1, 2}; @@ -497,22 +593,54 @@ TEST_F(TestCast, ToDouble) { options); } +TEST_F(TestCast, ChunkedArray) { + vector values1 = {0, 1, 2}; + vector values2 = {3, 4, 5}; + + auto type = int16(); + auto out_type = int64(); + + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + ArrayVector arrays = {a1, a2}; + auto carr = std::make_shared(arrays); + + CastOptions options; + + Datum out; + ASSERT_OK(Cast(&this->ctx_, Datum(carr), out_type, options, &out)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); + + auto out_carr = out.chunked_array(); + + vector ex_values1 = {0, 1, 2}; + vector ex_values2 = {3, 4, 5}; + auto a3 = _MakeArray(out_type, ex_values1, {}); + auto a4 = _MakeArray(out_type, ex_values2, {}); + + ArrayVector ex_arrays = {a3, a4}; + auto ex_carr = std::make_shared(ex_arrays); + + ASSERT_TRUE(out.chunked_array()->Equals(*ex_carr)); +} + TEST_F(TestCast, UnsupportedTarget) { vector is_valid = {true, false, true, true, true}; vector v1 = {0, 1, 2, 3, 4}; - std::shared_ptr arr; + shared_ptr arr; ArrayFromVector(int32(), is_valid, v1, &arr); - std::shared_ptr result; + shared_ptr result; ASSERT_RAISES(NotImplemented, Cast(&this->ctx_, *arr, utf8(), {}, &result)); } TEST_F(TestCast, DateTimeZeroCopy) { vector is_valid = {true, false, true, true, true}; - std::shared_ptr arr; vector v1 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; ArrayFromVector(int32(), is_valid, v1, &arr); CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); @@ -532,7 +660,7 @@ TEST_F(TestCast, FromNull) { NullArray arr(length); - std::shared_ptr result; + shared_ptr result; ASSERT_OK(Cast(&ctx_, arr, int32(), {}, &result)); ASSERT_EQ(length, result->length()); @@ -550,7 +678,7 @@ TEST_F(TestCast, PreallocatedMemory) { const int64_t length = 5; - std::shared_ptr arr; + shared_ptr arr; vector v1 = {0, 70000, 2000, 1000, 0}; vector e1 = {0, 70000, 2000, 1000, 0}; ArrayFromVector(int32(), is_valid, v1, &arr); @@ -560,26 +688,87 @@ TEST_F(TestCast, PreallocatedMemory) { std::unique_ptr kernel; ASSERT_OK(GetCastFunction(*int32(), out_type, options, &kernel)); - auto out_data = std::make_shared(out_type, length); + auto out_data = ArrayData::Make(out_type, length); - std::shared_ptr out_values; + shared_ptr out_values; ASSERT_OK(this->ctx_.Allocate(length * sizeof(int64_t), &out_values)); out_data->buffers.push_back(nullptr); out_data->buffers.push_back(out_values); - ASSERT_OK(kernel->Call(&this->ctx_, *arr, out_data.get())); + Datum out(out_data); + ASSERT_OK(kernel->Call(&this->ctx_, Datum(arr), &out)); // Buffer address unchanged ASSERT_EQ(out_values.get(), out_data->buffers[1].get()); - std::shared_ptr result = MakeArray(out_data); - std::shared_ptr expected; + shared_ptr result = MakeArray(out_data); + shared_ptr expected; ArrayFromVector(int64(), is_valid, e1, &expected); ASSERT_ARRAYS_EQUAL(*expected, *result); } +template +void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr& in_type, + const vector& in_values, + const std::shared_ptr& out_type, + const vector& out_values) { + using OutTraits = TypeTraits; + + CastOptions options; + + const int64_t length = static_cast(in_values.size()); + + shared_ptr arr, expected; + ArrayFromVector(in_type, in_values, &arr); + ArrayFromVector(out_type, out_values, &expected); + + shared_ptr out_buffer; + ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer)); + + std::unique_ptr kernel; + ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel)); + + const int64_t first_half = length / 2; + + auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer}); + auto out_second_data = out_data->Copy(); + out_second_data->offset = first_half; + + Datum out_first(out_data); + Datum out_second(out_second_data); + + // Cast each bit + ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(0, first_half)), &out_first)); + ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(first_half)), &out_second)); + + shared_ptr result = MakeArray(out_data); + + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +TEST_F(TestCast, OffsetOutputBuffer) { + // ARROW-1735 + vector v1 = {0, 10000, 2000, 1000, 0}; + vector e1 = {0, 10000, 2000, 1000, 0}; + + auto in_type = int32(); + auto out_type = int64(); + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + out_type, e1); + + vector e2 = {false, true, true, true, false}; + + out_type = boolean(); + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + boolean(), e2); + + vector e3 = {0, 10000, 2000, 1000, 0}; + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + int16(), e3); +} + template class TestDictionaryCast : public TestCast {}; @@ -592,13 +781,343 @@ TYPED_TEST_CASE(TestDictionaryCast, TestTypes); TYPED_TEST(TestDictionaryCast, Basic) { CastOptions options; - std::shared_ptr plain_array = + shared_ptr plain_array = TestBase::MakeRandomArray::ArrayType>(10, 2); - std::shared_ptr dict_array; + Datum out; + ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(plain_array->data()), &out)); + + this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options); +} + +/*TYPED_TEST(TestDictionaryCast, Reverse) { + CastOptions options; + shared_ptr plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + + shared_ptr dict_array; ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array)); - this->CheckPass(*dict_array, *plain_array, plain_array->type(), options); + this->CheckPass(*plain_array, *dict_array, dict_array->type(), options); +}*/ + +TEST_F(TestCast, ListToList) { + CastOptions options; + std::shared_ptr offsets; + + vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; + std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; + ArrayFromVector(offsets_is_valid, offsets_values, &offsets); + + shared_ptr int32_plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + std::shared_ptr int32_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *int32_plain_array, pool_, &int32_list_array)); + + std::shared_ptr int64_plain_array; + ASSERT_OK(Cast(&this->ctx_, *int32_plain_array, int64(), options, &int64_plain_array)); + std::shared_ptr int64_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *int64_plain_array, pool_, &int64_list_array)); + + std::shared_ptr float64_plain_array; + ASSERT_OK( + Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array)); + std::shared_ptr float64_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *float64_plain_array, pool_, &float64_list_array)); + + this->CheckPass(*int32_list_array, *int64_list_array, int64_list_array->type(), + options); + this->CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), + options); + this->CheckPass(*int64_list_array, *int32_list_array, int32_list_array->type(), + options); + this->CheckPass(*int64_list_array, *float64_list_array, float64_list_array->type(), + options); + this->CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), + options); + this->CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), + options); +} + +// ---------------------------------------------------------------------- +// Dictionary tests + +template +void CheckUnique(FunctionContext* ctx, const shared_ptr& type, + const vector& in_values, const vector& in_is_valid, + const vector& out_values, const vector& out_is_valid) { + shared_ptr input = _MakeArray(type, in_values, in_is_valid); + shared_ptr expected = _MakeArray(type, out_values, out_is_valid); + + shared_ptr result; + ASSERT_OK(Unique(ctx, Datum(input), &result)); + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +template +void CheckDictEncode(FunctionContext* ctx, const shared_ptr& type, + const vector& in_values, const vector& in_is_valid, + const vector& out_values, const vector& out_is_valid, + const vector& out_indices) { + shared_ptr input = _MakeArray(type, in_values, in_is_valid); + shared_ptr ex_dict = _MakeArray(type, out_values, out_is_valid); + shared_ptr ex_indices = + _MakeArray(int32(), out_indices, in_is_valid); + + DictionaryArray expected(dictionary(int32(), ex_dict), ex_indices); + + Datum datum_out; + ASSERT_OK(DictionaryEncode(ctx, Datum(input), &datum_out)); + shared_ptr result = MakeArray(datum_out.array()); + + ASSERT_ARRAYS_EQUAL(expected, *result); +} + +class TestHashKernel : public ComputeFixture, public TestBase {}; + +template +class TestHashKernelPrimitive : public ComputeFixture, public TestBase {}; + +typedef ::testing::Types + PrimitiveDictionaries; + +TYPED_TEST_CASE(TestHashKernelPrimitive, PrimitiveDictionaries); + +TYPED_TEST(TestHashKernelPrimitive, Unique) { + using T = typename TypeParam::c_type; + auto type = TypeTraits::type_singleton(); + CheckUnique(&this->ctx_, type, {2, 1, 2, 1}, {true, false, true, true}, + {2, 1}, {}); +} + +TYPED_TEST(TestHashKernelPrimitive, DictEncode) { + using T = typename TypeParam::c_type; + auto type = TypeTraits::type_singleton(); + CheckDictEncode(&this->ctx_, type, {2, 1, 2, 1, 2, 3}, + {true, false, true, true, true, true}, {2, 1, 3}, {}, + {0, 0, 0, 1, 0, 2}); +} + +TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { + using T = typename TypeParam::c_type; + // Skip this test for (u)int8 + if (sizeof(Scalar) == 1) { + return; + } + + const int64_t kTotalValues = 1000000; + const int64_t kRepeats = 5; + + vector values; + vector uniques; + vector indices; + for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { + const auto val = static_cast(i % kTotalValues); + values.push_back(val); + + if (i < kTotalValues) { + uniques.push_back(val); + } + indices.push_back(static_cast(i % kTotalValues)); + } + + auto type = TypeTraits::type_singleton(); + CheckUnique(&this->ctx_, type, values, {}, uniques, {}); + + CheckDictEncode(&this->ctx_, type, values, {}, uniques, {}, indices); +} + +TEST_F(TestHashKernel, UniqueTimeTimestamp) { + CheckUnique(&this->ctx_, time32(TimeUnit::SECOND), {2, 1, 2, 1}, + {true, false, true, true}, {2, 1}, {}); + + CheckUnique(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1}, + {true, false, true, true}, {2, 1}, {}); + + CheckUnique(&this->ctx_, timestamp(TimeUnit::NANO), + {2, 1, 2, 1}, {true, false, true, true}, {2, 1}, + {}); +} + +TEST_F(TestHashKernel, UniqueBoolean) { + CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, + {true, false, true, true}, {true, false}, {}); + + CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, + {true, false, true, true}, {false, true}, {}); + + // No nulls + CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, {}, + {true, false}, {}); + + CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, {}, + {false, true}, {}); +} + +TEST_F(TestHashKernel, DictEncodeBoolean) { + CheckDictEncode( + &this->ctx_, boolean(), {true, true, false, true, false}, + {true, false, true, true, true}, {true, false}, {}, {0, 0, 1, 0, 1}); + + CheckDictEncode( + &this->ctx_, boolean(), {false, true, false, true, false}, + {true, false, true, true, true}, {false, true}, {}, {0, 0, 0, 1, 0}); + + // No nulls + CheckDictEncode(&this->ctx_, boolean(), + {true, true, false, true, false}, {}, {true, false}, + {}, {0, 0, 1, 0, 1}); + + CheckDictEncode(&this->ctx_, boolean(), + {false, true, false, true, false}, {}, {false, true}, + {}, {0, 1, 0, 1, 0}); +} + +TEST_F(TestHashKernel, UniqueBinary) { + CheckUnique(&this->ctx_, binary(), + {"test", "", "test2", "test"}, + {true, false, true, true}, {"test", "test2"}, {}); + + CheckUnique(&this->ctx_, utf8(), {"test", "", "test2", "test"}, + {true, false, true, true}, {"test", "test2"}, {}); +} + +TEST_F(TestHashKernel, DictEncodeBinary) { + CheckDictEncode( + &this->ctx_, binary(), {"test", "", "test2", "test", "baz"}, + {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); + + CheckDictEncode( + &this->ctx_, utf8(), {"test", "", "test2", "test", "baz"}, + {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); +} + +TEST_F(TestHashKernel, BinaryResizeTable) { + const int64_t kTotalValues = 10000; + const int64_t kRepeats = 10; + + vector values; + vector uniques; + vector indices; + for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { + int64_t index = i % kTotalValues; + std::stringstream ss; + ss << "test" << index; + std::string val = ss.str(); + + values.push_back(val); + + if (i < kTotalValues) { + uniques.push_back(val); + } + indices.push_back(static_cast(i % kTotalValues)); + } + + CheckUnique(&this->ctx_, binary(), values, {}, uniques, {}); + CheckDictEncode(&this->ctx_, binary(), values, {}, uniques, {}, + indices); + + CheckUnique(&this->ctx_, utf8(), values, {}, uniques, {}); + CheckDictEncode(&this->ctx_, utf8(), values, {}, uniques, {}, + indices); +} + +TEST_F(TestHashKernel, UniqueFixedSizeBinary) { + CheckUnique( + &this->ctx_, fixed_size_binary(5), {"aaaaa", "", "bbbbb", "aaaaa"}, + {true, false, true, true}, {"aaaaa", "bbbbb"}, {}); +} + +TEST_F(TestHashKernel, DictEncodeFixedSizeBinary) { + CheckDictEncode( + &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, + {true, false, true, true, true}, {"bbbbb", "aaaaa", "ccccc"}, {}, {0, 0, 0, 1, 2}); +} + +TEST_F(TestHashKernel, FixedSizeBinaryResizeTable) { + const int64_t kTotalValues = 10000; + const int64_t kRepeats = 10; + + vector values; + vector uniques; + vector indices; + for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { + int64_t index = i % kTotalValues; + std::stringstream ss; + ss << "test" << static_cast(index / 128) << static_cast(index % 128); + std::string val = ss.str(); + + values.push_back(val); + + if (i < kTotalValues) { + uniques.push_back(val); + } + indices.push_back(static_cast(i % kTotalValues)); + } + + auto type = fixed_size_binary(6); + CheckUnique(&this->ctx_, type, values, {}, uniques, + {}); + CheckDictEncode(&this->ctx_, type, values, {}, + uniques, {}, indices); +} + +TEST_F(TestHashKernel, UniqueDecimal) { + vector values{12, 12, 11, 12}; + vector expected{12, 11}; + + CheckUnique(&this->ctx_, decimal(2, 0), values, + {true, false, true, true}, expected, {}); +} + +TEST_F(TestHashKernel, DictEncodeDecimal) { + vector values{12, 12, 11, 12, 13}; + vector expected{12, 11, 13}; + + CheckDictEncode(&this->ctx_, decimal(2, 0), values, + {true, false, true, true, true}, expected, + {}, {0, 0, 1, 0, 2}); +} + +TEST_F(TestHashKernel, ChunkedArrayInvoke) { + vector values1 = {"foo", "bar", "foo"}; + vector values2 = {"bar", "baz", "quuux", "foo"}; + + auto type = utf8(); + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + vector dict_values = {"foo", "bar", "baz", "quuux"}; + auto ex_dict = _MakeArray(type, dict_values, {}); + + ArrayVector arrays = {a1, a2}; + auto carr = std::make_shared(arrays); + + // Unique + shared_ptr result; + ASSERT_OK(Unique(&this->ctx_, Datum(carr), &result)); + ASSERT_ARRAYS_EQUAL(*ex_dict, *result); + + // Dictionary encode + auto dict_type = dictionary(int32(), ex_dict); + + auto i1 = _MakeArray(int32(), {0, 1, 0}, {}); + auto i2 = _MakeArray(int32(), {1, 2, 3, 0}, {}); + + ArrayVector dict_arrays = {std::make_shared(dict_type, i1), + std::make_shared(dict_type, i2)}; + auto dict_carr = std::make_shared(dict_arrays); + + Datum encoded_out; + ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(carr), &encoded_out)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); + + ASSERT_TRUE(encoded_out.chunked_array()->Equals(*dict_carr)); } } // namespace compute diff --git a/cpp/src/arrow/compute/context.cc b/cpp/src/arrow/compute/context.cc index 792dc4f386939..63aa341a3d718 100644 --- a/cpp/src/arrow/compute/context.cc +++ b/cpp/src/arrow/compute/context.cc @@ -20,11 +20,16 @@ #include #include "arrow/buffer.h" +#include "arrow/util/cpu-info.h" namespace arrow { namespace compute { -FunctionContext::FunctionContext(MemoryPool* pool) : pool_(pool) {} +FunctionContext::FunctionContext(MemoryPool* pool) : pool_(pool) { + if (!::arrow::CpuInfo::initialized()) { + ::arrow::CpuInfo::Init(); + } +} MemoryPool* FunctionContext::memory_pool() const { return pool_; } diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 4e072a7c143ed..0bfa55cfee198 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -18,7 +18,15 @@ #ifndef ARROW_COMPUTE_KERNEL_H #define ARROW_COMPUTE_KERNEL_H +#include +#include + #include "arrow/array.h" +#include "arrow/record_batch.h" +#include "arrow/table.h" +#include "arrow/util/macros.h" +#include "arrow/util/variant.h" +#include "arrow/util/visibility.h" namespace arrow { namespace compute { @@ -32,11 +40,99 @@ class ARROW_EXPORT OpKernel { virtual ~OpKernel() = default; }; +/// \brief Placeholder for Scalar values until we implement these +struct ARROW_EXPORT Scalar { + ~Scalar() {} + + ARROW_DISALLOW_COPY_AND_ASSIGN(Scalar); +}; + +/// \class Datum +/// \brief Variant type for various Arrow C++ data structures +struct ARROW_EXPORT Datum { + enum type { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION }; + + util::variant, std::shared_ptr, + std::shared_ptr, std::shared_ptr, + std::shared_ptr
, std::vector> + value; + + /// \brief Empty datum, to be populated elsewhere + Datum() : value(nullptr) {} + + explicit Datum(const std::shared_ptr& value) : value(value) {} + + explicit Datum(const std::shared_ptr& value) : value(value) {} + + explicit Datum(const std::shared_ptr& value) : Datum(value->data()) {} + + explicit Datum(const std::shared_ptr& value) : value(value) {} + + explicit Datum(const std::shared_ptr& value) : value(value) {} + + explicit Datum(const std::shared_ptr
& value) : value(value) {} + + explicit Datum(const std::vector& value) : value(value) {} + + ~Datum() {} + + Datum(const Datum& other) noexcept { this->value = other.value; } + + Datum::type kind() const { + switch (this->value.which()) { + case 0: + return Datum::NONE; + case 1: + return Datum::SCALAR; + case 2: + return Datum::ARRAY; + case 3: + return Datum::CHUNKED_ARRAY; + case 4: + return Datum::RECORD_BATCH; + case 5: + return Datum::TABLE; + case 6: + return Datum::COLLECTION; + default: + return Datum::NONE; + } + } + + std::shared_ptr array() const { + return util::get>(this->value); + } + + std::shared_ptr chunked_array() const { + return util::get>(this->value); + } + + const std::vector collection() const { + return util::get>(this->value); + } + + bool is_arraylike() const { + return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY; + } + + /// \brief The value type of the variant, if any + /// + /// \return nullptr if no type + std::shared_ptr type() const { + if (this->kind() == Datum::ARRAY) { + return util::get>(this->value)->type; + } else if (this->kind() == Datum::CHUNKED_ARRAY) { + return util::get>(this->value)->type(); + } + return nullptr; + } +}; + /// \class UnaryKernel /// \brief An array-valued function of a single input argument class ARROW_EXPORT UnaryKernel : public OpKernel { public: - virtual Status Call(FunctionContext* ctx, const Array& input, ArrayData* out) = 0; + virtual Status Call(FunctionContext* ctx, const Datum& input, Datum* out) = 0; }; } // namespace compute diff --git a/cpp/Brewfile b/cpp/src/arrow/compute/kernels/CMakeLists.txt similarity index 88% rename from cpp/Brewfile rename to cpp/src/arrow/compute/kernels/CMakeLists.txt index 5f82cacc55991..715e6c6612fe4 100644 --- a/cpp/Brewfile +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -brew "jemalloc" -brew "ccache" -brew "boost" -brew "cmake" +install(FILES + cast.h + hash.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/compute/kernels") diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc similarity index 68% rename from cpp/src/arrow/compute/cast.cc rename to cpp/src/arrow/compute/kernels/cast.cc index 68a2b12379e34..afa05485f65b4 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/compute/cast.h" +#include "arrow/compute/kernels/cast.h" #include #include @@ -26,6 +26,7 @@ #include #include #include +#include #include "arrow/array.h" #include "arrow/buffer.h" @@ -39,6 +40,7 @@ #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/util-internal.h" #ifdef ARROW_EXTRA_ERROR_CONTEXT @@ -69,23 +71,7 @@ namespace arrow { namespace compute { -template -inline const T* GetValuesAs(const ArrayData& data, int i) { - return reinterpret_cast(data.buffers[i]->data()) + data.offset; -} - -namespace { - -void CopyData(const Array& input, ArrayData* output) { - auto in_data = input.data(); - output->length = in_data->length; - output->null_count = input.null_count(); - output->buffers = in_data->buffers; - output->offset = in_data->offset; - output->child_data = in_data->child_data; -} - -} // namespace +constexpr int64_t kMillisecondsInDay = 86400000; // ---------------------------------------------------------------------- // Zero copy casts @@ -105,10 +91,14 @@ struct is_zero_copy_cast< // From integers to date/time types with zero copy template struct is_zero_copy_cast< - O, I, typename std::enable_if::value && - (std::is_base_of::value || - std::is_base_of::value || - std::is_base_of::value)>::type> { + O, I, + typename std::enable_if< + (std::is_base_of::value && + (std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value)) || + (std::is_base_of::value && + (std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value))>::type> { using O_T = typename O::c_type; using I_T = typename I::c_type; @@ -121,8 +111,8 @@ struct CastFunctor {}; // Indicated no computation required template struct CastFunctor::value>::type> { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { CopyData(input, output); } }; @@ -133,19 +123,14 @@ struct CastFunctor::value> template struct CastFunctor::value>::type> { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { - // Simply initialize data to 0 - auto buf = output->buffers[1]; - DCHECK_EQ(output->offset, 0); - memset(buf->mutable_data(), 0, buf->size()); - } + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) {} }; template <> struct CastFunctor { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) {} + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) {} }; // ---------------------------------------------------------------------- @@ -153,19 +138,17 @@ struct CastFunctor { // Cast from Boolean to other numbers template -struct CastFunctor::value>::type> { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { +struct CastFunctor> { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { using c_type = typename T::c_type; constexpr auto kOne = static_cast(1); constexpr auto kZero = static_cast(0); - auto in_data = input.data(); - internal::BitmapReader bit_reader(in_data->buffers[1]->data(), in_data->offset, - in_data->length); - auto out = reinterpret_cast(output->buffers[1]->mutable_data()); - for (int64_t i = 0; i < input.length(); ++i) { + internal::BitmapReader bit_reader(input.buffers[1]->data(), input.offset, + input.length); + auto out = GetMutableValues(output, 1); + for (int64_t i = 0; i < input.length; ++i) { *out++ = bit_reader.IsSet() ? kOne : kZero; bit_reader.Next(); } @@ -209,41 +192,46 @@ template struct CastFunctor::value && std::is_base_of::value && !std::is_same::value>::type> { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { - using in_type = typename I::c_type; - DCHECK_EQ(output->offset, 0); - - const in_type* in_data = GetValuesAs(*input.data(), 1); - uint8_t* out_data = reinterpret_cast(output->buffers[1]->mutable_data()); - for (int64_t i = 0; i < input.length(); ++i) { - BitUtil::SetBitTo(out_data, i, (*in_data++) != 0); + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + auto in_data = GetValues(input, 1); + internal::BitmapWriter writer(output->buffers[1]->mutable_data(), output->offset, + input.length); + + for (int64_t i = 0; i < input.length; ++i) { + if (*in_data++ != 0) { + writer.Set(); + } else { + writer.Clear(); + } + writer.Next(); } + writer.Finish(); } }; template struct CastFunctor::value>::type> { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { using in_type = typename I::c_type; using out_type = typename O::c_type; - DCHECK_EQ(output->offset, 0); - auto in_offset = input.offset(); + auto in_offset = input.offset; - const in_type* in_data = GetValuesAs(*input.data(), 1); - auto out_data = reinterpret_cast(output->buffers[1]->mutable_data()); + const in_type* in_data = GetValues(input, 1); + auto out_data = GetMutableValues(output, 1); if (!options.allow_int_overflow) { constexpr in_type kMax = static_cast(std::numeric_limits::max()); constexpr in_type kMin = static_cast(std::numeric_limits::min()); - if (input.null_count() > 0) { - internal::BitmapReader is_valid_reader(input.data()->buffers[0]->data(), - in_offset, input.length()); - for (int64_t i = 0; i < input.length(); ++i) { + // Null count may be -1 if the input array had been sliced + if (input.null_count != 0) { + internal::BitmapReader is_valid_reader(input.buffers[0]->data(), in_offset, + input.length); + for (int64_t i = 0; i < input.length; ++i) { if (ARROW_PREDICT_FALSE(is_valid_reader.IsSet() && (*in_data > kMax || *in_data < kMin))) { ctx->SetStatus(Status::Invalid("Integer value out of bounds")); @@ -252,7 +240,7 @@ struct CastFunctor kMax || *in_data < kMin)) { ctx->SetStatus(Status::Invalid("Integer value out of bounds")); } @@ -260,7 +248,7 @@ struct CastFunctor(*in_data++); } } @@ -271,14 +259,14 @@ template struct CastFunctor::value && !is_integer_downcast::value>::type> { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { using in_type = typename I::c_type; using out_type = typename O::c_type; - const in_type* in_data = GetValuesAs(*input.data(), 1); - auto out_data = reinterpret_cast(output->buffers[1]->mutable_data()); - for (int64_t i = 0; i < input.length(); ++i) { + const in_type* in_data = GetValues(input, 1); + auto out_data = GetMutableValues(output, 1); + for (int64_t i = 0; i < input.length; ++i) { *out_data++ = static_cast(*in_data++); } } @@ -288,32 +276,53 @@ struct CastFunctor -inline void ShiftTime(FunctionContext* ctx, const CastOptions& options, - const bool is_multiply, const int64_t factor, const Array& input, - ArrayData* output) { - const in_type* in_data = GetValuesAs(*input.data(), 1); - auto out_data = reinterpret_cast(output->buffers[1]->mutable_data()); - - if (is_multiply) { - for (int64_t i = 0; i < input.length(); i++) { +void ShiftTime(FunctionContext* ctx, const CastOptions& options, const bool is_multiply, + const int64_t factor, const ArrayData& input, ArrayData* output) { + const in_type* in_data = GetValues(input, 1); + auto out_data = GetMutableValues(output, 1); + + if (factor == 1) { + for (int64_t i = 0; i < input.length; i++) { + out_data[i] = static_cast(in_data[i]); + } + } else if (is_multiply) { + for (int64_t i = 0; i < input.length; i++) { out_data[i] = static_cast(in_data[i] * factor); } } else { if (options.allow_time_truncate) { - for (int64_t i = 0; i < input.length(); i++) { + for (int64_t i = 0; i < input.length; i++) { out_data[i] = static_cast(in_data[i] / factor); } } else { - for (int64_t i = 0; i < input.length(); i++) { - out_data[i] = static_cast(in_data[i] / factor); - if (input.IsValid(i) && (out_data[i] * factor != in_data[i])) { - std::stringstream ss; - ss << "Casting from " << input.type()->ToString() << " to " - << output->type->ToString() << " would lose data: " << in_data[i]; - ctx->SetStatus(Status::Invalid(ss.str())); - break; +#define RAISE_INVALID_CAST(VAL) \ + std::stringstream ss; \ + ss << "Casting from " << input.type->ToString() << " to " << output->type->ToString() \ + << " would lose data: " << VAL; \ + ctx->SetStatus(Status::Invalid(ss.str())); + + if (input.null_count != 0) { + internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset, + input.length); + for (int64_t i = 0; i < input.length; i++) { + out_data[i] = static_cast(in_data[i] / factor); + if (bit_reader.IsSet() && (out_data[i] * factor != in_data[i])) { + RAISE_INVALID_CAST(in_data[i]); + break; + } + bit_reader.Next(); + } + } else { + for (int64_t i = 0; i < input.length; i++) { + out_data[i] = static_cast(in_data[i] / factor); + if (out_data[i] * factor != in_data[i]) { + RAISE_INVALID_CAST(in_data[i]); + break; + } } } + +#undef RAISE_INVALID_CAST } } } @@ -332,10 +341,10 @@ const std::pair kTimeConversionTable[4][4] = { template <> struct CastFunctor { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { // If units are the same, zero copy, otherwise convert - const auto& in_type = static_cast(*input.type()); + const auto& in_type = static_cast(*input.type); const auto& out_type = static_cast(*output->type); if (in_type.unit() == out_type.unit()) { @@ -352,6 +361,56 @@ struct CastFunctor { } }; +template <> +struct CastFunctor { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + const auto& in_type = static_cast(*input.type); + + static const int64_t kTimestampToDateFactors[4] = { + 86400LL, // SECOND + 86400LL * 1000LL, // MILLI + 86400LL * 1000LL * 1000LL, // MICRO + 86400LL * 1000LL * 1000LL * 1000LL, // NANO + }; + + const int64_t factor = kTimestampToDateFactors[static_cast(in_type.unit())]; + ShiftTime(ctx, options, false, factor, input, output); + } +}; + +template <> +struct CastFunctor { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + const auto& in_type = static_cast(*input.type); + + std::pair conversion = + kTimeConversionTable[static_cast(in_type.unit())] + [static_cast(TimeUnit::MILLI)]; + + ShiftTime(ctx, options, conversion.first, conversion.second, input, + output); + + internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset, + input.length); + + // Ensure that intraday milliseconds have been zeroed out + auto out_data = GetMutableValues(output, 1); + for (int64_t i = 0; i < input.length; ++i) { + const int64_t remainder = out_data[i] % kMillisecondsInDay; + if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && bit_reader.IsSet() && + remainder > 0)) { + ctx->SetStatus( + Status::Invalid("Timestamp value had non-zero intraday milliseconds")); + break; + } + out_data[i] -= remainder; + bit_reader.Next(); + } + } +}; + // ---------------------------------------------------------------------- // From one time32 or time64 to another @@ -359,13 +418,13 @@ template struct CastFunctor::value && std::is_base_of::value>::type> { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { using in_t = typename I::c_type; using out_t = typename O::c_type; // If units are the same, zero copy, otherwise convert - const auto& in_type = static_cast(*input.type()); + const auto& in_type = static_cast(*input.type); const auto& out_type = static_cast(*output->type); if (in_type.unit() == out_type.unit()) { @@ -385,24 +444,65 @@ struct CastFunctor struct CastFunctor { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { ShiftTime(ctx, options, true, kMillisecondsInDay, input, output); } }; template <> struct CastFunctor { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { ShiftTime(ctx, options, false, kMillisecondsInDay, input, output); } }; +// ---------------------------------------------------------------------- +// List to List + +class ListCastKernel : public UnaryKernel { + public: + ListCastKernel(std::unique_ptr child_caster, + const std::shared_ptr& out_type) + : child_caster_(std::move(child_caster)), out_type_(out_type) {} + + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { + DCHECK_EQ(Datum::ARRAY, input.kind()); + + const ArrayData& in_data = *input.array(); + DCHECK_EQ(Type::LIST, in_data.type->id()); + ArrayData* result; + + if (in_data.offset != 0) { + return Status::NotImplemented( + "Casting sliced lists (non-zero offset) not yet implemented"); + } + + if (out->kind() == Datum::NONE) { + out->value = ArrayData::Make(out_type_, in_data.length); + } + + result = out->array().get(); + + // Copy buffers from parent + result->buffers = in_data.buffers; + + Datum casted_child; + RETURN_NOT_OK(child_caster_->Call(ctx, Datum(in_data.child_data[0]), &casted_child)); + result->child_data.push_back(casted_child.array()); + + RETURN_IF_ERROR(ctx); + return Status::OK(); + } + + private: + std::unique_ptr child_caster_; + std::shared_ptr out_type_; +}; + // ---------------------------------------------------------------------- // Dictionary to other things @@ -415,11 +515,12 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices, internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), indices.length()); - const index_c_type* in = GetValuesAs(*indices.data(), 1); + const index_c_type* in = GetValues(*indices.data(), 1); - uint8_t* out = output->buffers[1]->mutable_data(); int32_t byte_width = static_cast(*output->type).byte_width(); + + uint8_t* out = output->buffers[1]->mutable_data() + byte_width * output->offset; for (int64_t i = 0; i < indices.length(); ++i) { if (valid_bits_reader.IsSet()) { const uint8_t* value = dictionary.Value(in[i]); @@ -433,10 +534,11 @@ template struct CastFunctor< T, DictionaryType, typename std::enable_if::value>::type> { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { - const DictionaryArray& dict_array = static_cast(input); - const DictionaryType& type = static_cast(*input.type()); + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + DictionaryArray dict_array(input.Copy()); + + const DictionaryType& type = static_cast(*input.type); const DataType& values_type = *type.dictionary()->type(); const FixedSizeBinaryArray& dictionary = static_cast(*type.dictionary()); @@ -479,7 +581,7 @@ Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices, internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), indices.length()); - const index_c_type* in = GetValuesAs(*indices.data(), 1); + const index_c_type* in = GetValues(*indices.data(), 1); for (int64_t i = 0; i < indices.length(); ++i) { if (valid_bits_reader.IsSet()) { int32_t length; @@ -504,10 +606,11 @@ Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices, template struct CastFunctor::value>::type> { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { - const DictionaryArray& dict_array = static_cast(input); - const DictionaryType& type = static_cast(*input.type()); + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + DictionaryArray dict_array(input.Copy()); + + const DictionaryType& type = static_cast(*input.type); const DataType& values_type = *type.dictionary()->type(); const BinaryArray& dictionary = static_cast(*type.dictionary()); @@ -545,12 +648,10 @@ struct CastFunctor void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary, c_type* out) { - using index_c_type = typename IndexType::c_type; - internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), indices.length()); - const index_c_type* in = GetValuesAs(*indices.data(), 1); + auto in = GetValues(*indices.data(), 1); for (int64_t i = 0; i < indices.length(); ++i) { if (valid_bits_reader.IsSet()) { out[i] = dictionary[in[i]]; @@ -563,21 +664,22 @@ void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary, template struct CastFunctor::value>::type> { - void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, - ArrayData* output) { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { using c_type = typename T::c_type; - const DictionaryArray& dict_array = static_cast(input); - const DictionaryType& type = static_cast(*input.type()); + DictionaryArray dict_array(input.Copy()); + + const DictionaryType& type = static_cast(*input.type); const DataType& values_type = *type.dictionary()->type(); // Check if values and output type match DCHECK(values_type.Equals(*output->type)) << "Dictionary type: " << values_type << " target type: " << (*output->type); - const c_type* dictionary = GetValuesAs(*type.dictionary()->data(), 1); + const c_type* dictionary = GetValues(*type.dictionary()->data(), 1); - auto out = reinterpret_cast(output->buffers[1]->mutable_data()); + auto out = GetMutableValues(output, 1); const Array& indices = *dict_array.indices(); switch (indices.type()->id()) { case Type::INT8: @@ -603,24 +705,23 @@ struct CastFunctor CastFunction; -static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input, +static Status AllocateIfNotPreallocated(FunctionContext* ctx, const ArrayData& input, bool can_pre_allocate_values, ArrayData* out) { - const int64_t length = input.length(); - - out->null_count = input.null_count(); + const int64_t length = input.length; + out->null_count = input.null_count; // Propagate bitmap unless we are null type - std::shared_ptr validity_bitmap = input.data()->buffers[0]; - if (input.type_id() == Type::NA) { + std::shared_ptr validity_bitmap = input.buffers[0]; + if (input.type->id() == Type::NA) { int64_t bitmap_size = BitUtil::BytesForBits(length); RETURN_NOT_OK(ctx->Allocate(bitmap_size, &validity_bitmap)); memset(validity_bitmap->mutable_data(), 0, bitmap_size); - } else if (input.offset() != 0) { - RETURN_NOT_OK(CopyBitmap(ctx->memory_pool(), validity_bitmap->data(), input.offset(), + } else if (input.offset != 0) { + RETURN_NOT_OK(CopyBitmap(ctx->memory_pool(), validity_bitmap->data(), input.offset, length, &validity_bitmap)); } @@ -673,17 +774,31 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input class CastKernel : public UnaryKernel { public: CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy, - bool can_pre_allocate_values) + bool can_pre_allocate_values, const std::shared_ptr& out_type) : options_(options), func_(func), is_zero_copy_(is_zero_copy), - can_pre_allocate_values_(can_pre_allocate_values) {} + can_pre_allocate_values_(can_pre_allocate_values), + out_type_(out_type) {} + + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { + DCHECK_EQ(Datum::ARRAY, input.kind()); + + const ArrayData& in_data = *input.array(); + ArrayData* result; + + if (out->kind() == Datum::NONE) { + out->value = ArrayData::Make(out_type_, in_data.length); + } + + result = out->array().get(); - Status Call(FunctionContext* ctx, const Array& input, ArrayData* out) override { if (!is_zero_copy_) { - RETURN_NOT_OK(AllocateIfNotPreallocated(ctx, input, can_pre_allocate_values_, out)); + RETURN_NOT_OK( + AllocateIfNotPreallocated(ctx, in_data, can_pre_allocate_values_, result)); } - func_(ctx, options_, input, out); + func_(ctx, options_, in_data, result); + RETURN_IF_ERROR(ctx); return Status::OK(); } @@ -693,18 +808,19 @@ class CastKernel : public UnaryKernel { CastFunction func_; bool is_zero_copy_; bool can_pre_allocate_values_; + std::shared_ptr out_type_; }; -#define CAST_CASE(InType, OutType) \ - case OutType::type_id: \ - is_zero_copy = is_zero_copy_cast::value; \ - can_pre_allocate_values = \ - !(!is_binary_like(InType::type_id) && is_binary_like(OutType::type_id)); \ - func = [](FunctionContext* ctx, const CastOptions& options, const Array& input, \ - ArrayData* out) { \ - CastFunctor func; \ - func(ctx, options, input, out); \ - }; \ +#define CAST_CASE(InType, OutType) \ + case OutType::type_id: \ + is_zero_copy = is_zero_copy_cast::value; \ + can_pre_allocate_values = \ + !(!is_binary_like(InType::type_id) && is_binary_like(OutType::type_id)); \ + func = [](FunctionContext* ctx, const CastOptions& options, const ArrayData& input, \ + ArrayData* out) { \ + CastFunctor func; \ + func(ctx, options, input, out); \ + }; \ break; #define NUMERIC_CASES(FN, IN_TYPE) \ @@ -741,21 +857,29 @@ class CastKernel : public UnaryKernel { #define DATE32_CASES(FN, IN_TYPE) \ FN(Date32Type, Date32Type); \ - FN(Date32Type, Date64Type); + FN(Date32Type, Date64Type); \ + FN(Date32Type, Int32Type); #define DATE64_CASES(FN, IN_TYPE) \ FN(Date64Type, Date64Type); \ - FN(Date64Type, Date32Type); + FN(Date64Type, Date32Type); \ + FN(Date64Type, Int64Type); #define TIME32_CASES(FN, IN_TYPE) \ FN(Time32Type, Time32Type); \ - FN(Time32Type, Time64Type); + FN(Time32Type, Time64Type); \ + FN(Time32Type, Int32Type); #define TIME64_CASES(FN, IN_TYPE) \ FN(Time64Type, Time32Type); \ - FN(Time64Type, Time64Type); + FN(Time64Type, Time64Type); \ + FN(Time64Type, Int64Type); -#define TIMESTAMP_CASES(FN, IN_TYPE) FN(TimestampType, TimestampType); +#define TIMESTAMP_CASES(FN, IN_TYPE) \ + FN(TimestampType, TimestampType); \ + FN(TimestampType, Date32Type); \ + FN(TimestampType, Date64Type); \ + FN(TimestampType, Int64Type); #define DICTIONARY_CASES(FN, IN_TYPE) \ FN(IN_TYPE, NullType); \ @@ -775,26 +899,26 @@ class CastKernel : public UnaryKernel { FN(IN_TYPE, FloatType); \ FN(IN_TYPE, DoubleType); \ FN(IN_TYPE, FixedSizeBinaryType); \ - FN(IN_TYPE, DecimalType); \ + FN(IN_TYPE, Decimal128Type); \ FN(IN_TYPE, BinaryType); \ FN(IN_TYPE, StringType); -#define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \ - static std::unique_ptr Get##InType##CastFunc( \ - const std::shared_ptr& out_type, const CastOptions& options) { \ - CastFunction func; \ - bool is_zero_copy = false; \ - bool can_pre_allocate_values = true; \ - switch (out_type->id()) { \ - CASE_GENERATOR(CAST_CASE, InType); \ - default: \ - break; \ - } \ - if (func != nullptr) { \ - return std::unique_ptr( \ - new CastKernel(options, func, is_zero_copy, can_pre_allocate_values)); \ - } \ - return nullptr; \ +#define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \ + static std::unique_ptr Get##InType##CastFunc( \ + const std::shared_ptr& out_type, const CastOptions& options) { \ + CastFunction func; \ + bool is_zero_copy = false; \ + bool can_pre_allocate_values = true; \ + switch (out_type->id()) { \ + CASE_GENERATOR(CAST_CASE, InType); \ + default: \ + break; \ + } \ + if (func != nullptr) { \ + return std::unique_ptr(new CastKernel( \ + options, func, is_zero_copy, can_pre_allocate_values, out_type)); \ + } \ + return nullptr; \ } GET_CAST_FUNCTION(NULL_CASES, NullType); @@ -814,7 +938,6 @@ GET_CAST_FUNCTION(DATE64_CASES, Date64Type); GET_CAST_FUNCTION(TIME32_CASES, Time32Type); GET_CAST_FUNCTION(TIME64_CASES, Time64Type); GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType); - GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType); #define CAST_FUNCTION_CASE(InType) \ @@ -822,6 +945,26 @@ GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType); *kernel = Get##InType##CastFunc(out_type, options); \ break +namespace { + +Status GetListCastFunc(const DataType& in_type, const std::shared_ptr& out_type, + const CastOptions& options, std::unique_ptr* kernel) { + if (out_type->id() != Type::LIST) { + // Kernel will be null + return Status::OK(); + } + const DataType& in_value_type = *static_cast(in_type).value_type(); + std::shared_ptr out_value_type = + static_cast(*out_type).value_type(); + std::unique_ptr child_caster; + RETURN_NOT_OK(GetCastFunction(in_value_type, out_value_type, options, &child_caster)); + *kernel = + std::unique_ptr(new ListCastKernel(std::move(child_caster), out_type)); + return Status::OK(); +} + +} // namespace + Status GetCastFunction(const DataType& in_type, const std::shared_ptr& out_type, const CastOptions& options, std::unique_ptr* kernel) { switch (in_type.id()) { @@ -843,6 +986,9 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& CAST_FUNCTION_CASE(Time64Type); CAST_FUNCTION_CASE(TimestampType); CAST_FUNCTION_CASE(DictionaryType); + case Type::LIST: + RETURN_NOT_OK(GetListCastFunc(in_type, out_type, options, kernel)); + break; default: break; } @@ -855,18 +1001,27 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& return Status::OK(); } -Status Cast(FunctionContext* ctx, const Array& array, +Status Cast(FunctionContext* ctx, const Datum& value, const std::shared_ptr& out_type, const CastOptions& options, - std::shared_ptr* out) { + Datum* out) { // Dynamic dispatch to obtain right cast function std::unique_ptr func; - RETURN_NOT_OK(GetCastFunction(*array.type(), out_type, options, &func)); + RETURN_NOT_OK(GetCastFunction(*value.type(), out_type, options, &func)); - // Data structure for output - auto out_data = std::make_shared(out_type, array.length()); + std::vector result; + RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, func.get(), value, &result)); - RETURN_NOT_OK(func->Call(ctx, array, out_data.get())); - *out = MakeArray(out_data); + *out = detail::WrapDatumsLike(value, result); + return Status::OK(); +} + +Status Cast(FunctionContext* ctx, const Array& array, + const std::shared_ptr& out_type, const CastOptions& options, + std::shared_ptr* out) { + Datum datum_out; + RETURN_NOT_OK(Cast(ctx, Datum(array.data()), out_type, options, &datum_out)); + DCHECK_EQ(Datum::ARRAY, datum_out.kind()); + *out = MakeArray(datum_out.array()); return Status::OK(); } diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/kernels/cast.h similarity index 69% rename from cpp/src/arrow/compute/cast.h rename to cpp/src/arrow/compute/kernels/cast.h index d7bde20d607db..b75bb7b6c15f4 100644 --- a/cpp/src/arrow/compute/cast.h +++ b/cpp/src/arrow/compute/kernels/cast.h @@ -15,25 +15,26 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_COMPUTE_CAST_H -#define ARROW_COMPUTE_CAST_H +#ifndef ARROW_COMPUTE_KERNELS_CAST_H +#define ARROW_COMPUTE_KERNELS_CAST_H #include #include "arrow/status.h" #include "arrow/util/visibility.h" +#include "arrow/compute/kernel.h" + namespace arrow { class Array; +class ChunkedArray; +class Column; class DataType; namespace compute { -class FunctionContext; -class UnaryKernel; - -struct CastOptions { +struct ARROW_EXPORT CastOptions { CastOptions() : allow_int_overflow(false), allow_time_truncate(false) {} bool allow_int_overflow; @@ -48,7 +49,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& /// \brief Cast from one array type to another /// \param[in] context the FunctionContext -/// \param[in] array array to cast +/// \param[in] value array to cast /// \param[in] to_type type to cast to /// \param[in] options casting options /// \param[out] out resulting array @@ -56,11 +57,25 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& /// \since 0.7.0 /// \note API not yet finalized ARROW_EXPORT -Status Cast(FunctionContext* context, const Array& array, +Status Cast(FunctionContext* context, const Array& value, const std::shared_ptr& to_type, const CastOptions& options, std::shared_ptr* out); +/// \brief Cast from one value to another +/// \param[in] context the FunctionContext +/// \param[in] value datum to cast +/// \param[in] to_type type to cast to +/// \param[in] options casting options +/// \param[out] out resulting datum +/// +/// \since 0.8.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Cast(FunctionContext* context, const Datum& value, + const std::shared_ptr& to_type, const CastOptions& options, + Datum* out); + } // namespace compute } // namespace arrow -#endif // ARROW_COMPUTE_CAST_H +#endif // ARROW_COMPUTE_KERNELS_CAST_H diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc new file mode 100644 index 0000000000000..1face78bdebfb --- /dev/null +++ b/cpp/src/arrow/compute/kernels/hash.cc @@ -0,0 +1,838 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/kernels/hash.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/builder.h" +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/util-internal.h" +#include "arrow/util/hash-util.h" +#include "arrow/util/hash.h" + +namespace arrow { +namespace compute { + +namespace { + +enum class SIMDMode : char { NOSIMD, SSE4, AVX2 }; + +#define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE) \ + if (!KERNEL) { \ + std::stringstream ss; \ + ss << FUNCNAME << " not implemented for " << type->ToString(); \ + return Status::NotImplemented(ss.str()); \ + } + +// This is a slight design concession -- some hash actions have the possibility +// of failure. Rather than introduce extra error checking into all actions, we +// will raise an internal exception so that only the actions where errors can +// occur will experience the extra overhead +class HashException : public std::exception { + public: + explicit HashException(const std::string& msg, StatusCode code = StatusCode::Invalid) + : msg_(msg), code_(code) {} + + ~HashException() throw() {} + + const char* what() const throw() override; + + StatusCode code() const { return code_; } + + private: + std::string msg_; + StatusCode code_; +}; + +const char* HashException::what() const throw() { return msg_.c_str(); } + +class HashTable { + public: + HashTable(const std::shared_ptr& type, MemoryPool* pool) + : type_(type), + pool_(pool), + initialized_(false), + hash_table_(nullptr), + hash_slots_(nullptr), + hash_table_size_(0), + mod_bitmask_(0) {} + + virtual ~HashTable() {} + + virtual Status Append(const ArrayData& input) = 0; + virtual Status Flush(Datum* out) = 0; + virtual Status GetDictionary(std::shared_ptr* out) = 0; + + protected: + Status Init(int64_t elements); + + std::shared_ptr type_; + MemoryPool* pool_; + bool initialized_; + + // The hash table contains integer indices that reference the set of observed + // distinct values + std::shared_ptr hash_table_; + hash_slot_t* hash_slots_; + + /// Size of the table. Must be a power of 2. + int64_t hash_table_size_; + + /// Size at which we decide to resize + int64_t hash_table_load_threshold_; + + // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j % + // hash_table_size_, but uses far fewer CPU cycles + int64_t mod_bitmask_; +}; + +Status HashTable::Init(int64_t elements) { + DCHECK_EQ(elements, BitUtil::NextPower2(elements)); + RETURN_NOT_OK(internal::NewHashTable(elements, pool_, &hash_table_)); + hash_slots_ = reinterpret_cast(hash_table_->mutable_data()); + hash_table_size_ = elements; + hash_table_load_threshold_ = + static_cast(static_cast(elements) * kMaxHashTableLoad); + mod_bitmask_ = elements - 1; + initialized_ = true; + return Status::OK(); +} + +template +class HashTableKernel : public HashTable {}; + +// Types of hash actions +// +// unique: append to dictionary when not found, no-op with slot +// dictionary-encode: append to dictionary when not found, append slot # +// match: raise or set null when not found, otherwise append slot # +// isin: set false when not found, otherwise true +// value counts: append to dictionary when not found, increment count for slot + +template +class HashDictionary {}; + +// ---------------------------------------------------------------------- +// Hash table pass for nulls + +template +class HashTableKernel> : public HashTable { + public: + using HashTable::HashTable; + + Status Init() { + // No-op, do not even need to initialize hash table + return Status::OK(); + } + + Status Append(const ArrayData& arr) override { + if (!initialized_) { + RETURN_NOT_OK(Init()); + } + auto action = static_cast(this); + RETURN_NOT_OK(action->Reserve(arr.length)); + for (int64_t i = 0; i < arr.length; ++i) { + action->ObserveNull(); + } + return Status::OK(); + } + + Status GetDictionary(std::shared_ptr* out) override { + // TODO(wesm): handle null being a valid dictionary value + auto null_array = std::make_shared(0); + *out = null_array->data(); + return Status::OK(); + } +}; + +// ---------------------------------------------------------------------- +// Hash table pass for primitive types + +template +struct HashDictionary> { + using T = typename Type::c_type; + + explicit HashDictionary(MemoryPool* pool) + : pool(pool), buffer(std::make_shared(pool)), size(0), capacity(0) {} + + Status Init() { + this->size = 0; + return Resize(kInitialHashTableSize); + } + + Status DoubleSize() { return Resize(this->size * 2); } + + Status Resize(const int64_t elements) { + RETURN_NOT_OK(this->buffer->Resize(elements * sizeof(T))); + + this->capacity = elements; + this->values = reinterpret_cast(this->buffer->mutable_data()); + return Status::OK(); + } + + MemoryPool* pool; + std::shared_ptr buffer; + T* values; + int64_t size; + int64_t capacity; +}; + +#define GENERIC_HASH_PASS(HASH_INNER_LOOP) \ + if (arr.null_count != 0) { \ + internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); \ + for (int64_t i = 0; i < arr.length; ++i) { \ + const bool is_null = valid_reader.IsNotSet(); \ + valid_reader.Next(); \ + \ + if (is_null) { \ + action->ObserveNull(); \ + continue; \ + } \ + \ + HASH_INNER_LOOP(); \ + } \ + } else { \ + for (int64_t i = 0; i < arr.length; ++i) { \ + HASH_INNER_LOOP(); \ + } \ + } + +template +class HashTableKernel> : public HashTable { + public: + using T = typename Type::c_type; + + HashTableKernel(const std::shared_ptr& type, MemoryPool* pool) + : HashTable(type, pool), dict_(pool) {} + + Status Init() { + RETURN_NOT_OK(dict_.Init()); + return HashTable::Init(kInitialHashTableSize); + } + + Status Append(const ArrayData& arr) override { + if (!initialized_) { + RETURN_NOT_OK(Init()); + } + + const T* values = GetValues(arr, 1); + auto action = static_cast(this); + + RETURN_NOT_OK(action->Reserve(arr.length)); + +#define HASH_INNER_LOOP() \ + const T value = values[i]; \ + int64_t j = HashValue(value) & mod_bitmask_; \ + hash_slot_t slot = hash_slots_[j]; \ + \ + while (kHashSlotEmpty != slot && dict_.values[slot] != value) { \ + ++j; \ + if (ARROW_PREDICT_FALSE(j == hash_table_size_)) { \ + j = 0; \ + } \ + slot = hash_slots_[j]; \ + } \ + \ + if (slot == kHashSlotEmpty) { \ + if (!Action::allow_expand) { \ + throw HashException("Encountered new dictionary value"); \ + } \ + \ + slot = static_cast(dict_.size); \ + hash_slots_[j] = slot; \ + dict_.values[dict_.size++] = value; \ + \ + action->ObserveNotFound(slot); \ + \ + if (ARROW_PREDICT_FALSE(dict_.size > hash_table_load_threshold_)) { \ + RETURN_NOT_OK(action->DoubleSize()); \ + } \ + } else { \ + action->ObserveFound(slot); \ + } + + GENERIC_HASH_PASS(HASH_INNER_LOOP); + +#undef HASH_INNER_LOOP + + return Status::OK(); + } + + Status GetDictionary(std::shared_ptr* out) override { + // TODO(wesm): handle null being in the dictionary + auto dict_data = dict_.buffer; + RETURN_NOT_OK(dict_data->Resize(dict_.size * sizeof(T), false)); + + *out = ArrayData::Make(type_, dict_.size, {nullptr, dict_data}, 0); + return Status::OK(); + } + + protected: + int64_t HashValue(const T& value) const { + // TODO(wesm): Use faster hash function for C types + return HashUtil::Hash(&value, sizeof(T), 0); + } + + Status DoubleTableSize() { +#define PRIMITIVE_INNER_LOOP \ + const T value = dict_.values[index]; \ + int64_t j = HashValue(value) & new_mod_bitmask; + + DOUBLE_TABLE_SIZE(, PRIMITIVE_INNER_LOOP); + +#undef PRIMITIVE_INNER_LOOP + + return dict_.Resize(hash_table_size_); + } + + HashDictionary dict_; +}; + +// ---------------------------------------------------------------------- +// Hash table for boolean types + +template +class HashTableKernel> : public HashTable { + public: + HashTableKernel(const std::shared_ptr& type, MemoryPool* pool) + : HashTable(type, pool) { + std::fill(table_, table_ + 2, kHashSlotEmpty); + } + + Status Append(const ArrayData& arr) override { + auto action = static_cast(this); + + RETURN_NOT_OK(action->Reserve(arr.length)); + + internal::BitmapReader value_reader(arr.buffers[1]->data(), arr.offset, arr.length); + +#define HASH_INNER_LOOP() \ + if (slot == kHashSlotEmpty) { \ + if (!Action::allow_expand) { \ + throw HashException("Encountered new dictionary value"); \ + } \ + table_[j] = slot = static_cast(dict_.size()); \ + dict_.push_back(value); \ + action->ObserveNotFound(slot); \ + } else { \ + action->ObserveFound(slot); \ + } + + if (arr.null_count != 0) { + internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); + for (int64_t i = 0; i < arr.length; ++i) { + const bool is_null = valid_reader.IsNotSet(); + const bool value = value_reader.IsSet(); + const int j = value ? 1 : 0; + hash_slot_t slot = table_[j]; + valid_reader.Next(); + value_reader.Next(); + if (is_null) { + action->ObserveNull(); + continue; + } + HASH_INNER_LOOP(); + } + } else { + for (int64_t i = 0; i < arr.length; ++i) { + const bool value = value_reader.IsSet(); + const int j = value ? 1 : 0; + hash_slot_t slot = table_[j]; + value_reader.Next(); + HASH_INNER_LOOP(); + } + } + +#undef HASH_INNER_LOOP + + return Status::OK(); + } + + Status GetDictionary(std::shared_ptr* out) override { + BooleanBuilder builder(pool_); + for (const bool value : dict_) { + RETURN_NOT_OK(builder.Append(value)); + } + return builder.FinishInternal(out); + } + + private: + hash_slot_t table_[2]; + std::vector dict_; +}; + +// ---------------------------------------------------------------------- +// Hash table pass for variable-length binary types + +template +class HashTableKernel> : public HashTable { + public: + HashTableKernel(const std::shared_ptr& type, MemoryPool* pool) + : HashTable(type, pool), dict_offsets_(pool), dict_data_(pool), dict_size_(0) {} + + Status Init() { + RETURN_NOT_OK(dict_offsets_.Resize(kInitialHashTableSize)); + + // We append the end offset after each append to the dictionary, so this + // sets the initial condition for the length-0 case + // + // initial offsets (dict size == 0): 0 + // after 1st dict entry of length 3: 0 3 + // after 2nd dict entry of length 4: 0 3 7 + RETURN_NOT_OK(dict_offsets_.Append(0)); + return HashTable::Init(kInitialHashTableSize); + } + + Status Append(const ArrayData& arr) override { + if (!initialized_) { + RETURN_NOT_OK(Init()); + } + + const int32_t* offsets = GetValues(arr, 1); + const uint8_t* data = GetValues(arr, 2); + + auto action = static_cast(this); + RETURN_NOT_OK(action->Reserve(arr.length)); + +#define HASH_INNER_LOOP() \ + const int32_t position = offsets[i]; \ + const int32_t length = offsets[i + 1] - position; \ + const uint8_t* value = data + position; \ + \ + int64_t j = HashValue(value, length) & mod_bitmask_; \ + hash_slot_t slot = hash_slots_[j]; \ + \ + const int32_t* dict_offsets = dict_offsets_.data(); \ + const uint8_t* dict_data = dict_data_.data(); \ + while (kHashSlotEmpty != slot && \ + !((dict_offsets[slot + 1] - dict_offsets[slot]) == length && \ + 0 == memcmp(value, dict_data + dict_offsets[slot], length))) { \ + ++j; \ + if (ARROW_PREDICT_FALSE(j == hash_table_size_)) { \ + j = 0; \ + } \ + slot = hash_slots_[j]; \ + } \ + \ + if (slot == kHashSlotEmpty) { \ + if (!Action::allow_expand) { \ + throw HashException("Encountered new dictionary value"); \ + } \ + \ + slot = dict_size_++; \ + hash_slots_[j] = slot; \ + \ + RETURN_NOT_OK(dict_data_.Append(value, length)); \ + RETURN_NOT_OK(dict_offsets_.Append(static_cast(dict_data_.length()))); \ + \ + action->ObserveNotFound(slot); \ + \ + if (ARROW_PREDICT_FALSE(dict_size_ > hash_table_load_threshold_)) { \ + RETURN_NOT_OK(action->DoubleSize()); \ + } \ + } else { \ + action->ObserveFound(slot); \ + } + + GENERIC_HASH_PASS(HASH_INNER_LOOP); + +#undef HASH_INNER_LOOP + + return Status::OK(); + } + + Status GetDictionary(std::shared_ptr* out) override { + // TODO(wesm): handle null being in the dictionary + BufferVector buffers = {nullptr, nullptr, nullptr}; + + RETURN_NOT_OK(dict_offsets_.Finish(&buffers[1])); + RETURN_NOT_OK(dict_data_.Finish(&buffers[2])); + + *out = ArrayData::Make(type_, dict_size_, std::move(buffers), 0); + return Status::OK(); + } + + protected: + int64_t HashValue(const uint8_t* data, int32_t length) const { + return HashUtil::Hash(data, length, 0); + } + + Status DoubleTableSize() { +#define VARBYTES_SETUP \ + const int32_t* dict_offsets = dict_offsets_.data(); \ + const uint8_t* dict_data = dict_data_.data() + +#define VARBYTES_COMPUTE_HASH \ + const int32_t length = dict_offsets[index + 1] - dict_offsets[index]; \ + const uint8_t* value = dict_data + dict_offsets[index]; \ + int64_t j = HashValue(value, length) & new_mod_bitmask + + DOUBLE_TABLE_SIZE(VARBYTES_SETUP, VARBYTES_COMPUTE_HASH); + +#undef VARBYTES_SETUP +#undef VARBYTES_COMPUTE_HASH + + return Status::OK(); + } + + TypedBufferBuilder dict_offsets_; + TypedBufferBuilder dict_data_; + int32_t dict_size_; +}; + +// ---------------------------------------------------------------------- +// Hash table pass for fixed size binary types + +template +class HashTableKernel> + : public HashTable { + public: + HashTableKernel(const std::shared_ptr& type, MemoryPool* pool) + : HashTable(type, pool), dict_data_(pool), dict_size_(0) { + const auto& fw_type = static_cast(*type); + byte_width_ = fw_type.bit_width() / 8; + } + + Status Init() { + RETURN_NOT_OK(dict_data_.Resize(kInitialHashTableSize * byte_width_)); + return HashTable::Init(kInitialHashTableSize); + } + + Status Append(const ArrayData& arr) override { + if (!initialized_) { + RETURN_NOT_OK(Init()); + } + + const uint8_t* data = GetValues(arr, 1); + + auto action = static_cast(this); + RETURN_NOT_OK(action->Reserve(arr.length)); + +#define HASH_INNER_LOOP() \ + const uint8_t* value = data + i * byte_width_; \ + int64_t j = HashValue(value) & mod_bitmask_; \ + hash_slot_t slot = hash_slots_[j]; \ + \ + const uint8_t* dict_data = dict_data_.data(); \ + while (kHashSlotEmpty != slot && \ + !(0 == memcmp(value, dict_data + slot * byte_width_, byte_width_))) { \ + ++j; \ + if (ARROW_PREDICT_FALSE(j == hash_table_size_)) { \ + j = 0; \ + } \ + slot = hash_slots_[j]; \ + } \ + \ + if (slot == kHashSlotEmpty) { \ + if (!Action::allow_expand) { \ + throw HashException("Encountered new dictionary value"); \ + } \ + \ + slot = dict_size_++; \ + hash_slots_[j] = slot; \ + \ + RETURN_NOT_OK(dict_data_.Append(value, byte_width_)); \ + \ + action->ObserveNotFound(slot); \ + \ + if (ARROW_PREDICT_FALSE(dict_size_ > hash_table_load_threshold_)) { \ + RETURN_NOT_OK(action->DoubleSize()); \ + } \ + } else { \ + action->ObserveFound(slot); \ + } + + GENERIC_HASH_PASS(HASH_INNER_LOOP); + +#undef HASH_INNER_LOOP + + return Status::OK(); + } + + Status GetDictionary(std::shared_ptr* out) override { + // TODO(wesm): handle null being in the dictionary + BufferVector buffers = {nullptr, nullptr}; + RETURN_NOT_OK(dict_data_.Finish(&buffers[1])); + + *out = ArrayData::Make(type_, dict_size_, std::move(buffers), 0); + return Status::OK(); + } + + protected: + int64_t HashValue(const uint8_t* data) const { + return HashUtil::Hash(data, byte_width_, 0); + } + + Status DoubleTableSize() { +#define FIXED_BYTES_SETUP const uint8_t* dict_data = dict_data_.data() + +#define FIXED_BYTES_COMPUTE_HASH \ + int64_t j = HashValue(dict_data + index * byte_width_) & new_mod_bitmask + + DOUBLE_TABLE_SIZE(FIXED_BYTES_SETUP, FIXED_BYTES_COMPUTE_HASH); + +#undef FIXED_BYTES_SETUP +#undef FIXED_BYTES_COMPUTE_HASH + + return Status::OK(); + } + + int32_t byte_width_; + TypedBufferBuilder dict_data_; + int32_t dict_size_; +}; + +// ---------------------------------------------------------------------- +// Unique implementation + +template +class UniqueImpl : public HashTableKernel> { + public: + static constexpr bool allow_expand = true; + using Base = HashTableKernel>; + using Base::Base; + + Status Reserve(const int64_t length) { return Status::OK(); } + + void ObserveFound(const hash_slot_t slot) {} + void ObserveNull() {} + void ObserveNotFound(const hash_slot_t slot) {} + + Status DoubleSize() { return Base::DoubleTableSize(); } + + Status Append(const ArrayData& input) override { return Base::Append(input); } + + Status Flush(Datum* out) override { + // No-op + return Status::OK(); + } +}; + +// ---------------------------------------------------------------------- +// Dictionary encode implementation + +template +class DictEncodeImpl : public HashTableKernel> { + public: + static constexpr bool allow_expand = true; + using Base = HashTableKernel; + + DictEncodeImpl(const std::shared_ptr& type, MemoryPool* pool) + : Base(type, pool), indices_builder_(pool) {} + + Status Reserve(const int64_t length) { return indices_builder_.Reserve(length); } + + void ObserveNull() { indices_builder_.UnsafeAppendToBitmap(false); } + + void ObserveFound(const hash_slot_t slot) { indices_builder_.UnsafeAppend(slot); } + + void ObserveNotFound(const hash_slot_t slot) { return ObserveFound(slot); } + + Status DoubleSize() { return Base::DoubleTableSize(); } + + Status Flush(Datum* out) override { + std::shared_ptr result; + RETURN_NOT_OK(indices_builder_.FinishInternal(&result)); + out->value = std::move(result); + return Status::OK(); + } + + using Base::Append; + + private: + Int32Builder indices_builder_; +}; + +// ---------------------------------------------------------------------- +// Kernel wrapper for generic hash table kernels + +class HashKernelImpl : public HashKernel { + public: + explicit HashKernelImpl(std::unique_ptr hasher) + : hasher_(std::move(hasher)) {} + + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { + DCHECK_EQ(Datum::ARRAY, input.kind()); + RETURN_NOT_OK(Append(ctx, *input.array())); + return Flush(out); + } + + Status Append(FunctionContext* ctx, const ArrayData& input) override { + std::lock_guard guard(lock_); + try { + RETURN_NOT_OK(hasher_->Append(input)); + } catch (const HashException& e) { + return Status(e.code(), e.what()); + } + return Status::OK(); + } + + Status Flush(Datum* out) override { return hasher_->Flush(out); } + + Status GetDictionary(std::shared_ptr* out) override { + return hasher_->GetDictionary(out); + } + + private: + std::mutex lock_; + std::unique_ptr hasher_; +}; + +} // namespace + +Status GetUniqueKernel(FunctionContext* ctx, const std::shared_ptr& type, + std::unique_ptr* out) { + std::unique_ptr hasher; + +#define UNIQUE_CASE(InType) \ + case InType::type_id: \ + hasher.reset(new UniqueImpl(type, ctx->memory_pool())); \ + break + + switch (type->id()) { + UNIQUE_CASE(NullType); + UNIQUE_CASE(BooleanType); + UNIQUE_CASE(UInt8Type); + UNIQUE_CASE(Int8Type); + UNIQUE_CASE(UInt16Type); + UNIQUE_CASE(Int16Type); + UNIQUE_CASE(UInt32Type); + UNIQUE_CASE(Int32Type); + UNIQUE_CASE(UInt64Type); + UNIQUE_CASE(Int64Type); + UNIQUE_CASE(FloatType); + UNIQUE_CASE(DoubleType); + UNIQUE_CASE(Date32Type); + UNIQUE_CASE(Date64Type); + UNIQUE_CASE(Time32Type); + UNIQUE_CASE(Time64Type); + UNIQUE_CASE(TimestampType); + UNIQUE_CASE(BinaryType); + UNIQUE_CASE(StringType); + UNIQUE_CASE(FixedSizeBinaryType); + UNIQUE_CASE(Decimal128Type); + default: + break; + } + +#undef UNIQUE_CASE + + CHECK_IMPLEMENTED(hasher, "unique", type); + out->reset(new HashKernelImpl(std::move(hasher))); + return Status::OK(); +} + +Status GetDictionaryEncodeKernel(FunctionContext* ctx, + const std::shared_ptr& type, + std::unique_ptr* out) { + std::unique_ptr hasher; + +#define DICTIONARY_ENCODE_CASE(InType) \ + case InType::type_id: \ + hasher.reset(new DictEncodeImpl(type, ctx->memory_pool())); \ + break + + switch (type->id()) { + DICTIONARY_ENCODE_CASE(NullType); + DICTIONARY_ENCODE_CASE(BooleanType); + DICTIONARY_ENCODE_CASE(UInt8Type); + DICTIONARY_ENCODE_CASE(Int8Type); + DICTIONARY_ENCODE_CASE(UInt16Type); + DICTIONARY_ENCODE_CASE(Int16Type); + DICTIONARY_ENCODE_CASE(UInt32Type); + DICTIONARY_ENCODE_CASE(Int32Type); + DICTIONARY_ENCODE_CASE(UInt64Type); + DICTIONARY_ENCODE_CASE(Int64Type); + DICTIONARY_ENCODE_CASE(FloatType); + DICTIONARY_ENCODE_CASE(DoubleType); + DICTIONARY_ENCODE_CASE(Date32Type); + DICTIONARY_ENCODE_CASE(Date64Type); + DICTIONARY_ENCODE_CASE(Time32Type); + DICTIONARY_ENCODE_CASE(Time64Type); + DICTIONARY_ENCODE_CASE(TimestampType); + DICTIONARY_ENCODE_CASE(BinaryType); + DICTIONARY_ENCODE_CASE(StringType); + DICTIONARY_ENCODE_CASE(FixedSizeBinaryType); + DICTIONARY_ENCODE_CASE(Decimal128Type); + default: + break; + } + +#undef DICTIONARY_ENCODE_CASE + + CHECK_IMPLEMENTED(hasher, "dictionary-encode", type); + out->reset(new HashKernelImpl(std::move(hasher))); + return Status::OK(); +} + +namespace { + +Status InvokeHash(FunctionContext* ctx, HashKernel* func, const Datum& value, + std::vector* kernel_outputs, + std::shared_ptr* dictionary) { + RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, func, value, kernel_outputs)); + + std::shared_ptr dict_data; + RETURN_NOT_OK(func->GetDictionary(&dict_data)); + *dictionary = MakeArray(dict_data); + return Status::OK(); +} + +} // namespace + +Status Unique(FunctionContext* ctx, const Datum& value, std::shared_ptr* out) { + std::unique_ptr func; + RETURN_NOT_OK(GetUniqueKernel(ctx, value.type(), &func)); + + std::vector dummy_outputs; + return InvokeHash(ctx, func.get(), value, &dummy_outputs, out); +} + +Status DictionaryEncode(FunctionContext* ctx, const Datum& value, Datum* out) { + std::unique_ptr func; + RETURN_NOT_OK(GetDictionaryEncodeKernel(ctx, value.type(), &func)); + + std::shared_ptr dictionary; + std::vector indices_outputs; + RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &indices_outputs, &dictionary)); + + // Create the dictionary type + DCHECK_EQ(indices_outputs[0].kind(), Datum::ARRAY); + std::shared_ptr dict_type = + ::arrow::dictionary(indices_outputs[0].array()->type, dictionary); + + // Create DictionaryArray for each piece yielded by the kernel invocations + std::vector> dict_chunks; + for (const Datum& datum : indices_outputs) { + dict_chunks.emplace_back( + std::make_shared(dict_type, MakeArray(datum.array()))); + } + + *out = detail::WrapArraysLike(value, dict_chunks); + return Status::OK(); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/hash.h b/cpp/src/arrow/compute/kernels/hash.h new file mode 100644 index 0000000000000..05f242949893f --- /dev/null +++ b/cpp/src/arrow/compute/kernels/hash.h @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_KERNELS_HASH_H +#define ARROW_COMPUTE_KERNELS_HASH_H + +#include +#include + +#include "arrow/compute/kernel.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +class FunctionContext; + +/// \brief Invoke hash table kernel on input array, returning any output +/// values. Implementations should be thread-safe +class ARROW_EXPORT HashKernel : public UnaryKernel { + public: + virtual Status Append(FunctionContext* ctx, const ArrayData& input) = 0; + virtual Status Flush(Datum* out) = 0; + virtual Status GetDictionary(std::shared_ptr* out) = 0; +}; + +/// \since 0.8.0 +/// \note API not yet finalized +ARROW_EXPORT +Status GetUniqueKernel(FunctionContext* ctx, const std::shared_ptr& type, + std::unique_ptr* kernel); + +ARROW_EXPORT +Status GetDictionaryEncodeKernel(FunctionContext* ctx, + const std::shared_ptr& type, + std::unique_ptr* kernel); + +/// \brief Compute unique elements from an array-like object +/// \param[in] context the FunctionContext +/// \param[in] datum array-like input +/// \param[out] out result as Array +/// +/// \since 0.8.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Unique(FunctionContext* context, const Datum& datum, std::shared_ptr* out); + +/// \brief Dictionary-encode values in an array-like object +/// \param[in] context the FunctionContext +/// \param[in] data array-like input +/// \param[out] out result with same shape and type as input +/// +/// \since 0.8.0 +/// \note API not yet finalized +ARROW_EXPORT +Status DictionaryEncode(FunctionContext* context, const Datum& data, Datum* out); + +// TODO(wesm): Define API for incremental dictionary encoding + +// TODO(wesm): Define API for regularizing DictionaryArray objects with +// different dictionaries + +// class DictionaryEncoder { +// public: +// virtual Encode(const Datum& data, Datum* out) = 0; +// }; + +// +// ARROW_EXPORT +// Status DictionaryEncode(FunctionContext* context, const Datum& data, +// const Array& prior_dictionary, Datum* out); + +// TODO(wesm): Implement these next +// ARROW_EXPORT +// Status Match(FunctionContext* context, const Datum& values, const Datum& member_set, +// Datum* out); + +// ARROW_EXPORT +// Status IsIn(FunctionContext* context, const Datum& values, const Datum& member_set, +// Datum* out); + +// ARROW_EXPORT +// Status CountValues(FunctionContext* context, const Datum& values, +// std::shared_ptr* out_uniques, +// std::shared_ptr* out_counts); + +} // namespace compute +} // namespace arrow + +#endif // ARROW_COMPUTE_KERNELS_HASH_H diff --git a/cpp/src/arrow/compute/kernels/util-internal.cc b/cpp/src/arrow/compute/kernels/util-internal.cc new file mode 100644 index 0000000000000..28428bfcba6c6 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/util-internal.cc @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/kernels/util-internal.h" + +#include + +#include "arrow/array.h" +#include "arrow/status.h" +#include "arrow/util/logging.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" + +namespace arrow { +namespace compute { +namespace detail { + +Status InvokeUnaryArrayKernel(FunctionContext* ctx, UnaryKernel* kernel, + const Datum& value, std::vector* outputs) { + if (value.kind() == Datum::ARRAY) { + Datum output; + RETURN_NOT_OK(kernel->Call(ctx, value, &output)); + outputs->push_back(output); + } else if (value.kind() == Datum::CHUNKED_ARRAY) { + const ChunkedArray& array = *value.chunked_array(); + for (int i = 0; i < array.num_chunks(); i++) { + Datum output; + RETURN_NOT_OK(kernel->Call(ctx, Datum(array.chunk(i)), &output)); + outputs->push_back(output); + } + } else { + return Status::Invalid("Input Datum was not array-like"); + } + return Status::OK(); +} + +Datum WrapArraysLike(const Datum& value, + const std::vector>& arrays) { + // Create right kind of datum + if (value.kind() == Datum::ARRAY) { + return Datum(arrays[0]->data()); + } else if (value.kind() == Datum::CHUNKED_ARRAY) { + return Datum(std::make_shared(arrays)); + } else { + DCHECK(false) << "unhandled datum kind"; + return Datum(); + } +} + +Datum WrapDatumsLike(const Datum& value, const std::vector& datums) { + // Create right kind of datum + if (value.kind() == Datum::ARRAY) { + DCHECK_EQ(1, datums.size()); + return Datum(datums[0].array()); + } else if (value.kind() == Datum::CHUNKED_ARRAY) { + std::vector> arrays; + for (const Datum& datum : datums) { + DCHECK_EQ(Datum::ARRAY, datum.kind()); + arrays.emplace_back(MakeArray(datum.array())); + } + return Datum(std::make_shared(arrays)); + } else { + DCHECK(false) << "unhandled datum kind"; + return Datum(); + } +} + +} // namespace detail +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/util-internal.h b/cpp/src/arrow/compute/kernels/util-internal.h new file mode 100644 index 0000000000000..7633fed4a8fe7 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/util-internal.h @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_KERNELS_UTIL_INTERNAL_H +#define ARROW_COMPUTE_KERNELS_UTIL_INTERNAL_H + +#include + +#include "arrow/compute/kernel.h" +#include "arrow/type_fwd.h" + +namespace arrow { +namespace compute { + +class FunctionContext; + +template +using is_number = std::is_base_of; + +template +using enable_if_primitive_ctype = + typename std::enable_if::value>::type; + +template +using enable_if_date = typename std::enable_if::value>::type; + +template +using enable_if_time = typename std::enable_if::value>::type; + +template +using enable_if_timestamp = + typename std::enable_if::value>::type; + +template +using enable_if_has_c_type = + typename std::enable_if::value || + std::is_base_of::value || + std::is_base_of::value || + std::is_base_of::value>::type; + +template +using enable_if_null = typename std::enable_if::value>::type; + +template +using enable_if_binary = + typename std::enable_if::value>::type; + +template +using enable_if_boolean = + typename std::enable_if::value>::type; + +template +using enable_if_fixed_size_binary = + typename std::enable_if::value>::type; + +template +using enable_if_list = typename std::enable_if::value>::type; + +template +using enable_if_number = typename std::enable_if::value>::type; + +template +inline const T* GetValues(const ArrayData& data, int i) { + return reinterpret_cast(data.buffers[i]->data()) + data.offset; +} + +template +inline T* GetMutableValues(const ArrayData* data, int i) { + return reinterpret_cast(data->buffers[i]->mutable_data()) + data->offset; +} + +static inline void CopyData(const ArrayData& input, ArrayData* output) { + output->length = input.length; + output->null_count = input.null_count; + output->buffers = input.buffers; + output->offset = input.offset; + output->child_data = input.child_data; +} + +namespace detail { + +Status InvokeUnaryArrayKernel(FunctionContext* ctx, UnaryKernel* kernel, + const Datum& value, std::vector* outputs); + +Datum WrapArraysLike(const Datum& value, + const std::vector>& arrays); + +Datum WrapDatumsLike(const Datum& value, const std::vector& datums); + +} // namespace detail + +} // namespace compute +} // namespace arrow + +#endif // ARROW_COMPUTE_KERNELS_UTIL_INTERNAL_H diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index 3f3069b91974b..be474131e3c72 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -32,9 +32,6 @@ set(ARROW_GPU_SRCS cuda_memory.cc ) -add_custom_target(arrow_gpu_sources DEPENDS ${ARROW_GPU_SRCS}) -add_dependencies(arrow_gpu_sources metadata_fbs) - set(ARROW_GPU_SHARED_LINK_LIBS arrow_shared ${CUDA_LIBRARIES} @@ -43,6 +40,7 @@ set(ARROW_GPU_SHARED_LINK_LIBS ADD_ARROW_LIB(arrow_gpu SOURCES ${ARROW_GPU_SRCS} + DEPENDENCIES metadata_fbs SHARED_LINK_FLAGS "" SHARED_LINK_LIBS ${ARROW_GPU_SHARED_LINK_LIBS} STATIC_LINK_LIBS "" @@ -54,7 +52,7 @@ configure_file(cuda_version.h.in @ONLY) install(FILES - "${CMAKE_CURRENT_SOURCE_DIR}/cuda_version.h" + "${CMAKE_CURRENT_BINARY_DIR}/cuda_version.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu") install(FILES diff --git a/cpp/src/arrow/gpu/cuda-test.cc b/cpp/src/arrow/gpu/cuda-test.cc index afdc3020e8581..7595f8bec7912 100644 --- a/cpp/src/arrow/gpu/cuda-test.cc +++ b/cpp/src/arrow/gpu/cuda-test.cc @@ -55,6 +55,7 @@ TEST_F(TestCudaBuffer, Allocate) { std::shared_ptr buffer; ASSERT_OK(context_->Allocate(kSize, &buffer)); ASSERT_EQ(kSize, buffer->size()); + ASSERT_EQ(kSize, context_->bytes_allocated()); } void AssertCudaBufferEquals(const CudaBuffer& buffer, const uint8_t* host_data, diff --git a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc index 022268e034758..a7262c8b4d4ba 100644 --- a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc +++ b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc @@ -27,8 +27,8 @@ #include "arrow/ipc/message.h" #include "arrow/ipc/reader.h" #include "arrow/ipc/writer.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/util/visibility.h" #include "arrow/gpu/cuda_context.h" diff --git a/cpp/src/arrow/gpu/cuda_context.cc b/cpp/src/arrow/gpu/cuda_context.cc index 42315cd509c3c..2f5ccb0a95063 100644 --- a/cpp/src/arrow/gpu/cuda_context.cc +++ b/cpp/src/arrow/gpu/cuda_context.cc @@ -64,31 +64,32 @@ class CudaContext::CudaContextImpl { CUdeviceptr data; CU_RETURN_NOT_OK(cuMemAlloc(&data, static_cast(nbytes))); + bytes_allocated_ += nbytes; *out = reinterpret_cast(data); return Status::OK(); } - Status CopyHostToDevice(uint8_t* dst, const uint8_t* src, int64_t nbytes) { + Status CopyHostToDevice(void* dst, const void* src, int64_t nbytes) { CU_RETURN_NOT_OK(cuCtxSetCurrent(context_)); - CU_RETURN_NOT_OK(cuMemcpyHtoD(reinterpret_cast(dst), - reinterpret_cast(src), + CU_RETURN_NOT_OK(cuMemcpyHtoD(reinterpret_cast(dst), src, static_cast(nbytes))); return Status::OK(); } - Status CopyDeviceToHost(uint8_t* dst, const uint8_t* src, int64_t nbytes) { + Status CopyDeviceToHost(void* dst, const void* src, int64_t nbytes) { CU_RETURN_NOT_OK(cuCtxSetCurrent(context_)); CU_RETURN_NOT_OK(cuMemcpyDtoH(dst, reinterpret_cast(src), static_cast(nbytes))); return Status::OK(); } - Status Free(uint8_t* device_ptr, int64_t nbytes) { + Status Free(void* device_ptr, int64_t nbytes) { CU_RETURN_NOT_OK(cuMemFree(reinterpret_cast(device_ptr))); + bytes_allocated_ -= nbytes; return Status::OK(); } - Status ExportIpcBuffer(uint8_t* data, std::unique_ptr* handle) { + Status ExportIpcBuffer(void* data, std::unique_ptr* handle) { CU_RETURN_NOT_OK(cuCtxSetCurrent(context_)); CUipcMemHandle cu_handle; CU_RETURN_NOT_OK(cuIpcGetMemHandle(&cu_handle, reinterpret_cast(data))); @@ -143,7 +144,7 @@ class CudaDeviceManager::CudaDeviceManagerImpl { return Status::OK(); } - Status FreeHost(uint8_t* data, int64_t nbytes) { + Status FreeHost(void* data, int64_t nbytes) { CU_RETURN_NOT_OK(cuMemFreeHost(data)); host_bytes_allocated_ -= nbytes; return Status::OK(); @@ -219,7 +220,7 @@ Status CudaDeviceManager::AllocateHost(int64_t nbytes, return Status::OK(); } -Status CudaDeviceManager::FreeHost(uint8_t* data, int64_t nbytes) { +Status CudaDeviceManager::FreeHost(void* data, int64_t nbytes) { return impl_->FreeHost(data, nbytes); } @@ -239,22 +240,22 @@ Status CudaContext::Allocate(int64_t nbytes, std::shared_ptr* out) { return Status::OK(); } -Status CudaContext::ExportIpcBuffer(uint8_t* data, +Status CudaContext::ExportIpcBuffer(void* data, std::unique_ptr* handle) { return impl_->ExportIpcBuffer(data, handle); } -Status CudaContext::CopyHostToDevice(uint8_t* dst, const uint8_t* src, int64_t nbytes) { +Status CudaContext::CopyHostToDevice(void* dst, const void* src, int64_t nbytes) { return impl_->CopyHostToDevice(dst, src, nbytes); } -Status CudaContext::CopyDeviceToHost(uint8_t* dst, const uint8_t* src, int64_t nbytes) { +Status CudaContext::CopyDeviceToHost(void* dst, const void* src, int64_t nbytes) { return impl_->CopyDeviceToHost(dst, src, nbytes); } Status CudaContext::Close() { return impl_->Close(); } -Status CudaContext::Free(uint8_t* device_ptr, int64_t nbytes) { +Status CudaContext::Free(void* device_ptr, int64_t nbytes) { return impl_->Free(device_ptr, nbytes); } @@ -273,5 +274,7 @@ Status CudaContext::OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle, return Status::OK(); } +int64_t CudaContext::bytes_allocated() const { return impl_->bytes_allocated(); } + } // namespace gpu } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_context.h b/cpp/src/arrow/gpu/cuda_context.h index 6471059612349..6fc2e0d08abc9 100644 --- a/cpp/src/arrow/gpu/cuda_context.h +++ b/cpp/src/arrow/gpu/cuda_context.h @@ -46,7 +46,7 @@ class ARROW_EXPORT CudaDeviceManager { Status AllocateHost(int64_t nbytes, std::shared_ptr* buffer); - Status FreeHost(uint8_t* data, int64_t nbytes); + Status FreeHost(void* data, int64_t nbytes); int num_devices() const; @@ -88,10 +88,10 @@ class ARROW_EXPORT CudaContext : public std::enable_shared_from_this* handle); - Status CopyHostToDevice(uint8_t* dst, const uint8_t* src, int64_t nbytes); - Status CopyDeviceToHost(uint8_t* dst, const uint8_t* src, int64_t nbytes); - Status Free(uint8_t* device_ptr, int64_t nbytes); + Status ExportIpcBuffer(void* data, std::unique_ptr* handle); + Status CopyHostToDevice(void* dst, const void* src, int64_t nbytes); + Status CopyDeviceToHost(void* dst, const void* src, int64_t nbytes); + Status Free(void* device_ptr, int64_t nbytes); class CudaContextImpl; std::unique_ptr impl_; diff --git a/cpp/src/arrow/gpu/cuda_memory.cc b/cpp/src/arrow/gpu/cuda_memory.cc index 949c1d7a4dd41..cbf044121a644 100644 --- a/cpp/src/arrow/gpu/cuda_memory.cc +++ b/cpp/src/arrow/gpu/cuda_memory.cc @@ -101,11 +101,11 @@ CudaBuffer::CudaBuffer(const std::shared_ptr& parent, const int64_t is_ipc_(false) {} Status CudaBuffer::CopyToHost(const int64_t position, const int64_t nbytes, - uint8_t* out) const { + void* out) const { return context_->CopyDeviceToHost(out, data_ + position, nbytes); } -Status CudaBuffer::CopyFromHost(const int64_t position, const uint8_t* data, +Status CudaBuffer::CopyFromHost(const int64_t position, const void* data, int64_t nbytes) { DCHECK_LE(nbytes, size_ - position) << "Copy would overflow buffer"; return context_->CopyHostToDevice(mutable_data_ + position, data, nbytes); @@ -134,7 +134,7 @@ CudaBufferReader::CudaBufferReader(const std::shared_ptr& buffer) CudaBufferReader::~CudaBufferReader() {} -Status CudaBufferReader::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { +Status CudaBufferReader::Read(int64_t nbytes, int64_t* bytes_read, void* buffer) { nbytes = std::min(nbytes, size_ - position_); *bytes_read = nbytes; RETURN_NOT_OK(context_->CopyDeviceToHost(buffer, data_ + position_, nbytes)); @@ -190,7 +190,7 @@ class CudaBufferWriter::CudaBufferWriterImpl { return Status::OK(); } - Status Write(const uint8_t* data, int64_t nbytes) { + Status Write(const void* data, int64_t nbytes) { if (nbytes == 0) { return Status::OK(); } @@ -214,7 +214,7 @@ class CudaBufferWriter::CudaBufferWriterImpl { return Status::OK(); } - Status WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) { + Status WriteAt(int64_t position, const void* data, int64_t nbytes) { std::lock_guard guard(lock_); RETURN_NOT_OK(Seek(position)); return Write(data, nbytes); @@ -269,11 +269,11 @@ Status CudaBufferWriter::Seek(int64_t position) { Status CudaBufferWriter::Tell(int64_t* position) const { return impl_->Tell(position); } -Status CudaBufferWriter::Write(const uint8_t* data, int64_t nbytes) { +Status CudaBufferWriter::Write(const void* data, int64_t nbytes) { return impl_->Write(data, nbytes); } -Status CudaBufferWriter::WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) { +Status CudaBufferWriter::WriteAt(int64_t position, const void* data, int64_t nbytes) { return impl_->WriteAt(position, data, nbytes); } diff --git a/cpp/src/arrow/gpu/cuda_memory.h b/cpp/src/arrow/gpu/cuda_memory.h index 9ebd2ccf0c77e..9376b4b3ffcb9 100644 --- a/cpp/src/arrow/gpu/cuda_memory.h +++ b/cpp/src/arrow/gpu/cuda_memory.h @@ -49,14 +49,14 @@ class ARROW_EXPORT CudaBuffer : public Buffer { /// \brief Copy memory from GPU device to CPU host /// \param[out] out a pre-allocated output buffer /// \return Status - Status CopyToHost(const int64_t position, const int64_t nbytes, uint8_t* out) const; + Status CopyToHost(const int64_t position, const int64_t nbytes, void* out) const; /// \brief Copy memory to device at position /// \param[in] position start position to copy bytes /// \param[in] data the host data to copy /// \param[in] nbytes number of bytes to copy /// \return Status - Status CopyFromHost(const int64_t position, const uint8_t* data, int64_t nbytes); + Status CopyFromHost(const int64_t position, const void* data, int64_t nbytes); /// \brief Expose this device buffer as IPC memory which can be used in other processes /// \param[out] handle the exported IPC handle @@ -130,7 +130,7 @@ class ARROW_EXPORT CudaBufferReader : public io::BufferReader { /// \param[in] nbytes number of bytes to read /// \param[out] bytes_read actual number of bytes read /// \param[out] buffer pre-allocated memory to write into - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; /// \brief Zero-copy read from device memory /// \param[in] nbytes number of bytes to read @@ -158,9 +158,9 @@ class ARROW_EXPORT CudaBufferWriter : public io::WriteableFile { Status Seek(int64_t position) override; - Status Write(const uint8_t* data, int64_t nbytes) override; + Status Write(const void* data, int64_t nbytes) override; - Status WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) override; + Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; Status Tell(int64_t* position) const override; diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 74c6c09e6219b..65a302c8beac2 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -22,6 +22,21 @@ #define _FILE_OFFSET_BITS 64 +// define max read/write count +#if defined(_MSC_VER) +#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX +#else + +#ifdef __APPLE__ +// due to macOS bug, we need to set read/write max +#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX +#else +// see notes on Linux read/write manpage +#define ARROW_MAX_IO_CHUNKSIZE 0x7ffff000 +#endif + +#endif + #include "arrow/io/file.h" #if _WIN32 || _WIN64 @@ -238,39 +253,64 @@ static inline Status FileSeek(int fd, int64_t pos) { return Status::OK(); } -static inline Status FileRead(int fd, uint8_t* buffer, int64_t nbytes, +static inline Status FileRead(const int fd, uint8_t* buffer, const int64_t nbytes, int64_t* bytes_read) { + *bytes_read = 0; + + while (*bytes_read != -1 && *bytes_read < nbytes) { + int64_t chunksize = + std::min(static_cast(ARROW_MAX_IO_CHUNKSIZE), nbytes - *bytes_read); #if defined(_MSC_VER) - if (nbytes > INT32_MAX) { - return Status::IOError("Unable to read > 2GB blocks yet"); - } - *bytes_read = static_cast(_read(fd, buffer, static_cast(nbytes))); + int64_t ret = static_cast( + _read(fd, buffer + *bytes_read, static_cast(chunksize))); #else - *bytes_read = static_cast(read(fd, buffer, static_cast(nbytes))); + int64_t ret = static_cast( + read(fd, buffer + *bytes_read, static_cast(chunksize))); #endif + if (ret != -1) { + *bytes_read += ret; + if (ret < chunksize) { + // EOF + break; + } + } else { + *bytes_read = ret; + } + } + if (*bytes_read == -1) { - // TODO(wesm): errno to string - return Status::IOError("Error reading bytes from file"); + return Status::IOError(std::string("Error reading bytes from file: ") + + std::string(strerror(errno))); } return Status::OK(); } -static inline Status FileWrite(int fd, const uint8_t* buffer, int64_t nbytes) { - int ret; +static inline Status FileWrite(const int fd, const uint8_t* buffer, + const int64_t nbytes) { + int ret = 0; + int64_t bytes_written = 0; + + while (ret != -1 && bytes_written < nbytes) { + int64_t chunksize = + std::min(static_cast(ARROW_MAX_IO_CHUNKSIZE), nbytes - bytes_written); #if defined(_MSC_VER) - if (nbytes > INT32_MAX) { - return Status::IOError("Unable to write > 2GB blocks to file yet"); - } - ret = static_cast(_write(fd, buffer, static_cast(nbytes))); + ret = static_cast( + _write(fd, buffer + bytes_written, static_cast(chunksize))); #else - ret = static_cast(write(fd, buffer, static_cast(nbytes))); + ret = static_cast( + write(fd, buffer + bytes_written, static_cast(chunksize))); #endif + if (ret != -1) { + bytes_written += ret; + } + } + if (ret == -1) { - // TODO(wesm): errno to string - return Status::IOError("Error writing bytes to file"); + return Status::IOError(std::string("Error writing bytes from file: ") + + std::string(strerror(errno))); } return Status::OK(); } @@ -354,11 +394,11 @@ class OSFile { return Status::OK(); } - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { - return FileRead(fd_, out, nbytes, bytes_read); + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) { + return FileRead(fd_, reinterpret_cast(out), nbytes, bytes_read); } - Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) { + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, void* out) { std::lock_guard guard(lock_); RETURN_NOT_OK(Seek(position)); return Read(nbytes, bytes_read, out); @@ -373,12 +413,12 @@ class OSFile { Status Tell(int64_t* pos) const { return FileTell(fd_, pos); } - Status Write(const uint8_t* data, int64_t length) { + Status Write(const void* data, int64_t length) { std::lock_guard guard(lock_); if (length < 0) { return Status::IOError("Length must be non-negative"); } - return FileWrite(fd_, data, length); + return FileWrite(fd_, reinterpret_cast(data), length); } int fd() const { return fd_; } @@ -464,13 +504,13 @@ Status ReadableFile::Close() { return impl_->Close(); } Status ReadableFile::Tell(int64_t* pos) const { return impl_->Tell(pos); } -Status ReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { +Status ReadableFile::Read(int64_t nbytes, int64_t* bytes_read, void* out) { std::lock_guard guard(impl_->lock()); return impl_->Read(nbytes, bytes_read, out); } Status ReadableFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) { + void* out) { return impl_->ReadAt(position, nbytes, bytes_read, out); } @@ -530,7 +570,7 @@ Status FileOutputStream::Close() { return impl_->Close(); } Status FileOutputStream::Tell(int64_t* pos) const { return impl_->Tell(pos); } -Status FileOutputStream::Write(const uint8_t* data, int64_t length) { +Status FileOutputStream::Write(const void* data, int64_t length) { return impl_->Write(data, length); } @@ -670,7 +710,7 @@ Status MemoryMappedFile::Close() { return Status::OK(); } -Status MemoryMappedFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { +Status MemoryMappedFile::Read(int64_t nbytes, int64_t* bytes_read, void* out) { nbytes = std::max( 0, std::min(nbytes, memory_map_->size() - memory_map_->position())); if (nbytes > 0) { @@ -695,7 +735,7 @@ Status MemoryMappedFile::Read(int64_t nbytes, std::shared_ptr* out) { } Status MemoryMappedFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) { + void* out) { std::lock_guard guard(memory_map_->lock()); RETURN_NOT_OK(Seek(position)); return Read(nbytes, bytes_read, out); @@ -710,7 +750,7 @@ Status MemoryMappedFile::ReadAt(int64_t position, int64_t nbytes, bool MemoryMappedFile::supports_zero_copy() const { return true; } -Status MemoryMappedFile::WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) { +Status MemoryMappedFile::WriteAt(int64_t position, const void* data, int64_t nbytes) { std::lock_guard guard(memory_map_->lock()); if (!memory_map_->opened() || !memory_map_->writable()) { @@ -721,7 +761,7 @@ Status MemoryMappedFile::WriteAt(int64_t position, const uint8_t* data, int64_t return WriteInternal(data, nbytes); } -Status MemoryMappedFile::Write(const uint8_t* data, int64_t nbytes) { +Status MemoryMappedFile::Write(const void* data, int64_t nbytes) { std::lock_guard guard(memory_map_->lock()); if (!memory_map_->opened() || !memory_map_->writable()) { @@ -733,7 +773,7 @@ Status MemoryMappedFile::Write(const uint8_t* data, int64_t nbytes) { return WriteInternal(data, nbytes); } -Status MemoryMappedFile::WriteInternal(const uint8_t* data, int64_t nbytes) { +Status MemoryMappedFile::WriteInternal(const void* data, int64_t nbytes) { memcpy(memory_map_->head(), data, static_cast(nbytes)); memory_map_->advance(nbytes); return Status::OK(); diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h index 7937fea74997d..265df4d65225d 100644 --- a/cpp/src/arrow/io/file.h +++ b/cpp/src/arrow/io/file.h @@ -59,7 +59,7 @@ class ARROW_EXPORT FileOutputStream : public OutputStream { Status Tell(int64_t* position) const override; // Write bytes to the stream. Thread-safe - Status Write(const uint8_t* data, int64_t nbytes) override; + Status Write(const void* data, int64_t nbytes) override; int file_descriptor() const; @@ -93,12 +93,12 @@ class ARROW_EXPORT ReadableFile : public RandomAccessFile { Status Tell(int64_t* position) const override; // Read bytes from the file. Thread-safe - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; Status Read(int64_t nbytes, std::shared_ptr* out) override; /// \brief Thread-safe implementation of ReadAt Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) override; + void* out) override; /// \brief Thread-safe implementation of ReadAt Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; @@ -141,13 +141,13 @@ class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { Status Seek(int64_t position) override; // Required by RandomAccessFile, copies memory into out. Not thread-safe - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) override; + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; // Zero copy read. Not thread-safe Status Read(int64_t nbytes, std::shared_ptr* out) override; Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) override; + void* out) override; /// Default implementation is thread-safe Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; @@ -155,10 +155,10 @@ class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { bool supports_zero_copy() const override; /// Write data at the current position in the file. Thread-safe - Status Write(const uint8_t* data, int64_t nbytes) override; + Status Write(const void* data, int64_t nbytes) override; /// Write data at a particular position in the file. Thread-safe - Status WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) override; + Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; // @return: the size in bytes of the memory source Status GetSize(int64_t* size) override; @@ -168,7 +168,7 @@ class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { private: MemoryMappedFile(); - Status WriteInternal(const uint8_t* data, int64_t nbytes); + Status WriteInternal(const void* data, int64_t nbytes); class ARROW_NO_EXPORT MemoryMap; std::shared_ptr memory_map_; diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 77d1f524aae85..6e3e4a7a1c7e7 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -119,7 +119,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { return Status::OK(); } - Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, void* buffer) { tSize ret; if (driver_->HasPread()) { ret = driver_->Pread(fs_, file_, static_cast(position), @@ -149,11 +149,11 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { return Status::OK(); } - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) { int64_t total_bytes = 0; while (total_bytes < nbytes) { tSize ret = driver_->Read( - fs_, file_, reinterpret_cast(buffer + total_bytes), + fs_, file_, reinterpret_cast(buffer) + total_bytes, static_cast(std::min(buffer_size_, nbytes - total_bytes))); RETURN_NOT_OK(CheckReadResult(ret)); total_bytes += ret; @@ -212,7 +212,7 @@ HdfsReadableFile::~HdfsReadableFile() { DCHECK(impl_->Close().ok()); } Status HdfsReadableFile::Close() { return impl_->Close(); } Status HdfsReadableFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* buffer) { + void* buffer) { return impl_->ReadAt(position, nbytes, bytes_read, buffer); } @@ -223,7 +223,7 @@ Status HdfsReadableFile::ReadAt(int64_t position, int64_t nbytes, bool HdfsReadableFile::supports_zero_copy() const { return false; } -Status HdfsReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { +Status HdfsReadableFile::Read(int64_t nbytes, int64_t* bytes_read, void* buffer) { return impl_->Read(nbytes, bytes_read, buffer); } @@ -261,7 +261,7 @@ class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { return Status::OK(); } - Status Write(const uint8_t* buffer, int64_t nbytes, int64_t* bytes_written) { + Status Write(const void* buffer, int64_t nbytes, int64_t* bytes_written) { std::lock_guard guard(lock_); tSize ret = driver_->Write(fs_, file_, reinterpret_cast(buffer), static_cast(nbytes)); @@ -277,12 +277,11 @@ HdfsOutputStream::~HdfsOutputStream() { DCHECK(impl_->Close().ok()); } Status HdfsOutputStream::Close() { return impl_->Close(); } -Status HdfsOutputStream::Write(const uint8_t* buffer, int64_t nbytes, - int64_t* bytes_read) { +Status HdfsOutputStream::Write(const void* buffer, int64_t nbytes, int64_t* bytes_read) { return impl_->Write(buffer, nbytes, bytes_read); } -Status HdfsOutputStream::Write(const uint8_t* buffer, int64_t nbytes) { +Status HdfsOutputStream::Write(const void* buffer, int64_t nbytes) { int64_t bytes_written_dummy = 0; return Write(buffer, nbytes, &bytes_written_dummy); } diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index 0708b11cca550..062473b20104d 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -182,12 +182,12 @@ class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { // NOTE: If you wish to read a particular range of a file in a multithreaded // context, you may prefer to use ReadAt to avoid locking issues - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; Status Read(int64_t nbytes, std::shared_ptr* out) override; Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* buffer) override; + void* buffer) override; Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; @@ -217,9 +217,9 @@ class ARROW_EXPORT HdfsOutputStream : public OutputStream { Status Close() override; - Status Write(const uint8_t* buffer, int64_t nbytes) override; + Status Write(const void* buffer, int64_t nbytes) override; - Status Write(const uint8_t* buffer, int64_t nbytes, int64_t* bytes_written); + Status Write(const void* buffer, int64_t nbytes, int64_t* bytes_written); Status Flush() override; diff --git a/cpp/src/arrow/io/interfaces.cc b/cpp/src/arrow/io/interfaces.cc index 582cc2026d86d..04560209a62c0 100644 --- a/cpp/src/arrow/io/interfaces.cc +++ b/cpp/src/arrow/io/interfaces.cc @@ -38,7 +38,7 @@ RandomAccessFile::RandomAccessFile() : impl_(new RandomAccessFile::RandomAccessFileImpl()) {} Status RandomAccessFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) { + void* out) { std::lock_guard lock(impl_->lock_); RETURN_NOT_OK(Seek(position)); return Read(nbytes, bytes_read, out); @@ -51,12 +51,11 @@ Status RandomAccessFile::ReadAt(int64_t position, int64_t nbytes, return Read(nbytes, out); } -Status Writeable::Write(const std::string& data) { - return Write(reinterpret_cast(data.c_str()), - static_cast(data.size())); +Status Writable::Write(const std::string& data) { + return Write(data.c_str(), static_cast(data.size())); } -Status Writeable::Flush() { return Status::OK(); } +Status Writable::Flush() { return Status::OK(); } } // namespace io } // namespace arrow diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index 82af875e7c07e..09536a44ef003 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -86,11 +86,11 @@ class ARROW_EXPORT Seekable { virtual Status Seek(int64_t position) = 0; }; -class ARROW_EXPORT Writeable { +class ARROW_EXPORT Writable { public: - virtual ~Writeable() = default; + virtual ~Writable() = default; - virtual Status Write(const uint8_t* data, int64_t nbytes) = 0; + virtual Status Write(const void* data, int64_t nbytes) = 0; /// \brief Flush buffered bytes, if any virtual Status Flush(); @@ -102,13 +102,13 @@ class ARROW_EXPORT Readable { public: virtual ~Readable() = default; - virtual Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) = 0; + virtual Status Read(int64_t nbytes, int64_t* bytes_read, void* out) = 0; // Does not copy if not necessary virtual Status Read(int64_t nbytes, std::shared_ptr* out) = 0; }; -class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writeable { +class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writable { protected: OutputStream() = default; }; @@ -138,7 +138,7 @@ class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable { /// \param[out] out The buffer to read bytes into /// \return Status virtual Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) = 0; + void* out) = 0; /// \brief Read nbytes at position, provide default implementations using Read(...), but /// can be overridden. Default implementation is thread-safe. @@ -162,7 +162,7 @@ class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable { class ARROW_EXPORT WriteableFile : public OutputStream, public Seekable { public: - virtual Status WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) = 0; + virtual Status WriteAt(int64_t position, const void* data, int64_t nbytes) = 0; protected: WriteableFile() = default; diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc index ee3beabd9a80e..e70431e69580a 100644 --- a/cpp/src/arrow/io/io-file-test.cc +++ b/cpp/src/arrow/io/io-file-test.cc @@ -135,7 +135,7 @@ TEST_F(TestFileOutputStream, Close) { OpenFile(); const char* data = "testdata"; - ASSERT_OK(file_->Write(reinterpret_cast(data), strlen(data))); + ASSERT_OK(file_->Write(data, strlen(data))); int fd = file_->file_descriptor(); ASSERT_OK(file_->Close()); @@ -158,7 +158,7 @@ TEST_F(TestFileOutputStream, InvalidWrites) { const char* data = ""; - ASSERT_RAISES(IOError, file_->Write(reinterpret_cast(data), -1)); + ASSERT_RAISES(IOError, file_->Write(data, -1)); } TEST_F(TestFileOutputStream, Tell) { @@ -170,7 +170,7 @@ TEST_F(TestFileOutputStream, Tell) { ASSERT_EQ(0, position); const char* data = "testdata"; - ASSERT_OK(file_->Write(reinterpret_cast(data), 8)); + ASSERT_OK(file_->Write(data, 8)); ASSERT_OK(file_->Tell(&position)); ASSERT_EQ(8, position); } @@ -179,7 +179,7 @@ TEST_F(TestFileOutputStream, TruncatesNewFile) { ASSERT_OK(FileOutputStream::Open(path_, &file_)); const char* data = "testdata"; - ASSERT_OK(file_->Write(reinterpret_cast(data), strlen(data))); + ASSERT_OK(file_->Write(data, strlen(data))); ASSERT_OK(file_->Close()); ASSERT_OK(FileOutputStream::Open(path_, &file_)); @@ -583,8 +583,7 @@ TEST_F(TestMemoryMappedFile, ThreadSafety) { std::shared_ptr file; ASSERT_OK(MemoryMappedFile::Open(path, FileMode::READWRITE, &file)); - ASSERT_OK(file->Write(reinterpret_cast(data.c_str()), - static_cast(data.size()))); + ASSERT_OK(file->Write(data.c_str(), static_cast(data.size()))); std::atomic correct_count(0); constexpr int niter = 10000; diff --git a/cpp/src/arrow/io/io-memory-test.cc b/cpp/src/arrow/io/io-memory-test.cc index 117972f1cf06e..8c2e8c3b0beba 100644 --- a/cpp/src/arrow/io/io-memory-test.cc +++ b/cpp/src/arrow/io/io-memory-test.cc @@ -93,7 +93,7 @@ TEST(TestFixedSizeBufferWriter, Basics) { std::string data = "data123456"; auto nbytes = static_cast(data.size()); - ASSERT_OK(writer.Write(reinterpret_cast(data.c_str()), nbytes)); + ASSERT_OK(writer.Write(data.c_str(), nbytes)); ASSERT_OK(writer.Tell(&position)); ASSERT_EQ(nbytes, position); diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc index d9c84b495d21a..ecdf26f0a991b 100644 --- a/cpp/src/arrow/io/memory.cc +++ b/cpp/src/arrow/io/memory.cc @@ -79,7 +79,7 @@ Status BufferOutputStream::Tell(int64_t* position) const { return Status::OK(); } -Status BufferOutputStream::Write(const uint8_t* data, int64_t nbytes) { +Status BufferOutputStream::Write(const void* data, int64_t nbytes) { if (ARROW_PREDICT_FALSE(!is_open_)) { return Status::IOError("OutputStream is closed"); } @@ -116,7 +116,7 @@ Status MockOutputStream::Tell(int64_t* position) const { return Status::OK(); } -Status MockOutputStream::Write(const uint8_t* data, int64_t nbytes) { +Status MockOutputStream::Write(const void* data, int64_t nbytes) { extent_bytes_written_ += nbytes; return Status::OK(); } @@ -162,9 +162,10 @@ class FixedSizeBufferWriter::FixedSizeBufferWriterImpl { return Status::OK(); } - Status Write(const uint8_t* data, int64_t nbytes) { + Status Write(const void* data, int64_t nbytes) { if (nbytes > memcopy_threshold_ && memcopy_num_threads_ > 1) { - internal::parallel_memcopy(mutable_data_ + position_, data, nbytes, + internal::parallel_memcopy(mutable_data_ + position_, + reinterpret_cast(data), nbytes, memcopy_blocksize_, memcopy_num_threads_); } else { memcpy(mutable_data_ + position_, data, nbytes); @@ -173,7 +174,7 @@ class FixedSizeBufferWriter::FixedSizeBufferWriterImpl { return Status::OK(); } - Status WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) { + Status WriteAt(int64_t position, const void* data, int64_t nbytes) { std::lock_guard guard(lock_); RETURN_NOT_OK(Seek(position)); return Write(data, nbytes); @@ -210,11 +211,11 @@ Status FixedSizeBufferWriter::Tell(int64_t* position) const { return impl_->Tell(position); } -Status FixedSizeBufferWriter::Write(const uint8_t* data, int64_t nbytes) { +Status FixedSizeBufferWriter::Write(const void* data, int64_t nbytes) { return impl_->Write(data, nbytes); } -Status FixedSizeBufferWriter::WriteAt(int64_t position, const uint8_t* data, +Status FixedSizeBufferWriter::WriteAt(int64_t position, const void* data, int64_t nbytes) { return impl_->WriteAt(position, data, nbytes); } @@ -240,6 +241,9 @@ BufferReader::BufferReader(const std::shared_ptr& buffer) BufferReader::BufferReader(const uint8_t* data, int64_t size) : buffer_(nullptr), data_(data), size_(size), position_(0) {} +BufferReader::BufferReader(const Buffer& buffer) + : BufferReader(buffer.data(), buffer.size()) {} + Status BufferReader::Close() { // no-op return Status::OK(); @@ -252,7 +256,7 @@ Status BufferReader::Tell(int64_t* position) const { bool BufferReader::supports_zero_copy() const { return true; } -Status BufferReader::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { +Status BufferReader::Read(int64_t nbytes, int64_t* bytes_read, void* buffer) { memcpy(buffer, data_ + position_, nbytes); *bytes_read = std::min(nbytes, size_ - position_); position_ += *bytes_read; @@ -273,7 +277,7 @@ Status BufferReader::Read(int64_t nbytes, std::shared_ptr* out) { } Status BufferReader::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) { + void* out) { return RandomAccessFile::ReadAt(position, nbytes, bytes_read, out); } diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 3aec91f7237e1..cf370b3b6388b 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -48,7 +48,7 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream { // Implement the OutputStream interface Status Close() override; Status Tell(int64_t* position) const override; - Status Write(const uint8_t* data, int64_t nbytes) override; + Status Write(const void* data, int64_t nbytes) override; /// Close the stream and return the buffer Status Finish(std::shared_ptr* result); @@ -72,7 +72,7 @@ class ARROW_EXPORT MockOutputStream : public OutputStream { // Implement the OutputStream interface Status Close() override; Status Tell(int64_t* position) const override; - Status Write(const uint8_t* data, int64_t nbytes) override; + Status Write(const void* data, int64_t nbytes) override; int64_t GetExtentBytesWritten() const { return extent_bytes_written_; } @@ -90,8 +90,8 @@ class ARROW_EXPORT FixedSizeBufferWriter : public WriteableFile { Status Close() override; Status Seek(int64_t position) override; Status Tell(int64_t* position) const override; - Status Write(const uint8_t* data, int64_t nbytes) override; - Status WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) override; + Status Write(const void* data, int64_t nbytes) override; + Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; void set_memcopy_threads(int num_threads); void set_memcopy_blocksize(int64_t blocksize); @@ -107,16 +107,17 @@ class ARROW_EXPORT FixedSizeBufferWriter : public WriteableFile { class ARROW_EXPORT BufferReader : public RandomAccessFile { public: explicit BufferReader(const std::shared_ptr& buffer); + explicit BufferReader(const Buffer& buffer); BufferReader(const uint8_t* data, int64_t size); Status Close() override; Status Tell(int64_t* position) const override; - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; // Zero copy read Status Read(int64_t nbytes, std::shared_ptr* out) override; Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) override; + void* out) override; /// Default implementation is thread-safe Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc index 6bd16462df94d..8ec3b0e4ae8b2 100644 --- a/cpp/src/arrow/ipc/feather-test.cc +++ b/cpp/src/arrow/ipc/feather-test.cc @@ -29,6 +29,7 @@ #include "arrow/ipc/feather.h" #include "arrow/ipc/test-common.h" #include "arrow/pretty_print.h" +#include "arrow/table.h" #include "arrow/test-util.h" namespace arrow { @@ -365,19 +366,18 @@ TEST_F(TestTableWriter, TimeTypes) { ArrayFromVector(is_valid, date_values_vec, &date_array); const auto& prim_values = static_cast(*values); - std::vector> buffers = {prim_values.null_bitmap(), - prim_values.values()}; + BufferVector buffers = {prim_values.null_bitmap(), prim_values.values()}; std::vector> arrays; arrays.push_back(date_array->data()); for (int i = 1; i < schema->num_fields(); ++i) { - arrays.emplace_back(std::make_shared( - schema->field(i)->type(), values->length(), buffers, values->null_count(), 0)); + arrays.emplace_back(ArrayData::Make(schema->field(i)->type(), values->length(), + BufferVector(buffers), values->null_count(), 0)); } - RecordBatch batch(schema, values->length(), std::move(arrays)); - CheckBatch(batch); + auto batch = RecordBatch::Make(schema, values->length(), std::move(arrays)); + CheckBatch(*batch); } TEST_F(TestTableWriter, VLenPrimitiveRoundTrip) { diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index cea720bd01bc7..d3872503edf19 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -32,6 +32,7 @@ #include "arrow/ipc/feather-internal.h" #include "arrow/ipc/feather_generated.h" #include "arrow/ipc/util.h" // IWYU pragma: keep +#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" @@ -370,7 +371,7 @@ class TableReader::TableReaderImpl { buffers.push_back(SliceBuffer(buffer, offset, buffer->size() - offset)); auto arr_data = - std::make_shared(type, meta->length(), buffers, meta->null_count()); + ArrayData::Make(type, meta->length(), std::move(buffers), meta->null_count()); *out = MakeArray(arr_data); return Status::OK(); } @@ -522,10 +523,8 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { uint32_t buffer_size = static_cast(bytes_written); // Footer: metadata length, magic bytes - RETURN_NOT_OK( - stream_->Write(reinterpret_cast(&buffer_size), sizeof(uint32_t))); - return stream_->Write(reinterpret_cast(kFeatherMagicBytes), - strlen(kFeatherMagicBytes)); + RETURN_NOT_OK(stream_->Write(&buffer_size, sizeof(uint32_t))); + return stream_->Write(kFeatherMagicBytes, strlen(kFeatherMagicBytes)); } Status LoadArrayMetadata(const Array& values, ArrayMetadata* meta) { diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index f2dd9e74e335d..12fa4bf3ed7af 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -31,14 +31,15 @@ #include "arrow/ipc/json.h" #include "arrow/ipc/test-common.h" #include "arrow/memory_pool.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/type_traits.h" namespace arrow { namespace ipc { +namespace internal { namespace json { void TestSchemaRoundTrip(const Schema& schema) { @@ -46,7 +47,7 @@ void TestSchemaRoundTrip(const Schema& schema) { rj::Writer writer(sb); writer.StartObject(); - ASSERT_OK(internal::WriteSchema(schema, &writer)); + ASSERT_OK(WriteSchema(schema, &writer)); writer.EndObject(); std::string json_schema = sb.GetString(); @@ -55,7 +56,7 @@ void TestSchemaRoundTrip(const Schema& schema) { d.Parse(json_schema); std::shared_ptr out; - if (!internal::ReadSchema(d, default_memory_pool(), &out).ok()) { + if (!ReadSchema(d, default_memory_pool(), &out).ok()) { FAIL() << "Unable to read JSON schema: " << json_schema; } @@ -70,7 +71,7 @@ void TestArrayRoundTrip(const Array& array) { rj::StringBuffer sb; rj::Writer writer(sb); - ASSERT_OK(internal::WriteArray(name, array, &writer)); + ASSERT_OK(WriteArray(name, array, &writer)); std::string array_as_json = sb.GetString(); @@ -82,7 +83,7 @@ void TestArrayRoundTrip(const Array& array) { } std::shared_ptr out; - ASSERT_OK(internal::ReadArray(default_memory_pool(), d, array.type(), &out)); + ASSERT_OK(ReadArray(default_memory_pool(), d, array.type(), &out)); // std::cout << array_as_json << std::endl; CompareArraysDetailed(0, *out, array); @@ -222,8 +223,8 @@ void MakeBatchArrays(const std::shared_ptr& schema, const int num_rows, std::vector v1_values; std::vector v2_values; - test::randint(num_rows, 0, 100, &v1_values); - test::randint(num_rows, 0, 100, &v2_values); + test::randint(num_rows, 0, 100, &v1_values); + test::randint(num_rows, 0, 100, &v2_values); std::shared_ptr v1; ArrayFromVector(is_valid, v1_values, &v1); @@ -269,7 +270,7 @@ TEST(TestJsonFileReadWrite, BasicRoundTrip) { std::vector> arrays; MakeBatchArrays(schema, num_rows, &arrays); - auto batch = std::make_shared(schema, num_rows, arrays); + auto batch = RecordBatch::Make(schema, num_rows, arrays); batches.push_back(batch); ASSERT_OK(writer->WriteRecordBatch(*batch)); } @@ -415,5 +416,6 @@ TEST_P(TestJsonRoundTrip, RoundTrip) { INSTANTIATE_TEST_CASE_P(TestJsonRoundTrip, TestJsonRoundTrip, BATCH_CASES()); } // namespace json +} // namespace internal } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc index 9ed0abde651f6..8561fb8603707 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc @@ -63,7 +63,7 @@ std::shared_ptr MakeRecordBatch(int64_t total_size, int64_t num_fie } auto schema = std::make_shared(fields); - return std::make_shared(schema, length, arrays); + return RecordBatch::Make(schema, length, arrays); } static void BM_WriteRecordBatch(benchmark::State& state) { // NOLINT non-const reference diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index adf34a9eb5422..1fcbdac5ebc73 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -197,8 +197,8 @@ class IpcTestFixture : public io::MemoryMapFixture { std::vector> fields = {f0}; auto schema = std::make_shared(fields); - RecordBatch batch(schema, 0, {array}); - CheckRoundtrip(batch, buffer_size); + auto batch = RecordBatch::Make(schema, 0, {array}); + CheckRoundtrip(*batch, buffer_size); } protected: @@ -243,7 +243,7 @@ TEST_F(TestIpcRoundTrip, MetadataVersion) { std::unique_ptr message; ASSERT_OK(ReadMessage(0, metadata_length, mmap_.get(), &message)); - ASSERT_EQ(MetadataVersion::V3, message->metadata_version()); + ASSERT_EQ(MetadataVersion::V4, message->metadata_version()); } TEST_P(TestIpcRoundTrip, SliceRoundTrip) { @@ -292,13 +292,13 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) { auto CheckArray = [this](const std::shared_ptr& array) { auto f0 = field("f0", array->type()); auto schema = ::arrow::schema({f0}); - RecordBatch batch(schema, array->length(), {array}); - auto sliced_batch = batch.Slice(0, 5); + auto batch = RecordBatch::Make(schema, array->length(), {array}); + auto sliced_batch = batch->Slice(0, 5); int64_t full_size; int64_t sliced_size; - ASSERT_OK(GetRecordBatchSize(batch, &full_size)); + ASSERT_OK(GetRecordBatchSize(*batch, &full_size)); ASSERT_OK(GetRecordBatchSize(*sliced_batch, &sliced_size)); ASSERT_TRUE(sliced_size < full_size) << sliced_size << " " << full_size; @@ -411,8 +411,7 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { *schema = ::arrow::schema({f0}); - std::vector> arrays = {array}; - *batch = std::make_shared(*schema, batch_length, arrays); + *batch = RecordBatch::Make(*schema, batch_length, {array}); std::stringstream ss; ss << "test-write-past-max-recursion-" << g_file_number++; @@ -632,7 +631,7 @@ TEST_F(TestIpcRoundTrip, LargeRecordBatch) { std::vector> fields = {f0}; auto schema = std::make_shared(fields); - RecordBatch batch(schema, length, {array}); + auto batch = RecordBatch::Make(schema, length, {array}); std::string path = "test-write-large-record_batch"; @@ -641,8 +640,8 @@ TEST_F(TestIpcRoundTrip, LargeRecordBatch) { ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); std::shared_ptr result; - ASSERT_OK(DoLargeRoundTrip(batch, false, &result)); - CheckReadResult(*result, batch); + ASSERT_OK(DoLargeRoundTrip(*batch, false, &result)); + CheckReadResult(*result, *batch); ASSERT_EQ(length, result->num_rows()); } @@ -727,7 +726,7 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) { int64_t size = 24; std::vector values; - test::randint(size, 0, 100, &values); + test::randint(size, 0, 100, &values); auto data = test::GetBufferFromVector(values); @@ -748,7 +747,7 @@ TEST_F(TestTensorRoundTrip, NonContiguous) { ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); std::vector values; - test::randint(24, 0, 100, &values); + test::randint(24, 0, 100, &values); auto data = test::GetBufferFromVector(values); Tensor tensor(int64(), data, {4, 3}, {48, 16}); diff --git a/cpp/src/arrow/ipc/json-integration-test.cc b/cpp/src/arrow/ipc/json-integration-test.cc index c7530a467b398..37778fa25166d 100644 --- a/cpp/src/arrow/ipc/json-integration-test.cc +++ b/cpp/src/arrow/ipc/json-integration-test.cc @@ -34,8 +34,8 @@ #include "arrow/ipc/reader.h" #include "arrow/ipc/writer.h" #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" @@ -50,8 +50,7 @@ DEFINE_bool(verbose, true, "Verbose output"); namespace fs = boost::filesystem; namespace arrow { - -class Buffer; +namespace ipc { bool file_exists(const char* path) { std::ifstream handle(path); @@ -73,16 +72,15 @@ static Status ConvertJsonToArrow(const std::string& json_path, std::shared_ptr json_buffer; RETURN_NOT_OK(in_file->Read(file_size, &json_buffer)); - std::unique_ptr reader; - RETURN_NOT_OK(ipc::JsonReader::Open(json_buffer, &reader)); + std::unique_ptr reader; + RETURN_NOT_OK(internal::json::JsonReader::Open(json_buffer, &reader)); if (FLAGS_verbose) { std::cout << "Found schema: " << reader->schema()->ToString() << std::endl; } - std::shared_ptr writer; - RETURN_NOT_OK( - ipc::RecordBatchFileWriter::Open(out_file.get(), reader->schema(), &writer)); + std::shared_ptr writer; + RETURN_NOT_OK(RecordBatchFileWriter::Open(out_file.get(), reader->schema(), &writer)); for (int i = 0; i < reader->num_record_batches(); ++i) { std::shared_ptr batch; @@ -101,15 +99,15 @@ static Status ConvertArrowToJson(const std::string& arrow_path, RETURN_NOT_OK(io::ReadableFile::Open(arrow_path, &in_file)); RETURN_NOT_OK(io::FileOutputStream::Open(json_path, &out_file)); - std::shared_ptr reader; - RETURN_NOT_OK(ipc::RecordBatchFileReader::Open(in_file.get(), &reader)); + std::shared_ptr reader; + RETURN_NOT_OK(RecordBatchFileReader::Open(in_file.get(), &reader)); if (FLAGS_verbose) { std::cout << "Found schema: " << reader->schema()->ToString() << std::endl; } - std::unique_ptr writer; - RETURN_NOT_OK(ipc::JsonWriter::Open(reader->schema(), &writer)); + std::unique_ptr writer; + RETURN_NOT_OK(internal::json::JsonWriter::Open(reader->schema(), &writer)); for (int i = 0; i < reader->num_record_batches(); ++i) { std::shared_ptr batch; @@ -119,8 +117,7 @@ static Status ConvertArrowToJson(const std::string& arrow_path, std::string result; RETURN_NOT_OK(writer->Finish(&result)); - return out_file->Write(reinterpret_cast(result.c_str()), - static_cast(result.size())); + return out_file->Write(result.c_str(), static_cast(result.size())); } static Status ValidateArrowVsJson(const std::string& arrow_path, @@ -135,15 +132,15 @@ static Status ValidateArrowVsJson(const std::string& arrow_path, std::shared_ptr json_buffer; RETURN_NOT_OK(json_file->Read(file_size, &json_buffer)); - std::unique_ptr json_reader; - RETURN_NOT_OK(ipc::JsonReader::Open(json_buffer, &json_reader)); + std::unique_ptr json_reader; + RETURN_NOT_OK(internal::json::JsonReader::Open(json_buffer, &json_reader)); // Construct Arrow reader std::shared_ptr arrow_file; RETURN_NOT_OK(io::ReadableFile::Open(arrow_path, &arrow_file)); - std::shared_ptr arrow_reader; - RETURN_NOT_OK(ipc::RecordBatchFileReader::Open(arrow_file.get(), &arrow_reader)); + std::shared_ptr arrow_reader; + RETURN_NOT_OK(RecordBatchFileReader::Open(arrow_file.get(), &arrow_reader)); auto json_schema = json_reader->schema(); auto arrow_schema = arrow_reader->schema(); @@ -250,8 +247,7 @@ class TestJSONIntegration : public ::testing::Test { do { std::shared_ptr out; RETURN_NOT_OK(io::FileOutputStream::Open(path, &out)); - RETURN_NOT_OK(out->Write(reinterpret_cast(data), - static_cast(strlen(data)))); + RETURN_NOT_OK(out->Write(data, static_cast(strlen(data)))); } while (0); return Status::OK(); } @@ -401,6 +397,7 @@ TEST_F(TestJSONIntegration, ErrorStates) { ASSERT_RAISES(Invalid, RunCommand(json_path, "", "VALIDATE")); } +} // namespace ipc } // namespace arrow int main(int argc, char** argv) { @@ -409,7 +406,7 @@ int main(int argc, char** argv) { int ret = 0; if (FLAGS_integration) { - arrow::Status result = arrow::RunCommand(FLAGS_json, FLAGS_arrow, FLAGS_mode); + arrow::Status result = arrow::ipc::RunCommand(FLAGS_json, FLAGS_arrow, FLAGS_mode); if (!result.ok()) { std::cout << "Error message: " << result.ToString() << std::endl; ret = 1; diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 025f6c276541e..4088a8f20e6a0 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -28,35 +28,23 @@ #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/ipc/dictionary.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/util/string.h" #include "arrow/visitor_inline.h" namespace arrow { namespace ipc { -namespace json { namespace internal { +namespace json { -static std::string GetBufferTypeName(BufferType type) { - switch (type) { - case BufferType::DATA: - return "DATA"; - case BufferType::OFFSET: - return "OFFSET"; - case BufferType::TYPE: - return "TYPE"; - case BufferType::VALIDITY: - return "VALIDITY"; - default: - break; - } - return "UNKNOWN"; -} +using ::arrow::ipc::DictionaryMemo; +using ::arrow::ipc::DictionaryTypeMap; static std::string GetFloatingPrecisionName(FloatingPoint::Precision precision) { switch (precision) { @@ -124,8 +112,8 @@ class SchemaWriter { // Make a dummy record batch. A bit tedious as we have to make a schema auto schema = ::arrow::schema({arrow::field("dictionary", dictionary->type())}); - RecordBatch batch(schema, dictionary->length(), {dictionary}); - RETURN_NOT_OK(WriteRecordBatch(batch, writer_)); + auto batch = RecordBatch::Make(schema, dictionary->length(), {dictionary}); + RETURN_NOT_OK(WriteRecordBatch(*batch, writer_)); writer_->EndObject(); return Status::OK(); } @@ -173,12 +161,9 @@ class SchemaWriter { RETURN_NOT_OK(WriteDictionaryMetadata(dict_type)); const DataType& dictionary_type = *dict_type.dictionary()->type(); - const DataType& index_type = *dict_type.index_type(); RETURN_NOT_OK(WriteChildren(dictionary_type.children())); - WriteBufferLayout(index_type.GetBufferLayout()); } else { RETURN_NOT_OK(WriteChildren(type.children())); - WriteBufferLayout(type.GetBufferLayout()); } writer_->EndObject(); @@ -252,7 +237,7 @@ class SchemaWriter { writer_->Int(type.byte_width()); } - void WriteTypeMetadata(const DecimalType& type) { + void WriteTypeMetadata(const Decimal128Type& type) { writer_->Key("precision"); writer_->Int(type.precision()); writer_->Key("scale"); @@ -300,26 +285,6 @@ class SchemaWriter { return Status::OK(); } - void WriteBufferLayout(const std::vector& buffer_layout) { - writer_->Key("typeLayout"); - writer_->StartObject(); - writer_->Key("vectors"); - writer_->StartArray(); - - for (const BufferDescr& buffer : buffer_layout) { - writer_->StartObject(); - writer_->Key("type"); - writer_->String(GetBufferTypeName(buffer.type())); - - writer_->Key("typeBitWidth"); - writer_->Int(buffer.bit_width()); - - writer_->EndObject(); - } - writer_->EndArray(); - writer_->EndObject(); - } - Status WriteChildren(const std::vector>& children) { writer_->Key("children"); writer_->StartArray(); @@ -346,7 +311,7 @@ class SchemaWriter { return WritePrimitive("fixedsizebinary", type); } - Status Visit(const DecimalType& type) { return WritePrimitive("decimal", type); } + Status Visit(const Decimal128Type& type) { return WritePrimitive("decimal", type); } Status Visit(const TimestampType& type) { return WritePrimitive("timestamp", type); } Status Visit(const IntervalType& type) { return WritePrimitive("interval", type); } @@ -448,7 +413,8 @@ class ArrayWriter { } void WriteDataValues(const FixedSizeBinaryArray& arr) { - int32_t width = arr.byte_width(); + const int32_t width = arr.byte_width(); + for (int64_t i = 0; i < arr.length(); ++i) { const uint8_t* buf = arr.GetValue(i); std::string encoded = HexEncode(buf, width); @@ -456,6 +422,13 @@ class ArrayWriter { } } + void WriteDataValues(const Decimal128Array& arr) { + for (int64_t i = 0; i < arr.length(); ++i) { + const Decimal128 value(arr.GetValue(i)); + writer_->String(value.ToIntegerString()); + } + } + void WriteDataValues(const BooleanArray& arr) { for (int i = 0; i < arr.length(); ++i) { writer_->Bool(arr.Value(i)); @@ -765,7 +738,7 @@ static Status GetUnion(const RjObject& json_type, RETURN_NOT_STRING("mode", it_mode, json_type); std::string mode_str = it_mode->value.GetString(); - UnionMode mode; + UnionMode::type mode; if (mode_str == "SPARSE") { mode = UnionMode::SPARSE; @@ -1053,7 +1026,9 @@ class ArrayReader { } template - typename std::enable_if::value, Status>::type + typename std::enable_if::value && + !std::is_base_of::value, + Status>::type Visit(const T& type) { typename TypeTraits::BuilderType builder(type_, pool_); @@ -1073,22 +1048,52 @@ class ArrayReader { for (int i = 0; i < length_; ++i) { if (!is_valid_[i]) { RETURN_NOT_OK(builder.AppendNull()); - continue; - } + } else { + const rj::Value& val = json_data_arr[i]; + DCHECK(val.IsString()) + << "Found non-string JSON value when parsing FixedSizeBinary value"; + std::string hex_string = val.GetString(); + if (static_cast(hex_string.size()) != byte_width * 2) { + DCHECK(false) << "Expected size: " << byte_width * 2 + << " got: " << hex_string.size(); + } + const char* hex_data = hex_string.c_str(); - const rj::Value& val = json_data_arr[i]; - DCHECK(val.IsString()); - std::string hex_string = val.GetString(); - if (static_cast(hex_string.size()) != byte_width * 2) { - DCHECK(false) << "Expected size: " << byte_width * 2 - << " got: " << hex_string.size(); + for (int32_t j = 0; j < byte_width; ++j) { + RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j])); + } + RETURN_NOT_OK(builder.Append(byte_buffer_data)); } - const char* hex_data = hex_string.c_str(); + } + return builder.Finish(&result_); + } + + template + typename std::enable_if::value, Status>::type Visit( + const T& type) { + typename TypeTraits::BuilderType builder(type_, pool_); - for (int32_t j = 0; j < byte_width; ++j) { - RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j])); + const auto& json_data = obj_->FindMember("DATA"); + RETURN_NOT_ARRAY("DATA", json_data, *obj_); + + const auto& json_data_arr = json_data->value.GetArray(); + + DCHECK_EQ(static_cast(json_data_arr.Size()), length_); + + for (int i = 0; i < length_; ++i) { + if (!is_valid_[i]) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + const rj::Value& val = json_data_arr[i]; + DCHECK(val.IsString()) + << "Found non-string JSON value when parsing Decimal128 value"; + DCHECK_GT(val.GetStringLength(), 0) + << "Empty string found when parsing Decimal128 value"; + + Decimal128 value; + RETURN_NOT_OK(Decimal128::FromString(val.GetString(), &value)); + RETURN_NOT_OK(builder.Append(value)); } - RETURN_NOT_OK(builder.Append(byte_buffer_data)); } return builder.Finish(&result_); } @@ -1394,7 +1399,7 @@ Status ReadRecordBatch(const rj::Value& json_obj, const std::shared_ptr& RETURN_NOT_OK(ReadArray(pool, json_columns[i], type, &columns[i])); } - *batch = std::make_shared(schema, num_rows, columns); + *batch = RecordBatch::Make(schema, num_rows, columns); return Status::OK(); } @@ -1461,7 +1466,7 @@ Status ReadArray(MemoryPool* pool, const rj::Value& json_array, const Schema& sc return ReadArray(pool, json_array, result->type(), array); } -} // namespace internal } // namespace json +} // namespace internal } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 506fe682949e6..92afc14447dc7 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -92,8 +92,8 @@ using RjObject = rj::Value::ConstObject; namespace arrow { namespace ipc { -namespace json { namespace internal { +namespace json { Status WriteSchema(const Schema& schema, RjWriter* writer); Status WriteRecordBatch(const RecordBatch& batch, RjWriter* writer); @@ -111,8 +111,8 @@ Status ReadArray(MemoryPool* pool, const rj::Value& json_obj, Status ReadArray(MemoryPool* pool, const rj::Value& json_obj, const Schema& schema, std::shared_ptr* array); -} // namespace internal } // namespace json +} // namespace internal } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 30a1bb81e1a1e..394563c53c09d 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -24,8 +24,8 @@ #include "arrow/buffer.h" #include "arrow/ipc/json-internal.h" #include "arrow/memory_pool.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/util/logging.h" @@ -33,6 +33,8 @@ using std::size_t; namespace arrow { namespace ipc { +namespace internal { +namespace json { // ---------------------------------------------------------------------- // Writer implementation @@ -45,7 +47,7 @@ class JsonWriter::JsonWriterImpl { Status Start() { writer_->StartObject(); - RETURN_NOT_OK(json::internal::WriteSchema(*schema_, writer_.get())); + RETURN_NOT_OK(json::WriteSchema(*schema_, writer_.get())); // Record batches writer_->Key("batches"); @@ -63,7 +65,7 @@ class JsonWriter::JsonWriterImpl { Status WriteRecordBatch(const RecordBatch& batch) { DCHECK_EQ(batch.num_columns(), schema_->num_fields()); - return json::internal::WriteRecordBatch(batch, writer_.get()); + return json::WriteRecordBatch(batch, writer_.get()); } private: @@ -106,7 +108,7 @@ class JsonReader::JsonReaderImpl { return Status::IOError("JSON parsing failed"); } - RETURN_NOT_OK(json::internal::ReadSchema(doc_, pool_, &schema_)); + RETURN_NOT_OK(json::ReadSchema(doc_, pool_, &schema_)); auto it = doc_.FindMember("batches"); RETURN_NOT_ARRAY("batches", it, doc_); @@ -120,8 +122,7 @@ class JsonReader::JsonReaderImpl { DCHECK_LT(i, static_cast(record_batches_->GetArray().Size())) << "i out of bounds"; - return json::internal::ReadRecordBatch(record_batches_->GetArray()[i], schema_, pool_, - batch); + return json::ReadRecordBatch(record_batches_->GetArray()[i], schema_, pool_, batch); } std::shared_ptr schema() const { return schema_; } @@ -164,5 +165,7 @@ Status JsonReader::ReadRecordBatch(int i, std::shared_ptr* batch) c return impl_->ReadRecordBatch(i, batch); } +} // namespace json +} // namespace internal } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h index 51f30f0c109f3..674c3745ed413 100644 --- a/cpp/src/arrow/ipc/json.h +++ b/cpp/src/arrow/ipc/json.h @@ -34,12 +34,14 @@ class RecordBatch; class Schema; namespace ipc { +namespace internal { +namespace json { /// \class JsonWriter /// \brief Write the JSON representation of an Arrow record batch file or stream /// /// This is used for integration testing -class ARROW_EXPORT JsonWriter { +class JsonWriter { public: ~JsonWriter(); @@ -72,7 +74,7 @@ class ARROW_EXPORT JsonWriter { /// \brief Read the JSON representation of an Arrow record batch file or stream /// /// This is used for integration testing -class ARROW_EXPORT JsonReader { +class JsonReader { public: ~JsonReader(); @@ -113,6 +115,8 @@ class ARROW_EXPORT JsonReader { std::unique_ptr impl_; }; +} // namespace json +} // namespace internal } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 0dd5c72e51980..1835cefde09ee 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -67,20 +67,7 @@ class Message::MessageImpl { } MetadataVersion version() const { - switch (message_->version()) { - case flatbuf::MetadataVersion_V1: - // Arrow 0.1 - return MetadataVersion::V1; - case flatbuf::MetadataVersion_V2: - // Arrow 0.2 - return MetadataVersion::V2; - case flatbuf::MetadataVersion_V3: - // Arrow >= 0.3 - return MetadataVersion::V3; - // Add cases as other versions become available - default: - return MetadataVersion::V3; - } + return internal::GetMetadataVersion(message_->version()); } const void* header() const { return message_->header(); } @@ -249,11 +236,35 @@ Status ReadMessage(io::InputStream* file, std::unique_ptr* message) { // ---------------------------------------------------------------------- // Implement InputStream message reader -Status InputStreamMessageReader::ReadNextMessage(std::unique_ptr* message) { - return ReadMessage(stream_, message); +/// \brief Implementation of MessageReader that reads from InputStream +class InputStreamMessageReader : public MessageReader { + public: + explicit InputStreamMessageReader(io::InputStream* stream) : stream_(stream) {} + + explicit InputStreamMessageReader(const std::shared_ptr& owned_stream) + : InputStreamMessageReader(owned_stream.get()) { + owned_stream_ = owned_stream; + } + + ~InputStreamMessageReader() {} + + Status ReadNextMessage(std::unique_ptr* message) { + return ReadMessage(stream_, message); + } + + private: + io::InputStream* stream_; + std::shared_ptr owned_stream_; +}; + +std::unique_ptr MessageReader::Open(io::InputStream* stream) { + return std::unique_ptr(new InputStreamMessageReader(stream)); } -InputStreamMessageReader::~InputStreamMessageReader() {} +std::unique_ptr MessageReader::Open( + const std::shared_ptr& owned_stream) { + return std::unique_ptr(new InputStreamMessageReader(owned_stream)); +} } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index a1b6c07a43d0e..159b39a81f95d 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -42,7 +42,19 @@ class RandomAccessFile; namespace ipc { -enum class MetadataVersion : char { V1, V2, V3 }; +enum class MetadataVersion : char { + /// 0.1.0 + V1, + + /// 0.2.0 + V2, + + /// 0.3.0 to 0.7.1 + V3, + + /// >= 0.8.0 + V4 +}; // ARROW-109: We set this number arbitrarily to help catch user mistakes. For // deeply nested schemas, it is expected the user will indicate explicitly the @@ -132,6 +144,13 @@ class ARROW_EXPORT MessageReader { public: virtual ~MessageReader() = default; + /// \brief Create MessageReader that reads from InputStream + static std::unique_ptr Open(io::InputStream* stream); + + /// \brief Create MessageReader that reads from owned InputStream + static std::unique_ptr Open( + const std::shared_ptr& owned_stream); + /// \brief Read next Message from the interface /// /// \param[out] message an arrow::ipc::Message instance @@ -139,26 +158,6 @@ class ARROW_EXPORT MessageReader { virtual Status ReadNextMessage(std::unique_ptr* message) = 0; }; -/// \brief Implementation of MessageReader that reads from InputStream -/// \since 0.5.0 -class ARROW_EXPORT InputStreamMessageReader : public MessageReader { - public: - explicit InputStreamMessageReader(io::InputStream* stream) : stream_(stream) {} - - explicit InputStreamMessageReader(const std::shared_ptr& owned_stream) - : InputStreamMessageReader(owned_stream.get()) { - owned_stream_ = owned_stream; - } - - ~InputStreamMessageReader(); - - Status ReadNextMessage(std::unique_ptr* message) override; - - private: - io::InputStream* stream_; - std::shared_ptr owned_stream_; -}; - /// \brief Read encapulated RPC message from position in file /// /// Read a length-prefixed message flatbuffer starting at the indicated file diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index ad00cfb6c09be..af1d6c851582d 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -33,6 +33,7 @@ #include "arrow/ipc/Message_generated.h" #include "arrow/ipc/Tensor_generated.h" #include "arrow/ipc/dictionary.h" +#include "arrow/ipc/message.h" #include "arrow/ipc/util.h" #include "arrow/status.h" #include "arrow/tensor.h" @@ -53,10 +54,29 @@ using DictionaryOffset = flatbuffers::Offset; using FieldOffset = flatbuffers::Offset; using KeyValueOffset = flatbuffers::Offset; using RecordBatchOffset = flatbuffers::Offset; -using VectorLayoutOffset = flatbuffers::Offset; using Offset = flatbuffers::Offset; using FBString = flatbuffers::Offset; +MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version) { + switch (version) { + case flatbuf::MetadataVersion_V1: + // Arrow 0.1 + return MetadataVersion::V1; + case flatbuf::MetadataVersion_V2: + // Arrow 0.2 + return MetadataVersion::V2; + case flatbuf::MetadataVersion_V3: + // Arrow 0.3 to 0.7.1 + return MetadataVersion::V4; + case flatbuf::MetadataVersion_V4: + // Arrow >= 0.8 + return MetadataVersion::V4; + // Add cases as other versions become available + default: + return MetadataVersion::V4; + } +} + static Status IntFromFlatbuffer(const flatbuf::Int* int_data, std::shared_ptr* out) { if (int_data->bitWidth() > 64) { @@ -142,8 +162,9 @@ static Status StructToFlatbuffer(FBB& fbb, const DataType& type, static Status UnionFromFlatbuffer(const flatbuf::Union* union_data, const std::vector>& children, std::shared_ptr* out) { - UnionMode mode = union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE - : UnionMode::DENSE; + UnionMode::type mode = + (union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE + : UnionMode::DENSE); std::vector type_codes; @@ -319,34 +340,8 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data, // TODO(wesm): Convert this to visitor pattern static Status TypeToFlatbuffer(FBB& fbb, const DataType& type, std::vector* children, - std::vector* layout, flatbuf::Type* out_type, DictionaryMemo* dictionary_memo, Offset* offset) { - std::vector buffer_layout = type.GetBufferLayout(); - for (const BufferDescr& descr : buffer_layout) { - flatbuf::VectorType vector_type; - switch (descr.type()) { - case BufferType::OFFSET: - vector_type = flatbuf::VectorType_OFFSET; - break; - case BufferType::DATA: - vector_type = flatbuf::VectorType_DATA; - break; - case BufferType::VALIDITY: - vector_type = flatbuf::VectorType_VALIDITY; - break; - case BufferType::TYPE: - vector_type = flatbuf::VectorType_TYPE; - break; - default: - vector_type = flatbuf::VectorType_DATA; - break; - } - auto offset = flatbuf::CreateVectorLayout( - fbb, static_cast(descr.bit_width()), vector_type); - layout->push_back(offset); - } - const DataType* value_type = &type; if (type.id() == Type::DICTIONARY) { @@ -436,7 +431,7 @@ static Status TypeToFlatbuffer(FBB& fbb, const DataType& type, *offset = flatbuf::CreateTimestamp(fbb, fb_unit, fb_timezone).Union(); } break; case Type::DECIMAL: { - const auto& dec_type = static_cast(*value_type); + const auto& dec_type = static_cast(*value_type); *out_type = flatbuf::Type_Decimal; *offset = flatbuf::CreateDecimal(fbb, dec_type.precision(), dec_type.scale()).Union(); @@ -521,14 +516,11 @@ static Status FieldToFlatbuffer(FBB& fbb, const Field& field, flatbuf::Type type_enum; Offset type_offset; - Offset type_layout; std::vector children; - std::vector layout; - RETURN_NOT_OK(TypeToFlatbuffer(fbb, *field.type(), &children, &layout, &type_enum, + RETURN_NOT_OK(TypeToFlatbuffer(fbb, *field.type(), &children, &type_enum, dictionary_memo, &type_offset)); auto fb_children = fbb.CreateVector(children); - auto fb_layout = fbb.CreateVector(layout); DictionaryOffset dictionary = 0; if (field.type()->id() == Type::DICTIONARY) { @@ -538,7 +530,7 @@ static Status FieldToFlatbuffer(FBB& fbb, const Field& field, // TODO: produce the list of VectorTypes *offset = flatbuf::CreateField(fbb, fb_name, field.nullable(), type_enum, type_offset, - dictionary, fb_children, fb_layout); + dictionary, fb_children); return Status::OK(); } @@ -700,7 +692,7 @@ static Status WriteBuffers(FBB& fbb, const std::vector& buffers, for (size_t i = 0; i < buffers.size(); ++i) { const BufferMetadata& buffer = buffers[i]; - fb_buffers.emplace_back(buffer.page, buffer.offset, buffer.length); + fb_buffers.emplace_back(buffer.offset, buffer.length); } *out = fbb.CreateVectorOfStructs(fb_buffers); return Status::OK(); @@ -751,7 +743,7 @@ Status WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, auto fb_shape = fbb.CreateVector(dims); auto fb_strides = fbb.CreateVector(tensor.strides()); int64_t body_length = tensor.data()->size(); - flatbuf::Buffer buffer(-1, buffer_start_offset, body_length); + flatbuf::Buffer buffer(buffer_start_offset, body_length); TensorOffset fb_tensor = flatbuf::CreateTensor(fbb, fb_type_type, fb_type, fb_shape, fb_strides, &buffer); @@ -923,8 +915,7 @@ Status WriteMessage(const Buffer& message, io::OutputStream* file, // Write the flatbuffer size prefix including padding int32_t flatbuffer_size = padded_message_length - 4; - RETURN_NOT_OK( - file->Write(reinterpret_cast(&flatbuffer_size), sizeof(int32_t))); + RETURN_NOT_OK(file->Write(&flatbuffer_size, sizeof(int32_t))); // Write the flatbuffer RETURN_NOT_OK(file->Write(message.data(), message.size())); diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 309e7587a754c..380f3c9eb1013 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -27,6 +27,7 @@ #include "arrow/ipc/Schema_generated.h" #include "arrow/ipc/dictionary.h" +#include "arrow/ipc/message.h" namespace arrow { @@ -48,10 +49,12 @@ namespace ipc { namespace internal { static constexpr flatbuf::MetadataVersion kCurrentMetadataVersion = - flatbuf::MetadataVersion_V3; + flatbuf::MetadataVersion_V4; static constexpr flatbuf::MetadataVersion kMinMetadataVersion = - flatbuf::MetadataVersion_V3; + flatbuf::MetadataVersion_V4; + +MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version); static constexpr const char* kArrowMagicBytes = "ARROW1"; @@ -62,9 +65,6 @@ struct FieldMetadata { }; struct BufferMetadata { - /// The shared memory page id where to find this. Set to -1 if unused - int32_t page; - /// The relative offset into the memory page to the starting byte of the buffer int64_t offset; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 50eb9039c6ab6..ae0f8f39806b7 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -37,8 +37,8 @@ #include "arrow/ipc/message.h" #include "arrow/ipc/metadata-internal.h" #include "arrow/ipc/util.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/util/bit-util.h" @@ -307,7 +307,7 @@ static Status LoadRecordBatchFromSource(const std::shared_ptr& schema, arrays[i] = std::move(arr); } - *out = std::make_shared(schema, num_rows, std::move(arrays)); + *out = RecordBatch::Make(schema, num_rows, std::move(arrays)); return Status::OK(); } @@ -480,14 +480,12 @@ Status RecordBatchStreamReader::Open(std::unique_ptr message_read Status RecordBatchStreamReader::Open(io::InputStream* stream, std::shared_ptr* out) { - std::unique_ptr message_reader(new InputStreamMessageReader(stream)); - return Open(std::move(message_reader), out); + return Open(MessageReader::Open(stream), out); } Status RecordBatchStreamReader::Open(const std::shared_ptr& stream, std::shared_ptr* out) { - std::unique_ptr message_reader(new InputStreamMessageReader(stream)); - return Open(std::move(message_reader), out); + return Open(MessageReader::Open(stream), out); } std::shared_ptr RecordBatchStreamReader::schema() const { @@ -550,20 +548,7 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { int num_record_batches() const { return footer_->recordBatches()->size(); } MetadataVersion version() const { - switch (footer_->version()) { - case flatbuf::MetadataVersion_V1: - // Arrow 0.1 - return MetadataVersion::V1; - case flatbuf::MetadataVersion_V2: - // Arrow 0.2 - return MetadataVersion::V2; - case flatbuf::MetadataVersion_V3: - // Arrow 0.3 - return MetadataVersion::V3; - // Add cases as other versions become available - default: - return MetadataVersion::V3; - } + return internal::GetMetadataVersion(footer_->version()); } FileBlock record_batch(int i) const { @@ -730,14 +715,17 @@ Status ReadTensor(int64_t offset, io::RandomAccessFile* file, std::unique_ptr message; RETURN_NOT_OK(ReadContiguousPayload(file, &message)); + return ReadTensor(*message, out); +} +Status ReadTensor(const Message& message, std::shared_ptr* out) { std::shared_ptr type; std::vector shape; std::vector strides; std::vector dim_names; - RETURN_NOT_OK(internal::GetTensorMetadata(*message->metadata(), &type, &shape, &strides, + RETURN_NOT_OK(internal::GetTensorMetadata(*message.metadata(), &type, &shape, &strides, &dim_names)); - *out = std::make_shared(type, message->body(), shape, strides, dim_names); + *out = std::make_shared(type, message.body(), shape, strides, dim_names); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 7581fbda5b140..019c9bc1f32d8 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -24,13 +24,12 @@ #include #include "arrow/ipc/message.h" -#include "arrow/table.h" +#include "arrow/record_batch.h" #include "arrow/util/visibility.h" namespace arrow { class Buffer; -class RecordBatch; class Schema; class Status; class Tensor; @@ -220,7 +219,7 @@ Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& sc int max_recursion_depth, io::RandomAccessFile* file, std::shared_ptr* out); -/// EXPERIMENTAL: Read arrow::Tensor as encapsulated IPC message in file +/// \brief EXPERIMENTAL: Read arrow::Tensor as encapsulated IPC message in file /// /// \param[in] offset the file location of the start of the message /// \param[in] file the file where the batch is located @@ -230,6 +229,14 @@ ARROW_EXPORT Status ReadTensor(int64_t offset, io::RandomAccessFile* file, std::shared_ptr* out); +/// \brief EXPERIMENTAL: Read arrow::Tensor from IPC message +/// +/// \param[in] message a Message containing the tensor metadata and body +/// \param[out] out the read tensor +/// \return Status +ARROW_EXPORT +Status ReadTensor(const Message& message, std::shared_ptr* out); + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index b2137b7dbef6a..6f8a0dcc61fbc 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -30,8 +30,8 @@ #include "arrow/builder.h" #include "arrow/memory_pool.h" #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/bit-util.h" @@ -184,7 +184,7 @@ Status MakeBooleanBatchSized(const int length, std::shared_ptr* out std::shared_ptr a0, a1; RETURN_NOT_OK(MakeRandomBooleanArray(length, true, &a0)); RETURN_NOT_OK(MakeRandomBooleanArray(length, false, &a1)); - out->reset(new RecordBatch(schema, length, {a0, a1})); + *out = RecordBatch::Make(schema, length, {a0, a1}); return Status::OK(); } @@ -203,7 +203,7 @@ Status MakeIntBatchSized(int length, std::shared_ptr* out) { MemoryPool* pool = default_memory_pool(); RETURN_NOT_OK(MakeRandomInt32Array(length, false, pool, &a0)); RETURN_NOT_OK(MakeRandomInt32Array(length, true, pool, &a1)); - out->reset(new RecordBatch(schema, length, {a0, a1})); + *out = RecordBatch::Make(schema, length, {a0, a1}); return Status::OK(); } @@ -252,7 +252,7 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out) { auto s = MakeRandomBinaryArray(length, true, pool, &a1); RETURN_NOT_OK(s); } - out->reset(new RecordBatch(schema, length, {a0, a1})); + *out = RecordBatch::Make(schema, length, {a0, a1}); return Status::OK(); } @@ -261,7 +261,7 @@ Status MakeNullRecordBatch(std::shared_ptr* out) { auto f0 = field("f0", null()); auto schema = ::arrow::schema({f0}); std::shared_ptr a0 = std::make_shared(length); - out->reset(new RecordBatch(schema, length, {a0})); + *out = RecordBatch::Make(schema, length, {a0}); return Status::OK(); } @@ -284,7 +284,7 @@ Status MakeListRecordBatch(std::shared_ptr* out) { RETURN_NOT_OK( MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); - out->reset(new RecordBatch(schema, length, {list_array, list_list_array, flat_array})); + *out = RecordBatch::Make(schema, length, {list_array, list_list_array, flat_array}); return Status::OK(); } @@ -304,7 +304,7 @@ Status MakeZeroLengthRecordBatch(std::shared_ptr* out) { RETURN_NOT_OK( MakeRandomListArray(list_array, 0, include_nulls, pool, &list_list_array)); RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); - out->reset(new RecordBatch(schema, 0, {list_array, list_list_array, flat_array})); + *out = RecordBatch::Make(schema, 0, {list_array, list_list_array, flat_array}); return Status::OK(); } @@ -327,7 +327,7 @@ Status MakeNonNullRecordBatch(std::shared_ptr* out) { RETURN_NOT_OK( MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); - out->reset(new RecordBatch(schema, length, {list_array, list_list_array, flat_array})); + *out = RecordBatch::Make(schema, length, {list_array, list_list_array, flat_array}); return Status::OK(); } @@ -347,7 +347,7 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { auto f0 = field("f0", type); auto schema = ::arrow::schema({f0}); std::vector> arrays = {array}; - out->reset(new RecordBatch(schema, batch_length, arrays)); + *out = RecordBatch::Make(schema, batch_length, arrays); return Status::OK(); } @@ -377,7 +377,7 @@ Status MakeStruct(std::shared_ptr* out) { // construct batch std::vector> arrays = {no_nulls, with_nulls}; - out->reset(new RecordBatch(schema, list_batch->num_rows(), arrays)); + *out = RecordBatch::Make(schema, list_batch->num_rows(), arrays); return Status::OK(); } @@ -445,7 +445,7 @@ Status MakeUnion(std::shared_ptr* out) { // construct batch std::vector> arrays = {sparse_no_nulls, sparse, dense}; - out->reset(new RecordBatch(schema, length, arrays)); + *out = RecordBatch::Make(schema, length, arrays); return Status::OK(); } @@ -526,7 +526,7 @@ Status MakeDictionary(std::shared_ptr* out) { std::vector> arrays = {a0, a1, a2, a3, a4}; - out->reset(new RecordBatch(schema, length, arrays)); + *out = RecordBatch::Make(schema, length, arrays); return Status::OK(); } @@ -564,7 +564,7 @@ Status MakeDictionaryFlat(std::shared_ptr* out) { {field("dict1", f0_type), field("sparse", f1_type), field("dense", f2_type)}); std::vector> arrays = {a0, a1, a2}; - out->reset(new RecordBatch(schema, length, arrays)); + *out = RecordBatch::Make(schema, length, arrays); return Status::OK(); } @@ -584,8 +584,7 @@ Status MakeDates(std::shared_ptr* out) { std::shared_ptr date64_array; ArrayFromVector(is_valid, date64_values, &date64_array); - std::vector> arrays = {date32_array, date64_array}; - *out = std::make_shared(schema, date32_array->length(), arrays); + *out = RecordBatch::Make(schema, date32_array->length(), {date32_array, date64_array}); return Status::OK(); } @@ -604,8 +603,7 @@ Status MakeTimestamps(std::shared_ptr* out) { ArrayFromVector(f1->type(), is_valid, ts_values, &a1); ArrayFromVector(f2->type(), is_valid, ts_values, &a2); - ArrayVector arrays = {a0, a1, a2}; - *out = std::make_shared(schema, a0->length(), arrays); + *out = RecordBatch::Make(schema, a0->length(), {a0, a1, a2}); return Status::OK(); } @@ -628,8 +626,7 @@ Status MakeTimes(std::shared_ptr* out) { ArrayFromVector(f2->type(), is_valid, t32_values, &a2); ArrayFromVector(f3->type(), is_valid, t64_values, &a3); - ArrayVector arrays = {a0, a1, a2, a3}; - *out = std::make_shared(schema, a0->length(), arrays); + *out = RecordBatch::Make(schema, a0->length(), {a0, a1, a2, a3}); return Status::OK(); } @@ -665,14 +662,16 @@ Status MakeFWBinary(std::shared_ptr* out) { RETURN_NOT_OK(b1.Finish(&a1)); RETURN_NOT_OK(b2.Finish(&a2)); - ArrayVector arrays = {a1, a2}; - *out = std::make_shared(schema, a1->length(), arrays); + *out = RecordBatch::Make(schema, a1->length(), {a1, a2}); return Status::OK(); } Status MakeDecimal(std::shared_ptr* out) { - auto f0 = field("f0", decimal(19, 4)); - auto schema = ::arrow::schema({f0, f0}); + constexpr int kDecimalPrecision = 38; + auto type = decimal(kDecimalPrecision, 4); + auto f0 = field("f0", type); + auto f1 = field("f1", type); + auto schema = ::arrow::schema({f0, f1}); constexpr int kDecimalSize = 16; constexpr int length = 10; @@ -682,18 +681,17 @@ Status MakeDecimal(std::shared_ptr* out) { RETURN_NOT_OK(AllocateBuffer(default_memory_pool(), kDecimalSize * length, &data)); - test::random_bytes(kDecimalSize * length, 0, data->mutable_data()); + test::random_decimals(length, 1, kDecimalPrecision, data->mutable_data()); test::random_null_bytes(length, 0.1, is_valid_bytes.data()); RETURN_NOT_OK(BitUtil::BytesToBits(is_valid_bytes, default_memory_pool(), &is_valid)); - auto a1 = std::make_shared(f0->type(), length, data, is_valid, - kUnknownNullCount); + auto a1 = std::make_shared(f0->type(), length, data, is_valid, + kUnknownNullCount); - auto a2 = std::make_shared(f0->type(), length, data); + auto a2 = std::make_shared(f1->type(), length, data); - ArrayVector arrays = {a1, a2}; - *out = std::make_shared(schema, a1->length(), arrays); + *out = RecordBatch::Make(schema, length, {a1, a2}); return Status::OK(); } @@ -713,8 +711,7 @@ Status MakeNull(std::shared_ptr* out) { std::shared_ptr a2; ArrayFromVector(f1->type(), is_valid, int_values, &a2); - ArrayVector arrays = {a1, a2}; - *out = std::make_shared(schema, a1->length(), arrays); + *out = RecordBatch::Make(schema, a1->length(), {a1, a2}); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 279a69544faf2..c6aa770127c88 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -32,6 +32,7 @@ #include "arrow/ipc/metadata-internal.h" #include "arrow/ipc/util.h" #include "arrow/memory_pool.h" +#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/tensor.h" @@ -149,8 +150,6 @@ class RecordBatchSerializer : public ArrayVisitor { buffer_meta_.reserve(buffers_.size()); - const int32_t kNoPageId = -1; - // Construct the buffer metadata for the record batch header for (size_t i = 0; i < buffers_.size(); ++i) { const Buffer* buffer = buffers_[i].get(); @@ -163,15 +162,7 @@ class RecordBatchSerializer : public ArrayVisitor { padding = BitUtil::RoundUpToMultipleOf8(size) - size; } - // TODO(wesm): We currently have no notion of shared memory page id's, - // but we've included it in the metadata IDL for when we have it in the - // future. Use page = -1 for now - // - // Note that page ids are a bespoke notion for Arrow and not a feature we - // are using from any OS-level shared memory. The thought is that systems - // may (in the future) associate integer page id's with physical memory - // pages (according to whatever is the desired shared memory mechanism) - buffer_meta_.push_back({kNoPageId, offset, size + padding}); + buffer_meta_.push_back({offset, size + padding}); offset += size + padding; } @@ -349,7 +340,7 @@ class RecordBatchSerializer : public ArrayVisitor { VISIT_FIXED_WIDTH(Time32Array) VISIT_FIXED_WIDTH(Time64Array) VISIT_FIXED_WIDTH(FixedSizeBinaryArray) - VISIT_FIXED_WIDTH(DecimalArray) + VISIT_FIXED_WIDTH(Decimal128Array) #undef VISIT_FIXED_WIDTH @@ -518,12 +509,9 @@ class DictionaryWriter : public RecordBatchSerializer { dictionary_id_ = dictionary_id; // Make a dummy record batch. A bit tedious as we have to make a schema - std::vector> fields = { - arrow::field("dictionary", dictionary->type())}; - auto schema = std::make_shared(fields); - RecordBatch batch(schema, dictionary->length(), {dictionary}); - - return RecordBatchSerializer::Write(batch, dst, metadata_length, body_length); + auto schema = arrow::schema({arrow::field("dictionary", dictionary->type())}); + auto batch = RecordBatch::Make(schema, dictionary->length(), {dictionary}); + return RecordBatchSerializer::Write(*batch, dst, metadata_length, body_length); } private: @@ -572,9 +560,18 @@ Status WriteLargeRecordBatch(const RecordBatch& batch, int64_t buffer_start_offs pool, kMaxNestingDepth, true); } -static Status WriteStridedTensorData(int dim_index, int64_t offset, int elem_size, - const Tensor& tensor, uint8_t* scratch_space, - io::OutputStream* dst) { +namespace { + +Status WriteTensorHeader(const Tensor& tensor, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length) { + std::shared_ptr metadata; + RETURN_NOT_OK(internal::WriteTensorMessage(tensor, 0, &metadata)); + return internal::WriteMessage(*metadata, dst, metadata_length); +} + +Status WriteStridedTensorData(int dim_index, int64_t offset, int elem_size, + const Tensor& tensor, uint8_t* scratch_space, + io::OutputStream* dst) { if (dim_index == tensor.ndim() - 1) { const uint8_t* data_ptr = tensor.raw_data() + offset; const int64_t stride = tensor.strides()[dim_index]; @@ -592,16 +589,37 @@ static Status WriteStridedTensorData(int dim_index, int64_t offset, int elem_siz return Status::OK(); } -Status WriteTensorHeader(const Tensor& tensor, io::OutputStream* dst, - int32_t* metadata_length, int64_t* body_length) { - RETURN_NOT_OK(AlignStreamPosition(dst)); - std::shared_ptr metadata; - RETURN_NOT_OK(internal::WriteTensorMessage(tensor, 0, &metadata)); - return internal::WriteMessage(*metadata, dst, metadata_length); +Status GetContiguousTensor(const Tensor& tensor, MemoryPool* pool, + std::unique_ptr* out) { + const auto& type = static_cast(*tensor.type()); + const int elem_size = type.bit_width() / 8; + + // TODO(wesm): Do we care enough about this temporary allocation to pass in + // a MemoryPool to this function? + std::shared_ptr scratch_space; + RETURN_NOT_OK(AllocateBuffer(default_memory_pool(), + tensor.shape()[tensor.ndim() - 1] * elem_size, + &scratch_space)); + + std::shared_ptr contiguous_data; + RETURN_NOT_OK( + AllocateResizableBuffer(pool, tensor.size() * elem_size, &contiguous_data)); + + io::BufferOutputStream stream(contiguous_data); + RETURN_NOT_OK(WriteStridedTensorData(0, 0, elem_size, tensor, + scratch_space->mutable_data(), &stream)); + + out->reset(new Tensor(tensor.type(), contiguous_data, tensor.shape())); + + return Status::OK(); } +} // namespace + Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length) { + RETURN_NOT_OK(AlignStreamPosition(dst)); + if (tensor.is_contiguous()) { RETURN_NOT_OK(WriteTensorHeader(tensor, dst, metadata_length, body_length)); auto data = tensor.data(); @@ -631,6 +649,22 @@ Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadat } } +Status GetTensorMessage(const Tensor& tensor, MemoryPool* pool, + std::unique_ptr* out) { + const Tensor* tensor_to_write = &tensor; + std::unique_ptr temp_tensor; + + if (!tensor.is_contiguous()) { + RETURN_NOT_OK(GetContiguousTensor(tensor, pool, &temp_tensor)); + tensor_to_write = temp_tensor.get(); + } + + std::shared_ptr metadata; + RETURN_NOT_OK(internal::WriteTensorMessage(*tensor_to_write, 0, &metadata)); + out->reset(new Message(metadata, tensor_to_write->data())); + return Status::OK(); +} + Status WriteDictionary(int64_t dictionary_id, const std::shared_ptr& dictionary, int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length, MemoryPool* pool) { @@ -663,9 +697,13 @@ Status GetTensorSize(const Tensor& tensor, int64_t* size) { RecordBatchWriter::~RecordBatchWriter() {} -Status RecordBatchWriter::WriteTable(const Table& table) { +Status RecordBatchWriter::WriteTable(const Table& table, int64_t max_chunksize) { TableBatchReader reader(table); + if (max_chunksize > 0) { + reader.set_chunksize(max_chunksize); + } + std::shared_ptr batch; while (true) { RETURN_NOT_OK(reader.ReadNext(&batch)); @@ -678,6 +716,8 @@ Status RecordBatchWriter::WriteTable(const Table& table) { return Status::OK(); } +Status RecordBatchWriter::WriteTable(const Table& table) { return WriteTable(table, -1); } + // ---------------------------------------------------------------------- // Stream writer implementation @@ -699,7 +739,7 @@ class StreamBookKeeper { } // Write data and update position - Status Write(const uint8_t* data, int64_t nbytes) { + Status Write(const void* data, int64_t nbytes) { RETURN_NOT_OK(sink_->Write(data, nbytes)); position_ += nbytes; return Status::OK(); @@ -788,7 +828,7 @@ class RecordBatchStreamWriter::RecordBatchStreamWriterImpl : public StreamBookKe // Write 0 EOS message const int32_t kEos = 0; - return Write(reinterpret_cast(&kEos), sizeof(int32_t)); + return Write(&kEos, sizeof(int32_t)); } Status CheckStarted() { @@ -876,8 +916,7 @@ class RecordBatchFileWriter::RecordBatchFileWriterImpl Status Start() override { // It is only necessary to align to 8-byte boundary at the start of the file - RETURN_NOT_OK(Write(reinterpret_cast(kArrowMagicBytes), - strlen(kArrowMagicBytes))); + RETURN_NOT_OK(Write(kArrowMagicBytes, strlen(kArrowMagicBytes))); RETURN_NOT_OK(Align()); // We write the schema at the start of the file (and the end). This also @@ -901,12 +940,10 @@ class RecordBatchFileWriter::RecordBatchFileWriterImpl return Status::Invalid("Invalid file footer"); } - RETURN_NOT_OK( - Write(reinterpret_cast(&footer_length), sizeof(int32_t))); + RETURN_NOT_OK(Write(&footer_length, sizeof(int32_t))); // Write magic bytes to end file - return Write(reinterpret_cast(kArrowMagicBytes), - strlen(kArrowMagicBytes)); + return Write(kArrowMagicBytes, strlen(kArrowMagicBytes)); } }; diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index cedac45e712d6..013783ee0a224 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -65,6 +65,12 @@ class ARROW_EXPORT RecordBatchWriter { /// \return Status Status WriteTable(const Table& table); + /// \brief Write Table with a particular chunksize + /// \param[in] table table to write + /// \param[in] max_chunksize maximum chunk size for table chunks + /// \return Status + Status WriteTable(const Table& table, int64_t max_chunksize); + /// \brief Perform any logic necessary to finish the stream /// /// \return Status @@ -239,6 +245,17 @@ Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size); ARROW_EXPORT Status GetTensorSize(const Tensor& tensor, int64_t* size); +/// \brief EXPERIMENTAL: Convert arrow::Tensor to a Message with minimal memory +/// allocation +/// +/// \param[in] tensor the Tensor to write +/// \param[in] pool MemoryPool to allocate space for metadata +/// \param[out] out the resulting Message +/// \return Status +ARROW_EXPORT +Status GetTensorMessage(const Tensor& tensor, MemoryPool* pool, + std::unique_ptr* out); + /// \brief EXPERIMENTAL: Write arrow::Tensor as a contiguous message /// /// \param[in] tensor the Tensor to write diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 8b9a24fecd332..bf29d6a03cd93 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -107,14 +107,13 @@ TEST_F(TestPrettyPrint, FixedSizeBinaryType) { CheckArray(*array, 0, ex); } -TEST_F(TestPrettyPrint, DecimalType) { +TEST_F(TestPrettyPrint, Decimal128Type) { int32_t p = 19; int32_t s = 4; auto type = decimal(p, s); - DecimalBuilder builder(type); - + Decimal128Builder builder(type); Decimal128 val; ASSERT_OK(Decimal128::FromString("123.4567", &val)); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index aaea34a51388c..bd5f8ce10ea68 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -22,8 +22,8 @@ #include "arrow/array.h" #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/logging.h" @@ -170,7 +170,7 @@ class ArrayPrinter : public PrettyPrinter { } template - inline typename std::enable_if::value, void>::type + inline typename std::enable_if::value, void>::type WriteDataValues(const T& array) { for (int i = 0; i < array.length(); ++i) { if (i > 0) { diff --git a/cpp/src/arrow/public-api-test.cc b/cpp/src/arrow/public-api-test.cc new file mode 100644 index 0000000000000..8298d748fe8fd --- /dev/null +++ b/cpp/src/arrow/public-api-test.cc @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/api.h" + +#ifdef DCHECK +#error "DCHECK should not be visible from Arrow public headers." +#endif + +#include + +TEST(_, _) {} diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 7f1591213cec6..e21bbda055953 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -42,6 +42,8 @@ #include "arrow/util/parallel.h" #include "arrow/visitor_inline.h" +#include "arrow/compute/api.h" + #include "arrow/python/builtin_convert.h" #include "arrow/python/common.h" #include "arrow/python/config.h" @@ -57,6 +59,8 @@ namespace py { using internal::kPandasTimestampNull; using internal::kNanosecondsInDay; +using compute::Datum; + // ---------------------------------------------------------------------- // Utility code @@ -86,6 +90,7 @@ struct WrapBytes { static inline bool ListTypeSupported(const DataType& type) { switch (type.id()) { + case Type::NA: case Type::UINT8: case Type::INT8: case Type::UINT16: @@ -96,6 +101,7 @@ static inline bool ListTypeSupported(const DataType& type) { case Type::UINT64: case Type::FLOAT: case Type::DOUBLE: + case Type::BINARY: case Type::STRING: case Type::TIMESTAMP: // The above types are all supported. @@ -235,19 +241,14 @@ class PandasBlock { block_arr = PyArray_SimpleNewFromDescr(1, block_dims, descr); } - if (block_arr == NULL) { - // TODO(wesm): propagating Python exception - return Status::OK(); - } + RETURN_IF_PYERROR(); PyArray_ENABLEFLAGS(reinterpret_cast(block_arr), NPY_ARRAY_OWNDATA); npy_intp placement_dims[1] = {num_columns_}; PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64); - if (placement_arr == NULL) { - // TODO(wesm): propagating Python exception - return Status::OK(); - } + + RETURN_IF_PYERROR(); block_arr_.reset(block_arr); placement_arr_.reset(placement_arr); @@ -277,12 +278,19 @@ class PandasBlock { ARROW_DISALLOW_COPY_AND_ASSIGN(PandasBlock); }; +template +inline const T* GetPrimitiveValues(const Array& arr) { + const auto& prim_arr = static_cast(arr); + const T* raw_values = reinterpret_cast(prim_arr.values()->data()); + return raw_values + arr.offset(); +} + template inline void ConvertIntegerWithNulls(PandasOptions options, const ChunkedArray& data, double* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = static_cast(*data.chunk(c)); - auto in_values = reinterpret_cast(arr.raw_values()); + const auto& arr = *data.chunk(c); + const T* in_values = GetPrimitiveValues(arr); // Upcast to double, set NaN as appropriate for (int i = 0; i < arr.length(); ++i) { @@ -295,8 +303,8 @@ template inline void ConvertIntegerNoNullsSameType(PandasOptions options, const ChunkedArray& data, T* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = static_cast(*data.chunk(c)); - auto in_values = reinterpret_cast(arr.raw_values()); + const auto& arr = *data.chunk(c); + const T* in_values = GetPrimitiveValues(arr); memcpy(out_values, in_values, sizeof(T) * arr.length()); out_values += arr.length(); } @@ -306,8 +314,8 @@ template inline void ConvertIntegerNoNullsCast(PandasOptions options, const ChunkedArray& data, OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = static_cast(*data.chunk(c)); - auto in_values = reinterpret_cast(arr.raw_values()); + const auto& arr = *data.chunk(c); + const InType* in_values = GetPrimitiveValues(arr); for (int64_t i = 0; i < arr.length(); ++i) { *out_values = in_values[i]; } @@ -318,14 +326,13 @@ static Status ConvertBooleanWithNulls(PandasOptions options, const ChunkedArray& PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto bool_arr = static_cast(arr.get()); + const auto& arr = static_cast(*data.chunk(c)); - for (int64_t i = 0; i < arr->length(); ++i) { - if (bool_arr->IsNull(i)) { + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsNull(i)) { Py_INCREF(Py_None); *out_values++ = Py_None; - } else if (bool_arr->Value(i)) { + } else if (arr.Value(i)) { // True Py_INCREF(Py_True); *out_values++ = Py_True; @@ -342,10 +349,9 @@ static Status ConvertBooleanWithNulls(PandasOptions options, const ChunkedArray& static void ConvertBooleanNoNulls(PandasOptions options, const ChunkedArray& data, uint8_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); - auto bool_arr = static_cast(arr.get()); - for (int64_t i = 0; i < arr->length(); ++i) { - *out_values++ = static_cast(bool_arr->Value(i)); + const auto& arr = static_cast(*data.chunk(c)); + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = static_cast(arr.Value(i)); } } } @@ -356,17 +362,17 @@ inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& data, using ArrayType = typename TypeTraits::ArrayType; PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { - auto arr = static_cast(data.chunk(c).get()); + const auto& arr = static_cast(*data.chunk(c)); const uint8_t* data_ptr; int32_t length; const bool has_nulls = data.null_count() > 0; - for (int64_t i = 0; i < arr->length(); ++i) { - if (has_nulls && arr->IsNull(i)) { + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { Py_INCREF(Py_None); *out_values = Py_None; } else { - data_ptr = arr->GetValue(i, &length); + data_ptr = arr.GetValue(i, &length); *out_values = WrapBytes::Wrap(data_ptr, length); if (*out_values == nullptr) { PyErr_Clear(); @@ -476,7 +482,7 @@ inline Status ConvertStruct(PandasOptions options, const ChunkedArray& data, Py_INCREF(Py_None); field_value.reset(Py_None); } - // PyDict_SetItemString does not steal the value reference + // PyDict_SetItemString increments reference count auto setitem_result = PyDict_SetItemString(dict_item.obj(), name.c_str(), field_value.obj()); RETURN_IF_PYERROR(); @@ -525,13 +531,25 @@ inline Status ConvertListsLike(PandasOptions options, const std::shared_ptrvalue_offset(i) + chunk_offset); PyObject* end = PyLong_FromLongLong(arr->value_offset(i + 1) + chunk_offset); PyObject* slice = PySlice_New(start, end, NULL); + Py_XDECREF(start); + Py_XDECREF(end); + + if (ARROW_PREDICT_FALSE(slice == nullptr)) { + // Fall out of loop, will return from RETURN_IF_PYERROR + break; + } *out_values = PyObject_GetItem(numpy_array, slice); - Py_DECREF(start); - Py_DECREF(end); - Py_DECREF(slice); + + if (*out_values == nullptr) { + // Fall out of loop, will return from RETURN_IF_PYERROR + break; + } + + Py_XDECREF(slice); } ++out_values; } + RETURN_IF_PYERROR(); chunk_offset += arr->values()->length(); } @@ -543,14 +561,12 @@ inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = static_cast(*data.chunk(c)); - auto in_values = reinterpret_cast(arr.raw_values()); - - const uint8_t* valid_bits = arr.null_bitmap_data(); + const auto& arr = *data.chunk(c); + const T* in_values = GetPrimitiveValues(arr); if (arr.null_count() > 0) { for (int64_t i = 0; i < arr.length(); ++i) { - *out_values++ = BitUtil::BitNotSet(valid_bits, i) ? na_value : in_values[i]; + *out_values++ = arr.IsNull(i) ? na_value : in_values[i]; } } else { memcpy(out_values, in_values, sizeof(T) * arr.length()); @@ -563,8 +579,8 @@ template inline void ConvertNumericNullableCast(const ChunkedArray& data, OutType na_value, OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = static_cast(*data.chunk(c)); - auto in_values = reinterpret_cast(arr.raw_values()); + const auto& arr = *data.chunk(c); + const InType* in_values = GetPrimitiveValues(arr); for (int64_t i = 0; i < arr.length(); ++i) { *out_values++ = arr.IsNull(i) ? na_value : static_cast(in_values[i]); @@ -572,11 +588,11 @@ inline void ConvertNumericNullableCast(const ChunkedArray& data, OutType na_valu } } -template +template inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = static_cast(*data.chunk(c)); - auto in_values = reinterpret_cast(arr.raw_values()); + const auto& arr = *data.chunk(c); + const T* in_values = GetPrimitiveValues(arr); for (int64_t i = 0; i < arr.length(); ++i) { *out_values++ = arr.IsNull(i) ? kPandasTimestampNull @@ -616,14 +632,6 @@ static Status ConvertTimes(PandasOptions options, const ChunkedArray& data, return Status::OK(); } -static Status RawDecimalToString(const uint8_t* bytes, int precision, int scale, - std::string* result) { - DCHECK_NE(result, nullptr); - Decimal128 decimal(bytes); - *result = decimal.ToString(precision, scale); - return Status::OK(); -} - static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; @@ -634,20 +642,14 @@ static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, PyObject* Decimal = Decimal_ref.obj(); for (int c = 0; c < data.num_chunks(); c++) { - auto* arr(static_cast(data.chunk(c).get())); - auto type(std::dynamic_pointer_cast(arr->type())); - const int precision = type->precision(); - const int scale = type->scale(); + const auto& arr = static_cast(*data.chunk(c)); - for (int64_t i = 0; i < arr->length(); ++i) { - if (arr->IsNull(i)) { + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsNull(i)) { Py_INCREF(Py_None); *out_values++ = Py_None; } else { - const uint8_t* raw_value = arr->GetValue(i); - std::string decimal_string; - RETURN_NOT_OK(RawDecimalToString(raw_value, precision, scale, &decimal_string)); - *out_values++ = internal::DecimalFromString(Decimal, decimal_string); + *out_values++ = internal::DecimalFromString(Decimal, arr.FormatValue(i)); RETURN_IF_PYERROR(); } } @@ -694,6 +696,7 @@ class ObjectBlock : public PandasBlock { } else if (type == Type::LIST) { auto list_type = std::static_pointer_cast(col->type()); switch (list_type->value_type()->id()) { + CONVERTLISTSLIKE_CASE(FloatType, NA) CONVERTLISTSLIKE_CASE(UInt8Type, UINT8) CONVERTLISTSLIKE_CASE(Int8Type, INT8) CONVERTLISTSLIKE_CASE(UInt16Type, UINT16) @@ -705,6 +708,7 @@ class ObjectBlock : public PandasBlock { CONVERTLISTSLIKE_CASE(TimestampType, TIMESTAMP) CONVERTLISTSLIKE_CASE(FloatType, FLOAT) CONVERTLISTSLIKE_CASE(DoubleType, DOUBLE) + CONVERTLISTSLIKE_CASE(BinaryType, BINARY) CONVERTLISTSLIKE_CASE(StringType, STRING) CONVERTLISTSLIKE_CASE(ListType, LIST) default: { @@ -864,7 +868,7 @@ class BoolBlock : public PandasBlock { uint8_t* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - ConvertBooleanNoNulls(options_, *col->data().get(), out_buffer); + ConvertBooleanNoNulls(options_, *col->data(), out_buffer); placement_data_[rel_placement] = abs_placement; return Status::OK(); } @@ -892,7 +896,7 @@ class DatetimeBlock : public PandasBlock { int64_t* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - const ChunkedArray& data = *col.get()->data(); + const ChunkedArray& data = *col->data(); if (type == Type::DATE32) { // Convert from days since epoch to datetime64[ns] @@ -966,9 +970,10 @@ class CategoricalBlock : public PandasBlock { "CategoricalBlock allocation happens when calling Write"); } - template + template Status WriteIndices(const std::shared_ptr& col) { - using TRAITS = internal::arrow_traits; + using ArrayType = typename TypeTraits::ArrayType; + using TRAITS = internal::arrow_traits; using T = typename TRAITS::T; constexpr int npy_type = TRAITS::npy_type; @@ -977,10 +982,22 @@ class CategoricalBlock : public PandasBlock { // Sniff the first chunk const std::shared_ptr arr_first = data.chunk(0); const auto& dict_arr_first = static_cast(*arr_first); - const auto& indices_first = - static_cast(*dict_arr_first.indices()); + const auto& indices_first = static_cast(*dict_arr_first.indices()); + + auto CheckIndices = [](const ArrayType& arr, int64_t dict_length) { + const T* values = arr.raw_values(); + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsValid(i) && (values[i] < 0 || values[i] >= dict_length)) { + std::stringstream ss; + ss << "Out of bounds dictionary index: " << static_cast(values[i]); + return Status::Invalid(ss.str()); + } + } + return Status::OK(); + }; if (data.num_chunks() == 1 && indices_first.null_count() == 0) { + RETURN_NOT_OK(CheckIndices(indices_first, dict_arr_first.dictionary()->length())); RETURN_NOT_OK(AllocateNDArrayFromIndices(npy_type, indices_first)); } else { if (options_.zero_copy_only) { @@ -998,9 +1015,10 @@ class CategoricalBlock : public PandasBlock { const std::shared_ptr arr = data.chunk(c); const auto& dict_arr = static_cast(*arr); - const auto& indices = static_cast(*dict_arr.indices()); + const auto& indices = static_cast(*dict_arr.indices()); auto in_values = reinterpret_cast(indices.raw_values()); + RETURN_NOT_OK(CheckIndices(indices, dict_arr.dictionary()->length())); // Null is -1 in CategoricalBlock for (int i = 0; i < arr->length(); ++i) { *out_values++ = indices.IsNull(i) ? -1 : in_values[i]; @@ -1016,8 +1034,13 @@ class CategoricalBlock : public PandasBlock { std::shared_ptr converted_col; if (options_.strings_to_categorical && (col->type()->id() == Type::STRING || col->type()->id() == Type::BINARY)) { - RETURN_NOT_OK(EncodeColumnToDictionary(static_cast(*col), pool_, - &converted_col)); + compute::FunctionContext ctx(pool_); + + Datum out; + RETURN_NOT_OK(compute::DictionaryEncode(&ctx, Datum(col->data()), &out)); + DCHECK_EQ(out.kind(), Datum::CHUNKED_ARRAY); + converted_col = + std::make_shared(field(col->name(), out.type()), out.chunked_array()); } else { converted_col = col; } @@ -1026,16 +1049,16 @@ class CategoricalBlock : public PandasBlock { switch (dict_type.index_type()->id()) { case Type::INT8: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_col)); break; case Type::INT16: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_col)); break; case Type::INT32: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_col)); break; case Type::INT64: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_col)); break; default: { std::stringstream ss; @@ -1078,7 +1101,7 @@ class CategoricalBlock : public PandasBlock { Status AllocateNDArrayFromIndices(int npy_type, const PrimitiveArray& indices) { npy_intp block_dims[1] = {num_rows_}; - auto in_values = reinterpret_cast(indices.raw_values()); + const T* in_values = GetPrimitiveValues(indices); void* data = const_cast(in_values); PyAcquireGIL lock; @@ -1091,13 +1114,11 @@ class CategoricalBlock : public PandasBlock { PyObject* block_arr = PyArray_NewFromDescr(&PyArray_Type, descr, 1, block_dims, nullptr, data, NPY_ARRAY_CARRAY, nullptr); + RETURN_IF_PYERROR(); npy_intp placement_dims[1] = {num_columns_}; PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64); - if (placement_arr == NULL) { - // TODO(wesm): propagating Python exception - return Status::OK(); - } + RETURN_IF_PYERROR(); block_arr_.reset(block_arr); placement_arr_.reset(placement_arr); @@ -1411,6 +1432,7 @@ class ArrowDeserializer { PyAcquireGIL lock; result_ = NewArray1DFromType(col_->type().get(), type, col_->length(), nullptr); + RETURN_IF_PYERROR(); arr_ = reinterpret_cast(result_); return Status::OK(); } @@ -1420,8 +1442,7 @@ class ArrowDeserializer { std::shared_ptr arr) { typedef typename internal::arrow_traits::T T; - const auto& prim_arr = static_cast(*arr); - auto in_values = reinterpret_cast(prim_arr.raw_values()); + const T* in_values = GetPrimitiveValues(*arr); // Zero-Copy. We can pass the data pointer directly to NumPy. void* data = const_cast(in_values); @@ -1519,8 +1540,8 @@ class ArrowDeserializer { constexpr int64_t kShift = traits::npy_shift; for (int c = 0; c < data_.num_chunks(); c++) { - const auto& arr = static_cast(*data_.chunk(c)); - auto in_values = reinterpret_cast(arr.raw_values()); + const auto& arr = *data_.chunk(c); + const c_type* in_values = GetPrimitiveValues(arr); for (int64_t i = 0; i < arr.length(); ++i) { *out_values++ = arr.IsNull(i) ? na_value : static_cast(in_values[i]) / kShift; @@ -1590,7 +1611,7 @@ class ArrowDeserializer { return VisitObjects(ConvertFixedSizeBinary); } - Status Visit(const DecimalType& type) { return VisitObjects(ConvertDecimals); } + Status Visit(const Decimal128Type& type) { return VisitObjects(ConvertDecimals); } Status Visit(const Time32Type& type) { return VisitObjects(ConvertTimes); } @@ -1635,8 +1656,9 @@ class ArrowDeserializer { CONVERTVALUES_LISTSLIKE_CASE(TimestampType, TIMESTAMP) CONVERTVALUES_LISTSLIKE_CASE(FloatType, FLOAT) CONVERTVALUES_LISTSLIKE_CASE(DoubleType, DOUBLE) + CONVERTVALUES_LISTSLIKE_CASE(BinaryType, BINARY) CONVERTVALUES_LISTSLIKE_CASE(StringType, STRING) - CONVERTVALUES_LISTSLIKE_CASE(DecimalType, DECIMAL) + CONVERTVALUES_LISTSLIKE_CASE(Decimal128Type, DECIMAL) CONVERTVALUES_LISTSLIKE_CASE(ListType, LIST) default: { std::stringstream ss; diff --git a/cpp/src/arrow/python/arrow_to_python.cc b/cpp/src/arrow/python/arrow_to_python.cc index 9686050b9676f..c060ab8bfd6db 100644 --- a/cpp/src/arrow/python/arrow_to_python.cc +++ b/cpp/src/arrow/python/arrow_to_python.cc @@ -29,15 +29,17 @@ #include "arrow/array.h" #include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" #include "arrow/ipc/reader.h" +#include "arrow/table.h" +#include "arrow/util/logging.h" + #include "arrow/python/common.h" #include "arrow/python/helpers.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/pyarrow.h" #include "arrow/python/python_to_arrow.h" #include "arrow/python/util/datetime.h" -#include "arrow/table.h" -#include "arrow/util/logging.h" namespace arrow { namespace py { @@ -282,9 +284,64 @@ Status DeserializeObject(PyObject* context, const SerializedPyObject& obj, PyObj PyObject** out) { PyAcquireGIL lock; PyDateTime_IMPORT; + import_pyarrow(); return DeserializeList(context, *obj.batch->column(0), 0, obj.batch->num_rows(), base, obj, out); } +Status GetSerializedFromComponents(int num_tensors, int num_buffers, PyObject* data, + SerializedPyObject* out) { + PyAcquireGIL gil; + const Py_ssize_t data_length = PyList_Size(data); + RETURN_IF_PYERROR(); + + const Py_ssize_t expected_data_length = 1 + num_tensors * 2 + num_buffers; + if (data_length != expected_data_length) { + return Status::Invalid("Invalid number of buffers in data"); + } + + auto GetBuffer = [&data](Py_ssize_t index, std::shared_ptr* out) { + PyObject* py_buf = PyList_GET_ITEM(data, index); + return unwrap_buffer(py_buf, out); + }; + + Py_ssize_t buffer_index = 0; + + // Read the union batch describing object structure + { + std::shared_ptr data_buffer; + RETURN_NOT_OK(GetBuffer(buffer_index++, &data_buffer)); + gil.release(); + io::BufferReader buf_reader(data_buffer); + std::shared_ptr reader; + RETURN_NOT_OK(ipc::RecordBatchStreamReader::Open(&buf_reader, &reader)); + RETURN_NOT_OK(reader->ReadNext(&out->batch)); + gil.acquire(); + } + + // Zero-copy reconstruct tensors + for (int i = 0; i < num_tensors; ++i) { + std::shared_ptr metadata; + std::shared_ptr body; + std::shared_ptr tensor; + RETURN_NOT_OK(GetBuffer(buffer_index++, &metadata)); + RETURN_NOT_OK(GetBuffer(buffer_index++, &body)); + + ipc::Message message(metadata, body); + + RETURN_NOT_OK(ReadTensor(message, &tensor)); + out->tensors.emplace_back(std::move(tensor)); + } + + // Unwrap and append buffers + for (int i = 0; i < num_buffers; ++i) { + std::shared_ptr buffer; + RETURN_NOT_OK(GetBuffer(buffer_index++, &buffer)); + out->buffers.emplace_back(std::move(buffer)); + } + + return Status::OK(); +} + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h index 7509f30eb4e90..02a22f07d7e78 100644 --- a/cpp/src/arrow/python/arrow_to_python.h +++ b/cpp/src/arrow/python/arrow_to_python.h @@ -48,6 +48,19 @@ namespace py { ARROW_EXPORT Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out); +/// \brief Reconstruct SerializedPyObject from representation produced by +/// SerializedPyObject::GetComponents. +/// +/// \param[in] num_tensors number of tensors in the object +/// \param[in] num_buffers number of buffers in the object +/// \param[in] data a list containing pyarrow.Buffer instances. Must be 1 + +/// num_tensors * 2 + num_buffers in length +/// \param[out] out the reconstructed object +/// \return Status +ARROW_EXPORT +Status GetSerializedFromComponents(int num_tensors, int num_buffers, PyObject* data, + SerializedPyObject* out); + /// \brief Reconstruct Python object from Arrow-serialized representation /// \param[in] context Serialization context which contains custom serialization /// and deserialization callbacks. Can be any Python object with a diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index d52627ebfee12..cd88d557d4830 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -37,14 +37,6 @@ namespace arrow { namespace py { -static inline bool IsPyInteger(PyObject* obj) { -#if PYARROW_IS_PY2 - return PyLong_Check(obj) || PyInt_Check(obj); -#else - return PyLong_Check(obj); -#endif -} - Status InvalidConversion(PyObject* obj, const std::string& expected_types, std::ostream* out) { OwnedRef type(PyObject_Type(obj)); @@ -91,7 +83,7 @@ class ScalarVisitor { ++bool_count_; } else if (PyFloat_Check(obj)) { ++float_count_; - } else if (IsPyInteger(obj)) { + } else if (internal::IsPyInteger(obj)) { ++int_count_; } else if (PyDate_CheckExact(obj)) { ++date_count_; @@ -156,15 +148,14 @@ static constexpr int MAX_NESTING_LEVELS = 32; // SeqVisitor is used to infer the type. class SeqVisitor { public: - SeqVisitor() : max_nesting_level_(0), max_observed_level_(0) { - memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int)); + SeqVisitor() : max_nesting_level_(0), max_observed_level_(0), nesting_histogram_() { + std::fill(nesting_histogram_, nesting_histogram_ + MAX_NESTING_LEVELS, 0); } // co-recursive with VisitElem Status Visit(PyObject* obj, int level = 0) { - if (level > max_nesting_level_) { - max_nesting_level_ = level; - } + max_nesting_level_ = std::max(max_nesting_level_, level); + // Loop through either a sequence or an iterator. if (PySequence_Check(obj)) { Py_ssize_t size = PySequence_Size(obj); @@ -173,18 +164,26 @@ class SeqVisitor { if (PyArray_Check(obj)) { auto array = reinterpret_cast(obj); auto ptr = reinterpret_cast(PyArray_GETPTR1(array, i)); + ref.reset(PyArray_GETITEM(array, ptr)); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(VisitElem(ref, level)); } else { ref.reset(PySequence_GetItem(obj, i)); + RETURN_IF_PYERROR(); RETURN_NOT_OK(VisitElem(ref, level)); } } } else if (PyObject_HasAttrString(obj, "__iter__")) { - OwnedRef iter = OwnedRef(PyObject_GetIter(obj)); - PyObject* item; + OwnedRef iter(PyObject_GetIter(obj)); + RETURN_IF_PYERROR(); + + PyObject* item = NULLPTR; while ((item = PyIter_Next(iter.obj()))) { - OwnedRef ref = OwnedRef(item); + RETURN_IF_PYERROR(); + + OwnedRef ref(item); RETURN_NOT_OK(VisitElem(ref, level)); } } else { @@ -250,6 +249,7 @@ class SeqVisitor { // Visits a specific element (inner part of the loop). Status VisitElem(const OwnedRef& item_ref, int level) { + DCHECK_NE(item_ref.obj(), NULLPTR); if (PyList_Check(item_ref.obj())) { RETURN_NOT_OK(Visit(item_ref.obj(), level + 1)); } else if (PyDict_Check(item_ref.obj())) { @@ -331,7 +331,7 @@ class SeqConverter { virtual Status AppendData(PyObject* seq, int64_t size) = 0; - virtual ~SeqConverter() {} + virtual ~SeqConverter() = default; protected: ArrayBuilder* builder_; @@ -390,28 +390,26 @@ class TypedConverterVisitor : public TypedConverter { } return Status::OK(); } - - virtual Status AppendItem(const OwnedRef& item) = 0; }; class NullConverter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { return Status::Invalid("NullConverter: passed non-None value"); } }; class BoolConverter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { return typed_builder_->Append(item.obj() == Py_True); } }; class Int8Converter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { - int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast(PyLong_AsLongLong(item.obj())); if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || val < std::numeric_limits::min())) { @@ -426,8 +424,8 @@ class Int8Converter : public TypedConverterVisitor { class Int16Converter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { - int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast(PyLong_AsLongLong(item.obj())); if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || val < std::numeric_limits::min())) { @@ -442,8 +440,8 @@ class Int16Converter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { - int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast(PyLong_AsLongLong(item.obj())); if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max() || val < std::numeric_limits::min())) { @@ -458,8 +456,8 @@ class Int32Converter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { - int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast(PyLong_AsLongLong(item.obj())); RETURN_IF_PYERROR(); return typed_builder_->Append(val); } @@ -467,61 +465,80 @@ class Int64Converter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { - uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max())) { return Status::Invalid( "Cannot coerce values to array type that would " "lose data"); } - RETURN_IF_PYERROR(); return typed_builder_->Append(static_cast(val)); } }; class UInt16Converter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { - uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max())) { return Status::Invalid( "Cannot coerce values to array type that would " "lose data"); } - RETURN_IF_PYERROR(); return typed_builder_->Append(static_cast(val)); } }; class UInt32Converter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { - uint64_t val = static_cast(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); if (ARROW_PREDICT_FALSE(val > std::numeric_limits::max())) { return Status::Invalid( "Cannot coerce values to array type that would " "lose data"); } - RETURN_IF_PYERROR(); return typed_builder_->Append(static_cast(val)); } }; class UInt64Converter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { - int64_t val = static_cast(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast(PyLong_AsLongLong(item.obj())); RETURN_IF_PYERROR(); return typed_builder_->Append(val); } }; -class DateConverter : public TypedConverterVisitor { +class Date32Converter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { + int32_t t; + if (PyDate_Check(item.obj())) { + auto pydate = reinterpret_cast(item.obj()); + t = static_cast(PyDate_to_s(pydate)); + } else { + const auto casted_val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); + if (casted_val > std::numeric_limits::max()) { + return Status::Invalid("Integer as date32 larger than INT32_MAX"); + } + t = static_cast(casted_val); + } + return typed_builder_->Append(t); + } +}; + +class Date64Converter : public TypedConverterVisitor { + public: + Status AppendItem(const OwnedRef& item) { int64_t t; if (PyDate_Check(item.obj())) { auto pydate = reinterpret_cast(item.obj()); @@ -535,11 +552,11 @@ class DateConverter : public TypedConverterVisitor }; class TimestampConverter - : public TypedConverterVisitor { + : public TypedConverterVisitor { public: explicit TimestampConverter(TimeUnit::type unit) : unit_(unit) {} - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { int64_t t; if (PyDateTime_Check(item.obj())) { auto pydatetime = reinterpret_cast(item.obj()); @@ -571,7 +588,7 @@ class TimestampConverter class DoubleConverter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { double val = PyFloat_AsDouble(item.obj()); RETURN_IF_PYERROR(); return typed_builder_->Append(val); @@ -580,7 +597,7 @@ class DoubleConverter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; const char* bytes; Py_ssize_t length; @@ -608,7 +625,7 @@ class BytesConverter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; OwnedRef tmp; Py_ssize_t expected_length = @@ -635,7 +652,7 @@ class FixedWidthBytesConverter class UTF8Converter : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; OwnedRef tmp; const char* bytes; @@ -648,7 +665,8 @@ class UTF8Converter : public TypedConverterVisitor RETURN_IF_PYERROR(); bytes_obj = obj; } else if (!PyUnicode_Check(obj)) { - PyObjectStringify stringified(obj); + OwnedRef repr(PyObject_Repr(obj)); + PyObjectStringify stringified(repr.obj()); std::stringstream ss; ss << "Non bytes/unicode value encountered: " << stringified.bytes; return Status::Invalid(ss.str()); @@ -669,10 +687,10 @@ class ListConverter : public TypedConverterVisitor { public: Status Init(ArrayBuilder* builder) override; - inline Status AppendItem(const OwnedRef& item) override { + Status AppendItem(const OwnedRef& item) { RETURN_NOT_OK(typed_builder_->Append()); PyObject* item_obj = item.obj(); - int64_t list_size = static_cast(PySequence_Size(item_obj)); + const auto list_size = static_cast(PySequence_Size(item_obj)); return value_converter_->AppendData(item_obj, list_size); } @@ -681,15 +699,13 @@ class ListConverter : public TypedConverterVisitor { }; class DecimalConverter - : public TypedConverterVisitor { + : public TypedConverterVisitor { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { /// TODO(phillipc): Check for nan? - std::string string; - RETURN_NOT_OK(internal::PythonDecimalToString(item.obj(), &string)); - Decimal128 value; - RETURN_NOT_OK(Decimal128::FromString(string, &value)); + const auto& type = static_cast(*typed_builder_->type()); + RETURN_NOT_OK(internal::DecimalFromPythonDecimal(item.obj(), type, &value)); return typed_builder_->Append(value); } }; @@ -717,8 +733,10 @@ std::shared_ptr GetConverter(const std::shared_ptr& type return std::make_shared(); case Type::UINT64: return std::make_shared(); + case Type::DATE32: + return std::make_shared(); case Type::DATE64: - return std::make_shared(); + return std::make_shared(); case Type::TIMESTAMP: return std::make_shared( static_cast(*type).unit()); diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index 708d9916f46d3..494f929004ae9 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -15,8 +15,10 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/python/helpers.h" +#include + #include "arrow/python/common.h" +#include "arrow/python/helpers.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" @@ -91,20 +93,33 @@ Status PythonDecimalToString(PyObject* python_decimal, std::string* out) { return Status::OK(); } -Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int* precision, - int* scale) { - // Call Python's str(decimal_object) - OwnedRef str_obj(PyObject_Str(python_decimal)); +Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision, + int32_t* scale) { + DCHECK_NE(python_decimal, NULLPTR); + DCHECK_NE(precision, NULLPTR); + DCHECK_NE(scale, NULLPTR); + + OwnedRef as_tuple(PyObject_CallMethod(python_decimal, "as_tuple", "()")); RETURN_IF_PYERROR(); - PyObjectStringify str(str_obj.obj()); + DCHECK(PyTuple_Check(as_tuple.obj())); - const char* bytes = str.bytes; - DCHECK_NE(bytes, nullptr); + OwnedRef digits(PyObject_GetAttrString(as_tuple.obj(), "digits")); + RETURN_IF_PYERROR(); + DCHECK(PyTuple_Check(digits.obj())); - auto size = str.size; + const auto num_digits = static_cast(PyTuple_Size(digits.obj())); + RETURN_IF_PYERROR(); - std::string c_string(bytes, size); - return Decimal128::FromString(c_string, nullptr, precision, scale); + OwnedRef py_exponent(PyObject_GetAttrString(as_tuple.obj(), "exponent")); + RETURN_IF_PYERROR(); + DCHECK(IsPyInteger(py_exponent.obj())); + + const auto exponent = static_cast(PyLong_AsLong(py_exponent.obj())); + RETURN_IF_PYERROR(); + + *precision = num_digits; + *scale = -exponent; + return Status::OK(); } PyObject* DecimalFromString(PyObject* decimal_constructor, @@ -121,6 +136,46 @@ PyObject* DecimalFromString(PyObject* decimal_constructor, string_size); } +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal128* out) { + DCHECK_NE(python_decimal, NULLPTR); + DCHECK_NE(out, NULLPTR); + + std::string string; + RETURN_NOT_OK(PythonDecimalToString(python_decimal, &string)); + + int32_t inferred_precision; + int32_t inferred_scale; + + RETURN_NOT_OK( + Decimal128::FromString(string, out, &inferred_precision, &inferred_scale)); + + const int32_t precision = arrow_type.precision(); + const int32_t scale = arrow_type.scale(); + + if (ARROW_PREDICT_FALSE(inferred_precision > precision)) { + std::stringstream buf; + buf << "Decimal type with precision " << inferred_precision + << " does not fit into precision inferred from first array element: " + << precision; + return Status::Invalid(buf.str()); + } + + if (scale != inferred_scale) { + DCHECK_NE(out, NULLPTR); + RETURN_NOT_OK(out->Rescale(inferred_scale, scale, out)); + } + return Status::OK(); +} + +bool IsPyInteger(PyObject* obj) { +#if PYARROW_IS_PY2 + return PyLong_Check(obj) || PyInt_Check(obj); +#else + return PyLong_Check(obj); +#endif +} + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index 719ed796e08a1..c82bdabc47614 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -29,6 +29,9 @@ #include "arrow/util/visibility.h" namespace arrow { + +class Decimal128; + namespace py { class OwnedRef; @@ -44,11 +47,15 @@ Status ImportFromModule(const OwnedRef& module, const std::string& module_name, Status PythonDecimalToString(PyObject* python_decimal, std::string* out); -Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int* precision = NULLPTR, - int* scale = NULLPTR); +Status InferDecimalPrecisionAndScale(PyObject* python_decimal, + int32_t* precision = NULLPTR, + int32_t* scale = NULLPTR); PyObject* DecimalFromString(PyObject* decimal_constructor, const std::string& decimal_string); +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal128* out); +bool IsPyInteger(PyObject* obj); } // namespace internal } // namespace py diff --git a/cpp/src/arrow/python/io.cc b/cpp/src/arrow/python/io.cc index b01358ab00b4b..cc3892928c455 100644 --- a/cpp/src/arrow/python/io.cc +++ b/cpp/src/arrow/python/io.cc @@ -76,7 +76,7 @@ class PythonFile { return Status::OK(); } - Status Write(const uint8_t* data, int64_t nbytes) { + Status Write(const void* data, int64_t nbytes) { PyObject* py_data = PyBytes_FromStringAndSize(reinterpret_cast(data), nbytes); PY_RETURN_IF_ERROR(StatusCode::IOError); @@ -130,7 +130,7 @@ Status PyReadableFile::Tell(int64_t* position) const { return file_->Tell(position); } -Status PyReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { +Status PyReadableFile::Read(int64_t nbytes, int64_t* bytes_read, void* out) { PyAcquireGIL lock; PyObject* bytes_obj; ARROW_RETURN_NOT_OK(file_->Read(nbytes, &bytes_obj)); @@ -155,7 +155,7 @@ Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr* out) { } Status PyReadableFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) { + void* out) { std::lock_guard guard(file_->lock()); RETURN_NOT_OK(Seek(position)); return Read(nbytes, bytes_read, out); @@ -208,7 +208,7 @@ Status PyOutputStream::Tell(int64_t* position) const { return Status::OK(); } -Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) { +Status PyOutputStream::Write(const void* data, int64_t nbytes) { PyAcquireGIL lock; position_ += nbytes; return file_->Write(data, nbytes); diff --git a/cpp/src/arrow/python/io.h b/cpp/src/arrow/python/io.h index bf5db5313a9db..f550de7b2848c 100644 --- a/cpp/src/arrow/python/io.h +++ b/cpp/src/arrow/python/io.h @@ -41,12 +41,12 @@ class ARROW_EXPORT PyReadableFile : public io::RandomAccessFile { Status Close() override; - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) override; + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; Status Read(int64_t nbytes, std::shared_ptr* out) override; // Thread-safe version Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - uint8_t* out) override; + void* out) override; // Thread-safe version Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; @@ -70,7 +70,7 @@ class ARROW_EXPORT PyOutputStream : public io::OutputStream { Status Close() override; Status Tell(int64_t* position) const override; - Status Write(const uint8_t* data, int64_t nbytes) override; + Status Write(const void* data, int64_t nbytes) override; private: std::unique_ptr file_; diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index db34d24d99da5..6c9c871a100a2 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -56,8 +56,8 @@ class Ndarray1DIndexer { bool is_strided() const { return stride_ == 1; } - T& operator[](size_type index) { return *(data_ + index * stride_); } - T& operator[](size_type index) const { return *(data_ + index * stride_); } + T& operator[](size_type index) { return data_[index * stride_]; } + T& operator[](size_type index) const { return data_[index * stride_]; } private: PyArrayObject* arr_; diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index ead3a04810121..f21b40ed3c246 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -42,8 +42,8 @@ #include "arrow/util/macros.h" #include "arrow/visitor_inline.h" -#include "arrow/compute/cast.h" #include "arrow/compute/context.h" +#include "arrow/compute/kernels/cast.h" #include "arrow/python/builtin_convert.h" #include "arrow/python/common.h" @@ -120,6 +120,7 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { for (int i = 0; i < length; ++i) { if (mask_values[i]) { ++null_count; + BitUtil::ClearBit(bitmap, i); } else { BitUtil::SetBit(bitmap, i); } @@ -145,6 +146,52 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { } // namespace +/// Append as many string objects from NumPy arrays to a `StringBuilder` as we +/// can fit +/// +/// \param[in] offset starting offset for appending +/// \param[out] end_offset ending offset where we stopped appending. Will +/// be length of arr if fully consumed +/// \param[out] have_bytes true if we encountered any PyBytes object +static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask, + int64_t offset, BinaryBuilder* builder, + int64_t* end_offset, bool* have_bytes) { + PyObject* obj; + + Ndarray1DIndexer objects(arr); + Ndarray1DIndexer mask_values; + + bool have_mask = false; + if (mask != nullptr) { + mask_values.Init(mask); + have_mask = true; + } + + for (; offset < objects.size(); ++offset) { + OwnedRef tmp_obj; + obj = objects[offset]; + if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder->AppendNull()); + continue; + } else if (!PyBytes_Check(obj)) { + std::stringstream ss; + ss << "Error converting to Python objects to bytes: "; + RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss)); + return Status::Invalid(ss.str()); + } + + const int32_t length = static_cast(PyBytes_GET_SIZE(obj)); + if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) { + break; + } + RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length)); + } + + // If we consumed the whole array, this will be the length of arr + *end_offset = offset; + return Status::OK(); +} + /// Append as many string objects from NumPy arrays to a `StringBuilder` as we /// can fit /// @@ -260,6 +307,7 @@ class NumPyConverter { : pool_(pool), type_(type), arr_(reinterpret_cast(ao)), + dtype_(PyArray_DESCR(arr_)), mask_(nullptr), use_pandas_null_sentinels_(use_pandas_null_sentinels) { if (mo != nullptr && mo != Py_None) { @@ -304,7 +352,7 @@ class NumPyConverter { return TypeNotImplemented(type.ToString()); } - Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); } + Status Visit(const Decimal128Type& type) { return TypeNotImplemented(type.ToString()); } Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); } @@ -373,7 +421,11 @@ class NumPyConverter { using traits = internal::arrow_traits; const bool null_sentinels_possible = - (use_pandas_null_sentinels_ && traits::supports_nulls); + // NumPy has a NaT type + (ArrowType::type_id == Type::TIMESTAMP || ArrowType::type_id == Type::DATE32) || + + // Observing pandas's null sentinels + ((use_pandas_null_sentinels_ && traits::supports_nulls)); if (mask_ != nullptr || null_sentinels_possible) { RETURN_NOT_OK(InitNullBitmap()); @@ -391,9 +443,7 @@ class NumPyConverter { null_count = ValuesToBitmap(arr_, null_bitmap_data_); } - BufferVector buffers = {null_bitmap_, data}; - auto arr_data = - std::make_shared(type_, length_, std::move(buffers), null_count, 0); + auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count, 0); return PushArray(arr_data); } @@ -431,6 +481,7 @@ class NumPyConverter { MemoryPool* pool_; std::shared_ptr type_; PyArrayObject* arr_; + PyArray_Descr* dtype_; PyArrayObject* mask_; int64_t length_; int64_t stride_; @@ -450,7 +501,7 @@ Status NumPyConverter::Convert() { return Status::Invalid("only handle 1-dimensional arrays"); } - if (PyArray_DESCR(arr_)->type_num == NPY_OBJECT) { + if (dtype_->type_num == NPY_OBJECT) { return ConvertObjects(); } @@ -462,36 +513,15 @@ Status NumPyConverter::Convert() { return VisitTypeInline(*type_, this); } -template -void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* output_data) { - // Passing input_data as non-const is a concession to PyObject* - int64_t j = 0; - for (int64_t i = 0; i < length; ++i) { - output_data[i] = static_cast(input_data[j]); - j += stride; - } -} - -template <> -void CopyStrided(PyObject** input_data, int64_t length, - int64_t stride, PyObject** output_data) { - int64_t j = 0; - for (int64_t i = 0; i < length; ++i) { - output_data[i] = input_data[j]; - if (output_data[i] != nullptr) { - Py_INCREF(output_data[i]); - } - j += stride; - } -} +namespace { -static Status CastBuffer(const std::shared_ptr& input, const int64_t length, - const std::shared_ptr& in_type, - const std::shared_ptr& out_type, MemoryPool* pool, - std::shared_ptr* out) { +Status CastBuffer(const std::shared_ptr& in_type, + const std::shared_ptr& input, const int64_t length, + const std::shared_ptr& valid_bitmap, const int64_t null_count, + const std::shared_ptr& out_type, MemoryPool* pool, + std::shared_ptr* out) { // Must cast - std::vector> buffers = {nullptr, input}; - auto tmp_data = std::make_shared(in_type, length, buffers, 0); + auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count); std::shared_ptr tmp_array = MakeArray(tmp_data); std::shared_ptr casted_array; @@ -499,6 +529,7 @@ static Status CastBuffer(const std::shared_ptr& input, const int64_t len compute::FunctionContext context(pool); compute::CastOptions cast_options; cast_options.allow_int_overflow = false; + cast_options.allow_time_truncate = false; RETURN_NOT_OK( compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array)); @@ -506,71 +537,65 @@ static Status CastBuffer(const std::shared_ptr& input, const int64_t len return Status::OK(); } +template +Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool, + std::shared_ptr* out) { + auto result = std::make_shared(pool); + RETURN_NOT_OK(result->Resize(sizeof(ToType) * length)); + + auto in_values = reinterpret_cast(input.data()); + auto out_values = reinterpret_cast(result->mutable_data()); + for (int64_t i = 0; i < length; ++i) { + *out_values++ = static_cast(*in_values++); + } + *out = result; + return Status::OK(); +} + +template +void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* output_data) { + // Passing input_data as non-const is a concession to PyObject* + int64_t j = 0; + for (int64_t i = 0; i < length; ++i) { + output_data[i] = static_cast(input_data[j]); + j += stride; + } +} + template -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { +Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* pool, + std::shared_ptr* out) { using traits = internal::arrow_traits; using T = typename traits::T; + // Strided, must copy into new contiguous memory + const int64_t stride = PyArray_STRIDES(arr)[0]; + const int64_t stride_elements = stride / sizeof(T); + + auto new_buffer = std::make_shared(pool); + RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length)); + CopyStrided(reinterpret_cast(PyArray_DATA(arr)), length, stride_elements, + reinterpret_cast(new_buffer->mutable_data())); + *out = new_buffer; + return Status::OK(); +} + +} // namespace + +template +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { if (is_strided()) { - // Strided, must copy into new contiguous memory - const int64_t stride = PyArray_STRIDES(arr_)[0]; - const int64_t stride_elements = stride / sizeof(T); - - auto new_buffer = std::make_shared(pool_); - RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length_)); - CopyStrided(reinterpret_cast(PyArray_DATA(arr_)), length_, stride_elements, - reinterpret_cast(new_buffer->mutable_data())); - *data = new_buffer; + RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); } else { // Can zero-copy *data = std::make_shared(reinterpret_cast(arr_)); } std::shared_ptr input_type; - RETURN_NOT_OK( - NumPyDtypeToArrow(reinterpret_cast(PyArray_DESCR(arr_)), &input_type)); + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(*data, length_, input_type, type_, pool_, data)); - } - - return Status::OK(); -} - -template <> -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - // Handle LONGLONG->INT64 and other fun things - int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num); - int type_size = NumPyTypeSize(type_num_compat); - - if (type_size == 4) { - // Source and target are INT32, so can refer to the main implementation. - return ConvertData(data); - } else if (type_size == 8) { - // We need to scale down from int64 to int32 - auto new_buffer = std::make_shared(pool_); - RETURN_NOT_OK(new_buffer->Resize(sizeof(int32_t) * length_)); - - auto input = reinterpret_cast(PyArray_DATA(arr_)); - auto output = reinterpret_cast(new_buffer->mutable_data()); - - if (is_strided()) { - // Strided, must copy into new contiguous memory - const int64_t stride = PyArray_STRIDES(arr_)[0]; - const int64_t stride_elements = stride / sizeof(int64_t); - CopyStrided(input, length_, stride_elements, output); - } else { - // TODO(wesm): int32 overflow checks - for (int64_t i = 0; i < length_; ++i) { - *output++ = static_cast(*input++); - } - } - *data = new_buffer; - } else { - std::stringstream ss; - ss << "Cannot convert NumPy array of element size "; - ss << type_size << " to a Date32 array"; - return Status::NotImplemented(ss.str()); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data)); } return Status::OK(); @@ -597,6 +622,48 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* return Status::OK(); } +template <> +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { + if (is_strided()) { + RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); + } else { + // Can zero-copy + *data = std::make_shared(reinterpret_cast(arr_)); + } + + std::shared_ptr input_type; + + auto date_dtype = reinterpret_cast(dtype_->c_metadata); + if (dtype_->type_num == NPY_DATETIME) { + // If we have inbound datetime64[D] data, this needs to be downcasted + // separately here from int64_t to int32_t, because this data is not + // supported in compute::Cast + if (date_dtype->meta.base == NPY_FR_D) { + // TODO(wesm): How pedantic do we really want to be about checking for int32 + // overflow here? + Status s = StaticCastBuffer(**data, length_, pool_, data); + RETURN_NOT_OK(s); + } else { + // TODO(wesm): This is redundant, and recomputed in VisitNative() + const int64_t null_count = ValuesToBitmap(arr_, null_bitmap_data_); + + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); + if (!input_type->Equals(*type_)) { + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count, + type_, pool_, data)); + } + } + } else { + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); + if (!input_type->Equals(*type_)) { + RETURN_NOT_OK( + CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data)); + } + } + + return Status::OK(); +} + template struct UnboxDate {}; @@ -667,24 +734,41 @@ Status NumPyConverter::ConvertDecimals() { Ndarray1DIndexer objects(arr_); PyObject* object = objects[0]; - int precision; - int scale; + if (type_ == NULLPTR) { + int32_t precision; + int32_t desired_scale; + + int32_t tmp_precision; + int32_t tmp_scale; - RETURN_NOT_OK(internal::InferDecimalPrecisionAndScale(object, &precision, &scale)); + RETURN_NOT_OK( + internal::InferDecimalPrecisionAndScale(objects[0], &precision, &desired_scale)); + + for (int64_t i = 1; i < length_; ++i) { + RETURN_NOT_OK(internal::InferDecimalPrecisionAndScale(objects[i], &tmp_precision, + &tmp_scale)); + precision = std::max(precision, tmp_precision); + + if (std::abs(desired_scale) < std::abs(tmp_scale)) { + desired_scale = tmp_scale; + } + } - type_ = std::make_shared(precision, scale); + type_ = ::arrow::decimal(precision, desired_scale); + } - DecimalBuilder builder(type_, pool_); + Decimal128Builder builder(type_, pool_); RETURN_NOT_OK(builder.Resize(length_)); + const auto& decimal_type = static_cast(*type_); + PyObject* Decimal_type_object = Decimal.obj(); + for (int64_t i = 0; i < length_; ++i) { object = objects[i]; - if (PyObject_IsInstance(object, Decimal.obj())) { - std::string string; - RETURN_NOT_OK(internal::PythonDecimalToString(object, &string)); + if (PyObject_IsInstance(object, Decimal_type_object)) { Decimal128 value; - RETURN_NOT_OK(Decimal128::FromString(string, &value)); + RETURN_NOT_OK(internal::DecimalFromPythonDecimal(object, decimal_type, &value)); RETURN_NOT_OK(builder.Append(value)); } else if (PandasObjectIsNull(object)) { RETURN_NOT_OK(builder.AppendNull()); @@ -709,7 +793,7 @@ Status NumPyConverter::ConvertTimes() { Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_); RETURN_NOT_OK(builder.Resize(length_)); - PyObject* obj; + PyObject* obj = NULLPTR; for (int64_t i = 0; i < length_; ++i) { obj = objects[i]; if (PyTime_Check(obj)) { @@ -750,7 +834,7 @@ Status NumPyConverter::ConvertObjectStrings() { // If we saw PyBytes, convert everything to BinaryArray if (global_have_bytes) { for (size_t i = 0; i < out_arrays_.size(); ++i) { - auto binary_data = out_arrays_[i]->data()->ShallowCopy(); + auto binary_data = out_arrays_[i]->data()->Copy(); binary_data->type = ::arrow::binary(); out_arrays_[i] = std::make_shared(binary_data); } @@ -1160,6 +1244,59 @@ inline Status NumPyConverter::ConvertTypedLists( return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item); } +template <> +inline Status NumPyConverter::ConvertTypedLists( + const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { + PyAcquireGIL lock; + // TODO: If there are bytes involed, convert to Binary representation + bool have_bytes = false; + + Ndarray1DIndexer mask_values; + + bool have_mask = false; + if (mask_ != nullptr) { + mask_values.Init(mask_); + have_mask = true; + } + + auto value_builder = static_cast(builder->value_builder()); + + auto foreach_item = [&](PyObject* object, bool mask) { + if (mask || PandasObjectIsNull(object)) { + return builder->AppendNull(); + } else if (PyArray_Check(object)) { + auto numpy_array = reinterpret_cast(object); + RETURN_NOT_OK(builder->Append(true)); + + // TODO(uwe): Support more complex numpy array structures + RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); + + int64_t offset = 0; + RETURN_NOT_OK(AppendObjectBinaries(numpy_array, nullptr, 0, value_builder, &offset, + &have_bytes)); + if (offset < PyArray_SIZE(numpy_array)) { + return Status::Invalid("Array cell value exceeded 2GB"); + } + return Status::OK(); + } else if (PyList_Check(object)) { + int64_t size; + std::shared_ptr inferred_type; + RETURN_NOT_OK(builder->Append(true)); + RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type)); + if (inferred_type->id() != Type::NA && inferred_type->id() != Type::BINARY) { + std::stringstream ss; + ss << inferred_type->ToString() << " cannot be converted to BINARY."; + return Status::TypeError(ss.str()); + } + return AppendPySequence(object, size, inferred_type, value_builder); + } else { + return Status::TypeError("Unsupported Python type for list items"); + } + }; + + return LoopPySequenceWithMasks(list, mask_values, have_mask, foreach_item); +} + template <> inline Status NumPyConverter::ConvertTypedLists( const std::shared_ptr& type, ListBuilder* builder, PyObject* list) { @@ -1234,6 +1371,7 @@ Status NumPyConverter::ConvertLists(const std::shared_ptr& type, LIST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType) LIST_CASE(FLOAT, NPY_FLOAT, FloatType) LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType) + LIST_CASE(BINARY, NPY_OBJECT, BinaryType) LIST_CASE(STRING, NPY_OBJECT, StringType) case Type::LIST: { const auto& list_type = static_cast(*type); diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index 86391a18598fd..d9919ee499188 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -23,6 +23,7 @@ #include "arrow/array.h" #include "arrow/builder.h" +#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/python/arrow_to_pandas.h" @@ -34,34 +35,73 @@ namespace py { TEST(PyBuffer, InvalidInputObject) { PyBuffer buffer(Py_None); } -TEST(DecimalTest, TestPythonDecimalToString) { - PyAcquireGIL lock; +class DecimalTest : public ::testing::Test { + public: + DecimalTest() : lock_(), decimal_module_(), decimal_constructor_() { + auto s = internal::ImportModule("decimal", &decimal_module_); + DCHECK(s.ok()) << s.message(); + DCHECK_NE(decimal_module_.obj(), NULLPTR); - OwnedRef decimal; - OwnedRef Decimal; - ASSERT_OK(internal::ImportModule("decimal", &decimal)); - ASSERT_NE(decimal.obj(), nullptr); + s = internal::ImportFromModule(decimal_module_, "Decimal", &decimal_constructor_); + DCHECK(s.ok()) << s.message(); - ASSERT_OK(internal::ImportFromModule(decimal, "Decimal", &Decimal)); - ASSERT_NE(Decimal.obj(), nullptr); + DCHECK_NE(decimal_constructor_.obj(), NULLPTR); + } - std::string decimal_string("-39402950693754869342983"); - const char* format = "s#"; - auto c_string = decimal_string.c_str(); - ASSERT_NE(c_string, nullptr); + OwnedRef CreatePythonDecimal(const std::string& string_value) { + OwnedRef ref(internal::DecimalFromString(decimal_constructor_.obj(), string_value)); + return ref; + } - auto c_string_size = decimal_string.size(); - ASSERT_GT(c_string_size, 0); - OwnedRef pydecimal(PyObject_CallFunction(Decimal.obj(), const_cast(format), - c_string, c_string_size)); - ASSERT_NE(pydecimal.obj(), nullptr); - ASSERT_EQ(PyErr_Occurred(), nullptr); + private: + PyAcquireGIL lock_; + OwnedRef decimal_module_; + OwnedRef decimal_constructor_; +}; - PyObject* python_object = pydecimal.obj(); - ASSERT_NE(python_object, nullptr); +TEST_F(DecimalTest, TestPythonDecimalToString) { + std::string decimal_string("-39402950693754869342983"); + + OwnedRef python_object = this->CreatePythonDecimal(decimal_string); + ASSERT_NE(python_object.obj(), nullptr); std::string string_result; - ASSERT_OK(internal::PythonDecimalToString(python_object, &string_result)); + ASSERT_OK(internal::PythonDecimalToString(python_object.obj(), &string_result)); +} + +TEST_F(DecimalTest, TestInferPrecisionAndScale) { + std::string decimal_string("-394029506937548693.42983"); + OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string)); + + int32_t precision; + int32_t scale; + + ASSERT_OK( + internal::InferDecimalPrecisionAndScale(python_decimal.obj(), &precision, &scale)); + + const auto expected_precision = + static_cast(decimal_string.size() - 2); // 1 for -, 1 for . + const int32_t expected_scale = 5; + + ASSERT_EQ(expected_precision, precision); + ASSERT_EQ(expected_scale, scale); +} + +TEST_F(DecimalTest, TestInferPrecisionAndNegativeScale) { + std::string decimal_string("-3.94042983E+10"); + OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string)); + + int32_t precision; + int32_t scale; + + ASSERT_OK( + internal::InferDecimalPrecisionAndScale(python_decimal.obj(), &precision, &scale)); + + const auto expected_precision = 9; + const int32_t expected_scale = -2; + + ASSERT_EQ(expected_precision, precision); + ASSERT_EQ(expected_scale, scale); } TEST(PandasConversionTest, TestObjectBlockWriteFails) { @@ -81,8 +121,8 @@ TEST(PandasConversionTest, TestObjectBlockWriteFails) { std::vector> fields = {f1, f2, f3}; std::vector> cols = {arr, arr, arr}; - auto schema = std::make_shared(fields); - auto table = std::make_shared
(schema, cols); + auto schema = ::arrow::schema(fields); + auto table = Table::Make(schema, cols); PyObject* out; Py_BEGIN_ALLOW_THREADS; diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index b0c6287f088a1..253e9d9a7da30 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -31,14 +31,18 @@ #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" #include "arrow/ipc/writer.h" +#include "arrow/memory_pool.h" +#include "arrow/record_batch.h" +#include "arrow/tensor.h" +#include "arrow/util/logging.h" + #include "arrow/python/common.h" #include "arrow/python/helpers.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/platform.h" #include "arrow/python/util/datetime.h" -#include "arrow/tensor.h" -#include "arrow/util/logging.h" constexpr int32_t kMaxRecursionDepth = 100; @@ -694,7 +698,7 @@ Status SerializeDict(PyObject* context, std::vector dicts, std::shared_ptr MakeBatch(std::shared_ptr data) { auto field = std::make_shared("list", data->type()); auto schema = ::arrow::schema({field}); - return std::shared_ptr(new RecordBatch(schema, data->length(), {data})); + return RecordBatch::Make(schema, data->length(), {data}); } Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out) { @@ -708,27 +712,89 @@ Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject return Status::OK(); } -Status WriteSerializedObject(const SerializedPyObject& obj, io::OutputStream* dst) { - int32_t num_tensors = static_cast(obj.tensors.size()); - int32_t num_buffers = static_cast(obj.buffers.size()); - RETURN_NOT_OK(dst->Write(reinterpret_cast(&num_tensors), sizeof(int32_t))); - RETURN_NOT_OK(dst->Write(reinterpret_cast(&num_buffers), sizeof(int32_t))); - RETURN_NOT_OK(ipc::WriteRecordBatchStream({obj.batch}, dst)); +Status SerializedPyObject::WriteTo(io::OutputStream* dst) { + int32_t num_tensors = static_cast(this->tensors.size()); + int32_t num_buffers = static_cast(this->buffers.size()); + RETURN_NOT_OK( + dst->Write(reinterpret_cast(&num_tensors), sizeof(int32_t))); + RETURN_NOT_OK( + dst->Write(reinterpret_cast(&num_buffers), sizeof(int32_t))); + RETURN_NOT_OK(ipc::WriteRecordBatchStream({this->batch}, dst)); int32_t metadata_length; int64_t body_length; - for (const auto& tensor : obj.tensors) { + for (const auto& tensor : this->tensors) { RETURN_NOT_OK(ipc::WriteTensor(*tensor, dst, &metadata_length, &body_length)); } - for (const auto& buffer : obj.buffers) { + for (const auto& buffer : this->buffers) { int64_t size = buffer->size(); - RETURN_NOT_OK(dst->Write(reinterpret_cast(&size), sizeof(int64_t))); + RETURN_NOT_OK(dst->Write(reinterpret_cast(&size), sizeof(int64_t))); RETURN_NOT_OK(dst->Write(buffer->data(), size)); } return Status::OK(); } +Status SerializedPyObject::GetComponents(MemoryPool* memory_pool, PyObject** out) { + PyAcquireGIL py_gil; + + ScopedRef result(PyDict_New()); + PyObject* buffers = PyList_New(0); + + // TODO(wesm): Not sure how pedantic we need to be about checking the return + // values of these functions. There are other places where we do not check + // PyDict_SetItem/SetItemString return value, but these failures would be + // quite esoteric + PyDict_SetItemString(result.get(), "num_tensors", + PyLong_FromSize_t(this->tensors.size())); + PyDict_SetItemString(result.get(), "num_buffers", + PyLong_FromSize_t(this->buffers.size())); + PyDict_SetItemString(result.get(), "data", buffers); + RETURN_IF_PYERROR(); + + Py_DECREF(buffers); + + auto PushBuffer = [&buffers](const std::shared_ptr& buffer) { + PyObject* wrapped_buffer = wrap_buffer(buffer); + RETURN_IF_PYERROR(); + if (PyList_Append(buffers, wrapped_buffer) < 0) { + Py_DECREF(wrapped_buffer); + RETURN_IF_PYERROR(); + } + Py_DECREF(wrapped_buffer); + return Status::OK(); + }; + + constexpr int64_t kInitialCapacity = 1024; + + // Write the record batch describing the object structure + std::shared_ptr stream; + std::shared_ptr buffer; + + py_gil.release(); + RETURN_NOT_OK(io::BufferOutputStream::Create(kInitialCapacity, memory_pool, &stream)); + RETURN_NOT_OK(ipc::WriteRecordBatchStream({this->batch}, stream.get())); + RETURN_NOT_OK(stream->Finish(&buffer)); + py_gil.acquire(); + + RETURN_NOT_OK(PushBuffer(buffer)); + + // For each tensor, get a metadata buffer and a buffer for the body + for (const auto& tensor : this->tensors) { + std::unique_ptr message; + RETURN_NOT_OK(ipc::GetTensorMessage(*tensor, memory_pool, &message)); + RETURN_NOT_OK(PushBuffer(message->metadata())); + RETURN_NOT_OK(PushBuffer(message->body())); + } + + for (const auto& buf : this->buffers) { + RETURN_NOT_OK(PushBuffer(buf)); + } + + *out = result.release(); + return Status::OK(); +} + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index c5b6396145b7f..ce7aefa0e2409 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -30,6 +30,7 @@ namespace arrow { +class MemoryPool; class RecordBatch; class Tensor; @@ -45,6 +46,26 @@ struct ARROW_EXPORT SerializedPyObject { std::shared_ptr batch; std::vector> tensors; std::vector> buffers; + + /// \brief Write serialized Python object to OutputStream + /// \param[in,out] dst an OutputStream + /// \return Status + Status WriteTo(io::OutputStream* dst); + + /// \brief Convert SerializedPyObject to a dict containing the message + /// components as Buffer instances with minimal memory allocation + /// + /// { + /// 'num_tensors': N, + /// 'num_buffers': K, + /// 'data': [Buffer] + /// } + /// + /// Each tensor is written as two buffers, one for the metadata and one for + /// the body. Therefore, the number of buffers in 'data' is 2 * N + K + 1, + /// with the first buffer containing the serialized record batch containing + /// the UnionArray that describes the whole object + Status GetComponents(MemoryPool* pool, PyObject** out); }; /// \brief Serialize Python sequence as a RecordBatch plus @@ -62,13 +83,6 @@ struct ARROW_EXPORT SerializedPyObject { ARROW_EXPORT Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out); -/// \brief Write serialized Python object to OutputStream -/// \param[in] object a serialized Python object to write out -/// \param[out] dst an OutputStream -/// \return Status -ARROW_EXPORT -Status WriteSerializedObject(const SerializedPyObject& object, io::OutputStream* dst); - } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h index c110bc64a2a2f..e76c2e0db4aea 100644 --- a/cpp/src/arrow/python/util/datetime.h +++ b/cpp/src/arrow/python/util/datetime.h @@ -235,6 +235,11 @@ static inline Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, return Status::OK(); } +static inline int64_t PyDate_to_s(PyDateTime_Date* pydate) { + return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), + PyDateTime_GET_DAY(pydate)); +} + static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { int64_t total_seconds = 0; total_seconds += PyDateTime_DATE_GET_SECOND(pydate); diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc new file mode 100644 index 0000000000000..60932bdf3e4bb --- /dev/null +++ b/cpp/src/arrow/record_batch.cc @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/record_batch.h" + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/logging.h" + +namespace arrow { + +/// \class SimpleRecordBatch +/// \brief A basic, non-lazy in-memory record batch +class SimpleRecordBatch : public RecordBatch { + public: + SimpleRecordBatch(const std::shared_ptr& schema, int64_t num_rows, + const std::vector>& columns) + : RecordBatch(schema, num_rows) { + columns_.resize(columns.size()); + boxed_columns_.resize(schema->num_fields()); + for (size_t i = 0; i < columns.size(); ++i) { + columns_[i] = columns[i]->data(); + } + } + + SimpleRecordBatch(const std::shared_ptr& schema, int64_t num_rows, + std::vector>&& columns) + : RecordBatch(schema, num_rows) { + columns_.resize(columns.size()); + boxed_columns_.resize(schema->num_fields()); + for (size_t i = 0; i < columns.size(); ++i) { + columns_[i] = columns[i]->data(); + } + } + + SimpleRecordBatch(const std::shared_ptr& schema, int64_t num_rows, + std::vector>&& columns) + : RecordBatch(schema, num_rows) { + columns_ = std::move(columns); + boxed_columns_.resize(schema->num_fields()); + } + + SimpleRecordBatch(const std::shared_ptr& schema, int64_t num_rows, + const std::vector>& columns) + : RecordBatch(schema, num_rows) { + columns_ = columns; + boxed_columns_.resize(schema->num_fields()); + } + + std::shared_ptr column(int i) const override { + if (!boxed_columns_[i]) { + boxed_columns_[i] = MakeArray(columns_[i]); + } + DCHECK(boxed_columns_[i]); + return boxed_columns_[i]; + } + + std::shared_ptr column_data(int i) const override { return columns_[i]; } + + std::shared_ptr ReplaceSchemaMetadata( + const std::shared_ptr& metadata) const override { + auto new_schema = schema_->AddMetadata(metadata); + return RecordBatch::Make(new_schema, num_rows_, columns_); + } + + std::shared_ptr Slice(int64_t offset, int64_t length) const override { + std::vector> arrays; + arrays.reserve(num_columns()); + for (const auto& field : columns_) { + int64_t col_length = std::min(field->length - offset, length); + int64_t col_offset = field->offset + offset; + + auto new_data = std::make_shared(*field); + new_data->length = col_length; + new_data->offset = col_offset; + new_data->null_count = kUnknownNullCount; + arrays.emplace_back(new_data); + } + int64_t num_rows = std::min(num_rows_ - offset, length); + return std::make_shared(schema_, num_rows, std::move(arrays)); + } + + Status Validate() const override { + if (static_cast(columns_.size()) != schema_->num_fields()) { + return Status::Invalid("Number of columns did not match schema"); + } + return RecordBatch::Validate(); + } + + private: + std::vector> columns_; + + // Caching boxed array data + mutable std::vector> boxed_columns_; +}; + +RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows) + : schema_(schema), num_rows_(num_rows) {} + +std::shared_ptr RecordBatch::Make( + const std::shared_ptr& schema, int64_t num_rows, + const std::vector>& columns) { + return std::make_shared(schema, num_rows, columns); +} + +std::shared_ptr RecordBatch::Make( + const std::shared_ptr& schema, int64_t num_rows, + std::vector>&& columns) { + return std::make_shared(schema, num_rows, std::move(columns)); +} + +std::shared_ptr RecordBatch::Make( + const std::shared_ptr& schema, int64_t num_rows, + std::vector>&& columns) { + return std::make_shared(schema, num_rows, std::move(columns)); +} + +std::shared_ptr RecordBatch::Make( + const std::shared_ptr& schema, int64_t num_rows, + const std::vector>& columns) { + return std::make_shared(schema, num_rows, columns); +} + +const std::string& RecordBatch::column_name(int i) const { + return schema_->field(i)->name(); +} + +bool RecordBatch::Equals(const RecordBatch& other) const { + if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { + return false; + } + + for (int i = 0; i < num_columns(); ++i) { + if (!column(i)->Equals(other.column(i))) { + return false; + } + } + + return true; +} + +bool RecordBatch::ApproxEquals(const RecordBatch& other) const { + if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { + return false; + } + + for (int i = 0; i < num_columns(); ++i) { + if (!column(i)->ApproxEquals(other.column(i))) { + return false; + } + } + + return true; +} + +std::shared_ptr RecordBatch::Slice(int64_t offset) const { + return Slice(offset, this->num_rows() - offset); +} + +Status RecordBatch::Validate() const { + for (int i = 0; i < num_columns(); ++i) { + auto arr_shared = this->column_data(i); + const ArrayData& arr = *arr_shared; + if (arr.length != num_rows_) { + std::stringstream ss; + ss << "Number of rows in column " << i << " did not match batch: " << arr.length + << " vs " << num_rows_; + return Status::Invalid(ss.str()); + } + const auto& schema_type = *schema_->field(i)->type(); + if (!arr.type->Equals(schema_type)) { + std::stringstream ss; + ss << "Column " << i << " type not match schema: " << arr.type->ToString() << " vs " + << schema_type.ToString(); + return Status::Invalid(ss.str()); + } + } + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// Base record batch reader + +RecordBatchReader::~RecordBatchReader() {} + +} // namespace arrow diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h new file mode 100644 index 0000000000000..b2c4c76b3f2d3 --- /dev/null +++ b/cpp/src/arrow/record_batch.h @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_RECORD_BATCH_H +#define ARROW_RECORD_BATCH_H + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class KeyValueMetadata; +class Status; + +/// \class RecordBatch +/// \brief Collection of equal-length arrays matching a particular Schema +/// +/// A record batch is table-like data structure that is semantically a sequence +/// of fields, each a contiguous Arrow array +class ARROW_EXPORT RecordBatch { + public: + virtual ~RecordBatch() = default; + + /// \param[in] schema The record batch schema + /// \param[in] num_rows length of fields in the record batch. Each array + /// should have the same length as num_rows + /// \param[in] columns the record batch fields as vector of arrays + static std::shared_ptr Make( + const std::shared_ptr& schema, int64_t num_rows, + const std::vector>& columns); + + /// \brief Move-based constructor for a vector of Array instances + static std::shared_ptr Make(const std::shared_ptr& schema, + int64_t num_rows, + std::vector>&& columns); + + /// \brief Construct record batch from vector of internal data structures + /// \since 0.5.0 + /// + /// This class is only provided with an rvalue-reference for the input data, + /// and is intended for internal use, or advanced users. + /// + /// \param schema the record batch schema + /// \param num_rows the number of semantic rows in the record batch. This + /// should be equal to the length of each field + /// \param columns the data for the batch's columns + static std::shared_ptr Make( + const std::shared_ptr& schema, int64_t num_rows, + std::vector>&& columns); + + /// \brief Construct record batch by copying vector of array data + /// \since 0.5.0 + static std::shared_ptr Make( + const std::shared_ptr& schema, int64_t num_rows, + const std::vector>& columns); + + /// \brief Determine if two record batches are exactly equal + /// \return true if batches are equal + bool Equals(const RecordBatch& other) const; + + /// \brief Determine if two record batches are approximately equal + bool ApproxEquals(const RecordBatch& other) const; + + // \return the table's schema + /// \return true if batches are equal + std::shared_ptr schema() const { return schema_; } + + /// \brief Retrieve an array from the record batch + /// \param[in] i field index, does not boundscheck + /// \return an Array object + virtual std::shared_ptr column(int i) const = 0; + + /// \brief Retrieve an array's internaldata from the record batch + /// \param[in] i field index, does not boundscheck + /// \return an internal ArrayData object + virtual std::shared_ptr column_data(int i) const = 0; + + virtual std::shared_ptr ReplaceSchemaMetadata( + const std::shared_ptr& metadata) const = 0; + + /// \brief Name in i-th column + const std::string& column_name(int i) const; + + /// \return the number of columns in the table + int num_columns() const { return schema_->num_fields(); } + + /// \return the number of rows (the corresponding length of each column) + int64_t num_rows() const { return num_rows_; } + + /// \brief Slice each of the arrays in the record batch + /// \param[in] offset the starting offset to slice, through end of batch + /// \return new record batch + virtual std::shared_ptr Slice(int64_t offset) const; + + /// \brief Slice each of the arrays in the record batch + /// \param[in] offset the starting offset to slice + /// \param[in] length the number of elements to slice from offset + /// \return new record batch + virtual std::shared_ptr Slice(int64_t offset, int64_t length) const = 0; + + /// \brief Check for schema or length inconsistencies + /// \return Status + virtual Status Validate() const; + + protected: + RecordBatch(const std::shared_ptr& schema, int64_t num_rows); + + std::shared_ptr schema_; + int64_t num_rows_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatch); +}; + +/// \brief Abstract interface for reading stream of record batches +class ARROW_EXPORT RecordBatchReader { + public: + virtual ~RecordBatchReader(); + + /// \return the shared schema of the record batches in the stream + virtual std::shared_ptr schema() const = 0; + + /// Read the next record batch in the stream. Return null for batch when + /// reaching end of stream + /// + /// \param[out] batch the next loaded batch, null at end of stream + /// \return Status + virtual Status ReadNext(std::shared_ptr* batch) = 0; +}; + +} // namespace arrow + +#endif // ARROW_RECORD_BATCH_H diff --git a/cpp/src/arrow/symbols.map b/cpp/src/arrow/symbols.map index f216d865001d2..c5d23793ccbae 100644 --- a/cpp/src/arrow/symbols.map +++ b/cpp/src/arrow/symbols.map @@ -55,6 +55,8 @@ ERR_getErrorString; # jemalloc je_arrow_*; + # ORC destructors + _ZThn8_N3orc*; extern "C++" { # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically @@ -65,6 +67,8 @@ # Statically linked C++ dependencies boost::*; + google::*; + orc::*; snappy::*; }; }; diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index b490310c26ae6..3f1c6be3a87f6 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" #include "arrow/array.h" +#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/test-common.h" @@ -216,8 +217,8 @@ class TestTable : public TestBase { TEST_F(TestTable, EmptySchema) { auto empty_schema = ::arrow::schema({}); - table_.reset(new Table(empty_schema, columns_)); - ASSERT_OK(table_->ValidateColumns()); + table_ = Table::Make(empty_schema, columns_); + ASSERT_OK(table_->Validate()); ASSERT_EQ(0, table_->num_rows()); ASSERT_EQ(0, table_->num_columns()); } @@ -226,20 +227,20 @@ TEST_F(TestTable, Ctors) { const int length = 100; MakeExample1(length); - table_.reset(new Table(schema_, columns_)); - ASSERT_OK(table_->ValidateColumns()); + table_ = Table::Make(schema_, columns_); + ASSERT_OK(table_->Validate()); ASSERT_EQ(length, table_->num_rows()); ASSERT_EQ(3, table_->num_columns()); - auto array_ctor = std::make_shared
(schema_, arrays_); + auto array_ctor = Table::Make(schema_, arrays_); ASSERT_TRUE(table_->Equals(*array_ctor)); - table_.reset(new Table(schema_, columns_, length)); - ASSERT_OK(table_->ValidateColumns()); + table_ = Table::Make(schema_, columns_, length); + ASSERT_OK(table_->Validate()); ASSERT_EQ(length, table_->num_rows()); - ASSERT_OK(MakeTable(schema_, arrays_, &table_)); - ASSERT_OK(table_->ValidateColumns()); + table_ = Table::Make(schema_, arrays_); + ASSERT_OK(table_->Validate()); ASSERT_EQ(length, table_->num_rows()); ASSERT_EQ(3, table_->num_columns()); } @@ -248,7 +249,7 @@ TEST_F(TestTable, Metadata) { const int length = 100; MakeExample1(length); - table_.reset(new Table(schema_, columns_)); + table_ = Table::Make(schema_, columns_); ASSERT_TRUE(table_->schema()->Equals(*schema_)); @@ -262,14 +263,14 @@ TEST_F(TestTable, InvalidColumns) { const int length = 100; MakeExample1(length); - table_.reset(new Table(schema_, columns_, length - 1)); - ASSERT_RAISES(Invalid, table_->ValidateColumns()); + table_ = Table::Make(schema_, columns_, length - 1); + ASSERT_RAISES(Invalid, table_->Validate()); columns_.clear(); // Wrong number of columns - table_.reset(new Table(schema_, columns_, length)); - ASSERT_RAISES(Invalid, table_->ValidateColumns()); + table_ = Table::Make(schema_, columns_, length); + ASSERT_RAISES(Invalid, table_->Validate()); columns_ = { std::make_shared(schema_->field(0), MakeRandomArray(length)), @@ -277,15 +278,15 @@ TEST_F(TestTable, InvalidColumns) { std::make_shared(schema_->field(2), MakeRandomArray(length - 1))}; - table_.reset(new Table(schema_, columns_, length)); - ASSERT_RAISES(Invalid, table_->ValidateColumns()); + table_ = Table::Make(schema_, columns_, length); + ASSERT_RAISES(Invalid, table_->Validate()); } TEST_F(TestTable, Equals) { const int length = 100; MakeExample1(length); - table_.reset(new Table(schema_, columns_)); + table_ = Table::Make(schema_, columns_); ASSERT_TRUE(table_->Equals(*table_)); // Differing schema @@ -294,7 +295,8 @@ TEST_F(TestTable, Equals) { auto f2 = field("f5", int16()); vector> fields = {f0, f1, f2}; auto other_schema = std::make_shared(fields); - ASSERT_FALSE(table_->Equals(Table(other_schema, columns_))); + auto other = Table::Make(other_schema, columns_); + ASSERT_FALSE(table_->Equals(*other)); // Differing columns std::vector> other_columns = { std::make_shared(schema_->field(0), @@ -303,19 +305,21 @@ TEST_F(TestTable, Equals) { MakeRandomArray(length, 10)), std::make_shared(schema_->field(2), MakeRandomArray(length, 10))}; - ASSERT_FALSE(table_->Equals(Table(schema_, other_columns))); + + other = Table::Make(schema_, other_columns); + ASSERT_FALSE(table_->Equals(*other)); } TEST_F(TestTable, FromRecordBatches) { const int64_t length = 10; MakeExample1(length); - auto batch1 = std::make_shared(schema_, length, arrays_); + auto batch1 = RecordBatch::Make(schema_, length, arrays_); std::shared_ptr
result, expected; ASSERT_OK(Table::FromRecordBatches({batch1}, &result)); - expected = std::make_shared
(schema_, columns_); + expected = Table::Make(schema_, columns_); ASSERT_TRUE(result->Equals(*expected)); std::vector> other_columns; @@ -325,18 +329,17 @@ TEST_F(TestTable, FromRecordBatches) { } ASSERT_OK(Table::FromRecordBatches({batch1, batch1}, &result)); - expected = std::make_shared
(schema_, other_columns); + expected = Table::Make(schema_, other_columns); ASSERT_TRUE(result->Equals(*expected)); // Error states std::vector> empty_batches; ASSERT_RAISES(Invalid, Table::FromRecordBatches(empty_batches, &result)); - std::vector> fields = {schema_->field(0), schema_->field(1)}; - auto other_schema = std::make_shared(fields); + auto other_schema = ::arrow::schema({schema_->field(0), schema_->field(1)}); std::vector> other_arrays = {arrays_[0], arrays_[1]}; - auto batch2 = std::make_shared(other_schema, length, other_arrays); + auto batch2 = RecordBatch::Make(other_schema, length, other_arrays); ASSERT_RAISES(Invalid, Table::FromRecordBatches({batch1, batch2}, &result)); } @@ -344,11 +347,11 @@ TEST_F(TestTable, ConcatenateTables) { const int64_t length = 10; MakeExample1(length); - auto batch1 = std::make_shared(schema_, length, arrays_); + auto batch1 = RecordBatch::Make(schema_, length, arrays_); // generate different data MakeExample1(length); - auto batch2 = std::make_shared(schema_, length, arrays_); + auto batch2 = RecordBatch::Make(schema_, length, arrays_); std::shared_ptr
t1, t2, t3, result, expected; ASSERT_OK(Table::FromRecordBatches({batch1}, &t1)); @@ -362,11 +365,10 @@ TEST_F(TestTable, ConcatenateTables) { std::vector> empty_tables; ASSERT_RAISES(Invalid, ConcatenateTables(empty_tables, &result)); - std::vector> fields = {schema_->field(0), schema_->field(1)}; - auto other_schema = std::make_shared(fields); + auto other_schema = ::arrow::schema({schema_->field(0), schema_->field(1)}); std::vector> other_arrays = {arrays_[0], arrays_[1]}; - auto batch3 = std::make_shared(other_schema, length, other_arrays); + auto batch3 = RecordBatch::Make(other_schema, length, other_arrays); ASSERT_OK(Table::FromRecordBatches({batch3}, &t3)); ASSERT_RAISES(Invalid, ConcatenateTables({t1, t3}, &result)); @@ -376,31 +378,58 @@ TEST_F(TestTable, RemoveColumn) { const int64_t length = 10; MakeExample1(length); - Table table(schema_, columns_); + auto table_sp = Table::Make(schema_, columns_); + const Table& table = *table_sp; std::shared_ptr
result; ASSERT_OK(table.RemoveColumn(0, &result)); auto ex_schema = ::arrow::schema({schema_->field(1), schema_->field(2)}); std::vector> ex_columns = {table.column(1), table.column(2)}; - ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); + + auto expected = Table::Make(ex_schema, ex_columns); + ASSERT_TRUE(result->Equals(*expected)); ASSERT_OK(table.RemoveColumn(1, &result)); ex_schema = ::arrow::schema({schema_->field(0), schema_->field(2)}); ex_columns = {table.column(0), table.column(2)}; - ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); + + expected = Table::Make(ex_schema, ex_columns); + ASSERT_TRUE(result->Equals(*expected)); ASSERT_OK(table.RemoveColumn(2, &result)); ex_schema = ::arrow::schema({schema_->field(0), schema_->field(1)}); ex_columns = {table.column(0), table.column(1)}; - ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); + expected = Table::Make(ex_schema, ex_columns); + ASSERT_TRUE(result->Equals(*expected)); +} + +TEST_F(TestTable, RemoveColumnEmpty) { + // ARROW-1865 + const int64_t length = 10; + + auto f0 = field("f0", int32()); + auto schema = ::arrow::schema({f0}); + auto a0 = MakeRandomArray(length); + + auto table = Table::Make(schema, {std::make_shared(f0, a0)}); + + std::shared_ptr
empty; + ASSERT_OK(table->RemoveColumn(0, &empty)); + + ASSERT_EQ(table->num_rows(), empty->num_rows()); + + std::shared_ptr
added; + ASSERT_OK(empty->AddColumn(0, table->column(0), &added)); + ASSERT_EQ(table->num_rows(), added->num_rows()); } TEST_F(TestTable, AddColumn) { const int64_t length = 10; MakeExample1(length); - Table table(schema_, columns_); + auto table_sp = Table::Make(schema_, columns_); + const Table& table = *table_sp; std::shared_ptr
result; // Some negative tests with invalid index @@ -419,50 +448,32 @@ TEST_F(TestTable, AddColumn) { ASSERT_OK(table.AddColumn(0, columns_[0], &result)); auto ex_schema = ::arrow::schema( {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)}); - std::vector> ex_columns = {table.column(0), table.column(0), - table.column(1), table.column(2)}; - ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); + + auto expected = Table::Make( + ex_schema, {table.column(0), table.column(0), table.column(1), table.column(2)}); + ASSERT_TRUE(result->Equals(*expected)); ASSERT_OK(table.AddColumn(1, columns_[0], &result)); ex_schema = ::arrow::schema( {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)}); - ex_columns = {table.column(0), table.column(0), table.column(1), table.column(2)}; - ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); + + expected = Table::Make( + ex_schema, {table.column(0), table.column(0), table.column(1), table.column(2)}); + ASSERT_TRUE(result->Equals(*expected)); ASSERT_OK(table.AddColumn(2, columns_[0], &result)); ex_schema = ::arrow::schema( {schema_->field(0), schema_->field(1), schema_->field(0), schema_->field(2)}); - ex_columns = {table.column(0), table.column(1), table.column(0), table.column(2)}; - ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); + expected = Table::Make( + ex_schema, {table.column(0), table.column(1), table.column(0), table.column(2)}); + ASSERT_TRUE(result->Equals(*expected)); ASSERT_OK(table.AddColumn(3, columns_[0], &result)); ex_schema = ::arrow::schema( {schema_->field(0), schema_->field(1), schema_->field(2), schema_->field(0)}); - ex_columns = {table.column(0), table.column(1), table.column(2), table.column(0)}; - ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns))); -} - -TEST_F(TestTable, IsChunked) { - ArrayVector c1, c2; - - auto a1 = MakeRandomArray(10); - auto a2 = MakeRandomArray(20); - - auto sch1 = arrow::schema({field("f1", int32()), field("f2", int32())}); - - std::vector> columns; - - std::shared_ptr batch; - - columns = {column(sch1->field(0), {a1}), column(sch1->field(1), {a1})}; - auto t1 = std::make_shared
(sch1, columns); - - ASSERT_FALSE(t1->IsChunked()); - - columns = {column(sch1->field(0), {a2}), column(sch1->field(1), {a1, a1})}; - auto t2 = std::make_shared
(sch1, columns); - - ASSERT_TRUE(t2->IsChunked()); + expected = Table::Make( + ex_schema, {table.column(0), table.column(1), table.column(2), table.column(0)}); + ASSERT_TRUE(result->Equals(*expected)); } class TestRecordBatch : public TestBase {}; @@ -475,24 +486,22 @@ TEST_F(TestRecordBatch, Equals) { auto f2 = field("f2", int16()); vector> fields = {f0, f1, f2}; - auto schema = std::make_shared(fields); + auto schema = ::arrow::schema({f0, f1, f2}); + auto schema2 = ::arrow::schema({f0, f1}); auto a0 = MakeRandomArray(length); auto a1 = MakeRandomArray(length); auto a2 = MakeRandomArray(length); - RecordBatch b1(schema, length, {a0, a1, a2}); - RecordBatch b3(schema, length, {a0, a1}); - RecordBatch b4(schema, length, {a0, a1, a1}); + auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); + auto b3 = RecordBatch::Make(schema2, length, {a0, a1}); + auto b4 = RecordBatch::Make(schema, length, {a0, a1, a1}); - ASSERT_TRUE(b1.Equals(b1)); - ASSERT_FALSE(b1.Equals(b3)); - ASSERT_FALSE(b1.Equals(b4)); + ASSERT_TRUE(b1->Equals(*b1)); + ASSERT_FALSE(b1->Equals(*b3)); + ASSERT_FALSE(b1->Equals(*b4)); } -#ifdef NDEBUG -// In debug builds, RecordBatch ctor aborts if you construct an invalid one - TEST_F(TestRecordBatch, Validate) { const int length = 10; @@ -507,21 +516,19 @@ TEST_F(TestRecordBatch, Validate) { auto a2 = MakeRandomArray(length); auto a3 = MakeRandomArray(5); - RecordBatch b1(schema, length, {a0, a1, a2}); + auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); - ASSERT_OK(b1.Validate()); + ASSERT_OK(b1->Validate()); // Length mismatch - RecordBatch b2(schema, length, {a0, a1, a3}); - ASSERT_RAISES(Invalid, b2.Validate()); + auto b2 = RecordBatch::Make(schema, length, {a0, a1, a3}); + ASSERT_RAISES(Invalid, b2->Validate()); // Type mismatch - RecordBatch b3(schema, length, {a0, a1, a0}); - ASSERT_RAISES(Invalid, b3.Validate()); + auto b3 = RecordBatch::Make(schema, length, {a0, a1, a0}); + ASSERT_RAISES(Invalid, b3->Validate()); } -#endif - TEST_F(TestRecordBatch, Slice) { const int length = 10; @@ -529,19 +536,19 @@ TEST_F(TestRecordBatch, Slice) { auto f1 = field("f1", uint8()); vector> fields = {f0, f1}; - auto schema = std::make_shared(fields); + auto schema = ::arrow::schema(fields); auto a0 = MakeRandomArray(length); auto a1 = MakeRandomArray(length); - RecordBatch batch(schema, length, {a0, a1}); + auto batch = RecordBatch::Make(schema, length, {a0, a1}); - auto batch_slice = batch.Slice(2); - auto batch_slice2 = batch.Slice(1, 5); + auto batch_slice = batch->Slice(2); + auto batch_slice2 = batch->Slice(1, 5); - ASSERT_EQ(batch_slice->num_rows(), batch.num_rows() - 2); + ASSERT_EQ(batch_slice->num_rows(), batch->num_rows() - 2); - for (int i = 0; i < batch.num_columns(); ++i) { + for (int i = 0; i < batch->num_columns(); ++i) { ASSERT_EQ(2, batch_slice->column(i)->offset()); ASSERT_EQ(length - 2, batch_slice->column(i)->length()); @@ -567,9 +574,9 @@ TEST_F(TestTableBatchReader, ReadNext) { std::shared_ptr batch; columns = {column(sch1->field(0), {a1, a4, a2}), column(sch1->field(1), {a2, a2})}; - Table t1(sch1, columns); + auto t1 = Table::Make(sch1, columns); - TableBatchReader i1(t1); + TableBatchReader i1(*t1); ASSERT_OK(i1.ReadNext(&batch)); ASSERT_EQ(10, batch->num_rows()); @@ -584,9 +591,9 @@ TEST_F(TestTableBatchReader, ReadNext) { ASSERT_EQ(nullptr, batch); columns = {column(sch1->field(0), {a1}), column(sch1->field(1), {a4})}; - Table t2(sch1, columns); + auto t2 = Table::Make(sch1, columns); - TableBatchReader i2(t2); + TableBatchReader i2(*t2); ASSERT_OK(i2.ReadNext(&batch)); ASSERT_EQ(10, batch->num_rows()); @@ -599,4 +606,37 @@ TEST_F(TestTableBatchReader, ReadNext) { ASSERT_EQ(nullptr, batch); } +TEST_F(TestTableBatchReader, Chunksize) { + auto a1 = MakeRandomArray(10); + auto a2 = MakeRandomArray(20); + auto a3 = MakeRandomArray(10); + + auto sch1 = arrow::schema({field("f1", int32())}); + auto t1 = Table::Make(sch1, {column(sch1->field(0), {a1, a2, a3})}); + + TableBatchReader i1(*t1); + + i1.set_chunksize(15); + + std::shared_ptr batch; + ASSERT_OK(i1.ReadNext(&batch)); + ASSERT_OK(batch->Validate()); + ASSERT_EQ(10, batch->num_rows()); + + ASSERT_OK(i1.ReadNext(&batch)); + ASSERT_OK(batch->Validate()); + ASSERT_EQ(15, batch->num_rows()); + + ASSERT_OK(i1.ReadNext(&batch)); + ASSERT_OK(batch->Validate()); + ASSERT_EQ(5, batch->num_rows()); + + ASSERT_OK(i1.ReadNext(&batch)); + ASSERT_OK(batch->Validate()); + ASSERT_EQ(10, batch->num_rows()); + + ASSERT_OK(i1.ReadNext(&batch)); + ASSERT_EQ(nullptr, batch); +} + } // namespace arrow diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index fe19bf4ce0b3f..2cf6c26523965 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -19,10 +19,12 @@ #include #include +#include #include #include #include "arrow/array.h" +#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/logging.h" @@ -153,171 +155,127 @@ Status Column::ValidateData() { } // ---------------------------------------------------------------------- -// RecordBatch methods - -RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows) - : schema_(schema), num_rows_(num_rows) { - boxed_columns_.resize(schema->num_fields()); -} - -RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns) - : RecordBatch(schema, num_rows) { - columns_.resize(columns.size()); - for (size_t i = 0; i < columns.size(); ++i) { - columns_[i] = columns[i]->data(); - } -} - -RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns) - : RecordBatch(schema, num_rows) { - columns_.resize(columns.size()); - for (size_t i = 0; i < columns.size(); ++i) { - columns_[i] = columns[i]->data(); - } -} - -RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns) - : RecordBatch(schema, num_rows) { - columns_ = std::move(columns); -} - -RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns) - : RecordBatch(schema, num_rows) { - columns_ = columns; -} - -std::shared_ptr RecordBatch::column(int i) const { - if (!boxed_columns_[i]) { - boxed_columns_[i] = MakeArray(columns_[i]); - } - DCHECK(boxed_columns_[i]); - return boxed_columns_[i]; -} - -const std::string& RecordBatch::column_name(int i) const { - return schema_->field(i)->name(); -} - -bool RecordBatch::Equals(const RecordBatch& other) const { - if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { - return false; - } +// Table methods - for (int i = 0; i < num_columns(); ++i) { - if (!column(i)->Equals(other.column(i))) { - return false; +/// \class SimpleTable +/// \brief A basic, non-lazy in-memory table, like SimpleRecordBatch +class SimpleTable : public Table { + public: + SimpleTable(const std::shared_ptr& schema, + const std::vector>& columns, int64_t num_rows = -1) + : columns_(columns) { + schema_ = schema; + if (num_rows < 0) { + if (columns.size() == 0) { + num_rows_ = 0; + } else { + num_rows_ = columns[0]->length(); + } + } else { + num_rows_ = num_rows; } } - return true; -} - -bool RecordBatch::ApproxEquals(const RecordBatch& other) const { - if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { - return false; - } + SimpleTable(const std::shared_ptr& schema, + const std::vector>& columns, int64_t num_rows = -1) { + schema_ = schema; + if (num_rows < 0) { + if (columns.size() == 0) { + num_rows_ = 0; + } else { + num_rows_ = columns[0]->length(); + } + } else { + num_rows_ = num_rows; + } - for (int i = 0; i < num_columns(); ++i) { - if (!column(i)->ApproxEquals(other.column(i))) { - return false; + columns_.resize(columns.size()); + for (size_t i = 0; i < columns.size(); ++i) { + columns_[i] = + std::make_shared(schema->field(static_cast(i)), columns[i]); } } - return true; -} + std::shared_ptr column(int i) const override { return columns_[i]; } -std::shared_ptr RecordBatch::ReplaceSchemaMetadata( - const std::shared_ptr& metadata) const { - auto new_schema = schema_->AddMetadata(metadata); - return std::make_shared(new_schema, num_rows_, columns_); -} - -std::shared_ptr RecordBatch::Slice(int64_t offset) const { - return Slice(offset, this->num_rows() - offset); -} + Status RemoveColumn(int i, std::shared_ptr
* out) const override { + std::shared_ptr new_schema; + RETURN_NOT_OK(schema_->RemoveField(i, &new_schema)); -std::shared_ptr RecordBatch::Slice(int64_t offset, int64_t length) const { - std::vector> arrays; - arrays.reserve(num_columns()); - for (const auto& field : columns_) { - int64_t col_length = std::min(field->length - offset, length); - int64_t col_offset = field->offset + offset; - - auto new_data = std::make_shared(*field); - new_data->length = col_length; - new_data->offset = col_offset; - new_data->null_count = kUnknownNullCount; - arrays.emplace_back(new_data); + *out = Table::Make(new_schema, internal::DeleteVectorElement(columns_, i), + this->num_rows()); + return Status::OK(); } - int64_t num_rows = std::min(num_rows_ - offset, length); - return std::make_shared(schema_, num_rows, std::move(arrays)); -} -Status RecordBatch::Validate() const { - for (int i = 0; i < num_columns(); ++i) { - const ArrayData& arr = *columns_[i]; - if (arr.length != num_rows_) { + Status AddColumn(int i, const std::shared_ptr& col, + std::shared_ptr
* out) const override { + if (i < 0 || i > num_columns() + 1) { + return Status::Invalid("Invalid column index."); + } + if (col == nullptr) { std::stringstream ss; - ss << "Number of rows in column " << i << " did not match batch: " << arr.length - << " vs " << num_rows_; + ss << "Column " << i << " was null"; return Status::Invalid(ss.str()); } - const auto& schema_type = *schema_->field(i)->type(); - if (!arr.type->Equals(schema_type)) { + if (col->length() != num_rows_) { std::stringstream ss; - ss << "Column " << i << " type not match schema: " << arr.type->ToString() << " vs " - << schema_type.ToString(); + ss << "Added column's length must match table's length. Expected length " + << num_rows_ << " but got length " << col->length(); return Status::Invalid(ss.str()); } + + std::shared_ptr new_schema; + RETURN_NOT_OK(schema_->AddField(i, col->field(), &new_schema)); + + *out = Table::Make(new_schema, internal::AddVectorElement(columns_, i, col)); + return Status::OK(); } - return Status::OK(); -} -// ---------------------------------------------------------------------- -// Table methods + std::shared_ptr
ReplaceSchemaMetadata( + const std::shared_ptr& metadata) const override { + auto new_schema = schema_->AddMetadata(metadata); + return Table::Make(new_schema, columns_); + } -Table::Table(const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows) - : schema_(schema), columns_(columns) { - if (num_rows < 0) { - if (columns.size() == 0) { - num_rows_ = 0; - } else { - num_rows_ = columns[0]->length(); + Status Validate() const override { + if (static_cast(columns_.size()) != schema_->num_fields()) { + return Status::Invalid("Number of columns did not match schema"); } - } else { - num_rows_ = num_rows; - } -} -Table::Table(const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows) - : schema_(schema) { - if (num_rows < 0) { - if (columns.size() == 0) { - num_rows_ = 0; - } else { - num_rows_ = columns[0]->length(); + // Make sure columns are all the same length + for (int i = 0; i < num_columns(); ++i) { + const Column* col = columns_[i].get(); + if (col == nullptr) { + std::stringstream ss; + ss << "Column " << i << " was null"; + return Status::Invalid(ss.str()); + } + if (col->length() != num_rows_) { + std::stringstream ss; + ss << "Column " << i << " named " << col->name() << " expected length " + << num_rows_ << " but got length " << col->length(); + return Status::Invalid(ss.str()); + } } - } else { - num_rows_ = num_rows; + return Status::OK(); } - columns_.resize(columns.size()); - for (size_t i = 0; i < columns.size(); ++i) { - columns_[i] = - std::make_shared(schema->field(static_cast(i)), columns[i]); - } + private: + std::vector> columns_; +}; + +Table::Table() {} + +std::shared_ptr
Table::Make(const std::shared_ptr& schema, + const std::vector>& columns, + int64_t num_rows) { + return std::make_shared(schema, columns, num_rows); } -std::shared_ptr
Table::ReplaceSchemaMetadata( - const std::shared_ptr& metadata) const { - auto new_schema = schema_->AddMetadata(metadata); - return std::make_shared
(new_schema, columns_); +std::shared_ptr
Table::Make(const std::shared_ptr& schema, + const std::vector>& arrays, + int64_t num_rows) { + return std::make_shared(schema, arrays, num_rows); } Status Table::FromRecordBatches(const std::vector>& batches, @@ -351,7 +309,7 @@ Status Table::FromRecordBatches(const std::vector>& columns[i] = std::make_shared(schema->field(i), column_arrays); } - *table = std::make_shared
(schema, columns); + *table = Table::Make(schema, columns); return Status::OK(); } @@ -388,7 +346,7 @@ Status ConcatenateTables(const std::vector>& tables, } columns[i] = std::make_shared(schema->field(i), column_arrays); } - *table = std::make_shared
(schema, columns); + *table = Table::Make(schema, columns); return Status::OK(); } @@ -399,82 +357,19 @@ bool Table::Equals(const Table& other) const { if (!schema_->Equals(*other.schema())) { return false; } - if (static_cast(columns_.size()) != other.num_columns()) { + if (this->num_columns() != other.num_columns()) { return false; } - for (int i = 0; i < static_cast(columns_.size()); i++) { - if (!columns_[i]->Equals(other.column(i))) { + for (int i = 0; i < this->num_columns(); i++) { + if (!this->column(i)->Equals(other.column(i))) { return false; } } return true; } -Status Table::RemoveColumn(int i, std::shared_ptr
* out) const { - std::shared_ptr new_schema; - RETURN_NOT_OK(schema_->RemoveField(i, &new_schema)); - - *out = std::make_shared
(new_schema, internal::DeleteVectorElement(columns_, i)); - return Status::OK(); -} - -Status Table::AddColumn(int i, const std::shared_ptr& col, - std::shared_ptr
* out) const { - if (i < 0 || i > num_columns() + 1) { - return Status::Invalid("Invalid column index."); - } - if (col == nullptr) { - std::stringstream ss; - ss << "Column " << i << " was null"; - return Status::Invalid(ss.str()); - } - if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Added column's length must match table's length. Expected length " << num_rows_ - << " but got length " << col->length(); - return Status::Invalid(ss.str()); - } - - std::shared_ptr new_schema; - RETURN_NOT_OK(schema_->AddField(i, col->field(), &new_schema)); - - *out = - std::make_shared
(new_schema, internal::AddVectorElement(columns_, i, col)); - return Status::OK(); -} - -Status Table::ValidateColumns() const { - if (num_columns() != schema_->num_fields()) { - return Status::Invalid("Number of columns did not match schema"); - } - - // Make sure columns are all the same length - for (size_t i = 0; i < columns_.size(); ++i) { - const Column* col = columns_[i].get(); - if (col == nullptr) { - std::stringstream ss; - ss << "Column " << i << " was null"; - return Status::Invalid(ss.str()); - } - if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Column " << i << " named " << col->name() << " expected length " << num_rows_ - << " but got length " << col->length(); - return Status::Invalid(ss.str()); - } - } - return Status::OK(); -} - -bool Table::IsChunked() const { - for (size_t i = 0; i < columns_.size(); ++i) { - if (columns_[i]->data()->num_chunks() > 1) { - return true; - } - } - return false; -} +#ifndef ARROW_NO_DEPRECATED_API Status MakeTable(const std::shared_ptr& schema, const std::vector>& arrays, @@ -493,15 +388,12 @@ Status MakeTable(const std::shared_ptr& schema, columns.emplace_back(std::make_shared(schema->field(i), arrays[i])); } - *table = std::make_shared
(schema, columns); + *table = Table::Make(schema, columns); return Status::OK(); } -// ---------------------------------------------------------------------- -// Base record batch reader - -RecordBatchReader::~RecordBatchReader() {} +#endif // ARROW_NO_DEPRECATED_API // ---------------------------------------------------------------------- // Convert a table to a sequence of record batches @@ -513,7 +405,8 @@ class TableBatchReader::TableBatchReaderImpl { column_data_(table.num_columns()), chunk_numbers_(table.num_columns(), 0), chunk_offsets_(table.num_columns(), 0), - absolute_row_position_(0) { + absolute_row_position_(0), + max_chunksize_(std::numeric_limits::max()) { for (int i = 0; i < table.num_columns(); ++i) { column_data_[i] = table.column(i)->data().get(); } @@ -526,7 +419,7 @@ class TableBatchReader::TableBatchReaderImpl { } // Determine the minimum contiguous slice across all columns - int64_t chunksize = table_.num_rows(); + int64_t chunksize = std::min(table_.num_rows(), max_chunksize_); std::vector chunks(table_.num_columns()); for (int i = 0; i < table_.num_columns(); ++i) { auto chunk = column_data_[i]->chunk(chunk_numbers_[i]).get(); @@ -540,8 +433,7 @@ class TableBatchReader::TableBatchReaderImpl { } // Slice chunks and advance chunk index as appropriate - std::vector> batch_data; - batch_data.reserve(table_.num_columns()); + std::vector> batch_data(table_.num_columns()); for (int i = 0; i < table_.num_columns(); ++i) { // Exhausted chunk @@ -551,7 +443,7 @@ class TableBatchReader::TableBatchReaderImpl { if ((chunk->length() - offset) == chunksize) { ++chunk_numbers_[i]; chunk_offsets_[i] = 0; - if (chunk_offsets_[i] > 0) { + if (offset > 0) { // Need to slice slice_data = chunk->Slice(offset, chunksize)->data(); } else { @@ -559,26 +451,29 @@ class TableBatchReader::TableBatchReaderImpl { slice_data = chunk->data(); } } else { + chunk_offsets_[i] += chunksize; slice_data = chunk->Slice(offset, chunksize)->data(); } - batch_data.emplace_back(std::move(slice_data)); + batch_data[i] = std::move(slice_data); } absolute_row_position_ += chunksize; - *out = - std::make_shared(table_.schema(), chunksize, std::move(batch_data)); + *out = RecordBatch::Make(table_.schema(), chunksize, std::move(batch_data)); return Status::OK(); } std::shared_ptr schema() const { return table_.schema(); } + void set_chunksize(int64_t chunksize) { max_chunksize_ = chunksize; } + private: const Table& table_; std::vector column_data_; std::vector chunk_numbers_; std::vector chunk_offsets_; int64_t absolute_row_position_; + int64_t max_chunksize_; }; TableBatchReader::TableBatchReader(const Table& table) { @@ -589,6 +484,10 @@ TableBatchReader::~TableBatchReader() {} std::shared_ptr TableBatchReader::schema() const { return impl_->schema(); } +void TableBatchReader::set_chunksize(int64_t chunksize) { + impl_->set_chunksize(chunksize); +} + Status TableBatchReader::ReadNext(std::shared_ptr* out) { return impl_->ReadNext(out); } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index d3145ff107ae0..c813b32ad36dc 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -24,6 +24,7 @@ #include #include "arrow/array.h" +#include "arrow/record_batch.h" #include "arrow/type.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -33,8 +34,6 @@ namespace arrow { class KeyValueMetadata; class Status; -using ArrayVector = std::vector>; - /// \class ChunkedArray /// \brief A data structure managing a list of primitive Arrow arrays logically /// as one large array @@ -63,6 +62,9 @@ class ARROW_EXPORT ChunkedArray { ArrayVector chunks_; int64_t length_; int64_t null_count_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; /// \brief An immutable column data structure consisting of a field (type @@ -110,123 +112,28 @@ class ARROW_EXPORT Column { ARROW_DISALLOW_COPY_AND_ASSIGN(Column); }; -/// \class RecordBatch -/// \brief Collection of equal-length arrays matching a particular Schema -/// -/// A record batch is table-like data structure consisting of an internal -/// sequence of fields, each a contiguous Arrow array -class ARROW_EXPORT RecordBatch { - public: - /// \param[in] schema The record batch schema - /// \param[in] num_rows length of fields in the record batch. Each array - /// should have the same length as num_rows - /// \param[in] columns the record batch fields as vector of arrays - RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns); - - /// \brief Move-based constructor for a vector of Array instances - RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns); - - /// \brief Construct record batch from vector of internal data structures - /// \since 0.5.0 - /// - /// This class is only provided with an rvalue-reference for the input data, - /// and is intended for internal use, or advanced users. - /// - /// \param schema the record batch schema - /// \param num_rows the number of semantic rows in the record batch. This - /// should be equal to the length of each field - /// \param columns the data for the batch's columns - RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns); - - /// \brief Construct record batch by copying vector of array data - /// \since 0.5.0 - RecordBatch(const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns); - - /// \brief Determine if two record batches are exactly equal - /// \return true if batches are equal - bool Equals(const RecordBatch& other) const; - - /// \brief Determine if two record batches are approximately equal - bool ApproxEquals(const RecordBatch& other) const; - - // \return the table's schema - /// \return true if batches are equal - std::shared_ptr schema() const { return schema_; } - - /// \brief Retrieve an array from the record batch - /// \param[in] i field index, does not boundscheck - /// \return an Array object - std::shared_ptr column(int i) const; - - std::shared_ptr column_data(int i) const { return columns_[i]; } - - /// \brief Name in i-th column - const std::string& column_name(int i) const; - - /// \return the number of columns in the table - int num_columns() const { return static_cast(columns_.size()); } - - /// \return the number of rows (the corresponding length of each column) - int64_t num_rows() const { return num_rows_; } - - /// \brief Replace schema key-value metadata with new metadata (EXPERIMENTAL) - /// \since 0.5.0 - /// - /// \param[in] metadata new KeyValueMetadata - /// \return new RecordBatch - std::shared_ptr ReplaceSchemaMetadata( - const std::shared_ptr& metadata) const; - - /// \brief Slice each of the arrays in the record batch - /// \param[in] offset the starting offset to slice, through end of batch - /// \return new record batch - std::shared_ptr Slice(int64_t offset) const; - - /// \brief Slice each of the arrays in the record batch - /// \param[in] offset the starting offset to slice - /// \param[in] length the number of elements to slice from offset - /// \return new record batch - std::shared_ptr Slice(int64_t offset, int64_t length) const; - - /// \brief Check for schema or length inconsistencies - /// \return Status - Status Validate() const; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatch); - - RecordBatch(const std::shared_ptr& schema, int64_t num_rows); - - std::shared_ptr schema_; - int64_t num_rows_; - std::vector> columns_; - - // Caching boxed array data - mutable std::vector> boxed_columns_; -}; - /// \class Table /// \brief Logical table as sequence of chunked arrays class ARROW_EXPORT Table { public: + virtual ~Table() = default; + /// \brief Construct Table from schema and columns /// If columns is zero-length, the table's number of rows is zero /// \param schema The table schema (column types) /// \param columns The table's columns /// \param num_rows number of rows in table, -1 (default) to infer from columns - Table(const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows = -1); + static std::shared_ptr
Make(const std::shared_ptr& schema, + const std::vector>& columns, + int64_t num_rows = -1); /// \brief Construct Table from schema and arrays /// \param schema The table schema (column types) /// \param arrays The table's columns as arrays /// \param num_rows number of rows in table, -1 (default) to infer from columns - Table(const std::shared_ptr& schema, - const std::vector>& arrays, int64_t num_rows = -1); + static std::shared_ptr
Make(const std::shared_ptr& schema, + const std::vector>& arrays, + int64_t num_rows = -1); // Construct table from RecordBatch, but only if all of the batch schemas are // equal. Returns Status::Invalid if there is some problem @@ -239,25 +146,28 @@ class ARROW_EXPORT Table { /// \param[in] i column index, does not boundscheck /// \return the i-th column - std::shared_ptr column(int i) const { return columns_[i]; } + virtual std::shared_ptr column(int i) const = 0; /// \brief Remove column from the table, producing a new Table - Status RemoveColumn(int i, std::shared_ptr
* out) const; + virtual Status RemoveColumn(int i, std::shared_ptr
* out) const = 0; /// \brief Add column to the table, producing a new Table - Status AddColumn(int i, const std::shared_ptr& column, - std::shared_ptr
* out) const; + virtual Status AddColumn(int i, const std::shared_ptr& column, + std::shared_ptr
* out) const = 0; /// \brief Replace schema key-value metadata with new metadata (EXPERIMENTAL) /// \since 0.5.0 /// /// \param[in] metadata new KeyValueMetadata /// \return new Table - std::shared_ptr
ReplaceSchemaMetadata( - const std::shared_ptr& metadata) const; + virtual std::shared_ptr
ReplaceSchemaMetadata( + const std::shared_ptr& metadata) const = 0; + + /// \brief Perform any checks to validate the input arguments + virtual Status Validate() const = 0; /// \return the number of columns in the table - int num_columns() const { return static_cast(columns_.size()); } + int num_columns() const { return schema_->num_fields(); } /// \return the number of rows (the corresponding length of each column) int64_t num_rows() const { return num_rows_; } @@ -265,35 +175,14 @@ class ARROW_EXPORT Table { /// \brief Determine if semantic contents of tables are exactly equal bool Equals(const Table& other) const; - /// \brief Perform any checks to validate the input arguments - Status ValidateColumns() const; - - /// \brief Return true if any column has multiple chunks - bool IsChunked() const; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(Table); + protected: + Table(); std::shared_ptr schema_; - std::vector> columns_; - int64_t num_rows_; -}; - -/// \brief Abstract interface for reading stream of record batches -class ARROW_EXPORT RecordBatchReader { - public: - virtual ~RecordBatchReader(); - /// \return the shared schema of the record batches in the stream - virtual std::shared_ptr schema() const = 0; - - /// Read the next record batch in the stream. Return null for batch when - /// reaching end of stream - /// - /// \param[out] batch the next loaded batch, null at end of stream - /// \return Status - virtual Status ReadNext(std::shared_ptr* batch) = 0; + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Table); }; /// \brief Compute a sequence of record batches from a (possibly chunked) Table @@ -308,6 +197,8 @@ class ARROW_EXPORT TableBatchReader : public RecordBatchReader { Status ReadNext(std::shared_ptr* out) override; + void set_chunksize(int64_t chunksize); + private: class TableBatchReaderImpl; std::unique_ptr impl_; @@ -319,13 +210,18 @@ ARROW_EXPORT Status ConcatenateTables(const std::vector>& tables, std::shared_ptr
* table); +#ifndef ARROW_NO_DEPRECATED_API + /// \brief Construct table from multiple input tables. /// \return Status, fails if any schemas are different +/// \note Deprecated since 0.8.0 ARROW_EXPORT Status MakeTable(const std::shared_ptr& schema, const std::vector>& arrays, std::shared_ptr
* table); +#endif + } // namespace arrow #endif // ARROW_TABLE_H diff --git a/cpp/src/arrow/table_builder-test.cc b/cpp/src/arrow/table_builder-test.cc index 07d9b6b2d6568..8167577e9064e 100644 --- a/cpp/src/arrow/table_builder-test.cc +++ b/cpp/src/arrow/table_builder-test.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" #include "arrow/array.h" +#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/table_builder.h" @@ -98,7 +99,7 @@ TEST_F(TestRecordBatchBuilder, Basics) { ASSERT_OK(ex_b1.Finish(&a1)); ASSERT_OK(ex_b2.Finish(&a2)); - RecordBatch expected(schema, 4, {a0, a1, a2}); + auto expected = RecordBatch::Make(schema, 4, {a0, a1, a2}); // Builder attributes ASSERT_EQ(3, builder->num_fields()); @@ -119,7 +120,7 @@ TEST_F(TestRecordBatchBuilder, Basics) { ASSERT_OK(builder->Flush(&batch)); } - ASSERT_BATCHES_EQUAL(expected, *batch); + ASSERT_BATCHES_EQUAL(*expected, *batch); } // Test setting initial capacity diff --git a/cpp/src/arrow/table_builder.cc b/cpp/src/arrow/table_builder.cc index a1bd95940a6db..379d886deacba 100644 --- a/cpp/src/arrow/table_builder.cc +++ b/cpp/src/arrow/table_builder.cc @@ -24,6 +24,7 @@ #include "arrow/array.h" #include "arrow/builder.h" +#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" @@ -64,7 +65,7 @@ Status RecordBatchBuilder::Flush(bool reset_builders, } length = fields[i]->length(); } - *batch = std::make_shared(schema_, length, std::move(fields)); + *batch = RecordBatch::Make(schema_, length, std::move(fields)); if (reset_builders) { return InitBuilders(); } else { diff --git a/cpp/src/arrow/test-common.h b/cpp/src/arrow/test-common.h index a4c4fddff7348..911adf7b6057a 100644 --- a/cpp/src/arrow/test-common.h +++ b/cpp/src/arrow/test-common.h @@ -30,7 +30,6 @@ #include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/memory_pool.h" -#include "arrow/table.h" #include "arrow/test-util.h" namespace arrow { diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 044fb9476ca73..1a34808488a83 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -18,6 +18,7 @@ #ifndef ARROW_TEST_UTIL_H_ #define ARROW_TEST_UTIL_H_ +#include #include #include #include @@ -34,12 +35,11 @@ #include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" -#include "arrow/util/random.h" #define ASSERT_RAISES(ENUM, expr) \ do { \ @@ -47,7 +47,7 @@ if (!s.Is##ENUM()) { \ FAIL() << s.ToString(); \ } \ - } while (0) + } while (false) #define ASSERT_OK(expr) \ do { \ @@ -55,7 +55,7 @@ if (!s.ok()) { \ FAIL() << s.ToString(); \ } \ - } while (0) + } while (false) #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) @@ -63,15 +63,15 @@ do { \ ::arrow::Status s = (expr); \ EXPECT_TRUE(s.ok()); \ - } while (0) + } while (false) #define ABORT_NOT_OK(s) \ do { \ ::arrow::Status _s = (s); \ if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - exit(-1); \ + exit(EXIT_FAILURE); \ } \ - } while (0); + } while (false); namespace arrow { @@ -79,27 +79,22 @@ using ArrayVector = std::vector>; namespace test { -template -void randint(int64_t N, T lower, T upper, std::vector* out) { - Random rng(random_seed()); - uint64_t draw; - uint64_t span = upper - lower; - T val; - for (int64_t i = 0; i < N; ++i) { - draw = rng.Uniform64(span); - val = static_cast(draw + lower); - out->push_back(val); - } +template +void randint(int64_t N, T lower, T upper, std::vector* out) { + const int random_seed = 0; + std::mt19937 gen(random_seed); + std::uniform_int_distribution d(lower, upper); + out->resize(N, static_cast(0)); + std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast(d(gen)); }); } -template +template void random_real(int64_t n, uint32_t seed, T min_value, T max_value, - std::vector* out) { + std::vector* out) { std::mt19937 gen(seed); std::uniform_real_distribution d(min_value, max_value); - for (int64_t i = 0; i < n; ++i) { - out->push_back(d(gen)); - } + out->resize(n, static_cast(0)); + std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast(d(gen)); }); } template @@ -115,7 +110,8 @@ inline Status CopyBufferFromVector(const std::vector& values, MemoryPool* poo auto buffer = std::make_shared(pool); RETURN_NOT_OK(buffer->Resize(nbytes)); - memcpy(buffer->mutable_data(), values.data(), nbytes); + auto immutable_data = reinterpret_cast(values.data()); + std::copy(immutable_data, immutable_data + nbytes, buffer->mutable_data()); *result = buffer; return Status::OK(); @@ -143,56 +139,131 @@ static inline Status GetBitmapFromVector(const std::vector& is_valid, // Sets approximately pct_null of the first n bytes in null_bytes to zero // and the rest to non-zero (true) values. static inline void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { - Random rng(random_seed()); - for (int64_t i = 0; i < n; ++i) { - null_bytes[i] = rng.NextDoubleFraction() > pct_null; - } + const int random_seed = 0; + std::mt19937 gen(random_seed); + std::uniform_real_distribution d(0.0, 1.0); + std::generate(null_bytes, null_bytes + n, + [&d, &gen, &pct_null] { return d(gen) > pct_null; }); } static inline void random_is_valid(int64_t n, double pct_null, std::vector* is_valid) { - Random rng(random_seed()); - for (int64_t i = 0; i < n; ++i) { - is_valid->push_back(rng.NextDoubleFraction() > pct_null); - } + const int random_seed = 0; + std::mt19937 gen(random_seed); + std::uniform_real_distribution d(0.0, 1.0); + is_valid->resize(n, false); + std::generate(is_valid->begin(), is_valid->end(), + [&d, &gen, &pct_null] { return d(gen) > pct_null; }); } static inline void random_bytes(int64_t n, uint32_t seed, uint8_t* out) { std::mt19937 gen(seed); - std::uniform_int_distribution d(0, 255); + std::uniform_int_distribution d(0, std::numeric_limits::max()); + std::generate(out, out + n, [&d, &gen] { return static_cast(d(gen)); }); +} - for (int64_t i = 0; i < n; ++i) { - out[i] = static_cast(d(gen) & 0xFF); +static int32_t DecimalSize(int32_t precision) { + DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got " + << precision; + DCHECK_LE(precision, 38) << "decimal precision must be less than or equal to 38, got " + << precision; + + switch (precision) { + case 1: + case 2: + return 1; // 127 + case 3: + case 4: + return 2; // 32,767 + case 5: + case 6: + return 3; // 8,388,607 + case 7: + case 8: + case 9: + return 4; // 2,147,483,427 + case 10: + case 11: + return 5; // 549,755,813,887 + case 12: + case 13: + case 14: + return 6; // 140,737,488,355,327 + case 15: + case 16: + return 7; // 36,028,797,018,963,967 + case 17: + case 18: + return 8; // 9,223,372,036,854,775,807 + case 19: + case 20: + case 21: + return 9; // 2,361,183,241,434,822,606,847 + case 22: + case 23: + return 10; // 604,462,909,807,314,587,353,087 + case 24: + case 25: + case 26: + return 11; // 154,742,504,910,672,534,362,390,527 + case 27: + case 28: + return 12; // 39,614,081,257,132,168,796,771,975,167 + case 29: + case 30: + case 31: + return 13; // 10,141,204,801,825,835,211,973,625,643,007 + case 32: + case 33: + return 14; // 2,596,148,429,267,413,814,265,248,164,610,047 + case 34: + case 35: + return 15; // 664,613,997,892,457,936,451,903,530,140,172,287 + case 36: + case 37: + case 38: + return 16; // 170,141,183,460,469,231,731,687,303,715,884,105,727 + default: + DCHECK(false); + break; } + return -1; } -static inline void random_ascii(int64_t n, uint32_t seed, uint8_t* out) { +static inline void random_decimals(int64_t n, uint32_t seed, int32_t precision, + uint8_t* out) { std::mt19937 gen(seed); - std::uniform_int_distribution d(65, 122); - - for (int64_t i = 0; i < n; ++i) { - out[i] = static_cast(d(gen) & 0xFF); + std::uniform_int_distribution d(0, std::numeric_limits::max()); + const int32_t required_bytes = DecimalSize(precision); + constexpr int32_t byte_width = 16; + std::fill(out, out + byte_width * n, '\0'); + + for (int64_t i = 0; i < n; ++i, out += byte_width) { + std::generate(out, out + required_bytes, + [&d, &gen] { return static_cast(d(gen)); }); + + // sign extend if the sign bit is set for the last byte generated + // 0b10000000 == 0x80 == 128 + if ((out[required_bytes - 1] & '\x80') != 0) { + std::fill(out + required_bytes, out + byte_width, '\xFF'); + } } } -template -void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, T* out) { +template +void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, U* out) { DCHECK(out || (n == 0)); std::mt19937 gen(seed); std::uniform_int_distribution d(min_value, max_value); - for (int64_t i = 0; i < n; ++i) { - out[i] = static_cast(d(gen)); - } + std::generate(out, out + n, [&d, &gen] { return static_cast(d(gen)); }); +} + +static inline void random_ascii(int64_t n, uint32_t seed, uint8_t* out) { + rand_uniform_int(n, seed, static_cast('A'), static_cast('z'), out); } static inline int64_t null_count(const std::vector& valid_bytes) { - int64_t result = 0; - for (size_t i = 0; i < valid_bytes.size(); ++i) { - if (valid_bytes[i] == 0) { - ++result; - } - } - return result; + return static_cast(std::count(valid_bytes.cbegin(), valid_bytes.cend(), '\0')); } Status MakeRandomInt32PoolBuffer(int64_t length, MemoryPool* pool, @@ -293,13 +364,17 @@ Status MakeArray(const std::vector& valid_bytes, const std::vector& } \ } while (false) +#define DECL_T() typedef typename TestFixture::T T; + +#define DECL_TYPE() typedef typename TestFixture::Type Type; + void AssertArraysEqual(const Array& expected, const Array& actual) { ASSERT_ARRAYS_EQUAL(expected, actual); } #define ASSERT_BATCHES_EQUAL(LEFT, RIGHT) \ do { \ - if (!LEFT.ApproxEquals(RIGHT)) { \ + if (!(LEFT).ApproxEquals(RIGHT)) { \ std::stringstream ss; \ ss << "Left:\n"; \ ASSERT_OK(PrettyPrint(LEFT, 0, &ss)); \ diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 3242fadd50dd6..48982cad42435 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -400,7 +400,7 @@ TEST(TestStructType, Basics) { } TEST(TypesTest, TestDecimal128Small) { - DecimalType t1(8, 4); + Decimal128Type t1(8, 4); ASSERT_EQ(t1.id(), Type::DECIMAL); ASSERT_EQ(t1.precision(), 8); @@ -414,7 +414,7 @@ TEST(TypesTest, TestDecimal128Small) { } TEST(TypesTest, TestDecimal128Medium) { - DecimalType t1(12, 5); + Decimal128Type t1(12, 5); ASSERT_EQ(t1.id(), Type::DECIMAL); ASSERT_EQ(t1.precision(), 12); @@ -428,7 +428,7 @@ TEST(TypesTest, TestDecimal128Medium) { } TEST(TypesTest, TestDecimal128Large) { - DecimalType t1(27, 7); + Decimal128Type t1(27, 7); ASSERT_EQ(t1.id(), Type::DECIMAL); ASSERT_EQ(t1.precision(), 27); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a9bf591918558..31ad53458112c 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -190,7 +190,7 @@ std::string TimestampType::ToString() const { // Union type UnionType::UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode mode) + const std::vector& type_codes, UnionMode::type mode) : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) { children_ = fields; } @@ -373,7 +373,7 @@ ACCEPT_VISITOR(FixedSizeBinaryType); ACCEPT_VISITOR(StringType); ACCEPT_VISITOR(ListType); ACCEPT_VISITOR(StructType); -ACCEPT_VISITOR(DecimalType); +ACCEPT_VISITOR(Decimal128Type); ACCEPT_VISITOR(UnionType); ACCEPT_VISITOR(Date32Type); ACCEPT_VISITOR(Date64Type); @@ -440,10 +440,24 @@ std::shared_ptr struct_(const std::vector>& fie } std::shared_ptr union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode mode) { + const std::vector& type_codes, + UnionMode::type mode) { return std::make_shared(child_fields, type_codes, mode); } +std::shared_ptr union_(const std::vector>& children, + UnionMode::type mode) { + std::vector> types; + std::vector type_codes; + uint8_t counter = 0; + for (const auto& child : children) { + types.push_back(field(std::to_string(counter), child->type())); + type_codes.push_back(counter); + counter++; + } + return union_(types, type_codes, mode); +} + std::shared_ptr dictionary(const std::shared_ptr& index_type, const std::shared_ptr& dict_values, bool ordered) { @@ -457,47 +471,10 @@ std::shared_ptr field(const std::string& name, } std::shared_ptr decimal(int32_t precision, int32_t scale) { - return std::make_shared(precision, scale); -} - -static const BufferDescr kValidityBuffer(BufferType::VALIDITY, 1); -static const BufferDescr kOffsetBuffer(BufferType::OFFSET, 32); -static const BufferDescr kTypeBuffer(BufferType::TYPE, 32); -static const BufferDescr kBooleanBuffer(BufferType::DATA, 1); -static const BufferDescr kValues64(BufferType::DATA, 64); -static const BufferDescr kValues32(BufferType::DATA, 32); -static const BufferDescr kValues16(BufferType::DATA, 16); -static const BufferDescr kValues8(BufferType::DATA, 8); - -std::vector FixedWidthType::GetBufferLayout() const { - return {kValidityBuffer, BufferDescr(BufferType::DATA, bit_width())}; -} - -std::vector NullType::GetBufferLayout() const { return {}; } - -std::vector BinaryType::GetBufferLayout() const { - return {kValidityBuffer, kOffsetBuffer, kValues8}; -} - -std::vector FixedSizeBinaryType::GetBufferLayout() const { - return {kValidityBuffer, BufferDescr(BufferType::DATA, bit_width())}; -} - -std::vector ListType::GetBufferLayout() const { - return {kValidityBuffer, kOffsetBuffer}; -} - -std::vector StructType::GetBufferLayout() const { return {kValidityBuffer}; } - -std::vector UnionType::GetBufferLayout() const { - if (mode_ == UnionMode::SPARSE) { - return {kValidityBuffer, kTypeBuffer}; - } else { - return {kValidityBuffer, kTypeBuffer, kOffsetBuffer}; - } + return std::make_shared(precision, scale); } -std::string DecimalType::ToString() const { +std::string Decimal128Type::ToString() const { std::stringstream s; s << "decimal(" << precision_ << ", " << scale_ << ")"; return s.str(); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 446f4d3a0b33f..009e07db07744 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -133,20 +133,6 @@ struct Type { }; }; -enum class BufferType : char { DATA, OFFSET, TYPE, VALIDITY }; - -class BufferDescr { - public: - BufferDescr(BufferType type, int bit_width) : type_(type), bit_width_(bit_width) {} - - BufferType type() const { return type_; } - int bit_width() const { return bit_width_; } - - private: - BufferType type_; - int bit_width_; -}; - class ARROW_EXPORT DataType { public: explicit DataType(Type::type id) : id_(id) {} @@ -176,8 +162,6 @@ class ARROW_EXPORT DataType { /// \since 0.7.0 virtual std::string name() const = 0; - virtual std::vector GetBufferLayout() const = 0; - Type::type id() const { return id_; } protected: @@ -201,8 +185,6 @@ class ARROW_EXPORT FixedWidthType : public DataType { using DataType::DataType; virtual int bit_width() const = 0; - - std::vector GetBufferLayout() const override; }; class ARROW_EXPORT PrimitiveCType : public FixedWidthType { @@ -319,8 +301,6 @@ class ARROW_EXPORT NullType : public DataType, public NoExtraMeta { std::string ToString() const override; std::string name() const override { return "null"; } - - std::vector GetBufferLayout() const override; }; class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta { @@ -425,8 +405,6 @@ class ARROW_EXPORT ListType : public NestedType { std::string ToString() const override; std::string name() const override { return "list"; } - - std::vector GetBufferLayout() const override; }; // BinaryType type is represents lists of 1-byte values. @@ -440,8 +418,6 @@ class ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { std::string ToString() const override; std::string name() const override { return "binary"; } - std::vector GetBufferLayout() const override; - protected: // Allow subclasses to change the logical type. explicit BinaryType(Type::type logical_type) : DataType(logical_type) {} @@ -461,8 +437,6 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public Parametri std::string ToString() const override; std::string name() const override { return "fixed_size_binary"; } - std::vector GetBufferLayout() const override; - int32_t byte_width() const { return byte_width_; } int bit_width() const override; @@ -494,50 +468,57 @@ class ARROW_EXPORT StructType : public NestedType { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; std::string name() const override { return "struct"; } - - std::vector GetBufferLayout() const override; }; class ARROW_EXPORT DecimalType : public FixedSizeBinaryType { public: - static constexpr Type::type type_id = Type::DECIMAL; - - explicit DecimalType(int32_t precision, int32_t scale) - : FixedSizeBinaryType(16, Type::DECIMAL), precision_(precision), scale_(scale) {} - - Status Accept(TypeVisitor* visitor) const override; - std::string ToString() const override; - std::string name() const override { return "decimal"; } + explicit DecimalType(int32_t byte_width, int32_t precision, int32_t scale) + : FixedSizeBinaryType(byte_width, Type::DECIMAL), + precision_(precision), + scale_(scale) {} int32_t precision() const { return precision_; } int32_t scale() const { return scale_; } - private: + protected: int32_t precision_; int32_t scale_; }; -enum class UnionMode : char { SPARSE, DENSE }; +class ARROW_EXPORT Decimal128Type : public DecimalType { + public: + static constexpr Type::type type_id = Type::DECIMAL; + + explicit Decimal128Type(int32_t precision, int32_t scale) + : DecimalType(16, precision, scale) {} + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override; + std::string name() const override { return "decimal"; } +}; + +struct UnionMode { + enum type { SPARSE, DENSE }; +}; class ARROW_EXPORT UnionType : public NestedType { public: static constexpr Type::type type_id = Type::UNION; UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); + const std::vector& type_codes, + UnionMode::type mode = UnionMode::SPARSE); std::string ToString() const override; std::string name() const override { return "union"; } Status Accept(TypeVisitor* visitor) const override; - std::vector GetBufferLayout() const override; - const std::vector& type_codes() const { return type_codes_; } - UnionMode mode() const { return mode_; } + UnionMode::type mode() const { return mode_; } private: - UnionMode mode_; + UnionMode::type mode_; // The type id used in the data to indicate each data type in the union. For // example, the first type in the union might be denoted by the id 5 (instead @@ -842,7 +823,12 @@ struct_(const std::vector>& fields); /// \brief Create an instance of Union type std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); + const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); + +/// \brief Create and instance of Union type +std::shared_ptr ARROW_EXPORT +union_(const std::vector>& children, + UnionMode::type mode = UnionMode::SPARSE); /// \brief Create an instance of Dictionary type std::shared_ptr ARROW_EXPORT diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 0d06b6f6cb86e..9d8a23ce0714f 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -28,10 +28,16 @@ class Status; class DataType; class Array; +struct ArrayData; class ArrayBuilder; class Field; class Tensor; +class ChunkedArray; +class Column; +class RecordBatch; +class Table; + class Buffer; class MemoryPool; class RecordBatch; @@ -68,9 +74,9 @@ class StructType; class StructArray; class StructBuilder; -class DecimalType; -class DecimalArray; -class DecimalBuilder; +class Decimal128Type; +class Decimal128Array; +class Decimal128Builder; class UnionType; class UnionArray; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index fbd78398f4579..4bfce9b5f0c53 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -230,9 +230,9 @@ struct TypeTraits { }; template <> -struct TypeTraits { - using ArrayType = DecimalArray; - using BuilderType = DecimalBuilder; +struct TypeTraits { + using ArrayType = Decimal128Array; + using BuilderType = Decimal128Builder; constexpr static bool is_parameter_free = false; }; @@ -430,6 +430,10 @@ static inline bool is_binary_like(Type::type type_id) { return false; } +static inline bool is_dictionary(Type::type type_id) { + return type_id == Type::DICTIONARY; +} + } // namespace arrow #endif // ARROW_TYPE_TRAITS_H diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 5df5e748f39e5..8b61a3acfe709 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -34,15 +34,16 @@ install(FILES cpu-info.h decimal.h hash-util.h + hash.h key_value_metadata.h logging.h macros.h parallel.h - random.h rle-encoding.h sse-util.h stl.h type_traits.h + variant.h visibility.h DESTINATION include/arrow/util) @@ -56,11 +57,16 @@ if (ARROW_BUILD_BENCHMARKS) target_link_libraries(arrow_benchmark_main benchmark ) + elseif(MSVC) + target_link_libraries(arrow_benchmark_main + benchmark + Shlwapi.lib + ) else() - target_link_libraries(arrow_benchmark_main + target_link_libraries(arrow_benchmark_main benchmark pthread - ) + ) endif() # TODO(wesm): Some benchmarks include gtest.h @@ -73,3 +79,7 @@ ADD_ARROW_TEST(decimal-test) ADD_ARROW_TEST(key-value-metadata-test) ADD_ARROW_TEST(rle-encoding-test) ADD_ARROW_TEST(stl-util-test) + +ADD_ARROW_BENCHMARK(bit-util-benchmark) + +add_subdirectory(variant) diff --git a/cpp/src/arrow/util/bit-util-benchmark.cc b/cpp/src/arrow/util/bit-util-benchmark.cc new file mode 100644 index 0000000000000..8969dd80b157e --- /dev/null +++ b/cpp/src/arrow/util/bit-util-benchmark.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include + +#include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/memory_pool.h" +#include "arrow/test-util.h" +#include "arrow/util/bit-util.h" + +namespace arrow { +namespace BitUtil { + +static void BM_CopyBitmap(benchmark::State& state) { // NOLINT non-const reference + const int kBufferSize = state.range(0); + + std::shared_ptr buffer; + ASSERT_OK(AllocateBuffer(default_memory_pool(), kBufferSize, &buffer)); + memset(buffer->mutable_data(), 0, kBufferSize); + test::random_bytes(kBufferSize, 0, buffer->mutable_data()); + + const int num_bits = kBufferSize * 8; + const uint8_t* src = buffer->data(); + + std::shared_ptr copy; + while (state.KeepRunning()) { + ABORT_NOT_OK(CopyBitmap(default_memory_pool(), src, state.range(1), num_bits, ©)); + } + state.SetBytesProcessed(state.iterations() * kBufferSize * sizeof(int8_t)); +} + +BENCHMARK(BM_CopyBitmap) + ->Args({100000, 0}) + ->Args({1000000, 0}) + ->Args({100000, 4}) + ->Args({1000000, 4}) + ->MinTime(1.0) + ->Unit(benchmark::kMicrosecond); + +} // namespace BitUtil +} // namespace arrow diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 92bdcb5fc0846..4c64dea374d01 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -165,19 +165,20 @@ TEST(BitUtilTests, TestCopyBitmap) { memset(buffer->mutable_data(), 0, kBufferSize); test::random_bytes(kBufferSize, 0, buffer->mutable_data()); - const int num_bits = kBufferSize * 8; - const uint8_t* src = buffer->data(); + std::vector lengths = {kBufferSize * 8 - 4, kBufferSize * 8}; std::vector offsets = {0, 12, 16, 32, 37, 63, 64, 128}; - for (int64_t offset : offsets) { - const int64_t copy_length = num_bits - offset; + for (int64_t num_bits : lengths) { + for (int64_t offset : offsets) { + const int64_t copy_length = num_bits - offset; - std::shared_ptr copy; - ASSERT_OK(CopyBitmap(default_memory_pool(), src, offset, copy_length, ©)); + std::shared_ptr copy; + ASSERT_OK(CopyBitmap(default_memory_pool(), src, offset, copy_length, ©)); - for (int64_t i = 0; i < copy_length; ++i) { - ASSERT_EQ(BitUtil::GetBit(src, i + offset), BitUtil::GetBit(copy->data(), i)); + for (int64_t i = 0; i < copy_length; ++i) { + ASSERT_EQ(BitUtil::GetBit(src, i + offset), BitUtil::GetBit(copy->data(), i)); + } } } } diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 4dd91e99ad9da..c77f0d008b502 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -109,9 +109,37 @@ Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, int64_t std::shared_ptr buffer; RETURN_NOT_OK(GetEmptyBitmap(pool, length, &buffer)); uint8_t* dest = buffer->mutable_data(); - for (int64_t i = 0; i < length; ++i) { - BitUtil::SetBitTo(dest, i, BitUtil::GetBit(data, i + offset)); + + int64_t byte_offset = offset / 8; + int64_t bit_offset = offset % 8; + int64_t num_bytes = BitUtil::BytesForBits(length); + int64_t bits_to_zero = num_bytes * 8 - length; + + if (bit_offset > 0) { + uint32_t carry_mask = BitUtil::kBitmask[bit_offset] - 1U; + uint32_t carry_shift = 8U - static_cast(bit_offset); + + uint32_t carry = 0U; + if (BitUtil::BytesForBits(length + bit_offset) > num_bytes) { + carry = (data[byte_offset + num_bytes] & carry_mask) << carry_shift; + } + + int64_t i = num_bytes - 1; + while (i + 1 > 0) { + uint8_t cur_byte = data[byte_offset + i]; + dest[i] = static_cast((cur_byte >> bit_offset) | carry); + carry = (cur_byte & carry_mask) << carry_shift; + --i; + } + } else { + std::memcpy(dest, data + byte_offset, static_cast(num_bytes)); + } + + for (int64_t i = length; i < length + bits_to_zero; ++i) { + // Both branches may copy extra bits - unsetting to match specification. + BitUtil::SetBitTo(dest, i, false); } + *out = buffer; return Status::OK(); } diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 8043f90ccdf6a..86c17d16801b9 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -43,6 +43,8 @@ #endif #if defined(_MSC_VER) +#include +#pragma intrinsic(_BitScanReverse) #define ARROW_BYTE_SWAP64 _byteswap_uint64 #define ARROW_BYTE_SWAP32 _byteswap_ulong #else @@ -137,13 +139,10 @@ static inline void SetArrayBit(uint8_t* bits, int i, bool is_set) { } static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) { - // TODO: speed up. See https://graphics.stanford.edu/~seander/bithacks.html + // https://graphics.stanford.edu/~seander/bithacks.html // "Conditionally set or clear bits without branching" - if (bit_is_set) { - SetBit(bits, i); - } else { - ClearBit(bits, i); - } + bits[i / 8] ^= static_cast(-static_cast(bit_is_set) ^ bits[i / 8]) & + kBitmask[i % 8]; } // Returns the minimum number of bits needed to represent the value of 'x' @@ -296,6 +295,25 @@ static inline int Log2(uint64_t x) { return result; } +/// \brief Count the number of leading zeros in a 32 bit integer. +static inline int64_t CountLeadingZeros(uint32_t value) { +// DCHECK_NE(value, 0); +#if defined(__clang__) || defined(__GNUC__) + return static_cast(__builtin_clz(value)); +#elif defined(_MSC_VER) + unsigned long index; // NOLINT + _BitScanReverse(&index, static_cast(value)); // NOLINT + return 31LL - static_cast(index); +#else + int64_t bitpos = 0; + while (value != 0) { + value >>= 1; + ++bitpos; + } + return 32LL - bitpos; +#endif +} + /// Swaps the byte order (i.e. endianess) static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); } static inline uint64_t ByteSwap(uint64_t value) { diff --git a/cpp/src/arrow/util/compression.h b/cpp/src/arrow/util/compression.h index ae187a7fcdf1c..de3837ec727b1 100644 --- a/cpp/src/arrow/util/compression.h +++ b/cpp/src/arrow/util/compression.h @@ -27,7 +27,7 @@ namespace arrow { struct Compression { - enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, ZSTD, LZ4 }; + enum type { UNCOMPRESSED, SNAPPY, GZIP, BROTLI, ZSTD, LZ4, LZO }; }; class ARROW_EXPORT Codec { diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index 565a1bbb939b6..e4406747d557c 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -18,6 +18,7 @@ #include #include +#include #include @@ -36,9 +37,8 @@ class DecimalTestFixture : public ::testing::Test { TEST_F(DecimalTestFixture, TestToString) { Decimal128 decimal(this->integer_value_); - int precision = 8; int scale = 5; - std::string result = decimal.ToString(precision, scale); + std::string result = decimal.ToString(scale); ASSERT_EQ(result, this->string_value_); } @@ -256,4 +256,123 @@ TEST(Decimal128TestFalse, ConstructibleFromBool) { ASSERT_EQ(0, value.low_bits()); } +TEST(Decimal128Test, Division) { + const std::string expected_string_value("-23923094039234029"); + const Decimal128 value(expected_string_value); + const Decimal128 result(value / 3); + const Decimal128 expected_value("-7974364679744676"); + ASSERT_EQ(expected_value, result); +} + +TEST(Decimal128Test, PrintLargePositiveValue) { + const std::string string_value("99999999999999999999999999999999999999"); + const Decimal128 value(string_value); + const std::string printed_value = value.ToIntegerString(); + ASSERT_EQ(string_value, printed_value); +} + +TEST(Decimal128Test, PrintLargeNegativeValue) { + const std::string string_value("-99999999999999999999999999999999999999"); + const Decimal128 value(string_value); + const std::string printed_value = value.ToIntegerString(); + ASSERT_EQ(string_value, printed_value); +} + +TEST(Decimal128Test, PrintMaxValue) { + const std::string string_value("170141183460469231731687303715884105727"); + const Decimal128 value(string_value); + const std::string printed_value = value.ToIntegerString(); + ASSERT_EQ(string_value, printed_value); +} + +TEST(Decimal128Test, PrintMinValue) { + const std::string string_value("-170141183460469231731687303715884105728"); + const Decimal128 value(string_value); + const std::string printed_value = value.ToIntegerString(); + ASSERT_EQ(string_value, printed_value); +} + +class Decimal128PrintingTest + : public ::testing::TestWithParam> {}; + +TEST_P(Decimal128PrintingTest, Print) { + int32_t test_value; + int32_t scale; + std::string expected_string; + std::tie(test_value, scale, expected_string) = GetParam(); + const Decimal128 value(test_value); + const std::string printed_value = value.ToString(scale); + ASSERT_EQ(expected_string, printed_value); +} + +INSTANTIATE_TEST_CASE_P(Decimal128PrintingTest, Decimal128PrintingTest, + ::testing::Values(std::make_tuple(123, 1, "12.3"), + std::make_tuple(123, 5, "0.00123"), + std::make_tuple(123, 10, "1.23E-8"), + std::make_tuple(123, -1, "1.23E+3"), + std::make_tuple(-123, -1, "-1.23E+3"), + std::make_tuple(123, -3, "1.23E+5"), + std::make_tuple(-123, -3, "-1.23E+5"), + std::make_tuple(12345, -3, "1.2345E+7"))); + +class Decimal128ParsingTest + : public ::testing::TestWithParam> {}; + +TEST_P(Decimal128ParsingTest, Parse) { + std::string test_string; + uint64_t expected_low_bits; + int32_t expected_scale; + std::tie(test_string, expected_low_bits, expected_scale) = GetParam(); + Decimal128 value; + int32_t scale; + ASSERT_OK(Decimal128::FromString(test_string, &value, NULLPTR, &scale)); + ASSERT_EQ(value.low_bits(), expected_low_bits); + ASSERT_EQ(expected_scale, scale); +} + +INSTANTIATE_TEST_CASE_P(Decimal128ParsingTest, Decimal128ParsingTest, + ::testing::Values(std::make_tuple("12.3", 123ULL, 1), + std::make_tuple("0.00123", 123ULL, 5), + std::make_tuple("1.23E-8", 123ULL, 10), + std::make_tuple("-1.23E-8", -123LL, 10), + std::make_tuple("1.23E+3", 123ULL, -1), + std::make_tuple("-1.23E+3", -123LL, -1), + std::make_tuple("1.23E+5", 123ULL, -3), + std::make_tuple("1.2345E+7", 12345ULL, -3), + std::make_tuple("1.23e-8", 123ULL, 10), + std::make_tuple("-1.23e-8", -123LL, 10), + std::make_tuple("1.23e+3", 123ULL, -1), + std::make_tuple("-1.23e+3", -123LL, -1), + std::make_tuple("1.23e+5", 123ULL, -3), + std::make_tuple("1.2345e+7", 12345ULL, -3))); + +class Decimal128ParsingTestInvalid : public ::testing::TestWithParam {}; + +TEST_P(Decimal128ParsingTestInvalid, Parse) { + std::string test_string = GetParam(); + Decimal128 value; + ASSERT_RAISES(Invalid, Decimal128::FromString(test_string, &value)); +} + +INSTANTIATE_TEST_CASE_P(Decimal128ParsingTestInvalid, Decimal128ParsingTestInvalid, + ::testing::Values("0.00123D/3", "1.23eA8", "1.23E+3A", + "-1.23E--5", "1.2345E+++07")); + +TEST(Decimal128ParseTest, WithExponentAndNullptrScale) { + Decimal128 value; + ASSERT_OK(Decimal128::FromString("1.23E-8", &value)); + + const Decimal128 expected_value(123); + ASSERT_EQ(expected_value, value); +} + +TEST(Decimal128Test, TestSmallNumberFormat) { + Decimal128 value("0.2"); + std::string expected("0.2"); + + const int32_t scale = 1; + std::string result = value.ToString(scale); + ASSERT_EQ(expected, result); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 7196b252c5b60..e999854b10860 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -19,14 +19,10 @@ #include #include #include +#include #include #include -#ifdef _MSC_VER -#include -#pragma intrinsic(_BitScanReverse) -#endif - #include "arrow/util/bit-util.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" @@ -47,74 +43,156 @@ Decimal128::Decimal128(const uint8_t* bytes) } std::array Decimal128::ToBytes() const { - const uint64_t raw[] = {BitUtil::ToLittleEndian(low_bits_), - BitUtil::ToLittleEndian(static_cast(high_bits_))}; - const auto* raw_data = reinterpret_cast(raw); std::array out{{0}}; - std::copy(raw_data, raw_data + out.size(), out.begin()); + ToBytes(out.data()); return out; } -std::string Decimal128::ToString(int precision, int scale) const { - using std::size_t; +void Decimal128::ToBytes(uint8_t* out) const { + DCHECK_NE(out, NULLPTR); + reinterpret_cast(out)[0] = BitUtil::ToLittleEndian(low_bits_); + reinterpret_cast(out)[1] = BitUtil::ToLittleEndian(high_bits_); +} - const bool is_negative = *this < 0; +static constexpr Decimal128 kTenTo36(static_cast(0xC097CE7BC90715), + 0xB34B9F1000000000); +static constexpr Decimal128 kTenTo18(0xDE0B6B3A7640000); - // Decimal values are sent to clients as strings so in the interest of - // speed the string will be created without the using stringstream with the - // whole/fractional_part(). - size_t last_char_idx = precision + (scale > 0) // Add a space for decimal place - + (scale == precision) // Add a space for leading 0 - + is_negative; // Add a space for negative sign +std::string Decimal128::ToIntegerString() const { + Decimal128 remainder; + std::stringstream buf; + bool need_fill = false; - std::string str(last_char_idx, '0'); + // get anything above 10 ** 36 and print it + Decimal128 top; + Status s = Divide(kTenTo36, &top, &remainder); + DCHECK(s.ok()) << s.message(); - // Start filling in the values in reverse order by taking the last digit - // of the value. Use a positive value and worry about the sign later. At this - // point the last_char_idx points to the string terminator. - Decimal128 remaining_value(*this); + if (top != 0) { + buf << static_cast(top); + remainder.Abs(); + need_fill = true; + } - const auto first_digit_idx = static_cast(is_negative); - if (is_negative) { - remaining_value.Negate(); - } - - if (scale > 0) { - int remaining_scale = scale; - do { - str[--last_char_idx] = - static_cast(remaining_value % 10 + '0'); // Ascii offset - remaining_value /= 10; - } while (--remaining_scale > 0); - str[--last_char_idx] = '.'; - DCHECK_GT(last_char_idx, first_digit_idx) << "Not enough space remaining"; - } - - do { - str[--last_char_idx] = static_cast(remaining_value % 10 + '0'); // Ascii offset - remaining_value /= 10; - if (remaining_value == 0) { - // Trim any extra leading 0's. - if (last_char_idx > first_digit_idx) { - str.erase(0, last_char_idx - first_digit_idx); - } + // now get anything above 10 ** 18 and print it + Decimal128 tail; + s = remainder.Divide(kTenTo18, &top, &tail); - break; + if (need_fill || top != 0) { + if (need_fill) { + buf << std::setw(18) << std::setfill('0'); + } else { + need_fill = true; + tail.Abs(); } - // For safety, enforce string length independent of remaining_value. - } while (last_char_idx > first_digit_idx); + + buf << static_cast(top); + } + + // finally print the tail, which is less than 10**18 + if (need_fill) { + buf << std::setw(18) << std::setfill('0'); + } + buf << static_cast(tail); + return buf.str(); +} + +Decimal128::operator int64_t() const { + DCHECK(high_bits_ == 0 || high_bits_ == -1) + << "Trying to cast an Decimal128 greater than the value range of a " + "int64_t. high_bits_ must be equal to 0 or -1, got: " + << high_bits_; + return static_cast(low_bits_); +} + +static std::string ToStringNegativeScale(const std::string& str, + int32_t adjusted_exponent, bool is_negative) { + std::stringstream buf; + + size_t offset = 0; + buf << str[offset++]; if (is_negative) { - str[0] = '-'; + buf << str[offset++]; } - return str; + buf << '.' << str.substr(offset, std::string::npos) << 'E' << std::showpos + << adjusted_exponent; + return buf.str(); } +std::string Decimal128::ToString(int32_t scale) const { + const std::string str(ToIntegerString()); + + if (scale == 0) { + return str; + } + + const bool is_negative = *this < 0; + + const auto len = static_cast(str.size()); + const auto is_negative_offset = static_cast(is_negative); + const int32_t adjusted_exponent = -scale + (len - 1 - is_negative_offset); + + /// Note that the -6 is taken from the Java BigDecimal documentation. + if (scale < 0 || adjusted_exponent < -6) { + return ToStringNegativeScale(str, adjusted_exponent, is_negative); + } + + if (is_negative) { + if (len - 1 > scale) { + const auto n = static_cast(len - scale); + return str.substr(0, n) + "." + str.substr(n, static_cast(scale)); + } + + if (len - 1 == scale) { + return "-0." + str.substr(1, std::string::npos); + } + + std::string result("-0." + std::string(static_cast(scale - len + 1), '0')); + return result + str.substr(1, std::string::npos); + } + + if (len > scale) { + const auto n = static_cast(len - scale); + return str.substr(0, n) + "." + str.substr(n, static_cast(scale)); + } + + if (len == scale) { + return "0." + str; + } + + return "0." + std::string(static_cast(scale - len), '0') + str; +} + +static constexpr auto kInt64DecimalDigits = + static_cast(std::numeric_limits::digits10); +static constexpr int64_t kPowersOfTen[kInt64DecimalDigits + 1] = {1LL, + 10LL, + 100LL, + 1000LL, + 10000LL, + 100000LL, + 1000000LL, + 10000000LL, + 100000000LL, + 1000000000LL, + 10000000000LL, + 100000000000LL, + 1000000000000LL, + 10000000000000LL, + 100000000000000LL, + 1000000000000000LL, + 10000000000000000LL, + 100000000000000000LL, + 1000000000000000000LL}; + +static inline bool isdigit(char value) { return std::isdigit(value) != 0; } + static void StringToInteger(const std::string& str, Decimal128* out) { using std::size_t; - DCHECK_NE(out, nullptr) << "Decimal128 output variable cannot be nullptr"; + DCHECK_NE(out, NULLPTR) << "Decimal128 output variable cannot be NULLPTR"; DCHECK_EQ(*out, 0) << "When converting a string to Decimal128 the initial output must be 0"; @@ -122,13 +200,10 @@ static void StringToInteger(const std::string& str, Decimal128* out) { DCHECK_GT(length, 0) << "length of parsed decimal string should be greater than 0"; - size_t posn = 0; - - while (posn < length) { - const size_t group = std::min(static_cast(18), length - posn); - const auto chunk = static_cast(std::stoll(str.substr(posn, group))); - const auto multiple = - static_cast(std::pow(10.0, static_cast(group))); + for (size_t posn = 0; posn < length;) { + const size_t group = std::min(kInt64DecimalDigits, length - posn); + const int64_t chunk = std::stoll(str.substr(posn, group)); + const int64_t multiple = kPowersOfTen[group]; *out *= multiple; *out += chunk; @@ -139,7 +214,7 @@ static void StringToInteger(const std::string& str, Decimal128* out) { Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precision, int* scale) { - // Implements this regex: "(\\+?|-?)((0*)(\\d*))(\\.(\\d+))?"; + // Implements this regex: "(\\+?|-?)((0*)(\\d*))(\\.(\\d+))?((E|e)(\\+|-)?\\d+)?"; if (s.empty()) { return Status::Invalid("Empty string cannot be converted to decimal"); } @@ -165,21 +240,21 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precis DCHECK_LT(charp, end); // skip leading zeros - charp = std::find_if_not(charp, end, [](char c) { return c == '0'; }); + charp = std::find_if_not(charp, end, [](char value) { return value == '0'; }); // all zeros and no decimal point if (charp == end) { - if (out != nullptr) { + if (out != NULLPTR) { *out = 0; } // Not sure what other libraries assign precision to for this case (this case of // a string consisting only of one or more zeros) - if (precision != nullptr) { + if (precision != NULLPTR) { *precision = static_cast(charp - numeric_string_start); } - if (scale != nullptr) { + if (scale != NULLPTR) { *scale = 0; } @@ -188,7 +263,7 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precis std::string::const_iterator whole_part_start = charp; - charp = std::find_if_not(charp, end, [](char c) { return std::isdigit(c) != 0; }); + charp = std::find_if_not(charp, end, isdigit); std::string::const_iterator whole_part_end = charp; std::string whole_part(whole_part_start, whole_part_end); @@ -219,14 +294,13 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precis std::string::const_iterator fractional_part_start = charp; - // The rest must be digits, because if we have a decimal point it must be followed by - // digits + // The rest must be digits or an exponent if (charp != end) { - charp = std::find_if_not(charp, end, [](char c) { return std::isdigit(c) != 0; }); + charp = std::find_if_not(charp, end, isdigit); // The while loop has ended before the end of the string which means we've hit a - // character that isn't a base ten digit - if (charp != end) { + // character that isn't a base ten digit or "E" for exponent + if (charp != end && *charp != 'E' && *charp != 'e') { std::stringstream ss; ss << "Found non base ten digit character '" << *charp << "' before the end of the string"; @@ -237,15 +311,55 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precis std::string::const_iterator fractional_part_end = charp; std::string fractional_part(fractional_part_start, fractional_part_end); - if (precision != nullptr) { + if (precision != NULLPTR) { *precision = static_cast(whole_part.size() + fractional_part.size()); } - if (scale != nullptr) { - *scale = static_cast(fractional_part.size()); + if (charp != end) { + // we must have an exponent, if this aborts then we have somehow not caught this and + // raised a proper error + DCHECK(*charp == 'E' || *charp == 'e'); + + ++charp; + + const char value = *charp; + const bool starts_with_plus_or_minus = value == '+' || value == '-'; + + // we use this to construct the adjusted exponent integer later + std::string::const_iterator digit_start = charp; + + // skip plus or minus + charp += starts_with_plus_or_minus; + + // confirm that the rest of the characters are digits + charp = std::find_if_not(charp, end, isdigit); + + if (charp != end) { + // we have something other than digits here + std::stringstream ss; + ss << "Found non decimal digit exponent value '" << *charp << "'"; + return Status::Invalid(ss.str()); + } + + if (scale != NULLPTR) { + // compute the scale from the adjusted exponent + std::string adjusted_exponent_string(digit_start, end); + DCHECK(std::all_of(adjusted_exponent_string.cbegin() + starts_with_plus_or_minus, + adjusted_exponent_string.cend(), isdigit)) + << "Non decimal digit character found in " << adjusted_exponent_string; + const auto adjusted_exponent = + static_cast(std::stol(adjusted_exponent_string)); + const auto len = static_cast(whole_part.size() + fractional_part.size()); + + *scale = -adjusted_exponent + len - 1; + } + } else { + if (scale != NULLPTR) { + *scale = static_cast(fractional_part.size()); + } } - if (out != nullptr) { + if (out != NULLPTR) { // zero out in case we've passed in a previously used value *out = 0; StringToInteger(whole_part + fractional_part, out); @@ -266,6 +380,8 @@ Decimal128& Decimal128::Negate() { return *this; } +Decimal128& Decimal128::Abs() { return *this < 0 ? Negate() : *this; } + Decimal128& Decimal128::operator+=(const Decimal128& right) { const uint64_t sum = low_bits_ + right.low_bits_; high_bits_ += right.high_bits_; @@ -288,20 +404,11 @@ Decimal128& Decimal128::operator-=(const Decimal128& right) { Decimal128& Decimal128::operator/=(const Decimal128& right) { Decimal128 remainder; - DCHECK(Divide(right, this, &remainder).ok()); + Status s = Divide(right, this, &remainder); + DCHECK(s.ok()); return *this; } -Decimal128::operator char() const { - DCHECK(high_bits_ == 0 || high_bits_ == -1) - << "Trying to cast an Decimal128 greater than the value range of a " - "char. high_bits_ must be equal to 0 or -1, got: " - << high_bits_; - DCHECK_LE(low_bits_, std::numeric_limits::max()) - << "low_bits_ too large for C type char, got: " << low_bits_; - return static_cast(low_bits_); -} - Decimal128& Decimal128::operator|=(const Decimal128& right) { low_bits_ |= right.low_bits_; high_bits_ |= right.high_bits_; @@ -440,18 +547,6 @@ static int64_t FillInArray(const Decimal128& value, uint32_t* array, bool& was_n return 1; } -/// \brief Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is the MSB. -static int64_t FindLastSetBit(uint32_t value) { -#if defined(__clang__) || defined(__GNUC__) - // Count leading zeros - return __builtin_clz(value) + 1; -#elif defined(_MSC_VER) - unsigned long index; // NOLINT - _BitScanReverse(&index, static_cast(value)); // NOLINT - return static_cast(index + 1UL); -#endif -} - /// Shift the number in the array left by bits positions. /// \param array the number to shift, must have length elements /// \param length the number of entries in the array @@ -581,7 +676,7 @@ Status Decimal128::Divide(const Decimal128& divisor, Decimal128* result, // Normalize by shifting both by a multiple of 2 so that // the digit guessing is better. The requirement is that // divisor_array[0] is greater than 2**31. - int64_t normalize_bits = 32 - FindLastSetBit(divisor_array[0]); + int64_t normalize_bits = BitUtil::CountLeadingZeros(divisor_array[0]); ShiftArrayLeft(divisor_array, divisor_length, normalize_bits); ShiftArrayLeft(dividend_array, dividend_length, normalize_bits); @@ -589,7 +684,7 @@ Status Decimal128::Divide(const Decimal128& divisor, Decimal128* result, for (int64_t j = 0; j < result_length; ++j) { // Guess the next digit. At worst it is two too large uint32_t guess = std::numeric_limits::max(); - auto high_dividend = + const auto high_dividend = static_cast(dividend_array[j]) << 32 | dividend_array[j + 1]; if (dividend_array[j] != divisor_array[0]) { guess = static_cast(high_dividend / divisor_array[0]); @@ -625,10 +720,9 @@ Status Decimal128::Divide(const Decimal128& divisor, Decimal128* result, // if guess was too big, we add back divisor if (dividend_array[j] > prev) { --guess; - uint32_t carry = 0; for (int64_t i = divisor_length - 1; i >= 0; --i) { - uint64_t sum = + const auto sum = static_cast(divisor_array[i]) + dividend_array[j + i + 1] + carry; dividend_array[j + i + 1] = static_cast(sum); carry = static_cast(sum >> 32); @@ -645,6 +739,7 @@ Status Decimal128::Divide(const Decimal128& divisor, Decimal128* result, // return result and remainder RETURN_NOT_OK(BuildFromArray(result, result_array, result_length)); RETURN_NOT_OK(BuildFromArray(remainder, dividend_array, dividend_length)); + FixDivisionSigns(result, remainder, dividend_was_negative, divisor_was_negative); return Status::OK(); } @@ -679,6 +774,11 @@ Decimal128 operator-(const Decimal128& operand) { return result.Negate(); } +Decimal128 operator~(const Decimal128& operand) { + Decimal128 result(~operand.high_bits(), ~operand.low_bits()); + return result; +} + Decimal128 operator+(const Decimal128& left, const Decimal128& right) { Decimal128 result(left.high_bits(), left.low_bits()); result += right; @@ -700,15 +800,81 @@ Decimal128 operator*(const Decimal128& left, const Decimal128& right) { Decimal128 operator/(const Decimal128& left, const Decimal128& right) { Decimal128 remainder; Decimal128 result; - DCHECK(left.Divide(right, &result, &remainder).ok()); + Status s = left.Divide(right, &result, &remainder); + DCHECK(s.ok()); return result; } Decimal128 operator%(const Decimal128& left, const Decimal128& right) { Decimal128 remainder; Decimal128 result; - DCHECK(left.Divide(right, &result, &remainder).ok()); + Status s = left.Divide(right, &result, &remainder); + DCHECK(s.ok()); return remainder; } +static const Decimal128 ScaleMultipliers[] = { + Decimal128(1), + Decimal128(10), + Decimal128(100), + Decimal128(1000), + Decimal128(10000), + Decimal128(100000), + Decimal128(1000000), + Decimal128(10000000), + Decimal128(100000000), + Decimal128(1000000000), + Decimal128(10000000000), + Decimal128(100000000000), + Decimal128(1000000000000), + Decimal128(10000000000000), + Decimal128(100000000000000), + Decimal128(1000000000000000), + Decimal128(10000000000000000), + Decimal128(100000000000000000), + Decimal128(1000000000000000000), + Decimal128("10000000000000000000"), + Decimal128("100000000000000000000"), + Decimal128("1000000000000000000000"), + Decimal128("10000000000000000000000"), + Decimal128("100000000000000000000000"), + Decimal128("1000000000000000000000000"), + Decimal128("10000000000000000000000000"), + Decimal128("100000000000000000000000000"), + Decimal128("1000000000000000000000000000"), + Decimal128("10000000000000000000000000000"), + Decimal128("100000000000000000000000000000"), + Decimal128("1000000000000000000000000000000"), + Decimal128("10000000000000000000000000000000"), + Decimal128("100000000000000000000000000000000"), + Decimal128("1000000000000000000000000000000000"), + Decimal128("10000000000000000000000000000000000"), + Decimal128("100000000000000000000000000000000000"), + Decimal128("1000000000000000000000000000000000000"), + Decimal128("10000000000000000000000000000000000000"), + Decimal128("100000000000000000000000000000000000000")}; + +Status Decimal128::Rescale(int32_t original_scale, int32_t new_scale, + Decimal128* out) const { + DCHECK_NE(out, NULLPTR); + DCHECK_NE(original_scale, new_scale); + const int32_t delta_scale = original_scale - new_scale; + const int32_t abs_delta_scale = std::abs(delta_scale); + DCHECK_GE(abs_delta_scale, 1); + DCHECK_LE(abs_delta_scale, 38); + + const Decimal128 scale_multiplier = ScaleMultipliers[abs_delta_scale]; + const Decimal128 result = *this * scale_multiplier; + + if (ARROW_PREDICT_FALSE(result < *this)) { + std::stringstream buf; + buf << "Rescaling decimal value from original scale " << original_scale + << " to new scale " << new_scale << " would cause overflow"; + return Status::Invalid(buf.str()); + } + + *out = result; + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 72da5547907db..1594090a0d30e 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -39,15 +39,16 @@ namespace arrow { class ARROW_EXPORT Decimal128 { public: /// \brief Create an Decimal128 from the two's complement representation. - constexpr Decimal128(int64_t high, uint64_t low) : high_bits_(high), low_bits_(low) {} + constexpr Decimal128(int64_t high, uint64_t low) noexcept + : high_bits_(high), low_bits_(low) {} /// \brief Empty constructor creates an Decimal128 with a value of 0. - constexpr Decimal128() : Decimal128(0, 0) {} + constexpr Decimal128() noexcept : Decimal128(0, 0) {} /// \brief Convert any integer value into an Decimal128. template ::value, T>::type> - constexpr Decimal128(T value) + constexpr Decimal128(T value) noexcept : Decimal128(static_cast(value) >= 0 ? 0 : -1, static_cast(value)) {} @@ -61,6 +62,9 @@ class ARROW_EXPORT Decimal128 { /// \brief Negate the current value Decimal128& Negate(); + /// \brief Absolute value + Decimal128& Abs(); + /// \brief Add a number to this one. The result is truncated to 128 bits. Decimal128& operator+=(const Decimal128& right); @@ -85,9 +89,6 @@ class ARROW_EXPORT Decimal128 { /// \brief In-place division. Decimal128& operator/=(const Decimal128& right); - /// \brief Cast the value to char. This is used when converting the value a string. - explicit operator char() const; - /// \brief Bitwise or between two Decimal128. Decimal128& operator|=(const Decimal128& right); @@ -101,23 +102,33 @@ class ARROW_EXPORT Decimal128 { Decimal128& operator>>=(uint32_t bits); /// \brief Get the high bits of the two's complement representation of the number. - int64_t high_bits() const { return high_bits_; } + inline int64_t high_bits() const { return high_bits_; } /// \brief Get the low bits of the two's complement representation of the number. - uint64_t low_bits() const { return low_bits_; } + inline uint64_t low_bits() const { return low_bits_; } /// \brief Return the raw bytes of the value in little-endian byte order. std::array ToBytes() const; + void ToBytes(uint8_t* out) const; /// \brief Convert the Decimal128 value to a base 10 decimal string with the given - /// precision and scale. - std::string ToString(int precision, int scale) const; + /// scale. + std::string ToString(int32_t scale) const; + + /// \brief Convert the value to an integer string + std::string ToIntegerString() const; + + /// \brief Cast this value to an int64_t. + explicit operator int64_t() const; /// \brief Convert a decimal string to an Decimal128 value, optionally including /// precision and scale if they're passed in and not null. static Status FromString(const std::string& s, Decimal128* out, int* precision = NULLPTR, int* scale = NULLPTR); + /// \brief Convert Decimal128 from one scale to another + Status Rescale(int32_t original_scale, int32_t new_scale, Decimal128* out) const; + private: int64_t high_bits_; uint64_t low_bits_; @@ -131,6 +142,7 @@ ARROW_EXPORT bool operator>(const Decimal128& left, const Decimal128& right); ARROW_EXPORT bool operator>=(const Decimal128& left, const Decimal128& right); ARROW_EXPORT Decimal128 operator-(const Decimal128& operand); +ARROW_EXPORT Decimal128 operator~(const Decimal128& operand); ARROW_EXPORT Decimal128 operator+(const Decimal128& left, const Decimal128& right); ARROW_EXPORT Decimal128 operator-(const Decimal128& left, const Decimal128& right); ARROW_EXPORT Decimal128 operator*(const Decimal128& left, const Decimal128& right); diff --git a/cpp/src/arrow/util/hash.cc b/cpp/src/arrow/util/hash.cc new file mode 100644 index 0000000000000..94ba524560faf --- /dev/null +++ b/cpp/src/arrow/util/hash.cc @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/hash.h" + +#include "arrow/buffer.h" +#include "arrow/status.h" + +namespace arrow { +namespace internal { + +Status NewHashTable(int64_t size, MemoryPool* pool, std::shared_ptr* out) { + auto hash_table = std::make_shared(pool); + + RETURN_NOT_OK(hash_table->Resize(sizeof(hash_slot_t) * size)); + int32_t* slots = reinterpret_cast(hash_table->mutable_data()); + std::fill(slots, slots + size, kHashSlotEmpty); + + *out = hash_table; + return Status::OK(); +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/hash.h b/cpp/src/arrow/util/hash.h new file mode 100644 index 0000000000000..3597342716388 --- /dev/null +++ b/cpp/src/arrow/util/hash.h @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_HASH_H +#define ARROW_UTIL_HASH_H + +#include +#include +#include + +namespace arrow { + +class Buffer; +class MemoryPool; +class Status; + +typedef int32_t hash_slot_t; +static constexpr hash_slot_t kHashSlotEmpty = std::numeric_limits::max(); + +// Initially 1024 elements +static constexpr int kInitialHashTableSize = 1 << 10; + +// The maximum load factor for the hash table before resizing. +static constexpr double kMaxHashTableLoad = 0.5; + +namespace internal { + +#define DOUBLE_TABLE_SIZE(SETUP_CODE, COMPUTE_HASH) \ + do { \ + int64_t new_size = hash_table_size_ * 2; \ + \ + std::shared_ptr new_hash_table; \ + RETURN_NOT_OK(internal::NewHashTable(new_size, pool_, &new_hash_table)); \ + int32_t* new_hash_slots = \ + reinterpret_cast(new_hash_table->mutable_data()); \ + int64_t new_mod_bitmask = new_size - 1; \ + \ + SETUP_CODE; \ + \ + for (int i = 0; i < hash_table_size_; ++i) { \ + hash_slot_t index = hash_slots_[i]; \ + \ + if (index == kHashSlotEmpty) { \ + continue; \ + } \ + \ + COMPUTE_HASH; \ + while (kHashSlotEmpty != new_hash_slots[j]) { \ + ++j; \ + if (ARROW_PREDICT_FALSE(j == new_size)) { \ + j = 0; \ + } \ + } \ + \ + new_hash_slots[j] = index; \ + } \ + \ + hash_table_ = new_hash_table; \ + hash_slots_ = reinterpret_cast(hash_table_->mutable_data()); \ + hash_table_size_ = new_size; \ + hash_table_load_threshold_ = \ + static_cast(static_cast(new_size) * kMaxHashTableLoad); \ + mod_bitmask_ = new_size - 1; \ + } while (false) + +Status NewHashTable(int64_t size, MemoryPool* pool, std::shared_ptr* out); + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_HASH_H diff --git a/cpp/src/arrow/util/io-util.h b/cpp/src/arrow/util/io-util.h index dbca0d8be394e..7e2a94ca82320 100644 --- a/cpp/src/arrow/util/io-util.h +++ b/cpp/src/arrow/util/io-util.h @@ -40,7 +40,7 @@ class StdoutStream : public OutputStream { return Status::OK(); } - Status Write(const uint8_t* data, int64_t nbytes) override { + Status Write(const void* data, int64_t nbytes) override { pos_ += nbytes; std::cout.write(reinterpret_cast(data), nbytes); return Status::OK(); @@ -63,7 +63,7 @@ class StdinStream : public InputStream { return Status::OK(); } - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) override { + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override { std::cin.read(reinterpret_cast(out), nbytes); if (std::cin) { *bytes_read = nbytes; diff --git a/cpp/src/arrow/util/random.h b/cpp/src/arrow/util/random.h deleted file mode 100644 index 2e05a73033d0f..0000000000000 --- a/cpp/src/arrow/util/random.h +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// Moved from Kudu http://github.com/cloudera/kudu - -#ifndef ARROW_UTIL_RANDOM_H_ -#define ARROW_UTIL_RANDOM_H_ - -#include - -#include - -namespace arrow { -namespace internal { -namespace random { - -static const uint32_t M = 2147483647L; // 2^31-1 -const double kTwoPi = 6.283185307179586476925286; - -} // namespace random -} // namespace internal - -// A very simple random number generator. Not especially good at -// generating truly random bits, but good enough for our needs in this -// package. This implementation is not thread-safe. -class Random { - public: - explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { - // Avoid bad seeds. - if (seed_ == 0 || seed_ == internal::random::M) { - seed_ = 1; - } - } - - // Next pseudo-random 32-bit unsigned integer. - // FIXME: This currently only generates 31 bits of randomness. - // The MSB will always be zero. - uint32_t Next() { - static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 - // We are computing - // seed_ = (seed_ * A) % M, where M = 2^31-1 - // - // seed_ must not be zero or M, or else all subsequent computed values - // will be zero or M respectively. For all other values, seed_ will end - // up cycling through every number in [1,M-1] - uint64_t product = seed_ * A; - - // Compute (product % M) using the fact that ((x << 31) % M) == x. - seed_ = static_cast((product >> 31) + (product & internal::random::M)); - // The first reduction may overflow by 1 bit, so we may need to - // repeat. mod == M is not possible; using > allows the faster - // sign-bit-based test. - if (seed_ > internal::random::M) { - seed_ -= internal::random::M; - } - return seed_; - } - - // Alias for consistency with Next64 - uint32_t Next32() { return Next(); } - - // Next pseudo-random 64-bit unsigned integer. - // FIXME: This currently only generates 62 bits of randomness due to Next() - // only giving 31 bits of randomness. The 2 most significant bits will always - // be zero. - uint64_t Next64() { - uint64_t large = Next(); - // Only shift by 31 bits so we end up with zeros in MSB and not scattered - // throughout the 64-bit word. This is due to the weakness in Next() noted - // above. - large <<= 31; - large |= Next(); - return large; - } - - // Returns a uniformly distributed value in the range [0..n-1] - // REQUIRES: n > 0 - uint32_t Uniform(uint32_t n) { return Next() % n; } - - // Alias for consistency with Uniform64 - uint32_t Uniform32(uint32_t n) { return Uniform(n); } - - // Returns a uniformly distributed 64-bit value in the range [0..n-1] - // REQUIRES: n > 0 - uint64_t Uniform64(uint64_t n) { return Next64() % n; } - - // Randomly returns true ~"1/n" of the time, and false otherwise. - // REQUIRES: n > 0 - bool OneIn(int n) { return (Next() % n) == 0; } - - // Skewed: pick "base" uniformly from range [0,max_log] and then - // return "base" random bits. The effect is to pick a number in the - // range [0,2^max_log-1] with exponential bias towards smaller numbers. - uint32_t Skewed(int max_log) { return Uniform(1 << Uniform(max_log + 1)); } - - // Creates a normal distribution variable using the - // Box-Muller transform. See: - // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform - // Adapted from WebRTC source code at: - // webrtc/trunk/modules/video_coding/main/test/test_util.cc - double Normal(double mean, double std_dev) { - double uniform1 = (Next() + 1.0) / (internal::random::M + 1.0); - double uniform2 = (Next() + 1.0) / (internal::random::M + 1.0); - return (mean + - std_dev * sqrt(-2 * ::log(uniform1)) * - cos(internal::random::kTwoPi * uniform2)); - } - - // Return a random number between 0.0 and 1.0 inclusive. - double NextDoubleFraction() { - return Next() / static_cast(internal::random::M + 1.0); - } - - private: - uint32_t seed_; -}; - -uint32_t random_seed() { - // TODO(wesm): use system time to get a reasonably random seed - return 0; -} - -} // namespace arrow - -#endif // ARROW_UTIL_RANDOM_H_ diff --git a/cpp/src/arrow/util/variant.h b/cpp/src/arrow/util/variant.h new file mode 100644 index 0000000000000..9bfc52cb1c9a2 --- /dev/null +++ b/cpp/src/arrow/util/variant.h @@ -0,0 +1,1117 @@ +// Copyright (c) MapBox +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this +// list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// - Neither the name "MapBox" nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef ARROW_UTIL_VARIANT_H +#define ARROW_UTIL_VARIANT_H + +#include +#include // size_t +#include // operator new +#include // runtime_error +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// clang-format off +// [[deprecated]] is only available in C++14, use this for the time being +#if __cplusplus <= 201103L +# ifdef __GNUC__ +# define ARROW_VARIANT_DEPRECATED __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define ARROW_VARIANT_DEPRECATED __declspec(deprecated) +# else +# define ARROW_VARIANT_DEPRECATED +# endif +#else +# define ARROW_VARIANT_DEPRECATED [[deprecated]] +#endif + + +#ifdef _MSC_VER +// https://msdn.microsoft.com/en-us/library/bw1hbe6y.aspx +# ifdef NDEBUG +# define VARIANT_INLINE __forceinline +# else +# define VARIANT_INLINE //__declspec(noinline) +# endif +#else +# ifdef NDEBUG +# define VARIANT_INLINE //inline __attribute__((always_inline)) +# else +# define VARIANT_INLINE __attribute__((noinline)) +# endif +#endif +// clang-format on + +// Exceptions +#if defined( __EXCEPTIONS) || defined( _MSC_VER) +#define HAS_EXCEPTIONS +#endif + +#define VARIANT_MAJOR_VERSION 1 +#define VARIANT_MINOR_VERSION 1 +#define VARIANT_PATCH_VERSION 0 + +#define VARIANT_VERSION (VARIANT_MAJOR_VERSION * 100000) + (VARIANT_MINOR_VERSION * 100) + (VARIANT_PATCH_VERSION) + +namespace arrow { +namespace util { + +// XXX This should derive from std::logic_error instead of std::runtime_error. +// See https://github.com/mapbox/variant/issues/48 for details. +class bad_variant_access : public std::runtime_error +{ + +public: + explicit bad_variant_access(const std::string& what_arg) + : runtime_error(what_arg) {} + + explicit bad_variant_access(const char* what_arg) + : runtime_error(what_arg) {} + +}; // class bad_variant_access + +#if !defined(ARROW_VARIANT_MINIMIZE_SIZE) +using type_index_t = unsigned int; +#else +#if defined(ARROW_VARIANT_OPTIMIZE_FOR_SPEED) +using type_index_t = std::uint_fast8_t; +#else +using type_index_t = std::uint_least8_t; +#endif +#endif + +namespace detail { + +static constexpr type_index_t invalid_value = type_index_t(-1); + +template +struct direct_type; + +template +struct direct_type +{ + static constexpr type_index_t index = std::is_same::value + ? sizeof...(Types) + : direct_type::index; +}; + +template +struct direct_type +{ + static constexpr type_index_t index = invalid_value; +}; + +#if __cpp_lib_logical_traits >= 201510L + +using std::conjunction; +using std::disjunction; + +#else + +template +struct conjunction : std::true_type {}; + +template +struct conjunction : B1 {}; + +template +struct conjunction : std::conditional::type {}; + +template +struct conjunction : std::conditional, B1>::type {}; + +template +struct disjunction : std::false_type {}; + +template +struct disjunction : B1 {}; + +template +struct disjunction : std::conditional::type {}; + +template +struct disjunction : std::conditional>::type {}; + +#endif + +template +struct convertible_type; + +template +struct convertible_type +{ + static constexpr type_index_t index = std::is_convertible::value + ? disjunction...>::value ? invalid_value : sizeof...(Types) + : convertible_type::index; +}; + +template +struct convertible_type +{ + static constexpr type_index_t index = invalid_value; +}; + +template +struct value_traits +{ + using value_type = typename std::remove_const::type>::type; + using value_type_wrapper = recursive_wrapper; + static constexpr type_index_t direct_index = direct_type::index; + static constexpr bool is_direct = direct_index != invalid_value; + static constexpr type_index_t index_direct_or_wrapper = is_direct ? direct_index : direct_type::index; + static constexpr bool is_direct_or_wrapper = index_direct_or_wrapper != invalid_value; + static constexpr type_index_t index = is_direct_or_wrapper ? index_direct_or_wrapper : convertible_type::index; + static constexpr bool is_valid = index != invalid_value; + static constexpr type_index_t tindex = is_valid ? sizeof...(Types)-index : 0; + using target_type = typename std::tuple_element>::type; +}; + +template +struct enable_if_type +{ + using type = R; +}; + +template +struct result_of_unary_visit +{ + using type = typename std::result_of::type; +}; + +template +struct result_of_unary_visit::type> +{ + using type = typename F::result_type; +}; + +template +struct result_of_binary_visit +{ + using type = typename std::result_of::type; +}; + +template +struct result_of_binary_visit::type> +{ + using type = typename F::result_type; +}; + +template +struct static_max; + +template +struct static_max +{ + static const type_index_t value = arg; +}; + +template +struct static_max +{ + static const type_index_t value = arg1 >= arg2 ? static_max::value : static_max::value; +}; + +template +struct variant_helper; + +template +struct variant_helper +{ + VARIANT_INLINE static void destroy(const type_index_t type_index, void* data) + { + if (type_index == sizeof...(Types)) + { + reinterpret_cast(data)->~T(); + } + else + { + variant_helper::destroy(type_index, data); + } + } + + VARIANT_INLINE static void move(const type_index_t old_type_index, void* old_value, void* new_value) + { + if (old_type_index == sizeof...(Types)) + { + new (new_value) T(std::move(*reinterpret_cast(old_value))); + } + else + { + variant_helper::move(old_type_index, old_value, new_value); + } + } + + VARIANT_INLINE static void copy(const type_index_t old_type_index, const void* old_value, void* new_value) + { + if (old_type_index == sizeof...(Types)) + { + new (new_value) T(*reinterpret_cast(old_value)); + } + else + { + variant_helper::copy(old_type_index, old_value, new_value); + } + } +}; + +template <> +struct variant_helper<> +{ + VARIANT_INLINE static void destroy(const type_index_t, void*) {} + VARIANT_INLINE static void move(const type_index_t, void*, void*) {} + VARIANT_INLINE static void copy(const type_index_t, const void*, void*) {} +}; + +template +struct unwrapper +{ + static T const& apply_const(T const& obj) { return obj; } + static T& apply(T& obj) { return obj; } +}; + +template +struct unwrapper> +{ + static auto apply_const(recursive_wrapper const& obj) + -> typename recursive_wrapper::type const& + { + return obj.get(); + } + static auto apply(recursive_wrapper& obj) + -> typename recursive_wrapper::type& + { + return obj.get(); + } +}; + +template +struct unwrapper> +{ + static auto apply_const(std::reference_wrapper const& obj) + -> typename std::reference_wrapper::type const& + { + return obj.get(); + } + static auto apply(std::reference_wrapper& obj) + -> typename std::reference_wrapper::type& + { + return obj.get(); + } +}; + +template +struct dispatcher; + +template +struct dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v, F&& f) + { + if (v.template is()) + { + return f(unwrapper::apply_const(v.template get_unchecked())); + } + else + { + return dispatcher::apply_const(v, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& v, F&& f) + { + if (v.template is()) + { + return f(unwrapper::apply(v.template get_unchecked())); + } + else + { + return dispatcher::apply(v, std::forward(f)); + } + } +}; + +template +struct dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v, F&& f) + { + return f(unwrapper::apply_const(v.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& v, F&& f) + { + return f(unwrapper::apply(v.template get_unchecked())); + } +}; + +template +struct binary_dispatcher_rhs; + +template +struct binary_dispatcher_rhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + if (rhs.template is()) // call binary functor + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_rhs::apply_const(lhs, rhs, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + if (rhs.template is()) // call binary functor + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_rhs::apply(lhs, rhs, std::forward(f)); + } + } +}; + +template +struct binary_dispatcher_rhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } +}; + +template +struct binary_dispatcher_lhs; + +template +struct binary_dispatcher_lhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + if (lhs.template is()) // call binary functor + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_lhs::apply_const(lhs, rhs, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + if (lhs.template is()) // call binary functor + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_lhs::apply(lhs, rhs, std::forward(f)); + } + } +}; + +template +struct binary_dispatcher_lhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } +}; + +template +struct binary_dispatcher; + +template +struct binary_dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) + { + if (v0.template is()) + { + if (v1.template is()) + { + return f(unwrapper::apply_const(v0.template get_unchecked()), + unwrapper::apply_const(v1.template get_unchecked())); // call binary functor + } + else + { + return binary_dispatcher_rhs::apply_const(v0, v1, std::forward(f)); + } + } + else if (v1.template is()) + { + return binary_dispatcher_lhs::apply_const(v0, v1, std::forward(f)); + } + return binary_dispatcher::apply_const(v0, v1, std::forward(f)); + } + + VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) + { + if (v0.template is()) + { + if (v1.template is()) + { + return f(unwrapper::apply(v0.template get_unchecked()), + unwrapper::apply(v1.template get_unchecked())); // call binary functor + } + else + { + return binary_dispatcher_rhs::apply(v0, v1, std::forward(f)); + } + } + else if (v1.template is()) + { + return binary_dispatcher_lhs::apply(v0, v1, std::forward(f)); + } + return binary_dispatcher::apply(v0, v1, std::forward(f)); + } +}; + +template +struct binary_dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) + { + return f(unwrapper::apply_const(v0.template get_unchecked()), + unwrapper::apply_const(v1.template get_unchecked())); // call binary functor + } + + VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) + { + return f(unwrapper::apply(v0.template get_unchecked()), + unwrapper::apply(v1.template get_unchecked())); // call binary functor + } +}; + +// comparator functors +struct equal_comp +{ + template + bool operator()(T const& lhs, T const& rhs) const + { + return lhs == rhs; + } +}; + +struct less_comp +{ + template + bool operator()(T const& lhs, T const& rhs) const + { + return lhs < rhs; + } +}; + +template +class comparer +{ +public: + explicit comparer(Variant const& lhs) noexcept + : lhs_(lhs) {} + comparer& operator=(comparer const&) = delete; + // visitor + template + bool operator()(T const& rhs_content) const + { + T const& lhs_content = lhs_.template get_unchecked(); + return Comp()(lhs_content, rhs_content); + } + +private: + Variant const& lhs_; +}; + +// hashing visitor +struct hasher +{ + template + std::size_t operator()(const T& hashable) const + { + return std::hash{}(hashable); + } +}; + +} // namespace detail + +struct no_init {}; + +template +class variant +{ + static_assert(sizeof...(Types) > 0, "Template parameter type list of variant can not be empty."); + static_assert(!detail::disjunction...>::value, "Variant can not hold reference types. Maybe use std::reference_wrapper?"); + static_assert(!detail::disjunction...>::value, "Variant can not hold array types."); + static_assert(sizeof...(Types) < std::numeric_limits::max(), "Internal index type must be able to accommodate all alternatives."); +private: + static const std::size_t data_size = detail::static_max::value; + static const std::size_t data_align = detail::static_max::value; +public: + struct adapted_variant_tag; + using types = std::tuple; +private: + using first_type = typename std::tuple_element<0, types>::type; + using data_type = typename std::aligned_storage::type; + using helper_type = detail::variant_helper; + + type_index_t type_index; + data_type data; + +public: + VARIANT_INLINE variant() noexcept(std::is_nothrow_default_constructible::value) + : type_index(sizeof...(Types)-1) + { + static_assert(std::is_default_constructible::value, "First type in variant must be default constructible to allow default construction of variant."); + new (&data) first_type(); + } + + VARIANT_INLINE variant(no_init) noexcept + : type_index(detail::invalid_value) {} + + // http://isocpp.org/blog/2012/11/universal-references-in-c11-scott-meyers + template , + typename Enable = typename std::enable_if, typename Traits::value_type>::value>::type > + VARIANT_INLINE variant(T&& val) noexcept(std::is_nothrow_constructible::value) + : type_index(Traits::index) + { + new (&data) typename Traits::target_type(std::forward(val)); + } + + VARIANT_INLINE variant(variant const& old) + : type_index(old.type_index) + { + helper_type::copy(old.type_index, &old.data, &data); + } + + VARIANT_INLINE variant(variant&& old) + noexcept(detail::conjunction...>::value) + : type_index(old.type_index) + { + helper_type::move(old.type_index, &old.data, &data); + } + +private: + VARIANT_INLINE void copy_assign(variant const& rhs) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + helper_type::copy(rhs.type_index, &rhs.data, &data); + type_index = rhs.type_index; + } + + VARIANT_INLINE void move_assign(variant&& rhs) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + helper_type::move(rhs.type_index, &rhs.data, &data); + type_index = rhs.type_index; + } + +public: + VARIANT_INLINE variant& operator=(variant&& other) + { + move_assign(std::move(other)); + return *this; + } + + VARIANT_INLINE variant& operator=(variant const& other) + { + copy_assign(other); + return *this; + } + + // conversions + // move-assign + template + VARIANT_INLINE variant& operator=(T&& rhs) noexcept + { + variant temp(std::forward(rhs)); + move_assign(std::move(temp)); + return *this; + } + + // copy-assign + template + VARIANT_INLINE variant& operator=(T const& rhs) + { + variant temp(rhs); + copy_assign(temp); + return *this; + } + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE bool is() const + { + return type_index == detail::direct_type::index; + } + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE bool is() const + { + return type_index == detail::direct_type, Types...>::index; + } + + VARIANT_INLINE bool valid() const + { + return type_index != detail::invalid_value; + } + + template + VARIANT_INLINE void set(Args&&... args) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + new (&data) T(std::forward(args)...); + type_index = detail::direct_type::index; + } + + // get_unchecked() + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return *reinterpret_cast(&data); + } + +#ifdef HAS_EXCEPTIONS + // get() + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type::index) + { + return *reinterpret_cast(&data); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return *reinterpret_cast(&data); + } + +#ifdef HAS_EXCEPTIONS + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type::index) + { + return *reinterpret_cast(&data); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // get_unchecked() - T stored as recursive_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return (*reinterpret_cast*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + // get() - T stored as recursive_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return (*reinterpret_cast const*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast const*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // get_unchecked() - T stored as std::reference_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return (*reinterpret_cast*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + // get() - T stored as std::reference_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return (*reinterpret_cast const*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast const*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // This function is deprecated because it returns an internal index field. + // Use which() instead. + ARROW_VARIANT_DEPRECATED VARIANT_INLINE type_index_t get_type_index() const + { + return type_index; + } + + VARIANT_INLINE int which() const noexcept + { + return static_cast(sizeof...(Types) - type_index - 1); + } + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE static constexpr int which() noexcept + { + return static_cast(sizeof...(Types)-detail::direct_type::index - 1); + } + + // visitor + // unary + template ::type> + auto VARIANT_INLINE static visit(V const& v, F&& f) + -> decltype(detail::dispatcher::apply_const(v, std::forward(f))) + { + return detail::dispatcher::apply_const(v, std::forward(f)); + } + // non-const + template ::type> + auto VARIANT_INLINE static visit(V& v, F&& f) + -> decltype(detail::dispatcher::apply(v, std::forward(f))) + { + return detail::dispatcher::apply(v, std::forward(f)); + } + + // binary + // const + template ::type> + auto VARIANT_INLINE static binary_visit(V const& v0, V const& v1, F&& f) + -> decltype(detail::binary_dispatcher::apply_const(v0, v1, std::forward(f))) + { + return detail::binary_dispatcher::apply_const(v0, v1, std::forward(f)); + } + // non-const + template ::type> + auto VARIANT_INLINE static binary_visit(V& v0, V& v1, F&& f) + -> decltype(detail::binary_dispatcher::apply(v0, v1, std::forward(f))) + { + return detail::binary_dispatcher::apply(v0, v1, std::forward(f)); + } + + // match + // unary + template + auto VARIANT_INLINE match(Fs&&... fs) const + -> decltype(variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...))) + { + return variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...)); + } + // non-const + template + auto VARIANT_INLINE match(Fs&&... fs) + -> decltype(variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...))) + { + return variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...)); + } + + ~variant() noexcept // no-throw destructor + { + helper_type::destroy(type_index, &data); + } + + // comparison operators + // equality + VARIANT_INLINE bool operator==(variant const& rhs) const + { + assert(valid() && rhs.valid()); + if (this->which() != rhs.which()) + { + return false; + } + detail::comparer visitor(*this); + return visit(rhs, visitor); + } + + VARIANT_INLINE bool operator!=(variant const& rhs) const + { + return !(*this == rhs); + } + + // less than + VARIANT_INLINE bool operator<(variant const& rhs) const + { + assert(valid() && rhs.valid()); + if (this->which() != rhs.which()) + { + return this->which() < rhs.which(); + } + detail::comparer visitor(*this); + return visit(rhs, visitor); + } + VARIANT_INLINE bool operator>(variant const& rhs) const + { + return rhs < *this; + } + VARIANT_INLINE bool operator<=(variant const& rhs) const + { + return !(*this > rhs); + } + VARIANT_INLINE bool operator>=(variant const& rhs) const + { + return !(*this < rhs); + } +}; + +// unary visitor interface +// const +template +auto VARIANT_INLINE apply_visitor(F&& f, V const& v) -> decltype(V::visit(v, std::forward(f))) +{ + return V::visit(v, std::forward(f)); +} + +// non-const +template +auto VARIANT_INLINE apply_visitor(F&& f, V& v) -> decltype(V::visit(v, std::forward(f))) +{ + return V::visit(v, std::forward(f)); +} + +// binary visitor interface +// const +template +auto VARIANT_INLINE apply_visitor(F&& f, V const& v0, V const& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) +{ + return V::binary_visit(v0, v1, std::forward(f)); +} + +// non-const +template +auto VARIANT_INLINE apply_visitor(F&& f, V& v0, V& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) +{ + return V::binary_visit(v0, v1, std::forward(f)); +} + +// getter interface + +#ifdef HAS_EXCEPTIONS +template +auto get(T& var)->decltype(var.template get()) +{ + return var.template get(); +} +#endif + +template +ResultType& get_unchecked(T& var) +{ + return var.template get_unchecked(); +} + +#ifdef HAS_EXCEPTIONS +template +auto get(T const& var)->decltype(var.template get()) +{ + return var.template get(); +} +#endif + +template +ResultType const& get_unchecked(T const& var) +{ + return var.template get_unchecked(); +} +// variant_size +template +struct variant_size; + +//variable templates is c++14 +//template +//constexpr std::size_t variant_size_v = variant_size::value; + +template +struct variant_size + : variant_size {}; + +template +struct variant_size + : variant_size {}; + +template +struct variant_size + : variant_size {}; + +template +struct variant_size> + : std::integral_constant {}; + +// variant_alternative +template +struct variant_alternative; + +#if defined(__clang__) +#if __has_builtin(__type_pack_element) +#define has_type_pack_element +#endif +#endif + +#if defined(has_type_pack_element) +template +struct variant_alternative> +{ + static_assert(sizeof...(Types) > Index , "Index out of range"); + using type = __type_pack_element; +}; +#else +template +struct variant_alternative> + : variant_alternative> +{ + static_assert(sizeof...(Types) > Index -1 , "Index out of range"); +}; + +template +struct variant_alternative<0, variant> +{ + using type = First; +}; + +#endif + +template +using variant_alternative_t = typename variant_alternative::type; + +template +struct variant_alternative + : std::add_const> {}; + +template +struct variant_alternative + : std::add_volatile> {}; + +template +struct variant_alternative + : std::add_cv> {}; + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_VARIANT_H diff --git a/cpp/src/arrow/util/variant/CMakeLists.txt b/cpp/src/arrow/util/variant/CMakeLists.txt new file mode 100644 index 0000000000000..0ebb2516246ed --- /dev/null +++ b/cpp/src/arrow/util/variant/CMakeLists.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####################################### +# arrow_util_variant +####################################### + +install(FILES + optional.h + recursive_wrapper.h + variant_cast.h + variant_io.h + variant_visitor.h + DESTINATION include/arrow/util/variant) diff --git a/cpp/src/arrow/util/variant/optional.h b/cpp/src/arrow/util/variant/optional.h new file mode 100644 index 0000000000000..4c6671061fe80 --- /dev/null +++ b/cpp/src/arrow/util/variant/optional.h @@ -0,0 +1,100 @@ +// Copyright (c) MapBox +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this +// list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// - Neither the name "MapBox" nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef ARROW_UTIL_VARIANT_OPTIONAL_H +#define ARROW_UTIL_VARIANT_OPTIONAL_H + +#pragma message("This implementation of optional is deprecated. See https://github.com/mapbox/variant/issues/64.") + +#include +#include + +#include + +namespace arrow { +namespace util { + +template +class optional +{ + static_assert(!std::is_reference::value, "optional doesn't support references"); + + struct none_type + { + }; + + variant variant_; + +public: + optional() = default; + + optional(optional const& rhs) + { + if (this != &rhs) + { // protect against invalid self-assignment + variant_ = rhs.variant_; + } + } + + optional(T const& v) { variant_ = v; } + + explicit operator bool() const noexcept { return variant_.template is(); } + + T const& get() const { return variant_.template get(); } + T& get() { return variant_.template get(); } + + T const& operator*() const { return this->get(); } + T operator*() { return this->get(); } + + optional& operator=(T const& v) + { + variant_ = v; + return *this; + } + + optional& operator=(optional const& rhs) + { + if (this != &rhs) + { + variant_ = rhs.variant_; + } + return *this; + } + + template + void emplace(Args&&... args) + { + variant_ = T{std::forward(args)...}; + } + + void reset() { variant_ = none_type{}; } + +}; // class optional + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_VARIANT_OPTIONAL_H diff --git a/cpp/src/arrow/util/variant/recursive_wrapper.h b/cpp/src/arrow/util/variant/recursive_wrapper.h new file mode 100644 index 0000000000000..c9d9385394b38 --- /dev/null +++ b/cpp/src/arrow/util/variant/recursive_wrapper.h @@ -0,0 +1,122 @@ +#ifndef ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H +#define ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H + +// Based on variant/recursive_wrapper.h from boost. +// +// Original license: +// +// Copyright (c) 2002-2003 +// Eric Friedman, Itay Maman +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include +#include + +namespace arrow { +namespace util { + +template +class recursive_wrapper +{ + + T* p_; + + void assign(T const& rhs) + { + this->get() = rhs; + } + +public: + using type = T; + + /** + * Default constructor default initializes the internally stored value. + * For POD types this means nothing is done and the storage is + * uninitialized. + * + * @throws std::bad_alloc if there is insufficient memory for an object + * of type T. + * @throws any exception thrown by the default constructur of T. + */ + recursive_wrapper() + : p_(new T){} + + ~recursive_wrapper() noexcept { delete p_; } + + recursive_wrapper(recursive_wrapper const& operand) + : p_(new T(operand.get())) {} + + recursive_wrapper(T const& operand) + : p_(new T(operand)) {} + + recursive_wrapper(recursive_wrapper&& operand) + : p_(new T(std::move(operand.get()))) {} + + recursive_wrapper(T&& operand) + : p_(new T(std::move(operand))) {} + + inline recursive_wrapper& operator=(recursive_wrapper const& rhs) + { + assign(rhs.get()); + return *this; + } + + inline recursive_wrapper& operator=(T const& rhs) + { + assign(rhs); + return *this; + } + + inline void swap(recursive_wrapper& operand) noexcept + { + T* temp = operand.p_; + operand.p_ = p_; + p_ = temp; + } + + recursive_wrapper& operator=(recursive_wrapper&& rhs) noexcept + { + swap(rhs); + return *this; + } + + recursive_wrapper& operator=(T&& rhs) + { + get() = std::move(rhs); + return *this; + } + + T& get() + { + assert(p_); + return *get_pointer(); + } + + T const& get() const + { + assert(p_); + return *get_pointer(); + } + + T* get_pointer() { return p_; } + + const T* get_pointer() const { return p_; } + + operator T const&() const { return this->get(); } + + operator T&() { return this->get(); } + +}; // class recursive_wrapper + +template +inline void swap(recursive_wrapper& lhs, recursive_wrapper& rhs) noexcept +{ + lhs.swap(rhs); +} +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H diff --git a/cpp/src/arrow/util/variant/variant_cast.h b/cpp/src/arrow/util/variant/variant_cast.h new file mode 100644 index 0000000000000..558f1d9a60c20 --- /dev/null +++ b/cpp/src/arrow/util/variant/variant_cast.h @@ -0,0 +1,112 @@ +// Copyright (c) MapBox +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this +// list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// - Neither the name "MapBox" nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef ARROW_UTIL_VARIANT_CAST_H +#define ARROW_UTIL_VARIANT_CAST_H + +#include + +namespace arrow { +namespace util { + +namespace detail { + +template +class static_caster +{ +public: + template + T& operator()(V& v) const + { + return static_cast(v); + } +}; + +template +class dynamic_caster +{ +public: + using result_type = T&; + template + T& operator()(V& v, typename std::enable_if::value>::type* = nullptr) const + { + throw std::bad_cast(); + } + template + T& operator()(V& v, typename std::enable_if::value>::type* = nullptr) const + { + return dynamic_cast(v); + } +}; + +template +class dynamic_caster +{ +public: + using result_type = T*; + template + T* operator()(V& v, typename std::enable_if::value>::type* = nullptr) const + { + return nullptr; + } + template + T* operator()(V& v, typename std::enable_if::value>::type* = nullptr) const + { + return dynamic_cast(&v); + } +}; +} + +template +typename detail::dynamic_caster::result_type +dynamic_variant_cast(V& v) +{ + return arrow::util::apply_visitor(detail::dynamic_caster(), v); +} + +template +typename detail::dynamic_caster::result_type +dynamic_variant_cast(const V& v) +{ + return arrow::util::apply_visitor(detail::dynamic_caster(), v); +} + +template +T& static_variant_cast(V& v) +{ + return arrow::util::apply_visitor(detail::static_caster(), v); +} + +template +const T& static_variant_cast(const V& v) +{ + return arrow::util::apply_visitor(detail::static_caster(), v); +} + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_VARIANT_CAST_H diff --git a/cpp/src/arrow/util/variant/variant_io.h b/cpp/src/arrow/util/variant/variant_io.h new file mode 100644 index 0000000000000..5541a81f7035f --- /dev/null +++ b/cpp/src/arrow/util/variant/variant_io.h @@ -0,0 +1,72 @@ +// Copyright (c) MapBox +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this +// list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// - Neither the name "MapBox" nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef ARROW_UTIL_VARIANT_IO_H +#define ARROW_UTIL_VARIANT_IO_H + +#include + +#include + +namespace arrow { +namespace util { + +namespace detail { +// operator<< helper +template +class printer +{ +public: + explicit printer(Out& out) + : out_(out) {} + printer& operator=(printer const&) = delete; + + // visitor + template + void operator()(T const& operand) const + { + out_ << operand; + } + +private: + Out& out_; +}; +} + +// operator<< +template +VARIANT_INLINE std::basic_ostream& +operator<<(std::basic_ostream& out, variant const& rhs) +{ + detail::printer> visitor(out); + apply_visitor(visitor, rhs); + return out; +} + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_VARIANT_IO_H diff --git a/cpp/src/arrow/util/variant/variant_visitor.h b/cpp/src/arrow/util/variant/variant_visitor.h new file mode 100644 index 0000000000000..66b1dfea3d7c9 --- /dev/null +++ b/cpp/src/arrow/util/variant/variant_visitor.h @@ -0,0 +1,69 @@ +// Copyright (c) MapBox +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this +// list of conditions and the following disclaimer in the documentation and/or +// other materials provided with the distribution. +// - Neither the name "MapBox" nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef ARROW_UTIL_VARIANT_VISITOR_HPP +#define ARROW_UTIL_VARIANT_VISITOR_HPP + +#include + +namespace arrow { +namespace util { + +template +struct visitor; + +template +struct visitor : Fn +{ + using Fn::operator(); + + template + visitor(T&& fn) : Fn(std::forward(fn)) {} +}; + +template +struct visitor : Fn, visitor +{ + using Fn::operator(); + using visitor::operator(); + + template + visitor(T&& fn, Ts&&... fns) + : Fn(std::forward(fn)) + , visitor(std::forward(fns)...) {} +}; + +template +visitor::type...> make_visitor(Fns&&... fns) +{ + return visitor::type...> + (std::forward(fns)...); +} + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_VARIANT_VISITOR_HPP diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index a7b01b0f6315a..47dba6cd8ddf2 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -56,7 +56,7 @@ ARRAY_VISITOR_DEFAULT(ListArray); ARRAY_VISITOR_DEFAULT(StructArray); ARRAY_VISITOR_DEFAULT(UnionArray); ARRAY_VISITOR_DEFAULT(DictionaryArray); -ARRAY_VISITOR_DEFAULT(DecimalArray); +ARRAY_VISITOR_DEFAULT(Decimal128Array); #undef ARRAY_VISITOR_DEFAULT @@ -90,7 +90,7 @@ TYPE_VISITOR_DEFAULT(Time32Type); TYPE_VISITOR_DEFAULT(Time64Type); TYPE_VISITOR_DEFAULT(TimestampType); TYPE_VISITOR_DEFAULT(IntervalType); -TYPE_VISITOR_DEFAULT(DecimalType); +TYPE_VISITOR_DEFAULT(Decimal128Type); TYPE_VISITOR_DEFAULT(ListType); TYPE_VISITOR_DEFAULT(StructType); TYPE_VISITOR_DEFAULT(UnionType); diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 6c36e465ec436..030ffc85744e4 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -50,7 +50,7 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const Time64Array& array); virtual Status Visit(const TimestampArray& array); virtual Status Visit(const IntervalArray& array); - virtual Status Visit(const DecimalArray& array); + virtual Status Visit(const Decimal128Array& array); virtual Status Visit(const ListArray& array); virtual Status Visit(const StructArray& array); virtual Status Visit(const UnionArray& array); @@ -83,7 +83,7 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const Time64Type& type); virtual Status Visit(const TimestampType& type); virtual Status Visit(const IntervalType& type); - virtual Status Visit(const DecimalType& type); + virtual Status Visit(const Decimal128Type& type); virtual Status Visit(const ListType& type); virtual Status Visit(const StructType& type); virtual Status Visit(const UnionType& type); diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index 5ecabd2a4fbb1..41b0108aeb116 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -55,7 +55,7 @@ inline Status VisitTypeInline(const DataType& type, VISITOR* visitor) { TYPE_VISIT_INLINE(TimestampType); TYPE_VISIT_INLINE(Time32Type); TYPE_VISIT_INLINE(Time64Type); - TYPE_VISIT_INLINE(DecimalType); + TYPE_VISIT_INLINE(Decimal128Type); TYPE_VISIT_INLINE(ListType); TYPE_VISIT_INLINE(StructType); TYPE_VISIT_INLINE(UnionType); @@ -97,7 +97,7 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor) { ARRAY_VISIT_INLINE(TimestampType); ARRAY_VISIT_INLINE(Time32Type); ARRAY_VISIT_INLINE(Time64Type); - ARRAY_VISIT_INLINE(DecimalType); + ARRAY_VISIT_INLINE(Decimal128Type); ARRAY_VISIT_INLINE(ListType); ARRAY_VISIT_INLINE(StructType); ARRAY_VISIT_INLINE(UnionType); diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index e57a2a6f3008c..d74c0f412d97f 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -40,6 +40,7 @@ #include #include +#include "arrow/buffer.h" #include "plasma/common.h" #include "plasma/fling.h" #include "plasma/io.h" @@ -53,6 +54,8 @@ namespace plasma { +using arrow::MutableBuffer; + // Number of threads used for memcopy and hash computations. constexpr int64_t kThreadPoolSize = 8; constexpr int64_t kBytesInMB = 1 << 20; @@ -145,7 +148,8 @@ void PlasmaClient::increment_object_count(const ObjectID& object_id, PlasmaObjec } Status PlasmaClient::Create(const ObjectID& object_id, int64_t data_size, - uint8_t* metadata, int64_t metadata_size, uint8_t** data) { + uint8_t* metadata, int64_t metadata_size, + std::shared_ptr* data) { ARROW_LOG(DEBUG) << "called plasma_create on conn " << store_conn_ << " with size " << data_size << " and metadata size " << metadata_size; RETURN_NOT_OK(SendCreateRequest(store_conn_, object_id, data_size, metadata_size)); @@ -162,14 +166,16 @@ Status PlasmaClient::Create(const ObjectID& object_id, int64_t data_size, ARROW_CHECK(object.metadata_size == metadata_size); // The metadata should come right after the data. ARROW_CHECK(object.metadata_offset == object.data_offset + data_size); - *data = lookup_or_mmap(fd, object.handle.store_fd, object.handle.mmap_size) + - object.data_offset; + *data = std::make_shared( + lookup_or_mmap(fd, object.handle.store_fd, object.handle.mmap_size) + + object.data_offset, + data_size); // If plasma_create is being called from a transfer, then we will not copy the // metadata here. The metadata will be written along with the data streamed // from the transfer. if (metadata != NULL) { // Copy the metadata to the buffer. - memcpy(*data + object.data_size, metadata, metadata_size); + memcpy((*data)->mutable_data() + object.data_size, metadata, metadata_size); } // Increment the count of the number of instances of this object that this // client is using. A call to PlasmaClient::Release is required to decrement @@ -203,10 +209,12 @@ Status PlasmaClient::Get(const ObjectID* object_ids, int64_t num_objects, ARROW_CHECK(object_entry->second->is_sealed) << "Plasma client called get on an unsealed object that it created"; PlasmaObject* object = &object_entry->second->object; - object_buffers[i].data = lookup_mmapped_file(object->handle.store_fd); - object_buffers[i].data = object_buffers[i].data + object->data_offset; + uint8_t* data = lookup_mmapped_file(object->handle.store_fd); + object_buffers[i].data = + std::make_shared(data + object->data_offset, object->data_size); + object_buffers[i].metadata = std::make_shared( + data + object->data_offset + object->data_size, object->metadata_size); object_buffers[i].data_size = object->data_size; - object_buffers[i].metadata = object_buffers[i].data + object->data_size; object_buffers[i].metadata_size = object->metadata_size; // Increment the count of the number of instances of this object that this // client is using. A call to PlasmaClient::Release is required to @@ -254,13 +262,15 @@ Status PlasmaClient::Get(const ObjectID* object_ids, int64_t num_objects, // The object was retrieved. The user will be responsible for releasing // this object. int fd = recv_fd(store_conn_); - ARROW_CHECK(fd >= 0); - object_buffers[i].data = + uint8_t* data = lookup_or_mmap(fd, object->handle.store_fd, object->handle.mmap_size); + ARROW_CHECK(fd >= 0); // Finish filling out the return values. - object_buffers[i].data = object_buffers[i].data + object->data_offset; + object_buffers[i].data = + std::make_shared(data + object->data_offset, object->data_size); + object_buffers[i].metadata = std::make_shared( + data + object->data_offset + object->data_size, object->metadata_size); object_buffers[i].data_size = object->data_size; - object_buffers[i].metadata = object_buffers[i].data + object->data_size; object_buffers[i].metadata_size = object->metadata_size; // Increment the count of the number of instances of this object that this // client is using. A call to PlasmaClient::Release is required to @@ -278,6 +288,39 @@ Status PlasmaClient::Get(const ObjectID* object_ids, int64_t num_objects, return Status::OK(); } +Status PlasmaClient::UnmapObject(const ObjectID& object_id) { + auto object_entry = objects_in_use_.find(object_id); + ARROW_CHECK(object_entry != objects_in_use_.end()); + ARROW_CHECK(object_entry->second->count == 0); + + // Decrement the count of the number of objects in this memory-mapped file + // that the client is using. The corresponding increment should have + // happened in plasma_get. + int fd = object_entry->second->object.handle.store_fd; + auto entry = mmap_table_.find(fd); + ARROW_CHECK(entry != mmap_table_.end()); + ARROW_CHECK(entry->second.count >= 1); + if (entry->second.count == 1) { + // If no other objects are being used, then unmap the file. + int err = munmap(entry->second.pointer, entry->second.length); + if (err == -1) { + return Status::IOError("Error during munmap"); + } + // Remove the corresponding entry from the hash table. + mmap_table_.erase(fd); + } else { + // If there are other objects being used, decrement the reference count. + entry->second.count -= 1; + } + // Update the in_use_object_bytes_. + in_use_object_bytes_ -= (object_entry->second->object.data_size + + object_entry->second->object.metadata_size); + DCHECK_GE(in_use_object_bytes_, 0); + // Remove the entry from the hash table of objects currently in use. + objects_in_use_.erase(object_id); + return Status::OK(); +} + /// This is a helper method for implementing plasma_release. We maintain a /// buffer /// of release calls and only perform them once the buffer becomes full (as @@ -297,28 +340,9 @@ Status PlasmaClient::PerformRelease(const ObjectID& object_id) { ARROW_CHECK(object_entry->second->count >= 0); // Check if the client is no longer using this object. if (object_entry->second->count == 0) { - // Decrement the count of the number of objects in this memory-mapped file - // that the client is using. The corresponding increment should have - // happened in plasma_get. - int fd = object_entry->second->object.handle.store_fd; - auto entry = mmap_table_.find(fd); - ARROW_CHECK(entry != mmap_table_.end()); - entry->second.count -= 1; - ARROW_CHECK(entry->second.count >= 0); - // If none are being used then unmap the file. - if (entry->second.count == 0) { - munmap(entry->second.pointer, entry->second.length); - // Remove the corresponding entry from the hash table. - mmap_table_.erase(fd); - } // Tell the store that the client no longer needs the object. + RETURN_NOT_OK(UnmapObject(object_id)); RETURN_NOT_OK(SendReleaseRequest(store_conn_, object_id)); - // Update the in_use_object_bytes_. - in_use_object_bytes_ -= (object_entry->second->object.data_size + - object_entry->second->object.metadata_size); - DCHECK_GE(in_use_object_bytes_, 0); - // Remove the entry from the hash table of objects currently in use. - objects_in_use_.erase(object_id); } return Status::OK(); } @@ -344,6 +368,20 @@ Status PlasmaClient::Release(const ObjectID& object_id) { return Status::OK(); } +Status PlasmaClient::FlushReleaseHistory() { + // If the client is already disconnected, ignore the flush. + if (store_conn_ < 0) { + return Status::OK(); + } + while (release_history_.size() > 0) { + // Perform a release for the object ID for the first pending release. + RETURN_NOT_OK(PerformRelease(release_history_.back())); + // Remove the last entry from the release history. + release_history_.pop_back(); + } + return Status::OK(); +} + // This method is used to query whether the plasma store contains an object. Status PlasmaClient::Contains(const ObjectID& object_id, bool* has_object) { // Check if we already have a reference to the object. @@ -410,14 +448,16 @@ static uint64_t compute_object_hash(const ObjectBuffer& obj_buffer) { XXH64_state_t hash_state; XXH64_reset(&hash_state, XXH64_DEFAULT_SEED); if (obj_buffer.data_size >= kBytesInMB) { - compute_object_hash_parallel(&hash_state, - reinterpret_cast(obj_buffer.data), - obj_buffer.data_size); + compute_object_hash_parallel( + &hash_state, reinterpret_cast(obj_buffer.data->data()), + obj_buffer.data_size); } else { - XXH64_update(&hash_state, reinterpret_cast(obj_buffer.data), + XXH64_update(&hash_state, + reinterpret_cast(obj_buffer.data->data()), obj_buffer.data_size); } - XXH64_update(&hash_state, reinterpret_cast(obj_buffer.metadata), + XXH64_update(&hash_state, + reinterpret_cast(obj_buffer.metadata->data()), obj_buffer.metadata_size); return XXH64_digest(&hash_state); } @@ -443,10 +483,50 @@ Status PlasmaClient::Seal(const ObjectID& object_id) { return Release(object_id); } +Status PlasmaClient::Abort(const ObjectID& object_id) { + auto object_entry = objects_in_use_.find(object_id); + ARROW_CHECK(object_entry != objects_in_use_.end()) + << "Plasma client called abort on an object without a reference to it"; + ARROW_CHECK(!object_entry->second->is_sealed) + << "Plasma client called abort on a sealed object"; + + // Flush the release history. + RETURN_NOT_OK(FlushReleaseHistory()); + // Make sure that the Plasma client only has one reference to the object. If + // it has more, then the client needs to release the buffer before calling + // abort. + if (object_entry->second->count > 1) { + return Status::Invalid("Plasma client cannot have a reference to the buffer."); + } + + // Send the abort request. + RETURN_NOT_OK(SendAbortRequest(store_conn_, object_id)); + // Decrease the reference count to zero, then remove the object. + object_entry->second->count--; + RETURN_NOT_OK(UnmapObject(object_id)); + + std::vector buffer; + ObjectID id; + int64_t type; + RETURN_NOT_OK(ReadMessage(store_conn_, &type, &buffer)); + return ReadAbortReply(buffer.data(), buffer.size(), &id); +} + Status PlasmaClient::Delete(const ObjectID& object_id) { - // TODO(rkn): In the future, we can use this method to give hints to the - // eviction policy about when an object will no longer be needed. - return Status::NotImplemented("PlasmaClient::Delete is not implemented."); + RETURN_NOT_OK(FlushReleaseHistory()); + // If the object is in used, client can't send the remove message. + if (objects_in_use_.count(object_id) > 0) { + return Status::UnknownError("PlasmaClient::Object is in use."); + } else { + // If we don't already have a reference to the object, we can try to remove the object + RETURN_NOT_OK(SendDeleteRequest(store_conn_, object_id)); + std::vector buffer; + RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType_PlasmaDeleteReply, &buffer)); + ObjectID object_id2; + DCHECK_GT(buffer.size(), 0); + RETURN_NOT_OK(ReadDeleteReply(buffer.data(), buffer.size(), &object_id2)); + return Status::OK(); + } } Status PlasmaClient::Evict(int64_t num_bytes, int64_t& num_bytes_evicted) { @@ -560,7 +640,7 @@ Status PlasmaClient::Fetch(int num_object_ids, const ObjectID* object_ids) { return SendFetchRequest(manager_conn_, object_ids, num_object_ids); } -int PlasmaClient::get_manager_fd() { return manager_conn_; } +int PlasmaClient::get_manager_fd() const { return manager_conn_; } Status PlasmaClient::Info(const ObjectID& object_id, int* object_status) { ARROW_CHECK(manager_conn_ >= 0); diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h index 145942441c9f1..35182f8403201 100644 --- a/cpp/src/plasma/client.h +++ b/cpp/src/plasma/client.h @@ -26,11 +26,13 @@ #include #include +#include "arrow/buffer.h" #include "arrow/status.h" #include "arrow/util/visibility.h" #include "plasma/common.h" using arrow::Status; +using arrow::Buffer; namespace plasma { @@ -41,14 +43,16 @@ constexpr int64_t kL3CacheSizeBytes = 100000000; /// Object buffer data structure. struct ObjectBuffer { + /// The data buffer. + std::shared_ptr data; /// The size in bytes of the data object. int64_t data_size; - /// The address of the data object. - uint8_t* data; + /// The metadata buffer. + std::shared_ptr metadata; /// The metadata size in bytes. int64_t metadata_size; - /// The address of the metadata. - uint8_t* metadata; + /// The device number. + int device_num; }; /// Configuration options for the plasma client. @@ -107,11 +111,11 @@ class ARROW_EXPORT PlasmaClient { /// should be NULL. /// \param metadata_size The size in bytes of the metadata. If there is no /// metadata, this should be 0. - /// \param data The address of the newly created object will be written here. + /// \param data A buffer containing the address of the newly created object + /// will be written here. /// \return The return status. Status Create(const ObjectID& object_id, int64_t data_size, uint8_t* metadata, - int64_t metadata_size, uint8_t** data); - + int64_t metadata_size, std::shared_ptr* data); /// Get some objects from the Plasma Store. This function will block until the /// objects have all been created and sealed in the Plasma Store or the /// timeout @@ -152,6 +156,15 @@ class ARROW_EXPORT PlasmaClient { /// \return The return status. Status Contains(const ObjectID& object_id, bool* has_object); + /// Abort an unsealed object in the object store. If the abort succeeds, then + /// it will be as if the object was never created at all. The unsealed object + /// must have only a single reference (the one that would have been removed by + /// calling Seal). + /// + /// \param object_id The ID of the object to abort. + /// \return The return status. + Status Abort(const ObjectID& object_id); + /// Seal an object in the object store. The object will be immutable after /// this /// call. @@ -161,7 +174,8 @@ class ARROW_EXPORT PlasmaClient { Status Seal(const ObjectID& object_id); /// Delete an object from the object store. This currently assumes that the - /// object is present and has been sealed. + /// object is present, has been sealed and not used by another client. Otherwise, + /// it is a no operation. /// /// @todo We may want to allow the deletion of objects that are not present or /// haven't been sealed. @@ -304,9 +318,19 @@ class ARROW_EXPORT PlasmaClient { /// /// \return The file descriptor for the manager connection. If there is no /// connection to the manager, this is -1. - int get_manager_fd(); + int get_manager_fd() const; private: + /// This is a helper method for unmapping objects for which all references have + /// gone out of scope, either by calling Release or Abort. + /// + /// @param object_id The object ID whose data we should unmap. + Status UnmapObject(const ObjectID& object_id); + + /// This is a helper method that flushes all pending release calls to the + /// store. + Status FlushReleaseHistory(); + Status PerformRelease(const ObjectID& object_id); uint8_t* lookup_or_mmap(int fd, int store_fd_val, int64_t map_size); diff --git a/cpp/src/plasma/eviction_policy.cc b/cpp/src/plasma/eviction_policy.cc index 6c2309f1709d2..66a3b2ea298ac 100644 --- a/cpp/src/plasma/eviction_policy.cc +++ b/cpp/src/plasma/eviction_policy.cc @@ -61,38 +61,32 @@ int64_t EvictionPolicy::choose_objects_to_evict(int64_t num_bytes_required, } /* Update the number of bytes used. */ memory_used_ -= bytes_evicted; + ARROW_CHECK(memory_used_ >= 0); return bytes_evicted; } void EvictionPolicy::object_created(const ObjectID& object_id) { auto entry = store_info_->objects[object_id].get(); cache_.add(object_id, entry->info.data_size + entry->info.metadata_size); + int64_t size = entry->info.data_size + entry->info.metadata_size; + memory_used_ += size; + ARROW_CHECK(memory_used_ <= store_info_->memory_capacity); } bool EvictionPolicy::require_space(int64_t size, std::vector* objects_to_evict) { /* Check if there is enough space to create the object. */ int64_t required_space = memory_used_ + size - store_info_->memory_capacity; - int64_t num_bytes_evicted; - if (required_space > 0) { - /* Try to free up at least as much space as we need right now but ideally - * up to 20% of the total capacity. */ - int64_t space_to_free = std::max(size, store_info_->memory_capacity / 5); - ARROW_LOG(DEBUG) << "not enough space to create this object, so evicting objects"; - /* Choose some objects to evict, and update the return pointers. */ - num_bytes_evicted = choose_objects_to_evict(space_to_free, objects_to_evict); - ARROW_LOG(INFO) << "There is not enough space to create this object, so evicting " - << objects_to_evict->size() << " objects to free up " - << num_bytes_evicted << " bytes."; - } else { - num_bytes_evicted = 0; - } - if (num_bytes_evicted >= required_space) { - /* We only increment the space used if there is enough space to create the - * object. */ - memory_used_ += size; - } - return num_bytes_evicted >= required_space; + /* Try to free up at least as much space as we need right now but ideally + * up to 20% of the total capacity. */ + int64_t space_to_free = std::max(required_space, store_info_->memory_capacity / 5); + ARROW_LOG(DEBUG) << "not enough space to create this object, so evicting objects"; + /* Choose some objects to evict, and update the return pointers. */ + int64_t num_bytes_evicted = choose_objects_to_evict(space_to_free, objects_to_evict); + ARROW_LOG(INFO) << "There is not enough space to create this object, so evicting " + << objects_to_evict->size() << " objects to free up " + << num_bytes_evicted << " bytes."; + return num_bytes_evicted >= required_space && num_bytes_evicted > 0; } void EvictionPolicy::begin_object_access(const ObjectID& object_id, @@ -108,4 +102,14 @@ void EvictionPolicy::end_object_access(const ObjectID& object_id, cache_.add(object_id, entry->info.data_size + entry->info.metadata_size); } +void EvictionPolicy::remove_object(const ObjectID& object_id) { + /* If the object is in the LRU cache, remove it. */ + cache_.remove(object_id); + + auto entry = store_info_->objects[object_id].get(); + int64_t size = entry->info.data_size + entry->info.metadata_size; + ARROW_CHECK(memory_used_ >= size); + memory_used_ -= size; +} + } // namespace plasma diff --git a/cpp/src/plasma/eviction_policy.h b/cpp/src/plasma/eviction_policy.h index de33dabcbafb3..b076309552952 100644 --- a/cpp/src/plasma/eviction_policy.h +++ b/cpp/src/plasma/eviction_policy.h @@ -73,8 +73,7 @@ class EvictionPolicy { void object_created(const ObjectID& object_id); /// This method will be called when the Plasma store needs more space, perhaps - /// to create a new object. If the required amount of space cannot be freed up, - /// then a fatal error will be thrown. When this method is called, the eviction + /// to create a new object. When this method is called, the eviction /// policy will assume that the objects chosen to be evicted will in fact be /// evicted from the Plasma store by the caller. /// @@ -121,6 +120,11 @@ class EvictionPolicy { int64_t choose_objects_to_evict(int64_t num_bytes_required, std::vector* objects_to_evict); + /// This method will be called when an object is going to be removed + /// + /// @param object_id The ID of the object that is now being used. + void remove_object(const ObjectID& object_id); + private: /// The amount of memory (in bytes) currently being used. int64_t memory_used_; diff --git a/cpp/src/plasma/format/plasma.fbs b/cpp/src/plasma/format/plasma.fbs index 23782ade539d4..ea6dc8bb98da5 100644 --- a/cpp/src/plasma/format/plasma.fbs +++ b/cpp/src/plasma/format/plasma.fbs @@ -21,6 +21,8 @@ enum MessageType:int { // Create a new object. PlasmaCreateRequest = 1, PlasmaCreateReply, + PlasmaAbortRequest, + PlasmaAbortReply, // Seal an object. PlasmaSealRequest, PlasmaSealReply, @@ -74,7 +76,11 @@ enum PlasmaError:int { // Trying to access an object that doesn't exist. ObjectNonexistent, // Trying to create an object but there isn't enough space in the store. - OutOfMemory + OutOfMemory, + // Trying to delete an object but it's not sealed. + ObjectNotSealed, + // Trying to delete an object but it's in use. + ObjectInUse } // Plasma store messages @@ -113,6 +119,16 @@ table PlasmaCreateReply { error: PlasmaError; } +table PlasmaAbortRequest { + // ID of the object to be aborted. + object_id: string; +} + +table PlasmaAbortReply { + // ID of the object that was aborted. + object_id: string; +} + table PlasmaSealRequest { // ID of the object to be sealed. object_id: string; diff --git a/cpp/src/plasma/io.cc b/cpp/src/plasma/io.cc index 2228ad94c6346..2cba8970e26ae 100644 --- a/cpp/src/plasma/io.cc +++ b/cpp/src/plasma/io.cc @@ -156,19 +156,14 @@ Status ConnectIpcSocketRetry(const std::string& pathname, int num_retries, if (timeout < 0) { timeout = CONNECT_TIMEOUT_MS; } - - *fd = -1; - for (int num_attempts = 0; num_attempts < num_retries; ++num_attempts) { - *fd = connect_ipc_sock(pathname); - if (*fd >= 0) { - break; - } - if (num_attempts == 0) { - ARROW_LOG(ERROR) << "Connection to IPC socket failed for pathname " << pathname - << ", retrying " << num_retries << " times"; - } + *fd = connect_ipc_sock(pathname); + while (*fd < 0 && num_retries > 0) { + ARROW_LOG(ERROR) << "Connection to IPC socket failed for pathname " << pathname + << ", retrying " << num_retries << " more times"; /* Sleep for timeout milliseconds. */ usleep(static_cast(timeout * 1000)); + *fd = connect_ipc_sock(pathname); + --num_retries; } /* If we could not connect to the socket, exit. */ if (*fd == -1) { diff --git a/cpp/src/plasma/protocol.cc b/cpp/src/plasma/protocol.cc index 2261b6a624a8c..c0ebb88fe5019 100644 --- a/cpp/src/plasma/protocol.cc +++ b/cpp/src/plasma/protocol.cc @@ -100,6 +100,34 @@ Status ReadCreateReply(uint8_t* data, size_t size, ObjectID* object_id, return plasma_error_status(message->error()); } +Status SendAbortRequest(int sock, ObjectID object_id) { + flatbuffers::FlatBufferBuilder fbb; + auto message = CreatePlasmaAbortRequest(fbb, fbb.CreateString(object_id.binary())); + return PlasmaSend(sock, MessageType_PlasmaAbortRequest, &fbb, message); +} + +Status ReadAbortRequest(uint8_t* data, size_t size, ObjectID* object_id) { + DCHECK(data); + auto message = flatbuffers::GetRoot(data); + DCHECK(verify_flatbuffer(message, data, size)); + *object_id = ObjectID::from_binary(message->object_id()->str()); + return Status::OK(); +} + +Status SendAbortReply(int sock, ObjectID object_id) { + flatbuffers::FlatBufferBuilder fbb; + auto message = CreatePlasmaAbortReply(fbb, fbb.CreateString(object_id.binary())); + return PlasmaSend(sock, MessageType_PlasmaAbortReply, &fbb, message); +} + +Status ReadAbortReply(uint8_t* data, size_t size, ObjectID* object_id) { + DCHECK(data); + auto message = flatbuffers::GetRoot(data); + DCHECK(verify_flatbuffer(message, data, size)); + *object_id = ObjectID::from_binary(message->object_id()->str()); + return Status::OK(); +} + // Seal messages. Status SendSealRequest(int sock, ObjectID object_id, unsigned char* digest) { diff --git a/cpp/src/plasma/protocol.h b/cpp/src/plasma/protocol.h index af4b13978c697..e8c334f9181fc 100644 --- a/cpp/src/plasma/protocol.h +++ b/cpp/src/plasma/protocol.h @@ -51,6 +51,14 @@ Status SendCreateReply(int sock, ObjectID object_id, PlasmaObject* object, int e Status ReadCreateReply(uint8_t* data, size_t size, ObjectID* object_id, PlasmaObject* object); +Status SendAbortRequest(int sock, ObjectID object_id); + +Status ReadAbortRequest(uint8_t* data, size_t size, ObjectID* object_id); + +Status SendAbortReply(int sock, ObjectID object_id); + +Status ReadAbortReply(uint8_t* data, size_t size, ObjectID* object_id); + /* Plasma Seal message functions. */ Status SendSealRequest(int sock, ObjectID object_id, unsigned char* digest); diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc index 210cce16238f8..dde7f9cdfa8eb 100644 --- a/cpp/src/plasma/store.cc +++ b/cpp/src/plasma/store.cc @@ -393,6 +393,57 @@ void PlasmaStore::seal_object(const ObjectID& object_id, unsigned char digest[]) update_object_get_requests(object_id); } +int PlasmaStore::abort_object(const ObjectID& object_id, Client* client) { + auto entry = get_object_table_entry(&store_info_, object_id); + ARROW_CHECK(entry != NULL) << "To abort an object it must be in the object table."; + ARROW_CHECK(entry->state != PLASMA_SEALED) + << "To abort an object it must not have been sealed."; + auto it = entry->clients.find(client); + if (it == entry->clients.end()) { + // If the client requesting the abort is not the creator, do not + // perform the abort. + return 0; + } else { + // The client requesting the abort is the creator. Free the object. + dlfree(entry->pointer); + store_info_.objects.erase(object_id); + return 1; + } +} + +int PlasmaStore::delete_object(ObjectID& object_id) { + auto entry = get_object_table_entry(&store_info_, object_id); + // TODO(rkn): This should probably not fail, but should instead throw an + // error. Maybe we should also support deleting objects that have been + // created but not sealed. + if (entry == NULL) { + // To delete an object it must be in the object table. + return PlasmaError_ObjectNonexistent; + } + + if (entry->state != PLASMA_SEALED) { + // To delete an object it must have been sealed. + return PlasmaError_ObjectNotSealed; + } + + if (entry->clients.size() != 0) { + // To delete an object, there must be no clients currently using it. + return PlasmaError_ObjectInUse; + } + + eviction_policy_.remove_object(object_id); + + dlfree(entry->pointer); + store_info_.objects.erase(object_id); + // Inform all subscribers that the object has been deleted. + ObjectInfoT notification; + notification.object_id = object_id.binary(); + notification.is_deletion = true; + push_notification(¬ification); + + return PlasmaError_OK; +} + void PlasmaStore::delete_objects(const std::vector& object_ids) { for (const auto& object_id : object_ids) { ARROW_LOG(DEBUG) << "deleting object " << object_id.hex(); @@ -442,8 +493,21 @@ void PlasmaStore::disconnect_client(int client_fd) { ARROW_LOG(INFO) << "Disconnecting client on fd " << client_fd; // If this client was using any objects, remove it from the appropriate // lists. + // TODO(swang): Avoid iteration through the object table. + auto client = it->second.get(); + std::vector unsealed_objects; for (const auto& entry : store_info_.objects) { - remove_client_from_object_clients(entry.second.get(), it->second.get()); + if (entry.second->state == PLASMA_SEALED) { + remove_client_from_object_clients(entry.second.get(), client); + } else { + // Add unsealed objects to a temporary list of object IDs. Do not perform + // the abort here, since it potentially modifies the object table. + unsealed_objects.push_back(entry.first); + } + } + // If the client was creating any objects, abort them. + for (const auto& entry : unsealed_objects) { + abort_object(entry, client); } // Note, the store may still attempt to send a message to the disconnected @@ -582,24 +646,36 @@ Status PlasmaStore::process_message(Client* client) { warn_if_sigpipe(send_fd(client->fd, object.handle.store_fd), client->fd); } } break; + case MessageType_PlasmaAbortRequest: { + RETURN_NOT_OK(ReadAbortRequest(input, input_size, &object_id)); + ARROW_CHECK(abort_object(object_id, client) == 1) << "To abort an object, the only " + "client currently using it " + "must be the creator."; + HANDLE_SIGPIPE(SendAbortReply(client->fd, object_id), client->fd); + } break; case MessageType_PlasmaGetRequest: { std::vector object_ids_to_get; int64_t timeout_ms; RETURN_NOT_OK(ReadGetRequest(input, input_size, object_ids_to_get, &timeout_ms)); process_get_request(client, object_ids_to_get, timeout_ms); } break; - case MessageType_PlasmaReleaseRequest: + case MessageType_PlasmaReleaseRequest: { RETURN_NOT_OK(ReadReleaseRequest(input, input_size, &object_id)); release_object(object_id, client); - break; - case MessageType_PlasmaContainsRequest: + } break; + case MessageType_PlasmaDeleteRequest: { + RETURN_NOT_OK(ReadDeleteRequest(input, input_size, &object_id)); + int error_code = delete_object(object_id); + HANDLE_SIGPIPE(SendDeleteReply(client->fd, object_id, error_code), client->fd); + } break; + case MessageType_PlasmaContainsRequest: { RETURN_NOT_OK(ReadContainsRequest(input, input_size, &object_id)); if (contains_object(object_id) == OBJECT_FOUND) { HANDLE_SIGPIPE(SendContainsReply(client->fd, object_id, 1), client->fd); } else { HANDLE_SIGPIPE(SendContainsReply(client->fd, object_id, 0), client->fd); } - break; + } break; case MessageType_PlasmaSealRequest: { unsigned char digest[kDigestSize]; RETURN_NOT_OK(ReadSealRequest(input, input_size, &object_id, &digest[0])); @@ -638,12 +714,22 @@ class PlasmaStoreRunner { PlasmaStoreRunner() {} void Start(char* socket_name, int64_t system_memory, std::string directory, - bool hugepages_enabled) { + bool hugepages_enabled, bool use_one_memory_mapped_file) { // Create the event loop. loop_.reset(new EventLoop); store_.reset( new PlasmaStore(loop_.get(), system_memory, directory, hugepages_enabled)); plasma_config = store_->get_plasma_store_info(); + + // If the store is configured to use a single memory-mapped file, then we + // achieve that by mallocing and freeing a single large amount of space. + // that maximum allowed size up front. + if (use_one_memory_mapped_file) { + void* pointer = plasma::dlmemalign(BLOCK_SIZE, system_memory); + ARROW_CHECK(pointer != NULL); + plasma::dlfree(pointer); + } + int socket = bind_ipc_sock(socket_name, true); // TODO(pcm): Check return value. ARROW_CHECK(socket >= 0); @@ -678,14 +764,15 @@ void HandleSignal(int signal) { } void start_server(char* socket_name, int64_t system_memory, std::string plasma_directory, - bool hugepages_enabled) { + bool hugepages_enabled, bool use_one_memory_mapped_file) { // Ignore SIGPIPE signals. If we don't do this, then when we attempt to write // to a client that has already died, the store could die. signal(SIGPIPE, SIG_IGN); g_runner.reset(new PlasmaStoreRunner()); signal(SIGTERM, HandleSignal); - g_runner->Start(socket_name, system_memory, plasma_directory, hugepages_enabled); + g_runner->Start(socket_name, system_memory, plasma_directory, hugepages_enabled, + use_one_memory_mapped_file); } } // namespace plasma @@ -695,9 +782,11 @@ int main(int argc, char* argv[]) { // Directory where plasma memory mapped files are stored. std::string plasma_directory; bool hugepages_enabled = false; + // True if a single large memory-mapped file should be created at startup. + bool use_one_memory_mapped_file = false; int64_t system_memory = -1; int c; - while ((c = getopt(argc, argv, "s:m:d:h")) != -1) { + while ((c = getopt(argc, argv, "s:m:d:hf")) != -1) { switch (c) { case 'd': plasma_directory = std::string(optarg); @@ -717,6 +806,9 @@ int main(int argc, char* argv[]) { << "GB of memory."; break; } + case 'f': + use_one_memory_mapped_file = true; + break; default: exit(-1); } @@ -770,5 +862,6 @@ int main(int argc, char* argv[]) { // available. plasma::dlmalloc_set_footprint_limit((size_t)system_memory); ARROW_LOG(DEBUG) << "starting server listening on " << socket_name; - plasma::start_server(socket_name, system_memory, plasma_directory, hugepages_enabled); + plasma::start_server(socket_name, system_memory, plasma_directory, hugepages_enabled, + use_one_memory_mapped_file); } diff --git a/cpp/src/plasma/store.h b/cpp/src/plasma/store.h index d03d11f4ef0c4..7eada5a126991 100644 --- a/cpp/src/plasma/store.h +++ b/cpp/src/plasma/store.h @@ -48,6 +48,7 @@ struct Client { class PlasmaStore { public: + // TODO: PascalCase PlasmaStore methods. PlasmaStore(EventLoop* loop, int64_t system_memory, std::string directory, bool hugetlbfs_enabled); @@ -73,6 +74,24 @@ class PlasmaStore { int create_object(const ObjectID& object_id, int64_t data_size, int64_t metadata_size, Client* client, PlasmaObject* result); + /// Abort a created but unsealed object. If the client is not the + /// creator, then the abort will fail. + /// + /// @param object_id Object ID of the object to be aborted. + /// @param client The client who created the object. If this does not + /// match the creator of the object, then the abort will fail. + /// @return 1 if the abort succeeds, else 0. + int abort_object(const ObjectID& object_id, Client* client); + + /// Delete an specific object by object_id that have been created in the hash table. + /// + /// @param object_id Object ID of the object to be deleted. + /// @return One of the following error codes: + /// - PlasmaError_OK, if the object was delete successfully. + /// - PlasmaError_ObjectNonexistent, if ths object isn't existed. + /// - PlasmaError_ObjectInUse, if the object is in use. + int delete_object(ObjectID& object_id); + /// Delete objects that have been created in the hash table. This should only /// be called on objects that are returned by the eviction policy to evict. /// diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc index 0f19da5f72342..f19c2bfbdb380 100644 --- a/cpp/src/plasma/test/client_tests.cc +++ b/cpp/src/plasma/test/client_tests.cc @@ -45,16 +45,44 @@ class TestPlasmaStore : public ::testing::Test { "/plasma_store -m 1000000000 -s /tmp/store 1> /dev/null 2> /dev/null &"; system(plasma_command.c_str()); ARROW_CHECK_OK(client_.Connect("/tmp/store", "", PLASMA_DEFAULT_RELEASE_DELAY)); + ARROW_CHECK_OK(client2_.Connect("/tmp/store", "", PLASMA_DEFAULT_RELEASE_DELAY)); } virtual void Finish() { ARROW_CHECK_OK(client_.Disconnect()); + ARROW_CHECK_OK(client2_.Disconnect()); system("killall plasma_store &"); } protected: PlasmaClient client_; + PlasmaClient client2_; }; +TEST_F(TestPlasmaStore, DeleteTest) { + ObjectID object_id = ObjectID::from_random(); + + // Test for deleting non-existance object. + Status result = client_.Delete(object_id); + ASSERT_EQ(result.IsPlasmaObjectNonexistent(), true); + + // Test for the object being in local Plasma store. + // First create object. + int64_t data_size = 100; + uint8_t metadata[] = {5}; + int64_t metadata_size = sizeof(metadata); + std::shared_ptr data; + ARROW_CHECK_OK(client_.Create(object_id, data_size, metadata, metadata_size, &data)); + ARROW_CHECK_OK(client_.Seal(object_id)); + + // Object is in use, can't be delete. + result = client_.Delete(object_id); + ASSERT_EQ(result.IsUnknownError(), true); + + // Avoid race condition of Plasma Manager waiting for notification. + ARROW_CHECK_OK(client_.Release(object_id)); + ARROW_CHECK_OK(client_.Delete(object_id)); +} + TEST_F(TestPlasmaStore, ContainsTest) { ObjectID object_id = ObjectID::from_random(); @@ -68,7 +96,7 @@ TEST_F(TestPlasmaStore, ContainsTest) { int64_t data_size = 100; uint8_t metadata[] = {5}; int64_t metadata_size = sizeof(metadata); - uint8_t* data; + std::shared_ptr data; ARROW_CHECK_OK(client_.Create(object_id, data_size, metadata, metadata_size, &data)); ARROW_CHECK_OK(client_.Seal(object_id)); // Avoid race condition of Plasma Manager waiting for notification. @@ -91,16 +119,20 @@ TEST_F(TestPlasmaStore, GetTest) { int64_t data_size = 4; uint8_t metadata[] = {5}; int64_t metadata_size = sizeof(metadata); + std::shared_ptr data_buffer; uint8_t* data; - ARROW_CHECK_OK(client_.Create(object_id, data_size, metadata, metadata_size, &data)); + ARROW_CHECK_OK( + client_.Create(object_id, data_size, metadata, metadata_size, &data_buffer)); + data = data_buffer->mutable_data(); for (int64_t i = 0; i < data_size; i++) { data[i] = static_cast(i % 4); } ARROW_CHECK_OK(client_.Seal(object_id)); ARROW_CHECK_OK(client_.Get(&object_id, 1, -1, &object_buffer)); + const uint8_t* object_data = object_buffer.data->data(); for (int64_t i = 0; i < data_size; i++) { - ASSERT_EQ(data[i], object_buffer.data[i]); + ASSERT_EQ(data[i], object_data[i]); } } @@ -113,18 +145,154 @@ TEST_F(TestPlasmaStore, MultipleGetTest) { int64_t data_size = 4; uint8_t metadata[] = {5}; int64_t metadata_size = sizeof(metadata); - uint8_t* data; + std::shared_ptr data; ARROW_CHECK_OK(client_.Create(object_id1, data_size, metadata, metadata_size, &data)); - data[0] = 1; + data->mutable_data()[0] = 1; ARROW_CHECK_OK(client_.Seal(object_id1)); ARROW_CHECK_OK(client_.Create(object_id2, data_size, metadata, metadata_size, &data)); - data[0] = 2; + data->mutable_data()[0] = 2; ARROW_CHECK_OK(client_.Seal(object_id2)); ARROW_CHECK_OK(client_.Get(object_ids, 2, -1, object_buffer)); - ASSERT_EQ(object_buffer[0].data[0], 1); - ASSERT_EQ(object_buffer[1].data[0], 2); + ASSERT_EQ(object_buffer[0].data->data()[0], 1); + ASSERT_EQ(object_buffer[1].data->data()[0], 2); +} + +TEST_F(TestPlasmaStore, AbortTest) { + ObjectID object_id = ObjectID::from_random(); + ObjectBuffer object_buffer; + + // Test for object non-existence. + ARROW_CHECK_OK(client_.Get(&object_id, 1, 0, &object_buffer)); + ASSERT_EQ(object_buffer.data_size, -1); + + // Test object abort. + // First create object. + int64_t data_size = 4; + uint8_t metadata[] = {5}; + int64_t metadata_size = sizeof(metadata); + std::shared_ptr data; + uint8_t* data_ptr; + ARROW_CHECK_OK(client_.Create(object_id, data_size, metadata, metadata_size, &data)); + data_ptr = data->mutable_data(); + // Write some data. + for (int64_t i = 0; i < data_size / 2; i++) { + data_ptr[i] = static_cast(i % 4); + } + // Attempt to abort. Test that this fails before the first release. + Status status = client_.Abort(object_id); + ASSERT_TRUE(status.IsInvalid()); + // Release, then abort. + ARROW_CHECK_OK(client_.Release(object_id)); + ARROW_CHECK_OK(client_.Abort(object_id)); + + // Test for object non-existence after the abort. + ARROW_CHECK_OK(client_.Get(&object_id, 1, 0, &object_buffer)); + ASSERT_EQ(object_buffer.data_size, -1); + + // Create the object successfully this time. + ARROW_CHECK_OK(client_.Create(object_id, data_size, metadata, metadata_size, &data)); + data_ptr = data->mutable_data(); + for (int64_t i = 0; i < data_size; i++) { + data_ptr[i] = static_cast(i % 4); + } + ARROW_CHECK_OK(client_.Seal(object_id)); + + // Test that we can get the object. + ARROW_CHECK_OK(client_.Get(&object_id, 1, -1, &object_buffer)); + const uint8_t* buffer_ptr = object_buffer.data->data(); + for (int64_t i = 0; i < data_size; i++) { + ASSERT_EQ(data_ptr[i], buffer_ptr[i]); + } +} + +TEST_F(TestPlasmaStore, MultipleClientTest) { + ObjectID object_id = ObjectID::from_random(); + + // Test for object non-existence on the first client. + bool has_object; + ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); + ASSERT_EQ(has_object, false); + + // Test for the object being in local Plasma store. + // First create and seal object on the second client. + int64_t data_size = 100; + uint8_t metadata[] = {5}; + int64_t metadata_size = sizeof(metadata); + std::shared_ptr data; + ARROW_CHECK_OK(client2_.Create(object_id, data_size, metadata, metadata_size, &data)); + ARROW_CHECK_OK(client2_.Seal(object_id)); + // Test that the first client can get the object. + ObjectBuffer object_buffer; + ARROW_CHECK_OK(client_.Get(&object_id, 1, -1, &object_buffer)); + ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); + ASSERT_EQ(has_object, true); + + // Test that one client disconnecting does not interfere with the other. + // First create object on the second client. + object_id = ObjectID::from_random(); + ARROW_CHECK_OK(client2_.Create(object_id, data_size, metadata, metadata_size, &data)); + // Disconnect the first client. + ARROW_CHECK_OK(client_.Disconnect()); + // Test that the second client can seal and get the created object. + ARROW_CHECK_OK(client2_.Seal(object_id)); + ARROW_CHECK_OK(client2_.Get(&object_id, 1, -1, &object_buffer)); + ARROW_CHECK_OK(client2_.Contains(object_id, &has_object)); + ASSERT_EQ(has_object, true); +} + +TEST_F(TestPlasmaStore, ManyObjectTest) { + // Create many objects on the first client. Seal one third, abort one third, + // and leave the last third unsealed. + std::vector object_ids; + for (int i = 0; i < 100; i++) { + ObjectID object_id = ObjectID::from_random(); + object_ids.push_back(object_id); + + // Test for object non-existence on the first client. + bool has_object; + ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); + ASSERT_EQ(has_object, false); + + // Test for the object being in local Plasma store. + // First create and seal object on the first client. + int64_t data_size = 100; + uint8_t metadata[] = {5}; + int64_t metadata_size = sizeof(metadata); + std::shared_ptr data; + ARROW_CHECK_OK(client_.Create(object_id, data_size, metadata, metadata_size, &data)); + + if (i % 3 == 0) { + // Seal one third of the objects. + ARROW_CHECK_OK(client_.Seal(object_id)); + // Test that the first client can get the object. + ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); + ASSERT_EQ(has_object, true); + } else if (i % 3 == 1) { + // Abort one third of the objects. + ARROW_CHECK_OK(client_.Release(object_id)); + ARROW_CHECK_OK(client_.Abort(object_id)); + } + } + // Disconnect the first client. All unsealed objects should be aborted. + ARROW_CHECK_OK(client_.Disconnect()); + + // Check that the second client can query the object store for the first + // client's objects. + int i = 0; + for (auto const& object_id : object_ids) { + bool has_object; + ARROW_CHECK_OK(client2_.Contains(object_id, &has_object)); + if (i % 3 == 0) { + // The first third should be sealed. + ASSERT_EQ(has_object, true); + } else { + // The rest were aborted, so the object is not in the store. + ASSERT_EQ(has_object, false); + } + i++; + } } } // namespace plasma diff --git a/cpp/src/plasma/test/run_tests.sh b/cpp/src/plasma/test/run_tests.sh deleted file mode 100644 index 958bd08398e23..0000000000000 --- a/cpp/src/plasma/test/run_tests.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Cause the script to exit if a single command fails. -set -e - -./src/plasma/plasma_store -s /tmp/plasma_store_socket_1 -m 0 & -sleep 1 -./src/plasma/manager_tests -killall plasma_store -./src/plasma/serialization_tests - -# Start the Redis shards. -./src/common/thirdparty/redis/src/redis-server --loglevel warning --loadmodule ./src/common/redis_module/libray_redis_module.so --port 6379 & -redis_pid1=$! -./src/common/thirdparty/redis/src/redis-server --loglevel warning --loadmodule ./src/common/redis_module/libray_redis_module.so --port 6380 & -redis_pid2=$! -sleep 1 - -# Flush the redis server -./src/common/thirdparty/redis/src/redis-cli flushall -# Register the shard location with the primary shard. -./src/common/thirdparty/redis/src/redis-cli set NumRedisShards 1 -./src/common/thirdparty/redis/src/redis-cli rpush RedisShards 127.0.0.1:6380 -sleep 1 -./src/plasma/plasma_store -s /tmp/store1 -m 1000000000 & -plasma1_pid=$! -./src/plasma/plasma_manager -m /tmp/manager1 -s /tmp/store1 -h 127.0.0.1 -p 11111 -r 127.0.0.1:6379 & -plasma2_pid=$! -./src/plasma/plasma_store -s /tmp/store2 -m 1000000000 & -plasma3_pid=$! -./src/plasma/plasma_manager -m /tmp/manager2 -s /tmp/store2 -h 127.0.0.1 -p 22222 -r 127.0.0.1:6379 & -plasma4_pid=$! -sleep 1 - -./src/plasma/client_tests - -kill $plasma4_pid -kill $plasma3_pid -kill $plasma2_pid -kill $plasma1_pid -kill $redis_pid1 -wait $redis_pid1 -kill $redis_pid2 -wait $redis_pid2 diff --git a/cpp/thirdparty/jemalloc/17c897976c60b0e6e4f4a365c751027244dada7a.tar.gz b/cpp/thirdparty/jemalloc/17c897976c60b0e6e4f4a365c751027244dada7a.tar.gz new file mode 100644 index 0000000000000..29d9266a12ded Binary files /dev/null and b/cpp/thirdparty/jemalloc/17c897976c60b0e6e4f4a365c751027244dada7a.tar.gz differ diff --git a/cpp/thirdparty/jemalloc/README.md b/cpp/thirdparty/jemalloc/README.md new file mode 100644 index 0000000000000..272ff9c730be1 --- /dev/null +++ b/cpp/thirdparty/jemalloc/README.md @@ -0,0 +1,22 @@ + + +This directory contains a vendored commit from the jemalloc stable-4 branch. +You can bump the version by downloading +https://github.com/jemalloc/jemalloc/archive/{{ commit }}.tar.gz diff --git a/dev/README.md b/dev/README.md index f3e963cf491bb..62ffb0a8f1416 100644 --- a/dev/README.md +++ b/dev/README.md @@ -113,3 +113,11 @@ Studio 2015): ``` dev/release/verify-release-candidate.bat apache-arrow-0.7.0.tar.gz ``` + +### Verifying the JavaScript release + +For JavaScript-specific releases, use a different verification script: + +```shell +bash dev/release/js-verify-release-candidate.sh 0.7.0 0 +``` \ No newline at end of file diff --git a/dev/dask_integration.sh b/dev/dask_integration.sh new file mode 100755 index 0000000000000..d344328b6af1e --- /dev/null +++ b/dev/dask_integration.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Pass the service name to run_docker_compose.sh +# Which validates environment and runs the service +exec "$(dirname ${BASH_SOURCE})"/run_docker_compose.sh dask_integration diff --git a/dev/dask_integration/Dockerfile b/dev/dask_integration/Dockerfile new file mode 100644 index 0000000000000..f72ef8ca0daab --- /dev/null +++ b/dev/dask_integration/Dockerfile @@ -0,0 +1,88 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM ubuntu:14.04 +ADD . /apache-arrow +WORKDIR /apache-arrow +# Basic OS utilities +RUN apt-get update && apt-get install -y \ + wget \ + git \ + gcc \ + g++ +# This will install conda in /home/ubuntu/miniconda +RUN wget -O /tmp/miniconda.sh \ + https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ + rm /tmp/miniconda.sh +# Create Conda environment +ENV PATH="/home/ubuntu/miniconda/bin:${PATH}" +RUN conda create -y -q -n test-environment \ + python=3.6 +# Install dependencies +RUN conda install -c conda-forge \ + numpy \ + pandas \ + bcolz \ + blosc \ + bokeh \ + boto3 \ + chest \ + cloudpickle \ + coverage \ + cytoolz \ + distributed \ + graphviz \ + h5py \ + ipython \ + partd \ + psutil \ + "pytest<=3.1.1" \ + scikit-image \ + scikit-learn \ + scipy \ + sqlalchemy \ + toolz +# install pytables from defaults for now +RUN conda install pytables + +RUN pip install -q git+https://github.com/dask/partd --upgrade --no-deps +RUN pip install -q git+https://github.com/dask/zict --upgrade --no-deps +RUN pip install -q git+https://github.com/dask/distributed --upgrade --no-deps +RUN pip install -q git+https://github.com/mrocklin/sparse --upgrade --no-deps +RUN pip install -q git+https://github.com/dask/s3fs --upgrade --no-deps + +RUN conda install -q -c conda-forge numba cython +RUN pip install -q git+https://github.com/dask/fastparquet + +RUN pip install -q \ + cachey \ + graphviz \ + moto \ + pyarrow \ + --upgrade --no-deps + +RUN pip install -q \ + cityhash \ + flake8 \ + mmh3 \ + pandas_datareader \ + pytest-xdist \ + xxhash \ + pycodestyle + +CMD arrow/dev/dask_integration/dask_integration.sh + diff --git a/dev/dask_integration/dask_integration.sh b/dev/dask_integration/dask_integration.sh new file mode 100755 index 0000000000000..f5a24e462b742 --- /dev/null +++ b/dev/dask_integration/dask_integration.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set up environment and working directory +cd /apache-arrow + +export ARROW_BUILD_TYPE=release +export ARROW_HOME=$(pwd)/dist +export PARQUET_HOME=$(pwd)/dist +CONDA_BASE=/home/ubuntu/miniconda +export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} + +# Allow for --user Python installation inside Docker +export HOME=$(pwd) + +# Clean up and get the dask master branch from github +rm -rf dask .local +export GIT_COMMITTER_NAME="Nobody" +export GIT_COMMITTER_EMAIL="nobody@nowhere.com" +git clone https://github.com/dask/dask.git +pushd dask +pip install --user -e .[complete] +# Verify integrity of the installed dask dataframe code +py.test dask/dataframe/tests/test_dataframe.py +popd + +# Run the integration test +pushd arrow/python/testing +py.test dask_tests +popd + +pushd dask/dask/dataframe/io +py.test tests/test_parquet.py +popd diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml index 7bd2cd4412cec..4b9014894003b 100644 --- a/dev/docker-compose.yml +++ b/dev/docker-compose.yml @@ -28,3 +28,8 @@ services: - "4000:4000" volumes: - ../..:/apache-arrow + dask_integration: + build: + context: dask_integration + volumes: + - ../..:/apache-arrow diff --git a/dev/gen_apidocs/Dockerfile b/dev/gen_apidocs/Dockerfile index 0b2844cc8454b..ca4718e637842 100644 --- a/dev/gen_apidocs/Dockerfile +++ b/dev/gen_apidocs/Dockerfile @@ -15,8 +15,6 @@ # limitations under the License. # FROM ubuntu:14.04 -ADD . /apache-arrow -WORKDIR /apache-arrow # Prerequsites for apt-add-repository RUN apt-get update && apt-get install -y \ software-properties-common python-software-properties @@ -28,20 +26,12 @@ RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ git \ gcc-4.9 \ g++-4.9 \ - build-essential + build-essential # This will install conda in /home/ubuntu/miniconda RUN wget -O /tmp/miniconda.sh \ https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ rm /tmp/miniconda.sh -# C++ dependencies -RUN /home/ubuntu/miniconda/bin/conda install -c conda-forge \ - boost-cpp \ - doxygen \ - maven \ - cmake \ - zlib \ - thrift-cpp # C_Glib dependencies RUN apt-get install -y \ libgtk2.0-dev \ @@ -69,6 +59,7 @@ RUN /home/ubuntu/miniconda/bin/conda create -y -q -n pyarrow-dev \ six \ setuptools \ # C++ + boost-cpp \ cmake \ flatbuffers \ rapidjson \ @@ -79,5 +70,9 @@ RUN /home/ubuntu/miniconda/bin/conda create -y -q -n pyarrow-dev \ jemalloc \ lz4-c \ zstd \ + doxygen \ + maven \ -c conda-forge +ADD . /apache-arrow +WORKDIR /apache-arrow CMD arrow/dev/gen_apidocs/create_documents.sh diff --git a/dev/gen_apidocs/create_documents.sh b/dev/gen_apidocs/create_documents.sh index afbe041506d85..566d9cee79c85 100755 --- a/dev/gen_apidocs/create_documents.sh +++ b/dev/gen_apidocs/create_documents.sh @@ -16,6 +16,8 @@ # limitations under the License. # +set -ex + # Set up environment and output directory for C++ libraries cd /apache-arrow rm -rf dist @@ -25,8 +27,6 @@ export ARROW_HOME=$(pwd)/dist export PARQUET_HOME=$(pwd)/dist CONDA_BASE=/home/ubuntu/miniconda export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} -export THRIFT_HOME=${CONDA_BASE} -export BOOST_ROOT=${CONDA_BASE} export PATH=${CONDA_BASE}/bin:${PATH} # Prepare the asf-site before copying api docs @@ -41,9 +41,13 @@ popd # Make Python documentation (Depends on C++ ) # Build Arrow C++ source activate pyarrow-dev -rm -rf arrow/cpp/build -mkdir arrow/cpp/build -pushd arrow/cpp/build + +export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX +export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX + +rm -rf arrow/cpp/build_docs +mkdir arrow/cpp/build_docs +pushd arrow/cpp/build_docs cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DARROW_PYTHON=on \ @@ -55,9 +59,9 @@ make install popd # Build Parquet C++ -rm -rf parquet-cpp/build -mkdir parquet-cpp/build -pushd parquet-cpp/build +rm -rf parquet-cpp/build_docs +mkdir parquet-cpp/build_docs +pushd parquet-cpp/build_docs cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \ -DPARQUET_BUILD_BENCHMARKS=off \ diff --git a/dev/release/README b/dev/release/README index dd2f512a0bb1b..8e0104ed83318 100644 --- a/dev/release/README +++ b/dev/release/README @@ -49,3 +49,16 @@ Logon to the apache repository: https://repository.apache.org/#stagingRepositori Select the arrow staging repository you just just created: orgapachearrow-100x Click the "close" button Once validation has passed, click the "release" button + +# Releasing JavaScript + +* Make release branch then tag the release + +git checkout -b release-js-X.Y.Z +git tag -a apache-arrow-js-X.Y.Z + +* Build the source release (requires NodeJS) + +dev/release/js-source-release.sh X.Y.Z $RC_NUM + +* After release vote push tag, and rebase master on release branch \ No newline at end of file diff --git a/dev/release/RELEASE_MANAGEMENT.md b/dev/release/RELEASE_MANAGEMENT.md index ce7835d5f5d85..73eaf5f95b3b0 100644 --- a/dev/release/RELEASE_MANAGEMENT.md +++ b/dev/release/RELEASE_MANAGEMENT.md @@ -63,7 +63,7 @@ in the `_release` subdirectory. The new contents of the new entry will go into a new Markdown file of the form `X.Y.Z.md`. You can start by copying one of the other release entries. -Generate a web-friendly changelog by running +Generate a web-friendly changelog by running (python3) ``` dev/release/changelog.py $VERSION 1 diff --git a/dev/release/js-source-release.sh b/dev/release/js-source-release.sh new file mode 100755 index 0000000000000..bf32acd052403 --- /dev/null +++ b/dev/release/js-source-release.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +set -e + +SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit +fi + +js_version=$1 +rc=$2 + +tag=apache-arrow-js-${js_version} +tagrc=${tag}-rc${rc} + +echo "Preparing source for tag ${tag}" + +tarball=${tag}.tar.gz + +# cd to $ARROW_HOME/js +cd $SOURCE_DIR/../../js +JS_SRC_DIR="$PWD" +# npm pack the js source files +npm install + +npm version --no-git-tag-version $js_version +git add package.json +git commit -m "[Release] Apache Arrow JavaScript $js_version" +git tag -a ${tag} + +release_hash=`git rev-list $tag 2> /dev/null | head -n 1 ` + +if [ -z "$release_hash" ]; then + echo "Cannot continue: unknown git tag: $tag" + exit +fi + +echo "Using commit $release_hash" + +cd $SOURCE_DIR + +rm -rf js-tmp +# `npm pack` writes the .tgz file to the current dir, so cd into js-tmp +mkdir -p js-tmp +cd js-tmp +# run npm pack on `arrow/js` +npm pack ${JS_SRC_DIR} +# unzip and remove the npm pack tarball +tar -xzf *.tgz && rm *.tgz +# `npm pack` puts files in a dir called "package" +cp $JS_SRC_DIR/../NOTICE.txt package +cp $JS_SRC_DIR/../LICENSE.txt package +# rename "package" to $tag +mv package ${tag} +tar czf ${tarball} ${tag} +rm -rf ${tag} + +${SOURCE_DIR}/run-rat.sh ${tarball} + +# sign the archive +gpg --armor --output ${tarball}.asc --detach-sig ${tarball} +gpg --print-md MD5 ${tarball} > ${tarball}.md5 +sha1sum $tarball > ${tarball}.sha1 +sha256sum $tarball > ${tarball}.sha256 +sha512sum $tarball > ${tarball}.sha512 + +# check out the arrow RC folder +svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow js-rc-tmp + +# add the release candidate for the tag +mkdir -p js-rc-tmp/${tagrc} +cp ${tarball}* js-rc-tmp/${tagrc} +svn add js-rc-tmp/${tagrc} +svn ci -m 'Apache Arrow JavaScript ${version} RC${rc}' js-rc-tmp/${tagrc} + +cd - + +# clean up +rm -rf js-tmp + +echo "Success! The release candidate is available here:" +echo " https://dist.apache.org/repos/dist/dev/arrow/${tagrc}" +echo "" +echo "Commit SHA1: ${release_hash}" diff --git a/dev/release/js-verify-release-candidate.sh b/dev/release/js-verify-release-candidate.sh new file mode 100755 index 0000000000000..5a37e10f74afb --- /dev/null +++ b/dev/release/js-verify-release-candidate.sh @@ -0,0 +1,100 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Requirements +# - nodejs >= 6.0.0 (best way is to use nvm) + +case $# in + 2) VERSION="$1" + RC_NUMBER="$2" + ;; + + *) echo "Usage: $0 X.Y.Z RC_NUMBER" + exit 1 + ;; +esac + +set -ex + +HERE=$(cd `dirname "${BASH_SOURCE[0]:-$0}"` && pwd) + +ARROW_DIST_URL='https://dist.apache.org/repos/dist/dev/arrow' + +download_dist_file() { + curl -f -O $ARROW_DIST_URL/$1 +} + +download_rc_file() { + download_dist_file apache-arrow-js-${VERSION}-rc${RC_NUMBER}/$1 +} + +import_gpg_keys() { + download_dist_file KEYS + gpg --import KEYS +} + +fetch_archive() { + local dist_name=$1 + download_rc_file ${dist_name}.tar.gz + download_rc_file ${dist_name}.tar.gz.asc + download_rc_file ${dist_name}.tar.gz.md5 + download_rc_file ${dist_name}.tar.gz.sha512 + gpg --verify ${dist_name}.tar.gz.asc ${dist_name}.tar.gz + gpg --print-md MD5 ${dist_name}.tar.gz | diff - ${dist_name}.tar.gz.md5 + if [ "$(uname)" == "Darwin" ]; then + shasum -a 512 ${dist_name}.tar.gz | diff - ${dist_name}.tar.gz.sha512 + else + sha512sum ${dist_name}.tar.gz | diff - ${dist_name}.tar.gz.sha512 + fi +} + +setup_tempdir() { + cleanup() { + rm -fr "$TMPDIR" + } + trap cleanup EXIT + TMPDIR=$(mktemp -d -t "$1.XXXXX") +} + +setup_tempdir "arrow-js-$VERSION" +echo "Working in sandbox $TMPDIR" +cd $TMPDIR + +VERSION=$1 +RC_NUMBER=$2 + +TARBALL=apache-arrow-js-$1.tar.gz + +import_gpg_keys + +DIST_NAME="apache-arrow-js-${VERSION}" +fetch_archive $DIST_NAME +tar xvzf ${DIST_NAME}.tar.gz +cd ${DIST_NAME} + +npm install +# npx run-s clean:all lint create:testdata build +# npm run test -- -t ts -u --integration +# npm run test -- --integration +npx run-s clean:all lint build +npm run test + +echo 'Release candidate looks good!' +exit 0 diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index db3b3aa5fac9c..bf962bcd49e42 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -10,6 +10,12 @@ cpp/src/arrow/io/mman.h cpp/src/arrow/util/random.h cpp/src/arrow/status.cc cpp/src/arrow/status.h +cpp/src/arrow/util/variant.h +cpp/src/arrow/util/variant/optional.h +cpp/src/arrow/util/variant/recursive_wrapper.h +cpp/src/arrow/util/variant/variant_cast.h +cpp/src/arrow/util/variant/variant_io.h +cpp/src/arrow/util/variant/variant_visitor.h cpp/build-support/asan_symbolize.py cpp/build-support/cpplint.py cpp/build-support/clang_format_exclusions.txt diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 8d1bed75b8b05..f33211e26a3f6 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -165,7 +165,9 @@ test_glib() { make -j$NPROC make install - NO_MAKE=yes test/run-test.sh + GI_TYPELIB_PATH=$ARROW_HOME/lib/girepository-1.0 \ + NO_MAKE=yes \ + test/run-test.sh popd } @@ -173,7 +175,18 @@ test_glib() { test_js() { pushd js npm install - npm run validate + # clean, lint, and build JS source + npx run-s clean:all lint build + npm run test + + # create initial integration test data + # npm run create:testdata + + # run once to write the snapshots + # npm test -- -t ts -u --integration + + # run again to test all builds against the snapshots + # npm test -- --integration popd } @@ -229,13 +242,14 @@ fetch_archive $DIST_NAME tar xvzf ${DIST_NAME}.tar.gz cd ${DIST_NAME} +test_package_java setup_miniconda test_and_install_cpp +test_integration +test_glib install_parquet_cpp test_python -test_glib -test_package_java -test_integration + test_js echo 'Release candidate looks good!' diff --git a/dev/run_docker_compose.sh b/dev/run_docker_compose.sh index f46879ed1e436..681a3a75ffe20 100755 --- a/dev/run_docker_compose.sh +++ b/dev/run_docker_compose.sh @@ -37,4 +37,4 @@ fi GID=$(id -g ${USERNAME}) docker-compose -f arrow/dev/docker-compose.yml run \ - -u "${UID}:${GID}" "${1}" + --rm -u "${UID}:${GID}" "${1}" diff --git a/format/IPC.md b/format/IPC.md index 2f79031443b17..5a5d3aef62be7 100644 --- a/format/IPC.md +++ b/format/IPC.md @@ -67,7 +67,9 @@ We provide a streaming format for record batches. It is presented as a sequence of encapsulated messages, each of which follows the format above. The schema comes first in the stream, and it is the same for all of the record batches that follow. If any fields in the schema are dictionary-encoded, one or more -`DictionaryBatch` messages will follow the schema. +`DictionaryBatch` messages will be included. `DictionaryBatch` and +`RecordBatch` messages may be interleaved, but before any dictionary key is used +in a `RecordBatch` it should be defined in a `DictionaryBatch`. ``` @@ -76,6 +78,10 @@ that follow. If any fields in the schema are dictionary-encoded, one or more ... + +... + +... ``` @@ -109,6 +115,10 @@ Schematically we have: ``` +In the file format, there is no requirement that dictionary keys should be +defined in a `DictionaryBatch` before they are used in a `RecordBatch`, as long +as the keys are defined somewhere in the file. + ### RecordBatch body structure The `RecordBatch` metadata contains a depth-first (pre-order) flattened set of @@ -181,6 +191,7 @@ the dictionaries can be properly interpreted. table DictionaryBatch { id: long; data: RecordBatch; + isDelta: boolean = false; } ``` @@ -189,6 +200,38 @@ in the schema, so that dictionaries can even be used for multiple fields. See the [Physical Layout][4] document for more about the semantics of dictionary-encoded data. +The dictionary `isDelta` flag allows dictionary batches to be modified +mid-stream. A dictionary batch with `isDelta` set indicates that its vector +should be concatenated with those of any previous batches with the same `id`. A +stream which encodes one column, the list of strings +`["A", "B", "C", "B", "D", "C", "E", "A"]`, with a delta dictionary batch could +take the form: + +``` + + +(0) "A" +(1) "B" +(2) "C" + + +0 +1 +2 +1 + + +(3) "D" +(4) "E" + + +3 +2 +4 +0 +EOS +``` + ### Tensor (Multi-dimensional Array) Message Format The `Tensor` message types provides a way to write a multidimensional array of @@ -209,5 +252,5 @@ shared memory region) to be a multiple of 8: [1]: https://github.com/apache/arrow/blob/master/format/File.fbs [2]: https://github.com/apache/arrow/blob/master/format/Message.fbs -[3]: https://github.com/google]/flatbuffers +[3]: https://github.com/google/flatbuffers [4]: https://github.com/apache/arrow/blob/master/format/Layout.md diff --git a/format/Layout.md b/format/Layout.md index ebf93821aab24..963202f9fb77a 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -615,9 +615,9 @@ the the types array indicates that a slot contains a different type at the index ## Dictionary encoding When a field is dictionary encoded, the values are represented by an array of Int32 representing the index of the value in the dictionary. -The Dictionary is received as a DictionaryBatch whose id is referenced by a dictionary attribute defined in the metadata ([Message.fbs][7]) in the Field table. -The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its index in the DictionaryBatch. -When a Schema references a Dictionary id, it must send a DictionaryBatch for this id before any RecordBatch. +The Dictionary is received as one or more DictionaryBatches with the id referenced by a dictionary attribute defined in the metadata ([Message.fbs][7]) in the Field table. +The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its index in the DictionaryBatches. +When a Schema references a Dictionary id, it must send at least one DictionaryBatch for this id. As an example, you could have the following data: ``` diff --git a/format/Message.fbs b/format/Message.fbs index f4a95713cea93..830718139d88c 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -61,16 +61,20 @@ table RecordBatch { buffers: [Buffer]; } -/// ---------------------------------------------------------------------- /// For sending dictionary encoding information. Any Field can be /// dictionary-encoded, but in this case none of its children may be /// dictionary-encoded. -/// There is one vector / column per dictionary -/// +/// There is one vector / column per dictionary, but that vector / column +/// may be spread across multiple dictionary batches by using the isDelta +/// flag table DictionaryBatch { id: long; data: RecordBatch; + + /// If isDelta is true the values in the dictionary are to be appended to a + /// dictionary with the indicated id + isDelta: bool = false; } /// ---------------------------------------------------------------------- diff --git a/format/Schema.fbs b/format/Schema.fbs index 186f8e362bde2..3d739342b83c3 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -20,9 +20,17 @@ namespace org.apache.arrow.flatbuf; enum MetadataVersion:short { + /// 0.1.0 V1, + + /// 0.2.0 V2, - V3 + + /// 0.3.0 -> 0.7.1 + V3, + + /// >= 0.8.0 + V4 } /// These are stored in the flatbuffer in the Type union below @@ -203,32 +211,6 @@ union Type { Map } -/// ---------------------------------------------------------------------- -/// The possible types of a vector - -enum VectorType: short { - /// used in List type, Dense Union and variable length primitive types (String, Binary) - OFFSET, - /// actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector - DATA, - /// Bit vector indicating if each value is null - VALIDITY, - /// Type vector used in Union type - TYPE -} - -/// ---------------------------------------------------------------------- -/// represents the physical layout of a buffer -/// buffers have fixed width slots of a given type - -table VectorLayout { - /// the width of a slot in the buffer (typically 1, 8, 16, 32 or 64) - bit_width: short; - /// the purpose of the vector - type: VectorType; -} - - /// ---------------------------------------------------------------------- /// user defined key value pairs to add custom metadata to arrow /// key namespacing is the responsibility of the user @@ -277,10 +259,7 @@ table Field { // children apply only to Nested data types like Struct, List and Union children: [Field]; - /// layout of buffers produced for this type (as derived from the Type) - /// does not include children - /// each recordbatch will return instances of those Buffers. - layout: [ VectorLayout ]; + // User-defined metadata custom_metadata: [ KeyValue ]; } @@ -293,10 +272,6 @@ enum Endianness:short { Little, Big } /// ---------------------------------------------------------------------- /// A Buffer represents a single contiguous memory segment struct Buffer { - /// The shared memory page id where this buffer is located. Currently this is - /// not used - page: int; - /// The relative offset into the shared memory page where the bytes for this /// buffer starts offset: long; diff --git a/integration/README.md b/integration/README.md index 6e5a6c15641ec..8d6df8e5772de 100644 --- a/integration/README.md +++ b/integration/README.md @@ -77,7 +77,7 @@ export ARROW_CPP_EXE_PATH=$CPP_BUILD_DIR/debug Here `$ARROW_HOME` is the location of your Arrow git clone. The `$CPP_BUILD_DIR` may be different depending on how you built with CMake -(in-source of out-of-source). +(in-source or out-of-source). Once this is done, run the integration tests with (optionally adding `--debug` for additional output) @@ -88,4 +88,4 @@ python integration_test.py python integration_test.py --debug # additional output ``` -[1]: https://conda.io/miniconda.html \ No newline at end of file +[1]: https://conda.io/miniconda.html diff --git a/integration/data/simple.json b/integration/data/simple.json index fb903e7ac4b63..6634729193b4e 100644 --- a/integration/data/simple.json +++ b/integration/data/simple.json @@ -4,36 +4,20 @@ { "name": "foo", "type": {"name": "int", "isSigned": true, "bitWidth": 32}, - "nullable": true, "children": [], - "typeLayout": { - "vectors": [ - {"type": "VALIDITY", "typeBitWidth": 1}, - {"type": "DATA", "typeBitWidth": 32} - ] - } + "nullable": true, + "children": [] }, { "name": "bar", "type": {"name": "floatingpoint", "precision": "DOUBLE"}, - "nullable": true, "children": [], - "typeLayout": { - "vectors": [ - {"type": "VALIDITY", "typeBitWidth": 1}, - {"type": "DATA", "typeBitWidth": 64} - ] - } + "nullable": true, + "children": [] }, { "name": "baz", "type": {"name": "utf8"}, - "nullable": true, "children": [], - "typeLayout": { - "vectors": [ - {"type": "VALIDITY", "typeBitWidth": 1}, - {"type": "OFFSET", "typeBitWidth": 32}, - {"type": "DATA", "typeBitWidth": 8} - ] - } + "nullable": true, + "children": [] } ] }, @@ -61,6 +45,54 @@ "DATA": ["aa", "", "", "bbb", "cccc"] } ] + }, + { + "count": 5, + "columns": [ + { + "name": "foo", + "count": 5, + "VALIDITY": [1, 1, 1, 1, 1], + "DATA": [1, 2, 3, 4, 5] + }, + { + "name": "bar", + "count": 5, + "VALIDITY": [1, 1, 1, 1, 1], + "DATA": [1.0, 2.0, 3.0, 4.0, 5.0] + }, + { + "name": "baz", + "count": 5, + "VALIDITY": [1, 1, 1, 1, 1], + "OFFSET": [0, 2, 3, 4, 7, 11], + "DATA": ["aa", "b", "c", "ddd", "eeee"] + } + ] + }, + { + "count": 5, + "columns": [ + { + "name": "foo", + "count": 5, + "VALIDITY": [0, 0, 0, 0, 0], + "DATA": [1, 2, 3, 4, 5] + }, + { + "name": "bar", + "count": 5, + "VALIDITY": [0, 0, 0, 0, 0], + "DATA": [1.0, 2.0, 3.0, 4.0, 5.0] + }, + { + "name": "baz", + "count": 5, + "VALIDITY": [0, 0, 0, 0, 0], + "OFFSET": [0, 0, 0, 0, 0, 0], + "DATA": ["", "", "", "", ""] + } + ] } ] } diff --git a/integration/data/struct_example.json b/integration/data/struct_example.json index 3ea062db7ba32..4e6cc774e3151 100644 --- a/integration/data/struct_example.json +++ b/integration/data/struct_example.json @@ -16,19 +16,7 @@ "bitWidth": 32 }, "nullable": true, - "children": [], - "typeLayout": { - "vectors": [ - { - "type": "VALIDITY", - "typeBitWidth": 1 - }, - { - "type": "DATA", - "typeBitWidth": 32 - } - ] - } + "children": [] }, { "name": "f2", @@ -36,33 +24,9 @@ "name": "utf8" }, "nullable": true, - "children": [], - "typeLayout": { - "vectors": [ - { - "type": "VALIDITY", - "typeBitWidth": 1 - }, - { - "type": "OFFSET", - "typeBitWidth": 32 - }, - { - "type": "DATA", - "typeBitWidth": 8 - } - ] - } + "children": [] } - ], - "typeLayout": { - "vectors": [ - { - "type": "VALIDITY", - "typeBitWidth": 1 - } - ] - } + ] } ] }, @@ -234,4 +198,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/integration/integration_test.py b/integration/integration_test.py index 59a1de5a4639d..79b098a614a6c 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -28,6 +28,7 @@ import subprocess import tempfile import uuid +import errno import numpy as np @@ -65,24 +66,16 @@ def rands(nchars): return ''.join(np.random.choice(RANDS_CHARS, nchars)) -if six.PY2: - def frombytes(o): - return o +def tobytes(o): + if isinstance(o, six.text_type): + return o.encode('utf8') + return o - def tobytes(o): - if isinstance(o, unicode): - return o.encode('utf8') - else: - return o -else: - def tobytes(o): - if isinstance(o, str): - return o.encode('utf8') - else: - return o - def frombytes(o): +def frombytes(o): + if isinstance(o, six.binary_type): return o.decode('utf8') + return o # from the merge_arrow_pr.py script @@ -118,8 +111,7 @@ def get_json(self): ('name', self.name), ('type', self._get_type()), ('nullable', self.nullable), - ('children', self._get_children()), - ('typeLayout', self._get_type_layout()) + ('children', self._get_children()) ]) def _make_is_valid(self, size): @@ -165,19 +157,11 @@ class PrimitiveType(DataType): def _get_children(self): return [] - def _get_type_layout(self): - return OrderedDict([ - ('vectors', - [OrderedDict([('type', 'VALIDITY'), - ('typeBitWidth', 1)]), - OrderedDict([('type', 'DATA'), - ('typeBitWidth', self.bit_width)])])]) - class PrimitiveColumn(Column): def __init__(self, name, count, is_valid, values): - Column.__init__(self, name, count) + super(PrimitiveColumn, self).__init__(name, count) self.is_valid = is_valid self.values = values @@ -191,23 +175,33 @@ def _get_buffers(self): ] -TEST_INT_MIN = - 2**31 + 1 -TEST_INT_MAX = 2**31 - 1 +TEST_INT_MAX = 2 ** 31 - 1 +TEST_INT_MIN = ~TEST_INT_MAX + class IntegerType(PrimitiveType): def __init__(self, name, is_signed, bit_width, nullable=True, min_value=TEST_INT_MIN, max_value=TEST_INT_MAX): - PrimitiveType.__init__(self, name, nullable=nullable) + super(IntegerType, self).__init__(name, nullable=nullable) self.is_signed = is_signed self.bit_width = bit_width self.min_value = min_value self.max_value = max_value - @property - def numpy_type(self): - return ('int' if self.is_signed else 'uint') + str(self.bit_width) + def _get_generated_data_bounds(self): + signed_iinfo = np.iinfo('int' + str(self.bit_width)) + if self.is_signed: + min_value, max_value = signed_iinfo.min, signed_iinfo.max + else: + # ARROW-1837 Remove this hack and restore full unsigned integer + # range + min_value, max_value = 0, signed_iinfo.max + + lower_bound = max(min_value, self.min_value) + upper_bound = min(max_value, self.max_value) + return lower_bound, upper_bound def _get_type(self): return OrderedDict([ @@ -217,9 +211,7 @@ def _get_type(self): ]) def generate_column(self, size, name=None): - iinfo = np.iinfo(self.numpy_type) - lower_bound = max(iinfo.min, self.min_value) - upper_bound = min(iinfo.max, self.max_value) + lower_bound, upper_bound = self._get_generated_data_bounds() return self.generate_range(size, lower_bound, upper_bound, name=name) def generate_range(self, size, lower, upper, name=None): @@ -238,10 +230,21 @@ class DateType(IntegerType): DAY = 0 MILLISECOND = 1 + # 1/1/1 to 12/31/9999 + _ranges = { + DAY: [-719162, 2932896], + MILLISECOND: [-62135596800000, 253402214400000] + } + def __init__(self, name, unit, nullable=True): - self.unit = unit bit_width = 32 if unit == self.DAY else 64 - IntegerType.__init__(self, name, True, bit_width, nullable=nullable) + + min_value, max_value = self._ranges[unit] + super(DateType, self).__init__( + name, True, bit_width, nullable=nullable, + min_value=min_value, max_value=max_value + ) + self.unit = unit def _get_type(self): return OrderedDict([ @@ -267,10 +270,20 @@ class TimeType(IntegerType): 'ns': 64 } + _ranges = { + 's': [0, 86400], + 'ms': [0, 86400000], + 'us': [0, 86400000000], + 'ns': [0, 86400000000000] + } + def __init__(self, name, unit='s', nullable=True): + min_val, max_val = self._ranges[unit] + super(TimeType, self).__init__(name, True, self.BIT_WIDTHS[unit], + nullable=nullable, + min_value=min_val, + max_value=max_val) self.unit = unit - IntegerType.__init__(self, name, True, self.BIT_WIDTHS[unit], - nullable=nullable) def _get_type(self): return OrderedDict([ @@ -282,10 +295,23 @@ def _get_type(self): class TimestampType(IntegerType): + # 1/1/1 to 12/31/9999 + _ranges = { + 's': [-62135596800, 253402214400], + 'ms': [-62135596800000, 253402214400000], + 'us': [-62135596800000000, 253402214400000000], + + # Physical range for int64, ~584 years and change + 'ns': [np.iinfo('int64').min, np.iinfo('int64').max] + } + def __init__(self, name, unit='s', tz=None, nullable=True): + min_val, max_val = self._ranges[unit] + super(TimestampType, self).__init__(name, True, 64, nullable=nullable, + min_value=min_val, + max_value=max_val) self.unit = unit self.tz = tz - IntegerType.__init__(self, name, True, 64, nullable=nullable) def _get_type(self): fields = [ @@ -302,7 +328,7 @@ def _get_type(self): class FloatingPointType(PrimitiveType): def __init__(self, name, bit_width, nullable=True): - PrimitiveType.__init__(self, name, nullable=nullable) + super(FloatingPointType, self).__init__(name, nullable=nullable) self.bit_width = bit_width self.precision = { @@ -331,13 +357,30 @@ def generate_column(self, size, name=None): return PrimitiveColumn(name, size, is_valid, values) -class DecimalType(PrimitiveType): - def __init__(self, name, bit_width, precision, scale, nullable=True): - PrimitiveType.__init__(self, name, nullable=True) +DECIMAL_PRECISION_TO_VALUE = { + key: (1 << (8 * i - 1)) - 1 for i, key in enumerate( + [1, 3, 5, 7, 10, 12, 15, 17, 19, 22, 24, 27, 29, 32, 34, 36], + start=1, + ) +} - self.bit_width = bit_width + +def decimal_range_from_precision(precision): + assert 1 <= precision <= 38 + try: + max_value = DECIMAL_PRECISION_TO_VALUE[precision] + except KeyError: + return decimal_range_from_precision(precision - 1) + else: + return ~max_value, max_value + + +class DecimalType(PrimitiveType): + def __init__(self, name, precision, scale, bit_width=128, nullable=True): + super(DecimalType, self).__init__(name, nullable=True) self.precision = precision self.scale = scale + self.bit_width = bit_width @property def numpy_type(self): @@ -350,16 +393,9 @@ def _get_type(self): ('scale', self.scale), ]) - def _get_type_layout(self): - return OrderedDict([ - ('vectors', - [OrderedDict([('type', 'VALIDITY'), - ('typeBitWidth', 1)]), - OrderedDict([('type', 'DATA'), - ('typeBitWidth', self.bit_width)])])]) - def generate_column(self, size, name=None): - values = [random.randint(0, 2**self.bit_width - 1) for x in range(size)] + min_value, max_value = decimal_range_from_precision(self.precision) + values = [random.randint(min_value, max_value) for _ in range(size)] is_valid = self._make_is_valid(size) if name is None: @@ -369,14 +405,12 @@ def generate_column(self, size, name=None): class DecimalColumn(PrimitiveColumn): - def __init__(self, name, count, is_valid, values, bit_width): - PrimitiveColumn.__init__(self, name, count, is_valid, values) + def __init__(self, name, count, is_valid, values, bit_width=128): + super(DecimalColumn, self).__init__(name, count, is_valid, values) self.bit_width = bit_width - self.hex_width = bit_width / 4 def _encode_value(self, x): - hex_format_str = '%%0%dx' % self.hex_width - return (hex_format_str % x).upper() + return str(x) class BooleanType(PrimitiveType): @@ -410,16 +444,6 @@ def column_class(self): def _get_type(self): return OrderedDict([('name', 'binary')]) - def _get_type_layout(self): - return OrderedDict([ - ('vectors', - [OrderedDict([('type', 'VALIDITY'), - ('typeBitWidth', 1)]), - OrderedDict([('type', 'OFFSET'), - ('typeBitWidth', 32)]), - OrderedDict([('type', 'DATA'), - ('typeBitWidth', 8)])])]) - def generate_column(self, size, name=None): K = 7 is_valid = self._make_is_valid(size) @@ -478,7 +502,7 @@ def get_json(self): class BinaryColumn(PrimitiveColumn): def _encode_value(self, x): - return frombytes(binascii.hexlify(x)) + return frombytes(binascii.hexlify(x).upper()) def _get_buffers(self): offset = 0 @@ -510,7 +534,7 @@ def _encode_value(self, x): class ListType(DataType): def __init__(self, name, value_type, nullable=True): - DataType.__init__(self, name, nullable=nullable) + super(ListType, self).__init__(name, nullable=nullable) self.value_type = value_type def _get_type(self): @@ -521,14 +545,6 @@ def _get_type(self): def _get_children(self): return [self.value_type.get_json()] - def _get_type_layout(self): - return OrderedDict([ - ('vectors', - [OrderedDict([('type', 'VALIDITY'), - ('typeBitWidth', 1)]), - OrderedDict([('type', 'OFFSET'), - ('typeBitWidth', 32)])])]) - def generate_column(self, size, name=None): MAX_LIST_SIZE = 4 @@ -553,7 +569,7 @@ def generate_column(self, size, name=None): class ListColumn(Column): def __init__(self, name, count, is_valid, offsets, values): - Column.__init__(self, name, count) + super(ListColumn, self).__init__(name, count) self.is_valid = is_valid self.offsets = offsets self.values = values @@ -571,7 +587,7 @@ def _get_children(self): class StructType(DataType): def __init__(self, name, field_types, nullable=True): - DataType.__init__(self, name, nullable=nullable) + super(StructType, self).__init__(name, nullable=nullable) self.field_types = field_types def _get_type(self): @@ -582,12 +598,6 @@ def _get_type(self): def _get_children(self): return [type_.get_json() for type_ in self.field_types] - def _get_type_layout(self): - return OrderedDict([ - ('vectors', - [OrderedDict([('type', 'VALIDITY'), - ('typeBitWidth', 1)])])]) - def generate_column(self, size, name=None): is_valid = self._make_is_valid(size) @@ -620,7 +630,7 @@ def get_json(self): class DictionaryType(DataType): def __init__(self, name, index_type, dictionary, nullable=True): - DataType.__init__(self, name, nullable=nullable) + super(DictionaryType, self).__init__(name, nullable=nullable) assert isinstance(index_type, IntegerType) assert isinstance(dictionary, Dictionary) @@ -638,13 +648,9 @@ def get_json(self): ('id', self.dictionary.id_), ('indexType', self.index_type._get_type()), ('isOrdered', self.dictionary.ordered) - ])), - ('typeLayout', self.index_type._get_type_layout()) + ])) ]) - def _get_type_layout(self): - return self.index_type._get_type_layout() - def generate_column(self, size, name=None): if name is None: name = self.name @@ -655,7 +661,7 @@ def generate_column(self, size, name=None): class StructColumn(Column): def __init__(self, name, count, is_valid, field_values): - Column.__init__(self, name, count) + super(StructColumn, self).__init__(name, count) self.is_valid = is_valid self.field_values = field_values @@ -742,7 +748,7 @@ def _generate_file(name, fields, batch_sizes, dictionaries=None): return JsonFile(name, schema, batches, dictionaries) -def generate_primitive_case(batch_sizes): +def generate_primitive_case(batch_sizes, name='primitive'): types = ['bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', 'float32', 'float64', 'binary', 'utf8'] @@ -753,16 +759,17 @@ def generate_primitive_case(batch_sizes): fields.append(get_field(type_ + "_nullable", type_, True)) fields.append(get_field(type_ + "_nonnullable", type_, False)) - return _generate_file("primitive", fields, batch_sizes) + return _generate_file(name, fields, batch_sizes) def generate_decimal_case(): fields = [ - DecimalType('f1', 128, 24, 10, True), - DecimalType('f2', 128, 32, -10, True) + DecimalType(name='f{}'.format(i), precision=precision, scale=2) + for i, precision in enumerate(range(3, 39)) ] - batch_sizes = [7, 10] + possible_batch_sizes = 7, 10 + batch_sizes = [possible_batch_sizes[i % 2] for i in range(len(fields))] return _generate_file('decimal', fields, batch_sizes) @@ -830,8 +837,8 @@ def _temp_path(): return file_objs = [ - generate_primitive_case([7, 10]), - generate_primitive_case([0, 0, 0]), + generate_primitive_case([17, 20], name='primitive'), + generate_primitive_case([0, 0, 0], name='primitive_zerolength'), generate_decimal_case(), generate_datetime_case(), generate_nested_case(), @@ -861,14 +868,15 @@ def __init__(self, json_files, testers, debug=False): self.debug = debug def run(self): - for producer, consumer in itertools.product(self.testers, - self.testers): + for producer, consumer in itertools.product(filter(lambda t: t.PRODUCER, self.testers), + filter(lambda t: t.CONSUMER, self.testers)): self._compare_implementations(producer, consumer) def _compare_implementations(self, producer, consumer): print('##########################################################') - print('{0} producing, {1} consuming'.format(producer.name, - consumer.name)) + print( + '{0} producing, {1} consuming'.format(producer.name, consumer.name) + ) print('##########################################################') for json_path in self.json_files: @@ -901,6 +909,8 @@ def _compare_implementations(self, producer, consumer): class Tester(object): + PRODUCER = False + CONSUMER = False def __init__(self, debug=False): self.debug = debug @@ -919,6 +929,8 @@ def validate(self, json_path, arrow_path): class JavaTester(Tester): + PRODUCER = True + CONSUMER = True _arrow_version = load_version_from_pom() ARROW_TOOLS_JAR = os.environ.get( @@ -970,6 +982,8 @@ def file_to_stream(self, file_path, stream_path): class CPPTester(Tester): + PRODUCER = True + CONSUMER = True EXE_PATH = os.environ.get( 'ARROW_CPP_EXE_PATH', @@ -1017,6 +1031,41 @@ def file_to_stream(self, file_path, stream_path): print(cmd) os.system(cmd) +class JSTester(Tester): + PRODUCER = False + CONSUMER = True + + INTEGRATION_EXE = os.path.join(ARROW_HOME, 'js/bin/integration.js') + + name = 'JS' + + def _run(self, arrow_path=None, json_path=None, command='VALIDATE'): + cmd = [self.INTEGRATION_EXE] + + if arrow_path is not None: + cmd.extend(['-a', arrow_path]) + + if json_path is not None: + cmd.extend(['-j', json_path]) + + cmd.extend(['--mode', command]) + + if self.debug: + print(' '.join(cmd)) + + run_cmd(cmd) + + def validate(self, json_path, arrow_path): + return self._run(arrow_path, json_path, 'VALIDATE') + + def stream_to_file(self, stream_path, file_path): + # Just copy stream to file, we can read the stream directly + cmd = ['cp', stream_path, file_path] + cmd = ' '.join(cmd) + if self.debug: + print(cmd) + os.system(cmd) + def get_static_json_files(): glob_pattern = os.path.join(ARROW_HOME, 'integration', 'data', '*.json') @@ -1024,7 +1073,7 @@ def get_static_json_files(): def run_all_tests(debug=False): - testers = [CPPTester(debug=debug), JavaTester(debug=debug)] + testers = [CPPTester(debug=debug), JavaTester(debug=debug), JSTester(debug=debug)] static_json_files = get_static_json_files() generated_json_files = get_generated_json_files() json_files = static_json_files + generated_json_files @@ -1033,11 +1082,34 @@ def run_all_tests(debug=False): runner.run() print('-- All tests passed!') + +def write_js_test_json(directory): + generate_nested_case().write(os.path.join(directory, 'nested.json')) + generate_decimal_case().write(os.path.join(directory, 'decimal.json')) + generate_datetime_case().write(os.path.join(directory, 'datetime.json')) + (generate_dictionary_case() + .write(os.path.join(directory, 'dictionary.json'))) + (generate_primitive_case([7, 10]) + .write(os.path.join(directory, 'primitive.json'))) + (generate_primitive_case([0, 0, 0]) + .write(os.path.join(directory, 'primitive-empty.json'))) + + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Arrow integration test CLI') + parser.add_argument('--write_generated_json', dest='generated_json_path', + action='store', default=False, + help='Generate test JSON') parser.add_argument('--debug', dest='debug', action='store_true', default=False, help='Run executables in debug mode as relevant') - args = parser.parse_args() - run_all_tests(debug=args.debug) + if args.generated_json_path: + try: + os.makedirs(args.generated_json_path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + write_js_test_json(args.generated_json_path) + else: + run_all_tests(debug=args.debug) diff --git a/java/.gitattributes b/java/.gitattributes new file mode 100644 index 0000000000000..596615322fb3e --- /dev/null +++ b/java/.gitattributes @@ -0,0 +1,2 @@ +.gitattributes export-ignore +.gitignore export-ignore diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java b/java/dev/checkstyle/checkstyle.license similarity index 83% rename from java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java rename to java/dev/checkstyle/checkstyle.license index b2455e9e42b4b..c06c90cd287af 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java +++ b/java/dev/checkstyle/checkstyle.license @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -15,12 +15,3 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -package org.apache.arrow.vector; - -public interface NullableVector extends ValueVector { - - BitVector getValidityVector(); - - ValueVector getValuesVector(); -} diff --git a/java/dev/checkstyle/checkstyle.xml b/java/dev/checkstyle/checkstyle.xml new file mode 100644 index 0000000000000..14dbede169f19 --- /dev/null +++ b/java/dev/checkstyle/checkstyle.xml @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/java/dev/checkstyle/suppressions.xml b/java/dev/checkstyle/suppressions.xml new file mode 100644 index 0000000000000..36697256d69d9 --- /dev/null +++ b/java/dev/checkstyle/suppressions.xml @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + diff --git a/java/format/pom.xml b/java/format/pom.xml index cd2b6c0de6710..3e6582d3fd639 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -15,7 +15,7 @@ arrow-java-root org.apache.arrow - 0.8.0-SNAPSHOT + 0.9.0-SNAPSHOT arrow-format diff --git a/java/memory/pom.xml b/java/memory/pom.xml index 7efc8e6aa470c..74ec45c1d2f97 100644 --- a/java/memory/pom.xml +++ b/java/memory/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.8.0-SNAPSHOT + 0.9.0-SNAPSHOT arrow-memory Arrow Memory diff --git a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java index e2bbe35480b66..23f5d65fbb550 100644 --- a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java @@ -23,6 +23,7 @@ import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.ByteOrder; +import java.nio.channels.FileChannel; import java.nio.channels.GatheringByteChannel; import java.nio.channels.ScatteringByteChannel; import java.nio.charset.Charset; @@ -493,6 +494,16 @@ public ArrowBuf retain() { return retain(1); } + @Override + public ByteBuf touch() { + return this; + } + + @Override + public ByteBuf touch(Object hint) { + return this; + } + @Override public long getLong(int index) { chk(index, 8); @@ -505,6 +516,17 @@ public float getFloat(int index) { return Float.intBitsToFloat(getInt(index)); } + /** + * Gets a 64-bit long integer at the specified absolute {@code index} in + * this buffer in Big Endian Byte Order. + */ + @Override + public long getLongLE(int index) { + chk(index, 8); + final long v = PlatformDependent.getLong(addr(index)); + return Long.reverseBytes(v); + } + @Override public double getDouble(int index) { return Double.longBitsToDouble(getLong(index)); @@ -527,6 +549,17 @@ public int getInt(int index) { return v; } + /** + * Gets a 32-bit integer at the specified absolute {@code index} in + * this buffer in Big Endian Byte Order. + */ + @Override + public int getIntLE(int index) { + chk(index, 4); + final int v = PlatformDependent.getInt(addr(index)); + return Integer.reverseBytes(v); + } + @Override public int getUnsignedShort(int index) { return getShort(index) & 0xFFFF; @@ -535,10 +568,44 @@ public int getUnsignedShort(int index) { @Override public short getShort(int index) { chk(index, 2); - short v = PlatformDependent.getShort(addr(index)); + final short v = PlatformDependent.getShort(addr(index)); return v; } + /** + * Gets a 16-bit short integer at the specified absolute {@code index} in + * this buffer in Big Endian Byte Order. + */ + @Override + public short getShortLE(int index) { + final short v = PlatformDependent.getShort(addr(index)); + return Short.reverseBytes(v); + } + + /** + * Gets an unsigned 24-bit medium integer at the specified absolute + * {@code index} in this buffer. + */ + @Override + public int getUnsignedMedium(int index) { + chk(index, 3); + final long addr = addr(index); + return (PlatformDependent.getByte(addr) & 0xff) << 16 | + (PlatformDependent.getShort(addr + 1) & 0xffff); + } + + /** + * Gets an unsigned 24-bit medium integer at the specified absolute {@code index} in + * this buffer in Big Endian Byte Order. + */ + @Override + public int getUnsignedMediumLE(int index) { + chk(index, 3); + final long addr = addr(index); + return (PlatformDependent.getByte(addr) & 0xff) | + (Short.reverseBytes(PlatformDependent.getShort(addr + 1)) & 0xffff) << 8; + } + @Override public ArrowBuf setShort(int index, int value) { chk(index, 2); @@ -546,6 +613,44 @@ public ArrowBuf setShort(int index, int value) { return this; } + /** + * Sets the specified 16-bit short integer at the specified absolute {@code index} + * in this buffer with Big Endian byte order. + */ + @Override + public ByteBuf setShortLE(int index, int value) { + chk(index, 2); + PlatformDependent.putShort(addr(index), Short.reverseBytes((short) value)); + return this; + } + + /** + * Sets the specified 24-bit medium integer at the specified absolute + * {@code index} in this buffer. + */ + @Override + public ByteBuf setMedium(int index, int value) { + chk(index, 3); + final long addr = addr(index); + PlatformDependent.putByte(addr, (byte) (value >>> 16)); + PlatformDependent.putShort(addr + 1, (short) value); + return this; + } + + + /** + * Sets the specified 24-bit medium integer at the specified absolute {@code index} + * in this buffer with Big Endian byte order. + */ + @Override + public ByteBuf setMediumLE(int index, int value) { + chk(index, 3); + final long addr = addr(index); + PlatformDependent.putByte(addr, (byte) value); + PlatformDependent.putShort(addr + 1, Short.reverseBytes((short) (value >>> 8))); + return this; + } + @Override public ArrowBuf setInt(int index, int value) { chk(index, 4); @@ -553,6 +658,17 @@ public ArrowBuf setInt(int index, int value) { return this; } + /** + * Sets the specified 32-bit integer at the specified absolute {@code index} + * in this buffer with Big Endian byte order. + */ + @Override + public ByteBuf setIntLE(int index, int value) { + chk(index, 4); + PlatformDependent.putInt(addr(index), Integer.reverseBytes(value)); + return this; + } + @Override public ArrowBuf setLong(int index, long value) { chk(index, 8); @@ -560,6 +676,17 @@ public ArrowBuf setLong(int index, long value) { return this; } + /** + * Sets the specified 64-bit long integer at the specified absolute {@code index} + * in this buffer with Big Endian byte order. + */ + @Override + public ByteBuf setLongLE(int index, long value) { + chk(index, 8); + PlatformDependent.putLong(addr(index), Long.reverseBytes(value)); + return this; + } + @Override public ArrowBuf setChar(int index, int value) { chk(index, 2); @@ -668,16 +795,46 @@ protected short _getShort(int index) { return getShort(index); } + /** @see {@link #getShortLE(int)} */ + @Override + protected short _getShortLE(int index) { + return getShortLE(index); + } + @Override protected int _getInt(int index) { return getInt(index); } + /** @see {@link #getIntLE(int)} */ + @Override + protected int _getIntLE(int index) { + return getIntLE(index); + } + + /** @see {@link #getUnsignedMedium(int)} */ + @Override + protected int _getUnsignedMedium(int index) { + return getUnsignedMedium(index); + } + + /** @see {@link #getUnsignedMediumLE(int)} */ + @Override + protected int _getUnsignedMediumLE(int index) { + return getUnsignedMediumLE(index); + } + @Override protected long _getLong(int index) { return getLong(index); } + /** @see {@link #getLongLE(int)} */ + @Override + protected long _getLongLE(int index) { + return getLongLE(index); + } + @Override protected void _setByte(int index, int value) { setByte(index, value); @@ -688,21 +845,45 @@ protected void _setShort(int index, int value) { setShort(index, value); } + /** @see {@link #setShortLE(int, int)} */ + @Override + protected void _setShortLE(int index, int value) { + setShortLE(index, value); + } + @Override protected void _setMedium(int index, int value) { setMedium(index, value); } + /** @see {@link #setMediumLE(int, int)} */ + @Override + protected void _setMediumLE(int index, int value) { + setMediumLE(index, value); + } + @Override protected void _setInt(int index, int value) { setInt(index, value); } + /** @see {@link #setIntLE(int, int)} */ + @Override + protected void _setIntLE(int index, int value) { + setIntLE(index, value); + } + @Override protected void _setLong(int index, long value) { setLong(index, value); } + /** @see {@link #setLongLE(int, long)} */ + @Override + public void _setLongLE(int index, long value) { + setLongLE(index, value); + } + @Override public ArrowBuf getBytes(int index, ByteBuf dst, int dstIndex, int length) { udle.getBytes(index + offset, dst, dstIndex, length); @@ -716,16 +897,13 @@ public ArrowBuf getBytes(int index, OutputStream out, int length) throws IOExcep } @Override - protected int _getUnsignedMedium(int index) { - final long addr = addr(index); - return (PlatformDependent.getByte(addr) & 0xff) << 16 | - (PlatformDependent.getByte(addr + 1) & 0xff) << 8 | - PlatformDependent.getByte(addr + 2) & 0xff; + public int getBytes(int index, GatheringByteChannel out, int length) throws IOException { + return udle.getBytes(index + offset, out, length); } @Override - public int getBytes(int index, GatheringByteChannel out, int length) throws IOException { - return udle.getBytes(index + offset, out, length); + public int getBytes(int index, FileChannel out, long position, int length) throws IOException { + return udle.getBytes(index + offset, out, position, length); } @Override @@ -776,6 +954,11 @@ public int setBytes(int index, ScatteringByteChannel in, int length) throws IOEx return udle.setBytes(index + offset, in, length); } + @Override + public int setBytes(int index, FileChannel in, long position, int length) throws IOException { + return udle.setBytes(index + offset, in, position, length); + } + @Override public byte getByte(int index) { chk(index, 1); diff --git a/java/memory/src/main/java/io/netty/buffer/MutableWrappedByteBuf.java b/java/memory/src/main/java/io/netty/buffer/MutableWrappedByteBuf.java index a5683adccbc32..f0bc84cdc2db2 100644 --- a/java/memory/src/main/java/io/netty/buffer/MutableWrappedByteBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/MutableWrappedByteBuf.java @@ -23,9 +23,12 @@ import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.ByteOrder; +import java.nio.channels.FileChannel; import java.nio.channels.GatheringByteChannel; import java.nio.channels.ScatteringByteChannel; +import io.netty.util.ByteProcessor; + /** * This is basically a complete copy of DuplicatedByteBuf. We copy because we want to override * some behaviors and make @@ -128,6 +131,16 @@ protected short _getShort(int index) { return buffer.getShort(index); } + @Override + public short getShortLE(int index) { + return buffer.getShortLE(index); + } + + @Override + protected short _getShortLE(int index) { + return buffer.getShortLE(index); + } + @Override public int getUnsignedMedium(int index) { return _getUnsignedMedium(index); @@ -138,6 +151,16 @@ protected int _getUnsignedMedium(int index) { return buffer.getUnsignedMedium(index); } + @Override + public int getUnsignedMediumLE(int index) { + return buffer.getUnsignedMediumLE(index); + } + + @Override + protected int _getUnsignedMediumLE(int index) { + return buffer.getUnsignedMediumLE(index); + } + @Override public int getInt(int index) { return _getInt(index); @@ -148,6 +171,16 @@ protected int _getInt(int index) { return buffer.getInt(index); } + @Override + public int getIntLE(int index) { + return buffer.getIntLE(index); + } + + @Override + protected int _getIntLE(int index) { + return buffer.getIntLE(index); + } + @Override public long getLong(int index) { return _getLong(index); @@ -158,6 +191,16 @@ protected long _getLong(int index) { return buffer.getLong(index); } + @Override + public long getLongLE(int index) { + return buffer.getLongLE(index); + } + + @Override + protected long _getLongLE(int index) { + return buffer.getLongLE(index); + } + @Override public abstract ByteBuf copy(int index, int length); @@ -206,6 +249,17 @@ protected void _setShort(int index, int value) { buffer.setShort(index, value); } + @Override + public ByteBuf setShortLE(int index, int value) { + buffer.setShortLE(index, value); + return this; + } + + @Override + protected void _setShortLE(int index, int value) { + buffer.setShortLE(index, value); + } + @Override public ByteBuf setMedium(int index, int value) { _setMedium(index, value); @@ -217,6 +271,17 @@ protected void _setMedium(int index, int value) { buffer.setMedium(index, value); } + @Override + public ByteBuf setMediumLE(int index, int value) { + buffer.setMediumLE(index, value); + return this; + } + + @Override + protected void _setMediumLE(int index, int value) { + buffer.setMediumLE(index, value); + } + @Override public ByteBuf setInt(int index, int value) { _setInt(index, value); @@ -228,6 +293,17 @@ protected void _setInt(int index, int value) { buffer.setInt(index, value); } + @Override + public ByteBuf setIntLE(int index, int value) { + buffer.setIntLE(index, value); + return this; + } + + @Override + protected void _setIntLE(int index, int value) { + buffer.setIntLE(index, value); + } + @Override public ByteBuf setLong(int index, long value) { _setLong(index, value); @@ -239,6 +315,17 @@ protected void _setLong(int index, long value) { buffer.setLong(index, value); } + @Override + public ByteBuf setLongLE(int index, long value) { + buffer.setLongLE(index, value); + return this; + } + + @Override + protected void _setLongLE(int index, long value) { + buffer.setLongLE(index, value); + } + @Override public ByteBuf setBytes(int index, byte[] src, int srcIndex, int length) { buffer.setBytes(index, src, srcIndex, length); @@ -257,6 +344,12 @@ public ByteBuf setBytes(int index, ByteBuffer src) { return this; } + @Override + public int setBytes(int index, FileChannel in, long position, int length) + throws IOException { + return buffer.setBytes(index, in, position, length); + } + @Override public ByteBuf getBytes(int index, OutputStream out, int length) throws IOException { @@ -282,6 +375,13 @@ public int setBytes(int index, ScatteringByteChannel in, int length) return buffer.setBytes(index, in, length); } + + @Override + public int getBytes(int index, FileChannel out, long position, int length) + throws IOException { + return buffer.getBytes(index, out, position, length); + } + @Override public int nioBufferCount() { return buffer.nioBufferCount(); @@ -298,12 +398,12 @@ public ByteBuffer internalNioBuffer(int index, int length) { } @Override - public int forEachByte(int index, int length, ByteBufProcessor processor) { + public int forEachByte(int index, int length, ByteProcessor processor) { return buffer.forEachByte(index, length, processor); } @Override - public int forEachByteDesc(int index, int length, ByteBufProcessor processor) { + public int forEachByteDesc(int index, int length, ByteProcessor processor) { return buffer.forEachByteDesc(index, length, processor); } @@ -312,6 +412,18 @@ public final int refCnt() { return unwrap().refCnt(); } + @Override + public final ByteBuf touch() { + unwrap().touch(); + return this; + } + + @Override + public final ByteBuf touch(Object hint) { + unwrap().touch(hint); + return this; + } + @Override public final ByteBuf retain() { unwrap().retain(); diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java index 14687b54634ac..419be3429721b 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -76,7 +76,7 @@ public class AllocationManager { private final UnsafeDirectLittleEndian underlying; // ARROW-1627 Trying to minimize memory overhead caused by previously used IdentityHashMap // see JIRA for details - private final LowCostIdentityHasMap map = new LowCostIdentityHasMap<>(); + private final LowCostIdentityHashMap map = new LowCostIdentityHashMap<>(); private final ReadWriteLock lock = new ReentrantReadWriteLock(); private final AutoCloseableLock readLock = new AutoCloseableLock(lock.readLock()); private final AutoCloseableLock writeLock = new AutoCloseableLock(lock.writeLock()); diff --git a/java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java index b8b5283423c82..94102992139d8 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java @@ -18,8 +18,8 @@ package org.apache.arrow.memory; +import io.netty.buffer.AbstractByteBufAllocator; import io.netty.buffer.ByteBuf; -import io.netty.buffer.ByteBufAllocator; import io.netty.buffer.CompositeByteBuf; import io.netty.buffer.ExpandableByteBuf; @@ -32,7 +32,7 @@ * otherwise non-expandable * ArrowBufs to be expandable. */ -public class ArrowByteBufAllocator implements ByteBufAllocator { +public class ArrowByteBufAllocator extends AbstractByteBufAllocator { private static final int DEFAULT_BUFFER_SIZE = 4096; private static final int DEFAULT_MAX_COMPOSITE_COMPONENTS = 16; @@ -142,8 +142,17 @@ public CompositeByteBuf compositeHeapBuffer(int maxNumComponents) { throw fail(); } + @Override + protected ByteBuf newHeapBuffer(int initialCapacity, int maxCapacity) { + throw fail(); + } + + @Override + protected ByteBuf newDirectBuffer(int initialCapacity, int maxCapacity) { + return buffer(initialCapacity, maxCapacity); + } + private RuntimeException fail() { throw new UnsupportedOperationException("Allocator doesn't support heap-based memory."); } - } diff --git a/java/memory/src/main/java/org/apache/arrow/memory/LowCostIdentityHasMap.java b/java/memory/src/main/java/org/apache/arrow/memory/LowCostIdentityHashMap.java similarity index 98% rename from java/memory/src/main/java/org/apache/arrow/memory/LowCostIdentityHasMap.java rename to java/memory/src/main/java/org/apache/arrow/memory/LowCostIdentityHashMap.java index 1153fb5936471..fb7033815d5f4 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/LowCostIdentityHasMap.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/LowCostIdentityHashMap.java @@ -28,7 +28,7 @@ * that provides "getKey" method * @param */ -public class LowCostIdentityHasMap> { +public class LowCostIdentityHashMap> { /* * The internal data structure to hold values. @@ -52,7 +52,7 @@ public class LowCostIdentityHasMap> { /** * Creates an Map with default expected maximum size. */ - public LowCostIdentityHasMap() { + public LowCostIdentityHashMap() { this(DEFAULT_MIN_SIZE); } @@ -63,7 +63,7 @@ public LowCostIdentityHasMap() { * The estimated maximum number of entries that will be put in * this map. */ - public LowCostIdentityHasMap(int maxSize) { + public LowCostIdentityHashMap(int maxSize) { if (maxSize >= 0) { this.size = 0; threshold = getThreshold(maxSize); @@ -96,7 +96,7 @@ private int computeElementArraySize() { private Object[] newElementArray(int s) { return new Object[s]; } - + /** * Removes all elements from this map, leaving it empty. * @@ -331,4 +331,4 @@ public V getNextValue() { } return null; } -} \ No newline at end of file +} diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestLowCostIdentityHasMap.java b/java/memory/src/test/java/org/apache/arrow/memory/TestLowCostIdentityHashMap.java similarity index 95% rename from java/memory/src/test/java/org/apache/arrow/memory/TestLowCostIdentityHasMap.java rename to java/memory/src/test/java/org/apache/arrow/memory/TestLowCostIdentityHashMap.java index c1196147b4925..0237b38048f52 100644 --- a/java/memory/src/test/java/org/apache/arrow/memory/TestLowCostIdentityHasMap.java +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestLowCostIdentityHashMap.java @@ -27,11 +27,11 @@ /** * To test simplified implementation of IdentityHashMap */ -public class TestLowCostIdentityHasMap { +public class TestLowCostIdentityHashMap { @Test public void testIdentityHashMap() throws Exception { - LowCostIdentityHasMap hashMap = new LowCostIdentityHasMap<>(); + LowCostIdentityHashMap hashMap = new LowCostIdentityHashMap<>(); StringWithKey obj1 = new StringWithKey("s1key", "s1value"); StringWithKey obj2 = new StringWithKey("s2key", "s2value"); @@ -88,7 +88,7 @@ public void testIdentityHashMap() throws Exception { @Test public void testLargeMap() throws Exception { - LowCostIdentityHasMap hashMap = new LowCostIdentityHasMap<>(); + LowCostIdentityHashMap hashMap = new LowCostIdentityHashMap<>(); String [] keys = new String[200]; for (int i = 0; i < 200; i++) { diff --git a/java/pom.xml b/java/pom.xml index 0a0f2e0ce8f65..152deaa9c428e 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -20,7 +20,7 @@ org.apache.arrow arrow-java-root - 0.8.0-SNAPSHOT + 0.9.0-SNAPSHOT pom Apache Arrow Java Root POM @@ -32,7 +32,7 @@ 4.11 1.7.25 18.0 - 4.0.49.Final + 4.1.17.Final 2.7.9 2.7.1 1.2.0-3f79e055 @@ -41,8 +41,8 @@ - scm:git:https://git-wip-us.apache.org/repos/asf/arrow.git - scm:git:https://git-wip-us.apache.org/repos/asf/arrow.git + scm:git:https://github.com/apache/arrow.git + scm:git:https://github.com/apache/arrow.git https://github.com/apache/arrow apache-arrow-0.7.1 @@ -304,7 +304,9 @@ - google_checks.xml + dev/checkstyle/checkstyle.xml + dev/checkstyle/checkstyle.license + dev/checkstyle/suppressions.xml UTF-8 true ${checkstyle.failOnViolation} diff --git a/java/tools/pom.xml b/java/tools/pom.xml index f3c86688134dc..246f264a6def8 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.8.0-SNAPSHOT + 0.9.0-SNAPSHOT arrow-tools Arrow Tools diff --git a/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java b/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java index 3091bc4dab123..ce6b5164a8cbb 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/EchoServer.java @@ -23,8 +23,8 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.stream.ArrowStreamReader; -import org.apache.arrow.vector.stream.ArrowStreamWriter; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java b/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java index ab8fa6e45cecd..6e45305bf6cd4 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java @@ -22,8 +22,8 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.file.ArrowFileReader; -import org.apache.arrow.vector.file.ArrowFileWriter; +import org.apache.arrow.vector.ipc.ArrowFileReader; +import org.apache.arrow.vector.ipc.ArrowFileWriter; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; diff --git a/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java b/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java index 6722b30fa7f50..3db01f40c590e 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java @@ -21,8 +21,8 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.file.ArrowFileReader; -import org.apache.arrow.vector.stream.ArrowStreamWriter; +import org.apache.arrow.vector.ipc.ArrowFileReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; import java.io.File; import java.io.FileInputStream; diff --git a/java/tools/src/main/java/org/apache/arrow/tools/Integration.java b/java/tools/src/main/java/org/apache/arrow/tools/Integration.java index d2b35e65a8172..666f1ddeabcc9 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/Integration.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/Integration.java @@ -22,11 +22,11 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.file.ArrowBlock; -import org.apache.arrow.vector.file.ArrowFileReader; -import org.apache.arrow.vector.file.ArrowFileWriter; -import org.apache.arrow.vector.file.json.JsonFileReader; -import org.apache.arrow.vector.file.json.JsonFileWriter; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.ArrowFileReader; +import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.arrow.vector.ipc.JsonFileReader; +import org.apache.arrow.vector.ipc.JsonFileWriter; import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; diff --git a/java/tools/src/main/java/org/apache/arrow/tools/StreamToFile.java b/java/tools/src/main/java/org/apache/arrow/tools/StreamToFile.java index ef1a11f6bfac8..42d336af9b040 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/StreamToFile.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/StreamToFile.java @@ -21,8 +21,8 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.file.ArrowFileWriter; -import org.apache.arrow.vector.stream.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.arrow.vector.ipc.ArrowStreamReader; import java.io.File; import java.io.FileInputStream; diff --git a/java/tools/src/test/java/org/apache/arrow/tools/ArrowFileTestFixtures.java b/java/tools/src/test/java/org/apache/arrow/tools/ArrowFileTestFixtures.java index 6d9a6c1323e76..eac517d96bd48 100644 --- a/java/tools/src/test/java/org/apache/arrow/tools/ArrowFileTestFixtures.java +++ b/java/tools/src/test/java/org/apache/arrow/tools/ArrowFileTestFixtures.java @@ -28,9 +28,9 @@ import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; import org.apache.arrow.vector.complex.writer.BigIntWriter; import org.apache.arrow.vector.complex.writer.IntWriter; -import org.apache.arrow.vector.file.ArrowBlock; -import org.apache.arrow.vector.file.ArrowFileReader; -import org.apache.arrow.vector.file.ArrowFileWriter; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.ArrowFileReader; +import org.apache.arrow.vector.ipc.ArrowFileWriter; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; @@ -78,8 +78,8 @@ static void validateOutput(File testOutFile, BufferAllocator allocator) throws E static void validateContent(int count, VectorSchemaRoot root) { Assert.assertEquals(count, root.getRowCount()); for (int i = 0; i < count; i++) { - Assert.assertEquals(i, root.getVector("int").getAccessor().getObject(i)); - Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); + Assert.assertEquals(i, root.getVector("int").getObject(i)); + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getObject(i)); } } diff --git a/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java b/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java index ecac6d6e53067..47b5541d17e4e 100644 --- a/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java +++ b/java/tools/src/test/java/org/apache/arrow/tools/EchoServerTest.java @@ -35,17 +35,17 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.NullableIntVector; -import org.apache.arrow.vector.NullableTinyIntVector; -import org.apache.arrow.vector.NullableVarCharVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.dictionary.DictionaryProvider.MapDictionaryProvider; -import org.apache.arrow.vector.stream.ArrowStreamReader; -import org.apache.arrow.vector.stream.ArrowStreamWriter; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.Int; @@ -92,7 +92,7 @@ public static void stopEchoServer() throws IOException, InterruptedException { private void testEchoServer(int serverPort, Field field, - NullableTinyIntVector vector, + TinyIntVector vector, int batches) throws UnknownHostException, IOException { VectorSchemaRoot root = new VectorSchemaRoot(asList(field), asList((FieldVector) vector), 0); @@ -104,10 +104,10 @@ private void testEchoServer(int serverPort, for (int i = 0; i < batches; i++) { vector.allocateNew(16); for (int j = 0; j < 8; j++) { - vector.getMutator().set(j, j + i); - vector.getMutator().set(j + 8, 0, (byte) (j + i)); + vector.set(j, j + i); + vector.set(j + 8, 0, (byte) (j + i)); } - vector.getMutator().setValueCount(16); + vector.setValueCount(16); root.setRowCount(16); writer.writeBatch(); } @@ -115,15 +115,15 @@ private void testEchoServer(int serverPort, assertEquals(new Schema(asList(field)), reader.getVectorSchemaRoot().getSchema()); - NullableTinyIntVector readVector = (NullableTinyIntVector) reader.getVectorSchemaRoot() + TinyIntVector readVector = (TinyIntVector) reader.getVectorSchemaRoot() .getFieldVectors().get(0); for (int i = 0; i < batches; i++) { Assert.assertTrue(reader.loadNextBatch()); assertEquals(16, reader.getVectorSchemaRoot().getRowCount()); - assertEquals(16, readVector.getAccessor().getValueCount()); + assertEquals(16, readVector.getValueCount()); for (int j = 0; j < 8; j++) { - assertEquals(j + i, readVector.getAccessor().get(j)); - assertTrue(readVector.getAccessor().isNull(j + 8)); + assertEquals(j + i, readVector.get(j)); + assertTrue(readVector.isNull(j + 8)); } } Assert.assertFalse(reader.loadNextBatch()); @@ -140,8 +140,8 @@ public void basicTest() throws InterruptedException, IOException { "testField", new FieldType(true, new ArrowType.Int(8, true), null, null), Collections.emptyList()); - NullableTinyIntVector vector = - new NullableTinyIntVector("testField", FieldType.nullable(TINYINT.getType()), alloc); + TinyIntVector vector = + new TinyIntVector("testField", FieldType.nullable(TINYINT.getType()), alloc); Schema schema = new Schema(asList(field)); // Try an empty stream, just the header. @@ -158,31 +158,29 @@ public void basicTest() throws InterruptedException, IOException { public void testFlatDictionary() throws IOException { DictionaryEncoding writeEncoding = new DictionaryEncoding(1L, false, null); try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); - NullableIntVector writeVector = - new NullableIntVector( + IntVector writeVector = + new IntVector( "varchar", new FieldType(true, MinorType.INT.getType(), writeEncoding, null), allocator); - NullableVarCharVector writeDictionaryVector = - new NullableVarCharVector( + VarCharVector writeDictionaryVector = + new VarCharVector( "dict", FieldType.nullable(VARCHAR.getType()), allocator)) { writeVector.allocateNewSafe(); - NullableIntVector.Mutator mutator = writeVector.getMutator(); - mutator.set(0, 0); - mutator.set(1, 1); - mutator.set(3, 2); - mutator.set(4, 1); - mutator.set(5, 2); - mutator.setValueCount(6); + writeVector.set(0, 0); + writeVector.set(1, 1); + writeVector.set(3, 2); + writeVector.set(4, 1); + writeVector.set(5, 2); + writeVector.setValueCount(6); writeDictionaryVector.allocateNewSafe(); - NullableVarCharVector.Mutator dictionaryMutator = writeDictionaryVector.getMutator(); - dictionaryMutator.set(0, "foo".getBytes(StandardCharsets.UTF_8)); - dictionaryMutator.set(1, "bar".getBytes(StandardCharsets.UTF_8)); - dictionaryMutator.set(2, "baz".getBytes(StandardCharsets.UTF_8)); - dictionaryMutator.setValueCount(3); + writeDictionaryVector.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + writeDictionaryVector.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + writeDictionaryVector.set(2, "baz".getBytes(StandardCharsets.UTF_8)); + writeDictionaryVector.setValueCount(3); List fields = ImmutableList.of(writeVector.getField()); List vectors = ImmutableList.of((FieldVector) writeVector); @@ -210,23 +208,21 @@ public void testFlatDictionary() throws IOException { Assert.assertNotNull(readEncoding); Assert.assertEquals(1L, readEncoding.getId()); - FieldVector.Accessor accessor = readVector.getAccessor(); - Assert.assertEquals(6, accessor.getValueCount()); - Assert.assertEquals(0, accessor.getObject(0)); - Assert.assertEquals(1, accessor.getObject(1)); - Assert.assertEquals(null, accessor.getObject(2)); - Assert.assertEquals(2, accessor.getObject(3)); - Assert.assertEquals(1, accessor.getObject(4)); - Assert.assertEquals(2, accessor.getObject(5)); + Assert.assertEquals(6, readVector.getValueCount()); + Assert.assertEquals(0, readVector.getObject(0)); + Assert.assertEquals(1, readVector.getObject(1)); + Assert.assertEquals(null, readVector.getObject(2)); + Assert.assertEquals(2, readVector.getObject(3)); + Assert.assertEquals(1, readVector.getObject(4)); + Assert.assertEquals(2, readVector.getObject(5)); Dictionary dictionary = reader.lookup(1L); Assert.assertNotNull(dictionary); - NullableVarCharVector.Accessor dictionaryAccessor = ((NullableVarCharVector) dictionary - .getVector()).getAccessor(); - Assert.assertEquals(3, dictionaryAccessor.getValueCount()); - Assert.assertEquals(new Text("foo"), dictionaryAccessor.getObject(0)); - Assert.assertEquals(new Text("bar"), dictionaryAccessor.getObject(1)); - Assert.assertEquals(new Text("baz"), dictionaryAccessor.getObject(2)); + VarCharVector dictionaryVector = ((VarCharVector) dictionary.getVector()); + Assert.assertEquals(3, dictionaryVector.getValueCount()); + Assert.assertEquals(new Text("foo"), dictionaryVector.getObject(0)); + Assert.assertEquals(new Text("bar"), dictionaryVector.getObject(1)); + Assert.assertEquals(new Text("baz"), dictionaryVector.getObject(2)); } } } @@ -235,17 +231,17 @@ public void testFlatDictionary() throws IOException { public void testNestedDictionary() throws IOException { DictionaryEncoding writeEncoding = new DictionaryEncoding(2L, false, null); try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); - NullableVarCharVector writeDictionaryVector = - new NullableVarCharVector("dictionary", FieldType.nullable(VARCHAR.getType()), allocator); + VarCharVector writeDictionaryVector = + new VarCharVector("dictionary", FieldType.nullable(VARCHAR.getType()), allocator); ListVector writeVector = ListVector.empty("list", allocator)) { // data being written: // [['foo', 'bar'], ['foo'], ['bar']] -> [[0, 1], [0], [1]] writeDictionaryVector.allocateNew(); - writeDictionaryVector.getMutator().set(0, "foo".getBytes(StandardCharsets.UTF_8)); - writeDictionaryVector.getMutator().set(1, "bar".getBytes(StandardCharsets.UTF_8)); - writeDictionaryVector.getMutator().setValueCount(2); + writeDictionaryVector.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + writeDictionaryVector.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + writeDictionaryVector.setValueCount(2); writeVector.addOrGetVector(new FieldType(true, MinorType.INT.getType(), writeEncoding, null)); writeVector.allocateNew(); @@ -297,19 +293,17 @@ public void testNestedDictionary() throws IOException { Assert.assertEquals(2L, encoding.getId()); Assert.assertEquals(new Int(32, true), encoding.getIndexType()); - ListVector.Accessor accessor = readVector.getAccessor(); - Assert.assertEquals(3, accessor.getValueCount()); - Assert.assertEquals(Arrays.asList(0, 1), accessor.getObject(0)); - Assert.assertEquals(Arrays.asList(0), accessor.getObject(1)); - Assert.assertEquals(Arrays.asList(1), accessor.getObject(2)); + Assert.assertEquals(3, readVector.getValueCount()); + Assert.assertEquals(Arrays.asList(0, 1), readVector.getObject(0)); + Assert.assertEquals(Arrays.asList(0), readVector.getObject(1)); + Assert.assertEquals(Arrays.asList(1), readVector.getObject(2)); Dictionary readDictionary = reader.lookup(2L); Assert.assertNotNull(readDictionary); - NullableVarCharVector.Accessor dictionaryAccessor = ((NullableVarCharVector) - readDictionary.getVector()).getAccessor(); - Assert.assertEquals(2, dictionaryAccessor.getValueCount()); - Assert.assertEquals(new Text("foo"), dictionaryAccessor.getObject(0)); - Assert.assertEquals(new Text("bar"), dictionaryAccessor.getObject(1)); + VarCharVector dictionaryVector = ((VarCharVector) readDictionary.getVector()); + Assert.assertEquals(2, dictionaryVector.getValueCount()); + Assert.assertEquals(new Text("foo"), dictionaryVector.getObject(0)); + Assert.assertEquals(new Text("bar"), dictionaryVector.getObject(1)); } } } diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 46e06aa1e3f97..0f3e03e6902da 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.8.0-SNAPSHOT + 0.9.0-SNAPSHOT arrow-vector Arrow Vectors diff --git a/java/vector/src/main/codegen/templates/AbstractFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractFieldWriter.java index 853f67fd0dd56..fce6876025a91 100644 --- a/java/vector/src/main/codegen/templates/AbstractFieldWriter.java +++ b/java/vector/src/main/codegen/templates/AbstractFieldWriter.java @@ -67,6 +67,10 @@ public void write(${name}Holder holder) { public void write${minor.class}(${friendlyType} value) { fail("${name}"); } + + public void writeBigEndianBytesToDecimal(byte[] value) { + fail("${name}"); + } diff --git a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java index 228c2c531f98f..7f4a13d4f06e8 100644 --- a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java +++ b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java @@ -16,6 +16,8 @@ * limitations under the License. */ +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.vector.types.Types; import org.apache.drill.common.types.TypeProtos.MinorType; <@pp.dropOutputFile /> @@ -82,6 +84,12 @@ public void write(${name}Holder holder) { getWriter(MinorType.${name?upper_case}).write${minor.class}(<#list fields as field>${field.name}<#if field_has_next>, ); } + <#if minor.class == "Decimal"> + public void writeBigEndianBytesToDecimal(byte[] value) { + getWriter(Types.MinorType.DECIMAL).writeBigEndianBytesToDecimal(value); + } + + public void writeNull() { } diff --git a/java/vector/src/main/codegen/templates/ComplexReaders.java b/java/vector/src/main/codegen/templates/ComplexReaders.java index 38cd1bfdeb3c5..4863ecdb63284 100644 --- a/java/vector/src/main/codegen/templates/ComplexReaders.java +++ b/java/vector/src/main/codegen/templates/ComplexReaders.java @@ -53,9 +53,9 @@ @SuppressWarnings("unused") public class ${name}ReaderImpl extends AbstractFieldReader { - private final ${nullMode}${name}Vector vector; + private final ${name}Vector vector; - public ${name}ReaderImpl(${nullMode}${name}Vector vector){ + public ${name}ReaderImpl(${name}Vector vector){ super(); this.vector = vector; } @@ -69,11 +69,7 @@ public Field getField(){ } public boolean isSet(){ - <#if nullMode == "Nullable"> - return !vector.getAccessor().isNull(idx()); - <#else> - return true; - + return !vector.isNull(idx()); } public void copyAsValue(${minor.class?cap_first}Writer writer){ @@ -88,16 +84,16 @@ public void copyAsField(String name, MapWriter writer){ <#if nullMode != "Nullable"> public void read(${minor.class?cap_first}Holder h){ - vector.getAccessor().get(idx(), h); + vector.get(idx(), h); } public void read(Nullable${minor.class?cap_first}Holder h){ - vector.getAccessor().get(idx(), h); + vector.get(idx(), h); } public ${friendlyType} read${safeType}(){ - return vector.getAccessor().getObject(idx()); + return vector.getObject(idx()); } <#if minor.class == "TimeStampSec" || @@ -106,7 +102,7 @@ public void read(Nullable${minor.class?cap_first}Holder h){ minor.class == "TimeStampNano"> @Override public ${minor.boxedType} read${minor.boxedType}(){ - return vector.getAccessor().get(idx()); + return vector.get(idx()); } @@ -115,7 +111,7 @@ public void copyValue(FieldWriter w){ } public Object readObject(){ - return vector.getAccessor().getObject(idx()); + return (Object)vector.getObject(idx()); } } diff --git a/java/vector/src/main/codegen/templates/ComplexWriters.java b/java/vector/src/main/codegen/templates/ComplexWriters.java index fe099bede3568..24994d72ac1d8 100644 --- a/java/vector/src/main/codegen/templates/ComplexWriters.java +++ b/java/vector/src/main/codegen/templates/ComplexWriters.java @@ -39,11 +39,9 @@ @SuppressWarnings("unused") public class ${eName}WriterImpl extends AbstractFieldWriter { - private final Nullable${name}Vector.Mutator mutator; - final Nullable${name}Vector vector; + final ${name}Vector vector; - public ${eName}WriterImpl(Nullable${name}Vector vector) { - this.mutator = vector.getMutator(); + public ${eName}WriterImpl(${name}Vector vector) { this.vector = vector; } @@ -81,17 +79,17 @@ protected int idx() { public void write(${minor.class?cap_first}Holder h) { mutator.addSafe(idx(), h); - vector.getMutator().setValueCount(idx()+1); + vector.setValueCount(idx()+1); } - public void write(Nullable${minor.class?cap_first}Holder h) { + public void write(${minor.class?cap_first}Holder h) { mutator.addSafe(idx(), h); - vector.getMutator().setValueCount(idx()+1); + vector.setValueCount(idx()+1); } public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { mutator.addSafe(idx(), <#list fields as field>${field.name}<#if field_has_next>, ); - vector.getMutator().setValueCount(idx()+1); + vector.setValueCount(idx()+1); } public void setPosition(int idx) { @@ -103,34 +101,37 @@ public void setPosition(int idx) { <#else> public void write(${minor.class}Holder h) { - mutator.setSafe(idx(), h); - vector.getMutator().setValueCount(idx()+1); + vector.setSafe(idx(), h); + vector.setValueCount(idx()+1); } public void write(Nullable${minor.class}Holder h) { - mutator.setSafe(idx(), h); - vector.getMutator().setValueCount(idx()+1); + vector.setSafe(idx(), h); + vector.setValueCount(idx()+1); } public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { - mutator.setSafe(idx()<#if mode == "Nullable">, 1<#list fields as field><#if field.include!true >, ${field.name}); - vector.getMutator().setValueCount(idx()+1); + vector.setSafe(idx(), 1<#list fields as field><#if field.include!true >, ${field.name}); + vector.setValueCount(idx()+1); } - <#if minor.class == "Decimal"> + <#if minor.class == "Decimal"> public void write${minor.class}(${friendlyType} value) { - mutator.setSafe(idx(), value); - vector.getMutator().setValueCount(idx()+1); + vector.setSafe(idx(), value); + vector.setValueCount(idx()+1); + } + + public void writeBigEndianBytesToDecimal(byte[] value) { + vector.setBigEndianSafe(idx(), value); + vector.setValueCount(idx()+1); } - <#if mode == "Nullable"> public void writeNull() { - mutator.setNull(idx()); - vector.getMutator().setValueCount(idx()+1); + vector.setNull(idx()); + vector.setValueCount(idx()+1); } - } <@pp.changeOutputFile name="/org/apache/arrow/vector/complex/writer/${eName}Writer.java" /> @@ -150,6 +151,8 @@ public interface ${eName}Writer extends BaseWriter { <#if minor.class == "Decimal"> public void write${minor.class}(${friendlyType} value); + + public void writeBigEndianBytesToDecimal(byte[] value); } diff --git a/java/vector/src/main/codegen/templates/FixedValueVectors.java b/java/vector/src/main/codegen/templates/FixedValueVectors.java deleted file mode 100644 index e07416ba984d4..0000000000000 --- a/java/vector/src/main/codegen/templates/FixedValueVectors.java +++ /dev/null @@ -1,765 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.arrow.vector.util.DecimalUtility; - -import java.lang.Override; -import java.util.concurrent.TimeUnit; - -<@pp.dropOutputFile /> -<#list vv.types as type> -<#list type.minor as minor> -<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> -<#assign className = "${minor.class}Vector" /> - -<#if type.major == "Fixed"> -<@pp.changeOutputFile name="/org/apache/arrow/vector/${className}.java" /> -<#include "/@includes/license.ftl" /> - -package org.apache.arrow.vector; - -<#include "/@includes/vv_imports.ftl" /> - -/** - * ${minor.class} implements a vector of fixed width values. Elements in the vector are accessed - * by position, starting from the logical start of the vector. Values should be pushed onto the - * vector sequentially, but may be randomly accessed. - * The width of each element is ${type.width} byte(s) - * The equivalent Java primitive is '${minor.javaType!type.javaType}' - * - * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. - */ -public final class ${className} extends BaseDataValueVector implements FixedWidthVector{ - private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${className}.class); - - public static final int TYPE_WIDTH = ${type.width}; - - private final Accessor accessor = new Accessor(); - private final Mutator mutator = new Mutator(); - - private int allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * ${type.width}; - private int allocationMonitor = 0; - <#if minor.typeParams??> - - <#assign typeParams = minor.typeParams?reverse /> - <#list typeParams as typeParam> - private final ${typeParam.type} ${typeParam.name}; - - - public ${className}(String name, BufferAllocator allocator<#list typeParams as typeParam>, ${typeParam.type} ${typeParam.name}) { - super(name, allocator); - <#list typeParams as typeParam> - this.${typeParam.name} = ${typeParam.name}; - - } - <#else> - public ${className}(String name, BufferAllocator allocator) { - super(name, allocator); - } - - - @Override - public MinorType getMinorType() { - return MinorType.${minor.class?upper_case}; - } - - @Override - public Field getField() { - throw new UnsupportedOperationException("internal vector"); - } - - @Override - public FieldReader getReader(){ - throw new UnsupportedOperationException("non-nullable vectors cannot be used in readers"); - } - - @Override - public int getBufferSizeFor(final int valueCount) { - if (valueCount == 0) { - return 0; - } - return valueCount * ${type.width}; - } - - @Override - public ArrowBuf getValidityBuffer() { - /* this operation is not supported for non-nullable vectors */ - throw new UnsupportedOperationException(); - } - - @Override - public ArrowBuf getDataBuffer() { - /* we are not throwing away getBuffer() of BaseDataValueVector so use it wherever applicable */ - return getBuffer(); - } - - @Override - public ArrowBuf getOffsetBuffer() { - /* this operation is not supported for fixed-width vectors */ - throw new UnsupportedOperationException(); - } - - @Override - public int getValueCapacity(){ - return (int) (data.capacity() *1.0 / ${type.width}); - } - - @Override - public Accessor getAccessor(){ - return accessor; - } - - @Override - public Mutator getMutator(){ - return mutator; - } - - int getAllocationSize() { - return allocationSizeInBytes; - } - - @Override - public void setInitialCapacity(final int valueCount) { - final long size = 1L * valueCount * ${type.width}; - if (size > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); - } - allocationSizeInBytes = (int)size; - } - - @Override - public void allocateNew() { - if(!allocateNewSafe()){ - throw new OutOfMemoryException("Failure while allocating buffer."); - } - } - - @Override - public boolean allocateNewSafe() { - long curAllocationSize = allocationSizeInBytes; - if (allocationMonitor > 10) { - curAllocationSize = Math.max(8, curAllocationSize / 2); - allocationMonitor = 0; - } else if (allocationMonitor < -2) { - curAllocationSize = allocationSizeInBytes * 2L; - allocationMonitor = 0; - } - - try{ - allocateBytes(curAllocationSize); - } catch (RuntimeException ex) { - return false; - } - return true; - } - - /** - * Allocate a new buffer that supports setting at least the provided number of values. May actually be sized bigger - * depending on underlying buffer rounding size. Must be called prior to using the ValueVector. - * - * Note that the maximum number of values a vector can allocate is Integer.MAX_VALUE / value width. - * - * @param valueCount the number of values to allocate for - * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the new buffer - */ - @Override - public void allocateNew(final int valueCount) { - allocateBytes(valueCount * ${type.width}); - } - - @Override - public void reset() { - allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * ${type.width}; - allocationMonitor = 0; - zeroVector(); - super.reset(); - } - - private void allocateBytes(final long size) { - if (size > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); - } - - final int curSize = (int)size; - clear(); - data = allocator.buffer(curSize); - data.readerIndex(0); - allocationSizeInBytes = curSize; - } - - /** - * Allocate new buffer with double capacity, and copy data into the new buffer. Replace vector's buffer with new buffer, and release old one - * - * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the new buffer - */ - public void reAlloc() { - long baseSize = allocationSizeInBytes; - final int currentBufferCapacity = data.capacity(); - if (baseSize < (long)currentBufferCapacity) { - baseSize = (long)currentBufferCapacity; - } - long newAllocationSize = baseSize * 2L; - newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); - - if (newAllocationSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Unable to expand the buffer. Max allowed buffer size is reached."); - } - - logger.debug("Reallocating vector [{}]. # of bytes: [{}] -> [{}]", name, allocationSizeInBytes, newAllocationSize); - final ArrowBuf newBuf = allocator.buffer((int)newAllocationSize); - newBuf.setBytes(0, data, 0, currentBufferCapacity); - final int halfNewCapacity = newBuf.capacity() / 2; - newBuf.setZero(halfNewCapacity, halfNewCapacity); - newBuf.writerIndex(data.writerIndex()); - data.release(1); - data = newBuf; - allocationSizeInBytes = (int)newAllocationSize; - } - - /** - * {@inheritDoc} - */ - @Override - public void zeroVector() { - data.setZero(0, data.capacity()); - } - - public TransferPair getTransferPair(BufferAllocator allocator){ - return new TransferImpl(name, allocator); - } - - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator){ - return new TransferImpl(ref, allocator); - } - - @Override - public TransferPair makeTransferPair(ValueVector to) { - return new TransferImpl((${className}) to); - } - - public void transferTo(${className} target){ - target.clear(); - target.data = data.transferOwnership(target.allocator).buffer; - target.data.writerIndex(data.writerIndex()); - clear(); - } - - public void splitAndTransferTo(int startIndex, int length, ${className} target) { - final int startPoint = startIndex * ${type.width}; - final int sliceLength = length * ${type.width}; - target.clear(); - target.data = data.slice(startPoint, sliceLength).transferOwnership(target.allocator).buffer; - target.data.writerIndex(sliceLength); - } - - private class TransferImpl implements TransferPair{ - private ${className} to; - - public TransferImpl(String name, BufferAllocator allocator){ - to = new ${className}(name, allocator<#if minor.typeParams??><#list typeParams as typeParam>, ${className}.this.${typeParam.name}); - } - - public TransferImpl(${className} to) { - this.to = to; - } - - @Override - public ${className} getTo(){ - return to; - } - - @Override - public void transfer(){ - transferTo(to); - } - - @Override - public void splitAndTransfer(int startIndex, int length) { - splitAndTransferTo(startIndex, length, to); - } - - @Override - public void copyValueSafe(int fromIndex, int toIndex) { - to.copyFromSafe(fromIndex, toIndex, ${className}.this); - } - } - - public void copyFrom(int fromIndex, int thisIndex, ${className} from){ - <#if (type.width > 8 || minor.class == "IntervalDay")> - from.data.getBytes(fromIndex * ${type.width}, data, thisIndex * ${type.width}, ${type.width}); - <#else> <#-- type.width <= 8 --> - data.set${(minor.javaType!type.javaType)?cap_first}(thisIndex * ${type.width}, - from.data.get${(minor.javaType!type.javaType)?cap_first}(fromIndex * ${type.width}) - ); - <#-- type.width --> - } - - public void copyFromSafe(int fromIndex, int thisIndex, ${className} from){ - while(thisIndex >= getValueCapacity()) { - reAlloc(); - } - copyFrom(fromIndex, thisIndex, from); - } - - public void decrementAllocationMonitor() { - if (allocationMonitor > 0) { - allocationMonitor = 0; - } - --allocationMonitor; - } - - private void incrementAllocationMonitor() { - ++allocationMonitor; - } - - public final class Accessor extends BaseDataValueVector.BaseAccessor { - @Override - public int getValueCount() { - return data.writerIndex() / ${type.width}; - } - - @Override - public boolean isNull(int index){ - return false; - } - - <#if (type.width > 8 || minor.class == "IntervalDay")> - public ${minor.javaType!type.javaType} get(int index) { - return data.slice(index * ${type.width}, ${type.width}); - } - - <#if (minor.class == "IntervalDay")> - public void get(int index, ${minor.class}Holder holder){ - final int offsetIndex = index * ${type.width}; - holder.days = data.getInt(offsetIndex); - holder.milliseconds = data.getInt(offsetIndex + ${minor.millisecondsOffset}); - } - - public void get(int index, Nullable${minor.class}Holder holder){ - final int offsetIndex = index * ${type.width}; - holder.isSet = 1; - holder.days = data.getInt(offsetIndex); - holder.milliseconds = data.getInt(offsetIndex + ${minor.millisecondsOffset}); - } - - @Override - public ${friendlyType} getObject(int index) { - final int offsetIndex = index * ${type.width}; - final int millis = data.getInt(offsetIndex + ${minor.millisecondsOffset}); - final int days = data.getInt(offsetIndex); - final Period p = new Period(); - return p.plusDays(days).plusMillis(millis); - } - - public StringBuilder getAsStringBuilder(int index) { - final int offsetIndex = index * ${type.width}; - - int millis = data.getInt(offsetIndex + ${minor.millisecondsOffset}); - final int days = data.getInt(offsetIndex); - - final int hours = millis / (org.apache.arrow.vector.util.DateUtility.hoursToMillis); - millis = millis % (org.apache.arrow.vector.util.DateUtility.hoursToMillis); - - final int minutes = millis / (org.apache.arrow.vector.util.DateUtility.minutesToMillis); - millis = millis % (org.apache.arrow.vector.util.DateUtility.minutesToMillis); - - final int seconds = millis / (org.apache.arrow.vector.util.DateUtility.secondsToMillis); - millis = millis % (org.apache.arrow.vector.util.DateUtility.secondsToMillis); - - final String dayString = (Math.abs(days) == 1) ? " day " : " days "; - - return(new StringBuilder(). - append(days).append(dayString). - append(hours).append(":"). - append(minutes).append(":"). - append(seconds).append("."). - append(millis)); - } - - <#elseif minor.class == "Decimal"> - public void get(int index, ${minor.class}Holder holder) { - holder.start = index * ${type.width}; - holder.buffer = data; - holder.scale = scale; - holder.precision = precision; - } - - public void get(int index, Nullable${minor.class}Holder holder) { - holder.isSet = 1; - holder.start = index * ${type.width}; - holder.buffer = data; - holder.scale = scale; - holder.precision = precision; - } - - @Override - public ${friendlyType} getObject(int index) { - return DecimalUtility.getBigDecimalFromArrowBuf(data, index, scale); - } - - <#else> - public void get(int index, ${minor.class}Holder holder){ - holder.buffer = data; - holder.start = index * ${type.width}; - } - - public void get(int index, Nullable${minor.class}Holder holder){ - holder.isSet = 1; - holder.buffer = data; - holder.start = index * ${type.width}; - } - - @Override - public ${friendlyType} getObject(int index) { - return data.slice(index * ${type.width}, ${type.width}) - } - - - <#else> <#-- type.width <= 8 --> - - public ${minor.javaType!type.javaType} get(int index) { - return data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); - } - <#if type.width == 4> - public long getTwoAsLong(int index) { - return data.getLong(index * ${type.width}); - } - - - <#if minor.class == "DateDay" || - minor.class == "TimeSec" || - minor.class == "TimeMicro" || - minor.class == "TimeNano"> - @Override - public ${friendlyType} getObject(int index) { - return get(index); - } - - <#elseif minor.class == "DateMilli" || minor.class == "TimeMilli" || minor.class == "TimeStampMilli"> - @Override - public ${friendlyType} getObject(int index) { - org.joda.time.LocalDateTime ldt = new org.joda.time.LocalDateTime(get(index), org.joda.time.DateTimeZone.UTC); - return ldt; - } - - <#elseif minor.class == "TimeStampSec"> - @Override - public ${friendlyType} getObject(int index) { - long secs = java.util.concurrent.TimeUnit.SECONDS.toMillis(get(index)); - org.joda.time.LocalDateTime date = new org.joda.time.LocalDateTime(secs, org.joda.time.DateTimeZone.UTC); - return date; - } - - <#elseif minor.class == "TimeStampMicro"> - @Override - public ${friendlyType} getObject(int index) { - // value is truncated when converting microseconds to milliseconds in order to use DateTime type - long micros = java.util.concurrent.TimeUnit.MICROSECONDS.toMillis(get(index)); - org.joda.time.LocalDateTime date = new org.joda.time.LocalDateTime(micros, org.joda.time.DateTimeZone.UTC); - return date; - } - - <#elseif minor.class == "TimeStampNano"> - @Override - public ${friendlyType} getObject(int index) { - // value is truncated when converting nanoseconds to milliseconds in order to use DateTime type - long millis = java.util.concurrent.TimeUnit.NANOSECONDS.toMillis(get(index)); - org.joda.time.LocalDateTime date = new org.joda.time.LocalDateTime(millis, org.joda.time.DateTimeZone.UTC); - return date; - } - - <#elseif minor.class == "IntervalYear"> - @Override - public ${friendlyType} getObject(int index) { - - final int value = get(index); - - final int years = (value / org.apache.arrow.vector.util.DateUtility.yearsToMonths); - final int months = (value % org.apache.arrow.vector.util.DateUtility.yearsToMonths); - final Period p = new Period(); - return p.plusYears(years).plusMonths(months); - } - - public StringBuilder getAsStringBuilder(int index) { - int months = data.getInt(index); - - final int years = (months / org.apache.arrow.vector.util.DateUtility.yearsToMonths); - months = (months % org.apache.arrow.vector.util.DateUtility.yearsToMonths); - - final String yearString = (Math.abs(years) == 1) ? " year " : " years "; - final String monthString = (Math.abs(months) == 1) ? " month " : " months "; - - return(new StringBuilder(). - append(years).append(yearString). - append(months).append(monthString)); - } - - <#else> - @Override - public ${friendlyType} getObject(int index) { - return get(index); - } - public ${minor.javaType!type.javaType} getPrimitiveObject(int index) { - return get(index); - } - - - public void get(int index, ${minor.class}Holder holder){ - holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); - } - - public void get(int index, Nullable${minor.class}Holder holder){ - holder.isSet = 1; - holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); - } - - <#-- type.width --> - } - - /** - * ${minor.class}.Mutator implements a mutable vector of fixed width values. Elements in the - * vector are accessed by position from the logical start of the vector. Values should be pushed - * onto the vector sequentially, but may be randomly accessed. - * The width of each element is ${type.width} byte(s) - * The equivalent Java primitive is '${minor.javaType!type.javaType}' - * - * NB: this class is automatically generated from FixedValueVectorTypes.tdd using FreeMarker. - */ - public final class Mutator extends BaseDataValueVector.BaseMutator { - - private Mutator(){}; - - /** - * Set the element at the given index to the given value. Note that widths smaller than - * 32 bits are handled by the ArrowBuf interface. - * - * @param index position of the bit to set - * @param value value to set - */ - <#if (type.width > 8) || minor.class == "IntervalDay"> - public void set(int index, <#if (type.width > 4)>${minor.javaType!type.javaType}<#else>int value) { - data.setBytes(index * ${type.width}, value, 0, ${type.width}); - } - - public void setSafe(int index, <#if (type.width > 4)>${minor.javaType!type.javaType}<#else>int value) { - while(index >= getValueCapacity()) { - reAlloc(); - } - data.setBytes(index * ${type.width}, value, 0, ${type.width}); - } - - <#if (minor.class == "IntervalDay")> - public void set(int index, int days, int milliseconds){ - final int offsetIndex = index * ${type.width}; - data.setInt(offsetIndex, days); - data.setInt((offsetIndex + ${minor.millisecondsOffset}), milliseconds); - } - - protected void set(int index, ${minor.class}Holder holder){ - set(index, holder.days, holder.milliseconds); - } - - protected void set(int index, Nullable${minor.class}Holder holder){ - set(index, holder.days, holder.milliseconds); - } - - public void setSafe(int index, int days, int milliseconds){ - while(index >= getValueCapacity()) { - reAlloc(); - } - set(index, days, milliseconds); - } - - public void setSafe(int index, ${minor.class}Holder holder){ - setSafe(index, holder.days, holder.milliseconds); - } - - public void setSafe(int index, Nullable${minor.class}Holder holder){ - setSafe(index, holder.days, holder.milliseconds); - } - - <#elseif minor.class == "Decimal"> - public void set(int index, ${minor.class}Holder holder){ - set(index, holder.start, holder.buffer); - } - - void set(int index, Nullable${minor.class}Holder holder){ - set(index, holder.start, holder.buffer); - } - - public void setSafe(int index, Nullable${minor.class}Holder holder){ - setSafe(index, holder.start, holder.buffer); - } - public void setSafe(int index, ${minor.class}Holder holder){ - setSafe(index, holder.start, holder.buffer); - } - - public void setSafe(int index, int start, ArrowBuf buffer){ - while(index >= getValueCapacity()) { - reAlloc(); - } - set(index, start, buffer); - } - - public void set(int index, int start, ArrowBuf buffer){ - data.setBytes(index * ${type.width}, buffer, start, ${type.width}); - } - - public void set(int index, ${friendlyType} value){ - DecimalUtility.checkPrecisionAndScale(value, precision, scale); - DecimalUtility.writeBigDecimalToArrowBuf(value, data, index); - } - - public void setSafe(int index, ${friendlyType} value){ - while(index >= getValueCapacity()) { - reAlloc(); - } - set(index, value); - } - - <#else> - protected void set(int index, ${minor.class}Holder holder){ - set(index, holder.start, holder.buffer); - } - - public void set(int index, Nullable${minor.class}Holder holder){ - set(index, holder.start, holder.buffer); - } - - public void set(int index, int start, ArrowBuf buffer){ - data.setBytes(index * ${type.width}, buffer, start, ${type.width}); - } - - public void setSafe(int index, ${minor.class}Holder holder){ - setSafe(index, holder.start, holder.buffer); - } - - public void setSafe(int index, Nullable${minor.class}Holder holder){ - setSafe(index, holder.start, holder.buffer); - } - - public void setSafe(int index, int start, ArrowBuf buffer){ - while(index >= getValueCapacity()) { - reAlloc(); - } - set(index, holder); - } - - public void set(int index, Nullable${minor.class}Holder holder){ - data.setBytes(index * ${type.width}, holder.buffer, holder.start, ${type.width}); - } - - - @Override - public void generateTestData(int count) { - setValueCount(count); - boolean even = true; - final int valueCount = getAccessor().getValueCount(); - for(int i = 0; i < valueCount; i++, even = !even) { - final byte b = even ? Byte.MIN_VALUE : Byte.MAX_VALUE; - for(int w = 0; w < ${type.width}; w++){ - data.setByte(i + w, b); - } - } - } - - <#else> <#-- type.width <= 8 --> - public void set(int index, <#if (type.width >= 4)>${minor.javaType!type.javaType}<#else>int value) { - data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, value); - } - - public void setSafe(int index, <#if (type.width >= 4)>${minor.javaType!type.javaType}<#else>int value) { - while(index >= getValueCapacity()) { - reAlloc(); - } - set(index, value); - } - - protected void set(int index, ${minor.class}Holder holder){ - data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, holder.value); - } - - public void setSafe(int index, ${minor.class}Holder holder){ - while(index >= getValueCapacity()) { - reAlloc(); - } - set(index, holder); - } - - protected void set(int index, Nullable${minor.class}Holder holder){ - data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, holder.value); - } - - public void setSafe(int index, Nullable${minor.class}Holder holder){ - while(index >= getValueCapacity()) { - reAlloc(); - } - set(index, holder); - } - - @Override - public void generateTestData(int size) { - setValueCount(size); - boolean even = true; - final int valueCount = getAccessor().getValueCount(); - for(int i = 0; i < valueCount; i++, even = !even) { - if(even){ - set(i, ${minor.boxedType!type.boxedType}.MIN_VALUE); - }else{ - set(i, ${minor.boxedType!type.boxedType}.MAX_VALUE); - } - } - } - - public void generateTestDataAlt(int size) { - setValueCount(size); - boolean even = true; - final int valueCount = getAccessor().getValueCount(); - for(int i = 0; i < valueCount; i++, even = !even) { - if(even){ - set(i, (${(minor.javaType!type.javaType)}) 1); - }else{ - set(i, (${(minor.javaType!type.javaType)}) 0); - } - } - } - - <#-- type.width --> - - @Override - public void setValueCount(int valueCount) { - final int currentValueCapacity = getValueCapacity(); - final int idx = (${type.width} * valueCount); - while(valueCount > getValueCapacity()) { - reAlloc(); - } - if (valueCount > 0 && currentValueCapacity > valueCount * 2) { - incrementAllocationMonitor(); - } else if (allocationMonitor > 0) { - allocationMonitor = 0; - } - VectorTrimmer.trim(data, idx); - data.writerIndex(valueCount * ${type.width}); - } - } -} - - <#-- type.major --> - - diff --git a/java/vector/src/main/codegen/templates/MapWriters.java b/java/vector/src/main/codegen/templates/MapWriters.java index b89f91457e8b2..ac59e59b30888 100644 --- a/java/vector/src/main/codegen/templates/MapWriters.java +++ b/java/vector/src/main/codegen/templates/MapWriters.java @@ -184,7 +184,7 @@ public ListWriter list(String name) { } public void setValueCount(int count) { - container.getMutator().setValueCount(count); + container.setValueCount(count); } @Override @@ -199,7 +199,7 @@ public void setPosition(int index) { public void start() { <#if mode == "Single"> <#else> - container.getMutator().setIndexDefined(idx()); + container.setIndexDefined(idx()); } @@ -214,7 +214,6 @@ public void end() { <#assign upperName = minor.class?upper_case /> <#assign capName = minor.class?cap_first /> <#assign vectName = capName /> - <#assign vectName = "Nullable${capName}" /> <#if minor.typeParams?? > @Override diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java deleted file mode 100644 index 122cd23648286..0000000000000 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ /dev/null @@ -1,776 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -<@pp.dropOutputFile /> -<#list vv.types as type> -<#list type.minor as minor> - -<#assign className = "Nullable${minor.class}Vector" /> -<#assign valuesName = "${minor.class}Vector" /> -<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> - -<@pp.changeOutputFile name="/org/apache/arrow/vector/${className}.java" /> - -<#include "/@includes/license.ftl" /> - -package org.apache.arrow.vector; - -import org.apache.arrow.vector.schema.ArrowFieldNode; -import java.util.Collections; - -<#include "/@includes/vv_imports.ftl" /> - -import org.apache.arrow.flatbuf.Precision; - -/** - * ${className} implements a vector of values which could be null. Elements in the vector - * are first checked against a fixed length vector of boolean values. Then the element is retrieved - * from the base class (if not null). - * - * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. - */ -@SuppressWarnings("unused") -public final class ${className} extends BaseValueVector implements <#if type.major == "VarLen">VariableWidth<#else>FixedWidthVector, NullableVector, FieldVector { - private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${className}.class); - -protected final static byte[] emptyByteArray = new byte[]{}; - private final FieldReader reader = new ${minor.class}ReaderImpl(${className}.this); - - private final String bitsField = "$bits$"; - private final String valuesField = "$values$"; - private final Field field; - - final BitVector bits = new BitVector(bitsField, allocator); - final ${valuesName} values; - - private final Mutator mutator; - private final Accessor accessor; - - private final List innerVectors; - - <#if minor.typeParams??> - <#assign typeParams = minor.typeParams?reverse> - <#list typeParams as typeParam> - private final ${typeParam.type} ${typeParam.name}; - - - /** - * Assumes the type is nullable and not dictionary encoded - * @param name name of the field - * @param allocator allocator to use to resize the vector<#list typeParams as typeParam> - * @param ${typeParam.name} type parameter ${typeParam.name} - */ - public ${className}(String name, BufferAllocator allocator<#list typeParams as typeParam>, ${typeParam.type} ${typeParam.name}) { - <#if minor.arrowTypeConstructorParams??> - <#assign constructorParams = minor.arrowTypeConstructorParams /> - <#else> - <#assign constructorParams = [] /> - <#list typeParams as typeParam> - <#assign constructorParams = constructorParams + [ typeParam.name ] /> - - - this(name, FieldType.nullable(new ${minor.arrowType}(${constructorParams?join(", ")})), allocator); - } - <#else> - public ${className}(String name, BufferAllocator allocator) { - this(name, FieldType.nullable(org.apache.arrow.vector.types.Types.MinorType.${minor.class?upper_case}.getType()), allocator); - } - - - public ${className}(String name, FieldType fieldType, BufferAllocator allocator) { - super(name, allocator); - <#if minor.typeParams??> - <#assign typeParams = minor.typeParams?reverse> - ${minor.arrowType} arrowType = (${minor.arrowType})fieldType.getType(); - <#list typeParams as typeParam> - this.${typeParam.name} = arrowType.get${typeParam.name?cap_first}(); - - this.values = new ${valuesName}(valuesField, allocator<#list typeParams as typeParam>, ${typeParam.name}); - <#else> - this.values = new ${valuesName}(valuesField, allocator); - - this.mutator = new Mutator(); - this.accessor = new Accessor(); - this.field = new Field(name, fieldType, null); - innerVectors = Collections.unmodifiableList(Arrays.asList( - bits, - <#if type.major = "VarLen"> - values.offsetVector, - - values - )); - } - - @Override - public BitVector getValidityVector() { - return bits; - } - - @Override - public List getFieldInnerVectors() { - return innerVectors; - } - - @Override - public void initializeChildrenFromFields(List children) { - if (!children.isEmpty()) { - throw new IllegalArgumentException("primitive type vector ${className} can not have children: " + children); - } - } - - @Override - public List getChildrenFromFields() { - return Collections.emptyList(); - } - - @Override - public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - <#if type.major = "VarLen"> - // variable width values: truncate offset vector buffer to size (#1) - org.apache.arrow.vector.BaseDataValueVector.truncateBufferBasedOnSize(ownBuffers, 1, - values.offsetVector.getBufferSizeFor( - fieldNode.getLength() == 0? 0 : fieldNode.getLength() + 1)); - mutator.lastSet = fieldNode.getLength() - 1; - <#else> - // fixed width values truncate value vector to size (#1) - org.apache.arrow.vector.BaseDataValueVector.truncateBufferBasedOnSize(ownBuffers, 1, values.getBufferSizeFor(fieldNode.getLength())); - - org.apache.arrow.vector.BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); - bits.valueCount = fieldNode.getLength(); - } - - public List getFieldBuffers() { - return org.apache.arrow.vector.BaseDataValueVector.unload(getFieldInnerVectors()); - } - - @Override - public Field getField() { - return field; - } - - @Override - public MinorType getMinorType() { - return MinorType.${minor.class?upper_case}; - } - - @Override - public FieldReader getReader(){ - return reader; - } - - @Override - public int getValueCapacity(){ - return Math.min(bits.getValueCapacity(), values.getValueCapacity()); - } - - @Override - public ArrowBuf[] getBuffers(boolean clear) { - final ArrowBuf[] buffers = ObjectArrays.concat(bits.getBuffers(false), values.getBuffers(false), ArrowBuf.class); - if (clear) { - for (final ArrowBuf buffer:buffers) { - buffer.retain(1); - } - clear(); - } - return buffers; - } - - @Override - public void close() { - bits.close(); - values.close(); - super.close(); - } - - @Override - public void clear() { - bits.clear(); - values.clear(); - super.clear(); - } - - @Override - public int getBufferSize(){ - return values.getBufferSize() + bits.getBufferSize(); - } - - @Override - public int getBufferSizeFor(final int valueCount) { - if (valueCount == 0) { - return 0; - } - - return values.getBufferSizeFor(valueCount) - + bits.getBufferSizeFor(valueCount); - } - - public ArrowBuf getBuffer() { - return values.getDataBuffer(); - } - - @Override - public ${valuesName} getValuesVector() { - return values; - } - - @Override - public void setInitialCapacity(int numRecords) { - bits.setInitialCapacity(numRecords); - values.setInitialCapacity(numRecords); - } - - @Override - public void allocateNew() { - if(!allocateNewSafe()){ - throw new OutOfMemoryException("Failure while allocating buffer."); - } - } - - @Override - public boolean allocateNewSafe() { - /* Boolean to keep track if all the memory allocations were successful - * Used in the case of composite vectors when we need to allocate multiple - * buffers for multiple vectors. If one of the allocations failed we need to - * clear all the memory that we allocated - */ - boolean success = false; - try { - success = values.allocateNewSafe() && bits.allocateNewSafe(); - } finally { - if (!success) { - clear(); - } - } - bits.zeroVector(); - mutator.reset(); - accessor.reset(); - return success; - } - - @Override - public void reAlloc() { - bits.reAlloc(); - values.reAlloc(); - } - - public void reset() { - bits.zeroVector(); - mutator.reset(); - accessor.reset(); - } - - <#if type.major == "VarLen"> - @Override - public void allocateNew(int totalBytes, int valueCount) { - try { - values.allocateNew(totalBytes, valueCount); - bits.allocateNew(valueCount); - } catch(RuntimeException e) { - clear(); - throw e; - } - bits.zeroVector(); - mutator.reset(); - accessor.reset(); - } - - @Override - public int getByteCapacity(){ - return values.getByteCapacity(); - } - - @Override - public int getCurrentSizeInBytes(){ - return values.getCurrentSizeInBytes(); - } - - <#else> - @Override - public void allocateNew(int valueCount) { - try { - values.allocateNew(valueCount); - bits.allocateNew(valueCount); - } catch(OutOfMemoryException e) { - clear(); - throw e; - } - bits.zeroVector(); - mutator.reset(); - accessor.reset(); - } - - /** - * {@inheritDoc} - */ - @Override - public void zeroVector() { - bits.zeroVector(); - values.zeroVector(); - } - - - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { - return getTransferPair(ref, allocator); - } - - @Override - public TransferPair getTransferPair(BufferAllocator allocator){ - return new TransferImpl(name, allocator); - - } - - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator){ - return new TransferImpl(ref, allocator); - } - - @Override - public TransferPair makeTransferPair(ValueVector to) { - return new TransferImpl((${className}) to); - } - - public void transferTo(${className} target){ - bits.transferTo(target.bits); - values.transferTo(target.values); - <#if type.major == "VarLen"> - target.mutator.lastSet = mutator.lastSet; - - clear(); - } - - public void splitAndTransferTo(int startIndex, int length, ${className} target) { - bits.splitAndTransferTo(startIndex, length, target.bits); - values.splitAndTransferTo(startIndex, length, target.values); - <#if type.major == "VarLen"> - target.mutator.lastSet = length - 1; - - } - - private class TransferImpl implements TransferPair { - ${className} to; - - public TransferImpl(String ref, BufferAllocator allocator){ - to = new ${className}(ref, field.getFieldType(), allocator); - } - - public TransferImpl(${className} to){ - this.to = to; - } - - @Override - public ${className} getTo(){ - return to; - } - - @Override - public void transfer(){ - transferTo(to); - } - - @Override - public void splitAndTransfer(int startIndex, int length) { - splitAndTransferTo(startIndex, length, to); - } - - @Override - public void copyValueSafe(int fromIndex, int toIndex) { - to.copyFromSafe(fromIndex, toIndex, ${className}.this); - } - } - - @Override - public Accessor getAccessor(){ - return accessor; - } - - @Override - public Mutator getMutator(){ - return mutator; - } - - public void copyFrom(int fromIndex, int thisIndex, ${className} from){ - final Accessor fromAccessor = from.getAccessor(); - if (!fromAccessor.isNull(fromIndex)) { - mutator.set(thisIndex, fromAccessor.get(fromIndex)); - } - <#if type.major == "VarLen">mutator.lastSet = thisIndex; - } - - public void copyFromSafe(int fromIndex, int thisIndex, ${valuesName} from){ - <#if type.major == "VarLen"> - mutator.fillEmpties(thisIndex); - - values.copyFromSafe(fromIndex, thisIndex, from); - bits.getMutator().setSafeToOne(thisIndex); - <#if type.major == "VarLen">mutator.lastSet = thisIndex; - } - - public void copyFromSafe(int fromIndex, int thisIndex, ${className} from){ - <#if type.major == "VarLen"> - mutator.fillEmpties(thisIndex); - - bits.copyFromSafe(fromIndex, thisIndex, from.bits); - values.copyFromSafe(fromIndex, thisIndex, from.values); - <#if type.major == "VarLen">mutator.lastSet = thisIndex; - } - - @Override - public long getValidityBufferAddress() { - /* address of the databuffer associated with the bitVector */ - return (bits.getDataBuffer().memoryAddress()); - } - - @Override - public long getDataBufferAddress() { - /* address of the dataBuffer associated with the valueVector */ - return (values.getDataBuffer().memoryAddress()); - } - - @Override - public long getOffsetBufferAddress() { - /* address of the dataBuffer associated with the offsetVector - * this operation is not supported for fixed-width vector types. - */ - <#if type.major != "VarLen"> - throw new UnsupportedOperationException(); - <#else> - return (values.getOffsetAddr()); - - } - - @Override - public ArrowBuf getValidityBuffer() { - /* dataBuffer associated with the bitVector */ - return (bits.getDataBuffer()); - } - - @Override - public ArrowBuf getDataBuffer() { - /* dataBuffer associated with the valueVector */ - return (values.getDataBuffer()); - } - - @Override - public ArrowBuf getOffsetBuffer() { - /* dataBuffer associated with the offsetVector of the valueVector */ - <#if type.major != "VarLen"> - throw new UnsupportedOperationException(); - <#else> - return (values.getOffsetBuffer()); - - } - - public final class Accessor extends BaseDataValueVector.BaseAccessor <#if type.major = "VarLen">implements VariableWidthVector.VariableWidthAccessor { - final BitVector.Accessor bAccessor = bits.getAccessor(); - final ${valuesName}.Accessor vAccessor = values.getAccessor(); - - /** - * Get the element at the specified position. - * - * @param index position of the value - * @return value of the element, if not null - */ - public <#if type.major == "VarLen">byte[]<#else>${minor.javaType!type.javaType} get(int index) { - if (isNull(index)) { - throw new IllegalStateException("Can't get a null value"); - } - return vAccessor.get(index); - } - - @Override - public boolean isNull(int index) { - return isSet(index) == 0; - } - - public int isSet(int index){ - return bAccessor.get(index); - } - - <#if type.major == "VarLen"> - public long getStartEnd(int index){ - return vAccessor.getStartEnd(index); - } - - @Override - public int getValueLength(int index) { - return values.getAccessor().getValueLength(index); - } - - - public void get(int index, Nullable${minor.class}Holder holder){ - vAccessor.get(index, holder); - holder.isSet = bAccessor.get(index); - } - - @Override - public ${friendlyType} getObject(int index) { - if (isNull(index)) { - return null; - }else{ - return vAccessor.getObject(index); - } - } - - <#if minor.class == "IntervalYear" || minor.class == "IntervalDay"> - public StringBuilder getAsStringBuilder(int index) { - if (isNull(index)) { - return null; - }else{ - return vAccessor.getAsStringBuilder(index); - } - } - - - @Override - public int getValueCount(){ - return bits.getAccessor().getValueCount(); - } - - public void reset(){} - } - - public final class Mutator extends BaseDataValueVector.BaseMutator implements NullableVectorDefinitionSetter<#if type.major = "VarLen">, VariableWidthVector.VariableWidthMutator { - private int setCount; - <#if type.major = "VarLen"> private int lastSet = -1; - - private Mutator(){ - } - - public ${valuesName} getVectorWithValues(){ - return values; - } - - @Override - public void setIndexDefined(int index){ - bits.getMutator().setToOne(index); - } - - /** - * Set the variable length element at the specified index to the supplied byte array. - * - * @param index position of the bit to set - * @param value array of bytes (or int if smaller than 4 bytes) to write - */ - public void set(int index, <#if type.major == "VarLen">byte[]<#elseif (type.width < 4)>int<#else>${minor.javaType!type.javaType} value) { - setCount++; - final ${valuesName}.Mutator valuesMutator = values.getMutator(); - final BitVector.Mutator bitsMutator = bits.getMutator(); - <#if type.major == "VarLen"> - for (int i = lastSet + 1; i < index; i++) { - valuesMutator.set(i, emptyByteArray); - } - - bitsMutator.setToOne(index); - valuesMutator.set(index, value); - <#if type.major == "VarLen">lastSet = index; - } - - <#if type.major == "VarLen"> - - public void fillEmpties(int index){ - final ${valuesName}.Mutator valuesMutator = values.getMutator(); - for (int i = lastSet + 1; i < index; i++) { - valuesMutator.setSafe(i, emptyByteArray); - } - while(index > bits.getValueCapacity()) { - bits.reAlloc(); - } - lastSet = index - 1; - } - - @Override - public void setValueLengthSafe(int index, int length) { - values.getMutator().setValueLengthSafe(index, length); - lastSet = index; - } - - - public void setSafe(int index, byte[] value, int start, int length) { - <#if type.major != "VarLen"> - throw new UnsupportedOperationException(); - <#else> - fillEmpties(index); - - bits.getMutator().setSafeToOne(index); - values.getMutator().setSafe(index, value, start, length); - setCount++; - <#if type.major == "VarLen">lastSet = index; - - } - - public void setSafe(int index, ByteBuffer value, int start, int length) { - <#if type.major != "VarLen"> - throw new UnsupportedOperationException(); - <#else> - fillEmpties(index); - - bits.getMutator().setSafeToOne(index); - values.getMutator().setSafe(index, value, start, length); - setCount++; - <#if type.major == "VarLen">lastSet = index; - - } - - public void setNull(int index){ - bits.getMutator().setSafe(index, 0); - } - - public void setSkipNull(int index, ${minor.class}Holder holder){ - values.getMutator().set(index, holder); - } - - public void setSkipNull(int index, Nullable${minor.class}Holder holder){ - values.getMutator().set(index, holder); - } - - public void set(int index, Nullable${minor.class}Holder holder){ - final ${valuesName}.Mutator valuesMutator = values.getMutator(); - <#if type.major == "VarLen"> - for (int i = lastSet + 1; i < index; i++) { - valuesMutator.set(i, emptyByteArray); - } - - bits.getMutator().set(index, holder.isSet); - valuesMutator.set(index, holder); - <#if type.major == "VarLen">lastSet = index; - } - - public void set(int index, ${minor.class}Holder holder){ - final ${valuesName}.Mutator valuesMutator = values.getMutator(); - <#if type.major == "VarLen"> - for (int i = lastSet + 1; i < index; i++) { - valuesMutator.set(i, emptyByteArray); - } - - bits.getMutator().setToOne(index); - valuesMutator.set(index, holder); - <#if type.major == "VarLen">lastSet = index; - } - - public boolean isSafe(int outIndex) { - return outIndex < ${className}.this.getValueCapacity(); - } - - <#assign fields = minor.fields!type.fields /> - public void set(int index, int isSet<#list fields as field>, ${field.type} ${field.name}Field ){ - final ${valuesName}.Mutator valuesMutator = values.getMutator(); - <#if type.major == "VarLen"> - for (int i = lastSet + 1; i < index; i++) { - valuesMutator.set(i, emptyByteArray); - } - - bits.getMutator().set(index, isSet); - valuesMutator.set(index<#list fields as field><#if field.include!true >, ${field.name}Field); - <#if type.major == "VarLen">lastSet = index; - } - - public void setSafe(int index, int isSet<#list fields as field><#if field.include!true >, ${field.type} ${field.name}Field ) { - <#if type.major == "VarLen"> - fillEmpties(index); - - bits.getMutator().setSafe(index, isSet); - values.getMutator().setSafe(index<#list fields as field><#if field.include!true >, ${field.name}Field); - setCount++; - <#if type.major == "VarLen">lastSet = index; - } - - - public void setSafe(int index, Nullable${minor.class}Holder value) { - <#if type.major == "VarLen"> - fillEmpties(index); - - bits.getMutator().setSafe(index, value.isSet); - values.getMutator().setSafe(index, value); - setCount++; - <#if type.major == "VarLen">lastSet = index; - } - - public void setSafe(int index, ${minor.class}Holder value) { - <#if type.major == "VarLen"> - fillEmpties(index); - - bits.getMutator().setSafeToOne(index); - values.getMutator().setSafe(index, value); - setCount++; - <#if type.major == "VarLen">lastSet = index; - } - - <#if !(type.major == "VarLen" || minor.class == "IntervalDay")> - public void setSafe(int index, ${minor.javaType!type.javaType} value) { - bits.getMutator().setSafeToOne(index); - values.getMutator().setSafe(index, value); - setCount++; - } - - - <#if minor.class == "Decimal"> - public void set(int index, ${friendlyType} value) { - bits.getMutator().setToOne(index); - values.getMutator().set(index, value); - } - - public void setSafe(int index, ${friendlyType} value) { - bits.getMutator().setSafeToOne(index); - values.getMutator().setSafe(index, value); - setCount++; - } - - - @Override - public void setValueCount(int valueCount) { - assert valueCount >= 0; - <#if type.major == "VarLen"> - fillEmpties(valueCount); - - values.getMutator().setValueCount(valueCount); - bits.getMutator().setValueCount(valueCount); - } - - @Override - public void generateTestData(int valueCount){ - bits.getMutator().generateTestDataAlt(valueCount); - values.getMutator().generateTestData(valueCount); - <#if type.major = "VarLen">lastSet = valueCount; - setValueCount(valueCount); - } - - @Override - public void reset(){ - setCount = 0; - <#if type.major = "VarLen">lastSet = -1; - } - - public void setLastSet(int value) { - <#if type.major = "VarLen"> - lastSet = value; - <#else> - throw new UnsupportedOperationException(); - - } - - public int getLastSet() { - <#if type.major != "VarLen"> - throw new UnsupportedOperationException(); - <#else> - return lastSet; - - } - } -} - - diff --git a/java/vector/src/main/codegen/templates/UnionListWriter.java b/java/vector/src/main/codegen/templates/UnionListWriter.java index 9fe41d0d96335..8ac23fe46f7d2 100644 --- a/java/vector/src/main/codegen/templates/UnionListWriter.java +++ b/java/vector/src/main/codegen/templates/UnionListWriter.java @@ -36,11 +36,11 @@ public class UnionListWriter extends AbstractFieldWriter { private ListVector vector; - private UInt4Vector offsets; private PromotableWriter writer; private boolean inMap = false; private String mapName; private int lastIndex = 0; + private static final int OFFSET_WIDTH = 4; public UnionListWriter(ListVector vector) { this(vector, NullableMapWriterFactory.getNullableMapWriterFactoryInstance()); @@ -49,7 +49,6 @@ public UnionListWriter(ListVector vector) { public UnionListWriter(ListVector vector, NullableMapWriterFactory nullableMapWriterFactory) { this.vector = vector; this.writer = new PromotableWriter(vector.getDataVector(), vector, nullableMapWriterFactory); - this.offsets = vector.getOffsetVector(); } public UnionListWriter(ListVector vector, AbstractFieldWriter parent) { @@ -72,7 +71,7 @@ public Field getField() { } public void setValueCount(int count) { - vector.getMutator().setValueCount(count); + vector.setValueCount(count); } @Override @@ -133,13 +132,13 @@ public MapWriter map(String name) { @Override public void startList() { - vector.getMutator().startNewValue(idx()); - writer.setPosition(offsets.getAccessor().get(idx() + 1)); + vector.startNewValue(idx()); + writer.setPosition(vector.getOffsetBuffer().getInt((idx() + 1) * OFFSET_WIDTH)); } @Override public void endList() { - offsets.getMutator().set(idx() + 1, writer.idx()); + vector.getOffsetBuffer().setInt((idx() + 1) * OFFSET_WIDTH, writer.idx()); setPosition(idx() + 1); } diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index fd3a766fc2cb4..98bb7c1f53d84 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -60,7 +60,7 @@ public Field getField() { } public boolean isSet(){ - return !data.getAccessor().isNull(idx()); + return !data.isNull(idx()); } public void read(UnionHolder holder) { diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index fe24a8674bdc5..aa8178a92f80a 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -25,13 +25,16 @@ <#include "/@includes/vv_imports.ftl" /> import com.google.common.collect.ImmutableList; +import io.netty.buffer.ArrowBuf; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; -import org.apache.arrow.vector.BaseDataValueVector; import org.apache.arrow.vector.complex.impl.ComplexCopier; import org.apache.arrow.vector.util.CallBack; -import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.memory.BaseAllocator; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.util.OversizedAllocationException; import static org.apache.arrow.vector.types.UnionMode.Sparse; @@ -44,8 +47,8 @@ /** - * A vector which can hold values of different types. It does so by using a MapVector which contains a vector for each - * primitive type that is stored. MapVector is used in order to take advantage of its serialization/deserialization methods, + * A vector which can hold values of different types. It does so by using a NullableMapVector which contains a vector for each + * primitive type that is stored. NullableMapVector is used in order to take advantage of its serialization/deserialization methods, * as well as the addOrGet method. * * For performance reasons, UnionVector stores a cached reference to each subtype vector, to avoid having to do the map lookup @@ -56,12 +59,10 @@ public class UnionVector implements FieldVector { private String name; private BufferAllocator allocator; - private Accessor accessor = new Accessor(); - private Mutator mutator = new Mutator(); int valueCount; MapVector internalMap; - UInt1Vector typeVector; + protected ArrowBuf typeBuffer; private NullableMapVector mapVector; private ListVector listVector; @@ -71,16 +72,17 @@ public class UnionVector implements FieldVector { private int singleType = 0; private ValueVector singleVector; + private static final byte TYPE_WIDTH = 1; private final CallBack callBack; - private final List innerVectors; + private int typeBufferAllocationSizeInBytes; public UnionVector(String name, BufferAllocator allocator, CallBack callBack) { this.name = name; this.allocator = allocator; this.internalMap = new MapVector("internal", allocator, new FieldType(false, ArrowType.Struct.INSTANCE, null, null), callBack); - this.typeVector = new UInt1Vector("types", allocator); + this.typeBuffer = allocator.getEmpty(); this.callBack = callBack; - this.innerVectors = Collections.unmodifiableList(Arrays.asList(typeVector)); + this.typeBufferAllocationSizeInBytes = BaseValueVector.INITIAL_VALUE_ALLOCATION * TYPE_WIDTH; } public BufferAllocator getAllocator() { @@ -104,20 +106,35 @@ public List getChildrenFromFields() { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - // truncate types vector buffer to size (#0) - org.apache.arrow.vector.BaseDataValueVector.truncateBufferBasedOnSize(ownBuffers, 0, typeVector.getBufferSizeFor(fieldNode.getLength())); - BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); + if (ownBuffers.size() != 1) { + throw new IllegalArgumentException("Illegal buffer count, expected " + 1 + ", got: " + ownBuffers.size()); + } + + ArrowBuf buffer = ownBuffers.get(0); + typeBuffer.release(); + typeBuffer = buffer.retain(allocator); + typeBufferAllocationSizeInBytes = typeBuffer.capacity(); this.valueCount = fieldNode.getLength(); } @Override public List getFieldBuffers() { - return BaseDataValueVector.unload(getFieldInnerVectors()); + List result = new ArrayList<>(1); + setReaderAndWriterIndex(); + result.add(typeBuffer); + + return result; + } + + private void setReaderAndWriterIndex() { + typeBuffer.readerIndex(0); + typeBuffer.writerIndex(valueCount * TYPE_WIDTH); } @Override + @Deprecated public List getFieldInnerVectors() { - return this.innerVectors; + throw new UnsupportedOperationException("There are no inner vectors. Use geFieldBuffers"); } private String fieldName(MinorType type) { @@ -134,7 +151,7 @@ private T addOrGet(MinorType minorType, Class c) { @Override public long getValidityBufferAddress() { - return typeVector.getDataBuffer().memoryAddress(); + return typeBuffer.memoryAddress(); } @Override @@ -148,7 +165,7 @@ public long getOffsetBufferAddress() { } @Override - public ArrowBuf getValidityBuffer() { return typeVector.getDataBuffer(); } + public ArrowBuf getValidityBuffer() { return typeBuffer; } @Override public ArrowBuf getDataBuffer() { throw new UnsupportedOperationException(); } @@ -177,12 +194,12 @@ public NullableMapVector getMap() { <#assign lowerCaseName = name?lower_case/> <#if !minor.typeParams?? > - private Nullable${name}Vector ${uncappedName}Vector; + private ${name}Vector ${uncappedName}Vector; - public Nullable${name}Vector get${name}Vector() { + public ${name}Vector get${name}Vector() { if (${uncappedName}Vector == null) { int vectorCount = internalMap.size(); - ${uncappedName}Vector = addOrGet(MinorType.${name?upper_case}, Nullable${name}Vector.class); + ${uncappedName}Vector = addOrGet(MinorType.${name?upper_case}, ${name}Vector.class); if (internalMap.size() > vectorCount) { ${uncappedName}Vector.allocateNew(); if (callBack != null) { @@ -211,47 +228,80 @@ public ListVector getList() { } public int getTypeValue(int index) { - return typeVector.getAccessor().get(index); - } - - public UInt1Vector getTypeVector() { - return typeVector; + return typeBuffer.getByte(index * TYPE_WIDTH); } @Override public void allocateNew() throws OutOfMemoryException { + /* new allocation -- clear the current buffers */ + clear(); internalMap.allocateNew(); - typeVector.allocateNew(); - if (typeVector != null) { - typeVector.zeroVector(); + try { + allocateTypeBuffer(); + } catch (Exception e) { + clear(); + throw e; } } @Override public boolean allocateNewSafe() { + /* new allocation -- clear the current buffers */ + clear(); boolean safe = internalMap.allocateNewSafe(); - safe = safe && typeVector.allocateNewSafe(); - if (safe) { - if (typeVector != null) { - typeVector.zeroVector(); - } + if (!safe) { return false; } + try { + allocateTypeBuffer(); + } catch (Exception e) { + clear(); + return false; } - return safe; + + return true; + } + + private void allocateTypeBuffer() { + typeBuffer = allocator.buffer(typeBufferAllocationSizeInBytes); + typeBuffer.readerIndex(0); + typeBuffer.setZero(0, typeBuffer.capacity()); } @Override public void reAlloc() { internalMap.reAlloc(); - typeVector.reAlloc(); + reallocTypeBuffer(); } - @Override - public void setInitialCapacity(int numRecords) { + private void reallocTypeBuffer() { + final int currentBufferCapacity = typeBuffer.capacity(); + long baseSize = typeBufferAllocationSizeInBytes; + + if (baseSize < (long)currentBufferCapacity) { + baseSize = (long)currentBufferCapacity; + } + + long newAllocationSize = baseSize * 2L; + newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + + if (newAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int)newAllocationSize); + newBuf.setBytes(0, typeBuffer, 0, currentBufferCapacity); + final int halfNewCapacity = newBuf.capacity() / 2; + newBuf.setZero(halfNewCapacity, halfNewCapacity); + typeBuffer.release(1); + typeBuffer = newBuf; + typeBufferAllocationSizeInBytes = (int)newAllocationSize; } + @Override + public void setInitialCapacity(int numRecords) { } + @Override public int getValueCapacity() { - return Math.min(typeVector.getValueCapacity(), internalMap.getValueCapacity()); + return Math.min(getTypeBufferValueCapacity(), internalMap.getValueCapacity()); } @Override @@ -261,10 +311,19 @@ public void close() { @Override public void clear() { - typeVector.clear(); + valueCount = 0; + typeBuffer.release(); + typeBuffer = allocator.getEmpty(); internalMap.clear(); } + @Override + public void reset() { + valueCount = 0; + typeBuffer.setZero(0, typeBuffer.capacity()); + internalMap.reset(); + } + @Override public Field getField() { List childFields = new ArrayList<>(); @@ -300,7 +359,7 @@ public TransferPair makeTransferPair(ValueVector target) { public void copyFrom(int inIndex, int outIndex, UnionVector from) { from.getReader().setPosition(inIndex); getWriter().setPosition(outIndex); - ComplexCopier.copy(from.reader, mutator.writer); + ComplexCopier.copy(from.reader, writer); } public void copyFromSafe(int inIndex, int outIndex, UnionVector from) { @@ -321,33 +380,35 @@ public FieldVector addVector(FieldVector v) { private class TransferImpl implements TransferPair { private final TransferPair internalMapVectorTransferPair; - private final TransferPair typeVectorTransferPair; private final UnionVector to; public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { to = new UnionVector(name, allocator, callBack); internalMapVectorTransferPair = internalMap.makeTransferPair(to.internalMap); - typeVectorTransferPair = typeVector.makeTransferPair(to.typeVector); } public TransferImpl(UnionVector to) { this.to = to; internalMapVectorTransferPair = internalMap.makeTransferPair(to.internalMap); - typeVectorTransferPair = typeVector.makeTransferPair(to.typeVector); } @Override public void transfer() { + to.clear(); + to.typeBuffer = typeBuffer.transferOwnership(to.allocator).buffer; internalMapVectorTransferPair.transfer(); - typeVectorTransferPair.transfer(); to.valueCount = valueCount; + clear(); } @Override public void splitAndTransfer(int startIndex, int length) { + to.clear(); internalMapVectorTransferPair.splitAndTransfer(startIndex, length); - typeVectorTransferPair.splitAndTransfer(startIndex, length); - to.getMutator().setValueCount(length); + final int startPoint = startIndex * TYPE_WIDTH; + final int sliceLength = length * TYPE_WIDTH; + to.typeBuffer = typeBuffer.slice(startPoint, sliceLength).transferOwnership(to.allocator).buffer; + to.setValueCount(length); } @Override @@ -361,16 +422,6 @@ public void copyValueSafe(int from, int to) { } } - @Override - public Accessor getAccessor() { - return accessor; - } - - @Override - public Mutator getMutator() { - return mutator; - } - @Override public FieldReader getReader() { if (reader == null) { @@ -380,15 +431,17 @@ public FieldReader getReader() { } public FieldWriter getWriter() { - if (mutator.writer == null) { - mutator.writer = new UnionWriter(this); + if (writer == null) { + writer = new UnionWriter(this); } - return mutator.writer; + return writer; } @Override public int getBufferSize() { - return typeVector.getBufferSize() + internalMap.getBufferSize(); + if (valueCount == 0) { return 0; } + + return (valueCount * TYPE_WIDTH) + internalMap.getBufferSize(); } @Override @@ -402,14 +455,23 @@ public int getBufferSizeFor(final int valueCount) { bufferSize += v.getBufferSizeFor(valueCount); } - return (int) bufferSize; + return (int) bufferSize + (valueCount * TYPE_WIDTH); } @Override public ArrowBuf[] getBuffers(boolean clear) { ImmutableList.Builder builder = ImmutableList.builder(); - builder.add(typeVector.getBuffers(clear)); - builder.add(internalMap.getBuffers(clear)); + setReaderAndWriterIndex(); + if (getBufferSize() != 0) { + builder.add(typeBuffer); + builder.add(internalMap.getBuffers(clear)); + } + if (clear) { + valueCount = 0; + typeBuffer.retain(); + typeBuffer.release(); + typeBuffer = allocator.getEmpty(); + } List list = builder.build(); return list.toArray(new ArrowBuf[list.size()]); } @@ -417,15 +479,12 @@ public ArrowBuf[] getBuffers(boolean clear) { @Override public Iterator iterator() { List vectors = Lists.newArrayList(internalMap.iterator()); - vectors.add(typeVector); return vectors.iterator(); } - public class Accessor extends BaseValueVector.BaseAccessor { - @Override public Object getObject(int index) { - int type = typeVector.getAccessor().get(index); + int type = typeBuffer.getByte(index * TYPE_WIDTH); switch (MinorType.values()[type]) { case NULL: return null; @@ -436,14 +495,14 @@ public Object getObject(int index) { <#assign uncappedName = name?uncap_first/> <#if !minor.typeParams?? > case ${name?upper_case}: - return get${name}Vector().getAccessor().getObject(index); + return get${name}Vector().getObject(index); case MAP: - return getMap().getAccessor().getObject(index); + return getMap().getObject(index); case LIST: - return getList().getAccessor().getObject(index); + return getList().getObject(index); default: throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[type]); } @@ -462,30 +521,37 @@ public void get(int index, UnionHolder holder) { holder.reader = reader; } - @Override public int getValueCount() { return valueCount; } - @Override public boolean isNull(int index) { - return typeVector.getAccessor().get(index) == 0; + return (typeBuffer.getByte(index * TYPE_WIDTH) == 0); + } + + @Override + public int getNullCount() { + int nullCount = 0; + for (int i = 0; i < getValueCount(); i++) { + if (isNull(i)) { + nullCount++; + } + } + return nullCount; } public int isSet(int index) { return isNull(index) ? 0 : 1; } - } - - public class Mutator extends BaseValueVector.BaseMutator { UnionWriter writer; - @Override public void setValueCount(int valueCount) { - UnionVector.this.valueCount = valueCount; - typeVector.getMutator().setValueCount(valueCount); - internalMap.getMutator().setValueCount(valueCount); + this.valueCount = valueCount; + while (valueCount > getTypeBufferValueCapacity()) { + reallocTypeBuffer(); + } + internalMap.setValueCount(valueCount); } public void setSafe(int index, UnionHolder holder) { @@ -530,7 +596,7 @@ public void setSafe(int index, UnionHolder holder) { <#if !minor.typeParams?? > public void setSafe(int index, Nullable${name}Holder holder) { setType(index, MinorType.${name?upper_case}); - get${name}Vector().getMutator().setSafe(index, holder); + get${name}Vector().setSafe(index, holder); } @@ -538,13 +604,13 @@ public void setSafe(int index, Nullable${name}Holder holder) { public void setType(int index, MinorType type) { - typeVector.getMutator().setSafe(index, (byte) type.ordinal()); + while (index >= getTypeBufferValueCapacity()) { + reallocTypeBuffer(); + } + typeBuffer.setByte(index * TYPE_WIDTH , (byte) type.ordinal()); } - @Override - public void reset() { } - - @Override - public void generateTestData(int values) { } - } + private int getTypeBufferValueCapacity() { + return (int) ((typeBuffer.capacity() * 1.0) / TYPE_WIDTH); + } } diff --git a/java/vector/src/main/codegen/templates/UnionWriter.java b/java/vector/src/main/codegen/templates/UnionWriter.java index f892bac913392..526708a4c328b 100644 --- a/java/vector/src/main/codegen/templates/UnionWriter.java +++ b/java/vector/src/main/codegen/templates/UnionWriter.java @@ -62,7 +62,7 @@ public void setPosition(int index) { @Override public void start() { - data.getMutator().setType(idx(), MinorType.MAP); + data.setType(idx(), MinorType.MAP); getMapWriter().start(); } @@ -74,7 +74,7 @@ public void end() { @Override public void startList() { getListWriter().startList(); - data.getMutator().setType(idx(), MinorType.LIST); + data.setType(idx(), MinorType.LIST); } @Override @@ -92,7 +92,7 @@ private MapWriter getMapWriter() { } public MapWriter asMap() { - data.getMutator().setType(idx(), MinorType.MAP); + data.setType(idx(), MinorType.MAP); return getMapWriter(); } @@ -106,7 +106,7 @@ private ListWriter getListWriter() { } public ListWriter asList() { - data.getMutator().setType(idx(), MinorType.LIST); + data.setType(idx(), MinorType.LIST); return getListWriter(); } @@ -150,19 +150,19 @@ BaseWriter getWriter(MinorType minorType) { } public ${name}Writer as${name}() { - data.getMutator().setType(idx(), MinorType.${name?upper_case}); + data.setType(idx(), MinorType.${name?upper_case}); return get${name}Writer(); } @Override public void write(${name}Holder holder) { - data.getMutator().setType(idx(), MinorType.${name?upper_case}); + data.setType(idx(), MinorType.${name?upper_case}); get${name}Writer().setPosition(idx()); get${name}Writer().write${name}(<#list fields as field>holder.${field.name}<#if field_has_next>, ); } public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { - data.getMutator().setType(idx(), MinorType.${name?upper_case}); + data.setType(idx(), MinorType.${name?upper_case}); get${name}Writer().setPosition(idx()); get${name}Writer().write${name}(<#list fields as field>${field.name}<#if field_has_next>, ); } @@ -175,28 +175,28 @@ public void writeNull() { @Override public MapWriter map() { - data.getMutator().setType(idx(), MinorType.LIST); + data.setType(idx(), MinorType.LIST); getListWriter().setPosition(idx()); return getListWriter().map(); } @Override public ListWriter list() { - data.getMutator().setType(idx(), MinorType.LIST); + data.setType(idx(), MinorType.LIST); getListWriter().setPosition(idx()); return getListWriter().list(); } @Override public ListWriter list(String name) { - data.getMutator().setType(idx(), MinorType.MAP); + data.setType(idx(), MinorType.MAP); getMapWriter().setPosition(idx()); return getMapWriter().list(name); } @Override public MapWriter map(String name) { - data.getMutator().setType(idx(), MinorType.MAP); + data.setType(idx(), MinorType.MAP); getMapWriter().setPosition(idx()); return getMapWriter().map(name); } @@ -209,14 +209,14 @@ public MapWriter map(String name) { <#if !minor.typeParams?? > @Override public ${capName}Writer ${lowerName}(String name) { - data.getMutator().setType(idx(), MinorType.MAP); + data.setType(idx(), MinorType.MAP); getMapWriter().setPosition(idx()); return getMapWriter().${lowerName}(name); } @Override public ${capName}Writer ${lowerName}() { - data.getMutator().setType(idx(), MinorType.LIST); + data.setType(idx(), MinorType.LIST); getListWriter().setPosition(idx()); return getListWriter().${lowerName}(); } diff --git a/java/vector/src/main/codegen/templates/VariableLengthVectors.java b/java/vector/src/main/codegen/templates/VariableLengthVectors.java deleted file mode 100644 index 3934e74f11b2d..0000000000000 --- a/java/vector/src/main/codegen/templates/VariableLengthVectors.java +++ /dev/null @@ -1,677 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.lang.Override; - -import org.apache.drill.exec.exception.OutOfMemoryException; -import org.apache.drill.exec.vector.BaseDataValueVector; -import org.apache.drill.exec.vector.BaseValueVector; -import org.apache.drill.exec.vector.VariableWidthVector; - -<@pp.dropOutputFile /> -<#list vv.types as type> -<#list type.minor as minor> - -<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> -<#assign className = "${minor.class}Vector" /> - -<#if type.major == "VarLen"> -<@pp.changeOutputFile name="/org/apache/arrow/vector/${minor.class}Vector.java" /> - -<#include "/@includes/license.ftl" /> - -package org.apache.arrow.vector; - -<#include "/@includes/vv_imports.ftl" /> - -/** - * ${minor.class}Vector implements a vector of variable width values. Elements in the vector - * are accessed by position from the logical start of the vector. A fixed width offsetVector - * is used to convert an element's position to it's offset from the start of the (0-based) - * ArrowBuf. Size is inferred by adjacent elements. - * The width of each element is ${type.width} byte(s) - * The equivalent Java primitive is '${minor.javaType!type.javaType}' - * - * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. - */ -public final class ${className} extends BaseDataValueVector implements VariableWidthVector{ - private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${className}.class); - - private static final int DEFAULT_RECORD_BYTE_COUNT = 8; - private static final int INITIAL_BYTE_COUNT = 4096 * DEFAULT_RECORD_BYTE_COUNT; - private static final int MIN_BYTE_COUNT = 4096; - - public final static String OFFSETS_VECTOR_NAME = "$offsets$"; - final UInt${type.width}Vector offsetVector = new UInt${type.width}Vector(OFFSETS_VECTOR_NAME, allocator); - - private final Accessor accessor; - private final Mutator mutator; - - private final UInt${type.width}Vector.Accessor oAccessor; - - private int allocationSizeInBytes = INITIAL_BYTE_COUNT; - private int allocationMonitor = 0; - - <#if minor.typeParams??> - <#list minor.typeParams as typeParam> - private final ${typeParam.type} ${typeParam.name}; - - - public ${className}(String name, BufferAllocator allocator<#list minor.typeParams as typeParam>, ${typeParam.type} ${typeParam.name}) { - super(name, allocator); - this.oAccessor = offsetVector.getAccessor(); - this.accessor = new Accessor(); - this.mutator = new Mutator(); - <#list minor.typeParams as typeParam> - this.${typeParam.name} = ${typeParam.name}; - - } - <#else> - public ${className}(String name, BufferAllocator allocator) { - super(name, allocator); - this.oAccessor = offsetVector.getAccessor(); - this.accessor = new Accessor(); - this.mutator = new Mutator(); - } - - - @Override - public Field getField() { - throw new UnsupportedOperationException("internal vector"); - } - - @Override - public MinorType getMinorType() { - return MinorType.${minor.class?upper_case}; - } - - @Override - public FieldReader getReader(){ - throw new UnsupportedOperationException("internal vector"); - } - - @Override - public int getBufferSize(){ - if (getAccessor().getValueCount() == 0) { - return 0; - } - return offsetVector.getBufferSize() + data.writerIndex(); - } - - @Override - public int getBufferSizeFor(final int valueCount) { - if (valueCount == 0) { - return 0; - } - - final int idx = offsetVector.getAccessor().get(valueCount); - return offsetVector.getBufferSizeFor(valueCount + 1) + idx; - } - - @Override - public ArrowBuf getValidityBuffer() { - /* this operation is not supported for non-nullable vectors */ - throw new UnsupportedOperationException(); - } - - @Override - public ArrowBuf getDataBuffer() { - /* we are not throwing away getBuffer() of BaseDataValueVector so use it wherever applicable */ - return getBuffer(); - } - - @Override - public ArrowBuf getOffsetBuffer() { - /* dataBuffer associated with the underlying offsetVector */ - return offsetVector.getDataBuffer(); - } - - @Override - public int getValueCapacity(){ - return Math.max(offsetVector.getValueCapacity() - 1, 0); - } - - @Override - public int getByteCapacity(){ - return data.capacity(); - } - - @Override - public int getCurrentSizeInBytes() { - return offsetVector.getAccessor().get(getAccessor().getValueCount()); - } - - /** - * Return the number of bytes contained in the current var len byte vector. - * @return the number of bytes contained in the current var len byte vector - */ - public int getVarByteLength(){ - final int valueCount = getAccessor().getValueCount(); - if(valueCount == 0) { - return 0; - } - return offsetVector.getAccessor().get(valueCount); - } - - @Override - public void clear() { - super.clear(); - offsetVector.clear(); - } - - @Override - public ArrowBuf[] getBuffers(boolean clear) { - final ArrowBuf[] buffers = ObjectArrays.concat(offsetVector.getBuffers(false), super.getBuffers(false), ArrowBuf.class); - if (clear) { - // does not make much sense but we have to retain buffers even when clear is set. refactor this interface. - for (final ArrowBuf buffer:buffers) { - buffer.retain(1); - } - clear(); - } - return buffers; - } - - public long getOffsetAddr(){ - return offsetVector.getDataBuffer().memoryAddress(); - } - - public UInt${type.width}Vector getOffsetVector(){ - return offsetVector; - } - - @Override - public TransferPair getTransferPair(BufferAllocator allocator){ - return new TransferImpl(name, allocator); - } - - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator){ - return new TransferImpl(ref, allocator); - } - - @Override - public TransferPair makeTransferPair(ValueVector to) { - return new TransferImpl((${className}) to); - } - - public void transferTo(${className} target){ - target.clear(); - this.offsetVector.transferTo(target.offsetVector); - target.data = data.transferOwnership(target.allocator).buffer; - target.data.writerIndex(data.writerIndex()); - clear(); - } - - public void splitAndTransferTo(int startIndex, int length, ${className} target) { - UInt${type.width}Vector.Accessor offsetVectorAccessor = this.offsetVector.getAccessor(); - final int startPoint = offsetVectorAccessor.get(startIndex); - final int sliceLength = offsetVectorAccessor.get(startIndex + length) - startPoint; - target.clear(); - target.offsetVector.allocateNew(length + 1); - offsetVectorAccessor = this.offsetVector.getAccessor(); - final UInt4Vector.Mutator targetOffsetVectorMutator = target.offsetVector.getMutator(); - for (int i = 0; i < length + 1; i++) { - targetOffsetVectorMutator.set(i, offsetVectorAccessor.get(startIndex + i) - startPoint); - } - target.data = data.slice(startPoint, sliceLength).transferOwnership(target.allocator).buffer; - target.getMutator().setValueCount(length); -} - - protected void copyFrom(int fromIndex, int thisIndex, ${className} from){ - final UInt4Vector.Accessor fromOffsetVectorAccessor = from.offsetVector.getAccessor(); - final int start = fromOffsetVectorAccessor.get(fromIndex); - final int end = fromOffsetVectorAccessor.get(fromIndex + 1); - final int len = end - start; - - final int outputStart = offsetVector.data.get${(minor.javaType!type.javaType)?cap_first}(thisIndex * ${type.width}); - from.data.getBytes(start, data, outputStart, len); - offsetVector.data.set${(minor.javaType!type.javaType)?cap_first}( (thisIndex+1) * ${type.width}, outputStart + len); - } - - public boolean copyFromSafe(int fromIndex, int thisIndex, ${className} from){ - final UInt${type.width}Vector.Accessor fromOffsetVectorAccessor = from.offsetVector.getAccessor(); - final int start = fromOffsetVectorAccessor.get(fromIndex); - final int end = fromOffsetVectorAccessor.get(fromIndex + 1); - final int len = end - start; - final int outputStart = offsetVector.data.get${(minor.javaType!type.javaType)?cap_first}(thisIndex * ${type.width}); - - while(data.capacity() < outputStart + len) { - reAlloc(); - } - - offsetVector.getMutator().setSafe(thisIndex + 1, outputStart + len); - from.data.getBytes(start, data, outputStart, len); - return true; - } - - private class TransferImpl implements TransferPair{ - ${className} to; - - public TransferImpl(String name, BufferAllocator allocator){ - to = new ${className}(name, allocator<#if minor.typeParams??><#list minor.typeParams as typeParam>, ${className}.this.${typeParam.name}); - } - - public TransferImpl(${className} to){ - this.to = to; - } - - @Override - public ${className} getTo(){ - return to; - } - - @Override - public void transfer(){ - transferTo(to); - } - - @Override - public void splitAndTransfer(int startIndex, int length) { - splitAndTransferTo(startIndex, length, to); - } - - @Override - public void copyValueSafe(int fromIndex, int toIndex) { - to.copyFromSafe(fromIndex, toIndex, ${className}.this); - } - } - - @Override - public void setInitialCapacity(final int valueCount) { - final long size = 1L * valueCount * ${type.width}; - if (size > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); - } - allocationSizeInBytes = (int)size; - offsetVector.setInitialCapacity(valueCount + 1); - } - - @Override - public void allocateNew() { - if(!allocateNewSafe()){ - throw new OutOfMemoryException("Failure while allocating buffer."); - } - } - - @Override - public boolean allocateNewSafe() { - long curAllocationSize = allocationSizeInBytes; - if (allocationMonitor > 10) { - curAllocationSize = Math.max(MIN_BYTE_COUNT, curAllocationSize / 2); - allocationMonitor = 0; - } else if (allocationMonitor < -2) { - curAllocationSize = curAllocationSize * 2L; - allocationMonitor = 0; - } - - if (curAllocationSize > MAX_ALLOCATION_SIZE) { - return false; - } - - clear(); - /* Boolean to keep track if all the memory allocations were successful - * Used in the case of composite vectors when we need to allocate multiple - * buffers for multiple vectors. If one of the allocations failed we need to - * clear all the memory that we allocated - */ - try { - final int requestedSize = (int)curAllocationSize; - data = allocator.buffer(requestedSize); - allocationSizeInBytes = requestedSize; - offsetVector.allocateNew(); - } catch (OutOfMemoryException e) { - clear(); - return false; - } - data.readerIndex(0); - offsetVector.zeroVector(); - return true; - } - - @Override - public void allocateNew(int totalBytes, int valueCount) { - clear(); - assert totalBytes >= 0; - try { - data = allocator.buffer(totalBytes); - offsetVector.allocateNew(valueCount + 1); - } catch (RuntimeException e) { - clear(); - throw e; - } - data.readerIndex(0); - allocationSizeInBytes = totalBytes; - offsetVector.zeroVector(); - } - - @Override - public void reset() { - allocationSizeInBytes = INITIAL_BYTE_COUNT; - allocationMonitor = 0; - data.readerIndex(0); - offsetVector.zeroVector(); - super.reset(); - } - - public void reAlloc() { - long baseSize = allocationSizeInBytes; - final int currentBufferCapacity = data.capacity(); - if (baseSize < (long)currentBufferCapacity) { - baseSize = (long)currentBufferCapacity; - } - long newAllocationSize = baseSize * 2L; - newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); - - if (newAllocationSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Unable to expand the buffer. Max allowed buffer size is reached."); - } - - final ArrowBuf newBuf = allocator.buffer((int)newAllocationSize); - newBuf.setBytes(0, data, 0, currentBufferCapacity); - data.release(); - data = newBuf; - allocationSizeInBytes = (int)newAllocationSize; - } - - public void decrementAllocationMonitor() { - if (allocationMonitor > 0) { - allocationMonitor = 0; - } - --allocationMonitor; - } - - private void incrementAllocationMonitor() { - ++allocationMonitor; - } - - @Override - public Accessor getAccessor(){ - return accessor; - } - - @Override - public Mutator getMutator() { - return mutator; - } - - public final class Accessor extends BaseValueVector.BaseAccessor implements VariableWidthAccessor { - final UInt${type.width}Vector.Accessor oAccessor = offsetVector.getAccessor(); - public long getStartEnd(int index){ - return oAccessor.getTwoAsLong(index); - } - - public byte[] get(int index) { - assert index >= 0; - final int startIdx = oAccessor.get(index); - final int length = oAccessor.get(index + 1) - startIdx; - assert length >= 0; - final byte[] dst = new byte[length]; - data.getBytes(startIdx, dst, 0, length); - return dst; - } - - @Override - public int getValueLength(int index) { - final UInt${type.width}Vector.Accessor offsetVectorAccessor = offsetVector.getAccessor(); - return offsetVectorAccessor.get(index + 1) - offsetVectorAccessor.get(index); - } - - public void get(int index, ${minor.class}Holder holder){ - holder.start = oAccessor.get(index); - holder.end = oAccessor.get(index + 1); - holder.buffer = data; - } - - public void get(int index, Nullable${minor.class}Holder holder){ - holder.isSet = 1; - holder.start = oAccessor.get(index); - holder.end = oAccessor.get(index + 1); - holder.buffer = data; - } - - <#switch minor.class> - <#case "VarChar"> - @Override - public ${friendlyType} getObject(int index) { - Text text = new Text(); - text.set(get(index)); - return text; - } - <#break> - <#case "Decimal"> - @Override - public ${friendlyType} getObject(int index) { - return new BigDecimal(new BigInteger(get(index)), scale); - } - <#break> - <#default> - @Override - public ${friendlyType} getObject(int index) { - return get(index); - } - - - @Override - public int getValueCount() { - return Math.max(offsetVector.getAccessor().getValueCount()-1, 0); - } - - @Override - public boolean isNull(int index){ - return false; - } - - public UInt${type.width}Vector getOffsetVector(){ - return offsetVector; - } - } - - /** - * Mutable${minor.class} implements a vector of variable width values. Elements in the vector - * are accessed by position from the logical start of the vector. A fixed width offsetVector - * is used to convert an element's position to it's offset from the start of the (0-based) - * ArrowBuf. Size is inferred by adjacent elements. - * The width of each element is ${type.width} byte(s) - * The equivalent Java primitive is '${minor.javaType!type.javaType}' - * - * NB: this class is automatically generated from ValueVectorTypes.tdd using FreeMarker. - */ - public final class Mutator extends BaseValueVector.BaseMutator implements VariableWidthVector.VariableWidthMutator { - - /** - * Set the variable length element at the specified index to the supplied byte array. - * - * @param index position of the bit to set - * @param bytes array of bytes to write - */ - protected void set(int index, byte[] bytes) { - assert index >= 0; - final int currentOffset = offsetVector.getAccessor().get(index); - offsetVector.getMutator().set(index + 1, currentOffset + bytes.length); - data.setBytes(currentOffset, bytes, 0, bytes.length); - } - - public void setSafe(int index, byte[] bytes) { - assert index >= 0; - - final int currentOffset = offsetVector.getAccessor().get(index); - while (data.capacity() < currentOffset + bytes.length) { - reAlloc(); - } - offsetVector.getMutator().setSafe(index + 1, currentOffset + bytes.length); - data.setBytes(currentOffset, bytes, 0, bytes.length); - } - - /** - * Set the variable length element at the specified index to the supplied byte array. - * - * @param index position of the bit to set - * @param bytes array of bytes to write - * @param start start index of bytes to write - * @param length length of bytes to write - */ - protected void set(int index, byte[] bytes, int start, int length) { - assert index >= 0; - final int currentOffset = offsetVector.getAccessor().get(index); - offsetVector.getMutator().set(index + 1, currentOffset + length); - data.setBytes(currentOffset, bytes, start, length); - } - - public void setSafe(int index, ByteBuffer bytes, int start, int length) { - assert index >= 0; - - int currentOffset = offsetVector.getAccessor().get(index); - - while (data.capacity() < currentOffset + length) { - reAlloc(); - } - offsetVector.getMutator().setSafe(index + 1, currentOffset + length); - data.setBytes(currentOffset, bytes, start, length); - } - - public void setSafe(int index, byte[] bytes, int start, int length) { - assert index >= 0; - - final int currentOffset = offsetVector.getAccessor().get(index); - - while (data.capacity() < currentOffset + length) { - reAlloc(); - } - offsetVector.getMutator().setSafe(index + 1, currentOffset + length); - data.setBytes(currentOffset, bytes, start, length); - } - - @Override - public void setValueLengthSafe(int index, int length) { - final int offset = offsetVector.getAccessor().get(index); - while(data.capacity() < offset + length ) { - reAlloc(); - } - offsetVector.getMutator().setSafe(index + 1, offsetVector.getAccessor().get(index) + length); - } - - - public void setSafe(int index, int start, int end, ArrowBuf buffer){ - final int len = end - start; - final int outputStart = offsetVector.data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); - - while(data.capacity() < outputStart + len) { - reAlloc(); - } - - offsetVector.getMutator().setSafe( index+1, outputStart + len); - buffer.getBytes(start, data, outputStart, len); - } - - public void setSafe(int index, Nullable${minor.class}Holder holder){ - assert holder.isSet == 1; - - final int start = holder.start; - final int end = holder.end; - final int len = end - start; - - int outputStart = offsetVector.data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); - - while(data.capacity() < outputStart + len) { - reAlloc(); - } - - holder.buffer.getBytes(start, data, outputStart, len); - offsetVector.getMutator().setSafe( index+1, outputStart + len); - } - - public void setSafe(int index, ${minor.class}Holder holder){ - final int start = holder.start; - final int end = holder.end; - final int len = end - start; - final int outputStart = offsetVector.data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); - - while(data.capacity() < outputStart + len) { - reAlloc(); - } - - holder.buffer.getBytes(start, data, outputStart, len); - offsetVector.getMutator().setSafe( index+1, outputStart + len); - } - - protected void set(int index, int start, int length, ArrowBuf buffer){ - assert index >= 0; - final int currentOffset = offsetVector.getAccessor().get(index); - offsetVector.getMutator().set(index + 1, currentOffset + length); - final ArrowBuf bb = buffer.slice(start, length); - data.setBytes(currentOffset, bb); - } - - protected void set(int index, Nullable${minor.class}Holder holder){ - final int length = holder.end - holder.start; - final int currentOffset = offsetVector.getAccessor().get(index); - offsetVector.getMutator().set(index + 1, currentOffset + length); - data.setBytes(currentOffset, holder.buffer, holder.start, length); - } - - protected void set(int index, ${minor.class}Holder holder){ - final int length = holder.end - holder.start; - final int currentOffset = offsetVector.getAccessor().get(index); - offsetVector.getMutator().set(index + 1, currentOffset + length); - data.setBytes(currentOffset, holder.buffer, holder.start, length); - } - - @Override - public void setValueCount(int valueCount) { - if (valueCount == 0) { - // if no values in vector, don't try to retrieve the current value count. - offsetVector.getMutator().setValueCount(0); - } else { - final int currentByteCapacity = getByteCapacity(); - final int idx = offsetVector.getAccessor().get(valueCount); - data.writerIndex(idx); - if (currentByteCapacity > idx * 2) { - incrementAllocationMonitor(); - } else if (allocationMonitor > 0) { - allocationMonitor = 0; - } - VectorTrimmer.trim(data, idx); - offsetVector.getMutator().setValueCount(valueCount+1); - } - } - - @Override - public void generateTestData(int size){ - boolean even = true; - <#switch minor.class> - <#case "Var16Char"> - final java.nio.charset.Charset charset = Charsets.UTF_16; - <#break> - <#case "VarChar"> - <#default> - final java.nio.charset.Charset charset = Charsets.UTF_8; - - final byte[] evenValue = new String("aaaaa").getBytes(charset); - final byte[] oddValue = new String("bbbbbbbbbb").getBytes(charset); - for(int i =0; i < size; i++, even = !even){ - set(i, even ? evenValue : oddValue); - } - setValueCount(size); - } - } -} - - <#-- type.major --> - - diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java deleted file mode 100644 index 01340f66c4095..0000000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java +++ /dev/null @@ -1,129 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.arrow.vector; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.schema.ArrowFieldNode; - -import io.netty.buffer.ArrowBuf; -import org.apache.arrow.vector.util.CallBack; -import org.apache.arrow.vector.util.TransferPair; - - -public abstract class BaseDataValueVector extends BaseValueVector implements BufferBacked { - - public static void load(ArrowFieldNode fieldNode, List vectors, List buffers) { - int expectedSize = vectors.size(); - if (buffers.size() != expectedSize) { - throw new IllegalArgumentException("Illegal buffer count, expected " + expectedSize + ", got: " + buffers.size()); - } - for (int i = 0; i < expectedSize; i++) { - vectors.get(i).load(fieldNode, buffers.get(i)); - } - } - - public static void truncateBufferBasedOnSize(List buffers, int bufferIndex, int byteSize) { - if (bufferIndex >= buffers.size()) { - throw new IllegalArgumentException("no buffer at index " + bufferIndex + ": " + buffers); - } - ArrowBuf buffer = buffers.get(bufferIndex); - if (buffer.writerIndex() < byteSize) { - throw new IllegalArgumentException("can not truncate buffer to a larger size " + byteSize + ": " + buffer.writerIndex()); - } - buffer.writerIndex(byteSize); - } - - public static List unload(List vectors) { - List result = new ArrayList<>(vectors.size()); - for (BufferBacked vector : vectors) { - result.add(vector.unLoad()); - } - return result; - } - - protected ArrowBuf data; - - public BaseDataValueVector(String name, BufferAllocator allocator) { - super(name, allocator); - data = allocator.getEmpty(); - } - - @Override - public void clear() { - data.release(); - data = allocator.getEmpty(); - super.clear(); - } - - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { - return getTransferPair(ref, allocator); - } - - @Override - public ArrowBuf[] getBuffers(boolean clear) { - ArrowBuf[] out; - if (getBufferSize() == 0) { - out = new ArrowBuf[0]; - } else { - out = new ArrowBuf[] {data}; - data.readerIndex(0); - if (clear) { - data.retain(1); - } - } - if (clear) { - clear(); - } - return out; - } - - @Override - public int getBufferSize() { - if (getAccessor().getValueCount() == 0) { - return 0; - } - return data.writerIndex(); - } - - public ArrowBuf getBuffer() { - return data; - } - - @Override - public void load(ArrowFieldNode fieldNode, ArrowBuf data) { - this.data.release(); - this.data = data.retain(allocator); - } - - @Override - public ArrowBuf unLoad() { - return this.data.readerIndex(0); - } - - /** - * This method has a similar effect of allocateNew() without actually clearing and reallocating - * the value vector. The purpose is to move the value vector to a "mutate" state - */ - public void reset() { - } -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java new file mode 100644 index 0000000000000..702db9f528152 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -0,0 +1,843 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.BaseAllocator; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * BaseFixedWidthVector provides an abstract interface for + * implementing vectors of fixed width values. The vectors are nullable + * implying that zero or more elements in the vector could be NULL. + */ +public abstract class BaseFixedWidthVector extends BaseValueVector + implements FixedWidthVector, FieldVector, VectorDefinitionSetter { + private final byte typeWidth; + + protected int valueAllocationSizeInBytes; + protected int validityAllocationSizeInBytes; + + protected final Field field; + private int allocationMonitor; + protected ArrowBuf validityBuffer; + protected ArrowBuf valueBuffer; + protected int valueCount; + + public BaseFixedWidthVector(final String name, final BufferAllocator allocator, + FieldType fieldType, final byte typeWidth) { + super(name, allocator); + this.typeWidth = typeWidth; + field = new Field(name, fieldType, null); + valueCount = 0; + allocationMonitor = 0; + validityBuffer = allocator.getEmpty(); + valueBuffer = allocator.getEmpty(); + if (typeWidth > 0) { + valueAllocationSizeInBytes = INITIAL_VALUE_ALLOCATION * typeWidth; + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); + } else { + /* specialized handling for BitVector */ + valueAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); + validityAllocationSizeInBytes = valueAllocationSizeInBytes; + } + } + + + /* TODO: + * see if getNullCount() can be made faster -- O(1) + */ + + /* TODO: + * Once the entire hierarchy has been refactored, move common functions + * like getNullCount(), splitAndTransferValidityBuffer to top level + * base class BaseValueVector. + * + * Along with this, some class members (validityBuffer) can also be + * abstracted out to top level base class. + * + * Right now BaseValueVector is the top level base class for other + * vector types in ValueVector hierarchy (non-nullable) and those + * vectors have not yet been refactored/removed so moving things to + * the top class as of now is not a good idea. + */ + + /** + * Get the memory address of buffer that manages the validity + * (NULL or NON-NULL nature) of elements in the vector. + * @return starting address of the buffer + */ + @Override + public long getValidityBufferAddress() { + return (validityBuffer.memoryAddress()); + } + + /** + * Get the memory address of buffer that stores the data for elements + * in the vector. + * @return starting address of the buffer + */ + @Override + public long getDataBufferAddress() { + return (valueBuffer.memoryAddress()); + } + + /** + * Get the memory address of buffer that stores the offsets for elements + * in the vector. This operation is not supported for fixed-width vectors. + * @return starting address of the buffer + * @throws UnsupportedOperationException for fixed width vectors + */ + @Override + public long getOffsetBufferAddress() { + throw new UnsupportedOperationException("not supported for fixed-width vectors"); + } + + /** + * Get buffer that manages the validity (NULL or NON-NULL nature) of + * elements in the vector. Consider it as a buffer for internal bit vector + * data structure. + * @return buffer + */ + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + /** + * Get the buffer that stores the data for elements in the vector. + * @return buffer + */ + @Override + public ArrowBuf getDataBuffer() { + return valueBuffer; + } + + /** + * buffer that stores the offsets for elements + * in the vector. This operation is not supported for fixed-width vectors. + * @return buffer + * @throws UnsupportedOperationException for fixed width vectors + */ + @Override + public ArrowBuf getOffsetBuffer() { + throw new UnsupportedOperationException("not supported for fixed-width vectors"); + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * @param valueCount desired number of elements in the vector + */ + @Override + public void setInitialCapacity(int valueCount) { + final long size = (long) valueCount * typeWidth; + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); + } + valueAllocationSizeInBytes = (int) size; + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount); + } + + /** + * Get the current value capacity for the vector + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + return Math.min(getValueBufferValueCapacity(), getValidityBufferValueCapacity()); + } + + private int getValueBufferValueCapacity() { + return (int) ((valueBuffer.capacity() * 1.0) / typeWidth); + } + + private int getValidityBufferValueCapacity() { + return (int) (validityBuffer.capacity() * 8L); + } + + /** + * zero out the vector and the data in associated buffers. + */ + @Override + public void zeroVector() { + initValidityBuffer(); + initValueBuffer(); + } + + /* zero out the validity buffer */ + private void initValidityBuffer() { + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /* zero out the data buffer */ + private void initValueBuffer() { + valueBuffer.setZero(0, valueBuffer.capacity()); + } + + /** + * Reset the vector to initial state. Same as {@link #zeroVector()}. + * Note that this method doesn't release any memory. + */ + @Override + public void reset() { + valueCount = 0; + zeroVector(); + } + + /** + * Close the vector and release the associated buffers. + */ + @Override + public void close() { + clear(); + } + + /** + * Same as {@link #close()} + */ + @Override + public void clear() { + valueCount = 0; + validityBuffer = releaseBuffer(validityBuffer); + valueBuffer = releaseBuffer(valueBuffer); + } + + /* used to step down the memory allocation */ + protected void incrementAllocationMonitor() { + if (allocationMonitor < 0) { + allocationMonitor = 0; + } + allocationMonitor++; + } + + /* used to step up the memory allocation */ + protected void decrementAllocationMonitor() { + if (allocationMonitor > 0) { + allocationMonitor = 0; + } + allocationMonitor--; + } + + /** + * Same as {@link #allocateNewSafe()}. + */ + @Override + public void allocateNew() { + if (!allocateNewSafe()) { + throw new OutOfMemoryException("Failure while allocating memory."); + } + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. See {@link #allocateNew(int)} for allocating memory for specific + * number of elements in the vector. + * + * @return false if memory allocation fails, true otherwise. + */ + @Override + public boolean allocateNewSafe() { + long curAllocationSizeValue = valueAllocationSizeInBytes; + long curAllocationSizeValidity = validityAllocationSizeInBytes; + + if (curAllocationSizeValue > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory exceeds limit"); + } + + /* we are doing a new allocation -- release the current buffers */ + clear(); + + try { + allocateBytes(curAllocationSizeValue, curAllocationSizeValidity); + } catch (Exception e) { + e.printStackTrace(); + clear(); + return false; + } + + return true; + } + + /** + * Allocate memory for the vector to support storing at least the provided number of + * elements in the vector. This method must be called prior to using the ValueVector. + * + * @param valueCount the desired number of elements in the vector + * @throws org.apache.arrow.memory.OutOfMemoryException + */ + public void allocateNew(int valueCount) { + long valueBufferSize = valueCount * typeWidth; + long validityBufferSize = getValidityBufferSizeFromCount(valueCount); + if (typeWidth == 0) { + /* specialized handling for BitVector */ + valueBufferSize = validityBufferSize; + } + + if (valueBufferSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); + } + + /* we are doing a new allocation -- release the current buffers */ + clear(); + + try { + allocateBytes(valueBufferSize, validityBufferSize); + } catch (Exception e) { + e.printStackTrace(); + clear(); + throw e; + } + } + + /** + * Actual memory allocation is done by this function. All the calculations + * and knowledge about what size to allocate is upto the callers of this + * method. + * Callers appropriately handle errors if memory allocation fails here. + * Callers should also take care of determining that desired size is + * within the bounds of max allocation allowed and any other error + * conditions. + */ + private void allocateBytes(final long valueBufferSize, final long validityBufferSize) { + /* allocate data buffer */ + int curSize = (int) valueBufferSize; + valueBuffer = allocator.buffer(curSize); + valueBuffer.readerIndex(0); + valueAllocationSizeInBytes = curSize; + /* allocate validity buffer */ + allocateValidityBuffer((int) validityBufferSize); + zeroVector(); + } + + /** + * During splitAndTransfer, if we splitting from a random position within a byte, + * we can't just slice the source buffer so we have to explicitly allocate the + * validityBuffer of the target vector. This is unlike the databuffer which we can + * always slice for the target vector. + */ + private void allocateValidityBuffer(final int validityBufferSize) { + validityBuffer = allocator.buffer(validityBufferSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = validityBufferSize; + } + + /** + * Get the potential buffer size for a particular number of records. + * @param count desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ + @Override + public int getBufferSizeFor(final int count) { + if (count == 0) { + return 0; + } + return (count * typeWidth) + getValidityBufferSizeFromCount(count); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + if (valueCount == 0) { + return 0; + } + return (valueCount * typeWidth) + getValidityBufferSizeFromCount(valueCount); + } + + /** + * Get information about how this field is materialized. + * @return the field corresponding to this vector + */ + @Override + public Field getField() { + return field; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link io.netty.buffer.ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers; + setReaderAndWriterIndex(); + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = new ArrowBuf[2]; + buffers[0] = validityBuffer; + buffers[1] = valueBuffer; + } + if (clear) { + for (final ArrowBuf buffer : buffers) { + buffer.retain(1); + } + clear(); + } + return buffers; + } + + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ + @Override + public void reAlloc() { + valueBuffer = reallocBufferHelper(valueBuffer, true); + validityBuffer = reallocBufferHelper(validityBuffer, false); + } + + /** + * Helper method for reallocating a particular internal buffer + * Returns the new buffer. + */ + private ArrowBuf reallocBufferHelper(ArrowBuf buffer, final boolean dataBuffer) { + final int currentBufferCapacity = buffer.capacity(); + long baseSize = (dataBuffer ? valueAllocationSizeInBytes + : validityAllocationSizeInBytes); + + if (baseSize < (long) currentBufferCapacity) { + baseSize = (long) currentBufferCapacity; + } + + long newAllocationSize = baseSize * 2L; + newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setBytes(0, buffer, 0, currentBufferCapacity); + final int halfNewCapacity = newBuf.capacity() / 2; + newBuf.setZero(halfNewCapacity, halfNewCapacity); + buffer.release(1); + buffer = newBuf; + if (dataBuffer) { + valueAllocationSizeInBytes = (int) newAllocationSize; + } else { + validityAllocationSizeInBytes = (int) newAllocationSize; + } + + return buffer; + } + + @Override + @Deprecated + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + /** + * Initialize the children in schema for this Field. This operation is a + * NO-OP for scalar types since they don't have any children. + * @param children the schema + * @throws IllegalArgumentException if children is a non-empty list for scalar types. + */ + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("primitive type vector can not have children"); + } + } + + /** + * Get the inner child vectors. + * @return list of child vectors for complex types, empty list for scalar vector + * types + */ + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (ownBuffers.size() != 2) { + throw new IllegalArgumentException("Illegal buffer count, expected " + 2 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf dataBuffer = ownBuffers.get(1); + + validityBuffer.release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + valueBuffer.release(); + valueBuffer = dataBuffer.retain(allocator); + + valueCount = fieldNode.getLength(); + + valueAllocationSizeInBytes = valueBuffer.capacity(); + validityAllocationSizeInBytes = validityBuffer.capacity(); + } + + /** + * Get the buffers belonging to this vector + * @return the inner buffers. + */ + public List getFieldBuffers() { + List result = new ArrayList<>(2); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(valueBuffer); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + valueBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + valueBuffer.writerIndex(0); + } else { + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + if (typeWidth == 0) { + /* specialized handling for BitVector */ + valueBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + } else { + valueBuffer.writerIndex(valueCount * typeWidth); + } + } + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @param callBack + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(ref, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(name, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return TransferPair + */ + public abstract TransferPair getTransferPair(String ref, BufferAllocator allocator); + + /** + * Transfer this vector'data to another vector. The memory associated + * with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + * @param target destination vector for transfer + */ + public void transferTo(BaseFixedWidthVector target) { + compareTypes(target, "transferTo"); + target.clear(); + target.validityBuffer = validityBuffer.transferOwnership(target.allocator).buffer; + target.valueBuffer = valueBuffer.transferOwnership(target.allocator).buffer; + target.valueCount = valueCount; + clear(); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, + BaseFixedWidthVector target) { + compareTypes(target, "splitAndTransferTo"); + target.clear(); + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferValueBuffer(startIndex, length, target); + target.setValueCount(length); + } + + /** + * Data buffer can always be split and transferred using slicing. + */ + private void splitAndTransferValueBuffer(int startIndex, int length, + BaseFixedWidthVector target) { + final int startPoint = startIndex * typeWidth; + final int sliceLength = length * typeWidth; + target.valueBuffer = valueBuffer.slice(startPoint, sliceLength).transferOwnership(target.allocator).buffer; + } + + /** + * Validity buffer has multiple cases of split and transfer depending on + * the starting position of the source index. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, + BaseFixedWidthVector target) { + assert startIndex + length <= valueCount; + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + /* slice */ + if (target.validityBuffer != null) { + target.validityBuffer.release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, + firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + + /****************************************************************** + * * + * common getters and setters * + * * + ******************************************************************/ + + + /** + * Get the number of elements that are null in the vector + * + * @return the number of null elements. + */ + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Get the value count of vector. This will always be zero unless + * {@link #setValueCount(int)} has been called prior to calling this. + * + * @return valueCount for the vector + */ + @Override + public int getValueCount() { + return valueCount; + } + + /** + * Set value count for the vector. + * + * @param valueCount value count to set + */ + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + final int currentValueCapacity = getValueCapacity(); + while (valueCount > getValueCapacity()) { + reAlloc(); + } + /* + * We are trying to understand the pattern of memory allocation. + * If initially, the user did vector.allocateNew(), we would have + * allocated memory of default size (4096 * type width). + * Later on user invokes setValueCount(count). + * + * If the existing value capacity is twice as large as the + * valueCount, we know that we over-provisioned memory in the + * first place when default memory allocation was done because user + * really needs a much less value count in the vector. + * + * We record this by bumping up the allocationMonitor. If this pattern + * happens for certain number of times and allocationMonitor + * reaches the threshold (internal hardcoded) value, subsequent + * call to allocateNew() will take care of stepping down the + * default memory allocation size. + * + * Another case would be under-provisioning the initial memory and + * thus going through a lot of realloc(). Here the goal is to + * see if we can minimize the number of reallocations. Again the + * state is recorded in allocationMonitor by decrementing it + * (negative value). If a threshold is hit, realloc will try to + * allocate more memory in order to possibly avoid a future realloc. + * This case is also applicable to setSafe() methods which can trigger + * a realloc() and thus we record the state there as well. + */ + if (valueCount > 0) { + if (currentValueCapacity >= (valueCount * 2)) { + incrementAllocationMonitor(); + } else if (currentValueCapacity <= (valueCount / 2)) { + decrementAllocationMonitor(); + } + } + setReaderAndWriterIndex(); + } + + /** + * Check if the given index is within the current value capacity + * of the vector + * + * @param index position to check + * @return true if index is within the current value capacity + */ + public boolean isSafe(int index) { + return index < getValueCapacity(); + } + + /** + * Check if element at given index is null. + * + * @param index position of element + * @return true if element at given index is null, false otherwise + */ + @Override + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return Long.bitCount(b & (1L << bitIndex)); + } + + /** + * Mark the particular position in the vector as non-null. + * + * @param index position of the element. + */ + @Override + public void setIndexDefined(int index) { + handleSafe(index); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + } + + public void set(int index, byte[] value, int start, int length) { + throw new UnsupportedOperationException(); + } + + public void setSafe(int index, byte[] value, int start, int length) { + throw new UnsupportedOperationException(); + } + + public void set(int index, ByteBuffer value, int start, int length) { + throw new UnsupportedOperationException(); + } + + public void setSafe(int index, ByteBuffer value, int start, int length) { + throw new UnsupportedOperationException(); + } + + + /****************************************************************** + * * + * helper methods for setters * + * * + ******************************************************************/ + + + protected void handleSafe(int index) { + while (index >= getValueCapacity()) { + decrementAllocationMonitor(); + reAlloc(); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 598e578e55a6d..6418ea4f8a069 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + *

* http://www.apache.org/licenses/LICENSE-2.0 - * + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -18,8 +18,10 @@ package org.apache.arrow.vector; +import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; +import java.util.List; import com.google.flatbuffers.FlatBufferBuilder; import org.apache.arrow.memory.BufferAllocator; @@ -54,7 +56,6 @@ public String toString() { @Override public void clear() { - getMutator().reset(); } @Override @@ -67,42 +68,6 @@ public TransferPair getTransferPair(BufferAllocator allocator) { return getTransferPair(name, allocator); } - public abstract static class BaseAccessor implements ValueVector.Accessor { - protected BaseAccessor() { - } - - @Override - public boolean isNull(int index) { - return false; - } - - @Override - // override this in case your implementation is faster, see BitVector - public int getNullCount() { - int nullCount = 0; - for (int i = 0; i < getValueCount(); i++) { - if (isNull(i)) { - nullCount++; - } - } - return nullCount; - } - } - - public abstract static class BaseMutator implements ValueVector.Mutator { - protected BaseMutator() { - } - - @Override - public void generateTestData(int values) { - } - - //TODO: consider making mutator stateless(if possible) on another issue. - @Override - public void reset() { - } - } - @Override public Iterator iterator() { return Collections.emptyIterator(); @@ -122,5 +87,22 @@ public static boolean checkBufRefs(final ValueVector vv) { public BufferAllocator getAllocator() { return allocator; } + + protected void compareTypes(BaseValueVector target, String caller) { + if (this.getMinorType() != target.getMinorType()) { + throw new UnsupportedOperationException(caller + " should have vectors of exact same type"); + } + } + + protected ArrowBuf releaseBuffer(ArrowBuf buffer) { + buffer.release(); + buffer = allocator.getEmpty(); + return buffer; + } + + /* number of bytes for the validity buffer for the given valueCount */ + protected static int getValidityBufferSizeFromCount(final int valueCount) { + return (int) Math.ceil(valueCount / 8.0); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java new file mode 100644 index 0000000000000..fff329a9b9d66 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -0,0 +1,1240 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + + +import io.netty.buffer.ArrowBuf; + +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.BaseAllocator; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.NullableMapVector; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public abstract class BaseVariableWidthVector extends BaseValueVector + implements VariableWidthVector, FieldVector, VectorDefinitionSetter { + private static final int DEFAULT_RECORD_BYTE_COUNT = 8; + private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; + + private int valueAllocationSizeInBytes; + private int validityAllocationSizeInBytes; + private int offsetAllocationSizeInBytes; + + /* protected members */ + public static final int OFFSET_WIDTH = 4; /* 4 byte unsigned int to track offsets */ + protected static final byte[] emptyByteArray = new byte[]{}; + protected ArrowBuf validityBuffer; + protected ArrowBuf valueBuffer; + protected ArrowBuf offsetBuffer; + protected int valueCount; + protected int lastSet; + protected final Field field; + private boolean cleared; + + public BaseVariableWidthVector(final String name, final BufferAllocator allocator, + FieldType fieldType) { + super(name, allocator); + valueAllocationSizeInBytes = INITIAL_BYTE_COUNT; + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); + offsetAllocationSizeInBytes = (INITIAL_VALUE_ALLOCATION) * OFFSET_WIDTH; + field = new Field(name, fieldType, null); + valueCount = 0; + lastSet = -1; + offsetBuffer = allocator.getEmpty(); + validityBuffer = allocator.getEmpty(); + valueBuffer = allocator.getEmpty(); + cleared = false; + } + + /* TODO: + * see if getNullCount() can be made faster -- O(1) + */ + + /* TODO: + * Once the entire hierarchy has been refactored, move common functions + * like getNullCount(), splitAndTransferValidityBuffer to top level + * base class BaseValueVector. + * + * Along with this, some class members (validityBuffer) can also be + * abstracted out to top level base class. + * + * Right now BaseValueVector is the top level base class for other + * vector types in ValueVector hierarchy (non-nullable) and those + * vectors have not yet been refactored/removed so moving things to + * the top class as of now is not a good idea. + */ + + /** + * Get buffer that manages the validity (NULL or NON-NULL nature) of + * elements in the vector. Consider it as a buffer for internal bit vector + * data structure. + * @return buffer + */ + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + /** + * Get the buffer that stores the data for elements in the vector. + * @return buffer + */ + @Override + public ArrowBuf getDataBuffer() { + return valueBuffer; + } + + /** + * buffer that stores the offsets for elements + * in the vector. This operation is not supported for fixed-width vectors. + * @return buffer + */ + @Override + public ArrowBuf getOffsetBuffer() { + return offsetBuffer; + } + + /** + * Get the memory address of buffer that stores the offsets for elements + * in the vector. + * @return starting address of the buffer + */ + @Override + public long getOffsetBufferAddress() { + return offsetBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that manages the validity + * (NULL or NON-NULL nature) of elements in the vector. + * @return starting address of the buffer + */ + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that stores the data for elements + * in the vector. + * @return starting address of the buffer + */ + @Override + public long getDataBufferAddress() { + return valueBuffer.memoryAddress(); + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * @param valueCount desired number of elements in the vector + */ + @Override + public void setInitialCapacity(int valueCount) { + final long size = (long) valueCount * DEFAULT_RECORD_BYTE_COUNT; + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); + } + valueAllocationSizeInBytes = (int) size; + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount); + /* to track the end offset of last data element in vector, we need + * an additional slot in offset buffer. + */ + offsetAllocationSizeInBytes = (valueCount + 1) * OFFSET_WIDTH; + } + + /** + * Get the current value capacity for the vector + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0); + return Math.min(offsetValueCapacity, getValidityBufferValueCapacity()); + } + + private int getValidityBufferValueCapacity() { + return (int) (validityBuffer.capacity() * 8L); + } + + private int getOffsetBufferValueCapacity() { + return (int) ((offsetBuffer.capacity() * 1.0) / OFFSET_WIDTH); + } + + /** + * zero out the vector and the data in associated buffers. + */ + public void zeroVector() { + initValidityBuffer(); + initOffsetBuffer(); + valueBuffer.setZero(0, valueBuffer.capacity()); + } + + /* zero out the validity buffer */ + private void initValidityBuffer() { + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /* zero out the offset buffer */ + private void initOffsetBuffer() { + offsetBuffer.setZero(0, offsetBuffer.capacity()); + } + + /** + * Reset the vector to initial state. Same as {@link #zeroVector()}. + * Note that this method doesn't release any memory. + */ + public void reset() { + zeroVector(); + lastSet = -1; + valueCount = 0; + } + + /** + * Close the vector and release the associated buffers. + */ + @Override + public void close() { + clear(); + } + + /** + * Same as {@link #close()} + */ + @Override + public void clear() { + validityBuffer = releaseBuffer(validityBuffer); + valueBuffer = releaseBuffer(valueBuffer); + offsetBuffer = releaseBuffer(offsetBuffer); + cleared = true; + lastSet = -1; + valueCount = 0; + } + + @Override + @Deprecated + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + /** + * Initialize the children in schema for this Field. This operation is a + * NO-OP for scalar types since they don't have any children. + * @param children the schema + * @throws IllegalArgumentException if children is a non-empty list for scalar types. + */ + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("primitive type vector can not have children"); + } + } + + /** + * Get the inner child vectors. + * @return list of child vectors for complex types, empty list for scalar vector + * types + */ + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf offBuffer = ownBuffers.get(1); + ArrowBuf dataBuffer = ownBuffers.get(2); + + validityBuffer.release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + offsetBuffer.release(); + offsetBuffer = offBuffer.retain(allocator); + valueBuffer.release(); + valueBuffer = dataBuffer.retain(allocator); + + lastSet = fieldNode.getLength() - 1; + valueCount = fieldNode.getLength(); + } + + /** + * Get the buffers belonging to this vector + * @return the inner buffers. + */ + public List getFieldBuffers() { + List result = new ArrayList<>(3); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(offsetBuffer); + result.add(valueBuffer); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + offsetBuffer.readerIndex(0); + valueBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + offsetBuffer.writerIndex(0); + valueBuffer.writerIndex(0); + } else { + final int lastDataOffset = getstartOffset(valueCount); + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH); + valueBuffer.writerIndex(lastDataOffset); + } + } + + /** + * Same as {@link #allocateNewSafe()}. + */ + @Override + public void allocateNew() { + if (!allocateNewSafe()) { + throw new OutOfMemoryException("Failure while allocating memory."); + } + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. See {@link #allocateNew(int, int)} for allocating memory for specific + * number of elements in the vector. + * + * @return false if memory allocation fails, true otherwise. + */ + @Override + public boolean allocateNewSafe() { + long curAllocationSizeValue = valueAllocationSizeInBytes; + long curAllocationSizeValidity = validityAllocationSizeInBytes; + long curAllocationSizeOffset = offsetAllocationSizeInBytes; + + if (curAllocationSizeValue > MAX_ALLOCATION_SIZE || + curAllocationSizeOffset > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory exceeds limit"); + } + + /* we are doing a new allocation -- release the current buffers */ + clear(); + + try { + allocateBytes(curAllocationSizeValue, curAllocationSizeValidity, curAllocationSizeOffset); + } catch (Exception e) { + e.printStackTrace(); + clear(); + return false; + } + + return true; + } + + /** + * Allocate memory for the vector to support storing at least the provided number of + * elements in the vector. This method must be called prior to using the ValueVector. + * + * @param totalBytes desired total memory capacity + * @param valueCount the desired number of elements in the vector + * @throws org.apache.arrow.memory.OutOfMemoryException + */ + @Override + public void allocateNew(int totalBytes, int valueCount) { + assert totalBytes >= 0; + final int offsetBufferSize = (valueCount + 1) * OFFSET_WIDTH; + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + + if (totalBytes > MAX_ALLOCATION_SIZE || + offsetBufferSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory exceeds limit"); + } + + /* we are doing a new allocation -- release the current buffers */ + clear(); + + try { + allocateBytes(totalBytes, validityBufferSize, offsetBufferSize); + } catch (Exception e) { + e.printStackTrace(); + clear(); + } + } + + /* allocate the inner buffers */ + private void allocateBytes(final long valueBufferSize, final long validityBufferSize, + final long offsetBufferSize) { + /* allocate data buffer */ + int curSize = (int) valueBufferSize; + valueBuffer = allocator.buffer(curSize); + valueBuffer.readerIndex(0); + valueAllocationSizeInBytes = curSize; + allocateValidityBuffer(validityBufferSize); + allocateOffsetBuffer(offsetBufferSize); + } + + /* allocate offset buffer */ + private void allocateOffsetBuffer(final long size) { + final int curSize = (int) size; + offsetBuffer = allocator.buffer(curSize); + offsetBuffer.readerIndex(0); + offsetAllocationSizeInBytes = curSize; + initOffsetBuffer(); + } + + /* allocate validity buffer */ + private void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = curSize; + initValidityBuffer(); + } + + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ + public void reAlloc() { + reallocDataBuffer(); + reallocValidityAndOffsetBuffers(); + } + + /** + * Reallocate the data buffer. Data Buffer stores the actual data for + * VARCHAR or VARBINARY elements in the vector. The behavior is to double + * the size of buffer. + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocDataBuffer() { + long baseSize = valueAllocationSizeInBytes; + final int currentBufferCapacity = valueBuffer.capacity(); + + if (baseSize < (long) currentBufferCapacity) { + baseSize = (long) currentBufferCapacity; + } + + long newAllocationSize = baseSize * 2L; + newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setBytes(0, valueBuffer, 0, currentBufferCapacity); + valueBuffer.release(); + valueBuffer = newBuf; + valueAllocationSizeInBytes = (int) newAllocationSize; + } + + /** + * Reallocate the validity and offset buffers for this vector. Validity + * buffer is used to track the NULL or NON-NULL nature of elements in + * the vector and offset buffer is used to store the lengths of variable + * width elements in the vector. + * + * Note that data buffer for variable length vectors moves independent + * of the companion validity and offset buffers. This is in + * contrast to what we have for fixed width vectors. + * + * So even though we may have setup an initial capacity of 1024 + * elements in the vector, it is quite possible + * that we need to reAlloc() the data buffer when we are setting + * the 5th element in the vector simply because previous + * variable length elements have exhausted the buffer capacity. + * However, we really don't need to reAlloc() validity and + * offset buffers until we try to set the 1025th element + * This is why we do a separate check for safe methods to + * determine which buffer needs reallocation. + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocValidityAndOffsetBuffers() { + offsetBuffer = reallocBufferHelper(offsetBuffer, true); + validityBuffer = reallocBufferHelper(validityBuffer, false); + } + + /* helper method to realloc a particular buffer. returns the allocated buffer */ + private ArrowBuf reallocBufferHelper(ArrowBuf buffer, final boolean offsetBuffer) { + final int currentBufferCapacity = buffer.capacity(); + long baseSize = (offsetBuffer ? offsetAllocationSizeInBytes + : validityAllocationSizeInBytes); + + if (baseSize < (long) currentBufferCapacity) { + baseSize = (long) currentBufferCapacity; + } + + long newAllocationSize = baseSize * 2L; + newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setBytes(0, buffer, 0, currentBufferCapacity); + final int halfNewCapacity = newBuf.capacity() / 2; + newBuf.setZero(halfNewCapacity, halfNewCapacity); + buffer.release(1); + buffer = newBuf; + if (offsetBuffer) { + offsetAllocationSizeInBytes = (int) newAllocationSize; + } else { + validityAllocationSizeInBytes = (int) newAllocationSize; + } + + return buffer; + } + + /** + * Get the size (number of bytes) of underlying data buffer. + * @return + */ + @Override + public int getByteCapacity() { + return valueBuffer.capacity(); + } + + @Override + public int getCurrentSizeInBytes() { + /* TODO */ + return 0; + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + return getBufferSizeFor(this.valueCount); + } + + /** + * Get the potential buffer size for a particular number of records. + * @param valueCount desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + final int offsetBufferSize = (valueCount + 1) * OFFSET_WIDTH; + /* get the end offset for this valueCount */ + final int dataBufferSize = offsetBuffer.getInt(valueCount * OFFSET_WIDTH); + return validityBufferSize + offsetBufferSize + dataBufferSize; + } + + /** + * Get information about how this field is materialized. + * @return the field corresponding to this vector + */ + @Override + public Field getField() { + return field; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link io.netty.buffer.ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers; + setReaderAndWriterIndex(); + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = new ArrowBuf[3]; + buffers[0] = validityBuffer; + buffers[1] = offsetBuffer; + buffers[2] = valueBuffer; + } + if (clear) { + for (final ArrowBuf buffer : buffers) { + buffer.retain(1); + } + clear(); + } + return buffers; + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @param callBack + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(ref, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(name, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return TransferPair + */ + public abstract TransferPair getTransferPair(String ref, BufferAllocator allocator); + + /** + * Transfer this vector'data to another vector. The memory associated + * with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + * @param target destination vector for transfer + */ + public void transferTo(BaseVariableWidthVector target) { + compareTypes(target, "transferTo"); + target.clear(); + target.validityBuffer = validityBuffer.transferOwnership(target.allocator).buffer; + target.valueBuffer = valueBuffer.transferOwnership(target.allocator).buffer; + target.offsetBuffer = offsetBuffer.transferOwnership(target.allocator).buffer; + target.setLastSet(this.lastSet); + if (this.valueCount > 0) { + target.setValueCount(this.valueCount); + } + clear(); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, + BaseVariableWidthVector target) { + compareTypes(target, "splitAndTransferTo"); + target.clear(); + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferOffsetBuffer(startIndex, length, target); + target.setLastSet(length - 1); + if (this.valueCount > 0) { + target.setValueCount(this.valueCount); + } + } + + /* + * Transfer the offsets along with data. Unlike the data buffer, we cannot simply + * slice the offset buffer for split and transfer. The reason is that offsets + * in the target vector have to be adjusted and made relative to the staring + * offset in source vector from the start index of split. This is why, we + * need to explicitly allocate the offset buffer and set the adjusted offsets + * in the target vector. + */ + private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseVariableWidthVector target) { + final int start = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); + final int end = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH); + final int dataLength = end - start; + target.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + for (int i = 0; i < length + 1; i++) { + final int relativeSourceOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - start; + target.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeSourceOffset); + } + target.valueBuffer = valueBuffer.slice(start, dataLength).transferOwnership(target.allocator).buffer; + } + + /* + * Transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, + BaseVariableWidthVector target) { + assert startIndex + length <= valueCount; + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } + } + + + /****************************************************************** + * * + * common getters and setters * + * * + ******************************************************************/ + + + /** + * Get the number of elements that are null in the vector + * + * @return the number of null elements. + */ + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Check if the given index is within the current value capacity + * of the vector + * + * @param index position to check + * @return true if index is within the current value capacity + */ + public boolean isSafe(int index) { + return index < getValueCapacity(); + } + + /** + * Check if element at given index is null. + * + * @param index position of element + * @return true if element at given index is null + */ + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return Long.bitCount(b & (1L << bitIndex)); + } + + /** + * Get the value count of vector. This will always be zero unless + * setValueCount(int) has been called prior to calling this. + * + * @return valueCount for the vector + */ + public int getValueCount() { + return valueCount; + } + + /** + * Sets the value count for the vector + * + * @param valueCount value count + */ + public void setValueCount(int valueCount) { + assert valueCount >= 0; + this.valueCount = valueCount; + while (valueCount > getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + fillHoles(valueCount); + lastSet = valueCount - 1; + setReaderAndWriterIndex(); + } + + /** + * Create holes in the vector upto the given index (exclusive). + * Holes will be created from the current last set position in + * the vector. + * + * @param index target index + */ + public void fillEmpties(int index) { + handleSafe(index, emptyByteArray.length); + fillHoles(index); + lastSet = index - 1; + } + + /** + * Set the index of last non-null element in the vector. + * It is important to call this method with appropriate value + * before calling {@link #setValueCount(int)}. + * + * @param value desired index of last non-null element. + */ + public void setLastSet(int value) { + lastSet = value; + } + + /** + * Get the index of last non-null element in the vector. + * + * @return index of the last non-null element + */ + public int getLastSet() { + return lastSet; + } + + /** + * Get the starting position (offset) in the data stream for a given + * element in the vector. + * + * @param index position of the element in the vector + * @return starting offset for the element + */ + public long getStartEnd(int index) { + return offsetBuffer.getLong(index * OFFSET_WIDTH); + } + + /** + * Mark the particular position in the vector as non-null. + * + * @param index position of the element. + */ + @Override + public void setIndexDefined(int index) { + while (index >= getValidityBufferValueCapacity()) { + validityBuffer = reallocBufferHelper(validityBuffer, false); + } + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + } + + /** + * Sets the value length for an element. + * + * @param index position of the element to set + * @param length length of the element + */ + public void setValueLengthSafe(int index, int length) { + assert index >= 0; + handleSafe(index, length); + fillHoles(index); + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); + lastSet = index; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return greater than 0 length for non-null element, 0 otherwise + */ + public int getValueLength(int index) { + assert index >= 0; + if (isSet(index) == 0) { + return 0; + } + final int startOffset = getstartOffset(index); + final int dataLength = + offsetBuffer.getInt((index + 1) * OFFSET_WIDTH) - startOffset; + return dataLength; + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. This is same as using {@link #set(int, byte[], int, int)} + * with start as 0 and length as value.length + * + * @param index position of the element to set + * @param value array of bytes to write + */ + public void set(int index, byte[] value) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[])} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + */ + public void setSafe(int index, byte[] value) { + assert index >= 0; + fillEmpties(index); + handleSafe(index, value.length); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in array of bytes + * @param length length of data in array of bytes + */ + public void set(int index, byte[] value, int start, int length) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[], int, int)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in array of bytes + * @param length length of data in array of bytes + */ + public void setSafe(int index, byte[] value, int start, int length) { + assert index >= 0; + fillEmpties(index); + handleSafe(index, length); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the + * content in supplied ByteBuffer + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void set(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); + valueBuffer.setBytes(startOffset, value, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, ByteBuffer, int, int)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void setSafe(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + fillEmpties(index); + handleSafe(index, length); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); + valueBuffer.setBytes(startOffset, value, start, length); + lastSet = index; + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + while (index >= getValidityBufferValueCapacity()) { + validityBuffer = reallocBufferHelper(validityBuffer, false); + } + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, int isSet, int start, int end, ArrowBuf buffer) { + assert index >= 0; + final int dataLength = end - start; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + final int startOffset = offsetBuffer.getInt(index * OFFSET_WIDTH); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, buffer, start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, int, int, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, int isSet, int start, int end, ArrowBuf buffer) { + assert index >= 0; + final int dataLength = end - start; + fillEmpties(index); + handleSafe(index, end); + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + final int startOffset = offsetBuffer.getInt(index * OFFSET_WIDTH); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, buffer, start, dataLength); + lastSet = index; + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, int start, int length, ArrowBuf buffer) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + final int startOffset = offsetBuffer.getInt(index * OFFSET_WIDTH); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); + final ArrowBuf bb = buffer.slice(start, length); + valueBuffer.setBytes(startOffset, bb); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, int, int, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, int start, int length, ArrowBuf buffer) { + assert index >= 0; + fillEmpties(index); + handleSafe(index, length); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + final int startOffset = offsetBuffer.getInt(index * OFFSET_WIDTH); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); + final ArrowBuf bb = buffer.slice(start, length); + valueBuffer.setBytes(startOffset, bb); + lastSet = index; + } + + + /****************************************************************** + * * + * helper methods for setters * + * * + ******************************************************************/ + + + protected final void fillHoles(int index) { + for (int i = lastSet + 1; i < index; i++) { + setBytes(i, emptyByteArray, 0, emptyByteArray.length); + } + lastSet = index - 1; + } + + protected final void setBytes(int index, byte[] value, int start, int length) { + /* end offset of current last element in the vector. this will + * be the start offset of new element we are trying to store. + */ + final int startOffset = getstartOffset(index); + /* set new end offset */ + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); + /* store the var length data in value buffer */ + valueBuffer.setBytes(startOffset, value, start, length); + } + + protected final int getstartOffset(int index) { + return offsetBuffer.getInt(index * OFFSET_WIDTH); + } + + protected final void handleSafe(int index, int dataLength) { + /* + * IMPORTANT: + * value buffer for variable length vectors moves independent + * of the companion validity and offset buffers. This is in + * contrast to what we have for fixed width vectors. + * + * Here there is no concept of getValueCapacity() in the + * data stream. getValueCapacity() is applicable only to validity + * and offset buffers. + * + * So even though we may have setup an initial capacity of 1024 + * elements in the vector, it is quite possible + * that we need to reAlloc() the data buffer when we are setting + * the 5th element in the vector simply because previous + * variable length elements have exhausted the buffer capacity. + * However, we really don't need to reAlloc() validity and + * offset buffers until we try to set the 1025th element + * This is why we do a separate check for safe methods to + * determine which buffer needs reallocation. + */ + while (index >= getValueCapacity()) { + reallocValidityAndOffsetBuffers(); + } + final int startOffset = getstartOffset(index); + while (valueBuffer.capacity() < (startOffset + dataLength)) { + reallocDataBuffer(); + } + } + + /** + * Method used by Json Writer to read a variable width element from + * the variable width vector and write to Json. + * + * This method should not be used externally. + * + * @param data buffer storing the variable width vector elements + * @param offset buffer storing the offsets of variable width vector elements + * @param index position of the element in the vector + * @return array of bytes + */ + public static byte[] get(final ArrowBuf data, final ArrowBuf offset, int index) { + final int currentStartOffset = offset.getInt(index * OFFSET_WIDTH); + final int dataLength = + offset.getInt((index + 1) * OFFSET_WIDTH) - currentStartOffset; + final byte[] result = new byte[dataLength]; + data.getBytes(currentStartOffset, result, 0, dataLength); + return result; + } + + /** + * Method used by Json Reader to explicitly set the offsets of the variable + * width vector data. The method takes care of allocating the memory for + * offsets if the caller hasn't done so. + * + * This method should not be used externally. + * + * @param buffer ArrowBuf to store offsets for variable width elements + * @param allocator memory allocator + * @param valueCount number of elements + * @param index position of the element + * @param value offset of the element + * @return buffer holding the offsets + */ + public static ArrowBuf set(ArrowBuf buffer, BufferAllocator allocator, + int valueCount, int index, int value) { + if (buffer == null) { + buffer = allocator.buffer(valueCount * OFFSET_WIDTH); + } + buffer.setInt(index * OFFSET_WIDTH, value); + if (index == (valueCount - 1)) { + buffer.writerIndex(valueCount * OFFSET_WIDTH); + } + + return buffer; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BigIntVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BigIntVector.java new file mode 100644 index 0000000000000..ccf0c3067e043 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BigIntVector.java @@ -0,0 +1,366 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.BigIntReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.BigIntHolder; +import org.apache.arrow.vector.holders.NullableBigIntHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * BigIntVector implements a fixed width vector (8 bytes) of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class BigIntVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + /** + * Instantiate a BigIntVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public BigIntVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.BIGINT.getType()), + allocator); + } + + /** + * Instantiate a BigIntVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public BigIntVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new BigIntReaderImpl(BigIntVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.BIGINT; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableBigIntHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong(index * TYPE_WIDTH); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, BigIntVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final long value = from.valueBuffer.getLong(fromIndex * TYPE_WIDTH); + valueBuffer.setLong(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, BigIntVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, BigIntVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, long value) { + valueBuffer.setLong(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableBigIntHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, BigIntHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableBigIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableBigIntHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, BigIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, BigIntHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, final int index) { + return buffer.getLong(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((BigIntVector) to); + } + + private class TransferImpl implements TransferPair { + BigIntVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new BigIntVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(BigIntVector to) { + this.to = to; + } + + @Override + public BigIntVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, BigIntVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 591d13c4bd5b4..3887da4a618f0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -18,342 +18,535 @@ package org.apache.arrow.vector; +import io.netty.buffer.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.BaseAllocator; -import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.complex.impl.BitReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.BitHolder; import org.apache.arrow.vector.holders.NullableBitHolder; -import org.apache.arrow.vector.schema.ArrowFieldNode; -import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.util.TransferPair; -import io.netty.buffer.ArrowBuf; - /** - * Bit implements a vector of bit-width values. Elements in the vector are accessed by position from the logical start - * of the vector. The width of each element is 1 bit. The equivalent Java primitive is an int containing the value '0' - * or '1'. + * BitVector implements a fixed width (1 bit) vector of + * boolean values which could be null. Each value in the vector corresponds + * to a single bit in the underlying data stream backing the vector. */ -public final class BitVector extends BaseDataValueVector implements FixedWidthVector { - static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BitVector.class); - - private final Accessor accessor = new Accessor(); - private final Mutator mutator = new Mutator(); - - int valueCount; - private int allocationSizeInBytes = getSizeFromCount(INITIAL_VALUE_ALLOCATION); - private int allocationMonitor = 0; +public class BitVector extends BaseFixedWidthVector { + private final FieldReader reader; + /** + * Instantiate a BitVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ public BitVector(String name, BufferAllocator allocator) { - super(name, allocator); + this(name, FieldType.nullable(Types.MinorType.BIT.getType()), + allocator); } - @Override - public void load(ArrowFieldNode fieldNode, ArrowBuf data) { - // When the vector is all nulls or all defined, the content of the buffer can be omitted - if (data.readableBytes() == 0 && fieldNode.getLength() != 0) { - int count = fieldNode.getLength(); - allocateNew(count); - int n = getSizeFromCount(count); - if (fieldNode.getNullCount() == 0) { - // all defined - // create an all 1s buffer - // set full bytes - int fullBytesCount = count / 8; - for (int i = 0; i < fullBytesCount; ++i) { - this.data.setByte(i, 0xFF); - } - int remainder = count % 8; - // set remaining bits - if (remainder > 0) { - byte bitMask = (byte) (0xFFL >>> ((8 - remainder) & 7)); - this.data.setByte(fullBytesCount, bitMask); - } - } else if (fieldNode.getNullCount() == fieldNode.getLength()) { - // all null - // create an all 0s buffer - zeroVector(); - } else { - throw new IllegalArgumentException("The buffer can be empty only if there's no data or it's all null or all defined"); - } - this.data.writerIndex(n); - } else { - super.load(fieldNode, data); - } - this.valueCount = fieldNode.getLength(); + /** + * Instantiate a BitVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public BitVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, (byte) 0); + reader = new BitReaderImpl(BitVector.this); } + /** + * Get a reader that supports reading values from this vector + * + * @return Field Reader for this vector + */ @Override - public Field getField() { - throw new UnsupportedOperationException("internal vector"); + public FieldReader getReader() { + return reader; } + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ @Override - public MinorType getMinorType() { - return MinorType.BIT; + public Types.MinorType getMinorType() { + return Types.MinorType.BIT; } + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * + * @param valueCount desired number of elements in the vector + */ @Override - public FieldReader getReader() { - throw new UnsupportedOperationException("internal vector"); + public void setInitialCapacity(int valueCount) { + final int size = getValidityBufferSizeFromCount(valueCount); + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); + } + valueAllocationSizeInBytes = size; + validityAllocationSizeInBytes = size; } + /** + * Get the current value capacity for the vector + * + * @return number of elements that vector can hold. + */ @Override - public int getBufferSize() { - return getSizeFromCount(valueCount); + public int getValueCapacity() { + return (int) (validityBuffer.capacity() * 8L); } + /** + * Get the potential buffer size for a particular number of records. + * + * @param count desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ @Override - public int getBufferSizeFor(final int valueCount) { - return getSizeFromCount(valueCount); + public int getBufferSizeFor(final int count) { + if (count == 0) { + return 0; + } + return 2 * getValidityBufferSizeFromCount(count); } + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector + * + * @return size of underlying buffers. + */ @Override - public ArrowBuf getValidityBuffer() { - /* this operation is not supported for non-nullable vectors */ - throw new UnsupportedOperationException(); + public int getBufferSize() { + return getBufferSizeFor(valueCount); } - @Override - public ArrowBuf getDataBuffer() { - /* we are not throwing away getBuffer() of BaseDataValueVector so use it wherever applicable */ - return getBuffer(); - } + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, + BaseFixedWidthVector target) { + compareTypes(target, "splitAndTransferTo"); + target.clear(); + target.validityBuffer = splitAndTransferBuffer(startIndex, length, target, + validityBuffer, target.validityBuffer); + target.valueBuffer = splitAndTransferBuffer(startIndex, length, target, + valueBuffer, target.valueBuffer); - @Override - public ArrowBuf getOffsetBuffer() { - /* this operation is not supported for fixed-width vectors */ - throw new UnsupportedOperationException(); + target.setValueCount(length); } - int getSizeFromCount(int valueCount) { - return (int) Math.ceil(valueCount / 8.0); - } + private ArrowBuf splitAndTransferBuffer(int startIndex, int length, + BaseFixedWidthVector target, + ArrowBuf sourceBuffer, ArrowBuf destBuffer) { + assert startIndex + length <= valueCount; + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; - @Override - public int getValueCapacity() { - return (int) Math.min((long) Integer.MAX_VALUE, data.capacity() * 8L); - } + if (length > 0) { + if (offset == 0) { + /* slice */ + if (destBuffer != null) { + destBuffer.release(); + } + destBuffer = sourceBuffer.slice(firstByteSource, byteSizeTarget); + destBuffer.retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + destBuffer = allocator.buffer(byteSizeTarget); + destBuffer.readerIndex(0); + destBuffer.setZero(0, destBuffer.capacity()); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(sourceBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(sourceBuffer, firstByteSource + i + 1, offset); + + destBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(sourceBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(sourceBuffer, + firstByteSource + byteSizeTarget, offset); + + destBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(sourceBuffer, + firstByteSource + byteSizeTarget - 1, offset); + destBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } - private int getByteIndex(int index) { - return (int) Math.floor(index / 8.0); + return destBuffer; } - @Override - public void setInitialCapacity(final int valueCount) { - allocationSizeInBytes = getSizeFromCount(valueCount); + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + private int getBit(int index) { + final int byteIndex = index >> 3; + final byte b = valueBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return Long.bitCount(b & (1L << bitIndex)); } - @Override - public void allocateNew() { - if (!allocateNewSafe()) { - throw new OutOfMemoryException(); + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); } + return getBit(index); } - @Override - public boolean allocateNewSafe() { - long curAllocationSize = allocationSizeInBytes; - if (allocationMonitor > 10) { - curAllocationSize = Math.max(8, allocationSizeInBytes / 2); - allocationMonitor = 0; - } else if (allocationMonitor < -2) { - curAllocationSize = allocationSizeInBytes * 2L; - allocationMonitor = 0; + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableBitHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; } + holder.isSet = 1; + holder.value = getBit(index); + } - try { - allocateBytes(curAllocationSize); - } catch (OutOfMemoryException ex) { - return false; + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Boolean getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return new Boolean(getBit(index) != 0); } - return true; } - @Override - public void reset() { - valueCount = 0; - allocationSizeInBytes = getSizeFromCount(INITIAL_VALUE_ALLOCATION); - allocationMonitor = 0; - zeroVector(); - super.reset(); + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, BitVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + BitVectorHelper.setValidityBit(valueBuffer, thisIndex, from.getBit(fromIndex)); } /** - * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * Same as {@link #copyFrom(int, int, BitVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. * - * @param valueCount The number of values which can be contained within this vector. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector */ - @Override - public void allocateNew(int valueCount) { - final int size = getSizeFromCount(valueCount); - allocateBytes(size); + public void copyFromSafe(int fromIndex, int thisIndex, BitVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); } - private void allocateBytes(final long size) { - if (size > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); - } - final int curSize = (int) size; - clear(); - data = allocator.buffer(curSize); - zeroVector(); - allocationSizeInBytes = curSize; - } + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + /** - * Allocate new buffer with double capacity, and copy data into the new buffer. Replace vector's buffer with new buffer, and release old one + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element */ - public void reAlloc() { - long baseSize = allocationSizeInBytes; - final int currentBufferCapacity = data.capacity(); - if (baseSize < (long)currentBufferCapacity) { - baseSize = (long)currentBufferCapacity; + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + if (value != 0) { + BitVectorHelper.setValidityBitToOne(valueBuffer, index); + } else { + BitVectorHelper.setValidityBit(valueBuffer, index, 0); } - long newAllocationSize = baseSize * 2L; - newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + } - if (newAllocationSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableBitHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + if (holder.value != 0) { + BitVectorHelper.setValidityBitToOne(valueBuffer, index); + } else { + BitVectorHelper.setValidityBit(valueBuffer, index, 0); + } + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); } - - final int curSize = (int) newAllocationSize; - final ArrowBuf newBuf = allocator.buffer(curSize); - newBuf.setZero(0, newBuf.capacity()); - newBuf.setBytes(0, data, 0, currentBufferCapacity); - data.release(); - data = newBuf; - allocationSizeInBytes = curSize; } /** - * {@inheritDoc} + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element */ - @Override - public void zeroVector() { - data.setZero(0, data.capacity()); + public void set(int index, BitHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + if (holder.value != 0) { + BitVectorHelper.setValidityBitToOne(valueBuffer, index); + } else { + BitVectorHelper.setValidityBit(valueBuffer, index, 0); + } } - public void copyFrom(int inIndex, int outIndex, BitVector from) { - this.mutator.set(outIndex, from.accessor.get(inIndex)); + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); } - public void copyFromSafe(int inIndex, int outIndex, BitVector from) { - if (outIndex >= this.getValueCapacity()) { - reAlloc(); - } - copyFrom(inIndex, outIndex, from); + /** + * Same as {@link #set(int, NullableBitHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableBitHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); } - @Override - public Mutator getMutator() { - return new Mutator(); + /** + * Same as {@link #set(int, BitHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, BitHolder holder) { + handleSafe(index); + set(index, holder); } - @Override - public Accessor getAccessor() { - return new Accessor(); + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); } - @Override - public TransferPair getTransferPair(BufferAllocator allocator) { - return new TransferImpl(name, allocator); + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } } - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator) { - return new TransferImpl(ref, allocator); + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); } - @Override - public TransferPair makeTransferPair(ValueVector to) { - return new TransferImpl((BitVector) to); + /** + * Set the element at the given index to one. + * + * @param index position of element + */ + public void setToOne(int index) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + BitVectorHelper.setValidityBitToOne(valueBuffer, index); } - - public void transferTo(BitVector target) { - target.clear(); - target.data = data.transferOwnership(target.allocator).buffer; - target.valueCount = valueCount; - clear(); + /** + * Same as {@link #setToOne(int)} except that it handles the case when + * index is greater than or equal to current value capacity of the vector. + * + * @param index position of the element + */ + public void setSafeToOne(int index) { + handleSafe(index); + setToOne(index); } - public void splitAndTransferTo(int startIndex, int length, BitVector target) { - assert startIndex + length <= valueCount; - int firstByteSource = getByteIndex(startIndex); - int lastByteSource = getByteIndex(valueCount - 1); - int byteSizeTarget = getSizeFromCount(length); - int offset = startIndex % 8; - - if (length > 0) { - if (offset == 0) { - target.clear(); - // slice - if (target.data != null) { - target.data.release(); - } - target.data = data.slice(firstByteSource, byteSizeTarget); - target.data.retain(1); + /** + * Set count bits to 1 in data starting at firstBitIndex + * + * @param firstBitIndex the index of the first bit to set + * @param count the number of bits to set + */ + public void setRangeToOne(int firstBitIndex, int count) { + int startByteIndex = BitVectorHelper.byteIndex(firstBitIndex); + final int lastBitIndex = firstBitIndex + count; + final int endByteIndex = BitVectorHelper.byteIndex(lastBitIndex); + final int startByteBitIndex = BitVectorHelper.bitIndex(firstBitIndex); + final int endBytebitIndex = BitVectorHelper.bitIndex(lastBitIndex); + if (count < 8 && startByteIndex == endByteIndex) { + // handles the case where we don't have a first and a last byte + byte bitMask = 0; + for (int i = startByteBitIndex; i < endBytebitIndex; ++i) { + bitMask |= (byte) (1L << i); + } + BitVectorHelper.setBitMaskedByte(validityBuffer, startByteIndex, bitMask); + BitVectorHelper.setBitMaskedByte(valueBuffer, startByteIndex, bitMask); + } else { + // fill in first byte (if it's not full) + if (startByteBitIndex != 0) { + final byte bitMask = (byte) (0xFFL << startByteBitIndex); + BitVectorHelper.setBitMaskedByte(validityBuffer, startByteIndex, bitMask); + BitVectorHelper.setBitMaskedByte(valueBuffer, startByteIndex, bitMask); + ++startByteIndex; } - else { - // Copy data - // When the first bit starts from the middle of a byte (offset != 0), copy data from src BitVector. - // Each byte in the target is composed by a part in i-th byte, another part in (i+1)-th byte. - - target.clear(); - target.allocateNew(byteSizeTarget * 8); - - // TODO maybe do this one word at a time, rather than byte? - - for (int i = 0; i < byteSizeTarget - 1; i++) { - byte b1 = getBitsFromCurrentByte(this.data, firstByteSource + i, offset); - byte b2 = getBitsFromNextByte(this.data, firstByteSource + i + 1, offset); - - target.data.setByte(i, (b1 + b2)); - } - /* Copying the last piece is done in the following manner: - * if the source vector has 1 or more bytes remaining, we copy - * the last piece as a byte formed by shifting data - * from the current byte and the next byte. - * - * if the source vector has no more bytes remaining - * (we are at the last byte), we copy the last piece as a byte - * by shifting data from the current byte. - */ - if((firstByteSource + byteSizeTarget - 1) < lastByteSource) { - byte b1 = getBitsFromCurrentByte(this.data, firstByteSource + byteSizeTarget - 1, offset); - byte b2 = getBitsFromNextByte(this.data, firstByteSource + byteSizeTarget, offset); - - target.data.setByte(byteSizeTarget - 1, b1 + b2); - } - else { - byte b1 = getBitsFromCurrentByte(this.data, firstByteSource + byteSizeTarget - 1, offset); + // fill in one full byte at a time + for (int i = startByteIndex; i < endByteIndex; i++) { + validityBuffer.setByte(i, 0xFF); + valueBuffer.setByte(i, 0xFF); + } - target.data.setByte(byteSizeTarget - 1, b1); - } + // fill in the last byte (if it's not full) + if (endBytebitIndex != 0) { + final int byteIndex = BitVectorHelper.byteIndex(lastBitIndex - endBytebitIndex); + final byte bitMask = (byte) (0xFFL >>> ((8 - endBytebitIndex) & 7)); + BitVectorHelper.setBitMaskedByte(validityBuffer, byteIndex, bitMask); + BitVectorHelper.setBitMaskedByte(valueBuffer, byteIndex, bitMask); } } - target.getMutator().setValueCount(length); } - private static byte getBitsFromCurrentByte(ArrowBuf data, int index, int offset) { - return (byte)((data.getByte(index) & 0xFF) >>> offset); + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); } - private static byte getBitsFromNextByte(ArrowBuf data, int index, int offset) { - return (byte)((data.getByte(index) << (8 - offset))); + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((BitVector) to); } private class TransferImpl implements TransferPair { BitVector to; - public TransferImpl(String name, BufferAllocator allocator) { - this.to = new BitVector(name, allocator); + public TransferImpl(String ref, BufferAllocator allocator) { + to = new BitVector(ref, field.getFieldType(), allocator); } public TransferImpl(BitVector to) { @@ -380,269 +573,4 @@ public void copyValueSafe(int fromIndex, int toIndex) { to.copyFromSafe(fromIndex, toIndex, BitVector.this); } } - - private void decrementAllocationMonitor() { - if (allocationMonitor > 0) { - allocationMonitor = 0; - } - --allocationMonitor; - } - - private void incrementAllocationMonitor() { - ++allocationMonitor; - } - - public class Accessor extends BaseAccessor { - - /** - * Get the byte holding the desired bit, then mask all other bits. Iff the result is 0, the bit was not set. - * - * @param index position of the bit in the vector - * @return 1 if set, otherwise 0 - */ - public final int get(int index) { - int byteIndex = index >> 3; - byte b = data.getByte(byteIndex); - int bitIndex = index & 7; - return Long.bitCount(b & (1L << bitIndex)); - } - - @Override - public boolean isNull(int index) { - return false; - } - - @Override - public final Boolean getObject(int index) { - return new Boolean(get(index) != 0); - } - - @Override - public final int getValueCount() { - return valueCount; - } - - public final void get(int index, BitHolder holder) { - holder.value = get(index); - } - - public final void get(int index, NullableBitHolder holder) { - holder.isSet = 1; - holder.value = get(index); - } - - /** - * Get the number nulls, this correspond to the number of bits set to 0 in the vector - * - * @return the number of bits set to 0 - */ - @Override - public final int getNullCount() { - int count = 0; - int sizeInBytes = getSizeFromCount(valueCount); - - for (int i = 0; i < sizeInBytes; ++i) { - byte byteValue = data.getByte(i); - // Java uses two's complement binary representation, hence 11111111_b which is -1 when converted to Int - // will have 32bits set to 1. Masking the MSB and then adding it back solves the issue. - count += Integer.bitCount(byteValue & 0x7F) - (byteValue >> 7); - } - int nullCount = (sizeInBytes * 8) - count; - // if the valueCount is not a multiple of 8, the bits on the right were counted as null bits - int remainder = valueCount % 8; - nullCount -= remainder == 0 ? 0 : 8 - remainder; - return nullCount; - } - } - - /** - * MutableBit implements a vector of bit-width values. Elements in the vector are accessed by position from the - * logical start of the vector. Values should be pushed onto the vector sequentially, but may be randomly accessed. - * - * NB: this class is automatically generated from ValueVectorTypes.tdd using FreeMarker. - */ - public class Mutator extends BaseMutator { - - private Mutator() { - } - - /** - * Set the bit at the given index to the specified value. - * - * @param index position of the bit to set - * @param value value to set (either 1 or 0) - */ - public final void set(int index, int value) { - int byteIndex = byteIndex(index); - int bitIndex = bitIndex(index); - byte currentByte = data.getByte(byteIndex); - byte bitMask = (byte) (1L << bitIndex); - if (value != 0) { - currentByte |= bitMask; - } else { - currentByte -= (bitMask & currentByte); - } - data.setByte(byteIndex, currentByte); - } - - /** - * Set the bit at the given index to 1. - * - * @param index position of the bit to set - */ - public final void setToOne(int index) { - int byteIndex = byteIndex(index); - int bitIndex = bitIndex(index); - byte currentByte = data.getByte(byteIndex); - byte bitMask = (byte) (1L << bitIndex); - currentByte |= bitMask; - data.setByte(byteIndex, currentByte); - } - - /** - * set count bits to 1 in data starting at firstBitIndex - * - * @param firstBitIndex the index of the first bit to set - * @param count the number of bits to set - */ - public void setRangeToOne(int firstBitIndex, int count) { - int starByteIndex = byteIndex(firstBitIndex); - final int lastBitIndex = firstBitIndex + count; - final int endByteIndex = byteIndex(lastBitIndex); - final int startByteBitIndex = bitIndex(firstBitIndex); - final int endBytebitIndex = bitIndex(lastBitIndex); - if (count < 8 && starByteIndex == endByteIndex) { - // handles the case where we don't have a first and a last byte - byte bitMask = 0; - for (int i = startByteBitIndex; i < endBytebitIndex; ++i) { - bitMask |= (byte) (1L << i); - } - byte currentByte = data.getByte(starByteIndex); - currentByte |= bitMask; - data.setByte(starByteIndex, currentByte); - } else { - // fill in first byte (if it's not full) - if (startByteBitIndex != 0) { - byte currentByte = data.getByte(starByteIndex); - final byte bitMask = (byte) (0xFFL << startByteBitIndex); - currentByte |= bitMask; - data.setByte(starByteIndex, currentByte); - ++starByteIndex; - } - - // fill in one full byte at a time - for (int i = starByteIndex; i < endByteIndex; i++) { - data.setByte(i, 0xFF); - } - - // fill in the last byte (if it's not full) - if (endBytebitIndex != 0) { - final int byteIndex = byteIndex(lastBitIndex - endBytebitIndex); - byte currentByte = data.getByte(byteIndex); - final byte bitMask = (byte) (0xFFL >>> ((8 - endBytebitIndex) & 7)); - currentByte |= bitMask; - data.setByte(byteIndex, currentByte); - } - - } - } - - /** - * @param absoluteBitIndex the index of the bit in the buffer - * @return the index of the byte containing that bit - */ - private int byteIndex(int absoluteBitIndex) { - return absoluteBitIndex >> 3; - } - - /** - * @param absoluteBitIndex the index of the bit in the buffer - * @return the index of the bit inside the byte - */ - private int bitIndex(int absoluteBitIndex) { - return absoluteBitIndex & 7; - } - - public final void set(int index, BitHolder holder) { - set(index, holder.value); - } - - final void set(int index, NullableBitHolder holder) { - set(index, holder.value); - } - - public void setSafe(int index, int value) { - while (index >= getValueCapacity()) { - reAlloc(); - } - set(index, value); - } - - public void setSafeToOne(int index) { - while (index >= getValueCapacity()) { - reAlloc(); - } - setToOne(index); - } - - public void setSafe(int index, BitHolder holder) { - while (index >= getValueCapacity()) { - reAlloc(); - } - set(index, holder.value); - } - - public void setSafe(int index, NullableBitHolder holder) { - while (index >= getValueCapacity()) { - reAlloc(); - } - set(index, holder.value); - } - - @Override - public final void setValueCount(int valueCount) { - int currentValueCapacity = getValueCapacity(); - BitVector.this.valueCount = valueCount; - int idx = getSizeFromCount(valueCount); - while (valueCount > getValueCapacity()) { - reAlloc(); - } - if (valueCount > 0 && currentValueCapacity > valueCount * 2) { - incrementAllocationMonitor(); - } else if (allocationMonitor > 0) { - allocationMonitor = 0; - } - VectorTrimmer.trim(data, idx); - } - - @Override - public final void generateTestData(int values) { - boolean even = true; - for (int i = 0; i < values; i++, even = !even) { - if (even) { - set(i, 1); - } - } - setValueCount(values); - } - - public void generateTestDataAlt(int size) { - setValueCount(size); - boolean even = true; - final int valueCount = getAccessor().getValueCount(); - for (int i = 0; i < valueCount; i++, even = !even) { - if (even) { - set(i, (byte) 1); - } else { - set(i, (byte) 0); - } - } - } - } - - @Override - public void clear() { - this.valueCount = 0; - super.clear(); - } -} +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java new file mode 100644 index 0000000000000..8322a1ac8fc40 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java @@ -0,0 +1,215 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; + +/** + * Helper class for performing generic operations on a bit vector buffer. + * External use of this class is not recommended. + */ +public class BitVectorHelper { + + /** + * Get the index of byte corresponding to bit index in validity buffer + */ + public static int byteIndex(int absoluteBitIndex) { + return absoluteBitIndex >> 3; + } + + /** + * Get the relative index of bit within the byte in validity buffer + */ + public static int bitIndex(int absoluteBitIndex) { + return absoluteBitIndex & 7; + } + + /** + * Set the bit at provided index to 1. + * + * @param validityBuffer + * @param index + */ + public static void setValidityBitToOne(ArrowBuf validityBuffer, int index) { + final int byteIndex = byteIndex(index); + final int bitIndex = bitIndex(index); + byte currentByte = validityBuffer.getByte(byteIndex); + final byte bitMask = (byte) (1L << bitIndex); + currentByte |= bitMask; + validityBuffer.setByte(byteIndex, currentByte); + } + + /** + * Set the bit at a given index to provided value (1 or 0) + * + * @param validityBuffer + * @param index + * @param value + */ + public static void setValidityBit(ArrowBuf validityBuffer, int index, int value) { + final int byteIndex = byteIndex(index); + final int bitIndex = bitIndex(index); + byte currentByte = validityBuffer.getByte(byteIndex); + final byte bitMask = (byte) (1L << bitIndex); + if (value != 0) { + currentByte |= bitMask; + } else { + currentByte -= (bitMask & currentByte); + } + validityBuffer.setByte(byteIndex, currentByte); + } + + /** + * Set the bit at a given index to provided value (1 or 0). Internally + * takes care of allocating the buffer if the caller didn't do so. + * + * @param validityBuffer + * @param allocator + * @param valueCount + * @param index + * @param value + * @return ArrowBuf + */ + public static ArrowBuf setValidityBit(ArrowBuf validityBuffer, BufferAllocator allocator, + int valueCount, int index, int value) { + if (validityBuffer == null) { + validityBuffer = allocator.buffer(getValidityBufferSize(valueCount)); + } + setValidityBit(validityBuffer, index, value); + if (index == (valueCount - 1)) { + validityBuffer.writerIndex(getValidityBufferSize(valueCount)); + } + + return validityBuffer; + } + + /** + * Check if a bit at a given index is set or not. + * + * @param buffer + * @param index + * @return 1 if bit is set, 0 otherwise. + */ + public static int get(final ArrowBuf buffer, int index) { + final int byteIndex = index >> 3; + final byte b = buffer.getByte(byteIndex); + final int bitIndex = index & 7; + return Long.bitCount(b & (1L << bitIndex)); + } + + /** + * Compute the size of validity buffer required to manage a given number + * of elements in a vector. + * + * @param valueCount + * @return buffer size + */ + public static int getValidityBufferSize(int valueCount) { + return ((int) Math.ceil(valueCount / 8.0)); + } + + /** + * Given a validity buffer, find the number of bits that are not set. + * This is used to compute the number of null elements in a nullable vector. + * + * @param validityBuffer + * @param valueCount + * @return number of bits not set. + */ + public static int getNullCount(final ArrowBuf validityBuffer, final int valueCount) { + if (valueCount == 0) { + return 0; + } + int count = 0; + final int sizeInBytes = getValidityBufferSize(valueCount); + + for (int i = 0; i < sizeInBytes; ++i) { + final byte byteValue = validityBuffer.getByte(i); + /* Java uses two's complement binary representation, hence 11111111_b which is -1 + * when converted to Int will have 32bits set to 1. Masking the MSB and then + * adding it back solves the issue. + */ + count += Integer.bitCount(byteValue & 0x7F) - (byteValue >> 7); + } + int nullCount = (sizeInBytes * 8) - count; + /* if the valueCount is not a multiple of 8, + * the bits on the right were counted as null bits. + */ + int remainder = valueCount % 8; + nullCount -= remainder == 0 ? 0 : 8 - remainder; + return nullCount; + } + + public static byte getBitsFromCurrentByte(final ArrowBuf data, final int index, final int offset) { + return (byte) ((data.getByte(index) & 0xFF) >>> offset); + } + + public static byte getBitsFromNextByte(ArrowBuf data, int index, int offset) { + return (byte) ((data.getByte(index) << (8 - offset))); + } + + public static ArrowBuf loadValidityBuffer(final ArrowFieldNode fieldNode, + final ArrowBuf sourceValidityBuffer, + final BufferAllocator allocator) { + final int valueCount = fieldNode.getLength(); + ArrowBuf newBuffer = null; + /* either all NULLs or all non-NULLs */ + if (fieldNode.getNullCount() == 0 || fieldNode.getNullCount() == valueCount) { + newBuffer = allocator.buffer(getValidityBufferSize(valueCount)); + newBuffer.setZero(0, newBuffer.capacity()); + if (fieldNode.getNullCount() != 0) { + /* all NULLs */ + return newBuffer; + } + /* all non-NULLs */ + int fullBytesCount = valueCount / 8; + for (int i = 0; i < fullBytesCount; ++i) { + newBuffer.setByte(i, 0xFF); + } + int remainder = valueCount % 8; + if (remainder > 0) { + byte bitMask = (byte) (0xFFL >>> ((8 - remainder) & 7)); + newBuffer.setByte(fullBytesCount, bitMask); + } + } else { + /* mixed byte pattern -- create another ArrowBuf associated with the + * target allocator + */ + newBuffer = sourceValidityBuffer.retain(allocator); + } + + return newBuffer; + } + + /** + * Set the byte of the given index in the data buffer by applying a bit mask to + * the current byte at that index. + * + * @param data + * @param byteIndex + * @param bitMask + */ + static void setBitMaskedByte(ArrowBuf data, int byteIndex, byte bitMask) { + byte currentByte = data.getByte(byteIndex); + currentByte |= bitMask; + data.setByte(byteIndex, currentByte); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java index a0dbf2bdcf101..332ca228a43fd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java @@ -18,7 +18,7 @@ package org.apache.arrow.vector; -import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import io.netty.buffer.ArrowBuf; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java new file mode 100644 index 0000000000000..f6529d8e55bba --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java @@ -0,0 +1,134 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import com.google.common.base.Preconditions; + +public class BufferLayout { + + public enum BufferType { + DATA("DATA"), + OFFSET("OFFSET"), + VALIDITY("VALIDITY"), + TYPE("TYPE"); + + final private String name; + + BufferType(String name) { + this.name = name; + } + + public String getName() { + return name; + } + } + + private static final BufferLayout VALIDITY_BUFFER = new BufferLayout(BufferType.VALIDITY, 1); + private static final BufferLayout OFFSET_BUFFER = new BufferLayout(BufferType.OFFSET, 32); + private static final BufferLayout TYPE_BUFFER = new BufferLayout(BufferType.TYPE, 32); + private static final BufferLayout BIT_BUFFER = new BufferLayout(BufferType.DATA, 1); + private static final BufferLayout VALUES_128 = new BufferLayout(BufferType.DATA, 128); + private static final BufferLayout VALUES_64 = new BufferLayout(BufferType.DATA, 64); + private static final BufferLayout VALUES_32 = new BufferLayout(BufferType.DATA, 32); + private static final BufferLayout VALUES_16 = new BufferLayout(BufferType.DATA, 16); + private static final BufferLayout VALUES_8 = new BufferLayout(BufferType.DATA, 8); + + public static BufferLayout typeBuffer() { + return TYPE_BUFFER; + } + + public static BufferLayout offsetBuffer() { + return OFFSET_BUFFER; + } + + public static BufferLayout dataBuffer(int typeBitWidth) { + switch (typeBitWidth) { + case 8: + return VALUES_8; + case 16: + return VALUES_16; + case 32: + return VALUES_32; + case 64: + return VALUES_64; + case 128: + return VALUES_128; + default: + throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); + } + } + + public static BufferLayout booleanVector() { + return BIT_BUFFER; + } + + public static BufferLayout validityVector() { + return VALIDITY_BUFFER; + } + + public static BufferLayout byteVector() { + return dataBuffer(8); + } + + private final short typeBitWidth; + + private final BufferType type; + + private BufferLayout(BufferType type, int typeBitWidth) { + super(); + this.type = Preconditions.checkNotNull(type); + this.typeBitWidth = (short) typeBitWidth; + if (typeBitWidth <= 0) { + throw new IllegalArgumentException("bitWidth invalid: " + typeBitWidth); + } + } + + public int getTypeBitWidth() { + return typeBitWidth; + } + + public BufferType getType() { + return type; + } + + @Override + public String toString() { + return String.format("%s(%s)", type, typeBitWidth); + } + + @Override + public int hashCode() { + return 31 * (31 + type.hashCode()) + typeBitWidth; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + BufferLayout other = (BufferLayout) obj; + return type.equals(other.type) && (typeBitWidth == other.typeBitWidth); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java b/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java new file mode 100644 index 0000000000000..ed8956c1eaa3d --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/DateDayVector.java @@ -0,0 +1,368 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.DateDayReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.DateDayHolder; +import org.apache.arrow.vector.holders.NullableDateDayHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.slf4j.Logger; + +/** + * DateDayVector implements a fixed width (4 bytes) vector of + * date values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class DateDayVector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a DateDayVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public DateDayVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.DATEDAY.getType()), + allocator); + } + + /** + * Instantiate a DateDayVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public DateDayVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new DateDayReaderImpl(DateDayVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.DATEDAY; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableDateDayHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Integer getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getInt(index * TYPE_WIDTH); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, DateDayVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final int value = from.valueBuffer.getInt(fromIndex * TYPE_WIDTH); + valueBuffer.setInt(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, DateDayVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, DateDayVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, int value) { + valueBuffer.setInt(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableDateDayHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, DateDayHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableDateDayHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableDateDayHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, DateDayHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, DateDayHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static int get(final ArrowBuf buffer, final int index) { + return buffer.getInt(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((DateDayVector) to); + } + + private class TransferImpl implements TransferPair { + DateDayVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new DateDayVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(DateDayVector to) { + this.to = to; + } + + @Override + public DateDayVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, DateDayVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/DateMilliVector.java b/java/vector/src/main/java/org/apache/arrow/vector/DateMilliVector.java new file mode 100644 index 0000000000000..f21b58f85fb44 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/DateMilliVector.java @@ -0,0 +1,373 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.DateMilliReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.DateMilliHolder; +import org.apache.arrow.vector.holders.NullableDateMilliHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.joda.time.LocalDateTime; +import org.joda.time.LocalDateTimes; +import org.slf4j.Logger; + +/** + * DateMilliVector implements a fixed width vector (8 bytes) of + * date values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class DateMilliVector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + /** + * Instantiate a DateMilliVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public DateMilliVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.DATEMILLI.getType()), + allocator); + } + + /** + * Instantiate a DateMilliVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public DateMilliVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new DateMilliReaderImpl(DateMilliVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.DATEMILLI; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableDateMilliHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long millis = valueBuffer.getLong(index * TYPE_WIDTH); + final LocalDateTime localDateTime = new org.joda.time.LocalDateTime(millis, + org.joda.time.DateTimeZone.UTC); + return localDateTime; + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, DateMilliVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final long value = from.valueBuffer.getLong(fromIndex * TYPE_WIDTH); + valueBuffer.setLong(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, DateMilliVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, DateMilliVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, long value) { + valueBuffer.setLong(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableDateMilliHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, DateMilliHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableDateMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableDateMilliHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, DateMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, DateMilliHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, final int index) { + return buffer.getLong(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((DateMilliVector) to); + } + + private class TransferImpl implements TransferPair { + DateMilliVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new DateMilliVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(DateMilliVector to) { + this.to = to; + } + + @Override + public DateMilliVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, DateMilliVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java b/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java new file mode 100644 index 0000000000000..a043575081fb3 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/DecimalVector.java @@ -0,0 +1,470 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.DecimalReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.DecimalHolder; +import org.apache.arrow.vector.holders.NullableDecimalHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.DecimalUtility; +import org.apache.arrow.vector.util.TransferPair; + +import java.math.BigDecimal; + +/** + * DecimalVector implements a fixed width vector (16 bytes) of + * decimal values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class DecimalVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 16; + private final FieldReader reader; + + private final int precision; + private final int scale; + + /** + * Instantiate a DecimalVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public DecimalVector(String name, BufferAllocator allocator, + int precision, int scale) { + this(name, FieldType.nullable(new org.apache.arrow.vector.types.pojo.ArrowType.Decimal(precision, scale)), + allocator); + } + + /** + * Instantiate a DecimalVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public DecimalVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + org.apache.arrow.vector.types.pojo.ArrowType.Decimal arrowType = (org.apache.arrow.vector.types.pojo.ArrowType.Decimal) fieldType.getType(); + reader = new DecimalReaderImpl(DecimalVector.this); + this.precision = arrowType.getPrecision(); + this.scale = arrowType.getScale(); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.DECIMAL; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public ArrowBuf get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.slice(index * TYPE_WIDTH, TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableDecimalHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.buffer = valueBuffer; + holder.precision = precision; + holder.scale = scale; + holder.start = index * TYPE_WIDTH; + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public BigDecimal getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return DecimalUtility.getBigDecimalFromArrowBuf(valueBuffer, index, scale); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, DecimalVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + from.valueBuffer.getBytes(fromIndex * TYPE_WIDTH, valueBuffer, + thisIndex * TYPE_WIDTH, TYPE_WIDTH); + } + + /** + * Same as {@link #copyFrom(int, int, DecimalVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, DecimalVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + /** + * Return scale for the decimal value + */ + public int getScale() { + return scale; + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param buffer ArrowBuf containing decimal value. + */ + public void set(int index, ArrowBuf buffer) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + valueBuffer.setBytes(index * TYPE_WIDTH, buffer, 0, TYPE_WIDTH); + } + + /** + * Set the decimal element at given index to the provided array of bytes. + * Decimal is now implemented as Little Endian. This API allows the user + * to pass a decimal value in the form of byte array in BE byte order. + * + * Consumers of Arrow code can use this API instead of first swapping + * the source bytes (doing a write and read) and then finally writing to + * ArrowBuf of decimal vector. + * + * This method takes care of adding the necessary padding if the length + * of byte array is less then 16 (length of decimal type). + * + * @param index position of element + * @param value array of bytes containing decimal in big endian byte order. + */ + public void setBigEndian(int index, byte[] value) { + assert value.length <= TYPE_WIDTH; + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + final int length = value.length; + int startIndex = index * TYPE_WIDTH; + if (length == TYPE_WIDTH) { + for (int i = TYPE_WIDTH - 1; i >= 3; i-=4) { + valueBuffer.setByte(startIndex, value[i]); + valueBuffer.setByte(startIndex + 1, value[i-1]); + valueBuffer.setByte(startIndex + 2, value[i-2]); + valueBuffer.setByte(startIndex + 3, value[i-3]); + startIndex += 4; + } + } else { + for (int i = length - 1; i >= 0; i--) { + valueBuffer.setByte(startIndex, value[i]); + startIndex++; + } + valueBuffer.setZero(startIndex, TYPE_WIDTH - length); + } + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param start start index of data in the buffer + * @param buffer ArrowBuf containing decimal value. + */ + public void set(int index, int start, ArrowBuf buffer) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + valueBuffer.setBytes(index * TYPE_WIDTH, buffer, start, TYPE_WIDTH); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value BigDecimal containing decimal value. + */ + public void set(int index, BigDecimal value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + DecimalUtility.checkPrecisionAndScale(value, precision, scale); + DecimalUtility.writeBigDecimalToArrowBuf(value, valueBuffer, index); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableDecimalHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + valueBuffer.setBytes(index * TYPE_WIDTH, holder.buffer, holder.start, TYPE_WIDTH); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, DecimalHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + valueBuffer.setBytes(index * TYPE_WIDTH, holder.buffer, holder.start, TYPE_WIDTH); + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param buffer ArrowBuf containing decimal value. + */ + public void setSafe(int index, ArrowBuf buffer) { + handleSafe(index); + set(index, buffer); + } + + /** + * Same as {@link #setBigEndian(int, byte[])} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + */ + public void setBigEndianSafe(int index, byte[] value) { + handleSafe(index); + setBigEndian(index, value); + } + + /** + * Same as {@link #set(int, int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param start start index of data in the buffer + * @param buffer ArrowBuf containing decimal value. + */ + public void setSafe(int index, int start, ArrowBuf buffer) { + handleSafe(index); + set(index, start, buffer); + } + + /** + * Same as {@link #set(int, BigDecimal)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value BigDecimal containing decimal value. + */ + public void setSafe(int index, BigDecimal value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableDecimalHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableDecimalHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, DecimalHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, DecimalHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of the value in the buffer + * @param buffer buffer containing the value to be stored in the vector + */ + public void set(int index, int isSet, int start, ArrowBuf buffer) { + if (isSet > 0) { + set(index, start, buffer); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #setSafe(int, int, int, ArrowBuf)} except that it handles + * the case when the position of new value is beyond the current value + * capacity of the vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param start start position of the value in the buffer + * @param buffer buffer containing the value to be stored in the vector + */ + public void setSafe(int index, int isSet, int start, ArrowBuf buffer) { + handleSafe(index); + set(index, isSet, start, buffer); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((DecimalVector) to); + } + + private class TransferImpl implements TransferPair { + DecimalVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new DecimalVector(ref, allocator, DecimalVector.this.precision, + DecimalVector.this.scale); + } + + public TransferImpl(DecimalVector to) { + this.to = to; + } + + @Override + public DecimalVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, DecimalVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java index c2ed17eb4dd31..509eeda75000b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java @@ -20,7 +20,7 @@ import java.util.List; -import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.pojo.Field; import io.netty.buffer.ArrowBuf; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/Float4Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/Float4Vector.java new file mode 100644 index 0000000000000..dc78bfde2a228 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/Float4Vector.java @@ -0,0 +1,367 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.Float4ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.Float4Holder; +import org.apache.arrow.vector.holders.NullableFloat4Holder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Float4Vector implements a fixed width vector (4 bytes) of + * float values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class Float4Vector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a Float4Vector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public Float4Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.FLOAT4.getType()), + allocator); + } + + /** + * Instantiate a Float4Vector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public Float4Vector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new Float4ReaderImpl(Float4Vector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.FLOAT4; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public float get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getFloat(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableFloat4Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getFloat(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Float getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getFloat(index * TYPE_WIDTH); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, Float4Vector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final float value = from.valueBuffer.getFloat(fromIndex * TYPE_WIDTH); + valueBuffer.setFloat(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, Float4Vector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, Float4Vector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, float value) { + valueBuffer.setFloat(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, float value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableFloat4Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, Float4Holder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, float)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, float value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableFloat4Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableFloat4Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, Float4Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, Float4Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, float value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, float)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, float value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static float get(final ArrowBuf buffer, final int index) { + return buffer.getFloat(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((Float4Vector) to); + } + + private class TransferImpl implements TransferPair { + Float4Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new Float4Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(Float4Vector to) { + this.to = to; + } + + @Override + public Float4Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, Float4Vector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/Float8Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/Float8Vector.java new file mode 100644 index 0000000000000..1b410b868795d --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/Float8Vector.java @@ -0,0 +1,367 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.Float8ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.Float8Holder; +import org.apache.arrow.vector.holders.NullableFloat8Holder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Float8Vector implements a fixed width vector (8 bytes) of + * double values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class Float8Vector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + /** + * Instantiate a Float8Vector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public Float8Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.FLOAT8.getType()), + allocator); + } + + /** + * Instantiate a Float8Vector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public Float8Vector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new Float8ReaderImpl(Float8Vector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.FLOAT8; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public double get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getDouble(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableFloat8Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getDouble(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Double getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getDouble(index * TYPE_WIDTH); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, Float8Vector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final double value = from.valueBuffer.getDouble(fromIndex * TYPE_WIDTH); + valueBuffer.setDouble(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, Float8Vector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, Float8Vector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, double value) { + valueBuffer.setDouble(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, double value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableFloat8Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, Float8Holder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, double)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, double value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableFloat8Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableFloat8Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, Float8Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, Float8Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, double value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, double)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, double value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static double get(final ArrowBuf buffer, final int index) { + return buffer.getDouble(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((Float8Vector) to); + } + + private class TransferImpl implements TransferPair { + Float8Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new Float8Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(Float8Vector to) { + this.to = to; + } + + @Override + public Float8Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, Float8Vector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java b/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java new file mode 100644 index 0000000000000..89e2a02f6ac2d --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java @@ -0,0 +1,336 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.math.BigDecimal; +import java.nio.charset.Charset; + +/** + * Helper class to generate test data for Nullable fixed and variable + * width scalar vectors. Previous implementations of java vector classes + * provided generateTestData(now deprecated) API to populate the vector + * with sample data. This class should be used for that purpose. + */ +public class GenerateSampleData { + + public static void generateTestData(final ValueVector vector, final int valueCount) { + if (vector instanceof IntVector) { + writeIntData((IntVector) vector, valueCount); + } else if (vector instanceof DecimalVector) { + writeDecimalData((DecimalVector) vector, valueCount); + } else if (vector instanceof BitVector) { + writeBooleanData((BitVector) vector, valueCount); + } else if (vector instanceof VarCharVector) { + writeVarCharData((VarCharVector) vector, valueCount); + } else if (vector instanceof VarBinaryVector) { + writeVarBinaryData((VarBinaryVector) vector, valueCount); + } else if (vector instanceof BigIntVector) { + writeBigIntData((BigIntVector) vector, valueCount); + } else if (vector instanceof Float4Vector) { + writeFloatData((Float4Vector) vector, valueCount); + } else if (vector instanceof Float8Vector) { + writeDoubleData((Float8Vector) vector, valueCount); + } else if (vector instanceof DateDayVector) { + writeDateDayData((DateDayVector) vector, valueCount); + } else if (vector instanceof DateMilliVector) { + writeDateMilliData((DateMilliVector) vector, valueCount); + } else if (vector instanceof IntervalDayVector) { + writeIntervalDayData((IntervalDayVector) vector, valueCount); + } else if (vector instanceof IntervalYearVector) { + writeIntervalYearData((IntervalYearVector) vector, valueCount); + } else if (vector instanceof SmallIntVector) { + writeSmallIntData((SmallIntVector) vector, valueCount); + } else if (vector instanceof TinyIntVector) { + writeTinyIntData((TinyIntVector) vector, valueCount); + } else if (vector instanceof TimeMicroVector) { + writeTimeMicroData((TimeMicroVector) vector, valueCount); + } else if (vector instanceof TimeMilliVector) { + writeTimeMilliData((TimeMilliVector) vector, valueCount); + } else if (vector instanceof TimeNanoVector) { + writeTimeNanoData((TimeNanoVector) vector, valueCount); + } else if (vector instanceof TimeSecVector) { + writeTimeSecData((TimeSecVector) vector, valueCount); + } else if (vector instanceof TimeStampSecVector) { + writeTimeStampData((TimeStampSecVector) vector, valueCount); + } else if (vector instanceof TimeStampMicroVector) { + writeTimeStampData((TimeStampMicroVector) vector, valueCount); + } else if (vector instanceof TimeStampMilliVector) { + writeTimeStampData((TimeStampMilliVector) vector, valueCount); + } else if (vector instanceof TimeStampNanoVector) { + writeTimeStampData((TimeStampNanoVector) vector, valueCount); + } else if (vector instanceof TimeStampSecTZVector) { + writeTimeStampData((TimeStampSecTZVector) vector, valueCount); + } else if (vector instanceof TimeStampMicroTZVector) { + writeTimeStampData((TimeStampMicroTZVector) vector, valueCount); + } else if (vector instanceof TimeStampMilliTZVector) { + writeTimeStampData((TimeStampMilliTZVector) vector, valueCount); + } else if (vector instanceof TimeStampNanoTZVector) { + writeTimeStampData((TimeStampNanoTZVector) vector, valueCount); + } + } + + private static void writeTimeStampData(TimeStampVector vector, int valueCount) { + final long even = 100000; + final long odd = 200000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeDecimalData(DecimalVector vector, int valueCount) { + final BigDecimal even = new BigDecimal(0.0543278923); + final BigDecimal odd = new BigDecimal(2.0543278923); + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeIntData(IntVector vector, int valueCount) { + final int even = 1000; + final int odd = 2000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeBooleanData(BitVector vector, int valueCount) { + final int even = 0; + final int odd = 1; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeIntervalYearData(IntervalYearVector vector, int valueCount) { + final int even = 1; + final int odd = 2; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeIntervalDayData(IntervalDayVector vector, int valueCount) { + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, 1, 50); + } else { + vector.setSafe(i, 2, 100); + } + } + vector.setValueCount(valueCount); + } + + private static void writeTimeSecData(TimeSecVector vector, int valueCount) { + final int even = 500; + final int odd = 900; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeTimeMilliData(TimeMilliVector vector, int valueCount) { + final int even = 1000; + final int odd = 2000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeTimeMicroData(TimeMicroVector vector, int valueCount) { + final long even = 1000000000; + final long odd = 2000000000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + + } + + private static void writeTimeNanoData(TimeNanoVector vector, int valueCount) { + final long even = 1000000000; + final long odd = 2000000000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeDateDayData(DateDayVector vector, int valueCount) { + final int even = 1000; + final int odd = 2000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeDateMilliData(DateMilliVector vector, int valueCount) { + final long even = 1000000000; + final long odd = 2000000000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeSmallIntData(SmallIntVector vector, int valueCount) { + final short even = 10; + final short odd = 20; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeTinyIntData(TinyIntVector vector, int valueCount) { + final byte even = 1; + final byte odd = 2; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeBigIntData(BigIntVector vector, int valueCount) { + final long even = 1000000000; + final long odd = 2000000000; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeFloatData(Float4Vector vector, int valueCount) { + final float even = 20.3f; + final float odd = 40.2f; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeDoubleData(Float8Vector vector, int valueCount) { + final double even = 20.2373; + final double odd = 40.2378; + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeVarBinaryData(VarBinaryVector vector, int valueCount) { + Charset utf8Charset = Charset.forName("UTF-8"); + final byte[] even = "AAAAA1".getBytes(utf8Charset); + final byte[] odd = "BBBBBBBBB2".getBytes(utf8Charset); + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } + + private static void writeVarCharData(VarCharVector vector, int valueCount) { + Charset utf8Charset = Charset.forName("UTF-8"); + final byte[] even = "AAAAA1".getBytes(utf8Charset); + final byte[] odd = "BBBBBBBBB2".getBytes(utf8Charset); + for (int i = 0; i < valueCount; i++) { + if (i % 2 == 0) { + vector.setSafe(i, even); + } else { + vector.setSafe(i, odd); + } + } + vector.setValueCount(valueCount); + } +} + diff --git a/java/vector/src/main/java/org/apache/arrow/vector/IntVector.java b/java/vector/src/main/java/org/apache/arrow/vector/IntVector.java new file mode 100644 index 0000000000000..2364310e6d5fc --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/IntVector.java @@ -0,0 +1,377 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.IntReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.IntHolder; +import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * IntVector implements a fixed width (4 bytes) vector of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class IntVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a IntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public IntVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(org.apache.arrow.vector.types.Types.MinorType.INT.getType()), + allocator); + } + + /** + * Instantiate a IntVector. This doesn't allocate any memory for + * the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new IntReaderImpl(IntVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.INT; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableIntHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Integer getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getInt(index * TYPE_WIDTH); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, IntVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final int value = from.valueBuffer.getInt(fromIndex * TYPE_WIDTH); + valueBuffer.setInt(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, IntVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, IntVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, int value) { + valueBuffer.setInt(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableIntHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, IntHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableIntHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, IntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, IntHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + *

+ * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static int get(final ArrowBuf buffer, final int index) { + return buffer.getInt(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((IntVector) to); + } + + private class TransferImpl implements TransferPair { + IntVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new IntVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(IntVector to) { + this.to = to; + } + + @Override + public IntVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, IntVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/IntervalDayVector.java b/java/vector/src/main/java/org/apache/arrow/vector/IntervalDayVector.java new file mode 100644 index 0000000000000..481a66f4e7b29 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/IntervalDayVector.java @@ -0,0 +1,425 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.IntervalDayReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.IntervalDayHolder; +import org.apache.arrow.vector.holders.NullableIntervalDayHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.joda.time.Period; + +/** + * IntervalDayVector implements a fixed width vector (8 bytes) of + * interval (days and milliseconds) values which could be null. + * A validity buffer (bit vector) is maintained to track which elements in the + * vector are null. + */ +public class IntervalDayVector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 8; + private static final byte MILLISECOND_OFFSET = 4; + private final FieldReader reader; + + /** + * Instantiate a IntervalDayVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public IntervalDayVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.INTERVALDAY.getType()), + allocator); + } + + /** + * Instantiate a IntervalDayVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntervalDayVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new IntervalDayReaderImpl(IntervalDayVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.INTERVALDAY; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public ArrowBuf get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + return null; + } + return valueBuffer.slice(index * TYPE_WIDTH, TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableIntervalDayHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + final int startIndex = index * TYPE_WIDTH; + holder.isSet = 1; + holder.days = valueBuffer.getInt(startIndex); + holder.milliseconds = valueBuffer.getInt(startIndex + MILLISECOND_OFFSET); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Period getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final int startIndex = index * TYPE_WIDTH; + final int days = valueBuffer.getInt(startIndex); + final int milliseconds = valueBuffer.getInt(startIndex + MILLISECOND_OFFSET); + final Period p = new Period(); + return p.plusDays(days).plusMillis(milliseconds); + } + } + + /** + * Get the Interval value at a given index as a {@link StringBuilder} object + * @param index position of the element + * @return String Builder object with Interval value as + * [days, hours, minutes, seconds, millis] + */ + public StringBuilder getAsStringBuilder(int index) { + if (isSet(index) == 0) { + return null; + } else { + return getAsStringBuilderHelper(index); + } + } + + private StringBuilder getAsStringBuilderHelper(int index) { + final int startIndex = index * TYPE_WIDTH; + + final int days = valueBuffer.getInt(startIndex); + int millis = valueBuffer.getInt(startIndex + MILLISECOND_OFFSET); + + final int hours = millis / (org.apache.arrow.vector.util.DateUtility.hoursToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.hoursToMillis); + + final int minutes = millis / (org.apache.arrow.vector.util.DateUtility.minutesToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.minutesToMillis); + + final int seconds = millis / (org.apache.arrow.vector.util.DateUtility.secondsToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.secondsToMillis); + + final String dayString = (Math.abs(days) == 1) ? " day " : " days "; + + return (new StringBuilder(). + append(days).append(dayString). + append(hours).append(":"). + append(minutes).append(":"). + append(seconds).append("."). + append(millis)); + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, IntervalDayVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + from.valueBuffer.getBytes(fromIndex * TYPE_WIDTH, this.valueBuffer, + thisIndex * TYPE_WIDTH, TYPE_WIDTH); + } + + /** + * Same as {@link #copyFrom(int, int, IntervalDayVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, IntervalDayVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, ArrowBuf value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + valueBuffer.setBytes(index * TYPE_WIDTH, value, 0, TYPE_WIDTH); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param days days for the interval + * @param milliseconds milliseconds for the interval + */ + public void set(int index, int days, int milliseconds) { + final int offsetIndex = index * TYPE_WIDTH; + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + valueBuffer.setInt(offsetIndex, days); + valueBuffer.setInt((offsetIndex + MILLISECOND_OFFSET), milliseconds); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableIntervalDayHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + set(index, holder.days, holder.milliseconds); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, IntervalDayHolder holder) { + set(index, holder.days, holder.milliseconds); + } + + /** + * Same as {@link #set(int, ArrowBuf)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, ArrowBuf value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param days days for the interval + * @param milliseconds milliseconds for the interval + */ + public void setSafe(int index, int days, int milliseconds) { + handleSafe(index); + set(index, days, milliseconds); + } + + /** + * Same as {@link #set(int, NullableIntervalDayHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableIntervalDayHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, IntervalDayHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, IntervalDayHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param days days component of interval + * @param milliseconds millisecond component of interval + */ + public void set(int index, int isSet, int days, int milliseconds) { + if (isSet > 0) { + set(index, days, milliseconds); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param days days component of interval + * @param milliseconds millisecond component of interval + */ + public void setSafe(int index, int isSet, int days, int milliseconds) { + handleSafe(index); + set(index, isSet, days, milliseconds); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((IntervalDayVector) to); + } + + private class TransferImpl implements TransferPair { + IntervalDayVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new IntervalDayVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(IntervalDayVector to) { + this.to = to; + } + + @Override + public IntervalDayVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, IntervalDayVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/IntervalYearVector.java b/java/vector/src/main/java/org/apache/arrow/vector/IntervalYearVector.java new file mode 100644 index 0000000000000..2aa728f3d6337 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/IntervalYearVector.java @@ -0,0 +1,385 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.IntervalYearReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.IntervalYearHolder; +import org.apache.arrow.vector.holders.NullableIntervalYearHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.joda.time.Period; + +/** + * IntervalYearVector implements a fixed width (4 bytes) vector of + * interval (years and months) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public class IntervalYearVector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a IntervalYearVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public IntervalYearVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.INTERVALYEAR.getType()), + allocator); + } + + /** + * Instantiate a IntervalYearVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public IntervalYearVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new IntervalYearReaderImpl(IntervalYearVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.INTERVALYEAR; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableIntervalYearHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Period getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final int interval = valueBuffer.getInt(index * TYPE_WIDTH); + final int years = (interval / org.apache.arrow.vector.util.DateUtility.yearsToMonths); + final int months = (interval % org.apache.arrow.vector.util.DateUtility.yearsToMonths); + final Period p = new Period(); + return p.plusYears(years).plusMonths(months); + } + } + + /** + * Get the Interval value at a given index as a {@link StringBuilder} object + * @param index position of the element + * @return String Builder object with Interval value as + * [years, months] + */ + public StringBuilder getAsStringBuilder(int index) { + if (isSet(index) == 0) { + return null; + } else { + return getAsStringBuilderHelper(index); + } + } + + private StringBuilder getAsStringBuilderHelper(int index) { + int value = valueBuffer.getInt(index * TYPE_WIDTH); + + final int years = (value / org.apache.arrow.vector.util.DateUtility.yearsToMonths); + final int months = (value % org.apache.arrow.vector.util.DateUtility.yearsToMonths); + + final String yearString = (Math.abs(years) == 1) ? " year " : " years "; + final String monthString = (Math.abs(months) == 1) ? " month " : " months "; + + return (new StringBuilder(). + append(years).append(yearString). + append(months).append(monthString)); + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, IntervalYearVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final int value = from.valueBuffer.getInt(fromIndex * TYPE_WIDTH); + valueBuffer.setInt(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, IntervalYearVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, IntervalYearVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, int value) { + valueBuffer.setInt(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableIntervalYearHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, IntervalYearHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableIntervalYearHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableIntervalYearHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, IntervalYearHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, IntervalYearHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((IntervalYearVector) to); + } + + private class TransferImpl implements TransferPair { + IntervalYearVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new IntervalYearVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(IntervalYearVector to) { + this.to = to; + } + + @Override + public IntervalYearVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, IntervalYearVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/SmallIntVector.java b/java/vector/src/main/java/org/apache/arrow/vector/SmallIntVector.java new file mode 100644 index 0000000000000..859e62dbb8bbb --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/SmallIntVector.java @@ -0,0 +1,394 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.SmallIntReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.SmallIntHolder; +import org.apache.arrow.vector.holders.NullableSmallIntHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * SmallIntVector implements a fixed width (2 bytes) vector of + * short values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class SmallIntVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 2; + private final FieldReader reader; + + /** + * Instantiate a SmallIntVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public SmallIntVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.SMALLINT.getType()), + allocator); + } + + /** + * Instantiate a SmallIntVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public SmallIntVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new SmallIntReaderImpl(SmallIntVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.SMALLINT; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public short get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getShort(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableSmallIntHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getShort(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Short getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getShort(index * TYPE_WIDTH); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, SmallIntVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final short value = from.valueBuffer.getShort(fromIndex * TYPE_WIDTH); + valueBuffer.setShort(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, SmallIntVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, SmallIntVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, int value) { + valueBuffer.setShort(index * TYPE_WIDTH, value); + } + + private void setValue(int index, short value) { + valueBuffer.setShort(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, short value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableSmallIntHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, SmallIntHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, short)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, short value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableSmallIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableSmallIntHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, SmallIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, SmallIntHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, short value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, short)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, short value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static short get(final ArrowBuf buffer, final int index) { + return buffer.getShort(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((SmallIntVector) to); + } + + private class TransferImpl implements TransferPair { + SmallIntVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new SmallIntVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(SmallIntVector to) { + this.to = to; + } + + @Override + public SmallIntVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, SmallIntVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeMicroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeMicroVector.java new file mode 100644 index 0000000000000..604cedffd4258 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeMicroVector.java @@ -0,0 +1,368 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeMicroReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeMicroHolder; +import org.apache.arrow.vector.holders.NullableTimeMicroHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.slf4j.Logger; + +/** + * TimeMicroVector implements a fixed width vector (8 bytes) of + * time (microsecond resolution) values which could be null. + * A validity buffer (bit vector) is maintained to track which elements in the + * vector are null. + */ +public class TimeMicroVector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + /** + * Instantiate a TimeMicroVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeMicroVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.TIMEMICRO.getType()), + allocator); + } + + /** + * Instantiate a TimeMicroVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeMicroVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new TimeMicroReaderImpl(TimeMicroVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMEMICRO; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeMicroHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong(index * TYPE_WIDTH); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, TimeMicroVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final long value = from.valueBuffer.getLong(fromIndex * TYPE_WIDTH); + valueBuffer.setLong(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, TimeMicroVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, TimeMicroVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, long value) { + valueBuffer.setLong(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeMicroHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeMicroHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableTimeMicroHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeMicroHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeMicroHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeMicroHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, int index) { + return buffer.getLong(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeMicroVector) to); + } + + private class TransferImpl implements TransferPair { + TimeMicroVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new TimeMicroVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(TimeMicroVector to) { + this.to = to; + } + + @Override + public TimeMicroVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TimeMicroVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeMilliVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeMilliVector.java new file mode 100644 index 0000000000000..c3d100c5b6194 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeMilliVector.java @@ -0,0 +1,369 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeMilliReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeMilliHolder; +import org.apache.arrow.vector.holders.NullableTimeMilliHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.joda.time.LocalDateTime; +import org.slf4j.Logger; + +/** + * TimeMilliVector implements a fixed width (4 bytes) vector of + * time (millisecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public class TimeMilliVector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a TimeMilliVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeMilliVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.TIMEMILLI.getType()), + allocator); + } + + /** + * Instantiate a TimeMilliVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeMilliVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new TimeMilliReaderImpl(TimeMilliVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMEMILLI; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeMilliHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } + org.joda.time.LocalDateTime ldt = new org.joda.time.LocalDateTime(get(index), + org.joda.time.DateTimeZone.UTC); + return ldt; + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, TimeMilliVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final int value = from.valueBuffer.getInt(fromIndex * TYPE_WIDTH); + valueBuffer.setInt(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, TimeMilliVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, TimeMilliVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, int value) { + valueBuffer.setInt(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeMilliHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeMilliHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableTimeMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeMilliHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeMilliHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static int get(final ArrowBuf buffer, final int index) { + return buffer.getInt(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeMilliVector) to); + } + + private class TransferImpl implements TransferPair { + TimeMilliVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new TimeMilliVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(TimeMilliVector to) { + this.to = to; + } + + @Override + public TimeMilliVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TimeMilliVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeNanoVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeNanoVector.java new file mode 100644 index 0000000000000..97401ec8aaa0b --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeNanoVector.java @@ -0,0 +1,366 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeNanoReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeNanoHolder; +import org.apache.arrow.vector.holders.NullableTimeNanoHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeNanoVector implements a fixed width vector (8 bytes) of + * time (nanosecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public class TimeNanoVector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + /** + * Instantiate a TimeNanoVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeNanoVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.TIMENANO.getType()), + allocator); + } + + /** + * Instantiate a TimeNanoVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeNanoVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new TimeNanoReaderImpl(TimeNanoVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMENANO; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeNanoHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong(index * TYPE_WIDTH); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, TimeNanoVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final long value = from.valueBuffer.getLong(fromIndex * TYPE_WIDTH); + valueBuffer.setLong(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, TimeNanoVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, TimeNanoVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, long value) { + valueBuffer.setLong(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeNanoHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeNanoHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableTimeNanoHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeNanoHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeNanoHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeNanoHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, final int index) { + return buffer.getLong(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeNanoVector) to); + } + + private class TransferImpl implements TransferPair { + TimeNanoVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new TimeNanoVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(TimeNanoVector to) { + this.to = to; + } + + @Override + public TimeNanoVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TimeNanoVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeSecVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeSecVector.java new file mode 100644 index 0000000000000..a7823a916f3d3 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeSecVector.java @@ -0,0 +1,367 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeSecReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeSecHolder; +import org.apache.arrow.vector.holders.NullableTimeSecHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeSecVector implements a fixed width (4 bytes) vector of + * time (seconds resolution) values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class TimeSecVector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + /** + * Instantiate a TimeSecVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeSecVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.TIMESEC.getType()), + allocator); + } + + /** + * Instantiate a TimeSecVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeSecVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new TimeSecReaderImpl(TimeSecVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMESEC; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeSecHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Integer getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getInt(index * TYPE_WIDTH); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, TimeSecVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final int value = from.valueBuffer.getInt(fromIndex * TYPE_WIDTH); + valueBuffer.setInt(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, TimeSecVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, TimeSecVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, int value) { + valueBuffer.setInt(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeSecHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeSecHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableTimeSecHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeSecHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeSecHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeSecHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, int)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static int get(final ArrowBuf buffer, final int index) { + return buffer.getInt(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeSecVector) to); + } + + private class TransferImpl implements TransferPair { + TimeSecVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new TimeSecVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(TimeSecVector to) { + this.to = to; + } + + @Override + public TimeSecVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TimeSecVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroTZVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroTZVector.java new file mode 100644 index 0000000000000..bfe330a1e40c5 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroTZVector.java @@ -0,0 +1,217 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampMicroTZReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeStampMicroTZHolder; +import org.apache.arrow.vector.holders.NullableTimeStampMicroTZHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampMicroTZVector implements a fixed width vector (8 bytes) of + * timestamp (microsecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public class TimeStampMicroTZVector extends TimeStampVector { + private final FieldReader reader; + private final String timeZone; + + /** + * Instantiate a TimeStampMicroTZVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampMicroTZVector(String name, BufferAllocator allocator, String timeZone) { + this(name, FieldType.nullable(new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(TimeUnit.MICROSECOND, timeZone)), + allocator); + } + + /** + * Instantiate a TimeStampMicroTZVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMicroTZVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + org.apache.arrow.vector.types.pojo.ArrowType.Timestamp arrowType = (org.apache.arrow.vector.types.pojo.ArrowType.Timestamp) fieldType.getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampMicroTZReaderImpl(TimeStampMicroTZVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMESTAMPMICROTZ; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampMicroTZHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong(index * TYPE_WIDTH); + } + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampMicroTZHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampMicroTZHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampMicroTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampMicroTZHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampMicroTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampMicroTZHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampMicroTZVector to = new TimeStampMicroTZVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampMicroTZVector) to); + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroVector.java new file mode 100644 index 0000000000000..85b615d8f578b --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMicroVector.java @@ -0,0 +1,219 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampMicroReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeStampMicroHolder; +import org.apache.arrow.vector.holders.NullableTimeStampMicroHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.joda.time.LocalDateTime; + +/** + * TimeStampMicroVector implements a fixed width vector (8 bytes) of + * timestamp (microsecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public class TimeStampMicroVector extends TimeStampVector { + private final FieldReader reader; + + /** + * Instantiate a TimeStampMicroVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampMicroVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.TIMESTAMPMICRO.getType()), + allocator); + } + + /** + * Instantiate a TimeStampMicroVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMicroVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + reader = new TimeStampMicroReaderImpl(TimeStampMicroVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMESTAMPMICRO; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampMicroHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + /* value is truncated when converting microseconds to milliseconds in order to use DateTime type */ + final long micros = valueBuffer.getLong(index * TYPE_WIDTH); + final long millis = java.util.concurrent.TimeUnit.MICROSECONDS.toMillis(micros); + final org.joda.time.LocalDateTime localDateTime = new org.joda.time.LocalDateTime(millis, + org.joda.time.DateTimeZone.UTC); + return localDateTime; + } + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampMicroHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampMicroHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampMicroHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampMicroHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampMicroHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampMicroHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampMicroVector to = new TimeStampMicroVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampMicroVector) to); + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliTZVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliTZVector.java new file mode 100644 index 0000000000000..9d68b564492b6 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliTZVector.java @@ -0,0 +1,216 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampMilliTZReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeStampMilliTZHolder; +import org.apache.arrow.vector.holders.NullableTimeStampMilliTZHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampMilliTZVector implements a fixed width vector (8 bytes) of + * timestamp (millisecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public class TimeStampMilliTZVector extends TimeStampVector { + private final FieldReader reader; + private final String timeZone; + + /** + * Instantiate a TimeStampMilliTZVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampMilliTZVector(String name, BufferAllocator allocator, String timeZone) { + this(name, FieldType.nullable(new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(TimeUnit.MILLISECOND, timeZone)), + allocator); + } + + /** + * Instantiate a TimeStampMilliTZVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMilliTZVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + org.apache.arrow.vector.types.pojo.ArrowType.Timestamp arrowType = (org.apache.arrow.vector.types.pojo.ArrowType.Timestamp) fieldType.getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampMilliTZReaderImpl(TimeStampMilliTZVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMESTAMPMILLITZ; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampMilliTZHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong(index * TYPE_WIDTH); + } + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampMilliTZHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampMilliTZHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampMilliTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampMilliTZHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampMilliTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampMilliTZHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampMilliTZVector to = new TimeStampMilliTZVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampMilliTZVector) to); + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliVector.java new file mode 100644 index 0000000000000..7e8a1d0e2a854 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampMilliVector.java @@ -0,0 +1,217 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampMilliReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeStampMilliHolder; +import org.apache.arrow.vector.holders.NullableTimeStampMilliHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.joda.time.LocalDateTime; + +/** + * TimeStampMilliVector implements a fixed width vector (8 bytes) of + * timestamp (millisecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public class TimeStampMilliVector extends TimeStampVector { + private final FieldReader reader; + + /** + * Instantiate a TimeStampMilliVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampMilliVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.TIMESTAMPMILLI.getType()), + allocator); + } + + /** + * Instantiate a TimeStampMilliVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampMilliVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + reader = new TimeStampMilliReaderImpl(TimeStampMilliVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMESTAMPMILLI; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampMilliHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long millis = valueBuffer.getLong(index * TYPE_WIDTH); + final org.joda.time.LocalDateTime localDateTime = new org.joda.time.LocalDateTime(millis, + org.joda.time.DateTimeZone.UTC); + return localDateTime; + } + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampMilliHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampMilliHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampMilliHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampMilliHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampMilliHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampMilliVector to = new TimeStampMilliVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampMilliVector) to); + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoTZVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoTZVector.java new file mode 100644 index 0000000000000..e0361820137e2 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoTZVector.java @@ -0,0 +1,217 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampNanoTZReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeStampNanoTZHolder; +import org.apache.arrow.vector.holders.NullableTimeStampNanoTZHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampNanoTZVector implements a fixed width vector (8 bytes) of + * timestamp (nanosecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public class TimeStampNanoTZVector extends TimeStampVector { + private final FieldReader reader; + private final String timeZone; + + /** + * Instantiate a TimeStampNanoTZVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampNanoTZVector(String name, BufferAllocator allocator, String timeZone) { + this(name, FieldType.nullable(new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(TimeUnit.NANOSECOND, timeZone)), + allocator); + } + + /** + * Instantiate a TimeStampNanoTZVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampNanoTZVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + org.apache.arrow.vector.types.pojo.ArrowType.Timestamp arrowType = (org.apache.arrow.vector.types.pojo.ArrowType.Timestamp) fieldType.getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampNanoTZReaderImpl(TimeStampNanoTZVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMESTAMPNANOTZ; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampNanoTZHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong(index * TYPE_WIDTH); + } + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampNanoTZHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampNanoTZHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampNanoTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampNanoTZHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampNanoTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampNanoTZHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampNanoTZVector to = new TimeStampNanoTZVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampNanoTZVector) to); + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoVector.java new file mode 100644 index 0000000000000..fdf5d26945b90 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampNanoVector.java @@ -0,0 +1,218 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampNanoReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeStampNanoHolder; +import org.apache.arrow.vector.holders.NullableTimeStampNanoHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.joda.time.LocalDateTime; + +/** + * TimeStampNanoVector implements a fixed width vector (8 bytes) of + * timestamp (nanosecond resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public class TimeStampNanoVector extends TimeStampVector { + private final FieldReader reader; + + /** + * Instantiate a TimeStampNanoVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampNanoVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.TIMESTAMPNANO.getType()), + allocator); + } + + /** + * Instantiate a TimeStampNanoVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampNanoVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + reader = new TimeStampNanoReaderImpl(TimeStampNanoVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMESTAMPNANO; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampNanoHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long nanos = valueBuffer.getLong(index * TYPE_WIDTH); + final long millis = java.util.concurrent.TimeUnit.NANOSECONDS.toMillis(nanos); + final org.joda.time.LocalDateTime localDateTime = new org.joda.time.LocalDateTime(millis, + org.joda.time.DateTimeZone.UTC); + return localDateTime; + } + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampNanoHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampNanoHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampNanoHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampNanoHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampNanoHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampNanoHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampNanoVector to = new TimeStampNanoVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampNanoVector) to); + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecTZVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecTZVector.java new file mode 100644 index 0000000000000..201f1c317d02d --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecTZVector.java @@ -0,0 +1,215 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampSecTZReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeStampSecTZHolder; +import org.apache.arrow.vector.holders.NullableTimeStampSecTZHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampSecTZVector implements a fixed width vector (8 bytes) of + * timestamp (seconds resolution) values which could be null. A validity buffer + * (bit vector) is maintained to track which elements in the vector are null. + */ +public class TimeStampSecTZVector extends TimeStampVector { + private final FieldReader reader; + private final String timeZone; + + /** + * Instantiate a TimeStampSecTZVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampSecTZVector(String name, BufferAllocator allocator, String timeZone) { + this(name, FieldType.nullable(new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.SECOND, timeZone)), + allocator); + } + + /** + * Instantiate a TimeStampSecTZVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampSecTZVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + org.apache.arrow.vector.types.pojo.ArrowType.Timestamp arrowType = (org.apache.arrow.vector.types.pojo.ArrowType.Timestamp) fieldType.getType(); + timeZone = arrowType.getTimezone(); + reader = new TimeStampSecTZReaderImpl(TimeStampSecTZVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMESTAMPSECTZ; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampSecTZHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong(index * TYPE_WIDTH); + } + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampSecTZHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampSecTZHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampSecTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampSecTZHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampSecTZHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampSecTZHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampSecTZVector to = new TimeStampSecTZVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampSecTZVector) to); + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecVector.java new file mode 100644 index 0000000000000..4bcd4f7bf0026 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampSecVector.java @@ -0,0 +1,218 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TimeStampSecReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TimeStampSecHolder; +import org.apache.arrow.vector.holders.NullableTimeStampSecHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.joda.time.LocalDateTime; + +/** + * TimeStampSecVector implements a fixed width vector (8 bytes) of + * timestamp (seconds resolution) values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class TimeStampSecVector extends TimeStampVector { + private final FieldReader reader; + + /** + * Instantiate a TimeStampSecVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TimeStampSecVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.TIMESTAMPSEC.getType()), + allocator); + } + + /** + * Instantiate a TimeStampSecVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampSecVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, fieldType, allocator); + reader = new TimeStampSecReaderImpl(TimeStampSecVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TIMESTAMPSEC; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTimeStampSecHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public LocalDateTime getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + final long secs = valueBuffer.getLong(index * TYPE_WIDTH); + final long millis = java.util.concurrent.TimeUnit.SECONDS.toMillis(secs); + final org.joda.time.LocalDateTime localDateTime = new org.joda.time.LocalDateTime(millis, + org.joda.time.DateTimeZone.UTC); + return localDateTime; + } + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTimeStampSecHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TimeStampSecHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, NullableTimeStampSecHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTimeStampSecHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TimeStampSecHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TimeStampSecHolder holder) { + handleSafe(index); + set(index, holder); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + TimeStampSecVector to = new TimeStampSecVector(ref, + field.getFieldType(), allocator); + return new TransferImpl(to); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TimeStampSecVector) to); + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TimeStampVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampVector.java new file mode 100644 index 0000000000000..4c70b819cbf37 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TimeStampVector.java @@ -0,0 +1,217 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TimeStampVector is an abstract interface for fixed width vector (8 bytes) + * of timestamp values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public abstract class TimeStampVector extends BaseFixedWidthVector { + protected static final byte TYPE_WIDTH = 8; + + /** + * Instantiate a TimeStampVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TimeStampVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, TimeStampVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final long value = from.valueBuffer.getLong(fromIndex * TYPE_WIDTH); + valueBuffer.setLong(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFromSafe(int, int, TimeStampVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, TimeStampVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + protected void setValue(int index, long value) { + valueBuffer.setLong(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, long)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static long get(final ArrowBuf buffer, final int index) { + return buffer.getLong(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + public class TransferImpl implements TransferPair { + TimeStampVector to; + + public TransferImpl(TimeStampVector to) { + this.to = to; + } + + @Override + public TimeStampVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TimeStampVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TinyIntVector.java b/java/vector/src/main/java/org/apache/arrow/vector/TinyIntVector.java new file mode 100644 index 0000000000000..306437f0fe9d6 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/TinyIntVector.java @@ -0,0 +1,394 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.TinyIntReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.TinyIntHolder; +import org.apache.arrow.vector.holders.NullableTinyIntHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * TinyIntVector implements a fixed width (1 bytes) vector of + * byte values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class TinyIntVector extends BaseFixedWidthVector { + public static final byte TYPE_WIDTH = 1; + private final FieldReader reader; + + /** + * Instantiate a TinyIntVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public TinyIntVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.TINYINT.getType()), + allocator); + } + + /** + * Instantiate a TinyIntVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public TinyIntVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new TinyIntReaderImpl(TinyIntVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.TINYINT; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public byte get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getByte(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableTinyIntHolder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getByte(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Byte getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getByte(index * TYPE_WIDTH); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, TinyIntVector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final byte value = from.valueBuffer.getByte(fromIndex * TYPE_WIDTH); + valueBuffer.setByte(thisIndex * TYPE_WIDTH, value); + } + + /** + * Same as {@link #copyFrom(int, int, TinyIntVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, TinyIntVector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + private void setValue(int index, int value) { + valueBuffer.setByte(index * TYPE_WIDTH, value); + } + + private void setValue(int index, byte value) { + valueBuffer.setByte(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, byte value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableTinyIntHolder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, TinyIntHolder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, byte)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, byte value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableTinyIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableTinyIntHolder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, TinyIntHolder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, TinyIntHolder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void set(int index, int isSet, byte value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Same as {@link #set(int, int, byte)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet 0 for NULL value, 1 otherwise + * @param value element value + */ + public void setSafe(int index, int isSet, byte value) { + handleSafe(index); + set(index, isSet, value); + } + + /** + * Given a data buffer, get the value stored at a particular position + * in the vector. + * + * This method should not be used externally. + * + * @param buffer data buffer + * @param index position of the element. + * @return value stored at the index. + */ + public static byte get(final ArrowBuf buffer, final int index) { + return buffer.getByte(index * TYPE_WIDTH); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((TinyIntVector) to); + } + + private class TransferImpl implements TransferPair { + TinyIntVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new TinyIntVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(TinyIntVector to) { + this.to = to; + } + + @Override + public TinyIntVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, TinyIntVector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java similarity index 64% rename from java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java rename to java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index 29407bf1ab4e1..d6f32b4b4b1d5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -16,20 +16,15 @@ * limitations under the License. */ -package org.apache.arrow.vector.schema; +package org.apache.arrow.vector; import static java.util.Arrays.asList; -import static org.apache.arrow.vector.schema.VectorLayout.booleanVector; -import static org.apache.arrow.vector.schema.VectorLayout.byteVector; -import static org.apache.arrow.vector.schema.VectorLayout.dataVector; -import static org.apache.arrow.vector.schema.VectorLayout.offsetVector; -import static org.apache.arrow.vector.schema.VectorLayout.typeVector; -import static org.apache.arrow.vector.schema.VectorLayout.validityVector; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import org.apache.arrow.vector.BufferLayout.BufferType; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; import org.apache.arrow.vector.types.pojo.ArrowType.Binary; @@ -47,14 +42,11 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; /** - * The layout of vectors for a given type - * It defines its own vectors followed by the vectors for the children + * The buffer layout of vectors for a given type + * It defines its own buffers followed by the buffers for the children * if it is a nested type (Struct_, List, Union) */ public class TypeLayout { @@ -64,24 +56,24 @@ public static TypeLayout getTypeLayout(final ArrowType arrowType) { @Override public TypeLayout visit(Int type) { - return newFixedWidthTypeLayout(dataVector(type.getBitWidth())); + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(type.getBitWidth())); } @Override public TypeLayout visit(Union type) { - List vectors; + List vectors; switch (type.getMode()) { case Dense: vectors = asList( // TODO: validate this - validityVector(), - typeVector(), - offsetVector() // offset to find the vector + BufferLayout.validityVector(), + BufferLayout.typeBuffer(), + BufferLayout.offsetBuffer() // offset to find the vector ); break; case Sparse: vectors = asList( - typeVector() // type of the value at the index or 0 if null + BufferLayout.typeBuffer() // type of the value at the index or 0 if null ); break; default: @@ -92,30 +84,30 @@ public TypeLayout visit(Union type) { @Override public TypeLayout visit(Struct type) { - List vectors = asList( - validityVector() + List vectors = asList( + BufferLayout.validityVector() ); return new TypeLayout(vectors); } @Override public TypeLayout visit(Timestamp type) { - return newFixedWidthTypeLayout(dataVector(64)); + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(64)); } @Override public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { - List vectors = asList( - validityVector(), - offsetVector() + List vectors = asList( + BufferLayout.validityVector(), + BufferLayout.offsetBuffer() ); return new TypeLayout(vectors); } @Override public TypeLayout visit(FixedSizeList type) { - List vectors = asList( - validityVector() + List vectors = asList( + BufferLayout.validityVector() ); return new TypeLayout(vectors); } @@ -136,18 +128,17 @@ public TypeLayout visit(FloatingPoint type) { default: throw new UnsupportedOperationException("Unsupported Precision: " + type.getPrecision()); } - return newFixedWidthTypeLayout(dataVector(bitWidth)); + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(bitWidth)); } @Override public TypeLayout visit(Decimal type) { - // TODO: check size - return newFixedWidthTypeLayout(dataVector(64)); // actually depends on the type fields + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(128)); } @Override public TypeLayout visit(Bool type) { - return newFixedWidthTypeLayout(booleanVector()); + return newFixedWidthTypeLayout(BufferLayout.booleanVector()); } @Override @@ -161,39 +152,46 @@ public TypeLayout visit(Utf8 type) { } private TypeLayout newVariableWidthTypeLayout() { - return newPrimitiveTypeLayout(validityVector(), offsetVector(), byteVector()); + return newPrimitiveTypeLayout(BufferLayout.validityVector(), BufferLayout.offsetBuffer(), BufferLayout.byteVector()); } - private TypeLayout newPrimitiveTypeLayout(VectorLayout... vectors) { + private TypeLayout newPrimitiveTypeLayout(BufferLayout... vectors) { return new TypeLayout(asList(vectors)); } - public TypeLayout newFixedWidthTypeLayout(VectorLayout dataVector) { - return newPrimitiveTypeLayout(validityVector(), dataVector); + public TypeLayout newFixedWidthTypeLayout(BufferLayout dataVector) { + return newPrimitiveTypeLayout(BufferLayout.validityVector(), dataVector); } @Override public TypeLayout visit(Null type) { - return new TypeLayout(Collections.emptyList()); + return new TypeLayout(Collections.emptyList()); } @Override public TypeLayout visit(Date type) { - return newFixedWidthTypeLayout(dataVector(64)); + switch (type.getUnit()) { + case DAY: + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(32)); + case MILLISECOND: + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(64)); + default: + throw new UnsupportedOperationException("Unknown unit " + type.getUnit()); + } } @Override public TypeLayout visit(Time type) { - return newFixedWidthTypeLayout(dataVector(type.getBitWidth())); + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(type.getBitWidth())); } @Override - public TypeLayout visit(Interval type) { // TODO: check size + public TypeLayout visit(Interval type) { switch (type.getUnit()) { case DAY_TIME: - return newFixedWidthTypeLayout(dataVector(64)); + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(64)); case YEAR_MONTH: - return newFixedWidthTypeLayout(dataVector(64)); + return newFixedWidthTypeLayout(BufferLayout.dataBuffer(32)); default: throw new UnsupportedOperationException("Unknown unit " + type.getUnit()); } @@ -203,39 +201,37 @@ public TypeLayout visit(Interval type) { // TODO: check size return layout; } - private final List vectors; + private final List bufferLayouts; - @JsonCreator - public TypeLayout(@JsonProperty("vectors") List vectors) { + public TypeLayout(List bufferLayouts) { super(); - this.vectors = Preconditions.checkNotNull(vectors); + this.bufferLayouts = Preconditions.checkNotNull(bufferLayouts); } - public TypeLayout(VectorLayout... vectors) { - this(asList(vectors)); + public TypeLayout(BufferLayout... bufferLayouts) { + this(asList(bufferLayouts)); } - public List getVectors() { - return vectors; + public List getBufferLayouts() { + return bufferLayouts; } - @JsonIgnore - public List getVectorTypes() { - List types = new ArrayList<>(vectors.size()); - for (VectorLayout vector : vectors) { + public List getBufferTypes() { + List types = new ArrayList<>(bufferLayouts.size()); + for (BufferLayout vector : bufferLayouts) { types.add(vector.getType()); } return types; } public String toString() { - return vectors.toString(); + return bufferLayouts.toString(); } @Override public int hashCode() { - return vectors.hashCode(); + return bufferLayouts.hashCode(); } @Override @@ -250,7 +246,7 @@ public boolean equals(Object obj) { return false; } TypeLayout other = (TypeLayout) obj; - return vectors.equals(other.vectors); + return bufferLayouts.equals(other.bufferLayouts); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java new file mode 100644 index 0000000000000..6901a889a6dcf --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java @@ -0,0 +1,316 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.UInt1ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.UInt1Holder; +import org.apache.arrow.vector.holders.NullableUInt1Holder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * UInt1Vector implements a fixed width (1 bytes) vector of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class UInt1Vector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 1; + private final FieldReader reader; + + public UInt1Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(org.apache.arrow.vector.types.Types.MinorType.UINT1.getType()), + allocator); + } + + public UInt1Vector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new UInt1ReaderImpl(UInt1Vector.this); + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.UINT1; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public byte get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getByte(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableUInt1Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getByte(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Byte getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getByte(index * TYPE_WIDTH); + } + } + + public void copyFrom(int fromIndex, int thisIndex, UInt1Vector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final byte value = from.valueBuffer.getByte(fromIndex * TYPE_WIDTH); + valueBuffer.setByte(thisIndex * TYPE_WIDTH, value); + } + + public void copyFromSafe(int fromIndex, int thisIndex, UInt1Vector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, int value) { + valueBuffer.setByte(index * TYPE_WIDTH, value); + } + + private void setValue(int index, byte value) { + valueBuffer.setByte(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, byte value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableUInt1Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, UInt1Holder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, byte)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, byte value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableUInt1Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableUInt1Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, UInt1Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, UInt1Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + public void set(int index, int isSet, byte value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + public void setSafe(int index, int isSet, byte value) { + handleSafe(index); + set(index, isSet, value); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((UInt1Vector) to); + } + + private class TransferImpl implements TransferPair { + UInt1Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new UInt1Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(UInt1Vector to) { + this.to = to; + } + + @Override + public UInt1Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, UInt1Vector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java new file mode 100644 index 0000000000000..968ce9151fae0 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java @@ -0,0 +1,317 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.UInt2ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.UInt2Holder; +import org.apache.arrow.vector.holders.NullableUInt2Holder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.slf4j.Logger; + +/** + * UInt2Vector implements a fixed width (2 bytes) vector of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class UInt2Vector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 2; + private final FieldReader reader; + + public UInt2Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(org.apache.arrow.vector.types.Types.MinorType.UINT2.getType()), + allocator); + } + + public UInt2Vector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new UInt2ReaderImpl(UInt2Vector.this); + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.UINT2; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public char get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getChar(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableUInt2Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getChar(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Character getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getChar(index * TYPE_WIDTH); + } + } + + public void copyFrom(int fromIndex, int thisIndex, UInt2Vector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final char value = from.valueBuffer.getChar(fromIndex * TYPE_WIDTH); + valueBuffer.setChar(thisIndex * TYPE_WIDTH, value); + } + + public void copyFromSafe(int fromIndex, int thisIndex, UInt2Vector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, int value) { + valueBuffer.setChar(index * TYPE_WIDTH, value); + } + + private void setValue(int index, char value) { + valueBuffer.setChar(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, char value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableUInt2Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, UInt2Holder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, char)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, char value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableUInt2Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableUInt2Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, UInt2Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, UInt2Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + public void set(int index, int isSet, char value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + public void setSafe(int index, int isSet, char value) { + handleSafe(index); + set(index, isSet, value); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((UInt2Vector) to); + } + + private class TransferImpl implements TransferPair { + UInt2Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new UInt2Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(UInt2Vector to) { + this.to = to; + } + + @Override + public UInt2Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, UInt2Vector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java new file mode 100644 index 0000000000000..af219cb061e48 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java @@ -0,0 +1,289 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.UInt4ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.UInt4Holder; +import org.apache.arrow.vector.holders.NullableUInt4Holder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.slf4j.Logger; + +/** + * UInt4Vector implements a fixed width (4 bytes) vector of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class UInt4Vector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 4; + private final FieldReader reader; + + public UInt4Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(org.apache.arrow.vector.types.Types.MinorType.UINT4.getType()), + allocator); + } + + public UInt4Vector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new UInt4ReaderImpl(UInt4Vector.this); + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.UINT4; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public int get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableUInt4Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getInt(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Integer getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getInt(index * TYPE_WIDTH); + } + } + + public void copyFrom(int fromIndex, int thisIndex, UInt4Vector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final int value = from.valueBuffer.getInt(fromIndex * TYPE_WIDTH); + valueBuffer.setInt(thisIndex * TYPE_WIDTH, value); + } + + public void copyFromSafe(int fromIndex, int thisIndex, UInt4Vector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, int value) { + valueBuffer.setInt(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, int value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableUInt4Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, UInt4Holder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, int)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, int value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableUInt4Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableUInt4Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, UInt4Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, UInt4Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + public void set(int index, int isSet, int value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + public void setSafe(int index, int isSet, int value) { + handleSafe(index); + set(index, isSet, value); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((UInt4Vector) to); + } + + private class TransferImpl implements TransferPair { + UInt4Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new UInt4Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(UInt4Vector to) { + this.to = to; + } + + @Override + public UInt4Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, UInt4Vector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java new file mode 100644 index 0000000000000..eae4a31729955 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java @@ -0,0 +1,289 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.UInt8ReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.UInt8Holder; +import org.apache.arrow.vector.holders.NullableUInt8Holder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; +import org.slf4j.Logger; + +/** + * UInt8Vector implements a fixed width vector (8 bytes) of + * integer values which could be null. A validity buffer (bit vector) is + * maintained to track which elements in the vector are null. + */ +public class UInt8Vector extends BaseFixedWidthVector { + private static final byte TYPE_WIDTH = 8; + private final FieldReader reader; + + public UInt8Vector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.UINT8.getType()), + allocator); + } + + public UInt8Vector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType, TYPE_WIDTH); + reader = new UInt8ReaderImpl(UInt8Vector.this); + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.UINT8; + } + + + /****************************************************************** + * * + * vector value retrieval methods * + * * + ******************************************************************/ + + + /** + * Get the element at the given index from the vector. + * + * @param index position of element + * @return element at given index + */ + public long get(int index) throws IllegalStateException { + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + return valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Get the element at the given index from the vector and + * sets the state in holder. If element at given index + * is null, holder.isSet will be zero. + * + * @param index position of element + */ + public void get(int index, NullableUInt8Holder holder) { + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.value = valueBuffer.getLong(index * TYPE_WIDTH); + } + + /** + * Same as {@link #get(int)}. + * + * @param index position of element + * @return element at given index + */ + public Long getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return valueBuffer.getLong(index * TYPE_WIDTH); + } + } + + public void copyFrom(int fromIndex, int thisIndex, UInt8Vector from) { + BitVectorHelper.setValidityBit(validityBuffer, thisIndex, from.isSet(fromIndex)); + final long value = from.valueBuffer.getLong(fromIndex * TYPE_WIDTH); + valueBuffer.setLong(thisIndex * TYPE_WIDTH, value); + } + + public void copyFromSafe(int fromIndex, int thisIndex, UInt8Vector from) { + handleSafe(thisIndex); + copyFrom(fromIndex, thisIndex, from); + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + private void setValue(int index, long value) { + valueBuffer.setLong(index * TYPE_WIDTH, value); + } + + /** + * Set the element at the given index to the given value. + * + * @param index position of element + * @param value value of element + */ + public void set(int index, long value) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, value); + } + + /** + * Set the element at the given index to the value set in data holder. + * If the value in holder is not indicated as set, element in the + * at the given index will be null. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void set(int index, NullableUInt8Holder holder) throws IllegalArgumentException { + if (holder.isSet < 0) { + throw new IllegalArgumentException(); + } else if (holder.isSet > 0) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + /** + * Set the element at the given index to the value set in data holder. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void set(int index, UInt8Holder holder) { + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + setValue(index, holder.value); + } + + /** + * Same as {@link #set(int, long)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param value value of element + */ + public void setSafe(int index, long value) { + handleSafe(index); + set(index, value); + } + + /** + * Same as {@link #set(int, NullableUInt8Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder nullable data holder for value of element + */ + public void setSafe(int index, NullableUInt8Holder holder) throws IllegalArgumentException { + handleSafe(index); + set(index, holder); + } + + /** + * Same as {@link #set(int, UInt8Holder)} except that it handles the + * case when index is greater than or equal to existing + * value capacity {@link #getValueCapacity()}. + * + * @param index position of element + * @param holder data holder for value of element + */ + public void setSafe(int index, UInt8Holder holder) { + handleSafe(index); + set(index, holder); + } + + /** + * Set the element at the given index to null. + * + * @param index position of element + */ + public void setNull(int index) { + handleSafe(index); + /* not really needed to set the bit to 0 as long as + * the buffer always starts from 0. + */ + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + + public void set(int index, int isSet, long value) { + if (isSet > 0) { + set(index, value); + } else { + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } + } + + public void setSafe(int index, int isSet, long value) { + handleSafe(index); + set(index, isSet, value); + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((UInt8Vector) to); + } + + private class TransferImpl implements TransferPair { + UInt8Vector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new UInt8Vector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(UInt8Vector to) { + this.to = to; + } + + @Override + public UInt8Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, UInt8Vector.this); + } + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index fb7286f852c06..24cf59a0da81d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + *

* http://www.apache.org/licenses/LICENSE-2.0 - * + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -100,10 +100,19 @@ public interface ValueVector extends Closeable, Iterable { void close(); /** - * Release the underlying ArrowBuf and reset the ValueVector to empty. + * Release any owned ArrowBuf and reset the ValueVector to the initial state. If the + * vector has any child vectors, they will also be cleared. */ void clear(); + /** + * Reset the ValueVector to the initial state without releasing any owned ArrowBuf. + * Buffer capacities will remain unchanged and any previous data will be zeroed out. + * This includes buffers for data, validity, offset, etc. If the vector has any + * child vectors, they will also be reset. + */ + void reset(); + /** * Get information about how this field is materialized. * @@ -135,18 +144,6 @@ public interface ValueVector extends Closeable, Iterable { */ TransferPair makeTransferPair(ValueVector target); - /** - * @return an {@link org.apache.arrow.vector.ValueVector.Accessor accessor} that is used to read from this vector - * instance. - */ - Accessor getAccessor(); - - /** - * @return an {@link org.apache.arrow.vector.ValueVector.Mutator mutator} that is used to write to this vector - * instance. - */ - Mutator getMutator(); - /** * @return a {@link org.apache.arrow.vector.complex.reader.FieldReader field reader} that supports reading values * from this vector. @@ -160,7 +157,7 @@ public interface ValueVector extends Closeable, Iterable { /** * Returns the number of bytes that is used by this vector if it holds the given number - * of values. The result will be the same as if Mutator.setValueCount() were called, followed + * of values. The result will be the same as if setValueCount() were called, followed * by calling getBufferSize(), but without any of the closing side-effects that setValueCount() * implies wrt finishing off the population of a vector. Some operations might wish to use * this to determine how much memory has been used by a vector so far, even though it is @@ -182,77 +179,56 @@ public interface ValueVector extends Closeable, Iterable { */ ArrowBuf[] getBuffers(boolean clear); - /** - * An abstraction that is used to read from this vector instance. - */ - interface Accessor { - /** - * Get the Java Object representation of the element at the specified position. Useful for testing. - * - * @param index Index of the value to get - * @return the friendly java type - */ - Object getObject(int index); - - /** - * @return the number of values that is stored in this vector. - */ - int getValueCount(); - - /** - * @param index the index to check for nullity - * @return true if the value at the given index is null, false otherwise. - */ - boolean isNull(int index); - - /** - * @return the number of null values - */ - int getNullCount(); - } - - /** - * An abstraction that is used to write into this vector instance. - */ - interface Mutator { - /** - * Sets the number of values that is stored in this vector to the given value count. - * - * @param valueCount value count to set. - */ - void setValueCount(int valueCount); - - /** - * Resets the mutator to pristine state. - */ - void reset(); - - /** - * @param values the number of values to generate - * @deprecated this has nothing to do with value vector abstraction and should be removed. - */ - @Deprecated - void generateTestData(int values); - } - /** * Gets the underlying buffer associated with validity vector * * @return buffer */ - public ArrowBuf getValidityBuffer(); + ArrowBuf getValidityBuffer(); /** * Gets the underlying buffer associated with data vector * * @return buffer */ - public ArrowBuf getDataBuffer(); + ArrowBuf getDataBuffer(); /** * Gets the underlying buffer associated with offset vector * * @return buffer */ - public ArrowBuf getOffsetBuffer(); + ArrowBuf getOffsetBuffer(); + + /** + * Gets the number of values + * @return + */ + int getValueCount(); + + /** + * Set number of values in the vector + * @return + */ + void setValueCount(int valueCount); + + /** + * Get friendly type object from the vector + * @param index + * @return + */ + Object getObject(int index); + + /** + * Returns number of null elements in the vector + * @return + */ + int getNullCount(); + + /** + * Check whether an element in the vector is null + * @param index + * @return + */ + boolean isNull(int index); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java new file mode 100644 index 0000000000000..893ad7cb594a7 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java @@ -0,0 +1,328 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.VarBinaryReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.VarBinaryHolder; +import org.apache.arrow.vector.holders.NullableVarBinaryHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +import java.nio.ByteBuffer; + +/** + * VarBinaryVector implements a variable width vector of binary + * values which could be NULL. A validity buffer (bit vector) is maintained + * to track which elements in the vector are null. + */ +public class VarBinaryVector extends BaseVariableWidthVector { + private final FieldReader reader; + + /** + * Instantiate a VarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public VarBinaryVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(Types.MinorType.VARBINARY.getType()), allocator); + } + + /** + * Instantiate a VarBinaryVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public VarBinaryVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType); + reader = new VarBinaryReaderImpl(VarBinaryVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.VARBINARY; + } + + + /****************************************************************** + * * + * vector value getter methods * + * * + ******************************************************************/ + + + /** + * Get the variable length element at specified index as byte array. + * + * @param index position of element to get + * @return array of bytes for non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + final int startOffset = getstartOffset(index); + final int dataLength = + offsetBuffer.getInt((index + 1) * OFFSET_WIDTH) - startOffset; + final byte[] result = new byte[dataLength]; + valueBuffer.getBytes(startOffset, result, 0, dataLength); + return result; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return byte array for non-null element, null otherwise + */ + public byte[] getObject(int index) { + byte[] b; + try { + b = get(index); + } catch (IllegalStateException e) { + return null; + } + return b; + } + + /** + * Get the variable length element at specified index and sets the state + * in provided holder. + * + * @param index position of element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableVarBinaryHolder holder) { + assert index >= 0; + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.start = getstartOffset(index); + holder.end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + holder.buffer = valueBuffer; + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, VarBinaryVector from) { + final int start = from.offsetBuffer.getInt(fromIndex * OFFSET_WIDTH); + final int end = from.offsetBuffer.getInt((fromIndex + 1) * OFFSET_WIDTH); + final int length = end - start; + fillHoles(thisIndex); + BitVectorHelper.setValidityBit(this.validityBuffer, thisIndex, from.isSet(fromIndex)); + final int copyStart = offsetBuffer.getInt(thisIndex * OFFSET_WIDTH); + from.valueBuffer.getBytes(start, this.valueBuffer, copyStart, length); + offsetBuffer.setInt((thisIndex + 1) * OFFSET_WIDTH, copyStart + length); + lastSet = thisIndex; + } + + /** + * Same as {@link #copyFrom(int, int, VarBinaryVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, VarBinaryVector from) { + final int start = from.offsetBuffer.getInt(fromIndex * OFFSET_WIDTH); + final int end = from.offsetBuffer.getInt((fromIndex + 1) * OFFSET_WIDTH); + final int length = end - start; + handleSafe(thisIndex, length); + fillHoles(thisIndex); + BitVectorHelper.setValidityBit(this.validityBuffer, thisIndex, from.isSet(fromIndex)); + final int copyStart = offsetBuffer.getInt(thisIndex * OFFSET_WIDTH); + from.valueBuffer.getBytes(start, this.valueBuffer, copyStart, length); + offsetBuffer.setInt((thisIndex + 1) * OFFSET_WIDTH, copyStart + length); + lastSet = thisIndex; + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, VarBinaryHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + final int dataLength = holder.end - holder.start; + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, VarBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, VarBinaryHolder holder) { + assert index >= 0; + final int dataLength = holder.end - holder.start; + fillEmpties(index); + handleSafe(index, dataLength); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableVarBinaryHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final int dataLength = holder.end - holder.start; + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, NullableVarBinaryHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableVarBinaryHolder holder) { + assert index >= 0; + final int dataLength = holder.end - holder.start; + fillEmpties(index); + handleSafe(index, dataLength); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((VarBinaryVector) to); + } + + private class TransferImpl implements TransferPair { + VarBinaryVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new VarBinaryVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(VarBinaryVector to) { + this.to = to; + } + + @Override + public VarBinaryVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, VarBinaryVector.this); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java new file mode 100644 index 0000000000000..8a38b1d455d20 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java @@ -0,0 +1,331 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.impl.VarCharReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.VarCharHolder; +import org.apache.arrow.vector.holders.NullableVarCharHolder; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.Text; +import org.apache.arrow.vector.util.TransferPair; + +import java.nio.ByteBuffer; + +/** + * VarCharVector implements a variable width vector of VARCHAR + * values which could be NULL. A validity buffer (bit vector) is maintained + * to track which elements in the vector are null. + */ +public class VarCharVector extends BaseVariableWidthVector { + private final FieldReader reader; + + /** + * Instantiate a VarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public VarCharVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(org.apache.arrow.vector.types.Types.MinorType.VARCHAR.getType()), allocator); + } + + /** + * Instantiate a VarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public VarCharVector(String name, FieldType fieldType, BufferAllocator allocator) { + super(name, allocator, fieldType); + reader = new VarCharReaderImpl(VarCharVector.this); + } + + /** + * Get a reader that supports reading values from this vector + * @return Field Reader for this vector + */ + @Override + public FieldReader getReader() { + return reader; + } + + /** + * Get minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public Types.MinorType getMinorType() { + return Types.MinorType.VARCHAR; + } + + + /****************************************************************** + * * + * vector value getter methods * + * * + ******************************************************************/ + + + /** + * Get the variable length element at specified index as byte array. + * + * @param index position of element to get + * @return array of bytes for non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (isSet(index) == 0) { + throw new IllegalStateException("Value at index is null"); + } + final int startOffset = getstartOffset(index); + final int dataLength = + offsetBuffer.getInt((index + 1) * OFFSET_WIDTH) - startOffset; + final byte[] result = new byte[dataLength]; + valueBuffer.getBytes(startOffset, result, 0, dataLength); + return result; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of element to get + * @return Text object for non-null element, null otherwise + */ + public Text getObject(int index) { + Text result = new Text(); + byte[] b; + try { + b = get(index); + } catch (IllegalStateException e) { + return null; + } + result.set(b); + return result; + } + + /** + * Get the variable length element at specified index and sets the state + * in provided holder. + * + * @param index position of element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableVarCharHolder holder) { + assert index >= 0; + if (isSet(index) == 0) { + holder.isSet = 0; + return; + } + holder.isSet = 1; + holder.start = getstartOffset(index); + holder.end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + holder.buffer = valueBuffer; + } + + + /****************************************************************** + * * + * vector value setter methods * + * * + ******************************************************************/ + + + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFrom(int fromIndex, int thisIndex, VarCharVector from) { + final int start = from.offsetBuffer.getInt(fromIndex * OFFSET_WIDTH); + final int end = from.offsetBuffer.getInt((fromIndex + 1) * OFFSET_WIDTH); + final int length = end - start; + fillHoles(thisIndex); + BitVectorHelper.setValidityBit(this.validityBuffer, thisIndex, from.isSet(fromIndex)); + final int copyStart = offsetBuffer.getInt(thisIndex * OFFSET_WIDTH); + from.valueBuffer.getBytes(start, this.valueBuffer, copyStart, length); + offsetBuffer.setInt((thisIndex + 1) * OFFSET_WIDTH, copyStart + length); + lastSet = thisIndex; + } + + /** + * Same as {@link #copyFrom(int, int, VarCharVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + public void copyFromSafe(int fromIndex, int thisIndex, VarCharVector from) { + final int start = from.offsetBuffer.getInt(fromIndex * OFFSET_WIDTH); + final int end = from.offsetBuffer.getInt((fromIndex + 1) * OFFSET_WIDTH); + final int length = end - start; + handleSafe(thisIndex, length); + fillHoles(thisIndex); + BitVectorHelper.setValidityBit(this.validityBuffer, thisIndex, from.isSet(fromIndex)); + final int copyStart = offsetBuffer.getInt(thisIndex * OFFSET_WIDTH); + from.valueBuffer.getBytes(start, this.valueBuffer, copyStart, length); + offsetBuffer.setInt((thisIndex + 1) * OFFSET_WIDTH, copyStart + length); + lastSet = thisIndex; + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, VarCharHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + final int dataLength = holder.end - holder.start; + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, VarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, VarCharHolder holder) { + assert index >= 0; + final int dataLength = holder.end - holder.start; + fillEmpties(index); + handleSafe(index, dataLength); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableVarCharHolder holder) { + assert index >= 0; + fillHoles(index); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final int dataLength = holder.end - holder.start; + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, NullableVarCharHolder)} except that it handles the + * case where index and length of new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableVarCharHolder holder) { + assert index >= 0; + final int dataLength = holder.end - holder.start; + fillEmpties(index); + handleSafe(index, dataLength); + BitVectorHelper.setValidityBit(validityBuffer, index, holder.isSet); + final int startOffset = getstartOffset(index); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); + valueBuffer.setBytes(startOffset, holder.buffer, holder.start, dataLength); + lastSet = index; + } + + + /****************************************************************** + * * + * vector transfer * + * * + ******************************************************************/ + + /** + * Construct a TransferPair comprising of this and and a target vector of + * the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(ref, allocator); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((VarCharVector) to); + } + + private class TransferImpl implements TransferPair { + VarCharVector to; + + public TransferImpl(String ref, BufferAllocator allocator) { + to = new VarCharVector(ref, field.getFieldType(), allocator); + } + + public TransferImpl(VarCharVector to) { + this.to = to; + } + + @Override + public VarCharVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, VarCharVector.this); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java index 04c00b7c8349c..593d4dceaf782 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java @@ -35,17 +35,5 @@ public interface VariableWidthVector extends ValueVector { */ int getByteCapacity(); - VariableWidthMutator getMutator(); - - VariableWidthAccessor getAccessor(); - - interface VariableWidthAccessor extends Accessor { - int getValueLength(int index); - } - int getCurrentSizeInBytes(); - - interface VariableWidthMutator extends Mutator { - void setValueLengthSafe(int index, int length); - } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorDefinitionSetter.java similarity index 94% rename from java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java rename to java/vector/src/main/java/org/apache/arrow/vector/VectorDefinitionSetter.java index 1e0746aabaa61..2f45d3a5b6316 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorDefinitionSetter.java @@ -18,7 +18,7 @@ package org.apache.arrow.vector; -public interface NullableVectorDefinitionSetter { +public interface VectorDefinitionSetter { public void setIndexDefined(int index); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java index 58fc80bbba17c..c933d149f8d00 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -24,9 +24,8 @@ import java.util.Iterator; import java.util.List; -import org.apache.arrow.vector.schema.ArrowFieldNode; -import org.apache.arrow.vector.schema.ArrowRecordBatch; -import org.apache.arrow.vector.schema.VectorLayout; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.Field; import com.google.common.collect.Iterators; @@ -71,9 +70,9 @@ private void loadBuffers(FieldVector vector, Field field, Iterator buf checkArgument(nodes.hasNext(), "no more field nodes for for field " + field + " and vector " + vector); ArrowFieldNode fieldNode = nodes.next(); - List typeLayout = field.getTypeLayout().getVectors(); - List ownBuffers = new ArrayList<>(typeLayout.size()); - for (int j = 0; j < typeLayout.size(); j++) { + List bufferLayouts = TypeLayout.getTypeLayout(field.getType()).getBufferLayouts(); + List ownBuffers = new ArrayList<>(bufferLayouts.size()); + for (int j = 0; j < bufferLayouts.size(); j++) { ownBuffers.add(buffers.next()); } try { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java index 0c8868cad55b5..3fd33d66d4ef5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java @@ -38,7 +38,7 @@ public class VectorSchemaRoot implements AutoCloseable { private final Map fieldVectorsMap = new HashMap<>(); public VectorSchemaRoot(FieldVector parent) { - this(parent.getField().getChildren(), parent.getChildrenFromFields(), parent.getAccessor().getValueCount()); + this(parent.getField().getChildren(), parent.getChildrenFromFields(), parent.getValueCount()); } public VectorSchemaRoot(List fields, List fieldVectors, int rowCount) { @@ -140,7 +140,7 @@ public String contentToTSVString() { for (int i = 0; i < rowCount; i++) { row.clear(); for (FieldVector v : fieldVectors) { - row.add(v.getAccessor().getObject(i)); + row.add(v.getObject(i)); } printRow(sb, row); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java index f8385a7262a21..94981ef02f657 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java @@ -22,10 +22,9 @@ import java.util.List; import io.netty.buffer.ArrowBuf; -import org.apache.arrow.vector.ValueVector.Accessor; -import org.apache.arrow.vector.schema.ArrowFieldNode; -import org.apache.arrow.vector.schema.ArrowRecordBatch; -import org.apache.arrow.vector.schema.ArrowVectorType; +import org.apache.arrow.vector.BufferLayout.BufferType; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; public class VectorUnloader { @@ -53,10 +52,9 @@ public ArrowRecordBatch getRecordBatch() { } private void appendNodes(FieldVector vector, List nodes, List buffers) { - Accessor accessor = vector.getAccessor(); - nodes.add(new ArrowFieldNode(accessor.getValueCount(), includeNullCount ? accessor.getNullCount() : -1)); + nodes.add(new ArrowFieldNode(vector.getValueCount(), includeNullCount ? vector.getNullCount() : -1)); List fieldBuffers = vector.getFieldBuffers(); - List expectedBuffers = vector.getField().getTypeLayout().getVectorTypes(); + List expectedBuffers = TypeLayout.getTypeLayout(vector.getField().getType()).getBufferTypes(); if (fieldBuffers.size() != expectedBuffers.size()) { throw new IllegalArgumentException(String.format( "wrong number of buffers for field %s in vector %s. found: %s", diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index b267b2087d05c..2d3c543d03ffa 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -18,6 +18,8 @@ package org.apache.arrow.vector; +import static org.apache.arrow.vector.complex.BaseRepeatedValueVector.DATA_VECTOR_NAME; + import java.util.Collections; import java.util.Iterator; import java.util.List; @@ -26,7 +28,7 @@ import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.complex.impl.NullReader; import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.Field; @@ -39,8 +41,6 @@ public class ZeroVector implements FieldVector { public final static ZeroVector INSTANCE = new ZeroVector(); - private final String name = "[DEFAULT]"; - private final TransferPair defaultPair = new TransferPair() { @Override public void transfer() { @@ -60,41 +60,6 @@ public void copyValueSafe(int from, int to) { } }; - private final Accessor defaultAccessor = new Accessor() { - @Override - public Object getObject(int index) { - return null; - } - - @Override - public int getValueCount() { - return 0; - } - - @Override - public boolean isNull(int index) { - return true; - } - - @Override - public int getNullCount() { - return 0; - } - }; - - private final Mutator defaultMutator = new Mutator() { - @Override - public void setValueCount(int valueCount) { - } - - @Override - public void reset() { - } - - @Override - public void generateTestData(int values) { - } - }; public ZeroVector() { } @@ -107,9 +72,13 @@ public void close() { public void clear() { } + @Override + public void reset() { + } + @Override public Field getField() { - return new Field(name, FieldType.nullable(new Null()), null); + return new Field(DATA_VECTOR_NAME, FieldType.nullable(new Null()), null); } @Override @@ -186,16 +155,6 @@ public TransferPair makeTransferPair(ValueVector target) { return defaultPair; } - @Override - public Accessor getAccessor() { - return defaultAccessor; - } - - @Override - public Mutator getMutator() { - return defaultMutator; - } - @Override public FieldReader getReader() { return NullReader.INSTANCE; @@ -259,4 +218,19 @@ public ArrowBuf getDataBuffer() { public ArrowBuf getOffsetBuffer() { throw new UnsupportedOperationException(); } + + @Override + public int getValueCount() { return 0; } + + @Override + public void setValueCount(int valueCount) { } + + @Override + public Object getObject(int index) { return null; } + + @Override + public int getNullCount() { return 0; } + + @Override + public boolean isNull(int index) { return false; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java index 26b0f90581ffc..1b20b09d81b0b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java @@ -141,8 +141,8 @@ public T addOrGet(String childName, FieldType fieldType, } private boolean nullFilled(ValueVector vector) { - for (int r = 0; r < vector.getAccessor().getValueCount(); r++) { - if (!vector.getAccessor().isNull(r)) { + for (int r = 0; r < vector.getValueCount(); r++) { + if (!vector.isNull(r)) { return false; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 8e2877f892a64..d0a664ac01da2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + *

* http://www.apache.org/licenses/LICENSE-2.0 - * + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,6 +21,7 @@ import java.util.Collections; import java.util.Iterator; +import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.AddOrGetResult; import org.apache.arrow.vector.BaseValueVector; @@ -28,9 +29,12 @@ import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.util.SchemaChangeRuntimeException; import com.google.common.base.Preconditions; @@ -41,12 +45,14 @@ public abstract class BaseRepeatedValueVector extends BaseValueVector implements RepeatedValueVector { public final static FieldVector DEFAULT_DATA_VECTOR = ZeroVector.INSTANCE; - public final static String OFFSETS_VECTOR_NAME = "$offsets$"; public final static String DATA_VECTOR_NAME = "$data$"; - protected final UInt4Vector offsets; + public final static byte OFFSET_WIDTH = 4; + protected ArrowBuf offsetBuffer; protected FieldVector vector; protected final CallBack callBack; + protected int valueCount; + protected int offsetAllocationSizeInBytes = INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH; protected BaseRepeatedValueVector(String name, BufferAllocator allocator, CallBack callBack) { this(name, allocator, DEFAULT_DATA_VECTOR, callBack); @@ -54,42 +60,72 @@ protected BaseRepeatedValueVector(String name, BufferAllocator allocator, CallBa protected BaseRepeatedValueVector(String name, BufferAllocator allocator, FieldVector vector, CallBack callBack) { super(name, allocator); - this.offsets = new UInt4Vector(OFFSETS_VECTOR_NAME, allocator); + this.offsetBuffer = allocator.getEmpty(); this.vector = Preconditions.checkNotNull(vector, "data vector cannot be null"); this.callBack = callBack; + this.valueCount = 0; } @Override public boolean allocateNewSafe() { - /* boolean to keep track if all the memory allocation were successful - * Used in the case of composite vectors when we need to allocate multiple - * buffers for multiple vectors. If one of the allocations failed we need to - * clear all the memory that we allocated - */ - boolean success = false; + boolean dataAlloc = false; try { - if (!offsets.allocateNewSafe()) { - return false; - } - success = vector.allocateNewSafe(); + allocateOffsetBuffer(offsetAllocationSizeInBytes); + dataAlloc = vector.allocateNewSafe(); + } catch (Exception e) { + e.printStackTrace(); + clear(); + return false; } finally { - if (!success) { + if (!dataAlloc) { clear(); } } - offsets.zeroVector(); - return success; + return dataAlloc; + } + + protected void allocateOffsetBuffer(final long size) { + final int curSize = (int) size; + offsetBuffer = allocator.buffer(curSize); + offsetBuffer.readerIndex(0); + offsetAllocationSizeInBytes = curSize; + offsetBuffer.setZero(0, offsetBuffer.capacity()); } @Override public void reAlloc() { - offsets.reAlloc(); + reallocOffsetBuffer(); vector.reAlloc(); } + protected void reallocOffsetBuffer() { + final int currentBufferCapacity = offsetBuffer.capacity(); + long baseSize = offsetAllocationSizeInBytes; + + if (baseSize < (long) currentBufferCapacity) { + baseSize = (long) currentBufferCapacity; + } + + long newAllocationSize = baseSize * 2L; + newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setBytes(0, offsetBuffer, 0, currentBufferCapacity); + final int halfNewCapacity = newBuf.capacity() / 2; + newBuf.setZero(halfNewCapacity, halfNewCapacity); + offsetBuffer.release(1); + offsetBuffer = newBuf; + offsetAllocationSizeInBytes = (int) newAllocationSize; + } + @Override + @Deprecated public UInt4Vector getOffsetVector() { - return offsets; + throw new UnsupportedOperationException("There is no inner offset vector"); } @Override @@ -99,25 +135,33 @@ public FieldVector getDataVector() { @Override public void setInitialCapacity(int numRecords) { - offsets.setInitialCapacity(numRecords + 1); - vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); + offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH; + if (vector instanceof BaseFixedWidthVector || vector instanceof BaseVariableWidthVector) { + vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); + } else { + vector.setInitialCapacity(numRecords); + } } @Override public int getValueCapacity() { - final int offsetValueCapacity = Math.max(offsets.getValueCapacity() - 1, 0); + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0); if (vector == DEFAULT_DATA_VECTOR) { return offsetValueCapacity; } return Math.min(vector.getValueCapacity(), offsetValueCapacity); } + protected int getOffsetBufferValueCapacity() { + return (int) ((offsetBuffer.capacity() * 1.0) / OFFSET_WIDTH); + } + @Override public int getBufferSize() { - if (getAccessor().getValueCount() == 0) { + if (getValueCount() == 0) { return 0; } - return offsets.getBufferSize() + vector.getBufferSize(); + return ((valueCount + 1) * OFFSET_WIDTH) + vector.getBufferSize(); } @Override @@ -126,7 +170,7 @@ public int getBufferSizeFor(int valueCount) { return 0; } - return offsets.getBufferSizeFor(valueCount + 1) + vector.getBufferSizeFor(valueCount); + return ((valueCount + 1) * OFFSET_WIDTH) + vector.getBufferSizeFor(valueCount); } @Override @@ -136,14 +180,28 @@ public Iterator iterator() { @Override public void clear() { - offsets.clear(); + offsetBuffer = releaseBuffer(offsetBuffer); vector.clear(); + valueCount = 0; super.clear(); } + @Override + public void reset() { + offsetBuffer.setZero(0, offsetBuffer.capacity()); + vector.reset(); + valueCount = 0; + } + @Override public ArrowBuf[] getBuffers(boolean clear) { - final ArrowBuf[] buffers = ObjectArrays.concat(offsets.getBuffers(false), vector.getBuffers(false), ArrowBuf.class); + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = ObjectArrays.concat(new ArrowBuf[]{offsetBuffer}, vector.getBuffers(false), + ArrowBuf.class); + } if (clear) { for (ArrowBuf buffer : buffers) { buffer.retain(); @@ -167,15 +225,15 @@ public AddOrGetResult addOrGetVector(FieldType fieldT // returned vector must have the same field created = true; if (callBack != null && - // not a schema change if changing from ZeroVector to ZeroVector - (fieldType.getType().getTypeID() != ArrowTypeID.Null)) { + // not a schema change if changing from ZeroVector to ZeroVector + (fieldType.getType().getTypeID() != ArrowTypeID.Null)) { callBack.doWork(); } } if (vector.getField().getType().getTypeID() != fieldType.getType().getTypeID()) { final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", - fieldType.getType().getTypeID(), vector.getField().getType().getTypeID()); + fieldType.getType().getTypeID(), vector.getField().getType().getTypeID()); throw new SchemaChangeRuntimeException(msg); } @@ -187,54 +245,49 @@ protected void replaceDataVector(FieldVector v) { vector = v; } - public abstract class BaseRepeatedAccessor extends BaseValueVector.BaseAccessor implements RepeatedAccessor { - @Override - public int getValueCount() { - return Math.max(offsets.getAccessor().getValueCount() - 1, 0); - } + @Override + public int getValueCount() { + return valueCount; + } - @Override - public int getInnerValueCount() { - return vector.getAccessor().getValueCount(); - } + /* returns the value count for inner data vector for this list vector */ + public int getInnerValueCount() { + return vector.getValueCount(); + } - @Override - public int getInnerValueCountAt(int index) { - return offsets.getAccessor().get(index + 1) - offsets.getAccessor().get(index); - } - @Override - public boolean isNull(int index) { - return false; - } + /* returns the value count for inner data vector at a particular index */ + public int getInnerValueCountAt(int index) { + return offsetBuffer.getInt((index + 1) * OFFSET_WIDTH) - + offsetBuffer.getInt(index * OFFSET_WIDTH); + } - @Override - public boolean isEmpty(int index) { - return false; - } + public boolean isNull(int index) { + return false; } - public abstract class BaseRepeatedMutator extends BaseValueVector.BaseMutator implements RepeatedMutator { + public boolean isEmpty(int index) { + return false; + } - @Override - public int startNewValue(int index) { - while (offsets.getValueCapacity() <= index) { - offsets.reAlloc(); - } - int offset = offsets.getAccessor().get(index); - offsets.getMutator().setSafe(index + 1, offset); - setValueCount(index + 1); - return offset; + public int startNewValue(int index) { + while (index >= getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); } + int offset = offsetBuffer.getInt(index * OFFSET_WIDTH); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, offset); + setValueCount(index + 1); + return offset; + } - @Override - public void setValueCount(int valueCount) { - // TODO: populate offset end points - offsets.getMutator().setValueCount(valueCount == 0 ? 0 : valueCount + 1); - final int childValueCount = valueCount == 0 ? 0 : offsets.getAccessor().get(valueCount); - vector.getMutator().setValueCount(childValueCount); + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + while (valueCount > getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); } + final int childValueCount = valueCount == 0 ? 0 : + offsetBuffer.getInt(valueCount * OFFSET_WIDTH); + vector.setValueCount(childValueCount); } - } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java index a76fbbe11a1fb..cd3a2ae9e6f3c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java @@ -42,14 +42,12 @@ public void populate(int lastIndex) { if (lastIndex < 0) { throw new IndexOutOfBoundsException("index cannot be negative"); } - final UInt4Vector.Accessor accessor = offsets.getAccessor(); - final UInt4Vector.Mutator mutator = offsets.getMutator(); - final int lastSet = Math.max(accessor.getValueCount() - 1, 0); - final int previousEnd = accessor.get(lastSet);//0 ? 0 : accessor.get(lastSet); + final int lastSet = Math.max(offsets.getValueCount() - 1, 0); + final int previousEnd = offsets.get(lastSet);//0 ? 0 : accessor.get(lastSet); for (int i = lastSet; i < lastIndex; i++) { - mutator.setSafe(i + 1, previousEnd); + offsets.setSafe(i + 1, previousEnd); } - mutator.setValueCount(lastIndex + 1); + offsets.setValueCount(lastIndex + 1); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index b3be37541c172..9314a2566b124 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -1,5 +1,4 @@ -/******************************************************************************* - +/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -7,52 +6,40 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + *

* http://www.apache.org/licenses/LICENSE-2.0 - * + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + */ package org.apache.arrow.vector.complex; import static java.util.Collections.singletonList; import static org.apache.arrow.vector.complex.BaseRepeatedValueVector.DATA_VECTOR_NAME; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.Objects; +import java.util.*; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.ObjectArrays; import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; -import org.apache.arrow.vector.AddOrGetResult; -import org.apache.arrow.vector.BaseDataValueVector; -import org.apache.arrow.vector.BaseValueVector; -import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.BufferBacked; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.*; import org.apache.arrow.vector.complex.impl.UnionFixedSizeListReader; -import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; -import org.apache.arrow.vector.util.CallBack; -import org.apache.arrow.vector.util.JsonStringArrayList; -import org.apache.arrow.vector.util.SchemaChangeRuntimeException; -import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.*; public class FixedSizeListVector extends BaseValueVector implements FieldVector, PromotableVector { @@ -62,15 +49,13 @@ public static FixedSizeListVector empty(String name, int size, BufferAllocator a } private FieldVector vector; - private final BitVector bits; + private ArrowBuf validityBuffer; private final int listSize; private final FieldType fieldType; - private final List innerVectors; private UnionFixedSizeListReader reader; - - private Mutator mutator = new Mutator(); - private Accessor accessor = new Accessor(); + private int valueCount; + private int validityAllocationSizeInBytes; // deprecated, use FieldType or static constructor instead @Deprecated @@ -87,13 +72,14 @@ public FixedSizeListVector(String name, FieldType fieldType, CallBack schemaChangeCallback) { super(name, allocator); - this.bits = new BitVector("$bits$", allocator); + this.validityBuffer = allocator.getEmpty(); this.vector = ZeroVector.INSTANCE; this.fieldType = fieldType; this.listSize = ((ArrowType.FixedSizeList) fieldType.getType()).getListSize(); Preconditions.checkArgument(listSize > 0, "list size must be positive"); - this.innerVectors = Collections.singletonList((BufferBacked) bits); this.reader = new UnionFixedSizeListReader(this); + this.valueCount = 0; + this.validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); } @Override @@ -131,27 +117,37 @@ public List getChildrenFromFields() { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - BaseDataValueVector.load(fieldNode, innerVectors, ownBuffers); + if (ownBuffers.size() != 1) { + throw new IllegalArgumentException("Illegal buffer count, expected " + 1 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + + validityBuffer.release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + valueCount = fieldNode.getLength(); + + validityAllocationSizeInBytes = validityBuffer.capacity(); } @Override public List getFieldBuffers() { - return BaseDataValueVector.unload(innerVectors); - } + List result = new ArrayList<>(1); + setReaderAndWriterIndex(); + result.add(validityBuffer); - @Override - public List getFieldInnerVectors() { - return innerVectors; + return result; } - @Override - public Accessor getAccessor() { - return accessor; + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); } @Override - public Mutator getMutator() { - return mutator; + @Deprecated + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); } @Override @@ -161,7 +157,9 @@ public UnionFixedSizeListReader getReader() { @Override public void allocateNew() throws OutOfMemoryException { - allocateNewSafe(); + if (!allocateNewSafe()) { + throw new OutOfMemoryException("Failure while allocating memory"); + } } @Override @@ -173,31 +171,65 @@ public boolean allocateNewSafe() { */ boolean success = false; try { - success = bits.allocateNewSafe() && vector.allocateNewSafe(); + /* we are doing a new allocation -- release the current buffers */ + clear(); + /* allocate validity buffer */ + allocateValidityBuffer(validityAllocationSizeInBytes); + success = vector.allocateNewSafe(); } finally { if (!success) { clear(); + return false; } } - if (success) { - bits.zeroVector(); - } - return success; + + return true; + } + + private void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = curSize; + validityBuffer.setZero(0, validityBuffer.capacity()); } @Override public void reAlloc() { - bits.reAlloc(); + reallocValidityBuffer(); vector.reAlloc(); } + private void reallocValidityBuffer() { + final int currentBufferCapacity = validityBuffer.capacity(); + long baseSize = validityAllocationSizeInBytes; + + if (baseSize < (long) currentBufferCapacity) { + baseSize = (long) currentBufferCapacity; + } + + long newAllocationSize = baseSize * 2L; + newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setZero(0, newBuf.capacity()); + newBuf.setBytes(0, validityBuffer, 0, currentBufferCapacity); + validityBuffer.release(1); + validityBuffer = newBuf; + validityAllocationSizeInBytes = (int) newAllocationSize; + } + public FieldVector getDataVector() { return vector; } @Override public void setInitialCapacity(int numRecords) { - bits.setInitialCapacity(numRecords); + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); vector.setInitialCapacity(numRecords * listSize); } @@ -206,15 +238,15 @@ public int getValueCapacity() { if (vector == ZeroVector.INSTANCE) { return 0; } - return vector.getValueCapacity() / listSize; + return Math.min(vector.getValueCapacity() / listSize, getValidityBufferValueCapacity()); } @Override public int getBufferSize() { - if (accessor.getValueCount() == 0) { + if (getValueCount() == 0) { return 0; } - return bits.getBufferSize() + vector.getBufferSize(); + return getValidityBufferSizeFromCount(valueCount) + vector.getBufferSize(); } @Override @@ -222,7 +254,8 @@ public int getBufferSizeFor(int valueCount) { if (valueCount == 0) { return 0; } - return bits.getBufferSizeFor(valueCount) + vector.getBufferSizeFor(valueCount * listSize); + return getValidityBufferSizeFromCount(valueCount) + + vector.getBufferSizeFor(valueCount * listSize); } @Override @@ -232,14 +265,29 @@ public Iterator iterator() { @Override public void clear() { - bits.clear(); + validityBuffer = releaseBuffer(validityBuffer); vector.clear(); + valueCount = 0; super.clear(); } + @Override + public void reset() { + validityBuffer.setZero(0, validityBuffer.capacity()); + vector.reset(); + valueCount = 0; + } + @Override public ArrowBuf[] getBuffers(boolean clear) { - final ArrowBuf[] buffers = ObjectArrays.concat(bits.getBuffers(false), vector.getBuffers(false), ArrowBuf.class); + setReaderAndWriterIndex(); + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = ObjectArrays.concat(new ArrowBuf[]{validityBuffer}, vector.getBuffers(false), + ArrowBuf.class); + } if (clear) { for (ArrowBuf buffer : buffers) { buffer.retain(); @@ -268,7 +316,7 @@ public AddOrGetResult addOrGetVector(FieldType type) // returned vector must have the same field if (!Objects.equals(vector.getField().getType(), type.getType())) { final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", - type.getType(), vector.getField().getType()); + type.getType(), vector.getField().getType()); throw new SchemaChangeRuntimeException(msg); } @@ -295,7 +343,7 @@ public UnionVector promoteToUnion() { @Override public long getValidityBufferAddress() { - return (bits.getDataBuffer().memoryAddress()); + return validityBuffer.memoryAddress(); } @Override @@ -310,7 +358,7 @@ public long getOffsetBufferAddress() { @Override public ArrowBuf getValidityBuffer() { - return (bits.getDataBuffer()); + return validityBuffer; } @Override @@ -323,52 +371,64 @@ public ArrowBuf getOffsetBuffer() { throw new UnsupportedOperationException(); } - public class Accessor extends BaseValueVector.BaseAccessor { - - @Override - public Object getObject(int index) { - if (isNull(index)) { - return null; - } - final List vals = new JsonStringArrayList<>(listSize); - final ValueVector.Accessor valuesAccessor = vector.getAccessor(); - for (int i = 0; i < listSize; i++) { - vals.add(valuesAccessor.getObject(index * listSize + i)); - } - return vals; + @Override + public Object getObject(int index) { + if (isSet(index) == 0) { + return null; } - - @Override - public boolean isNull(int index) { - return bits.getAccessor().get(index) == 0; + final List vals = new JsonStringArrayList<>(listSize); + for (int i = 0; i < listSize; i++) { + vals.add(vector.getObject(index * listSize + i)); } + return vals; + } - @Override - public int getNullCount() { - return bits.getAccessor().getNullCount(); - } + public boolean isNull(int index) { + return (isSet(index) == 0); + } - @Override - public int getValueCount() { - return bits.getAccessor().getValueCount(); - } + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return Long.bitCount(b & (1L << bitIndex)); } - public class Mutator extends BaseValueVector.BaseMutator { + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + @Override + public int getValueCount() { + return valueCount; + } + + private int getValidityBufferValueCapacity() { + return (int) (validityBuffer.capacity() * 8L); + } - public void setNull(int index) { - bits.getMutator().setSafe(index, 0); + public void setNull(int index) { + while (index >= getValidityBufferValueCapacity()) { + reallocValidityBuffer(); } + BitVectorHelper.setValidityBit(validityBuffer, index, 0); + } - public void setNotNull(int index) { - bits.getMutator().setSafe(index, 1); + public void setNotNull(int index) { + while (index >= getValidityBufferValueCapacity()) { + reallocValidityBuffer(); } + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + } - @Override - public void setValueCount(int valueCount) { - bits.getMutator().setValueCount(valueCount); - vector.getMutator().setValueCount(valueCount * listSize); + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + while (valueCount > getValidityBufferValueCapacity()) { + reallocValidityBuffer(); } + vector.setValueCount(valueCount * listSize); } @Override @@ -389,7 +449,7 @@ public TransferPair makeTransferPair(ValueVector target) { private class TransferImpl implements TransferPair { FixedSizeListVector to; - TransferPair pairs[] = new TransferPair[2]; + TransferPair dataPair; public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { this(new FixedSizeListVector(name, allocator, fieldType, callBack)); @@ -398,19 +458,21 @@ public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { public TransferImpl(FixedSizeListVector to) { this.to = to; to.addOrGetVector(vector.getField().getFieldType()); - pairs[0] = bits.makeTransferPair(to.bits); - pairs[1] = vector.makeTransferPair(to.vector); + dataPair = vector.makeTransferPair(to.vector); } @Override public void transfer() { - for (TransferPair pair : pairs) { - pair.transfer(); - } + to.clear(); + dataPair.transfer(); + to.validityBuffer = validityBuffer.transferOwnership(to.allocator).buffer; + to.setValueCount(valueCount); + clear(); } @Override public void splitAndTransfer(int startIndex, int length) { + to.clear(); to.allocateNew(); for (int i = 0; i < length; i++) { copyValueSafe(startIndex + i, i); @@ -423,12 +485,15 @@ public ValueVector getTo() { } @Override - public void copyValueSafe(int from, int to) { - pairs[0].copyValueSafe(from, to); - int fromOffset = from * listSize; - int toOffset = to * listSize; + public void copyValueSafe(int fromIndex, int toIndex) { + while (toIndex >= to.getValueCapacity()) { + to.reAlloc(); + } + BitVectorHelper.setValidityBit(to.validityBuffer, toIndex, isSet(fromIndex)); + int fromOffset = fromIndex * listSize; + int toOffset = toIndex * listSize; for (int i = 0; i < listSize; i++) { - pairs[1].copyValueSafe(fromOffset + i, toOffset + i); + dataPair.copyValueSafe(fromOffset + i, toOffset + i); } } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 6511efcb7d513..8aeeb7e5a2886 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -1,5 +1,4 @@ -/******************************************************************************* - +/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -7,54 +6,51 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + *

* http://www.apache.org/licenses/LICENSE-2.0 - * + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + */ package org.apache.arrow.vector.complex; import static com.google.common.base.Preconditions.checkNotNull; import static java.util.Collections.singletonList; -import java.util.Arrays; -import java.util.Collections; +import java.util.ArrayList; import java.util.List; import com.google.common.collect.ImmutableList; import com.google.common.collect.ObjectArrays; import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.AddOrGetResult; -import org.apache.arrow.vector.BaseDataValueVector; -import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.BufferBacked; import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.complex.impl.ComplexCopier; import org.apache.arrow.vector.complex.impl.UnionListReader; import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.complex.writer.FieldWriter; -import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; -import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.util.TransferPair; public class ListVector extends BaseRepeatedValueVector implements FieldVector, PromotableVector { @@ -63,14 +59,12 @@ public static ListVector empty(String name, BufferAllocator allocator) { return new ListVector(name, allocator, FieldType.nullable(ArrowType.List.INSTANCE), null); } - final UInt4Vector offsets; - final BitVector bits; - private final List innerVectors; - private Mutator mutator = new Mutator(); - private Accessor accessor = new Accessor(); + protected ArrowBuf validityBuffer; private UnionListReader reader; private CallBack callBack; private final FieldType fieldType; + private int validityAllocationSizeInBytes; + private int lastSet; // deprecated, use FieldType or static constructor instead @Deprecated @@ -86,12 +80,12 @@ public ListVector(String name, BufferAllocator allocator, DictionaryEncoding dic public ListVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { super(name, allocator, callBack); - this.bits = new BitVector("$bits$", allocator); - this.offsets = getOffsetVector(); - this.innerVectors = Collections.unmodifiableList(Arrays.asList(bits, offsets)); + this.validityBuffer = allocator.getEmpty(); this.reader = new UnionListReader(this); this.fieldType = checkNotNull(fieldType); this.callBack = callBack; + this.validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); + this.lastSet = 0; } @Override @@ -113,44 +107,176 @@ public List getChildrenFromFields() { return singletonList(getDataVector()); } + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - // variable width values: truncate offset vector buffer to size (#1) - org.apache.arrow.vector.BaseDataValueVector.truncateBufferBasedOnSize(ownBuffers, 1, offsets.getBufferSizeFor(fieldNode.getLength() + 1)); - BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); + if (ownBuffers.size() != 2) { + throw new IllegalArgumentException("Illegal buffer count, expected " + 2 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf offBuffer = ownBuffers.get(1); + + validityBuffer.release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + offsetBuffer.release(); + offsetBuffer = offBuffer.retain(allocator); + + validityAllocationSizeInBytes = validityBuffer.capacity(); + offsetAllocationSizeInBytes = offsetBuffer.capacity(); + lastSet = fieldNode.getLength(); + valueCount = fieldNode.getLength(); } + /** + * Get the buffers belonging to this vector + * @return the inner buffers. + */ @Override public List getFieldBuffers() { - return BaseDataValueVector.unload(getFieldInnerVectors()); + List result = new ArrayList<>(2); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(offsetBuffer); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + offsetBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + offsetBuffer.writerIndex(0); + } else { + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH); + } } @Override + @Deprecated public List getFieldInnerVectors() { - return innerVectors; + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); } public UnionListWriter getWriter() { return new UnionListWriter(this); } + /** + * Same as {@link #allocateNewSafe()}. + */ @Override public void allocateNew() throws OutOfMemoryException { - super.allocateNewSafe(); - bits.allocateNewSafe(); + if (!allocateNewSafe()) { + throw new OutOfMemoryException("Failure while allocating memory"); + } + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. + * + * @return false if memory allocation fails, true otherwise. + */ + public boolean allocateNewSafe() { + boolean success = false; + try { + /* we are doing a new allocation -- release the current buffers */ + clear(); + /* allocate validity buffer */ + allocateValidityBuffer(validityAllocationSizeInBytes); + /* allocate offset and data buffer */ + success = super.allocateNewSafe(); + } finally { + if (!success) { + clear(); + return false; + } + } + return true; + } + + private void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = curSize; + validityBuffer.setZero(0, validityBuffer.capacity()); } + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ @Override public void reAlloc() { + /* reallocate the validity buffer */ + reallocValidityBuffer(); + /* reallocate the offset and data */ super.reAlloc(); - bits.reAlloc(); } + private void reallocValidityAndOffsetBuffers() { + reallocOffsetBuffer(); + reallocValidityBuffer(); + } + + private void reallocValidityBuffer() { + final int currentBufferCapacity = validityBuffer.capacity(); + long baseSize = validityAllocationSizeInBytes; + + if (baseSize < (long) currentBufferCapacity) { + baseSize = (long) currentBufferCapacity; + } + + long newAllocationSize = baseSize * 2L; + newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setBytes(0, validityBuffer, 0, currentBufferCapacity); + final int halfNewCapacity = newBuf.capacity() / 2; + newBuf.setZero(halfNewCapacity, halfNewCapacity); + validityBuffer.release(1); + validityBuffer = newBuf; + validityAllocationSizeInBytes = (int) newAllocationSize; + } + + /** + * Same as {@link #copyFrom(int, int, ListVector)} except that + * it handles the case when the capacity of the vector needs to be expanded + * before copy. + * @param inIndex position to copy from in source vector + * @param outIndex position to copy to in this vector + * @param from source vector + */ public void copyFromSafe(int inIndex, int outIndex, ListVector from) { copyFrom(inIndex, outIndex, from); } + /** + * Copy a cell value from a particular index in source vector to a particular + * position in this vector + * @param inIndex position to copy from in source vector + * @param outIndex position to copy to in this vector + * @param from source vector + */ public void copyFrom(int inIndex, int outIndex, ListVector from) { FieldReader in = from.getReader(); in.setPosition(inIndex); @@ -159,6 +285,10 @@ public void copyFrom(int inIndex, int outIndex, ListVector from) { ComplexCopier.copy(in, out); } + /** + * Get the inner data vector for this list vector + * @return data vector + */ @Override public FieldVector getDataVector() { return vector; @@ -181,7 +311,7 @@ public TransferPair makeTransferPair(ValueVector target) { @Override public long getValidityBufferAddress() { - return (bits.getDataBuffer().memoryAddress()); + return (validityBuffer.memoryAddress()); } @Override @@ -191,11 +321,13 @@ public long getDataBufferAddress() { @Override public long getOffsetBufferAddress() { - return (offsets.getDataBuffer().memoryAddress()); + return (offsetBuffer.memoryAddress()); } @Override - public ArrowBuf getValidityBuffer() { return bits.getDataBuffer(); } + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } @Override public ArrowBuf getDataBuffer() { @@ -203,17 +335,15 @@ public ArrowBuf getDataBuffer() { } @Override - public ArrowBuf getOffsetBuffer() { return offsets.getDataBuffer(); } + public ArrowBuf getOffsetBuffer() { + return offsetBuffer; + } private class TransferImpl implements TransferPair { ListVector to; - TransferPair bitsTransferPair; - TransferPair offsetsTransferPair; TransferPair dataTransferPair; - TransferPair[] pairs; - public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { this(new ListVector(name, allocator, fieldType, callBack)); } @@ -221,39 +351,112 @@ public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { public TransferImpl(ListVector to) { this.to = to; to.addOrGetVector(vector.getField().getFieldType()); - offsetsTransferPair = offsets.makeTransferPair(to.offsets); - bitsTransferPair = bits.makeTransferPair(to.bits); if (to.getDataVector() instanceof ZeroVector) { to.addOrGetVector(vector.getField().getFieldType()); } dataTransferPair = getDataVector().makeTransferPair(to.getDataVector()); - pairs = new TransferPair[] {bitsTransferPair, offsetsTransferPair, dataTransferPair}; } + /** + * Transfer this vector'data to another vector. The memory associated + * with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + */ @Override public void transfer() { - for (TransferPair pair : pairs) { - pair.transfer(); - } + to.clear(); + dataTransferPair.transfer(); + to.validityBuffer = validityBuffer.transferOwnership(to.allocator).buffer; + to.offsetBuffer = offsetBuffer.transferOwnership(to.allocator).buffer; to.lastSet = lastSet; + if (valueCount > 0) { + to.setValueCount(valueCount); + } + clear(); } + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + */ @Override public void splitAndTransfer(int startIndex, int length) { - UInt4Vector.Accessor offsetVectorAccessor = ListVector.this.offsets.getAccessor(); - final int startPoint = offsetVectorAccessor.get(startIndex); - final int sliceLength = offsetVectorAccessor.get(startIndex + length) - startPoint; + final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); + final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); - to.offsets.allocateNew(length + 1); - offsetVectorAccessor = ListVector.this.offsets.getAccessor(); - final UInt4Vector.Mutator targetOffsetVectorMutator = to.offsets.getMutator(); + to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + /* splitAndTransfer offset buffer */ for (int i = 0; i < length + 1; i++) { - targetOffsetVectorMutator.set(i, offsetVectorAccessor.get(startIndex + i) - startPoint); + final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; + to.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeOffset); } - bitsTransferPair.splitAndTransfer(startIndex, length); + /* splitAndTransfer validity buffer */ + splitAndTransferValidityBuffer(startIndex, length, to); + /* splitAndTransfer data buffer */ dataTransferPair.splitAndTransfer(startPoint, sliceLength); to.lastSet = length; - to.mutator.setValueCount(length); + to.setValueCount(length); + } + + /* + * transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, ListVector target) { + assert startIndex + length <= valueCount; + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = getValidityBufferSizeFromCount(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } } @Override @@ -267,66 +470,34 @@ public void copyValueSafe(int from, int to) { } } - @Override - public Accessor getAccessor() { - return accessor; - } - - @Override - public Mutator getMutator() { - return mutator; - } - @Override public UnionListReader getReader() { return reader; } - @Override - public boolean allocateNewSafe() { - /* boolean to keep track if all the memory allocation were successful - * Used in the case of composite vectors when we need to allocate multiple - * buffers for multiple vectors. If one of the allocations failed we need to - * clear all the memory that we allocated - */ - boolean success = false; - try { - if (!offsets.allocateNewSafe()) { - return false; - } - success = vector.allocateNewSafe(); - success = success && bits.allocateNewSafe(); - } finally { - if (!success) { - clear(); - } - } - if (success) { - offsets.zeroVector(); - bits.zeroVector(); - } - return success; - } - public AddOrGetResult addOrGetVector(FieldType fieldType) { AddOrGetResult result = super.addOrGetVector(fieldType); reader = new UnionListReader(this); return result; } + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector + * @return size of underlying buffers. + */ @Override public int getBufferSize() { - if (getAccessor().getValueCount() == 0) { + if (valueCount == 0) { return 0; } - return offsets.getBufferSize() + bits.getBufferSize() + vector.getBufferSize(); + final int offsetBufferSize = (valueCount + 1) * OFFSET_WIDTH; + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + return offsetBufferSize + validityBufferSize + vector.getBufferSize(); } @Override public Field getField() { - if (getDataVector() instanceof ZeroVector) { - return new Field(name, fieldType, ImmutableList.of(new Field(DATA_VECTOR_NAME, FieldType.nullable(Null.INSTANCE), null))); - } return new Field(name, fieldType, ImmutableList.of(getDataVector().getField())); } @@ -337,17 +508,40 @@ public MinorType getMinorType() { @Override public void clear() { - offsets.clear(); - vector.clear(); - bits.clear(); - lastSet = 0; super.clear(); + validityBuffer = releaseBuffer(validityBuffer); + lastSet = 0; } + @Override + public void reset() { + super.reset(); + validityBuffer.setZero(0, validityBuffer.capacity()); + lastSet = 0; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link io.netty.buffer.ArrowBuf buffers} that is used by this + * vector instance. + */ @Override public ArrowBuf[] getBuffers(boolean clear) { - final ArrowBuf[] buffers = ObjectArrays.concat(offsets.getBuffers(false), ObjectArrays.concat(bits.getBuffers(false), - vector.getBuffers(false), ArrowBuf.class), ArrowBuf.class); + setReaderAndWriterIndex(); + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = ObjectArrays.concat(new ArrowBuf[]{offsetBuffer}, + ObjectArrays.concat(new ArrowBuf[]{validityBuffer}, + vector.getBuffers(false), ArrowBuf.class), ArrowBuf.class); + } if (clear) { for (ArrowBuf buffer : buffers) { buffer.retain(); @@ -368,86 +562,150 @@ public UnionVector promoteToUnion() { return vector; } - private int lastSet = 0; + /** + * Get the element in the list vector at a particular index + * @param index position of the element + * @return Object at given position + */ + @Override + public Object getObject(int index) { + if (isSet(index) == 0) { + return null; + } + final List vals = new JsonStringArrayList<>(); + final int start = offsetBuffer.getInt(index * OFFSET_WIDTH); + final int end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + final ValueVector vv = getDataVector(); + for (int i = start; i < end; i++) { + vals.add(vv.getObject(i)); + } - public class Accessor extends BaseRepeatedAccessor { + return vals; + } - @Override - public Object getObject(int index) { - if (isNull(index)) { - return null; - } - final List vals = new JsonStringArrayList<>(); - final UInt4Vector.Accessor offsetsAccessor = offsets.getAccessor(); - final int start = offsetsAccessor.get(index); - final int end = offsetsAccessor.get(index + 1); - final ValueVector.Accessor valuesAccessor = getDataVector().getAccessor(); - for (int i = start; i < end; i++) { - vals.add(valuesAccessor.getObject(i)); - } - return vals; - } + /** + * Check if element at given index is null. + * + * @param index position of element + * @return true if element at given index is null, false otherwise + */ + @Override + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return Long.bitCount(b & (1L << bitIndex)); + } + + /** + * Get the number of elements that are null in the vector + * + * @return the number of null elements. + */ + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } - @Override - public boolean isNull(int index) { - return bits.getAccessor().get(index) == 0; - } + /** + * Get the current value capacity for the vector + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + return Math.min(getValidityBufferValueCapacity(), super.getValueCapacity()); + } - @Override - public int getNullCount() { - return bits.getAccessor().getNullCount(); - } + private int getValidityAndOffsetValueCapacity() { + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0); + return Math.min(offsetValueCapacity, getValidityBufferValueCapacity()); } - public class Mutator extends BaseRepeatedMutator { - public void setNotNull(int index) { - bits.getMutator().setSafe(index, 1); - lastSet = index + 1; - } + private int getValidityBufferValueCapacity() { + return (int) (validityBuffer.capacity() * 8L); + } - @Override - public int startNewValue(int index) { - for (int i = lastSet; i <= index; i++) { - offsets.getMutator().setSafe(i + 1, offsets.getAccessor().get(i)); - } - setNotNull(index); - lastSet = index + 1; - return offsets.getAccessor().get(lastSet); + public void setNotNull(int index) { + while (index >= getValidityAndOffsetValueCapacity()) { + reallocValidityAndOffsetBuffers(); } + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + lastSet = index + 1; + } - /** - * End the current value - * - * @param index index of the value to end - * @param size number of elements in the list that was written - */ - public void endValue(int index, int size) { - offsets.getMutator().set(index + 1, offsets.getAccessor().get(index + 1) + size); + /** + * Start a new value in the list vector + * + * @param index index of the value to start + */ + @Override + public int startNewValue(int index) { + while (index >= getValidityAndOffsetValueCapacity()) { + reallocValidityAndOffsetBuffers(); } - - @Override - public void setValueCount(int valueCount) { - // TODO: populate offset end points - if (valueCount == 0) { - offsets.getMutator().setValueCount(0); - } else { - for (int i = lastSet; i < valueCount; i++) { - offsets.getMutator().setSafe(i + 1, offsets.getAccessor().get(i)); - } - offsets.getMutator().setValueCount(valueCount + 1); - } - final int childValueCount = valueCount == 0 ? 0 : offsets.getAccessor().get(valueCount); - vector.getMutator().setValueCount(childValueCount); - bits.getMutator().setValueCount(valueCount); + for (int i = lastSet; i <= index; i++) { + final int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH); + offsetBuffer.setInt((i + 1) * OFFSET_WIDTH, currentOffset); } - - public void setLastSet(int value) { - lastSet = value; + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + lastSet = index + 1; + return offsetBuffer.getInt(lastSet * OFFSET_WIDTH); + } + + /** + * End the current value + * + * @param index index of the value to end + * @param size number of elements in the list that was written + */ + public void endValue(int index, int size) { + final int currentOffset = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); + offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, currentOffset + size); + } + + /** + * Sets the value count for the vector + * + * @param valueCount value count + */ + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + if (valueCount > 0) { + while (valueCount > getValidityAndOffsetValueCapacity()) { + /* check if validity and offset buffers need to be re-allocated */ + reallocValidityAndOffsetBuffers(); + } + for (int i = lastSet; i < valueCount; i++) { + /* fill the holes with offsets */ + final int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH); + offsetBuffer.setInt((i + 1) * OFFSET_WIDTH, currentOffset); + } } + /* valueCount for the data vector is the current end offset */ + final int childValueCount = (valueCount == 0) ? 0 : + offsetBuffer.getInt(valueCount * OFFSET_WIDTH); + /* set the value count of data vector and this will take care of + * checking whether data buffer needs to be reallocated. + */ + vector.setValueCount(childValueCount); + } - public int getLastSet() { - return lastSet; - } + public void setLastSet(int value) { + lastSet = value; } + public int getLastSet() { + return lastSet; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index f46635ad76021..6eab6ef3dcdcc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -35,9 +35,7 @@ import io.netty.buffer.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.BaseValueVector; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.*; import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.ComplexHolder; @@ -58,8 +56,6 @@ public static MapVector empty(String name, BufferAllocator allocator) { } private final SingleMapReaderImpl reader = new SingleMapReaderImpl(this); - private final Accessor accessor = new Accessor(); - private final Mutator mutator = new Mutator(); protected final FieldType fieldType; public int valueCount; @@ -72,6 +68,7 @@ public MapVector(String name, BufferAllocator allocator, CallBack callBack) { public MapVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { super(name, allocator, callBack); this.fieldType = checkNotNull(fieldType); + this.valueCount = 0; } @Override @@ -232,7 +229,7 @@ public void splitAndTransfer(int startIndex, int length) { for (TransferPair p : pairs) { p.splitAndTransfer(startIndex, length); } - to.getMutator().setValueCount(length); + to.setValueCount(length); } } @@ -256,64 +253,45 @@ public int compare(@Nullable ValueVector left, @Nullable ValueVector right) { } @Override - public Accessor getAccessor() { - return accessor; - } - - @Override - public Mutator getMutator() { - return mutator; - } - - public class Accessor extends BaseValueVector.BaseAccessor { - - @Override - public Object getObject(int index) { - Map vv = new JsonStringHashMap<>(); - for (String child : getChildFieldNames()) { - ValueVector v = getChild(child); - if (v != null && index < v.getAccessor().getValueCount()) { - Object value = v.getAccessor().getObject(index); - if (value != null) { - vv.put(child, value); - } + public Object getObject(int index) { + Map vv = new JsonStringHashMap<>(); + for (String child : getChildFieldNames()) { + ValueVector v = getChild(child); + if (v != null && index < v.getValueCount()) { + Object value = v.getObject(index); + if (value != null) { + vv.put(child, value); } } - return vv; } + return vv; + } - public void get(int index, ComplexHolder holder) { - reader.setPosition(index); - holder.reader = reader; - } + @Override + public boolean isNull(int index) { return false; } + @Override + public int getNullCount() { return 0; } - @Override - public int getValueCount() { - return valueCount; - } + public void get(int index, ComplexHolder holder) { + reader.setPosition(index); + holder.reader = reader; } - public ValueVector getVectorById(int id) { - return getChildByOrdinal(id); + @Override + public int getValueCount() { + return valueCount; } - public class Mutator extends BaseValueVector.BaseMutator { - - @Override - public void setValueCount(int valueCount) { - for (final ValueVector v : getChildren()) { - v.getMutator().setValueCount(valueCount); - } - MapVector.this.valueCount = valueCount; - } - - @Override - public void reset() { - } + public ValueVector getVectorById(int id) { + return getChildByOrdinal(id); +} - @Override - public void generateTestData(int values) { + @Override + public void setValueCount(int valueCount) { + for (final ValueVector v : getChildren()) { + v.setValueCount(valueCount); } + MapVector.this.valueCount = valueCount; } @Override @@ -324,6 +302,14 @@ public void clear() { valueCount = 0; } + @Override + public void reset() { + for (final ValueVector v : getChildren()) { + v.reset(); + } + valueCount = 0; + } + @Override public Field getField() { List children = new ArrayList<>(); @@ -361,5 +347,4 @@ public void initializeChildrenFromFields(List children) { public List getChildrenFromFields() { return getChildren(); } - } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java index 1c9d5aa265963..fb84d23b5074e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + *

* http://www.apache.org/licenses/LICENSE-2.0 - * + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,6 +20,7 @@ import static com.google.common.base.Preconditions.checkNotNull; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -27,23 +28,20 @@ import com.google.common.collect.ObjectArrays; import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.BaseDataValueVector; -import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.BufferBacked; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.NullableVectorDefinitionSetter; -import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.*; import org.apache.arrow.vector.complex.impl.NullableMapReaderImpl; import org.apache.arrow.vector.complex.impl.NullableMapWriter; import org.apache.arrow.vector.holders.ComplexHolder; -import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.Struct; import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.util.TransferPair; public class NullableMapVector extends MapVector implements FieldVector { @@ -56,12 +54,8 @@ public static NullableMapVector empty(String name, BufferAllocator allocator) { private final NullableMapReaderImpl reader = new NullableMapReaderImpl(this); private final NullableMapWriter writer = new NullableMapWriter(this); - protected final BitVector bits; - - private final List innerVectors; - - private final Accessor accessor; - private final Mutator mutator; + protected ArrowBuf validityBuffer; + private int validityAllocationSizeInBytes; // deprecated, use FieldType or static constructor instead @Deprecated @@ -77,10 +71,8 @@ public NullableMapVector(String name, BufferAllocator allocator, DictionaryEncod public NullableMapVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { super(name, checkNotNull(allocator), fieldType, callBack); - this.bits = new BitVector("$bits$", allocator); - this.innerVectors = Collections.unmodifiableList(Arrays.asList(bits)); - this.accessor = new Accessor(); - this.mutator = new Mutator(); + this.validityBuffer = allocator.getEmpty(); + this.validityAllocationSizeInBytes = BitVectorHelper.getValidityBufferSize(BaseValueVector.INITIAL_VALUE_ALLOCATION); } @Override @@ -92,18 +84,36 @@ public Field getField() { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - BaseDataValueVector.load(fieldNode, getFieldInnerVectors(), ownBuffers); - this.valueCount = fieldNode.getLength(); + if (ownBuffers.size() != 1) { + throw new IllegalArgumentException("Illegal buffer count, expected " + 1 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + + validityBuffer.release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + valueCount = fieldNode.getLength(); + validityAllocationSizeInBytes = validityBuffer.capacity(); } @Override public List getFieldBuffers() { - return BaseDataValueVector.unload(getFieldInnerVectors()); + List result = new ArrayList<>(1); + setReaderAndWriterIndex(); + result.add(validityBuffer); + + return result; + } + + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSize(valueCount)); } @Override + @Deprecated public List getFieldInnerVectors() { - return innerVectors; + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); } @Override @@ -146,63 +156,204 @@ protected NullableMapTransferPair(NullableMapVector from, NullableMapVector to, @Override public void transfer() { - bits.transferTo(target.bits); + target.clear(); + target.validityBuffer = validityBuffer.transferOwnership(target.allocator).buffer; super.transfer(); + clear(); } @Override public void copyValueSafe(int fromIndex, int toIndex) { - target.bits.copyFromSafe(fromIndex, toIndex, bits); + while (toIndex >= target.getValidityBufferValueCapacity()) { + target.reallocValidityBuffer(); + } + BitVectorHelper.setValidityBit(target.validityBuffer, toIndex, isSet(fromIndex)); super.copyValueSafe(fromIndex, toIndex); } @Override public void splitAndTransfer(int startIndex, int length) { - bits.splitAndTransferTo(startIndex, length, target.bits); + target.clear(); + splitAndTransferValidityBuffer(startIndex, length, target); super.splitAndTransfer(startIndex, length); } } - @Override - public int getValueCapacity() { - return Math.min(bits.getValueCapacity(), super.getValueCapacity()); + /* + * transfer the validity. + */ + private void splitAndTransferValidityBuffer(int startIndex, int length, NullableMapVector target) { + assert startIndex + length <= valueCount; + int firstByteSource = BitVectorHelper.byteIndex(startIndex); + int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); + int byteSizeTarget = BitVectorHelper.getValidityBufferSize(length); + int offset = startIndex % 8; + + if (length > 0) { + if (offset == 0) { + // slice + if (target.validityBuffer != null) { + target.validityBuffer.release(); + } + target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); + target.validityBuffer.retain(1); + } else { + /* Copy data + * When the first bit starts from the middle of a byte (offset != 0), + * copy data from src BitVector. + * Each byte in the target is composed by a part in i-th byte, + * another part in (i+1)-th byte. + */ + target.allocateValidityBuffer(byteSizeTarget); + + for (int i = 0; i < byteSizeTarget - 1; i++) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, firstByteSource + i, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, firstByteSource + i + 1, offset); + + target.validityBuffer.setByte(i, (b1 + b2)); + } + + /* Copying the last piece is done in the following manner: + * if the source vector has 1 or more bytes remaining, we copy + * the last piece as a byte formed by shifting data + * from the current byte and the next byte. + * + * if the source vector has no more bytes remaining + * (we are at the last byte), we copy the last piece as a byte + * by shifting data from the current byte. + */ + if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + byte b2 = BitVectorHelper.getBitsFromNextByte(validityBuffer, + firstByteSource + byteSizeTarget, offset); + + target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); + } else { + byte b1 = BitVectorHelper.getBitsFromCurrentByte(validityBuffer, + firstByteSource + byteSizeTarget - 1, offset); + target.validityBuffer.setByte(byteSizeTarget - 1, b1); + } + } + } } + /** + * Get the value capacity of the internal validity buffer. + * @return number of elements that validity buffer can hold + */ + private int getValidityBufferValueCapacity() { + return (int) (validityBuffer.capacity() * 8L); + } + + /** + * Get the current value capacity for the vector + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + return Math.min(getValidityBufferValueCapacity(), + super.getValueCapacity()); + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer so it only should be used for in-context + * access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning; the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link io.netty.buffer.ArrowBuf buffers} that is used by this + * vector instance. + */ @Override public ArrowBuf[] getBuffers(boolean clear) { - return ObjectArrays.concat(bits.getBuffers(clear), super.getBuffers(clear), ArrowBuf.class); + setReaderAndWriterIndex(); + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = ObjectArrays.concat(new ArrowBuf[]{validityBuffer}, super.getBuffers(false), + ArrowBuf.class); + } + if (clear) { + for (ArrowBuf buffer : buffers) { + buffer.retain(); + } + clear(); + } + + return buffers; } + /** + * Close the vector and release the associated buffers. + */ @Override public void close() { - bits.close(); + clearValidityBuffer(); super.close(); } + /** + * Same as {@link #close()} + */ @Override public void clear() { - bits.clear(); + clearValidityBuffer(); super.clear(); } + /** + * Reset this vector to empty, does not release buffers + */ + @Override + public void reset() { + super.reset(); + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /** + * Release the validity buffer + */ + private void clearValidityBuffer() { + validityBuffer.release(); + validityBuffer = allocator.getEmpty(); + } + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector + * @return size of underlying buffers. + */ @Override public int getBufferSize() { - return super.getBufferSize() + bits.getBufferSize(); + if (valueCount == 0) { + return 0; + } + return super.getBufferSize() + + BitVectorHelper.getValidityBufferSize(valueCount); } + /** + * Get the potential buffer size for a particular number of records. + * @param valueCount desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ @Override public int getBufferSizeFor(final int valueCount) { if (valueCount == 0) { return 0; } return super.getBufferSizeFor(valueCount) - + bits.getBufferSizeFor(valueCount); + + BitVectorHelper.getValidityBufferSize(valueCount); } @Override public void setInitialCapacity(int numRecords) { - bits.setInitialCapacity(numRecords); + validityAllocationSizeInBytes = BitVectorHelper.getValidityBufferSize(numRecords); super.setInitialCapacity(numRecords); } @@ -215,25 +366,59 @@ public boolean allocateNewSafe() { */ boolean success = false; try { - success = super.allocateNewSafe() && bits.allocateNewSafe(); + clear(); + allocateValidityBuffer(validityAllocationSizeInBytes); + success = super.allocateNewSafe(); } finally { if (!success) { clear(); + return false; } } - bits.zeroVector(); - return success; + return true; + } + + private void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = curSize; + validityBuffer.setZero(0, validityBuffer.capacity()); } @Override public void reAlloc() { - bits.reAlloc(); + /* reallocate the validity buffer */ + reallocValidityBuffer(); super.reAlloc(); } + private void reallocValidityBuffer() { + final int currentBufferCapacity = validityBuffer.capacity(); + long baseSize = validityAllocationSizeInBytes; + + if (baseSize < (long) currentBufferCapacity) { + baseSize = (long) currentBufferCapacity; + } + + long newAllocationSize = baseSize * 2L; + newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + + if (newAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); + newBuf.setZero(0, newBuf.capacity()); + newBuf.setBytes(0, validityBuffer, 0, currentBufferCapacity); + validityBuffer.release(1); + validityBuffer = newBuf; + validityAllocationSizeInBytes = (int) newAllocationSize; + } + @Override public long getValidityBufferAddress() { - return bits.getBuffer().memoryAddress(); + return validityBuffer.memoryAddress(); } @Override @@ -248,7 +433,7 @@ public long getOffsetBufferAddress() { @Override public ArrowBuf getValidityBuffer() { - return bits.getDataBuffer(); + return validityBuffer; } @Override @@ -261,82 +446,60 @@ public ArrowBuf getOffsetBuffer() { throw new UnsupportedOperationException(); } - public final class Accessor extends MapVector.Accessor { - final BitVector.Accessor bAccessor = bits.getAccessor(); - - @Override - public Object getObject(int index) { - if (isNull(index)) { - return null; - } else { - return super.getObject(index); - } - } - - @Override - public void get(int index, ComplexHolder holder) { - holder.isSet = isSet(index); - super.get(index, holder); - } - - @Override - public int getNullCount() { - return bits.getAccessor().getNullCount(); - } - - @Override - public boolean isNull(int index) { - return isSet(index) == 0; - } - - public int isSet(int index) { - return bAccessor.get(index); + @Override + public Object getObject(int index) { + if (isSet(index) == 0) { + return null; + } else { + return super.getObject(index); } - } - public final class Mutator extends MapVector.Mutator implements NullableVectorDefinitionSetter { - - private Mutator() { - } - - @Override - public void setIndexDefined(int index) { - bits.getMutator().setSafe(index, 1); - } + @Override + public void get(int index, ComplexHolder holder) { + holder.isSet = isSet(index); + super.get(index, holder); + } - public void setNull(int index) { - bits.getMutator().setSafe(index, 0); - } + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } - @Override - public void setValueCount(int valueCount) { - assert valueCount >= 0; - super.setValueCount(valueCount); - bits.getMutator().setValueCount(valueCount); - } + public boolean isNull(int index) { + return isSet(index) == 0; + } - @Override - public void generateTestData(int valueCount) { - super.generateTestData(valueCount); - bits.getMutator().generateTestDataAlt(valueCount); - } + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return Long.bitCount(b & (1L << bitIndex)); + } - @Override - public void reset() { - bits.getMutator().setValueCount(0); + public void setIndexDefined(int index) { + while (index >= getValidityBufferValueCapacity()) { + /* realloc the inner buffers if needed */ + reallocValidityBuffer(); } - + BitVectorHelper.setValidityBitToOne(validityBuffer, index); } - @Override - public Accessor getAccessor() { - return accessor; + public void setNull(int index) { + while (index >= getValidityBufferValueCapacity()) { + /* realloc the inner buffers if needed */ + reallocValidityBuffer(); + } + BitVectorHelper.setValidityBit(validityBuffer, index, 0); } @Override - public Mutator getMutator() { - return mutator; + public void setValueCount(int valueCount) { + assert valueCount >= 0; + while (valueCount > getValidityBufferValueCapacity()) { + /* realloc the inner buffers if needed */ + reallocValidityBuffer(); + } + super.setValueCount(valueCount); + this.valueCount = valueCount; } - } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java index 91147c663f248..36401172994c5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java @@ -41,43 +41,4 @@ public interface RepeatedValueVector extends ValueVector { * @return the underlying data vector or null if none exists. */ ValueVector getDataVector(); - - @Override - RepeatedAccessor getAccessor(); - - @Override - RepeatedMutator getMutator(); - - interface RepeatedAccessor extends ValueVector.Accessor { - /** - * The result includes empty, null valued cells. - * - * @return total number of cells that vector contains. - */ - int getInnerValueCount(); - - - /** - * @param index the index of the value for which we want the size - * @return number of cells that the value at the given index contains. - */ - int getInnerValueCountAt(int index); - - /** - * @param index value index - * @return true if the value at the given index is empty, false otherwise. - */ - boolean isEmpty(int index); - } - - interface RepeatedMutator extends ValueVector.Mutator { - - /** - * Starts a new value that is a container of cells. - * - * @param index index of new value to start - * @return index into the child vector - */ - int startNewValue(int index); - } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java index 614c266acf147..06b0f4d43272f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java @@ -52,6 +52,6 @@ public void copyAsField(String name, MapWriter writer) { @Override public boolean isSet() { - return !nullableMapVector.getAccessor().isNull(idx()); + return !nullableMapVector.isNull(idx()); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java index 9722196ed7cd2..5bd439cac4269 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -162,7 +162,7 @@ private FieldWriter promoteToUnion() { writer = new UnionWriter(unionVector, nullableMapWriterFactory); writer.setPosition(idx()); for (int i = 0; i <= idx(); i++) { - unionVector.getMutator().setType(i, vector.getMinorType()); + unionVector.setType(i, vector.getMinorType()); } vector = null; state = State.UNION; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java index 3ebd0cd7dd959..c77ca4e8f23ba 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java @@ -78,7 +78,7 @@ public void setPosition(int index) { @Override public Object readObject() { - return vector.getAccessor().getObject(idx()); + return vector.getObject(idx()); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java index f3e9b8773f25e..56ae379dca210 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionFixedSizeListReader.java @@ -46,7 +46,7 @@ public UnionFixedSizeListReader(FixedSizeListVector vector) { @Override public boolean isSet() { - return !vector.getAccessor().isNull(idx()); + return !vector.isNull(idx()); } @Override @@ -56,7 +56,7 @@ public FieldReader reader() { @Override public Object readObject() { - return vector.getAccessor().getObject(idx()); + return vector.getObject(idx()); } @Override @@ -80,7 +80,7 @@ public void read(int index, UnionHolder holder) { } } holder.reader = data.getReader(); - holder.isSet = vector.getAccessor().isNull(idx()) ? 0 : 1; + holder.isSet = vector.isNull(idx()) ? 0 : 1; } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java index b98c36d2bf721..62bc271257fb2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java @@ -19,7 +19,7 @@ package org.apache.arrow.vector.complex.impl; -import org.apache.arrow.vector.UInt4Vector; +import io.netty.buffer.ArrowBuf; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.reader.FieldReader; @@ -33,12 +33,11 @@ public class UnionListReader extends AbstractFieldReader { private ListVector vector; private ValueVector data; - private UInt4Vector offsets; + private static final int OFFSET_WIDTH = 4; public UnionListReader(ListVector vector) { this.vector = vector; this.data = vector.getDataVector(); - this.offsets = vector.getOffsetVector(); } @Override @@ -48,7 +47,7 @@ public Field getField() { @Override public boolean isSet() { - return !vector.getAccessor().isNull(idx()); + return !vector.isNull(idx()); } private int currentOffset; @@ -57,8 +56,8 @@ public boolean isSet() { @Override public void setPosition(int index) { super.setPosition(index); - currentOffset = offsets.getAccessor().get(index) - 1; - maxOffset = offsets.getAccessor().get(index + 1); + currentOffset = vector.getOffsetBuffer().getInt(index * OFFSET_WIDTH) - 1; + maxOffset = vector.getOffsetBuffer().getInt((index + 1) * OFFSET_WIDTH); } @Override @@ -68,7 +67,7 @@ public FieldReader reader() { @Override public Object readObject() { - return vector.getAccessor().getObject(idx()); + return vector.getObject(idx()); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 3b7dc4a56e1ac..762a442c983fe 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -47,11 +47,10 @@ public class DictionaryEncoder { public static ValueVector encode(ValueVector vector, Dictionary dictionary) { validateType(vector.getMinorType()); // load dictionary values into a hashmap for lookup - ValueVector.Accessor dictionaryAccessor = dictionary.getVector().getAccessor(); - Map lookUps = new HashMap<>(dictionaryAccessor.getValueCount()); - for (int i = 0; i < dictionaryAccessor.getValueCount(); i++) { + Map lookUps = new HashMap<>(dictionary.getVector().getValueCount()); + for (int i = 0; i < dictionary.getVector().getValueCount(); i++) { // for primitive array types we need a wrapper that implements equals and hashcode appropriately - lookUps.put(dictionaryAccessor.getObject(i), i); + lookUps.put(dictionary.getVector().getObject(i), i); } Field valueField = vector.getField(); @@ -61,14 +60,13 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { // vector to hold our indices (dictionary encoded values) FieldVector indices = indexField.createVector(vector.getAllocator()); - ValueVector.Mutator mutator = indices.getMutator(); // use reflection to pull out the set method // TODO implement a common interface for int vectors Method setter = null; for (Class c : ImmutableList.of(int.class, long.class)) { try { - setter = mutator.getClass().getMethod("setSafe", int.class, c); + setter = indices.getClass().getMethod("setSafe", int.class, c); break; } catch (NoSuchMethodException e) { // ignore @@ -78,21 +76,20 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { throw new IllegalArgumentException("Dictionary encoding does not have a valid int type:" + indices.getClass()); } - ValueVector.Accessor accessor = vector.getAccessor(); - int count = accessor.getValueCount(); + int count = vector.getValueCount(); indices.allocateNew(); try { for (int i = 0; i < count; i++) { - Object value = accessor.getObject(i); + Object value = vector.getObject(i); if (value != null) { // if it's null leave it null // note: this may fail if value was not included in the dictionary Object encoded = lookUps.get(value); if (encoded == null) { throw new IllegalArgumentException("Dictionary encoding not defined for value:" + value); } - setter.invoke(mutator, i, encoded); + setter.invoke(indices, i, encoded); } } } catch (IllegalAccessException e) { @@ -101,7 +98,7 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { throw new RuntimeException("InvocationTargetException invoking vector mutator set():", e.getCause()); } - mutator.setValueCount(count); + indices.setValueCount(count); return indices; } @@ -114,15 +111,14 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { * @return vector with values restored from dictionary */ public static ValueVector decode(ValueVector indices, Dictionary dictionary) { - ValueVector.Accessor accessor = indices.getAccessor(); - int count = accessor.getValueCount(); + int count = indices.getValueCount(); ValueVector dictionaryVector = dictionary.getVector(); - int dictionaryCount = dictionaryVector.getAccessor().getValueCount(); + int dictionaryCount = dictionaryVector.getValueCount(); // copy the dictionary values into the decoded vector TransferPair transfer = dictionaryVector.getTransferPair(indices.getAllocator()); transfer.getTo().allocateNewSafe(); for (int i = 0; i < count; i++) { - Object index = accessor.getObject(i); + Object index = indices.getObject(i); if (index != null) { int indexAsInt = ((Number) index).intValue(); if (indexAsInt > dictionaryCount) { @@ -133,7 +129,7 @@ public static ValueVector decode(ValueVector indices, Dictionary dictionary) { } // TODO do we need to worry about the field? ValueVector decoded = transfer.getTo(); - decoded.getMutator().setValueCount(count); + decoded.setValueCount(count); return decoded; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java deleted file mode 100644 index c6ebd61aa07b9..0000000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java +++ /dev/null @@ -1,421 +0,0 @@ -/******************************************************************************* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -package org.apache.arrow.vector.file.json; - -import static com.fasterxml.jackson.core.JsonToken.END_ARRAY; -import static com.fasterxml.jackson.core.JsonToken.END_OBJECT; -import static com.fasterxml.jackson.core.JsonToken.START_ARRAY; -import static com.fasterxml.jackson.core.JsonToken.START_OBJECT; -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.arrow.vector.schema.ArrowVectorType.OFFSET; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import com.google.common.collect.ImmutableList; -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.BigIntVector; -import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.BufferBacked; -import org.apache.arrow.vector.DateDayVector; -import org.apache.arrow.vector.DateMilliVector; -import org.apache.arrow.vector.DecimalVector; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.Float4Vector; -import org.apache.arrow.vector.Float8Vector; -import org.apache.arrow.vector.IntVector; -import org.apache.arrow.vector.NullableVarBinaryVector; -import org.apache.arrow.vector.NullableVarCharVector; -import org.apache.arrow.vector.SmallIntVector; -import org.apache.arrow.vector.TimeMicroVector; -import org.apache.arrow.vector.TimeMilliVector; -import org.apache.arrow.vector.TimeNanoVector; -import org.apache.arrow.vector.TimeSecVector; -import org.apache.arrow.vector.TimeStampMicroTZVector; -import org.apache.arrow.vector.TimeStampMicroVector; -import org.apache.arrow.vector.TimeStampMilliTZVector; -import org.apache.arrow.vector.TimeStampMilliVector; -import org.apache.arrow.vector.TimeStampNanoTZVector; -import org.apache.arrow.vector.TimeStampNanoVector; -import org.apache.arrow.vector.TimeStampSecTZVector; -import org.apache.arrow.vector.TimeStampSecVector; -import org.apache.arrow.vector.TinyIntVector; -import org.apache.arrow.vector.UInt1Vector; -import org.apache.arrow.vector.UInt2Vector; -import org.apache.arrow.vector.UInt4Vector; -import org.apache.arrow.vector.UInt8Vector; -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.VarBinaryVector; -import org.apache.arrow.vector.VarCharVector; -import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.complex.ListVector; -import org.apache.arrow.vector.dictionary.Dictionary; -import org.apache.arrow.vector.dictionary.DictionaryProvider; -import org.apache.arrow.vector.schema.ArrowVectorType; -import org.apache.arrow.vector.types.pojo.Field; -import org.apache.arrow.vector.types.pojo.Schema; -import org.apache.arrow.vector.util.DecimalUtility; -import org.apache.arrow.vector.util.DictionaryUtility; -import org.apache.commons.codec.DecoderException; -import org.apache.commons.codec.binary.Hex; - -import com.fasterxml.jackson.core.JsonParseException; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonToken; -import com.fasterxml.jackson.databind.MappingJsonFactory; -import com.google.common.base.Objects; - -public class JsonFileReader implements AutoCloseable, DictionaryProvider { - private final JsonParser parser; - private final BufferAllocator allocator; - private Schema schema; - private Map dictionaries; - private Boolean started = false; - - public JsonFileReader(File inputFile, BufferAllocator allocator) throws JsonParseException, IOException { - super(); - this.allocator = allocator; - MappingJsonFactory jsonFactory = new MappingJsonFactory(); - this.parser = jsonFactory.createParser(inputFile); - } - - @Override - public Dictionary lookup(long id) { - if (!started) { - throw new IllegalStateException("Unable to lookup until after read() has started"); - } - - return dictionaries.get(id); - } - - public Schema start() throws JsonParseException, IOException { - readToken(START_OBJECT); - { - Schema originalSchema = readNextField("schema", Schema.class); - List fields = new ArrayList<>(); - dictionaries = new HashMap<>(); - - // Convert fields with dictionaries to have the index type - for (Field field : originalSchema.getFields()) { - fields.add(DictionaryUtility.toMemoryFormat(field, allocator, dictionaries)); - } - this.schema = new Schema(fields, originalSchema.getCustomMetadata()); - - if (!dictionaries.isEmpty()) { - nextFieldIs("dictionaries"); - readDictionaryBatches(); - } - - nextFieldIs("batches"); - readToken(START_ARRAY); - started = true; - return this.schema; - } - } - - private void readDictionaryBatches() throws JsonParseException, IOException { - readToken(START_ARRAY); - JsonToken token = parser.nextToken(); - boolean haveDictionaryBatch = token == START_OBJECT; - while (haveDictionaryBatch) { - - // Lookup what dictionary for the batch about to be read - long id = readNextField("id", Long.class); - Dictionary dict = dictionaries.get(id); - if (dict == null) { - throw new IllegalArgumentException("Dictionary with id: " + id + " missing encoding from schema Field"); - } - - // Read the dictionary record batch - nextFieldIs("data"); - FieldVector vector = dict.getVector(); - List fields = ImmutableList.of(vector.getField()); - List vectors = ImmutableList.of(vector); - VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, vector.getAccessor().getValueCount()); - read(root); - - readToken(END_OBJECT); - token = parser.nextToken(); - haveDictionaryBatch = token == START_OBJECT; - } - - if (token != END_ARRAY) { - throw new IllegalArgumentException("Invalid token: " + token + " expected end of array at " + parser.getTokenLocation()); - } - } - - public boolean read(VectorSchemaRoot root) throws IOException { - JsonToken t = parser.nextToken(); - if (t == START_OBJECT) { - { - int count = readNextField("count", Integer.class); - root.setRowCount(count); - nextFieldIs("columns"); - readToken(START_ARRAY); - { - for (Field field : root.getSchema().getFields()) { - FieldVector vector = root.getVector(field.getName()); - readVector(field, vector); - } - } - readToken(END_ARRAY); - } - readToken(END_OBJECT); - return true; - } else if (t == END_ARRAY) { - root.setRowCount(0); - return false; - } else { - throw new IllegalArgumentException("Invalid token: " + t); - } - } - - public VectorSchemaRoot read() throws IOException { - JsonToken t = parser.nextToken(); - if (t == START_OBJECT) { - VectorSchemaRoot recordBatch = VectorSchemaRoot.create(schema, allocator); - { - int count = readNextField("count", Integer.class); - recordBatch.setRowCount(count); - nextFieldIs("columns"); - readToken(START_ARRAY); - { - for (Field field : schema.getFields()) { - FieldVector vector = recordBatch.getVector(field.getName()); - readVector(field, vector); - } - } - readToken(END_ARRAY); - } - readToken(END_OBJECT); - return recordBatch; - } else if (t == END_ARRAY) { - return null; - } else { - throw new IllegalArgumentException("Invalid token: " + t); - } - } - - /** - * TODO: A better way of implementing this function is to use `loadFieldBuffers` methods in - * FieldVector to set the inner-vector data as done in `ArrowFileReader`. - */ - private void readVector(Field field, FieldVector vector) throws JsonParseException, IOException { - List vectorTypes = field.getTypeLayout().getVectorTypes(); - List fieldInnerVectors = vector.getFieldInnerVectors(); - if (vectorTypes.size() != fieldInnerVectors.size()) { - throw new IllegalArgumentException("vector types and inner vectors are not the same size: " + vectorTypes.size() + " != " + fieldInnerVectors.size()); - } - readToken(START_OBJECT); - { - // If currently reading dictionaries, field name is not important so don't check - String name = readNextField("name", String.class); - if (started && !Objects.equal(field.getName(), name)) { - throw new IllegalArgumentException("Expected field " + field.getName() + " but got " + name); - } - - // Initialize the vector with required capacity - int count = readNextField("count", Integer.class); - vector.setInitialCapacity(count); - vector.allocateNew(); - - // Read inner vectors - for (int v = 0; v < vectorTypes.size(); v++) { - ArrowVectorType vectorType = vectorTypes.get(v); - ValueVector valueVector = (ValueVector) fieldInnerVectors.get(v); - nextFieldIs(vectorType.getName()); - readToken(START_ARRAY); - int innerVectorCount = vectorType.equals(OFFSET) ? count + 1 : count; - for (int i = 0; i < innerVectorCount; i++) { - parser.nextToken(); - setValueFromParser(valueVector, i); - } - readToken(END_ARRAY); - } - - // Set lastSet before valueCount to prevent setValueCount from filling empty values - switch (vector.getMinorType()) { - case LIST: - // ListVector starts lastSet from index 0, so lastSet value is always last index written + 1 - ((ListVector) vector).getMutator().setLastSet(count); - break; - case VARBINARY: - ((NullableVarBinaryVector) vector).getMutator().setLastSet(count - 1); - break; - case VARCHAR: - ((NullableVarCharVector) vector).getMutator().setLastSet(count - 1); - break; - } - vector.getMutator().setValueCount(count); - - // read child vectors, if any - List fields = field.getChildren(); - if (!fields.isEmpty()) { - List vectorChildren = vector.getChildrenFromFields(); - if (fields.size() != vectorChildren.size()) { - throw new IllegalArgumentException("fields and children are not the same size: " + fields.size() + " != " + vectorChildren.size()); - } - nextFieldIs("children"); - readToken(START_ARRAY); - for (int i = 0; i < fields.size(); i++) { - Field childField = fields.get(i); - FieldVector childVector = vectorChildren.get(i); - readVector(childField, childVector); - } - readToken(END_ARRAY); - } - } - readToken(END_OBJECT); - } - - private byte[] decodeHexSafe(String hexString) throws IOException { - try { - return Hex.decodeHex(hexString.toCharArray()); - } catch (DecoderException e) { - throw new IOException("Unable to decode hex string: " + hexString, e); - } - } - - private void setValueFromParser(ValueVector valueVector, int i) throws IOException { - switch (valueVector.getMinorType()) { - case BIT: - ((BitVector) valueVector).getMutator().set(i, parser.readValueAs(Boolean.class) ? 1 : 0); - break; - case TINYINT: - ((TinyIntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case SMALLINT: - ((SmallIntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case INT: - ((IntVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case BIGINT: - ((BigIntVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case UINT1: - ((UInt1Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case UINT2: - ((UInt2Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case UINT4: - ((UInt4Vector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case UINT8: - ((UInt8Vector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case FLOAT4: - ((Float4Vector) valueVector).getMutator().set(i, parser.readValueAs(Float.class)); - break; - case FLOAT8: - ((Float8Vector) valueVector).getMutator().set(i, parser.readValueAs(Double.class)); - break; - case DECIMAL: { - DecimalVector decimalVector = ((DecimalVector) valueVector); - byte[] value = decodeHexSafe(parser.readValueAs(String.class)); - DecimalUtility.writeByteArrayToArrowBuf(value, decimalVector.getBuffer(), i); - } - break; - case VARBINARY: - ((VarBinaryVector) valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class))); - break; - case VARCHAR: - ((VarCharVector) valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8)); - break; - case DATEDAY: - ((DateDayVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case DATEMILLI: - ((DateMilliVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESEC: - ((TimeSecVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case TIMEMILLI: - ((TimeMilliVector) valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); - break; - case TIMEMICRO: - ((TimeMicroVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMENANO: - ((TimeNanoVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPSEC: - ((TimeStampSecVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMILLI: - ((TimeStampMilliVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMICRO: - ((TimeStampMicroVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPNANO: - ((TimeStampNanoVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPSECTZ: - ((TimeStampSecTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMILLITZ: - ((TimeStampMilliTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPMICROTZ: - ((TimeStampMicroTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - case TIMESTAMPNANOTZ: - ((TimeStampNanoTZVector) valueVector).getMutator().set(i, parser.readValueAs(Long.class)); - break; - default: - throw new UnsupportedOperationException("minor type: " + valueVector.getMinorType()); - } - } - - @Override - public void close() throws IOException { - parser.close(); - for (Dictionary dictionary : dictionaries.values()) { - dictionary.getVector().close(); - } - } - - private T readNextField(String expectedFieldName, Class c) throws IOException, JsonParseException { - nextFieldIs(expectedFieldName); - parser.nextToken(); - return parser.readValueAs(c); - } - - private void nextFieldIs(String expectedFieldName) throws IOException, JsonParseException { - String name = parser.nextFieldName(); - if (name == null || !name.equals(expectedFieldName)) { - throw new IllegalStateException("Expected " + expectedFieldName + " but got " + name); - } - } - - private void readToken(JsonToken expected) throws JsonParseException, IOException { - JsonToken t = parser.nextToken(); - if (t != expected) { - throw new IllegalStateException("Expected " + expected + " but got " + t); - } - } - -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java similarity index 77% rename from java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java index d711b9c6c1e26..4cd70262261e8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileReader.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import java.io.IOException; import java.nio.ByteBuffer; @@ -26,32 +26,45 @@ import org.apache.arrow.flatbuf.Footer; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.schema.ArrowDictionaryBatch; -import org.apache.arrow.vector.schema.ArrowMessage; -import org.apache.arrow.vector.schema.ArrowRecordBatch; -import org.apache.arrow.vector.stream.MessageSerializer; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowFooter; +import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.MessageSerializer; import org.apache.arrow.vector.types.pojo.Schema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ArrowFileReader extends ArrowReader { +public class ArrowFileReader extends ArrowReader { private static final Logger LOGGER = LoggerFactory.getLogger(ArrowFileReader.class); + private SeekableReadChannel in; private ArrowFooter footer; private int currentDictionaryBatch = 0; private int currentRecordBatch = 0; + public ArrowFileReader(SeekableReadChannel in, BufferAllocator allocator) { + super(allocator); + this.in = in; + } + public ArrowFileReader(SeekableByteChannel in, BufferAllocator allocator) { - super(new SeekableReadChannel(in), allocator); + this(new SeekableReadChannel(in), allocator); } - public ArrowFileReader(SeekableReadChannel in, BufferAllocator allocator) { - super(in, allocator); + @Override + public long bytesRead() { + return in.bytesRead(); + } + + @Override + protected void closeReadSource() throws IOException { + in.close(); } @Override - protected Schema readSchema(SeekableReadChannel in) throws IOException { + protected Schema readSchema() throws IOException { if (footer == null) { if (in.size() <= (ArrowMagic.MAGIC_LENGTH * 2 + 4)) { throw new InvalidArrowFileException("file too small: " + in.size()); @@ -82,18 +95,30 @@ protected Schema readSchema(SeekableReadChannel in) throws IOException { } @Override - protected ArrowMessage readMessage(SeekableReadChannel in, BufferAllocator allocator) throws IOException { - if (currentDictionaryBatch < footer.getDictionaries().size()) { - ArrowBlock block = footer.getDictionaries().get(currentDictionaryBatch++); - return readDictionaryBatch(in, block, allocator); - } else if (currentRecordBatch < footer.getRecordBatches().size()) { + public ArrowDictionaryBatch readDictionary() throws IOException { + if (currentDictionaryBatch >= footer.getDictionaries().size()) { + throw new IOException("Requested more dictionaries than defined in footer: " + currentDictionaryBatch); + } + ArrowBlock block = footer.getDictionaries().get(currentDictionaryBatch++); + return readDictionaryBatch(in, block, allocator); + } + + // Returns true if a batch was read, false if no more batches + @Override + public boolean loadNextBatch() throws IOException { + prepareLoadNextBatch(); + + if (currentRecordBatch < footer.getRecordBatches().size()) { ArrowBlock block = footer.getRecordBatches().get(currentRecordBatch++); - return readRecordBatch(in, block, allocator); + ArrowRecordBatch batch = readRecordBatch(in, block, allocator); + loadRecordBatch(batch); + return true; } else { - return null; + return false; } } + public List getDictionaryBlocks() throws IOException { ensureInitialized(); return footer.getDictionaries(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileWriter.java similarity index 94% rename from java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileWriter.java index 1d92d2bde1c6f..1b687c9f2697a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowFileWriter.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import java.io.IOException; import java.nio.channels.WritableByteChannel; @@ -24,6 +24,8 @@ import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowFooter; import org.apache.arrow.vector.types.pojo.Schema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java similarity index 95% rename from java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java index 68313e7878b71..f71318ee678ad 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowMagic.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java @@ -16,13 +16,13 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Arrays; -public class ArrowMagic { +class ArrowMagic { private static final byte[] MAGIC = "ARROW1".getBytes(StandardCharsets.UTF_8); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java similarity index 65% rename from java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java index 21fb2207eb019..6d708a03cadb0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowReader.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import java.io.IOException; import java.util.ArrayList; @@ -33,32 +33,25 @@ import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; -import org.apache.arrow.vector.schema.ArrowDictionaryBatch; -import org.apache.arrow.vector.schema.ArrowMessage; -import org.apache.arrow.vector.schema.ArrowMessage.ArrowMessageVisitor; -import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.DictionaryUtility; /** - * Abstract class to read ArrowRecordBatches from a ReadChannel. + * Abstract class to read Schema and ArrowRecordBatches. * - * @param Type of ReadChannel to use */ -public abstract class ArrowReader implements DictionaryProvider, AutoCloseable { - - private final T in; - private final BufferAllocator allocator; +public abstract class ArrowReader implements DictionaryProvider, AutoCloseable { + protected final BufferAllocator allocator; private VectorLoader loader; private VectorSchemaRoot root; private Map dictionaries; - private boolean initialized = false; - protected ArrowReader(T in, BufferAllocator allocator) { - this.in = in; + protected ArrowReader(BufferAllocator allocator) { this.allocator = allocator; } @@ -105,58 +98,18 @@ public Dictionary lookup(long id) { * @return true if a batch was read, false on EOS * @throws IOException */ - public boolean loadNextBatch() throws IOException { - ensureInitialized(); - // read in all dictionary batches, then stop after our first record batch - ArrowMessageVisitor visitor = new ArrowMessageVisitor() { - @Override - public Boolean visit(ArrowDictionaryBatch message) { - try { - load(message); - } finally { - message.close(); - } - return true; - } - - @Override - public Boolean visit(ArrowRecordBatch message) { - try { - loader.load(message); - } finally { - message.close(); - } - return false; - } - }; - root.setRowCount(0); - ArrowMessage message = readMessage(in, allocator); - - boolean readBatch = false; - while (message != null) { - if (!message.accepts(visitor)) { - readBatch = true; - break; - } - // else read a dictionary - message = readMessage(in, allocator); - } - - return readBatch; - } + public abstract boolean loadNextBatch() throws IOException; /** * Return the number of bytes read from the ReadChannel. * * @return number of bytes read */ - public long bytesRead() { - return in.bytesRead(); - } + public abstract long bytesRead(); /** * Close resources, including vector schema root and dictionary vectors, and the - * underlying ReadChannel. + * underlying read source. * * @throws IOException */ @@ -167,12 +120,12 @@ public void close() throws IOException { /** * Close resources, including vector schema root and dictionary vectors. If the flag - * closeReadChannel is true then close the underlying ReadChannel, otherwise leave it open. + * closeReadChannel is true then close the underlying read source, otherwise leave it open. * - * @param closeReadChannel Flag to control if closing the underlying ReadChannel + * @param closeReadSource Flag to control if closing the underlying read source * @throws IOException */ - public void close(boolean closeReadChannel) throws IOException { + public void close(boolean closeReadSource) throws IOException { if (initialized) { root.close(); for (Dictionary dictionary : dictionaries.values()) { @@ -180,15 +133,40 @@ public void close(boolean closeReadChannel) throws IOException { } } - if (closeReadChannel) { - in.close(); + if (closeReadSource) { + closeReadSource(); } } - protected abstract Schema readSchema(T in) throws IOException; + /** + * Close the underlying read source. + * + * @throws IOException + */ + protected abstract void closeReadSource() throws IOException; + + /** + * Read the Schema from the source, will be invoked at the beginning the initialization. + * + * @return the read Schema + * @throws IOException + */ + protected abstract Schema readSchema() throws IOException; - protected abstract ArrowMessage readMessage(T in, BufferAllocator allocator) throws IOException; + /** + * Read a dictionary batch from the source, will be invoked after the schema has been read and + * called N times, where N is the number of dictionaries indicated by the schema Fields. + * + * @return the read ArrowDictionaryBatch + * @throws IOException + */ + protected abstract ArrowDictionaryBatch readDictionary() throws IOException; + /** + * Initialize if not done previously. + * + * @throws IOException + */ protected void ensureInitialized() throws IOException { if (!initialized) { initialize(); @@ -200,7 +178,7 @@ protected void ensureInitialized() throws IOException { * Reads the schema and initializes the vectors */ private void initialize() throws IOException { - Schema originalSchema = readSchema(in); + Schema originalSchema = readSchema(); List fields = new ArrayList<>(); List vectors = new ArrayList<>(); Map dictionaries = new HashMap<>(); @@ -216,9 +194,43 @@ private void initialize() throws IOException { this.root = new VectorSchemaRoot(schema, vectors, 0); this.loader = new VectorLoader(root); this.dictionaries = Collections.unmodifiableMap(dictionaries); + + // Read and load all dictionaries from schema + for (int i = 0; i < dictionaries.size(); i++) { + ArrowDictionaryBatch dictionaryBatch = readDictionary(); + loadDictionary(dictionaryBatch); + } } - private void load(ArrowDictionaryBatch dictionaryBatch) { + /** + * Ensure the reader has been initialized and reset the VectorSchemaRoot row count to 0. + * + * @throws IOException + */ + protected void prepareLoadNextBatch() throws IOException { + ensureInitialized(); + root.setRowCount(0); + } + + /** + * Load an ArrowRecordBatch to the readers VectorSchemaRoot. + * + * @param batch the record batch to load + */ + protected void loadRecordBatch(ArrowRecordBatch batch) { + try { + loader.load(batch); + } finally { + batch.close(); + } + } + + /** + * Load an ArrowDictionaryBatch to the readers dictionary vectors. + * + * @param dictionaryBatch + */ + protected void loadDictionary(ArrowDictionaryBatch dictionaryBatch) { long id = dictionaryBatch.getDictionaryId(); Dictionary dictionary = dictionaries.get(id); if (dictionary == null) { @@ -227,6 +239,10 @@ private void load(ArrowDictionaryBatch dictionaryBatch) { FieldVector vector = dictionary.getVector(); VectorSchemaRoot root = new VectorSchemaRoot(ImmutableList.of(vector.getField()), ImmutableList.of(vector), 0); VectorLoader loader = new VectorLoader(root); - loader.load(dictionaryBatch.getDictionary()); + try { + loader.load(dictionaryBatch.getDictionary()); + } finally { + dictionaryBatch.close(); + } } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java new file mode 100644 index 0000000000000..d1e48021885d5 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamReader.java @@ -0,0 +1,148 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.channels.Channels; +import java.nio.channels.ReadableByteChannel; + +import org.apache.arrow.flatbuf.Message; +import org.apache.arrow.flatbuf.MessageHeader; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.MessageChannelReader; +import org.apache.arrow.vector.ipc.message.MessageReader; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.ipc.ReadChannel; +import org.apache.arrow.vector.types.pojo.Schema; + +/** + * This classes reads from an input stream and produces ArrowRecordBatches. + */ +public class ArrowStreamReader extends ArrowReader { + + private MessageReader messageReader; + + /** + * Constructs a streaming reader using the MessageReader interface. Non-blocking. + * + * @param messageReader interface to get read messages + * @param allocator to allocate new buffers + */ + public ArrowStreamReader(MessageReader messageReader, BufferAllocator allocator) { + super(allocator); + this.messageReader = messageReader; + } + + /** + * Constructs a streaming reader from a ReadableByteChannel input. Non-blocking. + * + * @param in ReadableByteChannel to read messages from + * @param allocator to allocate new buffers + */ + public ArrowStreamReader(ReadableByteChannel in, BufferAllocator allocator) { + this(new MessageChannelReader(new ReadChannel(in)), allocator); + } + + /** + * Constructs a streaming reader from an InputStream. Non-blocking. + * + * @param in InputStream to read messages from + * @param allocator to allocate new buffers + */ + public ArrowStreamReader(InputStream in, BufferAllocator allocator) { + this(Channels.newChannel(in), allocator); + } + + /** + * Get the number of bytes read from the stream since constructing the reader. + * + * @return number of bytes + */ + @Override + public long bytesRead() { + return messageReader.bytesRead(); + } + + /** + * Closes the underlying read source. + * + * @throws IOException + */ + @Override + protected void closeReadSource() throws IOException { + messageReader.close(); + } + + /** + * Load the next ArrowRecordBatch to the vector schema root if available. + * + * @return true if a batch was read, false on EOS + * @throws IOException + */ + public boolean loadNextBatch() throws IOException { + prepareLoadNextBatch(); + + Message message = messageReader.readNextMessage(); + + // Reached EOS + if (message == null) { + return false; + } + + if (message.headerType() != MessageHeader.RecordBatch) { + throw new IOException("Expected RecordBatch but header was " + message.headerType()); + } + + ArrowRecordBatch batch = MessageSerializer.deserializeRecordBatch(messageReader, message, allocator); + loadRecordBatch(batch); + return true; + } + + /** + * Reads the schema message from the beginning of the stream. + * + * @return the deserialized arrow schema + */ + @Override + protected Schema readSchema() throws IOException { + return MessageSerializer.deserializeSchema(messageReader); + } + + /** + * Read a dictionary batch message, will be invoked after the schema and before normal record + * batches are read. + * + * @return the deserialized dictionary batch + * @throws IOException + */ + @Override + protected ArrowDictionaryBatch readDictionary() throws IOException { + Message message = messageReader.readNextMessage(); + + if (message.headerType() != MessageHeader.DictionaryBatch) { + throw new IOException("Expected DictionaryBatch but header was " + message.headerType()); + } + + return MessageSerializer.deserializeDictionaryBatch(messageReader, message, allocator); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamWriter.java similarity index 84% rename from java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamWriter.java index b854cd2bb6e74..d731d05b81f16 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowStreamWriter.java @@ -16,16 +16,13 @@ * limitations under the License. */ -package org.apache.arrow.vector.stream; +package org.apache.arrow.vector.ipc; -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.dictionary.DictionaryProvider; -import org.apache.arrow.vector.file.ArrowBlock; -import org.apache.arrow.vector.file.ArrowWriter; -import org.apache.arrow.vector.file.WriteChannel; -import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.ArrowWriter; +import org.apache.arrow.vector.ipc.WriteChannel; import org.apache.arrow.vector.types.pojo.Schema; import java.io.IOException; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java similarity index 94% rename from java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java index b35aba5426e4a..4b483d0105004 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowWriter.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import java.io.IOException; import java.nio.channels.WritableByteChannel; @@ -30,9 +30,10 @@ import org.apache.arrow.vector.VectorUnloader; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; -import org.apache.arrow.vector.schema.ArrowDictionaryBatch; -import org.apache.arrow.vector.schema.ArrowRecordBatch; -import org.apache.arrow.vector.stream.MessageSerializer; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowDictionaryBatch; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.MessageSerializer; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.DictionaryUtility; @@ -82,7 +83,7 @@ protected ArrowWriter(VectorSchemaRoot root, DictionaryProvider provider, Writab for (long id : dictionaryIdsUsed) { Dictionary dictionary = provider.lookup(id); FieldVector vector = dictionary.getVector(); - int count = vector.getAccessor().getValueCount(); + int count = vector.getValueCount(); VectorSchemaRoot dictRoot = new VectorSchemaRoot(ImmutableList.of(vector.getField()), ImmutableList.of(vector), count); VectorUnloader unloader = new VectorUnloader(dictRoot); ArrowRecordBatch batch = unloader.getRecordBatch(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/InvalidArrowFileException.java similarity index 96% rename from java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/InvalidArrowFileException.java index 607207f41b06c..ad9d8776e33f4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/InvalidArrowFileException.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; public class InvalidArrowFileException extends RuntimeException { private static final long serialVersionUID = 1L; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java new file mode 100644 index 0000000000000..d0a9b9e18b8f2 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -0,0 +1,584 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +package org.apache.arrow.vector.ipc; + +import static com.fasterxml.jackson.core.JsonToken.END_ARRAY; +import static com.fasterxml.jackson.core.JsonToken.END_OBJECT; +import static com.fasterxml.jackson.core.JsonToken.START_ARRAY; +import static com.fasterxml.jackson.core.JsonToken.START_OBJECT; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.arrow.vector.BufferLayout.BufferType.*; + +import java.io.File; +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.*; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.*; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.BufferLayout.BufferType; +import org.apache.arrow.vector.TypeLayout; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.DecimalUtility; +import org.apache.arrow.vector.util.DictionaryUtility; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.binary.Hex; + +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.MappingJsonFactory; +import com.google.common.base.Objects; + +public class JsonFileReader implements AutoCloseable, DictionaryProvider { + private final JsonParser parser; + private final BufferAllocator allocator; + private Schema schema; + private Map dictionaries; + private Boolean started = false; + + public JsonFileReader(File inputFile, BufferAllocator allocator) throws JsonParseException, IOException { + super(); + this.allocator = allocator; + MappingJsonFactory jsonFactory = new MappingJsonFactory(); + this.parser = jsonFactory.createParser(inputFile); + // Allow reading NaN for floating point values + this.parser.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, true); + } + + @Override + public Dictionary lookup(long id) { + if (!started) { + throw new IllegalStateException("Unable to lookup until after read() has started"); + } + + return dictionaries.get(id); + } + + public Schema start() throws JsonParseException, IOException { + readToken(START_OBJECT); + { + Schema originalSchema = readNextField("schema", Schema.class); + List fields = new ArrayList<>(); + dictionaries = new HashMap<>(); + + // Convert fields with dictionaries to have the index type + for (Field field : originalSchema.getFields()) { + fields.add(DictionaryUtility.toMemoryFormat(field, allocator, dictionaries)); + } + this.schema = new Schema(fields, originalSchema.getCustomMetadata()); + + if (!dictionaries.isEmpty()) { + nextFieldIs("dictionaries"); + readDictionaryBatches(); + } + + nextFieldIs("batches"); + readToken(START_ARRAY); + started = true; + return this.schema; + } + } + + private void readDictionaryBatches() throws JsonParseException, IOException { + readToken(START_ARRAY); + JsonToken token = parser.nextToken(); + boolean haveDictionaryBatch = token == START_OBJECT; + while (haveDictionaryBatch) { + + // Lookup what dictionary for the batch about to be read + long id = readNextField("id", Long.class); + Dictionary dict = dictionaries.get(id); + if (dict == null) { + throw new IllegalArgumentException("Dictionary with id: " + id + " missing encoding from schema Field"); + } + + // Read the dictionary record batch + nextFieldIs("data"); + FieldVector vector = dict.getVector(); + List fields = ImmutableList.of(vector.getField()); + List vectors = ImmutableList.of(vector); + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, vector.getValueCount()); + read(root); + + readToken(END_OBJECT); + token = parser.nextToken(); + haveDictionaryBatch = token == START_OBJECT; + } + + if (token != END_ARRAY) { + throw new IllegalArgumentException("Invalid token: " + token + " expected end of array at " + parser.getTokenLocation()); + } + } + + public boolean read(VectorSchemaRoot root) throws IOException { + JsonToken t = parser.nextToken(); + if (t == START_OBJECT) { + { + int count = readNextField("count", Integer.class); + root.setRowCount(count); + nextFieldIs("columns"); + readToken(START_ARRAY); + { + for (Field field : root.getSchema().getFields()) { + FieldVector vector = root.getVector(field.getName()); + readFromJsonIntoVector(field, vector); + } + } + readToken(END_ARRAY); + } + readToken(END_OBJECT); + return true; + } else if (t == END_ARRAY) { + root.setRowCount(0); + return false; + } else { + throw new IllegalArgumentException("Invalid token: " + t); + } + } + + public VectorSchemaRoot read() throws IOException { + JsonToken t = parser.nextToken(); + if (t == START_OBJECT) { + VectorSchemaRoot recordBatch = VectorSchemaRoot.create(schema, allocator); + { + int count = readNextField("count", Integer.class); + recordBatch.setRowCount(count); + nextFieldIs("columns"); + readToken(START_ARRAY); + { + for (Field field : schema.getFields()) { + FieldVector vector = recordBatch.getVector(field.getName()); + readFromJsonIntoVector(field, vector); + } + } + readToken(END_ARRAY); + } + readToken(END_OBJECT); + return recordBatch; + } else if (t == END_ARRAY) { + return null; + } else { + throw new IllegalArgumentException("Invalid token: " + t); + } + } + + private abstract class BufferReader { + abstract protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException; + + ArrowBuf readBuffer(BufferAllocator allocator, int count) throws IOException { + readToken(START_ARRAY); + ArrowBuf buf = read(allocator, count); + readToken(END_ARRAY); + return buf; + } + } + + private class BufferHelper { + BufferReader BIT = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final int bufferSize = BitVectorHelper.getValidityBufferSize(count); + ArrowBuf buf = allocator.buffer(bufferSize); + + // C++ integration test fails without this. + buf.setZero(0, bufferSize); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + BitVectorHelper.setValidityBit(buf, i, parser.readValueAs(Boolean.class) ? 1 : 0); + } + + buf.writerIndex(bufferSize); + return buf; + } + }; + + BufferReader INT1 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final int size = count * TinyIntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeByte(parser.getByteValue()); + } + + return buf; + } + }; + + BufferReader INT2 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final int size = count * SmallIntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeShort(parser.getShortValue()); + } + + return buf; + } + }; + + BufferReader INT4 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final int size = count * IntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeInt(parser.getIntValue()); + } + + return buf; + } + }; + + BufferReader INT8 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final int size = count * BigIntVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeLong(parser.getLongValue()); + } + + return buf; + } + }; + + BufferReader FLOAT4 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final int size = count * Float4Vector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeFloat(parser.getFloatValue()); + } + + return buf; + } + }; + + BufferReader FLOAT8 = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final int size = count * Float8Vector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + buf.writeDouble(parser.getDoubleValue()); + } + + return buf; + } + }; + + BufferReader DECIMAL = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + final int size = count * DecimalVector.TYPE_WIDTH; + ArrowBuf buf = allocator.buffer(size); + + for (int i = 0; i < count; i++) { + parser.nextToken(); + BigDecimal decimalValue = new BigDecimal(parser.readValueAs(String.class)); + DecimalUtility.writeBigDecimalToArrowBuf(decimalValue, buf, i); + } + + buf.writerIndex(size); + return buf; + } + }; + + BufferReader VARCHAR = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + ArrayList values = Lists.newArrayList(); + int bufferSize = 0; + for (int i = 0; i < count; i++) { + parser.nextToken(); + final byte[] value = parser.getValueAsString().getBytes(UTF_8); + values.add(value); + bufferSize += value.length; + + } + + ArrowBuf buf = allocator.buffer(bufferSize); + + for (byte[] value : values) { + buf.writeBytes(value); + } + + return buf; + } + }; + + BufferReader VARBINARY = new BufferReader() { + @Override + protected ArrowBuf read(BufferAllocator allocator, int count) throws IOException { + ArrayList values = Lists.newArrayList(); + int bufferSize = 0; + for (int i = 0; i < count; i++) { + parser.nextToken(); + final byte[] value = decodeHexSafe(parser.readValueAs(String.class)); + values.add(value); + bufferSize += value.length; + + } + + ArrowBuf buf = allocator.buffer(bufferSize); + + for (byte[] value : values) { + buf.writeBytes(value); + } + + return buf; + } + }; + } + + private ArrowBuf readIntoBuffer(BufferAllocator allocator, BufferType bufferType, + Types.MinorType type, int count) throws IOException { + ArrowBuf buf; + + BufferHelper helper = new BufferHelper(); + + BufferReader reader = null; + + if (bufferType.equals(VALIDITY)) { + reader = helper.BIT; + } else if (bufferType.equals(OFFSET)) { + reader = helper.INT4; + } else if (bufferType.equals(TYPE)) { + reader = helper.INT1; + } else if (bufferType.equals(DATA)) { + switch (type) { + case BIT: + reader = helper.BIT; + break; + case TINYINT: + reader = helper.INT1; + break; + case SMALLINT: + reader = helper.INT2; + break; + case INT: + reader = helper.INT4; + break; + case BIGINT: + reader = helper.INT8; + break; + case UINT1: + reader = helper.INT1; + break; + case UINT2: + reader = helper.INT2; + break; + case UINT4: + reader = helper.INT4; + break; + case UINT8: + reader = helper.INT8; + break; + case FLOAT4: + reader = helper.FLOAT4; + break; + case FLOAT8: + reader = helper.FLOAT8; + break; + case DECIMAL: + reader = helper.DECIMAL; + break; + case VARCHAR: + reader = helper.VARCHAR; + break; + case VARBINARY: + reader = helper.VARBINARY; + break; + case DATEDAY: + reader = helper.INT4; + break; + case DATEMILLI: + reader = helper.INT8; + break; + case TIMESEC: + case TIMEMILLI: + reader = helper.INT4; + break; + case TIMEMICRO: + case TIMENANO: + reader = helper.INT8; + break; + case TIMESTAMPNANO: + case TIMESTAMPMICRO: + case TIMESTAMPMILLI: + case TIMESTAMPSEC: + case TIMESTAMPNANOTZ: + case TIMESTAMPMICROTZ: + case TIMESTAMPMILLITZ: + case TIMESTAMPSECTZ: + reader = helper.INT8; + break; + default: + throw new UnsupportedOperationException("Cannot read array of type " + type); + } + } else { + throw new InvalidArrowFileException("Unrecognized buffer type " + bufferType); + } + + buf = reader.readBuffer(allocator, count); + + assert buf != null; + return buf; + } + + private void readFromJsonIntoVector(Field field, FieldVector vector) throws JsonParseException, IOException { + TypeLayout typeLayout = TypeLayout.getTypeLayout(field.getType()); + List vectorTypes = typeLayout.getBufferTypes(); + ArrowBuf[] vectorBuffers = new ArrowBuf[vectorTypes.size()]; + /* + * The order of inner buffers is : + * Fixed width vector: + * -- validity buffer + * -- data buffer + * Variable width vector: + * -- validity buffer + * -- offset buffer + * -- data buffer + * + * This is similar to what getFieldInnerVectors() used to give but now that we don't have + * inner vectors anymore, we will work directly at the buffer level -- populate buffers + * locally as we read from Json parser and do loadFieldBuffers on the vector followed by + * releasing the local buffers. + */ + readToken(START_OBJECT); + { + // If currently reading dictionaries, field name is not important so don't check + String name = readNextField("name", String.class); + if (started && !Objects.equal(field.getName(), name)) { + throw new IllegalArgumentException("Expected field " + field.getName() + " but got " + name); + } + + /* Initialize the vector with required capacity but don't allocateNew since we would + * be doing loadFieldBuffers. + */ + int valueCount = readNextField("count", Integer.class); + vector.setInitialCapacity(valueCount); + + for (int v = 0; v < vectorTypes.size(); v++) { + BufferType bufferType = vectorTypes.get(v); + nextFieldIs(bufferType.getName()); + int innerBufferValueCount = valueCount; + if (bufferType.equals(OFFSET)) { + /* offset buffer has 1 additional value capacity */ + innerBufferValueCount = valueCount + 1; + } + + vectorBuffers[v] = readIntoBuffer(allocator, bufferType, vector.getMinorType(), innerBufferValueCount); + } + + final int nullCount = BitVectorHelper.getNullCount(vectorBuffers[0], valueCount); + final ArrowFieldNode fieldNode = new ArrowFieldNode(valueCount, nullCount); + vector.loadFieldBuffers(fieldNode, Arrays.asList(vectorBuffers)); + + /* read child vectors (if any) */ + List fields = field.getChildren(); + if (!fields.isEmpty()) { + List vectorChildren = vector.getChildrenFromFields(); + if (fields.size() != vectorChildren.size()) { + throw new IllegalArgumentException( + "fields and children are not the same size: " + fields.size() + " != " + vectorChildren.size()); + } + nextFieldIs("children"); + readToken(START_ARRAY); + for (int i = 0; i < fields.size(); i++) { + Field childField = fields.get(i); + FieldVector childVector = vectorChildren.get(i); + readFromJsonIntoVector(childField, childVector); + } + readToken(END_ARRAY); + } + } + readToken(END_OBJECT); + + for (ArrowBuf buffer: vectorBuffers) { + buffer.release(); + } + } + + private byte[] decodeHexSafe(String hexString) throws IOException { + try { + return Hex.decodeHex(hexString.toCharArray()); + } catch (DecoderException e) { + throw new IOException("Unable to decode hex string: " + hexString, e); + } + } + + @Override + public void close() throws IOException { + parser.close(); + for (Dictionary dictionary : dictionaries.values()) { + dictionary.getVector().close(); + } + } + + private T readNextField(String expectedFieldName, Class c) throws IOException, JsonParseException { + nextFieldIs(expectedFieldName); + parser.nextToken(); + return parser.readValueAs(c); + } + + private void nextFieldIs(String expectedFieldName) throws IOException, JsonParseException { + String name = parser.nextFieldName(); + if (name == null || !name.equals(expectedFieldName)) { + throw new IllegalStateException("Expected " + expectedFieldName + " but got " + name); + } + } + + private void readToken(JsonToken expected) throws JsonParseException, IOException { + JsonToken t = parser.nextToken(); + if (t != expected) { + throw new IllegalStateException("Expected " + expected + " but got " + t); + } + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java similarity index 52% rename from java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java index 04e44379e5dfa..6eb76a7a14723 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java @@ -16,10 +16,13 @@ * limitations under the License. ******************************************************************************/ -package org.apache.arrow.vector.file.json; +package org.apache.arrow.vector.ipc; + +import static org.apache.arrow.vector.BufferLayout.BufferType.*; import java.io.File; import java.io.IOException; +import java.math.BigDecimal; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -27,27 +30,12 @@ import com.google.common.collect.ImmutableList; import io.netty.buffer.ArrowBuf; -import org.apache.arrow.vector.BitVector; -import org.apache.arrow.vector.BufferBacked; -import org.apache.arrow.vector.DateDayVector; -import org.apache.arrow.vector.DateMilliVector; -import org.apache.arrow.vector.DecimalVector; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.TimeMicroVector; -import org.apache.arrow.vector.TimeMilliVector; -import org.apache.arrow.vector.TimeNanoVector; -import org.apache.arrow.vector.TimeSecVector; -import org.apache.arrow.vector.TimeStampMicroVector; -import org.apache.arrow.vector.TimeStampMilliVector; -import org.apache.arrow.vector.TimeStampNanoVector; -import org.apache.arrow.vector.TimeStampSecVector; -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.ValueVector.Accessor; -import org.apache.arrow.vector.VarBinaryVector; -import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.*; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; -import org.apache.arrow.vector.schema.ArrowVectorType; +import org.apache.arrow.vector.BufferLayout.BufferType; +import org.apache.arrow.vector.TypeLayout; +import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; @@ -97,6 +85,8 @@ public JsonFileWriter(File outputFile, JSONWriteConfig config) throws IOExceptio prettyPrinter.indentArraysWith(NopIndenter.instance); this.generator.setPrettyPrinter(prettyPrinter); } + // Allow writing of floating point NaN values not as strings + this.generator.configure(JsonGenerator.Feature.QUOTE_NON_NUMERIC_NUMBERS, false); } public void start(Schema schema, DictionaryProvider provider) throws IOException { @@ -133,7 +123,7 @@ private void writeDictionaryBatches(JsonGenerator generator, Set dictionar FieldVector vector = dictionary.getVector(); List fields = ImmutableList.of(vector.getField()); List vectors = ImmutableList.of(vector); - VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, vector.getAccessor().getValueCount()); + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors, vector.getValueCount()); writeBatch(root); generator.writeEndObject(); @@ -155,31 +145,38 @@ private void writeBatch(VectorSchemaRoot recordBatch) throws IOException { generator.writeArrayFieldStart("columns"); for (Field field : recordBatch.getSchema().getFields()) { FieldVector vector = recordBatch.getVector(field.getName()); - writeVector(field, vector); + writeFromVectorIntoJson(field, vector); } generator.writeEndArray(); } generator.writeEndObject(); } - private void writeVector(Field field, FieldVector vector) throws IOException { - List vectorTypes = field.getTypeLayout().getVectorTypes(); - List fieldInnerVectors = vector.getFieldInnerVectors(); - if (vectorTypes.size() != fieldInnerVectors.size()) { - throw new IllegalArgumentException("vector types and inner vectors are not the same size: " + vectorTypes.size() + " != " + fieldInnerVectors.size()); + private void writeFromVectorIntoJson(Field field, FieldVector vector) throws IOException { + List vectorTypes = TypeLayout.getTypeLayout(field.getType()).getBufferTypes(); + List vectorBuffers = vector.getFieldBuffers(); + if (vectorTypes.size() != vectorBuffers.size()) { + throw new IllegalArgumentException("vector types and inner vector buffers are not the same size: " + vectorTypes.size() + " != " + vectorBuffers.size()); } generator.writeStartObject(); { generator.writeObjectField("name", field.getName()); - int valueCount = vector.getAccessor().getValueCount(); + int valueCount = vector.getValueCount(); generator.writeObjectField("count", valueCount); + final int scale = (vector instanceof DecimalVector) ? + ((DecimalVector) vector).getScale() : 0; for (int v = 0; v < vectorTypes.size(); v++) { - ArrowVectorType vectorType = vectorTypes.get(v); - BufferBacked innerVector = fieldInnerVectors.get(v); - generator.writeArrayFieldStart(vectorType.getName()); - ValueVector valueVector = (ValueVector) innerVector; - for (int i = 0; i < valueVector.getAccessor().getValueCount(); i++) { - writeValueToGenerator(valueVector, i); + BufferType bufferType = vectorTypes.get(v); + ArrowBuf vectorBuffer = vectorBuffers.get(v); + generator.writeArrayFieldStart(bufferType.getName()); + final int bufferValueCount = (bufferType.equals(OFFSET)) ? valueCount + 1 : valueCount; + for (int i = 0; i < bufferValueCount; i++) { + if (bufferType.equals(DATA) && (vector.getMinorType() == Types.MinorType.VARCHAR || + vector.getMinorType() == Types.MinorType.VARBINARY)) { + writeValueToGenerator(bufferType, vectorBuffer, vectorBuffers.get(v-1), vector, i, scale); + } else { + writeValueToGenerator(bufferType, vectorBuffer, null, vector, i, scale); + } } generator.writeEndArray(); } @@ -193,7 +190,7 @@ private void writeVector(Field field, FieldVector vector) throws IOException { for (int i = 0; i < fields.size(); i++) { Field childField = fields.get(i); FieldVector childVector = children.get(i); - writeVector(childField, childVector); + writeFromVectorIntoJson(childField, childVector); } generator.writeEndArray(); } @@ -201,62 +198,102 @@ private void writeVector(Field field, FieldVector vector) throws IOException { generator.writeEndObject(); } - private void writeValueToGenerator(ValueVector valueVector, int i) throws IOException { - switch (valueVector.getMinorType()) { - case DATEDAY: - generator.writeNumber(((DateDayVector) valueVector).getAccessor().get(i)); - break; - case DATEMILLI: - generator.writeNumber(((DateMilliVector) valueVector).getAccessor().get(i)); - break; - case TIMESEC: - generator.writeNumber(((TimeSecVector) valueVector).getAccessor().get(i)); - break; - case TIMEMILLI: - generator.writeNumber(((TimeMilliVector) valueVector).getAccessor().get(i)); - break; - case TIMEMICRO: - generator.writeNumber(((TimeMicroVector) valueVector).getAccessor().get(i)); - break; - case TIMENANO: - generator.writeNumber(((TimeNanoVector) valueVector).getAccessor().get(i)); - break; - case TIMESTAMPSEC: - generator.writeNumber(((TimeStampSecVector) valueVector).getAccessor().get(i)); - break; - case TIMESTAMPMILLI: - generator.writeNumber(((TimeStampMilliVector) valueVector).getAccessor().get(i)); - break; - case TIMESTAMPMICRO: - generator.writeNumber(((TimeStampMicroVector) valueVector).getAccessor().get(i)); - break; - case TIMESTAMPNANO: - generator.writeNumber(((TimeStampNanoVector) valueVector).getAccessor().get(i)); - break; - case BIT: - generator.writeNumber(((BitVector) valueVector).getAccessor().get(i)); - break; - case VARBINARY: { - String hexString = Hex.encodeHexString(((VarBinaryVector) valueVector).getAccessor().get(i)); - generator.writeString(hexString); + private void writeValueToGenerator(BufferType bufferType, ArrowBuf buffer, + ArrowBuf offsetBuffer, FieldVector vector, + final int index, final int scale) throws IOException { + if (bufferType.equals(TYPE)) { + generator.writeNumber(buffer.getByte(index * TinyIntVector.TYPE_WIDTH)); + } else if (bufferType.equals(OFFSET)) { + generator.writeNumber(buffer.getInt(index * BaseVariableWidthVector.OFFSET_WIDTH)); + } else if(bufferType.equals(VALIDITY)) { + generator.writeNumber(vector.isNull(index) ? 0 : 1); + } else if (bufferType.equals(DATA)) { + switch (vector.getMinorType()) { + case TINYINT: + generator.writeNumber(TinyIntVector.get(buffer, index)); + break; + case SMALLINT: + generator.writeNumber(SmallIntVector.get(buffer, index)); + break; + case INT: + generator.writeNumber(IntVector.get(buffer, index)); + break; + case BIGINT: + generator.writeNumber(BigIntVector.get(buffer, index)); + break; + case FLOAT4: + generator.writeNumber(Float4Vector.get(buffer, index)); + break; + case FLOAT8: + generator.writeNumber(Float8Vector.get(buffer, index)); + break; + case DATEDAY: + generator.writeNumber(DateDayVector.get(buffer, index)); + break; + case DATEMILLI: + generator.writeNumber(DateMilliVector.get(buffer, index)); + break; + case TIMESEC: + generator.writeNumber(TimeSecVector.get(buffer, index)); + break; + case TIMEMILLI: + generator.writeNumber(TimeMilliVector.get(buffer, index)); + break; + case TIMEMICRO: + generator.writeNumber(TimeMicroVector.get(buffer, index)); + break; + case TIMENANO: + generator.writeNumber(TimeNanoVector.get(buffer, index)); + break; + case TIMESTAMPSEC: + generator.writeNumber(TimeStampSecVector.get(buffer, index)); + break; + case TIMESTAMPMILLI: + generator.writeNumber(TimeStampMilliVector.get(buffer, index)); + break; + case TIMESTAMPMICRO: + generator.writeNumber(TimeStampMicroVector.get(buffer, index)); + break; + case TIMESTAMPNANO: + generator.writeNumber(TimeStampNanoVector.get(buffer, index)); + break; + case TIMESTAMPSECTZ: + generator.writeNumber(TimeStampSecTZVector.get(buffer, index)); + break; + case TIMESTAMPMILLITZ: + generator.writeNumber(TimeStampMilliTZVector.get(buffer, index)); + break; + case TIMESTAMPMICROTZ: + generator.writeNumber(TimeStampMicroTZVector.get(buffer, index)); + break; + case TIMESTAMPNANOTZ: + generator.writeNumber(TimeStampNanoTZVector.get(buffer, index)); + break; + case BIT: + generator.writeNumber(BitVectorHelper.get(buffer, index)); + break; + case VARBINARY: { + assert offsetBuffer != null; + String hexString = Hex.encodeHexString(BaseVariableWidthVector.get(buffer, + offsetBuffer, index)); + generator.writeObject(hexString); + break; } - break; - case DECIMAL: { - ArrowBuf bytebuf = valueVector.getDataBuffer(); - String hexString = Hex.encodeHexString(DecimalUtility.getByteArrayFromArrowBuf(bytebuf, i)); - generator.writeString(hexString); + case VARCHAR: { + assert offsetBuffer != null; + byte[] b = (BaseVariableWidthVector.get(buffer, offsetBuffer, index)); + generator.writeString(new String(b, "UTF-8")); + break; } - break; - default: - // TODO: each type - Accessor accessor = valueVector.getAccessor(); - Object value = accessor.getObject(i); - if (value instanceof Number || value instanceof Boolean) { - generator.writeObject(value); - } else { - generator.writeObject(value.toString()); + case DECIMAL: { + BigDecimal decimalValue = DecimalUtility.getBigDecimalFromArrowBuf(buffer, index, scale); + // We write the unscaled value, because the scale is stored in the type metadata. + generator.writeString(decimalValue.unscaledValue().toString()); + break; } - break; + default: + throw new UnsupportedOperationException("minor type: " + vector.getMinorType()); + } } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ReadChannel.java similarity index 98% rename from java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/ReadChannel.java index b0eb8f3d84d9a..395fd7db5975f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ReadChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ReadChannel.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/SeekableReadChannel.java similarity index 97% rename from java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/SeekableReadChannel.java index 46bea1314da63..62ba3b73e5377 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/SeekableReadChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/SeekableReadChannel.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import java.io.IOException; import java.nio.channels.SeekableByteChannel; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java similarity index 97% rename from java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java index 89c9d1f9b7a44..da500aa97bee9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/WriteChannel.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/WriteChannel.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import java.io.IOException; import java.nio.ByteBuffer; @@ -25,7 +25,7 @@ import com.google.flatbuffers.FlatBufferBuilder; import io.netty.buffer.ArrowBuf; -import org.apache.arrow.vector.schema.FBSerializable; +import org.apache.arrow.vector.ipc.message.FBSerializable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBlock.java similarity index 96% rename from java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBlock.java index e1b4d6a8b215e..8731f77ac2c4c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBlock.java @@ -16,10 +16,9 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc.message; import org.apache.arrow.flatbuf.Block; -import org.apache.arrow.vector.schema.FBSerializable; import com.google.flatbuffers.FlatBufferBuilder; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBuffer.java similarity index 81% rename from java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBuffer.java index d8c9e3001d0a5..6b0eeaad4d177 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowBuffer.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.schema; +package org.apache.arrow.vector.ipc.message; import org.apache.arrow.flatbuf.Buffer; @@ -24,21 +24,15 @@ public class ArrowBuffer implements FBSerializable { - private int page; private long offset; private long size; - public ArrowBuffer(int page, long offset, long size) { + public ArrowBuffer(long offset, long size) { super(); - this.page = page; this.offset = offset; this.size = size; } - public int getPage() { - return page; - } - public long getOffset() { return offset; } @@ -52,7 +46,6 @@ public int hashCode() { final int prime = 31; int result = 1; result = prime * result + (int) (offset ^ (offset >>> 32)); - result = prime * result + page; result = prime * result + (int) (size ^ (size >>> 32)); return result; } @@ -72,9 +65,6 @@ public boolean equals(Object obj) { if (offset != other.offset) { return false; } - if (page != other.page) { - return false; - } if (size != other.size) { return false; } @@ -83,12 +73,12 @@ public boolean equals(Object obj) { @Override public int writeTo(FlatBufferBuilder builder) { - return Buffer.createBuffer(builder, page, offset, size); + return Buffer.createBuffer(builder, offset, size); } @Override public String toString() { - return "ArrowBuffer [page=" + page + ", offset=" + offset + ", size=" + size + "]"; + return "ArrowBuffer [offset=" + offset + ", size=" + size + "]"; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java similarity index 97% rename from java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java index 635fa3fb42307..cd23cb96b6bd7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowDictionaryBatch.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.schema; +package org.apache.arrow.vector.ipc.message; import com.google.flatbuffers.FlatBufferBuilder; import org.apache.arrow.flatbuf.DictionaryBatch; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFieldNode.java similarity index 97% rename from java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFieldNode.java index 3ed384ed7e280..ca0087f7089ce 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFieldNode.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.schema; +package org.apache.arrow.vector.ipc.message; import org.apache.arrow.flatbuf.FieldNode; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFooter.java similarity index 96% rename from java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFooter.java index 1e95321fdec5b..f7794f7364bd3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowFooter.java @@ -16,16 +16,15 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc.message; -import static org.apache.arrow.vector.schema.FBSerializables.writeAllStructsToVector; +import static org.apache.arrow.vector.ipc.message.FBSerializables.writeAllStructsToVector; import java.util.ArrayList; import java.util.List; import org.apache.arrow.flatbuf.Block; import org.apache.arrow.flatbuf.Footer; -import org.apache.arrow.vector.schema.FBSerializable; import org.apache.arrow.vector.types.pojo.Schema; import com.google.flatbuffers.FlatBufferBuilder; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowMessage.java similarity index 96% rename from java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowMessage.java index f59b4b6c1721e..92fb58e16fe6b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowMessage.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowMessage.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.schema; +package org.apache.arrow.vector.ipc.message; public interface ArrowMessage extends FBSerializable, AutoCloseable { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java similarity index 93% rename from java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java index c842d4c3f9a74..6c6481e74dd93 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.java @@ -16,9 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.schema; - -import static org.apache.arrow.vector.schema.FBSerializables.writeAllStructsToVector; +package org.apache.arrow.vector.ipc.message; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -72,7 +70,7 @@ public ArrowRecordBatch(int length, List nodes, List b for (ArrowBuf arrowBuf : buffers) { arrowBuf.retain(); long size = arrowBuf.readableBytes(); - arrowBuffers.add(new ArrowBuffer(0, offset, size)); + arrowBuffers.add(new ArrowBuffer(offset, size)); LOGGER.debug(String.format("Buffer in RecordBatch at %d, length: %d", offset, size)); offset += size; if (alignBuffers && offset % 8 != 0) { // align on 8 byte boundaries @@ -113,9 +111,9 @@ public List getBuffersLayout() { @Override public int writeTo(FlatBufferBuilder builder) { RecordBatch.startNodesVector(builder, nodes.size()); - int nodesOffset = writeAllStructsToVector(builder, nodes); + int nodesOffset = FBSerializables.writeAllStructsToVector(builder, nodes); RecordBatch.startBuffersVector(builder, buffers.size()); - int buffersOffset = writeAllStructsToVector(builder, buffersLayout); + int buffersOffset = FBSerializables.writeAllStructsToVector(builder, buffersLayout); RecordBatch.startRecordBatch(builder); RecordBatch.addLength(builder, length); RecordBatch.addNodes(builder, nodesOffset); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializable.java similarity index 95% rename from java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializable.java index 91d60ea995b89..31f55bd522c68 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializable.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.schema; +package org.apache.arrow.vector.ipc.message; import com.google.flatbuffers.FlatBufferBuilder; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java similarity index 96% rename from java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java index ae5aa555e745e..6717ed7ab313e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.schema; +package org.apache.arrow.vector.ipc.message; import java.util.ArrayList; import java.util.Collections; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageChannelReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageChannelReader.java new file mode 100644 index 0000000000000..5bc3e1fff6f96 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageChannelReader.java @@ -0,0 +1,115 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.flatbuf.Message; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.ReadChannel; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Reads a sequence of messages using a ReadChannel. + */ +public class MessageChannelReader implements MessageReader { + + private ReadChannel in; + + /** + * Construct from an existing ReadChannel. + * + * @param in Channel to read messages from + */ + public MessageChannelReader(ReadChannel in) { + this.in = in; + } + + /** + * Read the next message from the ReadChannel. + * + * @return A Message or null if ReadChannel has no more messages, indicated by message length of 0 + * @throws IOException + */ + @Override + public Message readNextMessage() throws IOException { + // Read the message size. There is an i32 little endian prefix. + ByteBuffer buffer = ByteBuffer.allocate(4); + if (in.readFully(buffer) != 4) { + return null; + } + int messageLength = MessageSerializer.bytesToInt(buffer.array()); + if (messageLength == 0) { + return null; + } + + buffer = ByteBuffer.allocate(messageLength); + if (in.readFully(buffer) != messageLength) { + throw new IOException( + "Unexpected end of stream trying to read message."); + } + buffer.rewind(); + + return Message.getRootAsMessage(buffer); + } + + /** + * Read a message body from the ReadChannel. + * + * @param message Read message that is followed by a body of data + * @param allocator BufferAllocator to allocate memory for body data + * @return ArrowBuf containing the message body data + * @throws IOException + */ + @Override + public ArrowBuf readMessageBody(Message message, BufferAllocator allocator) throws IOException { + + int bodyLength = (int) message.bodyLength(); + + // Now read the record batch body + ArrowBuf buffer = allocator.buffer(bodyLength); + if (in.readFully(buffer, bodyLength) != bodyLength) { + throw new IOException("Unexpected end of input trying to read batch."); + } + + return buffer; + } + + /** + * Get the number of bytes read from the ReadChannel. + * + * @return number of bytes + */ + @Override + public long bytesRead() { + return in.bytesRead(); + } + + /** + * Close the ReadChannel. + * + * @throws IOException + */ + @Override + public void close() throws IOException { + in.close(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageReader.java new file mode 100644 index 0000000000000..b277c58295059 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageReader.java @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc.message; + + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.flatbuf.Message; +import org.apache.arrow.memory.BufferAllocator; + +import java.io.IOException; + +/** + * Interface for reading a sequence of messages. + */ +public interface MessageReader { + + /** + * Read the next message in the sequence. + * + * @return The read message or null if reached the end of the message sequence + * @throws IOException + */ + Message readNextMessage() throws IOException; + + /** + * When a message is followed by a body of data, read that data into an ArrowBuf. This should + * only be called when a Message has a body length > 0. + * + * @param message Read message that is followed by a body of data + * @param allocator BufferAllocator to allocate memory for body data + * @return An ArrowBuf containing the body of the message that was read + * @throws IOException + */ + ArrowBuf readMessageBody(Message message, BufferAllocator allocator) throws IOException; + + /** + * Return the current number of bytes that have been read. + * + * @return number of bytes read + */ + long bytesRead(); + + /** + * Close any resource opened by the message reader, not including message body allocations. + * + * @throws IOException + */ + void close() throws IOException; +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageSerializer.java similarity index 85% rename from java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java rename to java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageSerializer.java index f69aa41e7f6bd..e2f8f7d9a8d7e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/MessageSerializer.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.stream; +package org.apache.arrow.vector.ipc.message; import java.io.IOException; import java.nio.ByteBuffer; @@ -31,14 +31,8 @@ import org.apache.arrow.flatbuf.MetadataVersion; import org.apache.arrow.flatbuf.RecordBatch; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.file.ArrowBlock; -import org.apache.arrow.vector.file.ReadChannel; -import org.apache.arrow.vector.file.WriteChannel; -import org.apache.arrow.vector.schema.ArrowBuffer; -import org.apache.arrow.vector.schema.ArrowDictionaryBatch; -import org.apache.arrow.vector.schema.ArrowFieldNode; -import org.apache.arrow.vector.schema.ArrowMessage; -import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.ReadChannel; +import org.apache.arrow.vector.ipc.WriteChannel; import org.apache.arrow.vector.types.pojo.Schema; import com.google.flatbuffers.FlatBufferBuilder; @@ -102,12 +96,12 @@ public static long serialize(WriteChannel out, Schema schema) throws IOException /** * Deserializes a schema object. Format is from serialize(). * - * @param in the channel to deserialize from + * @param reader the reader interface to deserialize from * @return the deserialized object * @throws IOException if something went wrong */ - public static Schema deserializeSchema(ReadChannel in) throws IOException { - Message message = deserializeMessage(in); + public static Schema deserializeSchema(MessageReader reader) throws IOException { + Message message = reader.readNextMessage(); if (message == null) { throw new IOException("Unexpected end of input. Missing schema."); } @@ -119,6 +113,16 @@ public static Schema deserializeSchema(ReadChannel in) throws IOException { message.header(new org.apache.arrow.flatbuf.Schema())); } + /** + * Deserializes a schema object. Format is from serialize(). + * + * @param in the channel to deserialize from + * @return the deserialized object + * @throws IOException if something went wrong + */ + public static Schema deserializeSchema(ReadChannel in) throws IOException { + return deserializeSchema(new MessageChannelReader(in)); + } /** * Serializes an ArrowRecordBatch. Returns the offset and length of the written batch. @@ -184,25 +188,20 @@ public static long writeBatchBuffers(WriteChannel out, ArrowRecordBatch batch) t } /** - * Deserializes a RecordBatch + * Deserializes a RecordBatch. * - * @param in the channel to deserialize from + * @param reader the reader interface to deserialize from * @param message the object to derialize to * @param alloc to allocate buffers * @return the deserialized object * @throws IOException if something went wrong */ - public static ArrowRecordBatch deserializeRecordBatch(ReadChannel in, Message message, BufferAllocator alloc) + public static ArrowRecordBatch deserializeRecordBatch(MessageReader reader, Message message, BufferAllocator alloc) throws IOException { RecordBatch recordBatchFB = (RecordBatch) message.header(new RecordBatch()); - int bodyLength = (int) message.bodyLength(); - // Now read the record batch body - ArrowBuf buffer = alloc.buffer(bodyLength); - if (in.readFully(buffer, bodyLength) != bodyLength) { - throw new IOException("Unexpected end of input trying to read batch."); - } + ArrowBuf buffer = reader.readMessageBody(message, alloc); return deserializeRecordBatch(recordBatchFB, buffer); } @@ -243,7 +242,14 @@ public static ArrowRecordBatch deserializeRecordBatch(ReadChannel in, ArrowBlock return deserializeRecordBatch(recordBatchFB, body); } - // Deserializes a record batch given the Flatbuffer metadata and in-memory body + /** + * Deserializes a record batch given the Flatbuffer metadata and in-memory body. + * + * @param recordBatchFB Deserialized FlatBuffer record batch + * @param body Read body of the record batch + * @return ArrowRecordBatch from metadata and in-memory body + * @throws IOException + */ public static ArrowRecordBatch deserializeRecordBatch(RecordBatch recordBatchFB, ArrowBuf body) throws IOException { // Now read the body @@ -314,26 +320,21 @@ public static ArrowBlock serialize(WriteChannel out, ArrowDictionaryBatch batch) } /** - * Deserializes a DictionaryBatch + * Deserializes a DictionaryBatch. * - * @param in where to read from + * @param reader where to read from * @param message the message message metadata to deserialize * @param alloc the allocator for new buffers * @return the corresponding dictionary batch * @throws IOException if something went wrong */ - public static ArrowDictionaryBatch deserializeDictionaryBatch(ReadChannel in, + public static ArrowDictionaryBatch deserializeDictionaryBatch(MessageReader reader, Message message, BufferAllocator alloc) throws IOException { DictionaryBatch dictionaryBatchFB = (DictionaryBatch) message.header(new DictionaryBatch()); - int bodyLength = (int) message.bodyLength(); - // Now read the record batch body - ArrowBuf body = alloc.buffer(bodyLength); - if (in.readFully(body, bodyLength) != bodyLength) { - throw new IOException("Unexpected end of input trying to read batch."); - } + ArrowBuf body = reader.readMessageBody(message, alloc); ArrowRecordBatch recordBatch = deserializeRecordBatch(dictionaryBatchFB.data(), body); return new ArrowDictionaryBatch(dictionaryBatchFB.id(), recordBatch); } @@ -377,24 +378,48 @@ public static ArrowDictionaryBatch deserializeDictionaryBatch(ReadChannel in, return new ArrowDictionaryBatch(dictionaryBatchFB.id(), recordBatch); } - public static ArrowMessage deserializeMessageBatch(ReadChannel in, BufferAllocator alloc) throws IOException { - Message message = deserializeMessage(in); + /** + * Deserialize a message that is either an ArrowDictionaryBatch or ArrowRecordBatch. + * + * @param reader Interface to read messages from + * @param alloc Allocator for message data + * @return The deserialized record batch + * @throws IOException if the message is not an ArrowDictionaryBatch or ArrowRecordBatch + */ + public static ArrowMessage deserializeMessageBatch(MessageReader reader, BufferAllocator alloc) throws IOException { + Message message = reader.readNextMessage(); if (message == null) { return null; } else if (message.bodyLength() > Integer.MAX_VALUE) { throw new IOException("Cannot currently deserialize record batches over 2GB"); } + if (message.version() != MetadataVersion.V4) { + throw new IOException("Received metadata with an incompatible version number"); + } + switch (message.headerType()) { case MessageHeader.RecordBatch: - return deserializeRecordBatch(in, message, alloc); + return deserializeRecordBatch(reader, message, alloc); case MessageHeader.DictionaryBatch: - return deserializeDictionaryBatch(in, message, alloc); + return deserializeDictionaryBatch(reader, message, alloc); default: throw new IOException("Unexpected message header type " + message.headerType()); } } + /** + * Deserialize a message that is either an ArrowDictionaryBatch or ArrowRecordBatch. + * + * @param in ReadChannel to read messages from + * @param alloc Allocator for message data + * @return The deserialized record batch + * @throws IOException if the message is not an ArrowDictionaryBatch or ArrowRecordBatch + */ + public static ArrowMessage deserializeMessageBatch(ReadChannel in, BufferAllocator alloc) throws IOException { + return deserializeMessageBatch(new MessageChannelReader(in), alloc); + } + /** * Serializes a message header. * @@ -409,7 +434,7 @@ public static ByteBuffer serializeMessage(FlatBufferBuilder builder, byte header Message.startMessage(builder); Message.addHeaderType(builder, headerType); Message.addHeader(builder, headerOffset); - Message.addVersion(builder, MetadataVersion.V3); + Message.addVersion(builder, MetadataVersion.V4); Message.addBodyLength(builder, bodyLength); builder.finish(Message.endMessage(builder)); return builder.dataBuffer(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java deleted file mode 100644 index 9d2fdfaafe4aa..0000000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java +++ /dev/null @@ -1,101 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.arrow.vector.schema; - -import java.util.Map; - -import org.apache.arrow.flatbuf.VectorType; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonValue; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableMap.Builder; - -public class ArrowVectorType { - - public static final ArrowVectorType DATA = new ArrowVectorType(VectorType.DATA); - public static final ArrowVectorType OFFSET = new ArrowVectorType(VectorType.OFFSET); - public static final ArrowVectorType VALIDITY = new ArrowVectorType(VectorType.VALIDITY); - public static final ArrowVectorType TYPE = new ArrowVectorType(VectorType.TYPE); - - private static final Map typeByName; - - static { - ArrowVectorType[] types = {DATA, OFFSET, VALIDITY, TYPE}; - Builder builder = ImmutableMap.builder(); - for (ArrowVectorType type : types) { - builder.put(type.getName(), type); - } - typeByName = builder.build(); - } - - public static ArrowVectorType fromName(String name) { - ArrowVectorType type = typeByName.get(name); - if (type == null) { - throw new IllegalArgumentException("Unknown type " + name); - } - return type; - } - - private final short type; - - public ArrowVectorType(short type) { - this.type = type; - // validate that the type is valid - getName(); - } - - @JsonCreator - private ArrowVectorType(String name) { - this.type = fromName(name).type; - } - - public short getType() { - return type; - } - - @JsonValue - public String getName() { - try { - return VectorType.name(type); - } catch (ArrayIndexOutOfBoundsException e) { - throw new IllegalArgumentException("Unknown type " + type); - } - } - - @Override - public String toString() { - return getName(); - } - - @Override - public int hashCode() { - return type; - } - - @Override - public boolean equals(Object obj) { - if (obj instanceof ArrowVectorType) { - ArrowVectorType other = (ArrowVectorType) obj; - return type == other.type; - } - return false; - } - -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java deleted file mode 100644 index 0871baf38edaa..0000000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java +++ /dev/null @@ -1,135 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.arrow.vector.schema; - -import static org.apache.arrow.vector.schema.ArrowVectorType.DATA; -import static org.apache.arrow.vector.schema.ArrowVectorType.OFFSET; -import static org.apache.arrow.vector.schema.ArrowVectorType.TYPE; -import static org.apache.arrow.vector.schema.ArrowVectorType.VALIDITY; - -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.base.Preconditions; -import com.google.flatbuffers.FlatBufferBuilder; - -public class VectorLayout implements FBSerializable { - - private static final VectorLayout VALIDITY_VECTOR = new VectorLayout(VALIDITY, 1); - private static final VectorLayout OFFSET_VECTOR = new VectorLayout(OFFSET, 32); - private static final VectorLayout TYPE_VECTOR = new VectorLayout(TYPE, 32); - private static final VectorLayout BOOLEAN_VECTOR = new VectorLayout(DATA, 1); - private static final VectorLayout VALUES_64 = new VectorLayout(DATA, 64); - private static final VectorLayout VALUES_32 = new VectorLayout(DATA, 32); - private static final VectorLayout VALUES_16 = new VectorLayout(DATA, 16); - private static final VectorLayout VALUES_8 = new VectorLayout(DATA, 8); - - public static VectorLayout typeVector() { - return TYPE_VECTOR; - } - - public static VectorLayout offsetVector() { - return OFFSET_VECTOR; - } - - public static VectorLayout dataVector(int typeBitWidth) { - switch (typeBitWidth) { - case 8: - return VALUES_8; - case 16: - return VALUES_16; - case 32: - return VALUES_32; - case 64: - return VALUES_64; - default: - throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); - } - } - - public static VectorLayout booleanVector() { - return BOOLEAN_VECTOR; - } - - public static VectorLayout validityVector() { - return VALIDITY_VECTOR; - } - - public static VectorLayout byteVector() { - return dataVector(8); - } - - private final short typeBitWidth; - - private final ArrowVectorType type; - - @JsonCreator - private VectorLayout(@JsonProperty("type") ArrowVectorType type, @JsonProperty("typeBitWidth") int typeBitWidth) { - super(); - this.type = Preconditions.checkNotNull(type); - this.typeBitWidth = (short) typeBitWidth; - if (typeBitWidth <= 0) { - throw new IllegalArgumentException("bitWidth invalid: " + typeBitWidth); - } - } - - public VectorLayout(org.apache.arrow.flatbuf.VectorLayout layout) { - this(new ArrowVectorType(layout.type()), layout.bitWidth()); - } - - public int getTypeBitWidth() { - return typeBitWidth; - } - - public ArrowVectorType getType() { - return type; - } - - @Override - public String toString() { - return String.format("%s(%s)", type, typeBitWidth); - } - - @Override - public int hashCode() { - return 31 * (31 + type.hashCode()) + typeBitWidth; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - VectorLayout other = (VectorLayout) obj; - return type.equals(other.type) && (typeBitWidth == other.typeBitWidth); - } - - @Override - public int writeTo(FlatBufferBuilder builder) { - ; - return org.apache.arrow.flatbuf.VectorLayout.createVectorLayout(builder, typeBitWidth, type.getType()); - } - - -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java deleted file mode 100644 index 5b6300076b6c2..0000000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/ArrowStreamReader.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.arrow.vector.stream; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.channels.Channels; -import java.nio.channels.ReadableByteChannel; - -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.file.ArrowReader; -import org.apache.arrow.vector.file.ReadChannel; -import org.apache.arrow.vector.schema.ArrowMessage; -import org.apache.arrow.vector.types.pojo.Schema; - -/** - * This classes reads from an input stream and produces ArrowRecordBatches. - */ -public class ArrowStreamReader extends ArrowReader { - - /** - * Constructs a streaming read, reading bytes from 'in'. Non-blocking. - * - * @param in the stream to read from - * @param allocator to allocate new buffers - */ - public ArrowStreamReader(ReadableByteChannel in, BufferAllocator allocator) { - super(new ReadChannel(in), allocator); - } - - public ArrowStreamReader(InputStream in, BufferAllocator allocator) { - this(Channels.newChannel(in), allocator); - } - - /** - * Reads the schema message from the beginning of the stream. - * - * @param in to allocate new buffers - * @return the deserialized arrow schema - */ - @Override - protected Schema readSchema(ReadChannel in) throws IOException { - return MessageSerializer.deserializeSchema(in); - } - - @Override - protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) throws IOException { - return MessageSerializer.deserializeMessageBatch(in, allocator); - } -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index c57dd6dafe9e6..3c5fd81d572d0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -24,36 +24,36 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.NullableBigIntVector; -import org.apache.arrow.vector.NullableBitVector; -import org.apache.arrow.vector.NullableDateDayVector; -import org.apache.arrow.vector.NullableDateMilliVector; -import org.apache.arrow.vector.NullableDecimalVector; -import org.apache.arrow.vector.NullableFloat4Vector; -import org.apache.arrow.vector.NullableFloat8Vector; -import org.apache.arrow.vector.NullableIntVector; -import org.apache.arrow.vector.NullableIntervalDayVector; -import org.apache.arrow.vector.NullableIntervalYearVector; -import org.apache.arrow.vector.NullableSmallIntVector; -import org.apache.arrow.vector.NullableTimeMicroVector; -import org.apache.arrow.vector.NullableTimeMilliVector; -import org.apache.arrow.vector.NullableTimeNanoVector; -import org.apache.arrow.vector.NullableTimeSecVector; -import org.apache.arrow.vector.NullableTimeStampMicroTZVector; -import org.apache.arrow.vector.NullableTimeStampMicroVector; -import org.apache.arrow.vector.NullableTimeStampMilliTZVector; -import org.apache.arrow.vector.NullableTimeStampMilliVector; -import org.apache.arrow.vector.NullableTimeStampNanoTZVector; -import org.apache.arrow.vector.NullableTimeStampNanoVector; -import org.apache.arrow.vector.NullableTimeStampSecTZVector; -import org.apache.arrow.vector.NullableTimeStampSecVector; -import org.apache.arrow.vector.NullableTinyIntVector; -import org.apache.arrow.vector.NullableUInt1Vector; -import org.apache.arrow.vector.NullableUInt2Vector; -import org.apache.arrow.vector.NullableUInt4Vector; -import org.apache.arrow.vector.NullableUInt8Vector; -import org.apache.arrow.vector.NullableVarBinaryVector; -import org.apache.arrow.vector.NullableVarCharVector; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.IntervalDayVector; +import org.apache.arrow.vector.IntervalYearVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroTZVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecTZVector; +import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.complex.FixedSizeListVector; @@ -142,293 +142,293 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { TINYINT(new Int(8, true)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTinyIntVector(name, fieldType, allocator); + return new TinyIntVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TinyIntWriterImpl((NullableTinyIntVector) vector); + return new TinyIntWriterImpl((TinyIntVector) vector); } }, SMALLINT(new Int(16, true)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableSmallIntVector(name, fieldType, allocator); + return new SmallIntVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new SmallIntWriterImpl((NullableSmallIntVector) vector); + return new SmallIntWriterImpl((SmallIntVector) vector); } }, INT(new Int(32, true)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableIntVector(name, fieldType, allocator); + return new IntVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new IntWriterImpl((NullableIntVector) vector); + return new IntWriterImpl((IntVector) vector); } }, BIGINT(new Int(64, true)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableBigIntVector(name, fieldType, allocator); + return new BigIntVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new BigIntWriterImpl((NullableBigIntVector) vector); + return new BigIntWriterImpl((BigIntVector) vector); } }, DATEDAY(new Date(DateUnit.DAY)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableDateDayVector(name, fieldType, allocator); + return new DateDayVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new DateDayWriterImpl((NullableDateDayVector) vector); + return new DateDayWriterImpl((DateDayVector) vector); } }, DATEMILLI(new Date(DateUnit.MILLISECOND)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableDateMilliVector(name, fieldType, allocator); + return new DateMilliVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new DateMilliWriterImpl((NullableDateMilliVector) vector); + return new DateMilliWriterImpl((DateMilliVector) vector); } }, TIMESEC(new Time(TimeUnit.SECOND, 32)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeSecVector(name, fieldType, allocator); + return new TimeSecVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeSecWriterImpl((NullableTimeSecVector) vector); + return new TimeSecWriterImpl((TimeSecVector) vector); } }, TIMEMILLI(new Time(TimeUnit.MILLISECOND, 32)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeMilliVector(name, fieldType, allocator); + return new TimeMilliVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeMilliWriterImpl((NullableTimeMilliVector) vector); + return new TimeMilliWriterImpl((TimeMilliVector) vector); } }, TIMEMICRO(new Time(TimeUnit.MICROSECOND, 64)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeMicroVector(name, fieldType, allocator); + return new TimeMicroVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeMicroWriterImpl((NullableTimeMicroVector) vector); + return new TimeMicroWriterImpl((TimeMicroVector) vector); } }, TIMENANO(new Time(TimeUnit.NANOSECOND, 64)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeNanoVector(name, fieldType, allocator); + return new TimeNanoVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeNanoWriterImpl((NullableTimeNanoVector) vector); + return new TimeNanoWriterImpl((TimeNanoVector) vector); } }, // time in second from the Unix epoch, 00:00:00.000000 on 1 January 1970, UTC. TIMESTAMPSEC(new Timestamp(org.apache.arrow.vector.types.TimeUnit.SECOND, null)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeStampSecVector(name, fieldType, allocator); + return new TimeStampSecVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeStampSecWriterImpl((NullableTimeStampSecVector) vector); + return new TimeStampSecWriterImpl((TimeStampSecVector) vector); } }, // time in millis from the Unix epoch, 00:00:00.000 on 1 January 1970, UTC. TIMESTAMPMILLI(new Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, null)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeStampMilliVector(name, fieldType, allocator); + return new TimeStampMilliVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeStampMilliWriterImpl((NullableTimeStampMilliVector) vector); + return new TimeStampMilliWriterImpl((TimeStampMilliVector) vector); } }, // time in microsecond from the Unix epoch, 00:00:00.000000 on 1 January 1970, UTC. TIMESTAMPMICRO(new Timestamp(org.apache.arrow.vector.types.TimeUnit.MICROSECOND, null)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeStampMicroVector(name, fieldType, allocator); + return new TimeStampMicroVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeStampMicroWriterImpl((NullableTimeStampMicroVector) vector); + return new TimeStampMicroWriterImpl((TimeStampMicroVector) vector); } }, // time in nanosecond from the Unix epoch, 00:00:00.000000000 on 1 January 1970, UTC. TIMESTAMPNANO(new Timestamp(org.apache.arrow.vector.types.TimeUnit.NANOSECOND, null)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeStampNanoVector(name, fieldType, allocator); + return new TimeStampNanoVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeStampNanoWriterImpl((NullableTimeStampNanoVector) vector); + return new TimeStampNanoWriterImpl((TimeStampNanoVector) vector); } }, INTERVALDAY(new Interval(IntervalUnit.DAY_TIME)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableIntervalDayVector(name, fieldType, allocator); + return new IntervalDayVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new IntervalDayWriterImpl((NullableIntervalDayVector) vector); + return new IntervalDayWriterImpl((IntervalDayVector) vector); } }, INTERVALYEAR(new Interval(IntervalUnit.YEAR_MONTH)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableIntervalYearVector(name, fieldType, allocator); + return new IntervalYearVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new IntervalYearWriterImpl((NullableIntervalYearVector) vector); + return new IntervalYearWriterImpl((IntervalYearVector) vector); } }, // 4 byte ieee 754 FLOAT4(new FloatingPoint(SINGLE)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableFloat4Vector(name, fieldType, allocator); + return new Float4Vector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new Float4WriterImpl((NullableFloat4Vector) vector); + return new Float4WriterImpl((Float4Vector) vector); } }, // 8 byte ieee 754 FLOAT8(new FloatingPoint(DOUBLE)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableFloat8Vector(name, fieldType, allocator); + return new Float8Vector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new Float8WriterImpl((NullableFloat8Vector) vector); + return new Float8WriterImpl((Float8Vector) vector); } }, BIT(Bool.INSTANCE) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableBitVector(name, fieldType, allocator); + return new BitVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new BitWriterImpl((NullableBitVector) vector); + return new BitWriterImpl((BitVector) vector); } }, VARCHAR(Utf8.INSTANCE) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableVarCharVector(name, fieldType, allocator); + return new VarCharVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new VarCharWriterImpl((NullableVarCharVector) vector); + return new VarCharWriterImpl((VarCharVector) vector); } }, VARBINARY(Binary.INSTANCE) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableVarBinaryVector(name, fieldType, allocator); + return new VarBinaryVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new VarBinaryWriterImpl((NullableVarBinaryVector) vector); + return new VarBinaryWriterImpl((VarBinaryVector) vector); } }, DECIMAL(null) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableDecimalVector(name, fieldType, allocator); + return new DecimalVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new DecimalWriterImpl((NullableDecimalVector) vector); + return new DecimalWriterImpl((DecimalVector) vector); } }, UINT1(new Int(8, false)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableUInt1Vector(name, fieldType, allocator); + return new UInt1Vector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new UInt1WriterImpl((NullableUInt1Vector) vector); + return new UInt1WriterImpl((UInt1Vector) vector); } }, UINT2(new Int(16, false)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableUInt2Vector(name, fieldType, allocator); + return new UInt2Vector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new UInt2WriterImpl((NullableUInt2Vector) vector); + return new UInt2WriterImpl((UInt2Vector) vector); } }, UINT4(new Int(32, false)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableUInt4Vector(name, fieldType, allocator); + return new UInt4Vector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new UInt4WriterImpl((NullableUInt4Vector) vector); + return new UInt4WriterImpl((UInt4Vector) vector); } }, UINT8(new Int(64, false)) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableUInt8Vector(name, fieldType, allocator); + return new UInt8Vector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new UInt8WriterImpl((NullableUInt8Vector) vector); + return new UInt8WriterImpl((UInt8Vector) vector); } }, LIST(List.INSTANCE) { @@ -470,45 +470,45 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { TIMESTAMPSECTZ(null) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeStampSecTZVector(name, fieldType, allocator); + return new TimeStampSecTZVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeStampSecTZWriterImpl((NullableTimeStampSecTZVector) vector); + return new TimeStampSecTZWriterImpl((TimeStampSecTZVector) vector); } }, TIMESTAMPMILLITZ(null) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeStampMilliTZVector(name, fieldType, allocator); + return new TimeStampMilliTZVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeStampMilliTZWriterImpl((NullableTimeStampMilliTZVector) vector); + return new TimeStampMilliTZWriterImpl((TimeStampMilliTZVector) vector); } }, TIMESTAMPMICROTZ(null) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeStampMicroTZVector(name, fieldType, allocator); + return new TimeStampMicroTZVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeStampMicroTZWriterImpl((NullableTimeStampMicroTZVector) vector); + return new TimeStampMicroTZWriterImpl((TimeStampMicroTZVector) vector); } }, TIMESTAMPNANOTZ(null) { @Override public FieldVector getNewVector(String name, FieldType fieldType, BufferAllocator allocator, CallBack schemaChangeCallback) { - return new NullableTimeStampNanoTZVector(name, fieldType, allocator); + return new TimeStampNanoTZVector(name, fieldType, allocator); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new TimeStampNanoTZWriterImpl((NullableTimeStampNanoTZVector) vector); + return new TimeStampNanoTZWriterImpl((TimeStampNanoTZVector) vector); } }; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java index 48e71a976c0e8..b1f036a34a9a6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java @@ -20,6 +20,7 @@ import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.arrow.vector.complex.BaseRepeatedValueVector.DATA_VECTOR_NAME; import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField; import java.util.Iterator; @@ -39,10 +40,10 @@ import com.google.flatbuffers.FlatBufferBuilder; import org.apache.arrow.flatbuf.KeyValue; +import org.apache.arrow.flatbuf.Type; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.schema.TypeLayout; -import org.apache.arrow.vector.schema.VectorLayout; +import org.apache.arrow.vector.TypeLayout; import org.apache.arrow.vector.types.pojo.ArrowType.Int; public class Field { @@ -58,7 +59,6 @@ public static Field nullable(String name, ArrowType type) { private final String name; private final FieldType fieldType; private final List children; - private final TypeLayout typeLayout; @JsonCreator private Field( @@ -67,16 +67,14 @@ private Field( @JsonProperty("type") ArrowType type, @JsonProperty("dictionary") DictionaryEncoding dictionary, @JsonProperty("children") List children, - @JsonProperty("typeLayout") TypeLayout typeLayout, @JsonProperty("metadata") Map metadata) { - this(name, new FieldType(nullable, type, dictionary, metadata), children, typeLayout); + this(name, new FieldType(nullable, type, dictionary, metadata), children); } private Field(String name, FieldType fieldType, List children, TypeLayout typeLayout) { this.name = name; this.fieldType = checkNotNull(fieldType); this.children = children == null ? ImmutableList.of() : ImmutableList.copyOf(children); - this.typeLayout = checkNotNull(typeLayout); } // deprecated, use FieldType or static constructor instead @@ -115,13 +113,11 @@ public static Field convertField(org.apache.arrow.flatbuf.Field field) { } dictionary = new DictionaryEncoding(dictionaryFB.id(), dictionaryFB.isOrdered(), indexType); } - ImmutableList.Builder layout = ImmutableList.builder(); - for (int i = 0; i < field.layoutLength(); ++i) { - layout.add(new org.apache.arrow.vector.schema.VectorLayout(field.layout(i))); - } ImmutableList.Builder childrenBuilder = ImmutableList.builder(); for (int i = 0; i < field.childrenLength(); i++) { - childrenBuilder.add(convertField(field.children(i))); + Field childField = convertField(field.children(i)); + childField = mutateOriginalNameIfNeeded(field, childField); + childrenBuilder.add(childField); } List children = childrenBuilder.build(); ImmutableMap.Builder metadataBuilder = ImmutableMap.builder(); @@ -131,14 +127,27 @@ public static Field convertField(org.apache.arrow.flatbuf.Field field) { metadataBuilder.put(key == null ? "" : key, value == null ? "" : value); } Map metadata = metadataBuilder.build(); - return new Field(name, nullable, type, dictionary, children, new TypeLayout(layout.build()), metadata); + return new Field(name, nullable, type, dictionary, children, metadata); } - public void validate() { - TypeLayout expectedLayout = TypeLayout.getTypeLayout(getType()); - if (!expectedLayout.equals(typeLayout)) { - throw new IllegalArgumentException("Deserialized field does not match expected vectors. expected: " + expectedLayout + " got " + typeLayout); + /** + * Helper method to ensure backward compatibility with schemas generated prior to ARROW-1347, ARROW-1663 + * @param field + * @param originalChildField original field which name might be mutated + * @return original or mutated field + */ + private static Field mutateOriginalNameIfNeeded(org.apache.arrow.flatbuf.Field field, Field originalChildField) { + if ((field.typeType() == Type.List || field.typeType() == Type.FixedSizeList) + && originalChildField.getName().equals("[DEFAULT]")) { + return + new Field(DATA_VECTOR_NAME, + originalChildField.isNullable(), + originalChildField.getType(), + originalChildField.getDictionary(), + originalChildField.getChildren(), + originalChildField.getMetadata()); } + return originalChildField; } public int getField(FlatBufferBuilder builder) { @@ -159,12 +168,6 @@ public int getField(FlatBufferBuilder builder) { childrenData[i] = children.get(i).getField(builder); } int childrenOffset = org.apache.arrow.flatbuf.Field.createChildrenVector(builder, childrenData); - int[] buffersData = new int[typeLayout.getVectors().size()]; - for (int i = 0; i < buffersData.length; i++) { - VectorLayout vectorLayout = typeLayout.getVectors().get(i); - buffersData[i] = vectorLayout.writeTo(builder); - } - int layoutOffset = org.apache.arrow.flatbuf.Field.createLayoutVector(builder, buffersData); int[] metadataOffsets = new int[getMetadata().size()]; Iterator> metadataIterator = getMetadata().entrySet().iterator(); for (int i = 0; i < metadataOffsets.length; i++) { @@ -185,7 +188,6 @@ public int getField(FlatBufferBuilder builder) { org.apache.arrow.flatbuf.Field.addTypeType(builder, getType().getTypeID().getFlatbufID()); org.apache.arrow.flatbuf.Field.addType(builder, typeOffset); org.apache.arrow.flatbuf.Field.addChildren(builder, childrenOffset); - org.apache.arrow.flatbuf.Field.addLayout(builder, layoutOffset); org.apache.arrow.flatbuf.Field.addCustomMetadata(builder, metadataOffset); if (dictionary != null) { org.apache.arrow.flatbuf.Field.addDictionary(builder, dictionaryOffset); @@ -219,10 +221,6 @@ public List getChildren() { return children; } - public TypeLayout getTypeLayout() { - return typeLayout; - } - @JsonInclude(Include.NON_EMPTY) public Map getMetadata() { return fieldType.getMetadata(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index 033ae6c09914d..c6d734d18d89b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -19,7 +19,6 @@ package org.apache.arrow.vector.util; import io.netty.buffer.ArrowBuf; -import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.types.pojo.ArrowType; import java.math.BigDecimal; @@ -142,8 +141,18 @@ public static StringBuilder toStringWithZeroes(long number, int desiredLength) { */ public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int index, int scale) { byte[] value = new byte[DECIMAL_BYTE_LENGTH]; + byte temp; final int startIndex = index * DECIMAL_BYTE_LENGTH; + + // Decimal stored as little endian, need to swap bytes to make BigDecimal bytebuf.getBytes(startIndex, value, 0, DECIMAL_BYTE_LENGTH); + int stop = DECIMAL_BYTE_LENGTH / 2; + for (int i = 0, j; i < stop; i++) { + temp = value[i]; + j = (DECIMAL_BYTE_LENGTH - 1) - i; + value[i] = value[j]; + value[j] = temp; + } BigInteger unscaledValue = new BigInteger(value); return new BigDecimal(unscaledValue, scale); } @@ -212,10 +221,26 @@ private static void writeByteArrayToArrowBuf(byte[] bytes, ArrowBuf bytebuf, int if (bytes.length > DECIMAL_BYTE_LENGTH) { throw new UnsupportedOperationException("Decimal size greater than 16 bytes"); } - final int padLength = DECIMAL_BYTE_LENGTH - bytes.length; - for (int i = 0; i < padLength; i++) { + + // Decimal stored as little endian, need to swap data bytes before writing to ArrowBuf + byte[] bytesLE = new byte[bytes.length]; + int stop = bytes.length / 2; + for (int i = 0, j; i < stop; i++) { + j = (bytes.length - 1) - i; + bytesLE[i] = bytes[j]; + bytesLE[j] = bytes[i]; + } + if (bytes.length % 2 != 0) { + int i = (bytes.length / 2); + bytesLE[i] = bytes[i]; + } + + // Write LE data + bytebuf.setBytes(startIndex, bytesLE, 0, bytes.length); + + // Write padding after data + for (int i = bytes.length; i < DECIMAL_BYTE_LENGTH; i++) { bytebuf.setByte(startIndex + i, padValue); } - bytebuf.setBytes(startIndex + padLength, bytes, 0, bytes.length); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java index cf0596c8c1fb4..6b46dbae385bb 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java @@ -73,7 +73,7 @@ public static Field toMessageFormat(Field field, DictionaryProvider provider, Se } /** - * Convert field and child fields that have a dictionary encoding to message format, so fields + * Convert field and child fields that have a dictionary encoding to memory format, so fields * have the index type */ public static Field toMemoryFormat(Field field, BufferAllocator allocator, Map dictionaries) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java index 480bd76d445b0..b6db29a7fc64e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java @@ -19,7 +19,6 @@ package org.apache.arrow.vector.util; import java.util.ArrayList; -import java.util.List; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; @@ -40,21 +39,6 @@ public JsonStringArrayList(int size) { super(size); } - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (!(obj instanceof List)) { - return false; - } - List other = (List) obj; - return this.size() == other.size() && this.containsAll(other); - } - @Override public final String toString() { try { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java index 6455389d582b9..cdb44fbeeaf3b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java @@ -19,7 +19,6 @@ package org.apache.arrow.vector.util; import java.util.LinkedHashMap; -import java.util.Map; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; @@ -36,36 +35,6 @@ public class JsonStringHashMap extends LinkedHashMap { mapper = new ObjectMapper(); } - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (!(obj instanceof Map)) { - return false; - } - Map other = (Map) obj; - if (this.size() != other.size()) { - return false; - } - for (K key : this.keySet()) { - if (this.get(key) == null) { - if (other.get(key) == null) { - continue; - } else { - return false; - } - } - if (!this.get(key).equals(other.get(key))) { - return false; - } - } - return true; - } - @Override public final String toString() { try { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java index 6d3b390379a56..b863fa8af86fd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java @@ -134,9 +134,9 @@ public Set keySet() { @Override public Collection values() { - return Lists.newArrayList(Iterables.transform(secondary.entries(), new Function, V>() { + return Lists.newArrayList(Iterables.transform(secondary.entries(), new Function, V>() { @Override - public V apply(IntObjectMap.Entry entry) { + public V apply(IntObjectMap.PrimitiveEntry entry) { return Preconditions.checkNotNull(entry).value(); } })); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java index 5851bd5fa5d97..c27e5e5c85c30 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -114,13 +114,13 @@ public static void compareFieldVectors(FieldVector vector1, FieldVector vector2) if (!field1.equals(vector2.getField())) { throw new IllegalArgumentException("Different Fields:\n" + field1 + "\n!=\n" + vector2.getField()); } - int valueCount = vector1.getAccessor().getValueCount(); - if (valueCount != vector2.getAccessor().getValueCount()) { - throw new IllegalArgumentException("Different value count for field " + field1 + " : " + valueCount + " != " + vector2.getAccessor().getValueCount()); + int valueCount = vector1.getValueCount(); + if (valueCount != vector2.getValueCount()) { + throw new IllegalArgumentException("Different value count for field " + field1 + " : " + valueCount + " != " + vector2.getValueCount()); } for (int j = 0; j < valueCount; j++) { - Object obj1 = vector1.getAccessor().getObject(j); - Object obj2 = vector2.getAccessor().getObject(j); + Object obj1 = vector1.getObject(j); + Object obj2 = vector2.getObject(j); if (!equals(field1.getType(), obj1, obj2)) { throw new IllegalArgumentException( "Different values in column:\n" + field1 + " at index " + j + ": " + obj1 + " != " + obj2); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java index 17fcf05fcd9bd..a59e5cdd48fce 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBitVector.java @@ -55,17 +55,17 @@ public void testBitVectorCopyFromSafe() { dst.allocateNew(10); for (int i = 0; i < size; i++) { - src.getMutator().set(i, i % 2); + src.set(i, i % 2); } - src.getMutator().setValueCount(size); + src.setValueCount(size); for (int i = 0; i < size; i++) { dst.copyFromSafe(i, i, src); } - dst.getMutator().setValueCount(size); + dst.setValueCount(size); for (int i = 0; i < size; i++) { - assertEquals(src.getAccessor().getObject(i), dst.getAccessor().getObject(i)); + assertEquals(src.getObject(i), dst.getObject(i)); } } } @@ -74,25 +74,23 @@ public void testBitVectorCopyFromSafe() { public void testSplitAndTransfer() throws Exception { try (final BitVector sourceVector = new BitVector("bitvector", allocator)) { - final BitVector.Mutator sourceMutator = sourceVector.getMutator(); - final BitVector.Accessor sourceAccessor = sourceVector.getAccessor(); sourceVector.allocateNew(40); /* populate the bitvector -- 010101010101010101010101..... */ for (int i = 0; i < 40; i++) { if ((i & 1) == 1) { - sourceMutator.set(i, 1); + sourceVector.set(i, 1); } else { - sourceMutator.set(i, 0); + sourceVector.set(i, 0); } } - sourceMutator.setValueCount(40); + sourceVector.setValueCount(40); /* check the vector output */ for (int i = 0; i < 40; i++) { - int result = sourceAccessor.get(i); + int result = sourceVector.get(i); if ((i & 1) == 1) { assertEquals(Integer.toString(1), Integer.toString(result)); } else { @@ -102,8 +100,6 @@ public void testSplitAndTransfer() throws Exception { try (final BitVector toVector = new BitVector("toVector", allocator)) { final TransferPair transferPair = sourceVector.makeTransferPair(toVector); - final BitVector.Accessor toAccessor = toVector.getAccessor(); - final BitVector.Mutator toMutator = toVector.getMutator(); /* * form test cases such that we cover: @@ -123,8 +119,8 @@ public void testSplitAndTransfer() throws Exception { /* check the toVector output after doing splitAndTransfer */ for (int i = 0; i < length; i++) { - int actual = toAccessor.get(i); - int expected = sourceAccessor.get(start + i); + int actual = toVector.get(i); + int expected = sourceVector.get(start + i); assertEquals("different data values not expected --> sourceVector index: " + (start + i) + " toVector index: " + i, expected, actual); } @@ -137,28 +133,24 @@ public void testSplitAndTransfer() throws Exception { public void testSplitAndTransfer1() throws Exception { try (final BitVector sourceVector = new BitVector("bitvector", allocator)) { - final BitVector.Mutator sourceMutator = sourceVector.getMutator(); - final BitVector.Accessor sourceAccessor = sourceVector.getAccessor(); sourceVector.allocateNew(8190); /* populate the bitvector */ for (int i = 0; i < 8190; i++) { - sourceMutator.set(i, 1); + sourceVector.set(i, 1); } - sourceMutator.setValueCount(8190); + sourceVector.setValueCount(8190); /* check the vector output */ for (int i = 0; i < 8190; i++) { - int result = sourceAccessor.get(i); + int result = sourceVector.get(i); assertEquals(Integer.toString(1), Integer.toString(result)); } try (final BitVector toVector = new BitVector("toVector", allocator)) { final TransferPair transferPair = sourceVector.makeTransferPair(toVector); - final BitVector.Accessor toAccessor = toVector.getAccessor(); - final BitVector.Mutator toMutator = toVector.getMutator(); final int[][] transferLengths = {{0, 4095}, {4095, 4095}}; @@ -170,8 +162,8 @@ public void testSplitAndTransfer1() throws Exception { /* check the toVector output after doing splitAndTransfer */ for (int i = 0; i < length; i++) { - int actual = toAccessor.get(i); - int expected = sourceAccessor.get(start + i); + int actual = toVector.get(i); + int expected = sourceVector.get(start + i); assertEquals("different data values not expected --> sourceVector index: " + (start + i) + " toVector index: " + i, expected, actual); } @@ -184,25 +176,23 @@ public void testSplitAndTransfer1() throws Exception { public void testSplitAndTransfer2() throws Exception { try (final BitVector sourceVector = new BitVector("bitvector", allocator)) { - final BitVector.Mutator sourceMutator = sourceVector.getMutator(); - final BitVector.Accessor sourceAccessor = sourceVector.getAccessor(); sourceVector.allocateNew(32); /* populate the bitvector */ for (int i = 0; i < 32; i++) { if ((i & 1) == 1) { - sourceMutator.set(i, 1); + sourceVector.set(i, 1); } else { - sourceMutator.set(i, 0); + sourceVector.set(i, 0); } } - sourceMutator.setValueCount(32); + sourceVector.setValueCount(32); /* check the vector output */ for (int i = 0; i < 32; i++) { - int result = sourceAccessor.get(i); + int result = sourceVector.get(i); if ((i & 1) == 1) { assertEquals(Integer.toString(1), Integer.toString(result)); } else { @@ -212,8 +202,6 @@ public void testSplitAndTransfer2() throws Exception { try (final BitVector toVector = new BitVector("toVector", allocator)) { final TransferPair transferPair = sourceVector.makeTransferPair(toVector); - final BitVector.Accessor toAccessor = toVector.getAccessor(); - final BitVector.Mutator toMutator = toVector.getMutator(); final int[][] transferLengths = {{5,22}, {5,24}, {5,25}, {5,27}, {0,31}, {5,7}, {2,3}}; @@ -225,8 +213,8 @@ public void testSplitAndTransfer2() throws Exception { /* check the toVector output after doing splitAndTransfer */ for (int i = 0; i < length; i++) { - int actual = toAccessor.get(i); - int expected = sourceAccessor.get(start + i); + int actual = toVector.get(i); + int expected = sourceVector.get(start + i); assertEquals("different data values not expected --> sourceVector index: " + (start + i) + " toVector index: " + i, expected, actual); } @@ -242,62 +230,56 @@ public void testReallocAfterVectorTransfer1() { int valueCapacity = vector.getValueCapacity(); assertEquals(4096, valueCapacity); - final BitVector.Mutator mutator = vector.getMutator(); - final BitVector.Accessor accessor = vector.getAccessor(); - for (int i = 0; i < valueCapacity; i++) { if ((i & 1) == 1) { - mutator.setToOne(i); + vector.setToOne(i); } } for (int i = 0; i < valueCapacity; i++) { - int val = accessor.get(i); if ((i & 1) == 1) { - assertEquals("unexpected cleared bit at index: " + i, 1, val); + assertEquals("unexpected cleared bit at index: " + i, 1, vector.get(i)); } else { - assertEquals("unexpected set bit at index: " + i, 0, val); + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); } } /* trigger first realloc */ - mutator.setSafeToOne(valueCapacity); + vector.setSafeToOne(valueCapacity); assertEquals(valueCapacity * 2, vector.getValueCapacity()); for (int i = valueCapacity; i < valueCapacity*2; i++) { if ((i & 1) == 1) { - mutator.setToOne(i); + vector.setToOne(i); } } for (int i = 0; i < valueCapacity*2; i++) { - int val = accessor.get(i); if (((i & 1) == 1) || (i == valueCapacity)) { - assertEquals("unexpected cleared bit at index: " + i, 1, val); + assertEquals("unexpected cleared bit at index: " + i, 1, vector.get(i)); } else { - assertEquals("unexpected set bit at index: " + i, 0, val); + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); } } /* trigger second realloc */ - mutator.setSafeToOne(valueCapacity*2); + vector.setSafeToOne(valueCapacity*2); assertEquals(valueCapacity * 4, vector.getValueCapacity()); for (int i = valueCapacity*2; i < valueCapacity*4; i++) { if ((i & 1) == 1) { - mutator.setToOne(i); + vector.setToOne(i); } } for (int i = 0; i < valueCapacity*4; i++) { - int val = accessor.get(i); if (((i & 1) == 1) || (i == valueCapacity) || (i == valueCapacity*2)) { - assertEquals("unexpected cleared bit at index: " + i, 1, val); + assertEquals("unexpected cleared bit at index: " + i, 1, vector.get(i)); } else { - assertEquals("unexpected set bit at index: " + i, 0, val); + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); } } @@ -305,27 +287,24 @@ public void testReallocAfterVectorTransfer1() { TransferPair transferPair = vector.getTransferPair(allocator); transferPair.transfer(); final BitVector toVector = (BitVector)transferPair.getTo(); - final BitVector.Accessor toAccessor = toVector.getAccessor(); - final BitVector.Mutator toMutator = toVector.getMutator(); assertEquals(valueCapacity * 4, toVector.getValueCapacity()); /* realloc the toVector */ - toMutator.setSafeToOne(valueCapacity * 4); + toVector.setSafeToOne(valueCapacity * 4); for (int i = 0; i < toVector.getValueCapacity(); i++) { - int val = toAccessor.get(i); if (i <= valueCapacity * 4) { if (((i & 1) == 1) || (i == valueCapacity) || (i == valueCapacity*2) || (i == valueCapacity*4)) { - assertEquals("unexpected cleared bit at index: " + i, 1, val); + assertEquals("unexpected cleared bit at index: " + i, 1, toVector.get(i)); } else { - assertEquals("unexpected set bit at index: " + i, 0, val); + assertTrue("unexpected set bit at index: " + i, toVector.isNull(i)); } } else { - assertEquals("unexpected set bit at index: " + i, 0, val); + assertTrue("unexpected set bit at index: " + i, toVector.isNull(i)); } } @@ -335,91 +314,86 @@ public void testReallocAfterVectorTransfer1() { @Test public void testReallocAfterVectorTransfer2() { - try (final NullableBitVector vector = new NullableBitVector(EMPTY_SCHEMA_PATH, allocator)) { + try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { vector.allocateNew(4096); int valueCapacity = vector.getValueCapacity(); assertEquals(4096, valueCapacity); - final NullableBitVector.Mutator mutator = vector.getMutator(); - final NullableBitVector.Accessor accessor = vector.getAccessor(); - for (int i = 0; i < valueCapacity; i++) { if ((i & 1) == 1) { - mutator.set(i, 1); + vector.set(i, 1); } } for (int i = 0; i < valueCapacity; i++) { if ((i & 1) == 1) { - assertFalse("unexpected cleared bit at index: " + i, accessor.isNull(i)); + assertFalse("unexpected cleared bit at index: " + i, vector.isNull(i)); } else { - assertTrue("unexpected set bit at index: " + i, accessor.isNull(i)); + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); } } /* trigger first realloc */ - mutator.setSafe(valueCapacity, 1, 1); + vector.setSafe(valueCapacity, 1, 1); assertEquals(valueCapacity * 2, vector.getValueCapacity()); for (int i = valueCapacity; i < valueCapacity*2; i++) { if ((i & 1) == 1) { - mutator.set(i, 1); + vector.set(i, 1); } } for (int i = 0; i < valueCapacity*2; i++) { if (((i & 1) == 1) || (i == valueCapacity)) { - assertFalse("unexpected cleared bit at index: " + i, accessor.isNull(i)); + assertFalse("unexpected cleared bit at index: " + i, vector.isNull(i)); } else { - assertTrue("unexpected set bit at index: " + i, accessor.isNull(i)); + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); } } /* trigger second realloc */ - mutator.setSafe(valueCapacity*2, 1, 1); + vector.setSafe(valueCapacity*2, 1, 1); assertEquals(valueCapacity * 4, vector.getValueCapacity()); for (int i = valueCapacity*2; i < valueCapacity*4; i++) { if ((i & 1) == 1) { - mutator.set(i, 1); + vector.set(i, 1); } } for (int i = 0; i < valueCapacity*4; i++) { if (((i & 1) == 1) || (i == valueCapacity) || (i == valueCapacity*2)) { - assertFalse("unexpected cleared bit at index: " + i, accessor.isNull(i)); + assertFalse("unexpected cleared bit at index: " + i, vector.isNull(i)); } else { - assertTrue("unexpected set bit at index: " + i, accessor.isNull(i)); + assertTrue("unexpected set bit at index: " + i, vector.isNull(i)); } } /* now transfer the vector */ TransferPair transferPair = vector.getTransferPair(allocator); transferPair.transfer(); - final NullableBitVector toVector = (NullableBitVector)transferPair.getTo(); - final NullableBitVector.Accessor toAccessor = toVector.getAccessor(); - final NullableBitVector.Mutator toMutator = toVector.getMutator(); + final BitVector toVector = (BitVector)transferPair.getTo(); assertEquals(valueCapacity * 4, toVector.getValueCapacity()); /* realloc the toVector */ - toMutator.setSafe(valueCapacity * 4, 1, 1); + toVector.setSafe(valueCapacity * 4, 1, 1); for (int i = 0; i < toVector.getValueCapacity(); i++) { if (i <= valueCapacity * 4) { if (((i & 1) == 1) || (i == valueCapacity) || (i == valueCapacity*2) || (i == valueCapacity*4)) { - assertFalse("unexpected cleared bit at index: " + i, toAccessor.isNull(i)); + assertFalse("unexpected cleared bit at index: " + i, toVector.isNull(i)); } else { - assertTrue("unexpected set bit at index: " + i, toAccessor.isNull(i)); + assertTrue("unexpected set bit at index: " + i, toVector.isNull(i)); } } else { - assertTrue("unexpected set bit at index: " + i, toAccessor.isNull(i)); + assertTrue("unexpected set bit at index: " + i, toVector.isNull(i)); } } @@ -431,81 +405,81 @@ public void testReallocAfterVectorTransfer2() { public void testBitVector() { // Create a new value vector for 1024 integers try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { - final BitVector.Mutator m = vector.getMutator(); vector.allocateNew(1024); - m.setValueCount(1024); + vector.setValueCount(1024); // Put and set a few values - m.set(0, 1); - m.set(1, 0); - m.set(100, 0); - m.set(1022, 1); + vector.set(0, 1); + vector.set(1, 0); + vector.set(100, 0); + vector.set(1022, 1); - m.setValueCount(1024); + vector.setValueCount(1024); - final BitVector.Accessor accessor = vector.getAccessor(); - assertEquals(1, accessor.get(0)); - assertEquals(0, accessor.get(1)); - assertEquals(0, accessor.get(100)); - assertEquals(1, accessor.get(1022)); + assertEquals(1, vector.get(0)); + assertEquals(0, vector.get(1)); + assertEquals(0, vector.get(100)); + assertEquals(1, vector.get(1022)); - assertEquals(1022, accessor.getNullCount()); + assertEquals(1020, vector.getNullCount()); // test setting the same value twice - m.set(0, 1); - m.set(0, 1); - m.set(1, 0); - m.set(1, 0); - assertEquals(1, accessor.get(0)); - assertEquals(0, accessor.get(1)); + vector.set(0, 1); + vector.set(0, 1); + vector.set(1, 0); + vector.set(1, 0); + assertEquals(1, vector.get(0)); + assertEquals(0, vector.get(1)); // test toggling the values - m.set(0, 0); - m.set(1, 1); - assertEquals(0, accessor.get(0)); - assertEquals(1, accessor.get(1)); + vector.set(0, 0); + vector.set(1, 1); + assertEquals(0, vector.get(0)); + assertEquals(1, vector.get(1)); // should not change - assertEquals(1022, accessor.getNullCount()); + assertEquals(1020, vector.getNullCount()); - // Ensure unallocated space returns 0 - assertEquals(0, accessor.get(3)); + // Ensure null value + assertTrue(vector.isNull(3)); // unset the previously set bits - m.set(1, 0); - m.set(1022, 0); + vector.setNull(0); + vector.setNull(1); + vector.setNull(100); + vector.setNull(1022); // this should set all the array to 0 - assertEquals(1024, accessor.getNullCount()); + assertEquals(1024, vector.getNullCount()); // set all the array to 1 for (int i = 0; i < 1024; ++i) { - assertEquals(1024 - i, accessor.getNullCount()); - m.set(i, 1); + assertEquals(1024 - i, vector.getNullCount()); + vector.set(i, 1); } - assertEquals(0, accessor.getNullCount()); + assertEquals(0, vector.getNullCount()); vector.allocateNew(1015); - m.setValueCount(1015); + vector.setValueCount(1015); // ensure it has been zeroed - assertEquals(1015, accessor.getNullCount()); + assertEquals(1015, vector.getNullCount()); - m.set(0, 1); - m.set(1014, 1); // ensure that the last item of the last byte is allocated + vector.set(0, 1); + vector.set(1014, 1); // ensure that the last item of the last byte is allocated - assertEquals(1013, accessor.getNullCount()); + assertEquals(1013, vector.getNullCount()); vector.zeroVector(); - assertEquals(1015, accessor.getNullCount()); + assertEquals(1015, vector.getNullCount()); // set all the array to 1 for (int i = 0; i < 1015; ++i) { - assertEquals(1015 - i, accessor.getNullCount()); - m.set(i, 1); + assertEquals(1015 - i, vector.getNullCount()); + vector.set(i, 1); } - assertEquals(0, accessor.getNullCount()); + assertEquals(0, vector.getNullCount()); } } @@ -531,15 +505,15 @@ private void validateRange(int length, int start, int count) { try (BitVector bitVector = new BitVector("bits", allocator)) { bitVector.reset(); bitVector.allocateNew(length); - bitVector.getMutator().setRangeToOne(start, count); + bitVector.setRangeToOne(start, count); for (int i = 0; i < start; i++) { - Assert.assertEquals(desc + i, 0, bitVector.getAccessor().get(i)); + Assert.assertTrue(desc + i, bitVector.isNull(i)); } for (int i = start; i < start + count; i++) { - Assert.assertEquals(desc + i, 1, bitVector.getAccessor().get(i)); + Assert.assertEquals(desc + i, 1, bitVector.get(i)); } for (int i = start + count; i < length; i++) { - Assert.assertEquals(desc + i, 0, bitVector.getAccessor().get(i)); + Assert.assertTrue(desc + i, bitVector.isNull(i)); } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java index 1a801a63ec688..20f6754be2def 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java @@ -38,11 +38,11 @@ public void testTransferFixedWidth() { BufferAllocator childAllocator1 = allocator.newChildAllocator("child1", 100000, 100000); BufferAllocator childAllocator2 = allocator.newChildAllocator("child2", 100000, 100000); - NullableIntVector v1 = new NullableIntVector("v1", childAllocator1); + IntVector v1 = new IntVector("v1", childAllocator1); v1.allocateNew(); - v1.getMutator().setValueCount(4095); + v1.setValueCount(4095); - NullableIntVector v2 = new NullableIntVector("v2", childAllocator2); + IntVector v2 = new IntVector("v2", childAllocator2); v1.makeTransferPair(v2).transfer(); @@ -58,12 +58,12 @@ public void testTransferVariableidth() { BufferAllocator childAllocator1 = allocator.newChildAllocator("child1", 100000, 100000); BufferAllocator childAllocator2 = allocator.newChildAllocator("child2", 100000, 100000); - NullableVarCharVector v1 = new NullableVarCharVector("v1", childAllocator1); + VarCharVector v1 = new VarCharVector("v1", childAllocator1); v1.allocateNew(); - v1.getMutator().setSafe(4094, "hello world".getBytes(), 0, 11); - v1.getMutator().setValueCount(4001); + v1.setSafe(4094, "hello world".getBytes(), 0, 11); + v1.setValueCount(4001); - NullableVarCharVector v2 = new NullableVarCharVector("v2", childAllocator2); + VarCharVector v2 = new VarCharVector("v2", childAllocator2); v1.makeTransferPair(v2).transfer(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java b/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java new file mode 100644 index 0000000000000..87ffcafecd198 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java @@ -0,0 +1,1037 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.types.Types; +import org.joda.time.Period; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.math.BigDecimal; + +import static org.apache.arrow.vector.TestUtils.newVector; +import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +/** + * Tested field types: + * + * NullableInt + * NullableBigInt + * NullableFloat4 + * NullableFloat8 + * NullableBit + * NullableDecimal + * NullableIntervalDay + * NullableIntervalYear + * NullableSmallInt + * NullableTinyInt + * NullableVarChar + * NullableTimeMicro + * NullableTimeMilli + * NullableTimeStamp* + */ + +public class TestCopyFrom { + + private final static String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test /* NullableVarChar */ + public void testCopyFromWithNulls() { + try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, Types.MinorType.VARCHAR, allocator); + final VarCharVector vector2 = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, Types.MinorType.VARCHAR, allocator)) { + + vector.allocateNew(); + int capacity = vector.getValueCapacity(); + assertEquals(4095, capacity); + + for (int i = 0; i < 4095; i++) { + if (i % 3 == 0) { + continue; + } + byte[] b = Integer.toString(i).getBytes(); + vector.setSafe(i, b, 0, b.length); + } + + /* NO reAlloc() should have happened in setSafe() */ + capacity = vector.getValueCapacity(); + assertEquals(4095, capacity); + + vector.setValueCount(4095); + + for (int i = 0; i < 4095; i++) { + if (i % 3 == 0) { + assertNull(vector.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector.getObject(i).toString()); + } + } + + vector2.allocateNew(); + capacity = vector2.getValueCapacity(); + assertEquals(4095, capacity); + + for (int i = 0; i < 4095; i++) { + vector2.copyFromSafe(i, i, vector); + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } + } + + /* NO reAlloc() should have happened in copyFrom */ + capacity = vector2.getValueCapacity(); + assertEquals(4095, capacity); + + vector2.setValueCount(4095); + + for (int i = 0; i < 4095; i++) { + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } + } + } + } + + @Test /* NullableVarChar */ + public void testCopyFromWithNulls1() { + try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, Types.MinorType.VARCHAR, allocator); + final VarCharVector vector2 = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, Types.MinorType.VARCHAR, allocator)) { + + vector.allocateNew(); + int capacity = vector.getValueCapacity(); + assertEquals(4095, capacity); + + for (int i = 0; i < 4095; i++) { + if (i % 3 == 0) { + continue; + } + byte[] b = Integer.toString(i).getBytes(); + vector.setSafe(i, b, 0, b.length); + } + + /* NO reAlloc() should have happened in setSafe() */ + capacity = vector.getValueCapacity(); + assertEquals(4095, capacity); + + vector.setValueCount(4095); + + for (int i = 0; i < 4095; i++) { + if (i % 3 == 0) { + assertNull(vector.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector.getObject(i).toString()); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024 * 10, 1024); + + capacity = vector2.getValueCapacity(); + assertEquals(1024, capacity); + + for (int i = 0; i < 4095; i++) { + vector2.copyFromSafe(i, i, vector); + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } + } + + /* 2 reAllocs should have happened in copyFromSafe() */ + capacity = vector2.getValueCapacity(); + assertEquals(4096, capacity); + + vector2.setValueCount(4095); + + for (int i = 0; i < 4095; i++) { + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } + } + } + } + + @Test /* IntVector */ + public void testCopyFromWithNulls2() { + try (final IntVector vector1 = new IntVector(EMPTY_SCHEMA_PATH, allocator); + final IntVector vector2 = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, 1000 + i); + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, 1000 + i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, 1000 + i, vector2.get(i)); + } + } + } + } + + @Test /* BigIntVector */ + public void testCopyFromWithNulls3() { + try (final BigIntVector vector1 = new BigIntVector(EMPTY_SCHEMA_PATH, allocator); + final BigIntVector vector2 = new BigIntVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, 10000000000L + (long)i); + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + 10000000000L + (long)i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + 10000000000L + (long)i, vector2.get(i)); + } + } + } + } + + @Test /* BitVector */ + public void testCopyFromWithNulls4() { + try (final BitVector vector1 = new BitVector(EMPTY_SCHEMA_PATH, allocator); + final BitVector vector2 = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + int counter = 0; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + if ((counter&1) == 0) { + vector1.setSafe(i, 1); + } else { + vector1.setSafe(i, 0); + } + counter++; + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + counter = 0; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + if ((counter&1) == 0) { + assertTrue(vector1.getObject(i)); + } else { + assertFalse(vector1.getObject(i)); + } + counter++; + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + counter = 0; + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + if ((counter&1) == 0) { + assertTrue(vector2.getObject(i)); + } else { + assertFalse(vector2.getObject(i)); + } + counter++; + } + } + } + } + + @Test /* Float4Vector */ + public void testCopyFromWithNulls5() { + try (final Float4Vector vector1 = new Float4Vector(EMPTY_SCHEMA_PATH, allocator); + final Float4Vector vector2 = new Float4Vector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, 100.25f + (float)i); + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + 100.25f + (float)i, vector1.get(i), 0); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + 100.25f + i*1.0f, vector2.get(i), 0); + } + } + } + } + + @Test /* Float8Vector */ + public void testCopyFromWithNulls6() { + try (final Float8Vector vector1 = new Float8Vector(EMPTY_SCHEMA_PATH, allocator); + final Float8Vector vector2 = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, 123456.7865 + (double) i); + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + 123456.7865 + (double) i, vector1.get(i), 0); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + 123456.7865 + (double) i, vector2.get(i), 0); + } + } + } + } + + @Test /* IntervalDayVector */ + public void testCopyFromWithNulls7() { + try (final IntervalDayVector vector1 = new IntervalDayVector(EMPTY_SCHEMA_PATH, allocator); + final IntervalDayVector vector2 = new IntervalDayVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + final int days = 10; + final int milliseconds = 10000; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, days + i, milliseconds + i); + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + final Period p = vector1.getObject(i); + assertEquals(days + i, p.getDays()); + assertEquals(milliseconds + i, p.getMillis()); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + final Period p = vector2.getObject(i); + assertEquals(days + i, p.getDays()); + assertEquals(milliseconds + i, p.getMillis()); + } + } + } + } + + @Test /* IntervalYearVector */ + public void testCopyFromWithNulls8() { + try (final IntervalYearVector vector1 = new IntervalYearVector(EMPTY_SCHEMA_PATH, allocator); + final IntervalYearVector vector2 = new IntervalYearVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + final int interval = 30; /* 2 years 6 months */ + final Period[] periods = new Period[4096]; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, interval + i); + final Period p = new Period(); + final int years = (interval + i) / org.apache.arrow.vector.util.DateUtility.yearsToMonths; + final int months = (interval + i) % org.apache.arrow.vector.util.DateUtility.yearsToMonths; + periods[i] = p.plusYears(years).plusMonths(months);; + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + final Period p = vector1.getObject(i); + assertEquals(interval + i, vector1.get(i)); + assertEquals(periods[i], p); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + final Period p = vector2.getObject(i); + assertEquals(periods[i], p); + } + } + } + } + + @Test /* SmallIntVector */ + public void testCopyFromWithNulls9() { + try (final SmallIntVector vector1 = new SmallIntVector(EMPTY_SCHEMA_PATH, allocator); + final SmallIntVector vector2 = new SmallIntVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + final short val = 1000; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, val + (short)i); + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + val + (short)i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + val + (short)i, vector2.get(i)); + } + } + } + } + + @Test /* TimeMicroVector */ + public void testCopyFromWithNulls10() { + try (final TimeMicroVector vector1 = new TimeMicroVector(EMPTY_SCHEMA_PATH, allocator); + final TimeMicroVector vector2 = new TimeMicroVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + final long val = 100485765432L; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, val + (long)i); + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + val + (long)i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + val + (long) i, vector2.get(i)); + } + } + } + } + + @Test /* TimeMilliVector */ + public void testCopyFromWithNulls11() { + try (final TimeMilliVector vector1 = new TimeMilliVector(EMPTY_SCHEMA_PATH, allocator); + final TimeMilliVector vector2 = new TimeMilliVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + final int val = 1000; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, val + i); + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + val + i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + val + i, vector2.get(i)); + } + } + } + } + + @Test /* TinyIntVector */ + public void testCopyFromWithNulls12() { + try (final TinyIntVector vector1 = new TinyIntVector(EMPTY_SCHEMA_PATH, allocator); + final TinyIntVector vector2 = new TinyIntVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + byte val = -128; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, val); + val++; + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + val = -128; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val, vector1.get(i)); + val++; + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + val = -128; + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, val, vector2.get(i)); + val++; + } + } + } + } + + @Test /* DecimalVector */ + public void testCopyFromWithNulls13() { + try (final DecimalVector vector1 = new DecimalVector(EMPTY_SCHEMA_PATH, allocator, 30, 16); + final DecimalVector vector2 = new DecimalVector(EMPTY_SCHEMA_PATH, allocator, 30, 16)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + final double baseValue = 104567897654.876543654; + final BigDecimal[] decimals = new BigDecimal[4096]; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + BigDecimal decimal = new BigDecimal(baseValue + (double)i); + vector1.setSafe(i, decimal); + decimals[i] = decimal; + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + final BigDecimal decimal = vector1.getObject(i); + assertEquals(decimals[i], decimal); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + final BigDecimal decimal = vector2.getObject(i); + assertEquals(decimals[i], decimal); + } + } + } + } + + @Test /* TimeStampVector */ + public void testCopyFromWithNulls14() { + try (final TimeStampVector vector1 = new TimeStampMicroVector(EMPTY_SCHEMA_PATH, allocator); + final TimeStampVector vector2 = new TimeStampMicroVector(EMPTY_SCHEMA_PATH, allocator)) { + + vector1.allocateNew(); + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(0, vector1.getValueCount()); + + final long val = 20145678912L; + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + continue; + } + vector1.setSafe(i, val + (long)i); + } + + vector1.setValueCount(4096); + + /* No realloc should have happened in setSafe or + * setValueCount + */ + assertEquals(4096, vector1.getValueCapacity()); + assertEquals(4096, vector1.getValueCount()); + + for (int i = 0; i < 4096; i++) { + if ((i&1) == 0) { + assertNull(vector1.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + val + (long)i, vector1.get(i)); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024); + assertEquals(1024, vector2.getValueCapacity()); + + for (int i = 0; i < 4096; i++) { + vector2.copyFromSafe(i, i, vector1); + } + + /* 2 realloc should have happened in copyFromSafe() */ + assertEquals(4096, vector2.getValueCapacity()); + vector2.setValueCount(8192); + /* setValueCount() should have done another realloc */ + assertEquals(8192, vector2.getValueCount()); + assertEquals(8192, vector2.getValueCapacity()); + + /* check vector data after copy and realloc */ + for (int i = 0; i < 8192; i++) { + if (((i&1) == 0) || (i >= 4096)) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, + val + (long) i, vector2.get(i)); + } + } + } + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java index 56d2293276404..8c86452fcc3bf 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java @@ -60,9 +60,9 @@ public void terminate() throws Exception { @Test public void testValuesWriteRead() { - try (NullableDecimalVector decimalVector = TestUtils.newVector(NullableDecimalVector.class, "decimal", new ArrowType.Decimal(10, scale), allocator);) { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", new ArrowType.Decimal(10, scale), allocator);) { - try (NullableDecimalVector oldConstructor = new NullableDecimalVector("decimal", allocator, 10, scale);) { + try (DecimalVector oldConstructor = new DecimalVector("decimal", allocator, 10, scale);) { assertEquals(decimalVector.getField().getType(), oldConstructor.getField().getType()); } @@ -71,28 +71,28 @@ public void testValuesWriteRead() { for (int i = 0; i < intValues.length; i++) { BigDecimal decimal = new BigDecimal(BigInteger.valueOf(intValues[i]), scale); values[i] = decimal; - decimalVector.getMutator().setSafe(i, decimal); + decimalVector.setSafe(i, decimal); } - decimalVector.getMutator().setValueCount(intValues.length); + decimalVector.setValueCount(intValues.length); for (int i = 0; i < intValues.length; i++) { - BigDecimal value = decimalVector.getAccessor().getObject(i); - assertEquals(values[i], value); + BigDecimal value = decimalVector.getObject(i); + assertEquals("unexpected data at index: " + i, values[i], value); } } } @Test public void testBigDecimalDifferentScaleAndPrecision() { - try (NullableDecimalVector decimalVector = TestUtils.newVector(NullableDecimalVector.class, "decimal", new ArrowType.Decimal(4, 2), allocator);) { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", new ArrowType.Decimal(4, 2), allocator);) { decimalVector.allocateNew(); // test BigDecimal with different scale boolean hasError = false; try { BigDecimal decimal = new BigDecimal(BigInteger.valueOf(0), 3); - decimalVector.getMutator().setSafe(0, decimal); + decimalVector.setSafe(0, decimal); } catch (UnsupportedOperationException ue) { hasError = true; } finally { @@ -103,7 +103,7 @@ public void testBigDecimalDifferentScaleAndPrecision() { hasError = false; try { BigDecimal decimal = new BigDecimal(BigInteger.valueOf(12345), 2); - decimalVector.getMutator().setSafe(0, decimal); + decimalVector.setSafe(0, decimal); } catch (UnsupportedOperationException ue) { hasError = true; } finally { @@ -111,4 +111,83 @@ public void testBigDecimalDifferentScaleAndPrecision() { } } } + + @Test + public void testWriteBigEndian() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", new ArrowType.Decimal(38, 9), allocator);) { + decimalVector.allocateNew(); + BigDecimal decimal1 = new BigDecimal("123456789.000000000"); + BigDecimal decimal2 = new BigDecimal("11.123456789"); + BigDecimal decimal3 = new BigDecimal("1.000000000"); + BigDecimal decimal4 = new BigDecimal("0.111111111"); + BigDecimal decimal5 = new BigDecimal("987654321.123456789"); + BigDecimal decimal6 = new BigDecimal("222222222222.222222222"); + BigDecimal decimal7 = new BigDecimal("7777777777777.666666667"); + BigDecimal decimal8 = new BigDecimal("1212121212.343434343"); + + byte[] decimalValue1 = decimal1.unscaledValue().toByteArray(); + byte[] decimalValue2 = decimal2.unscaledValue().toByteArray(); + byte[] decimalValue3 = decimal3.unscaledValue().toByteArray(); + byte[] decimalValue4 = decimal4.unscaledValue().toByteArray(); + byte[] decimalValue5 = decimal5.unscaledValue().toByteArray(); + byte[] decimalValue6 = decimal6.unscaledValue().toByteArray(); + byte[] decimalValue7 = decimal7.unscaledValue().toByteArray(); + byte[] decimalValue8 = decimal8.unscaledValue().toByteArray(); + + decimalVector.setBigEndian(0, decimalValue1); + decimalVector.setBigEndian(1, decimalValue2); + decimalVector.setBigEndian(2, decimalValue3); + decimalVector.setBigEndian(3, decimalValue4); + decimalVector.setBigEndian(4, decimalValue5); + decimalVector.setBigEndian(5, decimalValue6); + decimalVector.setBigEndian(6, decimalValue7); + decimalVector.setBigEndian(7, decimalValue8); + + decimalVector.setValueCount(8); + assertEquals(8, decimalVector.getValueCount()); + assertEquals(decimal1, decimalVector.getObject(0)); + assertEquals(decimal2, decimalVector.getObject(1)); + assertEquals(decimal3, decimalVector.getObject(2)); + assertEquals(decimal4, decimalVector.getObject(3)); + assertEquals(decimal5, decimalVector.getObject(4)); + assertEquals(decimal6, decimalVector.getObject(5)); + assertEquals(decimal7, decimalVector.getObject(6)); + assertEquals(decimal8, decimalVector.getObject(7)); + } + } + + @Test + public void testBigDecimalReadWrite() { + try (DecimalVector decimalVector = TestUtils.newVector(DecimalVector.class, "decimal", new ArrowType.Decimal(38, 9), allocator);) { + decimalVector.allocateNew(); + BigDecimal decimal1 = new BigDecimal("123456789.000000000"); + BigDecimal decimal2 = new BigDecimal("11.123456789"); + BigDecimal decimal3 = new BigDecimal("1.000000000"); + BigDecimal decimal4 = new BigDecimal("-0.111111111"); + BigDecimal decimal5 = new BigDecimal("-987654321.123456789"); + BigDecimal decimal6 = new BigDecimal("-222222222222.222222222"); + BigDecimal decimal7 = new BigDecimal("7777777777777.666666667"); + BigDecimal decimal8 = new BigDecimal("1212121212.343434343"); + + decimalVector.set(0, decimal1); + decimalVector.set(1, decimal2); + decimalVector.set(2, decimal3); + decimalVector.set(3, decimal4); + decimalVector.set(4, decimal5); + decimalVector.set(5, decimal6); + decimalVector.set(6, decimal7); + decimalVector.set(7, decimal8); + + decimalVector.setValueCount(8); + assertEquals(8, decimalVector.getValueCount()); + assertEquals(decimal1, decimalVector.getObject(0)); + assertEquals(decimal2, decimalVector.getObject(1)); + assertEquals(decimal3, decimalVector.getObject(2)); + assertEquals(decimal4, decimalVector.getObject(3)); + assertEquals(decimal5, decimalVector.getObject(4)); + assertEquals(decimal6, decimalVector.getObject(5)); + assertEquals(decimal7, decimalVector.getObject(6)); + assertEquals(decimal8, decimalVector.getObject(7)); + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java index f8c16e7fc8176..ba149bcb8cb9f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java @@ -18,7 +18,7 @@ package org.apache.arrow.vector; -import static org.apache.arrow.vector.TestUtils.newNullableVarCharVector; +import static org.apache.arrow.vector.TestUtils.newVarCharVector; import static org.junit.Assert.assertEquals; import java.nio.charset.StandardCharsets; @@ -54,47 +54,45 @@ public void terminate() throws Exception { @Test public void testEncodeStrings() { // Create a new value vector - try (final NullableVarCharVector vector = newNullableVarCharVector("foo", allocator); - final NullableVarCharVector dictionaryVector = newNullableVarCharVector("dict", allocator);) { - final NullableVarCharVector.Mutator m = vector.getMutator(); + try (final VarCharVector vector = newVarCharVector("foo", allocator); + final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) { vector.allocateNew(512, 5); // set some values - m.setSafe(0, zero, 0, zero.length); - m.setSafe(1, one, 0, one.length); - m.setSafe(2, one, 0, one.length); - m.setSafe(3, two, 0, two.length); - m.setSafe(4, zero, 0, zero.length); - m.setValueCount(5); + vector.setSafe(0, zero, 0, zero.length); + vector.setSafe(1, one, 0, one.length); + vector.setSafe(2, one, 0, one.length); + vector.setSafe(3, two, 0, two.length); + vector.setSafe(4, zero, 0, zero.length); + vector.setValueCount(5); // set some dictionary values - final NullableVarCharVector.Mutator m2 = dictionaryVector.getMutator(); dictionaryVector.allocateNew(512, 3); - m2.setSafe(0, zero, 0, zero.length); - m2.setSafe(1, one, 0, one.length); - m2.setSafe(2, two, 0, two.length); - m2.setValueCount(3); + dictionaryVector.setSafe(0, zero, 0, zero.length); + dictionaryVector.setSafe(1, one, 0, one.length); + dictionaryVector.setSafe(2, two, 0, two.length); + dictionaryVector.setValueCount(3); Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) { // verify indices - assertEquals(NullableIntVector.class, encoded.getClass()); + assertEquals(IntVector.class, encoded.getClass()); - NullableIntVector.Accessor indexAccessor = ((NullableIntVector) encoded).getAccessor(); - assertEquals(5, indexAccessor.getValueCount()); - assertEquals(0, indexAccessor.get(0)); - assertEquals(1, indexAccessor.get(1)); - assertEquals(1, indexAccessor.get(2)); - assertEquals(2, indexAccessor.get(3)); - assertEquals(0, indexAccessor.get(4)); + IntVector index = ((IntVector)encoded); + assertEquals(5, index.getValueCount()); + assertEquals(0, index.get(0)); + assertEquals(1, index.get(1)); + assertEquals(1, index.get(2)); + assertEquals(2, index.get(3)); + assertEquals(0, index.get(4)); // now run through the decoder and verify we get the original back try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) { assertEquals(vector.getClass(), decoded.getClass()); - assertEquals(vector.getAccessor().getValueCount(), decoded.getAccessor().getValueCount()); + assertEquals(vector.getValueCount(), ((VarCharVector)decoded).getValueCount()); for (int i = 0; i < 5; i++) { - assertEquals(vector.getAccessor().getObject(i), decoded.getAccessor().getObject(i)); + assertEquals(vector.getObject(i), ((VarCharVector)decoded).getObject(i)); } } } @@ -104,43 +102,42 @@ public void testEncodeStrings() { @Test public void testEncodeLargeVector() { // Create a new value vector - try (final NullableVarCharVector vector = newNullableVarCharVector("foo", allocator); - final NullableVarCharVector dictionaryVector = newNullableVarCharVector("dict", allocator);) { - final NullableVarCharVector.Mutator m = vector.getMutator(); + try (final VarCharVector vector = newVarCharVector("foo", allocator); + final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) { vector.allocateNew(); int count = 10000; for (int i = 0; i < 10000; ++i) { - vector.getMutator().setSafe(i, data[i % 3], 0, data[i % 3].length); + vector.setSafe(i, data[i % 3], 0, data[i % 3].length); } - vector.getMutator().setValueCount(count); + vector.setValueCount(count); dictionaryVector.allocateNew(512, 3); - dictionaryVector.getMutator().setSafe(0, zero, 0, zero.length); - dictionaryVector.getMutator().setSafe(1, one, 0, one.length); - dictionaryVector.getMutator().setSafe(2, two, 0, two.length); - dictionaryVector.getMutator().setValueCount(3); + dictionaryVector.setSafe(0, zero, 0, zero.length); + dictionaryVector.setSafe(1, one, 0, one.length); + dictionaryVector.setSafe(2, two, 0, two.length); + dictionaryVector.setValueCount(3); Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) { // verify indices - assertEquals(NullableIntVector.class, encoded.getClass()); + assertEquals(IntVector.class, encoded.getClass()); - NullableIntVector.Accessor indexAccessor = ((NullableIntVector) encoded).getAccessor(); - assertEquals(count, indexAccessor.getValueCount()); + IntVector index = ((IntVector) encoded); + assertEquals(count, index.getValueCount()); for (int i = 0; i < count; ++i) { - assertEquals(i % 3, indexAccessor.get(i)); + assertEquals(i % 3, index.get(i)); } // now run through the decoder and verify we get the original back try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) { assertEquals(vector.getClass(), decoded.getClass()); - assertEquals(vector.getAccessor().getValueCount(), decoded.getAccessor().getValueCount()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); for (int i = 0; i < count; ++i) { - assertEquals(vector.getAccessor().getObject(i), decoded.getAccessor().getObject(i)); + assertEquals(vector.getObject(i), decoded.getObject(i)); } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java index 5677f2566797a..50438ce116dfd 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java @@ -26,6 +26,7 @@ import org.apache.arrow.vector.complex.impl.UnionFixedSizeListReader; import org.apache.arrow.vector.complex.impl.UnionListReader; import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.FieldType; @@ -52,16 +53,15 @@ public void terminate() throws Exception { @Test public void testIntType() { try (FixedSizeListVector vector = FixedSizeListVector.empty("list", 2, allocator)) { - NullableIntVector nested = (NullableIntVector) vector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())).getVector(); - NullableIntVector.Mutator mutator = nested.getMutator(); + IntVector nested = (IntVector) vector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())).getVector(); vector.allocateNew(); for (int i = 0; i < 10; i++) { - vector.getMutator().setNotNull(i); - mutator.set(i * 2, i); - mutator.set(i * 2 + 1, i + 10); + vector.setNotNull(i); + nested.set(i * 2, i); + nested.set(i * 2 + 1, i + 10); } - vector.getMutator().setValueCount(10); + vector.setValueCount(10); UnionFixedSizeListReader reader = vector.getReader(); for (int i = 0; i < 10; i++) { @@ -80,18 +80,17 @@ public void testIntType() { @Test public void testFloatTypeNullable() { try (FixedSizeListVector vector = FixedSizeListVector.empty("list", 2, allocator)) { - NullableFloat4Vector nested = (NullableFloat4Vector) vector.addOrGetVector(FieldType.nullable(MinorType.FLOAT4.getType())).getVector(); - NullableFloat4Vector.Mutator mutator = nested.getMutator(); + Float4Vector nested = (Float4Vector) vector.addOrGetVector(FieldType.nullable(MinorType.FLOAT4.getType())).getVector(); vector.allocateNew(); for (int i = 0; i < 10; i++) { if (i % 2 == 0) { - vector.getMutator().setNotNull(i); - mutator.set(i * 2, i + 0.1f); - mutator.set(i * 2 + 1, i + 10.1f); + vector.setNotNull(i); + nested.set(i * 2, i + 0.1f); + nested.set(i * 2 + 1, i + 10.1f); } } - vector.getMutator().setValueCount(10); + vector.setValueCount(10); UnionFixedSizeListReader reader = vector.getReader(); for (int i = 0; i < 10; i++) { @@ -115,24 +114,22 @@ public void testFloatTypeNullable() { @Test public void testNestedInList() { try (ListVector vector = ListVector.empty("list", allocator)) { - ListVector.Mutator mutator = vector.getMutator(); FixedSizeListVector tuples = (FixedSizeListVector) vector.addOrGetVector(FieldType.nullable(new ArrowType.FixedSizeList(2))).getVector(); - FixedSizeListVector.Mutator tupleMutator = tuples.getMutator(); - NullableIntVector.Mutator innerMutator = (NullableIntVector.Mutator) tuples.addOrGetVector(FieldType.nullable(MinorType.INT.getType())).getVector().getMutator(); + IntVector innerVector = (IntVector) tuples.addOrGetVector(FieldType.nullable(MinorType.INT.getType())).getVector(); vector.allocateNew(); for (int i = 0; i < 10; i++) { if (i % 2 == 0) { - int position = mutator.startNewValue(i); + int position = vector.startNewValue(i); for (int j = 0; j < i % 7; j++) { - tupleMutator.setNotNull(position + j); - innerMutator.set((position + j) * 2, j); - innerMutator.set((position + j) * 2 + 1, j + 1); + tuples.setNotNull(position + j); + innerVector.set((position + j) * 2, j); + innerVector.set((position + j) * 2 + 1, j + 1); } - mutator.endValue(i, i % 7); + vector.endValue(i, i % 7); } } - mutator.setValueCount(10); + vector.setValueCount(10); UnionListReader reader = vector.getReader(); for (int i = 0; i < 10; i++) { @@ -160,25 +157,25 @@ public void testNestedInList() { public void testTransferPair() { try (FixedSizeListVector from = new FixedSizeListVector("from", allocator, 2, null, null); FixedSizeListVector to = new FixedSizeListVector("to", allocator, 2, null, null)) { - NullableFloat4Vector nested = (NullableFloat4Vector) from.addOrGetVector(FieldType.nullable(MinorType.FLOAT4.getType())).getVector(); - NullableFloat4Vector.Mutator mutator = nested.getMutator(); + Float4Vector nested = (Float4Vector) from.addOrGetVector(FieldType.nullable(MinorType.FLOAT4.getType())).getVector(); from.allocateNew(); for (int i = 0; i < 10; i++) { if (i % 2 == 0) { - from.getMutator().setNotNull(i); - mutator.set(i * 2, i + 0.1f); - mutator.set(i * 2 + 1, i + 10.1f); + from.setNotNull(i); + nested.set(i * 2, i + 0.1f); + nested.set(i * 2 + 1, i + 10.1f); } } - from.getMutator().setValueCount(10); + from.setValueCount(10); TransferPair pair = from.makeTransferPair(to); pair.copyValueSafe(0, 1); pair.copyValueSafe(2, 2); to.copyFromSafe(4, 3, from); - to.getMutator().setValueCount(10); + + to.setValueCount(10); UnionFixedSizeListReader reader = to.getReader(); @@ -220,4 +217,17 @@ public void testTransferPair() { } } } + + @Test + public void testConsistentChildName() throws Exception { + try (FixedSizeListVector listVector = FixedSizeListVector.empty("sourceVector", 2, allocator)) { + String emptyListStr = listVector.getField().toString(); + Assert.assertTrue(emptyListStr.contains(ListVector.DATA_VECTOR_NAME)); + + listVector.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); + String emptyVectorStr = listVector.getField().toString(); + Assert.assertTrue(emptyVectorStr.contains(ListVector.DATA_VECTOR_NAME)); + } + } + } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index 1c9b574998018..e2023f4461879 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -26,12 +26,9 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.impl.UnionListWriter; -import org.apache.arrow.vector.complex.impl.UnionListReader; import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.complex.writer.FieldWriter; -import org.apache.arrow.vector.holders.NullableBigIntHolder; import org.apache.arrow.vector.types.Types; -import org.apache.arrow.vector.types.Types.*; +import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.TransferPair; import org.junit.After; @@ -88,7 +85,7 @@ public void testCopyFrom() throws Exception { for (int i = 0; i < 3; i++) { outVector.copyFrom(i, i, inVector); } - outVector.getMutator().setValueCount(3); + outVector.setValueCount(3); // assert the output vector is correct FieldReader reader = outVector.getReader(); @@ -98,11 +95,9 @@ public void testCopyFrom() throws Exception { reader.setPosition(2); Assert.assertTrue("shouldn't be null", reader.isSet()); - /* check the exact contents of vector */ - final ListVector.Accessor accessor = outVector.getAccessor(); /* index 0 */ - Object result = accessor.getObject(0); + Object result = outVector.getObject(0); ArrayList resultSet = (ArrayList) result; assertEquals(3, resultSet.size()); assertEquals(new Long(1), (Long) resultSet.get(0)); @@ -110,11 +105,11 @@ public void testCopyFrom() throws Exception { assertEquals(new Long(3), (Long) resultSet.get(2)); /* index 1 */ - result = accessor.getObject(1); + result = outVector.getObject(1); assertNull(result); /* index 2 */ - result = accessor.getObject(2); + result = outVector.getObject(2); resultSet = (ArrayList) result; assertEquals(0, resultSet.size()); } @@ -131,60 +126,60 @@ public void testSetLastSetUsage() throws Exception { /* allocate memory */ listVector.allocateNew(); - /* get inner vectors; bitVector and offsetVector */ - List innerVectors = listVector.getFieldInnerVectors(); - BitVector bitVector = (BitVector) innerVectors.get(0); - UInt4Vector offsetVector = (UInt4Vector) innerVectors.get(1); + /* get inner buffers; validityBuffer and offsetBuffer */ - /* get the underlying data vector -- NullableBigIntVector */ - NullableBigIntVector dataVector = (NullableBigIntVector) listVector.getDataVector(); + ArrowBuf validityBuffer = listVector.getValidityBuffer(); + ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* get the underlying data vector -- BigIntVector */ + BigIntVector dataVector = (BigIntVector) listVector.getDataVector(); /* check current lastSet */ - assertEquals(Integer.toString(0), Integer.toString(listVector.getMutator().getLastSet())); + assertEquals(Integer.toString(0), Integer.toString(listVector.getLastSet())); int index = 0; int offset = 0; - /* write [10, 11, 12] to the list vector at index */ - bitVector.getMutator().setSafe(index, 1); - dataVector.getMutator().setSafe(0, 1, 10); - dataVector.getMutator().setSafe(1, 1, 11); - dataVector.getMutator().setSafe(2, 1, 12); - offsetVector.getMutator().setSafe(index + 1, 3); + /* write [10, 11, 12] to the list vector at index 0 */ + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + dataVector.setSafe(0, 1, 10); + dataVector.setSafe(1, 1, 11); + dataVector.setSafe(2, 1, 12); + offsetBuffer.setInt((index + 1) * ListVector.OFFSET_WIDTH, 3); index += 1; /* write [13, 14] to the list vector at index 1 */ - bitVector.getMutator().setSafe(index, 1); - dataVector.getMutator().setSafe(3, 1, 13); - dataVector.getMutator().setSafe(4, 1, 14); - offsetVector.getMutator().setSafe(index + 1, 5); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + dataVector.setSafe(3, 1, 13); + dataVector.setSafe(4, 1, 14); + offsetBuffer.setInt((index + 1) * ListVector.OFFSET_WIDTH, 5); index += 1; /* write [15, 16, 17] to the list vector at index 2 */ - bitVector.getMutator().setSafe(index, 1); - dataVector.getMutator().setSafe(5, 1, 15); - dataVector.getMutator().setSafe(6, 1, 16); - dataVector.getMutator().setSafe(7, 1, 17); - offsetVector.getMutator().setSafe(index + 1, 8); + BitVectorHelper.setValidityBitToOne(validityBuffer, index); + dataVector.setSafe(5, 1, 15); + dataVector.setSafe(6, 1, 16); + dataVector.setSafe(7, 1, 17); + offsetBuffer.setInt((index + 1) * ListVector.OFFSET_WIDTH, 8); /* check current lastSet */ - assertEquals(Integer.toString(0), Integer.toString(listVector.getMutator().getLastSet())); + assertEquals(Integer.toString(0), Integer.toString(listVector.getLastSet())); /* set lastset and arbitrary valuecount for list vector. * * NOTE: if we don't execute setLastSet() before setLastValueCount(), then - * the latter will corrupt the offsetVector and thus the accessor will not - * retrieve the correct values from underlying dataVector. Run the test + * the latter will corrupt the offsetBuffer and thus the accessor will not + * retrieve the correct values from underlying dataBuffer. Run the test * by commenting out next line and we should see failures from 5th assert * onwards. This is why doing setLastSet() is important before setValueCount() * once the vector has been loaded. * * Another important thing to remember is the value of lastSet itself. * Even though the listVector has elements till index 2 only, the lastSet should - * be set as 3. This is because the offsetVector has valid offsets filled till index 3. - * If we do setLastSet(2), the offsetVector at index 3 will contain incorrect value + * be set as 3. This is because the offsetBuffer has valid offsets filled till index 3. + * If we do setLastSet(2), the offsetBuffer at index 3 will contain incorrect value * after execution of setValueCount(). * * correct state of the listVector @@ -211,54 +206,50 @@ public void testSetLastSetUsage() throws Exception { * [15, 16, 17] * } */ - listVector.getMutator().setLastSet(3); - listVector.getMutator().setValueCount(10); - - /* check the vector output */ - final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); - final ValueVector.Accessor valueAccessor = dataVector.getAccessor(); + listVector.setLastSet(3); + listVector.setValueCount(10); index = 0; - offset = offsetAccessor.get(index); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); assertEquals(Integer.toString(0), Integer.toString(offset)); - Object actual = valueAccessor.getObject(offset); + Object actual = dataVector.getObject(offset); assertEquals(new Long(10), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(11), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(12), (Long) actual); index++; - offset = offsetAccessor.get(index); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); assertEquals(Integer.toString(3), Integer.toString(offset)); - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(13), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(14), (Long) actual); index++; - offset = offsetAccessor.get(index); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); assertEquals(Integer.toString(5), Integer.toString(offset)); - actual = valueAccessor.getObject(offsetAccessor.get(index)); + actual = dataVector.getObject(offset); assertEquals(new Long(15), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(16), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(17), (Long) actual); index++; - offset = offsetAccessor.get(index); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); assertEquals(Integer.toString(8), Integer.toString(offset)); - actual = valueAccessor.getObject(offsetAccessor.get(index)); + actual = dataVector.getObject(offset); assertNull(actual); } } @@ -311,99 +302,97 @@ public void testSplitAndTransfer() throws Exception { listWriter.bigInt().writeBigInt(23); listWriter.endList(); - listVector.getMutator().setValueCount(5); + listVector.setValueCount(5); - assertEquals(5, listVector.getMutator().getLastSet()); + assertEquals(5, listVector.getLastSet()); - /* get offsetVector */ - UInt4Vector offsetVector = (UInt4Vector) listVector.getOffsetVector(); + /* get offset buffer */ + final ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); /* get dataVector */ - NullableBigIntVector dataVector = (NullableBigIntVector) listVector.getDataVector(); + BigIntVector dataVector = (BigIntVector) listVector.getDataVector(); /* check the vector output */ - final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); - final ValueVector.Accessor valueAccessor = dataVector.getAccessor(); int index = 0; int offset = 0; Object actual = null; /* index 0 */ - assertFalse(listVector.getAccessor().isNull(index)); - offset = offsetAccessor.get(index); + assertFalse(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); assertEquals(Integer.toString(0), Integer.toString(offset)); - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(10), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(11), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(12), (Long) actual); /* index 1 */ index++; - assertFalse(listVector.getAccessor().isNull(index)); - offset = offsetAccessor.get(index); + assertFalse(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); assertEquals(Integer.toString(3), Integer.toString(offset)); - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(13), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(14), (Long) actual); /* index 2 */ index++; - assertFalse(listVector.getAccessor().isNull(index)); - offset = offsetAccessor.get(index); + assertFalse(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); assertEquals(Integer.toString(5), Integer.toString(offset)); - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(15), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(16), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(17), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(18), (Long) actual); /* index 3 */ index++; - assertFalse(listVector.getAccessor().isNull(index)); - offset = offsetAccessor.get(index); + assertFalse(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); assertEquals(Integer.toString(9), Integer.toString(offset)); - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(19), (Long) actual); /* index 4 */ index++; - assertFalse(listVector.getAccessor().isNull(index)); - offset = offsetAccessor.get(index); + assertFalse(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); assertEquals(Integer.toString(10), Integer.toString(offset)); - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(20), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(21), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(22), (Long) actual); offset++; - actual = valueAccessor.getObject(offset); + actual = dataVector.getObject(offset); assertEquals(new Long(23), (Long) actual); /* index 5 */ index++; - assertTrue(listVector.getAccessor().isNull(index)); - offset = offsetAccessor.get(index); + assertTrue(listVector.isNull(index)); + offset = offsetBuffer.getInt(index * ListVector.OFFSET_WIDTH); assertEquals(Integer.toString(14), Integer.toString(offset)); /* do split and transfer */ @@ -425,27 +414,27 @@ public void testSplitAndTransfer() throws Exception { transferPair.splitAndTransfer(start, splitLength); - /* get offsetVector of toVector */ - UInt4Vector offsetVector1 = (UInt4Vector) toVector.getOffsetVector(); - UInt4Vector.Accessor offsetAccessor1 = offsetVector1.getAccessor(); + /* get offsetBuffer of toVector */ + final ArrowBuf toOffsetBuffer = toVector.getOffsetBuffer(); /* get dataVector of toVector */ - NullableBigIntVector dataVector1 = (NullableBigIntVector) toVector.getDataVector(); - NullableBigIntVector.Accessor valueAccessor1 = dataVector1.getAccessor(); + BigIntVector dataVector1 = (BigIntVector) toVector.getDataVector(); for (int i = 0; i < splitLength; i++) { - dataLength1 = offsetAccessor.get(start + i + 1) - offsetAccessor.get(start + i); - dataLength2 = offsetAccessor1.get(i + 1) - offsetAccessor1.get(i); + dataLength1 = offsetBuffer.getInt((start + i + 1) * ListVector.OFFSET_WIDTH) - + offsetBuffer.getInt((start + i) * ListVector.OFFSET_WIDTH); + dataLength2 = toOffsetBuffer.getInt((i + 1) * ListVector.OFFSET_WIDTH) + - toOffsetBuffer.getInt(i * ListVector.OFFSET_WIDTH); assertEquals("Different data lengths at index: " + i + " and start: " + start, dataLength1, dataLength2); - offset1 = offsetAccessor.get(start + i); - offset2 = offsetAccessor1.get(i); + offset1 = offsetBuffer.getInt((start + i) * ListVector.OFFSET_WIDTH); + offset2 = toOffsetBuffer.getInt(i * ListVector.OFFSET_WIDTH); for (int j = 0; j < dataLength1; j++) { assertEquals("Different data at indexes: " + offset1 + " and " + offset2, - valueAccessor.getObject(offset1), valueAccessor1.getObject(offset2)); + dataVector.getObject(offset1), dataVector1.getObject(offset2)); offset1++; offset2++; @@ -509,15 +498,14 @@ public void testNestedListVector() throws Exception { listWriter.endList(); - assertEquals(2, listVector.getMutator().getLastSet()); + assertEquals(2, listVector.getLastSet()); - listVector.getMutator().setValueCount(2); + listVector.setValueCount(2); - final ListVector.Accessor accessor = listVector.getAccessor(); - assertEquals(2, accessor.getValueCount()); + assertEquals(2, listVector.getValueCount()); /* get listVector value at index 0 -- the value itself is a listvector */ - Object result = accessor.getObject(0); + Object result = listVector.getObject(0); ArrayList> resultSet = (ArrayList>) result; ArrayList list; @@ -537,7 +525,7 @@ public void testNestedListVector() throws Exception { assertEquals(new Long(175), list.get(3)); /* get listVector value at index 1 -- the value itself is a listvector */ - result = accessor.getObject(1); + result = listVector.getObject(1); resultSet = (ArrayList>) result; assertEquals(3, resultSet.size()); /* 3 inner lists at index 1 */ @@ -558,17 +546,144 @@ public void testNestedListVector() throws Exception { assertEquals(new Long(35), list.get(2)); /* check underlying bitVector */ - assertFalse(accessor.isNull(0)); - assertFalse(accessor.isNull(1)); + assertFalse(listVector.isNull(0)); + assertFalse(listVector.isNull(1)); + + /* check underlying offsets */ + final ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); + + /* listVector has 2 lists at index 0 and 3 lists at index 1 */ + assertEquals(0, offsetBuffer.getInt(0 * ListVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getInt(1 * ListVector.OFFSET_WIDTH)); + assertEquals(5, offsetBuffer.getInt(2 * ListVector.OFFSET_WIDTH)); + } + } + + @Test + public void testNestedListVector1() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + + MinorType listType = MinorType.LIST; + MinorType scalarType = MinorType.BIGINT; + + listVector.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList1 = (ListVector)listVector.getDataVector(); + innerList1.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList2 = (ListVector)innerList1.getDataVector(); + innerList2.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList3 = (ListVector)innerList2.getDataVector(); + innerList3.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList4 = (ListVector)innerList3.getDataVector(); + innerList4.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList5 = (ListVector)innerList4.getDataVector(); + innerList5.addOrGetVector(FieldType.nullable(listType.getType())); + + ListVector innerList6 = (ListVector)innerList5.getDataVector(); + innerList6.addOrGetVector(FieldType.nullable(scalarType.getType())); + + listVector.setInitialCapacity(128); + } + } + + @Test + public void testNestedListVector2() throws Exception { + try (ListVector listVector = ListVector.empty("sourceVector", allocator)) { + listVector.setInitialCapacity(1); + UnionListWriter listWriter = listVector.getWriter(); + /* allocate memory */ + listWriter.allocate(); + + /* write one or more inner lists at index 0 */ + listWriter.setPosition(0); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(50); + listWriter.list().bigInt().writeBigInt(100); + listWriter.list().bigInt().writeBigInt(200); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(75); + listWriter.list().bigInt().writeBigInt(125); + listWriter.list().endList(); + + listWriter.endList(); + + /* write one or more inner lists at index 1 */ + listWriter.setPosition(1); + listWriter.startList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(15); + listWriter.list().bigInt().writeBigInt(20); + listWriter.list().endList(); + + listWriter.list().startList(); + listWriter.list().bigInt().writeBigInt(25); + listWriter.list().bigInt().writeBigInt(30); + listWriter.list().bigInt().writeBigInt(35); + listWriter.list().endList(); + + listWriter.endList(); + + assertEquals(2, listVector.getLastSet()); + + listVector.setValueCount(2); + + assertEquals(2, listVector.getValueCount()); + + /* get listVector value at index 0 -- the value itself is a listvector */ + Object result = listVector.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(50), list.get(0)); + assertEquals(new Long(100), list.get(1)); + assertEquals(new Long(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(new Long(75), list.get(0)); + assertEquals(new Long(125), list.get(1)); + + /* get listVector value at index 1 -- the value itself is a listvector */ + result = listVector.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(2, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(2, resultSet.get(0).size()); /* size of first inner list */ + assertEquals(3, resultSet.get(1).size()); /* size of second inner list */ + + list = resultSet.get(0); + assertEquals(new Long(15), list.get(0)); + assertEquals(new Long(20), list.get(1)); + + list = resultSet.get(1); + assertEquals(new Long(25), list.get(0)); + assertEquals(new Long(30), list.get(1)); + assertEquals(new Long(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(listVector.isNull(0)); + assertFalse(listVector.isNull(1)); - /* check underlying offsetVector */ - UInt4Vector offsetVector = listVector.getOffsetVector(); - final UInt4Vector.Accessor offsetAccessor = offsetVector.getAccessor(); + /* check underlying offsets */ + final ArrowBuf offsetBuffer = listVector.getOffsetBuffer(); /* listVector has 2 lists at index 0 and 3 lists at index 1 */ - assertEquals(0, offsetAccessor.get(0)); - assertEquals(2, offsetAccessor.get(1)); - assertEquals(5, offsetAccessor.get(2)); + assertEquals(0, offsetBuffer.getInt(0 * ListVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getInt(1 * ListVector.OFFSET_WIDTH)); + assertEquals(4, offsetBuffer.getInt(2 * ListVector.OFFSET_WIDTH)); } } @@ -594,17 +709,15 @@ public void testGetBufferAddress() throws Exception { listWriter.bigInt().writeBigInt(300); listWriter.endList(); - final ListVector.Accessor accessor = listVector.getAccessor(); - /* check listVector contents */ - Object result = accessor.getObject(0); + Object result = listVector.getObject(0); ArrayList resultSet = (ArrayList) result; assertEquals(3, resultSet.size()); assertEquals(new Long(50), resultSet.get(0)); assertEquals(new Long(100), resultSet.get(1)); assertEquals(new Long(200), resultSet.get(2)); - result = accessor.getObject(1); + result = listVector.getObject(1); resultSet = (ArrayList) result; assertEquals(2, resultSet.size()); assertEquals(new Long(250), resultSet.get(0)); @@ -635,7 +748,7 @@ public void testConsistentChildName() throws Exception { String emptyListStr = listVector.getField().toString(); assertTrue(emptyListStr.contains(ListVector.DATA_VECTOR_NAME)); - listVector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + listVector.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); String emptyVectorStr = listVector.getField().toString(); assertTrue(emptyVectorStr.contains(ListVector.DATA_VECTOR_NAME)); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java index ba2ebbf05ad0d..f14dbd6637356 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java @@ -112,10 +112,10 @@ public void testVariableVectorReallocation() { try { vector.allocateNew(expectedAllocationInBytes, 10); assertTrue(expectedOffsetSize <= vector.getValueCapacity()); - assertTrue(expectedAllocationInBytes <= vector.getBuffer().capacity()); + assertTrue(expectedAllocationInBytes <= vector.getDataBuffer().capacity()); vector.reAlloc(); assertTrue(expectedOffsetSize * 2 <= vector.getValueCapacity()); - assertTrue(expectedAllocationInBytes * 2 <= vector.getBuffer().capacity()); + assertTrue(expectedAllocationInBytes * 2 <= vector.getDataBuffer().capacity()); } finally { vector.close(); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java index 66e5375e3bd0d..80d5fe19700ac 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java @@ -24,8 +24,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.NullableVarCharVector; -import org.apache.arrow.vector.NullableVarCharVector.Accessor; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.util.TransferPair; import org.junit.After; @@ -46,40 +45,38 @@ public void terminate() throws Exception { allocator.close(); } - @Test /* NullableVarCharVector */ + @Test /* VarCharVector */ public void test() throws Exception { - try(final NullableVarCharVector varCharVector = new NullableVarCharVector("myvector", allocator)) { + try(final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { varCharVector.allocateNew(10000, 1000); final int valueCount = 500; final String[] compareArray = new String[valueCount]; - final NullableVarCharVector.Mutator mutator = varCharVector.getMutator(); for (int i = 0; i < valueCount; i += 3) { final String s = String.format("%010d", i); - mutator.set(i, s.getBytes()); + varCharVector.set(i, s.getBytes()); compareArray[i] = s; } - mutator.setValueCount(valueCount); + varCharVector.setValueCount(valueCount); final TransferPair tp = varCharVector.getTransferPair(allocator); - final NullableVarCharVector newVarCharVector = (NullableVarCharVector) tp.getTo(); - final Accessor accessor = newVarCharVector.getAccessor(); + final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); final int[][] startLengths = {{0, 201}, {201, 200}, {401, 99}}; for (final int[] startLength : startLengths) { final int start = startLength[0]; final int length = startLength[1]; tp.splitAndTransfer(start, length); - newVarCharVector.getMutator().setValueCount(length); + newVarCharVector.setValueCount(length); for (int i = 0; i < length; i++) { final boolean expectedSet = ((start + i) % 3) == 0; if (expectedSet) { final byte[] expectedValue = compareArray[start + i].getBytes(); - assertFalse(accessor.isNull(i)); - assertArrayEquals(expectedValue, accessor.get(i)); + assertFalse(newVarCharVector.isNull(i)); + assertArrayEquals(expectedValue, newVarCharVector.get(i)); } else { - assertTrue(accessor.isNull(i)); + assertTrue(newVarCharVector.isNull(i)); } } newVarCharVector.clear(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java index 86f0bf337f9d4..aec7d0f327389 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; import java.util.List; @@ -64,26 +65,24 @@ public void testUnionVector() throws Exception { unionVector.allocateNew(); // write some data - final UnionVector.Mutator mutator = unionVector.getMutator(); - mutator.setType(0, Types.MinorType.UINT4); - mutator.setSafe(0, uInt4Holder); - mutator.setType(2, Types.MinorType.UINT4); - mutator.setSafe(2, uInt4Holder); - mutator.setValueCount(4); + unionVector.setType(0, Types.MinorType.UINT4); + unionVector.setSafe(0, uInt4Holder); + unionVector.setType(2, Types.MinorType.UINT4); + unionVector.setSafe(2, uInt4Holder); + unionVector.setValueCount(4); // check that what we wrote is correct - final UnionVector.Accessor accessor = unionVector.getAccessor(); - assertEquals(4, accessor.getValueCount()); + assertEquals(4, unionVector.getValueCount()); - assertEquals(false, accessor.isNull(0)); - assertEquals(100, accessor.getObject(0)); + assertEquals(false, unionVector.isNull(0)); + assertEquals(100, unionVector.getObject(0)); - assertEquals(true, accessor.isNull(1)); + assertEquals(true, unionVector.isNull(1)); - assertEquals(false, accessor.isNull(2)); - assertEquals(100, accessor.getObject(2)); + assertEquals(false, unionVector.isNull(2)); + assertEquals(100, unionVector.getObject(2)); - assertEquals(true, accessor.isNull(3)); + assertEquals(true, unionVector.isNull(3)); } } @@ -93,16 +92,15 @@ public void testTransfer() throws Exception { srcVector.allocateNew(); // write some data - final UnionVector.Mutator mutator = srcVector.getMutator(); - mutator.setType(0, MinorType.INT); - mutator.setSafe(0, newIntHolder(5)); - mutator.setType(1, MinorType.BIT); - mutator.setSafe(1, newBitHolder(false)); - mutator.setType(3, MinorType.INT); - mutator.setSafe(3, newIntHolder(10)); - mutator.setType(5, MinorType.BIT); - mutator.setSafe(5, newBitHolder(false)); - mutator.setValueCount(6); + srcVector.setType(0, MinorType.INT); + srcVector.setSafe(0, newIntHolder(5)); + srcVector.setType(1, MinorType.BIT); + srcVector.setSafe(1, newBitHolder(false)); + srcVector.setType(3, MinorType.INT); + srcVector.setSafe(3, newIntHolder(10)); + srcVector.setType(5, MinorType.BIT); + srcVector.setSafe(5, newBitHolder(false)); + srcVector.setValueCount(6); try (UnionVector destVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { TransferPair pair = srcVector.makeTransferPair(destVector); @@ -116,10 +114,23 @@ public void testTransfer() throws Exception { assertEquals(srcVector.getField(), destVector.getField()); // now check the values are transferred - assertEquals(srcVector.getAccessor().getValueCount(), destVector.getAccessor().getValueCount()); - for (int i = 0; i < srcVector.getAccessor().getValueCount(); i++) { - assertEquals("Different values at index " + i, srcVector.getAccessor().get(i), destVector.getAccessor().get(i)); - } + assertEquals(6, destVector.getValueCount()); + + assertFalse(destVector.isNull(0)); + assertEquals(5, destVector.getObject(0)); + + assertFalse(destVector.isNull(1)); + assertEquals(false, destVector.getObject(1)); + + assertTrue(destVector.isNull(2)); + + assertFalse(destVector.isNull(3)); + assertEquals(10, destVector.getObject(3)); + + assertTrue(destVector.isNull(4)); + + assertFalse(destVector.isNull(5)); + assertEquals(false, destVector.getObject(5)); } } } @@ -127,61 +138,58 @@ public void testTransfer() throws Exception { @Test public void testSplitAndTransfer() throws Exception { try (UnionVector sourceVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { - final UnionVector.Mutator sourceMutator = sourceVector.getMutator(); - final UnionVector.Accessor sourceAccessor = sourceVector.getAccessor(); sourceVector.allocateNew(); /* populate the UnionVector */ - sourceMutator.setType(0, MinorType.INT); - sourceMutator.setSafe(0, newIntHolder(5)); - sourceMutator.setType(1, MinorType.INT); - sourceMutator.setSafe(1, newIntHolder(10)); - sourceMutator.setType(2, MinorType.INT); - sourceMutator.setSafe(2, newIntHolder(15)); - sourceMutator.setType(3, MinorType.INT); - sourceMutator.setSafe(3, newIntHolder(20)); - sourceMutator.setType(4, MinorType.INT); - sourceMutator.setSafe(4, newIntHolder(25)); - sourceMutator.setType(5, MinorType.INT); - sourceMutator.setSafe(5, newIntHolder(30)); - sourceMutator.setType(6, MinorType.INT); - sourceMutator.setSafe(6, newIntHolder(35)); - sourceMutator.setType(7, MinorType.INT); - sourceMutator.setSafe(7, newIntHolder(40)); - sourceMutator.setType(8, MinorType.INT); - sourceMutator.setSafe(8, newIntHolder(45)); - sourceMutator.setType(9, MinorType.INT); - sourceMutator.setSafe(9, newIntHolder(50)); - sourceMutator.setValueCount(10); + sourceVector.setType(0, MinorType.INT); + sourceVector.setSafe(0, newIntHolder(5)); + sourceVector.setType(1, MinorType.INT); + sourceVector.setSafe(1, newIntHolder(10)); + sourceVector.setType(2, MinorType.INT); + sourceVector.setSafe(2, newIntHolder(15)); + sourceVector.setType(3, MinorType.INT); + sourceVector.setSafe(3, newIntHolder(20)); + sourceVector.setType(4, MinorType.INT); + sourceVector.setSafe(4, newIntHolder(25)); + sourceVector.setType(5, MinorType.INT); + sourceVector.setSafe(5, newIntHolder(30)); + sourceVector.setType(6, MinorType.INT); + sourceVector.setSafe(6, newIntHolder(35)); + sourceVector.setType(7, MinorType.INT); + sourceVector.setSafe(7, newIntHolder(40)); + sourceVector.setType(8, MinorType.INT); + sourceVector.setSafe(8, newIntHolder(45)); + sourceVector.setType(9, MinorType.INT); + sourceVector.setSafe(9, newIntHolder(50)); + sourceVector.setValueCount(10); /* check the vector output */ - assertEquals(10, sourceAccessor.getValueCount()); - assertEquals(false, sourceAccessor.isNull(0)); - assertEquals(5, sourceAccessor.getObject(0)); - assertEquals(false, sourceAccessor.isNull(1)); - assertEquals(10, sourceAccessor.getObject(1)); - assertEquals(false, sourceAccessor.isNull(2)); - assertEquals(15, sourceAccessor.getObject(2)); - assertEquals(false, sourceAccessor.isNull(3)); - assertEquals(20, sourceAccessor.getObject(3)); - assertEquals(false, sourceAccessor.isNull(4)); - assertEquals(25, sourceAccessor.getObject(4)); - assertEquals(false, sourceAccessor.isNull(5)); - assertEquals(30, sourceAccessor.getObject(5)); - assertEquals(false, sourceAccessor.isNull(6)); - assertEquals(35, sourceAccessor.getObject(6)); - assertEquals(false, sourceAccessor.isNull(7)); - assertEquals(40, sourceAccessor.getObject(7)); - assertEquals(false, sourceAccessor.isNull(8)); - assertEquals(45, sourceAccessor.getObject(8)); - assertEquals(false, sourceAccessor.isNull(9)); - assertEquals(50, sourceAccessor.getObject(9)); + assertEquals(10, sourceVector.getValueCount()); + assertEquals(false, sourceVector.isNull(0)); + assertEquals(5, sourceVector.getObject(0)); + assertEquals(false, sourceVector.isNull(1)); + assertEquals(10, sourceVector.getObject(1)); + assertEquals(false, sourceVector.isNull(2)); + assertEquals(15, sourceVector.getObject(2)); + assertEquals(false, sourceVector.isNull(3)); + assertEquals(20, sourceVector.getObject(3)); + assertEquals(false, sourceVector.isNull(4)); + assertEquals(25, sourceVector.getObject(4)); + assertEquals(false, sourceVector.isNull(5)); + assertEquals(30, sourceVector.getObject(5)); + assertEquals(false, sourceVector.isNull(6)); + assertEquals(35, sourceVector.getObject(6)); + assertEquals(false, sourceVector.isNull(7)); + assertEquals(40, sourceVector.getObject(7)); + assertEquals(false, sourceVector.isNull(8)); + assertEquals(45, sourceVector.getObject(8)); + assertEquals(false, sourceVector.isNull(9)); + assertEquals(50, sourceVector.getObject(9)); try (UnionVector toVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { final TransferPair transferPair = sourceVector.makeTransferPair(toVector); - final UnionVector.Accessor toAccessor = toVector.getAccessor(); final int[][] transferLengths = {{0, 3}, {3, 1}, @@ -199,8 +207,8 @@ public void testSplitAndTransfer() throws Exception { /* check the toVector output after doing the splitAndTransfer */ for (int i = 0; i < length; i++) { - assertEquals("Different data at indexes: " + (start + i) + "and " + i, sourceAccessor.getObject(start + i), - toAccessor.getObject(i)); + assertEquals("Different data at indexes: " + (start + i) + "and " + i, sourceVector.getObject(start + i), + toVector.getObject(i)); } } } @@ -210,70 +218,67 @@ public void testSplitAndTransfer() throws Exception { @Test public void testSplitAndTransferWithMixedVectors() throws Exception { try (UnionVector sourceVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { - final UnionVector.Mutator sourceMutator = sourceVector.getMutator(); - final UnionVector.Accessor sourceAccessor = sourceVector.getAccessor(); sourceVector.allocateNew(); /* populate the UnionVector */ - sourceMutator.setType(0, MinorType.INT); - sourceMutator.setSafe(0, newIntHolder(5)); + sourceVector.setType(0, MinorType.INT); + sourceVector.setSafe(0, newIntHolder(5)); - sourceMutator.setType(1, MinorType.FLOAT4); - sourceMutator.setSafe(1, newFloat4Holder(5.5f)); + sourceVector.setType(1, MinorType.FLOAT4); + sourceVector.setSafe(1, newFloat4Holder(5.5f)); - sourceMutator.setType(2, MinorType.INT); - sourceMutator.setSafe(2, newIntHolder(10)); + sourceVector.setType(2, MinorType.INT); + sourceVector.setSafe(2, newIntHolder(10)); - sourceMutator.setType(3, MinorType.FLOAT4); - sourceMutator.setSafe(3, newFloat4Holder(10.5f)); + sourceVector.setType(3, MinorType.FLOAT4); + sourceVector.setSafe(3, newFloat4Holder(10.5f)); - sourceMutator.setType(4, MinorType.INT); - sourceMutator.setSafe(4, newIntHolder(15)); + sourceVector.setType(4, MinorType.INT); + sourceVector.setSafe(4, newIntHolder(15)); - sourceMutator.setType(5, MinorType.FLOAT4); - sourceMutator.setSafe(5, newFloat4Holder(15.5f)); + sourceVector.setType(5, MinorType.FLOAT4); + sourceVector.setSafe(5, newFloat4Holder(15.5f)); - sourceMutator.setType(6, MinorType.INT); - sourceMutator.setSafe(6, newIntHolder(20)); + sourceVector.setType(6, MinorType.INT); + sourceVector.setSafe(6, newIntHolder(20)); - sourceMutator.setType(7, MinorType.FLOAT4); - sourceMutator.setSafe(7, newFloat4Holder(20.5f)); + sourceVector.setType(7, MinorType.FLOAT4); + sourceVector.setSafe(7, newFloat4Holder(20.5f)); - sourceMutator.setType(8, MinorType.INT); - sourceMutator.setSafe(8, newIntHolder(30)); + sourceVector.setType(8, MinorType.INT); + sourceVector.setSafe(8, newIntHolder(30)); - sourceMutator.setType(9, MinorType.FLOAT4); - sourceMutator.setSafe(9, newFloat4Holder(30.5f)); - sourceMutator.setValueCount(10); + sourceVector.setType(9, MinorType.FLOAT4); + sourceVector.setSafe(9, newFloat4Holder(30.5f)); + sourceVector.setValueCount(10); /* check the vector output */ - assertEquals(10, sourceAccessor.getValueCount()); - assertEquals(false, sourceAccessor.isNull(0)); - assertEquals(5, sourceAccessor.getObject(0)); - assertEquals(false, sourceAccessor.isNull(1)); - assertEquals(5.5f, sourceAccessor.getObject(1)); - assertEquals(false, sourceAccessor.isNull(2)); - assertEquals(10, sourceAccessor.getObject(2)); - assertEquals(false, sourceAccessor.isNull(3)); - assertEquals(10.5f, sourceAccessor.getObject(3)); - assertEquals(false, sourceAccessor.isNull(4)); - assertEquals(15, sourceAccessor.getObject(4)); - assertEquals(false, sourceAccessor.isNull(5)); - assertEquals(15.5f, sourceAccessor.getObject(5)); - assertEquals(false, sourceAccessor.isNull(6)); - assertEquals(20, sourceAccessor.getObject(6)); - assertEquals(false, sourceAccessor.isNull(7)); - assertEquals(20.5f, sourceAccessor.getObject(7)); - assertEquals(false, sourceAccessor.isNull(8)); - assertEquals(30, sourceAccessor.getObject(8)); - assertEquals(false, sourceAccessor.isNull(9)); - assertEquals(30.5f, sourceAccessor.getObject(9)); + assertEquals(10, sourceVector.getValueCount()); + assertEquals(false, sourceVector.isNull(0)); + assertEquals(5, sourceVector.getObject(0)); + assertEquals(false, sourceVector.isNull(1)); + assertEquals(5.5f, sourceVector.getObject(1)); + assertEquals(false, sourceVector.isNull(2)); + assertEquals(10, sourceVector.getObject(2)); + assertEquals(false, sourceVector.isNull(3)); + assertEquals(10.5f, sourceVector.getObject(3)); + assertEquals(false, sourceVector.isNull(4)); + assertEquals(15, sourceVector.getObject(4)); + assertEquals(false, sourceVector.isNull(5)); + assertEquals(15.5f, sourceVector.getObject(5)); + assertEquals(false, sourceVector.isNull(6)); + assertEquals(20, sourceVector.getObject(6)); + assertEquals(false, sourceVector.isNull(7)); + assertEquals(20.5f, sourceVector.getObject(7)); + assertEquals(false, sourceVector.isNull(8)); + assertEquals(30, sourceVector.getObject(8)); + assertEquals(false, sourceVector.isNull(9)); + assertEquals(30.5f, sourceVector.getObject(9)); try (UnionVector toVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { final TransferPair transferPair = sourceVector.makeTransferPair(toVector); - final UnionVector.Accessor toAccessor = toVector.getAccessor(); final int[][] transferLengths = {{0, 2}, {2, 1}, @@ -290,7 +295,7 @@ public void testSplitAndTransferWithMixedVectors() throws Exception { /* check the toVector output after doing the splitAndTransfer */ for (int i = 0; i < length; i++) { - assertEquals("Different values at index: " + i, sourceAccessor.getObject(start + i), toAccessor.getObject(i)); + assertEquals("Different values at index: " + i, sourceVector.getObject(start + i), toVector.getObject(i)); } } } @@ -300,37 +305,35 @@ public void testSplitAndTransferWithMixedVectors() throws Exception { @Test public void testGetBufferAddress() throws Exception { try (UnionVector vector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { - final UnionVector.Mutator mutator = vector.getMutator(); - final UnionVector.Accessor accessor = vector.getAccessor(); boolean error = false; vector.allocateNew(); /* populate the UnionVector */ - mutator.setType(0, MinorType.INT); - mutator.setSafe(0, newIntHolder(5)); + vector.setType(0, MinorType.INT); + vector.setSafe(0, newIntHolder(5)); - mutator.setType(1, MinorType.FLOAT4); - mutator.setSafe(1, newFloat4Holder(5.5f)); + vector.setType(1, MinorType.FLOAT4); + vector.setSafe(1, newFloat4Holder(5.5f)); - mutator.setType(2, MinorType.INT); - mutator.setSafe(2, newIntHolder(10)); + vector.setType(2, MinorType.INT); + vector.setSafe(2, newIntHolder(10)); - mutator.setType(3, MinorType.FLOAT4); - mutator.setSafe(3, newFloat4Holder(10.5f)); + vector.setType(3, MinorType.FLOAT4); + vector.setSafe(3, newFloat4Holder(10.5f)); - mutator.setValueCount(10); + vector.setValueCount(10); /* check the vector output */ - assertEquals(10, accessor.getValueCount()); - assertEquals(false, accessor.isNull(0)); - assertEquals(5, accessor.getObject(0)); - assertEquals(false, accessor.isNull(1)); - assertEquals(5.5f, accessor.getObject(1)); - assertEquals(false, accessor.isNull(2)); - assertEquals(10, accessor.getObject(2)); - assertEquals(false, accessor.isNull(3)); - assertEquals(10.5f, accessor.getObject(3)); + assertEquals(10, vector.getValueCount()); + assertEquals(false, vector.isNull(0)); + assertEquals(5, vector.getObject(0)); + assertEquals(false, vector.isNull(1)); + assertEquals(5.5f, vector.getObject(1)); + assertEquals(false, vector.isNull(2)); + assertEquals(10, vector.getObject(2)); + assertEquals(false, vector.isNull(3)); + assertEquals(10.5f, vector.getObject(3)); List buffers = vector.getFieldBuffers(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java index a148813090900..99a1d89071c71 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java @@ -25,13 +25,13 @@ public class TestUtils { - public static NullableVarCharVector newNullableVarCharVector(String name, BufferAllocator allocator) { - return (NullableVarCharVector) + public static VarCharVector newVarCharVector(String name, BufferAllocator allocator) { + return (VarCharVector) FieldType.nullable(new ArrowType.Utf8()).createNewSingleVector(name, allocator, null); } - public static NullableVarBinaryVector newNullableVarBinaryVector(String name, BufferAllocator allocator) { - return (NullableVarBinaryVector) + public static VarBinaryVector newVarBinaryVector(String name, BufferAllocator allocator) { + return (VarBinaryVector) FieldType.nullable(new ArrowType.Binary()).createNewSingleVector(name, allocator, null); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index a239861d9b32f..601b2062ff698 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -17,11 +17,10 @@ */ package org.apache.arrow.vector; -import org.apache.arrow.vector.holders.VarCharHolder; import org.apache.arrow.vector.util.OversizedAllocationException; -import static org.apache.arrow.vector.TestUtils.newNullableVarBinaryVector; -import static org.apache.arrow.vector.TestUtils.newNullableVarCharVector; +import static org.apache.arrow.vector.TestUtils.newVarBinaryVector; +import static org.apache.arrow.vector.TestUtils.newVarCharVector; import static org.apache.arrow.vector.TestUtils.newVector; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; @@ -38,15 +37,13 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.schema.ArrowRecordBatch; -import org.apache.arrow.vector.schema.TypeLayout; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.TransferPair; import org.junit.After; -import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -90,9 +87,9 @@ public void terminate() throws Exception { * -- Float4Vector * -- Float8Vector * - * -- NullableUInt4Vector - * -- NullableIntVector - * -- NullableFloat4Vector + * -- UInt4Vector + * -- IntVector + * -- Float4Vector * * TODO: * @@ -109,28 +106,26 @@ public void testFixedType1() { boolean error = false; int initialCapacity = 0; - final UInt4Vector.Mutator mutator = vector.getMutator(); - final UInt4Vector.Accessor accessor = vector.getAccessor(); vector.allocateNew(1024); initialCapacity = vector.getValueCapacity(); assertEquals(1024, initialCapacity); // Put and set a few values - mutator.setSafe(0, 100); - mutator.setSafe(1, 101); - mutator.setSafe(100, 102); - mutator.setSafe(1022, 103); - mutator.setSafe(1023, 104); - - assertEquals(100, accessor.get(0)); - assertEquals(101, accessor.get(1)); - assertEquals(102, accessor.get(100)); - assertEquals(103, accessor.get(1022)); - assertEquals(104, accessor.get(1023)); + vector.setSafe(0, 100); + vector.setSafe(1, 101); + vector.setSafe(100, 102); + vector.setSafe(1022, 103); + vector.setSafe(1023, 104); + + assertEquals(100, vector.get(0)); + assertEquals(101, vector.get(1)); + assertEquals(102, vector.get(100)); + assertEquals(103, vector.get(1022)); + assertEquals(104, vector.get(1023)); try { - mutator.set(1024, 10000); + vector.set(1024, 10000); } catch (IndexOutOfBoundsException ie) { error = true; @@ -141,7 +136,7 @@ public void testFixedType1() { } try { - accessor.get(1024); + vector.get(1024); } catch (IndexOutOfBoundsException ie) { error = true; @@ -152,18 +147,18 @@ public void testFixedType1() { } /* this should trigger a realloc() */ - mutator.setSafe(1024, 10000); + vector.setSafe(1024, 10000); /* underlying buffer should now be able to store double the number of values */ assertEquals(initialCapacity * 2, vector.getValueCapacity()); /* check vector data after realloc */ - assertEquals(100, accessor.get(0)); - assertEquals(101, accessor.get(1)); - assertEquals(102, accessor.get(100)); - assertEquals(103, accessor.get(1022)); - assertEquals(104, accessor.get(1023)); - assertEquals(10000, accessor.get(1024)); + assertEquals(100, vector.get(0)); + assertEquals(101, vector.get(1)); + assertEquals(102, vector.get(100)); + assertEquals(103, vector.get(1022)); + assertEquals(104, vector.get(1023)); + assertEquals(10000, vector.get(1024)); /* reset the vector */ vector.reset(); @@ -173,7 +168,8 @@ public void testFixedType1() { /* vector data should have been zeroed out */ for(int i = 0; i < (initialCapacity * 2); i++) { - assertEquals("non-zero data not expected at index: " + i, 0, accessor.get(i)); + // TODO: test vector.get(i) is 0 after unsafe get added + assertEquals("non-zero data not expected at index: " + i, true, vector.isNull(i)); } } } @@ -181,8 +177,6 @@ public void testFixedType1() { @Test /* IntVector */ public void testFixedType2() { try (final IntVector intVector = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { - final IntVector.Mutator mutator = intVector.getMutator(); - final IntVector.Accessor accessor = intVector.getAccessor(); boolean error = false; int initialCapacity = 16; @@ -213,12 +207,12 @@ public void testFixedType2() { /* populate the vector */ int j = 1; for(int i = 0; i < 16; i += 2) { - mutator.set(i, j); + intVector.set(i, j); j++; } try { - mutator.set(16, 9); + intVector.set(16, 9); } catch (IndexOutOfBoundsException ie) { error = true; @@ -231,12 +225,12 @@ public void testFixedType2() { /* check vector contents */ j = 1; for(int i = 0; i < 16; i += 2) { - assertEquals("unexpected value at index: " + i, j, accessor.get(i)); + assertEquals("unexpected value at index: " + i, j, intVector.get(i)); j++; } try { - accessor.get(16); + intVector.get(16); } catch (IndexOutOfBoundsException ie) { error = true; @@ -247,7 +241,7 @@ public void testFixedType2() { } /* this should trigger a realloc() */ - mutator.setSafe(16, 9); + intVector.setSafe(16, 9); /* underlying buffer should now be able to store double the number of values */ assertEquals(initialCapacity * 2, intVector.getValueCapacity()); @@ -255,7 +249,7 @@ public void testFixedType2() { /* vector data should still be intact after realloc */ j = 1; for(int i = 0; i <= 16; i += 2) { - assertEquals("unexpected value at index: " + i, j, accessor.get(i)); + assertEquals("unexpected value at index: " + i, j, intVector.get(i)); j++; } @@ -267,7 +261,7 @@ public void testFixedType2() { /* vector data should have been zeroed out */ for(int i = 0; i < (initialCapacity * 2); i++) { - assertEquals("non-zero data not expected at index: " + i, 0, accessor.get(i)); + assertEquals("non-zero data not expected at index: " + i, true, intVector.isNull(i)); } } } @@ -275,8 +269,6 @@ public void testFixedType2() { @Test /* Float4Vector */ public void testFixedType3() { try (final Float4Vector floatVector = new Float4Vector(EMPTY_SCHEMA_PATH, allocator)) { - final Float4Vector.Mutator mutator = floatVector.getMutator(); - final Float4Vector.Accessor accessor = floatVector.getAccessor(); boolean error = false; int initialCapacity = 16; @@ -306,18 +298,18 @@ public void testFixedType3() { floatVector.zeroVector(); - /* populate the vector */ - mutator.set(0, 1.5f); - mutator.set(2, 2.5f); - mutator.set(4, 3.3f); - mutator.set(6, 4.8f); - mutator.set(8, 5.6f); - mutator.set(10, 6.6f); - mutator.set(12, 7.8f); - mutator.set(14, 8.5f); + /* populate the floatVector */ + floatVector.set(0, 1.5f); + floatVector.set(2, 2.5f); + floatVector.set(4, 3.3f); + floatVector.set(6, 4.8f); + floatVector.set(8, 5.6f); + floatVector.set(10, 6.6f); + floatVector.set(12, 7.8f); + floatVector.set(14, 8.5f); try { - mutator.set(16, 9.5f); + floatVector.set(16, 9.5f); } catch (IndexOutOfBoundsException ie) { error = true; @@ -328,17 +320,17 @@ public void testFixedType3() { } /* check vector contents */ - assertEquals(1.5f, accessor.get(0), 0); - assertEquals(2.5f, accessor.get(2), 0); - assertEquals(3.3f, accessor.get(4), 0); - assertEquals(4.8f, accessor.get(6), 0); - assertEquals(5.6f, accessor.get(8), 0); - assertEquals(6.6f, accessor.get(10), 0); - assertEquals(7.8f, accessor.get(12), 0); - assertEquals(8.5f, accessor.get(14), 0); + assertEquals(1.5f, floatVector.get(0), 0); + assertEquals(2.5f, floatVector.get(2), 0); + assertEquals(3.3f, floatVector.get(4), 0); + assertEquals(4.8f, floatVector.get(6), 0); + assertEquals(5.6f, floatVector.get(8), 0); + assertEquals(6.6f, floatVector.get(10), 0); + assertEquals(7.8f, floatVector.get(12), 0); + assertEquals(8.5f, floatVector.get(14), 0); try { - accessor.get(16); + floatVector.get(16); } catch (IndexOutOfBoundsException ie) { error = true; @@ -349,21 +341,21 @@ public void testFixedType3() { } /* this should trigger a realloc() */ - mutator.setSafe(16, 9.5f); + floatVector.setSafe(16, 9.5f); /* underlying buffer should now be able to store double the number of values */ assertEquals(initialCapacity * 2, floatVector.getValueCapacity()); /* vector data should still be intact after realloc */ - assertEquals(1.5f, accessor.get(0), 0); - assertEquals(2.5f, accessor.get(2), 0); - assertEquals(3.3f, accessor.get(4), 0); - assertEquals(4.8f, accessor.get(6), 0); - assertEquals(5.6f, accessor.get(8), 0); - assertEquals(6.6f, accessor.get(10), 0); - assertEquals(7.8f, accessor.get(12), 0); - assertEquals(8.5f, accessor.get(14), 0); - assertEquals(9.5f, accessor.get(16), 0); + assertEquals(1.5f, floatVector.get(0), 0); + assertEquals(2.5f, floatVector.get(2), 0); + assertEquals(3.3f, floatVector.get(4), 0); + assertEquals(4.8f, floatVector.get(6), 0); + assertEquals(5.6f, floatVector.get(8), 0); + assertEquals(6.6f, floatVector.get(10), 0); + assertEquals(7.8f, floatVector.get(12), 0); + assertEquals(8.5f, floatVector.get(14), 0); + assertEquals(9.5f, floatVector.get(16), 0); /* reset the vector */ floatVector.reset(); @@ -373,7 +365,7 @@ public void testFixedType3() { /* vector data should be zeroed out */ for(int i = 0; i < (initialCapacity * 2); i++) { - assertEquals("non-zero data not expected at index: " + i, 0, accessor.get(i), 0); + assertEquals("non-zero data not expected at index: " + i, true, floatVector.isNull(i)); } } } @@ -381,8 +373,6 @@ public void testFixedType3() { @Test /* Float8Vector */ public void testFixedType4() { try (final Float8Vector floatVector = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { - final Float8Vector.Mutator mutator = floatVector.getMutator(); - final Float8Vector.Accessor accessor = floatVector.getAccessor(); boolean error = false; int initialCapacity = 16; @@ -411,17 +401,17 @@ public void testFixedType4() { assertEquals(initialCapacity, floatVector.getValueCapacity()); /* populate the vector */ - mutator.set(0, 1.55); - mutator.set(2, 2.53); - mutator.set(4, 3.36); - mutator.set(6, 4.82); - mutator.set(8, 5.67); - mutator.set(10, 6.67); - mutator.set(12, 7.87); - mutator.set(14, 8.56); + floatVector.set(0, 1.55); + floatVector.set(2, 2.53); + floatVector.set(4, 3.36); + floatVector.set(6, 4.82); + floatVector.set(8, 5.67); + floatVector.set(10, 6.67); + floatVector.set(12, 7.87); + floatVector.set(14, 8.56); try { - mutator.set(16, 9.53); + floatVector.set(16, 9.53); } catch (IndexOutOfBoundsException ie) { error = true; @@ -431,18 +421,18 @@ public void testFixedType4() { error = false; } - /* check vector contents */ - assertEquals(1.55, accessor.get(0), 0); - assertEquals(2.53, accessor.get(2), 0); - assertEquals(3.36, accessor.get(4), 0); - assertEquals(4.82, accessor.get(6), 0); - assertEquals(5.67, accessor.get(8), 0); - assertEquals(6.67, accessor.get(10), 0); - assertEquals(7.87, accessor.get(12), 0); - assertEquals(8.56, accessor.get(14), 0); + /* check floatVector contents */ + assertEquals(1.55, floatVector.get(0), 0); + assertEquals(2.53, floatVector.get(2), 0); + assertEquals(3.36, floatVector.get(4), 0); + assertEquals(4.82, floatVector.get(6), 0); + assertEquals(5.67, floatVector.get(8), 0); + assertEquals(6.67, floatVector.get(10), 0); + assertEquals(7.87, floatVector.get(12), 0); + assertEquals(8.56, floatVector.get(14), 0); try { - accessor.get(16); + floatVector.get(16); } catch (IndexOutOfBoundsException ie) { error = true; @@ -453,21 +443,21 @@ public void testFixedType4() { } /* this should trigger a realloc() */ - mutator.setSafe(16, 9.53); + floatVector.setSafe(16, 9.53); /* underlying buffer should now be able to store double the number of values */ assertEquals(initialCapacity * 2, floatVector.getValueCapacity()); /* vector data should still be intact after realloc */ - assertEquals(1.55, accessor.get(0), 0); - assertEquals(2.53, accessor.get(2), 0); - assertEquals(3.36, accessor.get(4), 0); - assertEquals(4.82, accessor.get(6), 0); - assertEquals(5.67, accessor.get(8), 0); - assertEquals(6.67, accessor.get(10), 0); - assertEquals(7.87, accessor.get(12), 0); - assertEquals(8.56, accessor.get(14), 0); - assertEquals(9.53, accessor.get(16), 0); + assertEquals(1.55, floatVector.get(0), 0); + assertEquals(2.53, floatVector.get(2), 0); + assertEquals(3.36, floatVector.get(4), 0); + assertEquals(4.82, floatVector.get(6), 0); + assertEquals(5.67, floatVector.get(8), 0); + assertEquals(6.67, floatVector.get(10), 0); + assertEquals(7.87, floatVector.get(12), 0); + assertEquals(8.56, floatVector.get(14), 0); + assertEquals(9.53, floatVector.get(16), 0); /* reset the vector */ floatVector.reset(); @@ -477,18 +467,16 @@ public void testFixedType4() { /* vector data should be zeroed out */ for(int i = 0; i < (initialCapacity * 2); i++) { - assertEquals("non-zero data not expected at index: " + i, 0, accessor.get(i), 0); + assertEquals("non-zero data not expected at index: " + i, true, floatVector.isNull(i)); } } } - @Test /* NullableUInt4Vector */ + @Test /* UInt4Vector */ public void testNullableFixedType1() { // Create a new value vector for 1024 integers. - try (final NullableUInt4Vector vector = newVector(NullableUInt4Vector.class, EMPTY_SCHEMA_PATH, new ArrowType.Int(32, false), allocator);) { - final NullableUInt4Vector.Mutator mutator = vector.getMutator(); - final NullableUInt4Vector.Accessor accessor = vector.getAccessor(); + try (final UInt4Vector vector = newVector(UInt4Vector.class, EMPTY_SCHEMA_PATH, new ArrowType.Int(32, false), allocator);) { boolean error = false; int initialCapacity = 1024; @@ -500,33 +488,33 @@ public void testNullableFixedType1() { assertEquals(initialCapacity, vector.getValueCapacity()); // Put and set a few values - mutator.set(0, 100); - mutator.set(1, 101); - mutator.set(100, 102); - mutator.set(1022, 103); - mutator.set(1023, 104); + vector.set(0, 100); + vector.set(1, 101); + vector.set(100, 102); + vector.set(1022, 103); + vector.set(1023, 104); /* check vector contents */ - assertEquals(100, accessor.get(0)); - assertEquals(101, accessor.get(1)); - assertEquals(102, accessor.get(100)); - assertEquals(103, accessor.get(1022)); - assertEquals(104, accessor.get(1023)); + assertEquals(100, vector.get(0)); + assertEquals(101, vector.get(1)); + assertEquals(102, vector.get(100)); + assertEquals(103, vector.get(1022)); + assertEquals(104, vector.get(1023)); int val = 0; /* check unset bits/null values */ for (int i = 2, j = 101; i <= 99 || j <= 1021; i++, j++) { if (i <= 99) { - assertTrue(accessor.isNull(i)); + assertTrue(vector.isNull(i)); } if(j <= 1021) { - assertTrue(accessor.isNull(j)); + assertTrue(vector.isNull(j)); } } try { - mutator.set(1024, 10000); + vector.set(1024, 10000); } catch (IndexOutOfBoundsException ie) { error = true; @@ -537,7 +525,7 @@ public void testNullableFixedType1() { } try { - accessor.get(1024); + vector.get(1024); } catch (IndexOutOfBoundsException ie) { error = true; @@ -548,28 +536,28 @@ public void testNullableFixedType1() { } /* should trigger a realloc of the underlying bitvector and valuevector */ - mutator.setSafe(1024, 10000); + vector.setSafe(1024, 10000); /* check new capacity */ assertEquals(initialCapacity * 2, vector.getValueCapacity()); /* vector contents should still be intact after realloc */ - assertEquals(100, accessor.get(0)); - assertEquals(101, accessor.get(1)); - assertEquals(102, accessor.get(100)); - assertEquals(103, accessor.get(1022)); - assertEquals(104, accessor.get(1023)); - assertEquals(10000, accessor.get(1024)); + assertEquals(100, vector.get(0)); + assertEquals(101, vector.get(1)); + assertEquals(102, vector.get(100)); + assertEquals(103, vector.get(1022)); + assertEquals(104, vector.get(1023)); + assertEquals(10000, vector.get(1024)); val = 0; /* check unset bits/null values */ for (int i = 2, j = 101; i < 99 || j < 1021; i++, j++) { if (i <= 99) { - assertTrue(accessor.isNull(i)); + assertTrue(vector.isNull(i)); } if(j <= 1021) { - assertTrue(accessor.isNull(j)); + assertTrue(vector.isNull(j)); } } @@ -581,17 +569,15 @@ public void testNullableFixedType1() { /* vector data should be zeroed out */ for(int i = 0; i < (initialCapacity * 2); i++) { - assertTrue("non-null data not expected at index: " + i, accessor.isNull(i)); + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } } - @Test /* NullableFloat4Vector */ + @Test /* Float4Vector */ public void testNullableFixedType2() { // Create a new value vector for 1024 integers - try (final NullableFloat4Vector vector = newVector(NullableFloat4Vector.class, EMPTY_SCHEMA_PATH, MinorType.FLOAT4, allocator);) { - final NullableFloat4Vector.Mutator mutator = vector.getMutator(); - final NullableFloat4Vector.Accessor accessor = vector.getAccessor(); + try (final Float4Vector vector = newVector(Float4Vector.class, EMPTY_SCHEMA_PATH, MinorType.FLOAT4, allocator);) { boolean error = false; int initialCapacity = 16; @@ -603,17 +589,17 @@ public void testNullableFixedType2() { assertEquals(initialCapacity, vector.getValueCapacity()); /* populate the vector */ - mutator.set(0, 100.5f); - mutator.set(2, 201.5f); - mutator.set(4, 300.3f); - mutator.set(6, 423.8f); - mutator.set(8, 555.6f); - mutator.set(10, 66.6f); - mutator.set(12, 78.8f); - mutator.set(14, 89.5f); + vector.set(0, 100.5f); + vector.set(2, 201.5f); + vector.set(4, 300.3f); + vector.set(6, 423.8f); + vector.set(8, 555.6f); + vector.set(10, 66.6f); + vector.set(12, 78.8f); + vector.set(14, 89.5f); try { - mutator.set(16, 90.5f); + vector.set(16, 90.5f); } catch (IndexOutOfBoundsException ie) { error = true; @@ -624,25 +610,25 @@ public void testNullableFixedType2() { } /* check vector contents */ - assertEquals(100.5f, accessor.get(0), 0); - assertTrue(accessor.isNull(1)); - assertEquals(201.5f, accessor.get(2), 0); - assertTrue(accessor.isNull(3)); - assertEquals(300.3f, accessor.get(4), 0); - assertTrue(accessor.isNull(5)); - assertEquals(423.8f, accessor.get(6), 0); - assertTrue(accessor.isNull(7)); - assertEquals(555.6f, accessor.get(8), 0); - assertTrue(accessor.isNull(9)); - assertEquals(66.6f, accessor.get(10), 0); - assertTrue(accessor.isNull(11)); - assertEquals(78.8f, accessor.get(12), 0); - assertTrue(accessor.isNull(13)); - assertEquals(89.5f, accessor.get(14), 0); - assertTrue(accessor.isNull(15)); + assertEquals(100.5f, vector.get(0), 0); + assertTrue(vector.isNull(1)); + assertEquals(201.5f, vector.get(2), 0); + assertTrue(vector.isNull(3)); + assertEquals(300.3f, vector.get(4), 0); + assertTrue(vector.isNull(5)); + assertEquals(423.8f, vector.get(6), 0); + assertTrue(vector.isNull(7)); + assertEquals(555.6f, vector.get(8), 0); + assertTrue(vector.isNull(9)); + assertEquals(66.6f, vector.get(10), 0); + assertTrue(vector.isNull(11)); + assertEquals(78.8f, vector.get(12), 0); + assertTrue(vector.isNull(13)); + assertEquals(89.5f, vector.get(14), 0); + assertTrue(vector.isNull(15)); try { - accessor.get(16); + vector.get(16); } catch (IndexOutOfBoundsException ie) { error = true; @@ -653,29 +639,28 @@ public void testNullableFixedType2() { } /* this should trigger a realloc() */ - mutator.setSafe(16, 90.5f); + vector.setSafe(16, 90.5f); /* underlying buffer should now be able to store double the number of values */ assertEquals(initialCapacity * 2, vector.getValueCapacity()); /* vector data should still be intact after realloc */ - assertEquals(100.5f, accessor.get(0), 0); - assertTrue(accessor.isNull(1)); - assertEquals(201.5f, accessor.get(2), 0); - assertTrue(accessor.isNull(3)); - assertEquals(300.3f, accessor.get(4), 0); - assertTrue(accessor.isNull(5)); - assertEquals(423.8f, accessor.get(6), 0); - assertTrue(accessor.isNull(7)); - assertEquals(555.6f, accessor.get(8), 0); - assertTrue(accessor.isNull(9)); - assertEquals(66.6f, accessor.get(10), 0); - assertTrue(accessor.isNull(11)); - assertEquals(78.8f, accessor.get(12), 0); - assertTrue(accessor.isNull(13)); - assertEquals(89.5f, accessor.get(14), 0); - assertTrue(accessor.isNull(15)); - assertEquals(90.5f, accessor.get(16), 0); + assertEquals(100.5f, vector.get(0), 0); + assertTrue(vector.isNull(1)); + assertEquals(201.5f, vector.get(2), 0); + assertTrue(vector.isNull(3)); + assertEquals(300.3f, vector.get(4), 0); + assertTrue(vector.isNull(5)); + assertEquals(423.8f, vector.get(6), 0); + assertTrue(vector.isNull(7)); + assertEquals(555.6f, vector.get(8), 0); + assertTrue(vector.isNull(9)); + assertEquals(66.6f, vector.get(10), 0); + assertTrue(vector.isNull(11)); + assertEquals(78.8f, vector.get(12), 0); + assertTrue(vector.isNull(13)); + assertEquals(89.5f, vector.get(14), 0); + assertTrue(vector.isNull(15)); /* reset the vector */ vector.reset(); @@ -685,17 +670,15 @@ public void testNullableFixedType2() { /* vector data should be zeroed out */ for(int i = 0; i < (initialCapacity * 2); i++) { - assertTrue("non-null data not expected at index: " + i, accessor.isNull(i)); + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } } - @Test /* NullableIntVector */ + @Test /* IntVector */ public void testNullableFixedType3() { // Create a new value vector for 1024 integers - try (final NullableIntVector vector = newVector(NullableIntVector.class, EMPTY_SCHEMA_PATH, MinorType.INT, allocator)) { - final NullableIntVector.Mutator mutator = vector.getMutator(); - final NullableIntVector.Accessor accessor = vector.getAccessor(); + try (final IntVector vector = newVector(IntVector.class, EMPTY_SCHEMA_PATH, MinorType.INT, allocator)) { boolean error = false; int initialCapacity = 1024; @@ -706,32 +689,30 @@ public void testNullableFixedType3() { /* underlying buffer should be able to store 16 values */ assertEquals(initialCapacity, vector.getValueCapacity()); - mutator.set(0, 1); - mutator.set(1, 2); - mutator.set(100, 3); - mutator.set(1022, 4); - mutator.set(1023, 5); + vector.set(0, 1); + vector.set(1, 2); + vector.set(100, 3); + vector.set(1022, 4); + vector.set(1023, 5); /* check vector contents */ int j = 1; for(int i = 0; i <= 1023; i++) { if((i >= 2 && i <= 99) || (i >= 101 && i <= 1021)) { - assertTrue("non-null data not expected at index: " + i, accessor.isNull(i)); + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } else { - assertFalse("null data not expected at index: " + i, accessor.isNull(i)); - assertEquals("unexpected value at index: " + i, j, accessor.get(i)); + assertFalse("null data not expected at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, j, vector.get(i)); j++; } } - mutator.setValueCount(1024); + vector.setValueCount(1024); Field field = vector.getField(); - TypeLayout typeLayout = field.getTypeLayout(); List buffers = vector.getFieldBuffers(); - assertEquals(2, typeLayout.getVectors().size()); assertEquals(2, buffers.size()); ArrowBuf validityVectorBuf = buffers.get(0); @@ -749,7 +730,7 @@ public void testNullableFixedType3() { assertEquals(-64, validityVectorBuf.getByte(127)); // 1022nd and 1023rd bit defined /* this should trigger a realloc() */ - mutator.setSafe(1024, 6); + vector.setSafe(1024, 6); /* underlying buffer should now be able to store double the number of values */ assertEquals(initialCapacity * 2, vector.getValueCapacity()); @@ -758,11 +739,11 @@ public void testNullableFixedType3() { j = 1; for(int i = 0; i < (initialCapacity * 2); i++) { if((i > 1024) || (i >= 2 && i <= 99) || (i >= 101 && i <= 1021)) { - assertTrue("non-null data not expected at index: " + i, accessor.isNull(i)); + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } else { - assertFalse("null data not expected at index: " + i, accessor.isNull(i)); - assertEquals("unexpected value at index: " + i, j, accessor.get(i)); + assertFalse("null data not expected at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, j, vector.get(i)); j++; } } @@ -775,13 +756,101 @@ public void testNullableFixedType3() { /* vector data should have been zeroed out */ for(int i = 0; i < (initialCapacity * 2); i++) { - assertTrue("non-null data not expected at index: " + i, accessor.isNull(i)); + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } vector.allocateNew(4096); // vector has been erased for(int i = 0; i < 4096; i++) { - assertTrue("non-null data not expected at index: " + i, accessor.isNull(i)); + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); + } + } + } + + @Test /* IntVector */ + public void testNullableFixedType4() { + try (final IntVector vector = newVector(IntVector.class, EMPTY_SCHEMA_PATH, MinorType.INT, allocator)) { + + /* no memory allocation has happened yet */ + assertEquals(0, vector.getValueCapacity()); + + vector.allocateNew(); + int valueCapacity = vector.getValueCapacity(); + assertEquals(vector.INITIAL_VALUE_ALLOCATION, valueCapacity); + + int baseValue = 20000; + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + vector.set(i, baseValue + i); + } + } + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, (baseValue + i), vector.get(i)); + } else { + assertTrue("unexpected non-null value at index: " + i, vector.isNull(i)); + } + } + + vector.setSafe(valueCapacity, 20000000); + assertEquals(valueCapacity * 2, vector.getValueCapacity()); + + for (int i = 0; i < vector.getValueCapacity(); i++) { + if (i == valueCapacity) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, 20000000, vector.get(i)); + } else if (i < valueCapacity) { + if ((i & 1) == 1) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, (baseValue + i), vector.get(i)); + } + } else { + assertTrue("unexpected non-null value at index: " + i, vector.isNull(i)); + } + } + + vector.zeroVector(); + + for (int i = 0; i < vector.getValueCapacity(); i+=2) { + vector.set(i, baseValue + i); + } + + for (int i = 0; i < vector.getValueCapacity(); i++) { + if (i%2 == 0) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, (baseValue + i), vector.get(i)); + } else { + assertTrue("unexpected non-null value at index: " + i, vector.isNull(i)); + } + } + + vector.setSafe((valueCapacity * 2) + 1000, 400000000); + assertEquals(valueCapacity * 4, vector.getValueCapacity()); + + for (int i = 0; i < vector.getValueCapacity(); i++) { + if (i == (valueCapacity*2 + 1000)) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, 400000000, vector.get(i)); + } else if (i < valueCapacity*2 && (i%2) == 0) { + assertFalse("unexpected null value at index: " + i, vector.isNull(i)); + assertEquals("unexpected value at index: " + i, baseValue + i, vector.get(i)); + } else { + assertTrue("unexpected non-null value at index: " + i, vector.isNull(i)); + } + } + + /* reset the vector */ + vector.reset(); + + /* capacity shouldn't change after reset */ + assertEquals(valueCapacity * 4, vector.getValueCapacity()); + + /* vector data should be zeroed out */ + for(int i = 0; i < (valueCapacity * 4); i++) { + assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } } @@ -791,8 +860,8 @@ public void testNullableFixedType3() { * * Covered types as of now * - * -- NullableVarCharVector - * -- NullableVarBinaryVector + * -- VarCharVector + * -- VarBinaryVector * * TODO: * @@ -800,37 +869,35 @@ public void testNullableFixedType3() { * -- VarBinaryVector */ - @Test /* NullableVarCharVector */ + @Test /* VarCharVector */ public void testNullableVarType1() { // Create a new value vector for 1024 integers. - try (final NullableVarCharVector vector = newNullableVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { - final NullableVarCharVector.Mutator m = vector.getMutator(); + try (final VarCharVector vector = newVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { vector.allocateNew(1024 * 10, 1024); - m.set(0, STR1); - m.set(1, STR2); - m.set(2, STR3); - m.setSafe(3, STR3, 1, STR3.length - 1); - m.setSafe(4, STR3, 2, STR3.length - 2); + vector.set(0, STR1); + vector.set(1, STR2); + vector.set(2, STR3); + vector.setSafe(3, STR3, 1, STR3.length - 1); + vector.setSafe(4, STR3, 2, STR3.length - 2); ByteBuffer STR3ByteBuffer = ByteBuffer.wrap(STR3); - m.setSafe(5, STR3ByteBuffer, 1, STR3.length - 1); - m.setSafe(6, STR3ByteBuffer, 2, STR3.length - 2); + vector.setSafe(5, STR3ByteBuffer, 1, STR3.length - 1); + vector.setSafe(6, STR3ByteBuffer, 2, STR3.length - 2); // Check the sample strings. - final NullableVarCharVector.Accessor accessor = vector.getAccessor(); - assertArrayEquals(STR1, accessor.get(0)); - assertArrayEquals(STR2, accessor.get(1)); - assertArrayEquals(STR3, accessor.get(2)); - assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), accessor.get(3)); - assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), accessor.get(4)); - assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), accessor.get(5)); - assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), accessor.get(6)); + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(3)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(4)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(5)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(6)); // Ensure null value throws. boolean b = false; try { - vector.getAccessor().get(7); + vector.get(7); } catch (IllegalStateException e) { b = true; } finally { @@ -839,37 +906,35 @@ public void testNullableVarType1() { } } - @Test /* NullableVarBinaryVector */ + @Test /* VarBinaryVector */ public void testNullableVarType2() { // Create a new value vector for 1024 integers. - try (final NullableVarBinaryVector vector = newNullableVarBinaryVector(EMPTY_SCHEMA_PATH, allocator)) { - final NullableVarBinaryVector.Mutator m = vector.getMutator(); + try (final VarBinaryVector vector = newVarBinaryVector(EMPTY_SCHEMA_PATH, allocator)) { vector.allocateNew(1024 * 10, 1024); - m.set(0, STR1); - m.set(1, STR2); - m.set(2, STR3); - m.setSafe(3, STR3, 1, STR3.length - 1); - m.setSafe(4, STR3, 2, STR3.length - 2); + vector.set(0, STR1); + vector.set(1, STR2); + vector.set(2, STR3); + vector.setSafe(3, STR3, 1, STR3.length - 1); + vector.setSafe(4, STR3, 2, STR3.length - 2); ByteBuffer STR3ByteBuffer = ByteBuffer.wrap(STR3); - m.setSafe(5, STR3ByteBuffer, 1, STR3.length - 1); - m.setSafe(6, STR3ByteBuffer, 2, STR3.length - 2); + vector.setSafe(5, STR3ByteBuffer, 1, STR3.length - 1); + vector.setSafe(6, STR3ByteBuffer, 2, STR3.length - 2); // Check the sample strings. - final NullableVarBinaryVector.Accessor accessor = vector.getAccessor(); - assertArrayEquals(STR1, accessor.get(0)); - assertArrayEquals(STR2, accessor.get(1)); - assertArrayEquals(STR3, accessor.get(2)); - assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), accessor.get(3)); - assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), accessor.get(4)); - assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), accessor.get(5)); - assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), accessor.get(6)); + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(3)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(4)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(5)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(6)); // Ensure null value throws. boolean b = false; try { - vector.getAccessor().get(7); + vector.get(7); } catch (IllegalStateException e) { b = true; } finally { @@ -896,8 +961,6 @@ public void testNullableVarType2() { @Test /* Float8Vector */ public void testReallocAfterVectorTransfer1() { try (final Float8Vector vector = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { - final Float8Vector.Mutator mutator = vector.getMutator(); - final Float8Vector.Accessor accessor = vector.getAccessor(); final int initialDefaultCapacity = 4096; boolean error = false; @@ -909,7 +972,7 @@ public void testReallocAfterVectorTransfer1() { double baseValue = 100.375; for (int i = 0; i < initialDefaultCapacity; i++) { - mutator.setSafe(i, baseValue + (double)i); + vector.setSafe(i, baseValue + (double)i); } /* the above setSafe calls should not have triggered a realloc as @@ -918,33 +981,33 @@ public void testReallocAfterVectorTransfer1() { assertEquals(initialDefaultCapacity, vector.getValueCapacity()); for (int i = 0; i < initialDefaultCapacity; i++) { - double value = accessor.get(i); + double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - mutator.setSafe(initialDefaultCapacity, baseValue + (double)initialDefaultCapacity); + vector.setSafe(initialDefaultCapacity, baseValue + (double)initialDefaultCapacity); assertEquals(initialDefaultCapacity * 2, vector.getValueCapacity()); for (int i = initialDefaultCapacity + 1; i < (initialDefaultCapacity * 2); i++) { - mutator.setSafe(i, baseValue + (double)i); + vector.setSafe(i, baseValue + (double)i); } for (int i = 0; i < (initialDefaultCapacity * 2); i++) { - double value = accessor.get(i); + double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - mutator.setSafe(initialDefaultCapacity * 2, baseValue + (double)(initialDefaultCapacity * 2)); + vector.setSafe(initialDefaultCapacity * 2, baseValue + (double)(initialDefaultCapacity * 2)); assertEquals(initialDefaultCapacity * 4, vector.getValueCapacity()); for (int i = (initialDefaultCapacity * 2) + 1; i < (initialDefaultCapacity * 4); i++) { - mutator.setSafe(i, baseValue + (double)i); + vector.setSafe(i, baseValue + (double)i); } for (int i = 0; i < (initialDefaultCapacity * 4); i++) { - double value = accessor.get(i); + double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } @@ -961,15 +1024,12 @@ public void testReallocAfterVectorTransfer1() { toVector.reAlloc(); assertEquals(initialDefaultCapacity * 8, toVector.getValueCapacity()); - final Float8Vector.Accessor toAccessor = toVector.getAccessor(); - for (int i = 0; i < (initialDefaultCapacity * 8); i++) { - double value = toAccessor.get(i); if (i < (initialDefaultCapacity * 4)) { - assertEquals(baseValue + (double)i, value, 0); + assertEquals(baseValue + (double)i, toVector.get(i), 0); } else { - assertEquals(0, value, 0); + assertTrue(toVector.isNull(i)); } } @@ -977,11 +1037,9 @@ public void testReallocAfterVectorTransfer1() { } } - @Test /* NullableFloat8Vector */ + @Test /* Float8Vector */ public void testReallocAfterVectorTransfer2() { - try (final NullableFloat8Vector vector = new NullableFloat8Vector(EMPTY_SCHEMA_PATH, allocator)) { - final NullableFloat8Vector.Mutator mutator = vector.getMutator(); - final NullableFloat8Vector.Accessor accessor = vector.getAccessor(); + try (final Float8Vector vector = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { final int initialDefaultCapacity = 4096; boolean error = false; @@ -992,7 +1050,7 @@ public void testReallocAfterVectorTransfer2() { double baseValue = 100.375; for (int i = 0; i < initialDefaultCapacity; i++) { - mutator.setSafe(i, baseValue + (double)i); + vector.setSafe(i, baseValue + (double)i); } /* the above setSafe calls should not have triggered a realloc as @@ -1001,33 +1059,33 @@ public void testReallocAfterVectorTransfer2() { assertEquals(initialDefaultCapacity, vector.getValueCapacity()); for (int i = 0; i < initialDefaultCapacity; i++) { - double value = accessor.get(i); + double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - mutator.setSafe(initialDefaultCapacity, baseValue + (double)initialDefaultCapacity); + vector.setSafe(initialDefaultCapacity, baseValue + (double)initialDefaultCapacity); assertEquals(initialDefaultCapacity * 2, vector.getValueCapacity()); for (int i = initialDefaultCapacity + 1; i < (initialDefaultCapacity * 2); i++) { - mutator.setSafe(i, baseValue + (double)i); + vector.setSafe(i, baseValue + (double)i); } for (int i = 0; i < (initialDefaultCapacity * 2); i++) { - double value = accessor.get(i); + double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - mutator.setSafe(initialDefaultCapacity * 2, baseValue + (double)(initialDefaultCapacity * 2)); + vector.setSafe(initialDefaultCapacity * 2, baseValue + (double)(initialDefaultCapacity * 2)); assertEquals(initialDefaultCapacity * 4, vector.getValueCapacity()); for (int i = (initialDefaultCapacity * 2) + 1; i < (initialDefaultCapacity * 4); i++) { - mutator.setSafe(i, baseValue + (double)i); + vector.setSafe(i, baseValue + (double)i); } for (int i = 0; i < (initialDefaultCapacity * 4); i++) { - double value = accessor.get(i); + double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } @@ -1038,13 +1096,12 @@ public void testReallocAfterVectorTransfer2() { TransferPair transferPair = vector.getTransferPair(allocator); transferPair.transfer(); - NullableFloat8Vector toVector = (NullableFloat8Vector)transferPair.getTo(); - final NullableFloat8Vector.Accessor toAccessor = toVector.getAccessor(); + Float8Vector toVector = (Float8Vector)transferPair.getTo(); /* check toVector contents before realloc */ for (int i = 0; i < (initialDefaultCapacity * 4); i++) { - assertFalse("unexpected null value at index: " + i, toAccessor.isNull(i)); - double value = toAccessor.get(i); + assertFalse("unexpected null value at index: " + i, toVector.isNull(i)); + double value = toVector.get(i); assertEquals("unexpected value at index: " + i, baseValue + (double)i, value, 0); } @@ -1054,12 +1111,12 @@ public void testReallocAfterVectorTransfer2() { for (int i = 0; i < (initialDefaultCapacity * 8); i++) { if (i < (initialDefaultCapacity * 4)) { - assertFalse("unexpected null value at index: " + i, toAccessor.isNull(i)); - double value = toAccessor.get(i); + assertFalse("unexpected null value at index: " + i, toVector.isNull(i)); + double value = toVector.get(i); assertEquals("unexpected value at index: " + i, baseValue + (double)i, value, 0); } else { - assertTrue("unexpected non-null value at index: " + i, toAccessor.isNull(i)); + assertTrue("unexpected non-null value at index: " + i, toVector.isNull(i)); } } @@ -1067,46 +1124,45 @@ public void testReallocAfterVectorTransfer2() { } } - @Test /* NullableVarCharVector */ + @Test /* VarCharVector */ public void testReallocAfterVectorTransfer3() { - try (final NullableVarCharVector vector = new NullableVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { - final NullableVarCharVector.Mutator mutator = vector.getMutator(); - final NullableVarCharVector.Accessor accessor = vector.getAccessor(); - + try (final VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator)) { /* 4096 values with 10 byte per record */ vector.allocateNew(4096 * 10, 4096); int valueCapacity = vector.getValueCapacity(); + assertEquals(4096, valueCapacity); /* populate the vector */ for (int i = 0; i < valueCapacity; i++) { if ((i & 1) == 1) { - mutator.set(i, STR1); + vector.set(i, STR1); } else { - mutator.set(i, STR2); + vector.set(i, STR2); } } /* Check the vector output */ for (int i = 0; i < valueCapacity; i++) { if ((i & 1) == 1) { - assertArrayEquals(STR1, accessor.get(i)); + assertArrayEquals(STR1, vector.get(i)); } else { - assertArrayEquals(STR2, accessor.get(i)); + assertArrayEquals(STR2, vector.get(i)); } } /* trigger first realloc */ - mutator.setSafe(valueCapacity, STR2, 0, STR2.length); + vector.setSafe(valueCapacity, STR2, 0, STR2.length); + assertEquals(valueCapacity * 2, vector.getValueCapacity()); /* populate the remaining vector */ for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { if ((i & 1) == 1) { - mutator.set(i, STR1); + vector.set(i, STR1); } else { - mutator.set(i, STR2); + vector.set(i, STR2); } } @@ -1114,23 +1170,24 @@ public void testReallocAfterVectorTransfer3() { valueCapacity = vector.getValueCapacity(); for (int i = 0; i < valueCapacity; i++) { if ((i & 1) == 1) { - assertArrayEquals(STR1, accessor.get(i)); + assertArrayEquals(STR1, vector.get(i)); } else { - assertArrayEquals(STR2, accessor.get(i)); + assertArrayEquals(STR2, vector.get(i)); } } /* trigger second realloc */ - mutator.setSafe(valueCapacity + 10, STR2, 0, STR2.length); + vector.setSafe(valueCapacity + 10, STR2, 0, STR2.length); + assertEquals(valueCapacity * 2, vector.getValueCapacity()); /* populate the remaining vector */ for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { if ((i & 1) == 1) { - mutator.set(i, STR1); + vector.set(i, STR1); } else { - mutator.set(i, STR2); + vector.set(i, STR2); } } @@ -1138,10 +1195,10 @@ public void testReallocAfterVectorTransfer3() { valueCapacity = vector.getValueCapacity(); for (int i = 0; i < valueCapacity; i++) { if ((i & 1) == 1) { - assertArrayEquals(STR1, accessor.get(i)); + assertArrayEquals(STR1, vector.get(i)); } else { - assertArrayEquals(STR2, accessor.get(i)); + assertArrayEquals(STR2, vector.get(i)); } } @@ -1151,134 +1208,328 @@ public void testReallocAfterVectorTransfer3() { TransferPair transferPair = vector.getTransferPair(allocator); transferPair.transfer(); - NullableVarCharVector toVector = (NullableVarCharVector)transferPair.getTo(); - NullableVarCharVector.Mutator toMutator = toVector.getMutator(); - NullableVarCharVector.Accessor toAccessor = toVector.getAccessor(); - + VarCharVector toVector = (VarCharVector)transferPair.getTo(); valueCapacity = toVector.getValueCapacity(); - /* trigger a realloc of this toVector */ - toMutator.setSafe(valueCapacity + 10, STR2, 0, STR2.length); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 1) { + assertArrayEquals(STR1, toVector.get(i)); + } + else { + assertArrayEquals(STR2, toVector.get(i)); + } + } + + toVector.close(); + } + } + + @Test /* IntVector */ + public void testReallocAfterVectorTransfer4() { + try (final IntVector vector = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { + + /* 4096 values */ + vector.allocateNew(4096); + int valueCapacity = vector.getValueCapacity(); + assertEquals(4096, valueCapacity); + + /* populate the vector */ + int baseValue = 1000; + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 0) { + vector.set(i, 1000 + i); + } + } + + /* Check the vector output */ + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 0) { + assertEquals(1000 + i, vector.get(i)); + } + else { + assertTrue(vector.isNull(i)); + } + } + + /* trigger first realloc */ + vector.setSafe(valueCapacity, 10000000); + assertEquals(valueCapacity * 2, vector.getValueCapacity()); + + /* populate the remaining vector */ + for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { + if ((i & 1) == 0) { + vector.set(i, 1000 + i); + } + } + + /* Check the vector output */ + valueCapacity = vector.getValueCapacity(); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 0) { + assertEquals(1000 + i, vector.get(i)); + } + else { + assertTrue(vector.isNull(i)); + } + } + + /* trigger second realloc */ + vector.setSafe(valueCapacity, 10000000); + assertEquals(valueCapacity * 2, vector.getValueCapacity()); + + /* populate the remaining vector */ + for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { + if ((i & 1) == 0) { + vector.set(i, 1000 + i); + } + } + + /* Check the vector output */ + valueCapacity = vector.getValueCapacity(); + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 0) { + assertEquals(1000 + i, vector.get(i)); + } + else { + assertTrue(vector.isNull(i)); + } + } + + /* we are potentially working with 4x the size of vector buffer + * that we initially started with. Now let's transfer the vector. + */ + + TransferPair transferPair = vector.getTransferPair(allocator); + transferPair.transfer(); + IntVector toVector = (IntVector)transferPair.getTo(); + /* value capacity of source and target vectors should be same after + * the transfer. + */ + assertEquals(valueCapacity, toVector.getValueCapacity()); + + for (int i = 0; i < valueCapacity; i++) { + if ((i & 1) == 0) { + assertEquals(1000 + i, toVector.get(i)); + } + else { + assertTrue(toVector.isNull(i)); + } + } toVector.close(); } } @Test - public void testReAllocNullableFixedWidthVector() { + public void testReAllocFixedWidthVector() { // Create a new value vector for 1024 integers - try (final NullableFloat4Vector vector = newVector(NullableFloat4Vector.class, EMPTY_SCHEMA_PATH, MinorType.FLOAT4, allocator)) { - final NullableFloat4Vector.Mutator m = vector.getMutator(); + try (final Float4Vector vector = newVector(Float4Vector.class, EMPTY_SCHEMA_PATH, MinorType.FLOAT4, allocator)) { vector.allocateNew(1024); assertEquals(1024, vector.getValueCapacity()); // Put values in indexes that fall within the initial allocation - m.setSafe(0, 100.1f); - m.setSafe(100, 102.3f); - m.setSafe(1023, 104.5f); + vector.setSafe(0, 100.1f); + vector.setSafe(100, 102.3f); + vector.setSafe(1023, 104.5f); // Now try to put values in space that falls beyond the initial allocation - m.setSafe(2000, 105.5f); + vector.setSafe(2000, 105.5f); // Check valueCapacity is more than initial allocation assertEquals(1024 * 2, vector.getValueCapacity()); - final NullableFloat4Vector.Accessor accessor = vector.getAccessor(); - assertEquals(100.1f, accessor.get(0), 0); - assertEquals(102.3f, accessor.get(100), 0); - assertEquals(104.5f, accessor.get(1023), 0); - assertEquals(105.5f, accessor.get(2000), 0); + assertEquals(100.1f, vector.get(0), 0); + assertEquals(102.3f, vector.get(100), 0); + assertEquals(104.5f, vector.get(1023), 0); + assertEquals(105.5f, vector.get(2000), 0); - // Set the valueCount to be more than valueCapacity of current allocation. This is possible for NullableValueVectors + // Set the valueCount to be more than valueCapacity of current allocation. This is possible for ValueVectors // as we don't call setSafe for null values, but we do call setValueCount when all values are inserted into the // vector - m.setValueCount(vector.getValueCapacity() + 200); + vector.setValueCount(vector.getValueCapacity() + 200); } } @Test - public void testReAllocNullableVariableWidthVector() { - // Create a new value vector for 1024 integers - try (final NullableVarCharVector vector = newVector(NullableVarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { - final NullableVarCharVector.Mutator m = vector.getMutator(); + public void testReAllocVariableWidthVector() { + try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { vector.allocateNew(); int initialCapacity = vector.getValueCapacity(); + assertEquals(4095, initialCapacity); - // Put values in indexes that fall within the initial allocation - m.setSafe(0, STR1, 0, STR1.length); - m.setSafe(initialCapacity - 1, STR2, 0, STR2.length); + /* Put values in indexes that fall within the initial allocation */ + vector.setSafe(0, STR1, 0, STR1.length); + vector.setSafe(initialCapacity - 1, STR2, 0, STR2.length); - // Now try to put values in space that falls beyond the initial allocation - m.setSafe(initialCapacity + 200, STR3, 0, STR3.length); + /* the above set calls should NOT have triggered a realloc */ + initialCapacity = vector.getValueCapacity(); + assertEquals(4095, initialCapacity); - // Check valueCapacity is more than initial allocation - assertEquals((initialCapacity + 1) * 2 - 1, vector.getValueCapacity()); + /* Now try to put values in space that falls beyond the initial allocation */ + vector.setSafe(initialCapacity + 200, STR3, 0, STR3.length); - final NullableVarCharVector.Accessor accessor = vector.getAccessor(); - assertArrayEquals(STR1, accessor.get(0)); - assertArrayEquals(STR2, accessor.get(initialCapacity - 1)); - assertArrayEquals(STR3, accessor.get(initialCapacity + 200)); + /* Check valueCapacity is more than initial allocation */ + assertEquals(((initialCapacity + 1) * 2) - 1, vector.getValueCapacity()); - // Set the valueCount to be more than valueCapacity of current allocation. This is possible for NullableValueVectors + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(initialCapacity - 1)); + assertArrayEquals(STR3, vector.get(initialCapacity + 200)); + + // Set the valueCount to be more than valueCapacity of current allocation. This is possible for ValueVectors // as we don't call setSafe for null values, but we do call setValueCount when the current batch is processed. - m.setValueCount(vector.getValueCapacity() + 200); + vector.setValueCount(vector.getValueCapacity() + 200); } } @Test public void testFillEmptiesNotOverfill() { - try (final NullableVarCharVector vector = newVector(NullableVarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { vector.allocateNew(); - vector.getMutator().setSafe(4094, "hello".getBytes(), 0, 5); - vector.getMutator().setValueCount(4095); + int initialCapacity = vector.getValueCapacity(); + assertEquals(4095, initialCapacity); + + vector.setSafe(4094, "hello".getBytes(), 0, 5); + /* the above set method should NOT have trigerred a realloc */ + initialCapacity = vector.getValueCapacity(); + assertEquals(4095, initialCapacity); - assertEquals(4096 * 4, vector.getFieldBuffers().get(1).capacity()); + vector.setValueCount(4095); + assertEquals(4096 * vector.OFFSET_WIDTH, vector.getFieldBuffers().get(1).capacity()); + initialCapacity = vector.getValueCapacity(); + assertEquals(4095, initialCapacity); } } @Test public void testCopyFromWithNulls() { - try (final NullableVarCharVector vector = newVector(NullableVarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); - final NullableVarCharVector vector2 = newVector(NullableVarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); + final VarCharVector vector2 = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.allocateNew(); + int capacity = vector.getValueCapacity(); + assertEquals(4095, capacity); for (int i = 0; i < 4095; i++) { if (i % 3 == 0) { continue; } byte[] b = Integer.toString(i).getBytes(); - vector.getMutator().setSafe(i, b, 0, b.length); + vector.setSafe(i, b, 0, b.length); } - vector.getMutator().setValueCount(4095); + /* NO reAlloc() should have happened in setSafe() */ + capacity = vector.getValueCapacity(); + assertEquals(4095, capacity); + + vector.setValueCount(4095); + + for (int i = 0; i < 4095; i++) { + if (i % 3 == 0) { + assertNull(vector.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector.getObject(i).toString()); + } + } vector2.allocateNew(); + capacity = vector2.getValueCapacity(); + assertEquals(4095, capacity); for (int i = 0; i < 4095; i++) { vector2.copyFromSafe(i, i, vector); + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } } - vector2.getMutator().setValueCount(4095); + /* NO reAlloc() should have happened in copyFrom */ + capacity = vector2.getValueCapacity(); + assertEquals(4095, capacity); + + vector2.setValueCount(4095); for (int i = 0; i < 4095; i++) { if (i % 3 == 0) { - assertNull(vector2.getAccessor().getObject(i)); + assertNull(vector2.getObject(i)); } else { - assertEquals(Integer.toString(i), vector2.getAccessor().getObject(i).toString()); + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); } } } } @Test - public void testSetLastSetUsage() { - try (final NullableVarCharVector vector = new NullableVarCharVector("myvector", allocator)) { + public void testCopyFromWithNulls1() { + try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); + final VarCharVector vector2 = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { - final NullableVarCharVector.Mutator mutator = vector.getMutator(); + vector.allocateNew(); + int capacity = vector.getValueCapacity(); + assertEquals(4095, capacity); + for (int i = 0; i < 4095; i++) { + if (i % 3 == 0) { + continue; + } + byte[] b = Integer.toString(i).getBytes(); + vector.setSafe(i, b, 0, b.length); + } + + /* NO reAlloc() should have happened in setSafe() */ + capacity = vector.getValueCapacity(); + assertEquals(4095, capacity); + + vector.setValueCount(4095); + + for (int i = 0; i < 4095; i++) { + if (i % 3 == 0) { + assertNull(vector.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector.getObject(i).toString()); + } + } + + /* set lesser initial capacity than actually needed + * to trigger reallocs in copyFromSafe() + */ + vector2.allocateNew(1024 * 10, 1024); + + capacity = vector2.getValueCapacity(); + assertEquals(1024, capacity); + + for (int i = 0; i < 4095; i++) { + vector2.copyFromSafe(i, i, vector); + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } + } + + /* 2 reAllocs should have happened in copyFromSafe() */ + capacity = vector2.getValueCapacity(); + assertEquals(4096, capacity); + + vector2.setValueCount(4095); + + for (int i = 0; i < 4095; i++) { + if (i % 3 == 0) { + assertNull(vector2.getObject(i)); + } else { + assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + } + } + } + } + + @Test + public void testSetLastSetUsage() { + try (final VarCharVector vector = new VarCharVector("myvector", allocator)) { vector.allocateNew(1024 * 10, 1024); setBytes(0, STR1, vector); @@ -1289,62 +1540,128 @@ public void testSetLastSetUsage() { setBytes(5, STR6, vector); /* Check current lastSet */ - assertEquals(Integer.toString(-1), Integer.toString(mutator.getLastSet())); + assertEquals(Integer.toString(-1), Integer.toString(vector.getLastSet())); /* Check the vector output */ - final NullableVarCharVector.Accessor accessor = vector.getAccessor(); - assertArrayEquals(STR1, accessor.get(0)); - assertArrayEquals(STR2, accessor.get(1)); - assertArrayEquals(STR3, accessor.get(2)); - assertArrayEquals(STR4, accessor.get(3)); - assertArrayEquals(STR5, accessor.get(4)); - assertArrayEquals(STR6, accessor.get(5)); + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); /* * If we don't do setLastSe(5) before setValueCount(), then the latter will corrupt * the value vector by filling in all positions [0,valuecount-1] will empty byte arrays. * Run the test by commenting out next line and we should see incorrect vector output. */ - mutator.setLastSet(5); - mutator.setValueCount(20); + vector.setLastSet(5); + vector.setValueCount(20); + + /* Check current lastSet */ + assertEquals(Integer.toString(19), Integer.toString(vector.getLastSet())); /* Check the vector output again */ - assertArrayEquals(STR1, accessor.get(0)); - assertArrayEquals(STR2, accessor.get(1)); - assertArrayEquals(STR3, accessor.get(2)); - assertArrayEquals(STR4, accessor.get(3)); - assertArrayEquals(STR5, accessor.get(4)); - assertArrayEquals(STR6, accessor.get(5)); + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(6))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(7))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(8))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(9))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(10))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(11))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(12))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(13))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(14))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(15))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(16))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(17))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(18))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(19))); + + /* Check offsets */ + assertEquals(Integer.toString(0), + Integer.toString(vector.offsetBuffer.getInt(0 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(6), + Integer.toString(vector.offsetBuffer.getInt(1 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(16), + Integer.toString(vector.offsetBuffer.getInt(2 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(21), + Integer.toString(vector.offsetBuffer.getInt(3 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(30), + Integer.toString(vector.offsetBuffer.getInt(4 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(34), + Integer.toString(vector.offsetBuffer.getInt(5 * vector.OFFSET_WIDTH))); + + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(6 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(7 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(8 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(9 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(10 * vector.OFFSET_WIDTH))); + + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(11 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(12 * vector.OFFSET_WIDTH))); + + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(13 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(14 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(15 * vector.OFFSET_WIDTH))); + + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(16 * vector.OFFSET_WIDTH))); + + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(17 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(18 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(19 * vector.OFFSET_WIDTH))); + + vector.set(19, STR6); + assertArrayEquals(STR6, vector.get(19)); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(19 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(46), + Integer.toString(vector.offsetBuffer.getInt(20 * vector.OFFSET_WIDTH))); } } @Test public void testVectorLoadUnload() { - try (final NullableVarCharVector vector1 = new NullableVarCharVector("myvector", allocator)) { - - final NullableVarCharVector.Mutator mutator1 = vector1.getMutator(); - + try (final VarCharVector vector1 = new VarCharVector("myvector", allocator)) { vector1.allocateNew(1024 * 10, 1024); - mutator1.set(0, STR1); - mutator1.set(1, STR2); - mutator1.set(2, STR3); - mutator1.set(3, STR4); - mutator1.set(4, STR5); - mutator1.set(5, STR6); - assertEquals(Integer.toString(5), Integer.toString(mutator1.getLastSet())); - mutator1.setValueCount(15); - assertEquals(Integer.toString(14), Integer.toString(mutator1.getLastSet())); + vector1.set(0, STR1); + vector1.set(1, STR2); + vector1.set(2, STR3); + vector1.set(3, STR4); + vector1.set(4, STR5); + vector1.set(5, STR6); + assertEquals(Integer.toString(5), Integer.toString(vector1.getLastSet())); + vector1.setValueCount(15); + assertEquals(Integer.toString(14), Integer.toString(vector1.getLastSet())); /* Check the vector output */ - final NullableVarCharVector.Accessor accessor1 = vector1.getAccessor(); - assertArrayEquals(STR1, accessor1.get(0)); - assertArrayEquals(STR2, accessor1.get(1)); - assertArrayEquals(STR3, accessor1.get(2)); - assertArrayEquals(STR4, accessor1.get(3)); - assertArrayEquals(STR5, accessor1.get(4)); - assertArrayEquals(STR6, accessor1.get(5)); + assertArrayEquals(STR1, vector1.get(0)); + assertArrayEquals(STR2, vector1.get(1)); + assertArrayEquals(STR3, vector1.get(2)); + assertArrayEquals(STR4, vector1.get(3)); + assertArrayEquals(STR5, vector1.get(4)); + assertArrayEquals(STR6, vector1.get(5)); Field field = vector1.getField(); String fieldName = field.getName(); @@ -1357,7 +1674,7 @@ public void testVectorLoadUnload() { Schema schema = new Schema(fields); - VectorSchemaRoot schemaRoot1 = new VectorSchemaRoot(schema, fieldVectors, accessor1.getValueCount()); + VectorSchemaRoot schemaRoot1 = new VectorSchemaRoot(schema, fieldVectors, vector1.getValueCount()); VectorUnloader vectorUnloader = new VectorUnloader(schemaRoot1); try ( @@ -1369,34 +1686,29 @@ public void testVectorLoadUnload() { VectorLoader vectorLoader = new VectorLoader(schemaRoot2); vectorLoader.load(recordBatch); - NullableVarCharVector vector2 = (NullableVarCharVector) schemaRoot2.getVector(fieldName); - NullableVarCharVector.Mutator mutator2 = vector2.getMutator(); - + VarCharVector vector2 = (VarCharVector) schemaRoot2.getVector(fieldName); /* * lastSet would have internally been set by VectorLoader.load() when it invokes * loadFieldBuffers. */ - assertEquals(Integer.toString(14), Integer.toString(mutator2.getLastSet())); - mutator2.setValueCount(25); - assertEquals(Integer.toString(24), Integer.toString(mutator2.getLastSet())); + assertEquals(Integer.toString(14), Integer.toString(vector2.getLastSet())); + vector2.setValueCount(25); + assertEquals(Integer.toString(24), Integer.toString(vector2.getLastSet())); /* Check the vector output */ - final NullableVarCharVector.Accessor accessor2 = vector2.getAccessor(); - assertArrayEquals(STR1, accessor2.get(0)); - assertArrayEquals(STR2, accessor2.get(1)); - assertArrayEquals(STR3, accessor2.get(2)); - assertArrayEquals(STR4, accessor2.get(3)); - assertArrayEquals(STR5, accessor2.get(4)); - assertArrayEquals(STR6, accessor2.get(5)); + assertArrayEquals(STR1, vector2.get(0)); + assertArrayEquals(STR2, vector2.get(1)); + assertArrayEquals(STR3, vector2.get(2)); + assertArrayEquals(STR4, vector2.get(3)); + assertArrayEquals(STR5, vector2.get(4)); + assertArrayEquals(STR6, vector2.get(5)); } } } @Test public void testFillEmptiesUsage() { - try (final NullableVarCharVector vector = new NullableVarCharVector("myvector", allocator)) { - - final NullableVarCharVector.Mutator mutator = vector.getMutator(); + try (final VarCharVector vector = new VarCharVector("myvector", allocator)) { vector.allocateNew(1024 * 10, 1024); @@ -1408,114 +1720,124 @@ public void testFillEmptiesUsage() { setBytes(5, STR6, vector); /* Check current lastSet */ - assertEquals(Integer.toString(-1), Integer.toString(mutator.getLastSet())); + assertEquals(Integer.toString(-1), Integer.toString(vector.getLastSet())); /* Check the vector output */ - final NullableVarCharVector.Accessor accessor = vector.getAccessor(); - assertArrayEquals(STR1, accessor.get(0)); - assertArrayEquals(STR2, accessor.get(1)); - assertArrayEquals(STR3, accessor.get(2)); - assertArrayEquals(STR4, accessor.get(3)); - assertArrayEquals(STR5, accessor.get(4)); - assertArrayEquals(STR6, accessor.get(5)); - - mutator.setLastSet(5); + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + + vector.setLastSet(5); /* fill empty byte arrays from index [6, 9] */ - mutator.fillEmpties(10); + vector.fillEmpties(10); /* Check current lastSet */ - assertEquals(Integer.toString(9), Integer.toString(mutator.getLastSet())); + assertEquals(Integer.toString(9), Integer.toString(vector.getLastSet())); /* Check the vector output */ - assertArrayEquals(STR1, accessor.get(0)); - assertArrayEquals(STR2, accessor.get(1)); - assertArrayEquals(STR3, accessor.get(2)); - assertArrayEquals(STR4, accessor.get(3)); - assertArrayEquals(STR5, accessor.get(4)); - assertArrayEquals(STR6, accessor.get(5)); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(6))); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(7))); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(8))); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(9))); + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(6))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(7))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(8))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(9))); setBytes(10, STR1, vector); setBytes(11, STR2, vector); - mutator.setLastSet(11); + vector.setLastSet(11); /* fill empty byte arrays from index [12, 14] */ - mutator.setValueCount(15); + vector.setValueCount(15); /* Check current lastSet */ - assertEquals(Integer.toString(14), Integer.toString(mutator.getLastSet())); + assertEquals(Integer.toString(14), Integer.toString(vector.getLastSet())); /* Check the vector output */ - assertArrayEquals(STR1, accessor.get(0)); - assertArrayEquals(STR2, accessor.get(1)); - assertArrayEquals(STR3, accessor.get(2)); - assertArrayEquals(STR4, accessor.get(3)); - assertArrayEquals(STR5, accessor.get(4)); - assertArrayEquals(STR6, accessor.get(5)); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(6))); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(7))); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(8))); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(9))); - assertArrayEquals(STR1, accessor.get(10)); - assertArrayEquals(STR2, accessor.get(11)); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(12))); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(13))); - assertEquals(Integer.toString(0), Integer.toString(accessor.getValueLength(14))); + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(6))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(7))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(8))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(9))); + assertArrayEquals(STR1, vector.get(10)); + assertArrayEquals(STR2, vector.get(11)); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(12))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(13))); + assertEquals(Integer.toString(0), Integer.toString(vector.getValueLength(14))); /* Check offsets */ - final UInt4Vector.Accessor offsetAccessor = vector.values.offsetVector.getAccessor(); - assertEquals(Integer.toString(0), Integer.toString(offsetAccessor.get(0))); - assertEquals(Integer.toString(6), Integer.toString(offsetAccessor.get(1))); - assertEquals(Integer.toString(16), Integer.toString(offsetAccessor.get(2))); - assertEquals(Integer.toString(21), Integer.toString(offsetAccessor.get(3))); - assertEquals(Integer.toString(30), Integer.toString(offsetAccessor.get(4))); - assertEquals(Integer.toString(34), Integer.toString(offsetAccessor.get(5))); - - assertEquals(Integer.toString(40), Integer.toString(offsetAccessor.get(6))); - assertEquals(Integer.toString(40), Integer.toString(offsetAccessor.get(7))); - assertEquals(Integer.toString(40), Integer.toString(offsetAccessor.get(8))); - assertEquals(Integer.toString(40), Integer.toString(offsetAccessor.get(9))); - assertEquals(Integer.toString(40), Integer.toString(offsetAccessor.get(10))); - - assertEquals(Integer.toString(46), Integer.toString(offsetAccessor.get(11))); - assertEquals(Integer.toString(56), Integer.toString(offsetAccessor.get(12))); - - assertEquals(Integer.toString(56), Integer.toString(offsetAccessor.get(13))); - assertEquals(Integer.toString(56), Integer.toString(offsetAccessor.get(14))); - assertEquals(Integer.toString(56), Integer.toString(offsetAccessor.get(15))); + assertEquals(Integer.toString(0), + Integer.toString(vector.offsetBuffer.getInt(0 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(6), + Integer.toString(vector.offsetBuffer.getInt(1 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(16), + Integer.toString(vector.offsetBuffer.getInt(2 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(21), + Integer.toString(vector.offsetBuffer.getInt(3 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(30), + Integer.toString(vector.offsetBuffer.getInt(4 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(34), + Integer.toString(vector.offsetBuffer.getInt(5 * vector.OFFSET_WIDTH))); + + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(6 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(7 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(8 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(9 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(40), + Integer.toString(vector.offsetBuffer.getInt(10 * vector.OFFSET_WIDTH))); + + assertEquals(Integer.toString(46), + Integer.toString(vector.offsetBuffer.getInt(11 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(56), + Integer.toString(vector.offsetBuffer.getInt(12 * vector.OFFSET_WIDTH))); + + assertEquals(Integer.toString(56), + Integer.toString(vector.offsetBuffer.getInt(13 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(56), + Integer.toString(vector.offsetBuffer.getInt(14 * vector.OFFSET_WIDTH))); + assertEquals(Integer.toString(56), + Integer.toString(vector.offsetBuffer.getInt(15 * vector.OFFSET_WIDTH))); } } - @Test /* NullableVarCharVector */ + @Test /* VarCharVector */ public void testGetBufferAddress1() { - try (final NullableVarCharVector vector = new NullableVarCharVector("myvector", allocator)) { - - final NullableVarCharVector.Mutator mutator = vector.getMutator(); - final NullableVarCharVector.Accessor accessor = vector.getAccessor(); - + try (final VarCharVector vector = new VarCharVector("myvector", allocator)) { vector.allocateNew(1024 * 10, 1024); /* populate the vector */ - mutator.set(0, STR1); - mutator.set(1, STR2); - mutator.set(2, STR3); - mutator.set(3, STR4); - mutator.set(4, STR5); - mutator.set(5, STR6); + vector.set(0, STR1); + vector.set(1, STR2); + vector.set(2, STR3); + vector.set(3, STR4); + vector.set(4, STR5); + vector.set(5, STR6); - mutator.setValueCount(15); + vector.setValueCount(15); /* check the vector output */ - assertArrayEquals(STR1, accessor.get(0)); - assertArrayEquals(STR2, accessor.get(1)); - assertArrayEquals(STR3, accessor.get(2)); - assertArrayEquals(STR4, accessor.get(3)); - assertArrayEquals(STR5, accessor.get(4)); - assertArrayEquals(STR6, accessor.get(5)); + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertArrayEquals(STR5, vector.get(4)); + assertArrayEquals(STR6, vector.get(5)); List buffers = vector.getFieldBuffers(); long bitAddress = vector.getValidityBufferAddress(); @@ -1529,25 +1851,20 @@ public void testGetBufferAddress1() { } } - @Test /* NullableIntVector */ + @Test /* IntVector */ public void testGetBufferAddress2() { - - try (final NullableIntVector vector = new NullableIntVector("myvector", allocator)) { - - final NullableIntVector.Mutator mutator = vector.getMutator(); - final NullableIntVector.Accessor accessor = vector.getAccessor(); + try (final IntVector vector = new IntVector("myvector", allocator)) { boolean error = false; - vector.allocateNew(16); /* populate the vector */ for(int i = 0; i < 16; i += 2) { - mutator.set(i, i+10); + vector.set(i, i+10); } /* check the vector output */ for(int i = 0; i < 16; i += 2) { - assertEquals(i+10, accessor.get(i)); + assertEquals(i+10, vector.get(i)); } List buffers = vector.getFieldBuffers(); @@ -1573,18 +1890,22 @@ public void testGetBufferAddress2() { @Test public void testMultipleClose() { BufferAllocator vectorAllocator = allocator.newChildAllocator("vector_allocator", 0, Long.MAX_VALUE); - NullableIntVector vector = newVector(NullableIntVector.class, EMPTY_SCHEMA_PATH, MinorType.INT, vectorAllocator); + IntVector vector = newVector(IntVector.class, EMPTY_SCHEMA_PATH, MinorType.INT, vectorAllocator); vector.close(); vectorAllocator.close(); vector.close(); vectorAllocator.close(); } - public static void setBytes(int index, byte[] bytes, NullableVarCharVector vector) { - final int currentOffset = vector.values.offsetVector.getAccessor().get(index); + /* this method is used by the tests to bypass the vector set methods that manipulate + * lastSet. The method is to test the lastSet property and that's why we load the vector + * in a way that lastSet is not set automatically. + */ + public static void setBytes(int index, byte[] bytes, VarCharVector vector) { + final int currentOffset = vector.offsetBuffer.getInt(index * vector.OFFSET_WIDTH); - vector.bits.getMutator().setToOne(index); - vector.values.offsetVector.getMutator().set(index + 1, currentOffset + bytes.length); - vector.values.data.setBytes(currentOffset, bytes, 0, bytes.length); + BitVectorHelper.setValidityBitToOne(vector.validityBuffer, index); + vector.offsetBuffer.setInt((index + 1) * vector.OFFSET_WIDTH, currentOffset + bytes.length); + vector.valueBuffer.setBytes(currentOffset, bytes, 0, bytes.length); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java index 4ac7536c017db..f8edf8904c53e 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -28,6 +28,7 @@ import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.FieldType; import org.junit.After; import org.junit.Assert; @@ -52,14 +53,13 @@ public void terminate() throws Exception { @Test public void testFixedType() { try (final UInt4Vector vector = new UInt4Vector("", allocator)) { - final UInt4Vector.Mutator m = vector.getMutator(); vector.setInitialCapacity(512); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); try { - m.set(512, 0); + vector.set(512, 0); Assert.fail("Expected out of bounds exception"); } catch (Exception e) { // ok @@ -68,32 +68,31 @@ public void testFixedType() { vector.reAlloc(); assertEquals(1024, vector.getValueCapacity()); - m.set(512, 100); - assertEquals(100, vector.getAccessor().get(512)); + vector.set(512, 100); + assertEquals(100, vector.get(512)); } } @Test public void testNullableType() { - try (final NullableVarCharVector vector = new NullableVarCharVector("", allocator)) { - final NullableVarCharVector.Mutator m = vector.getMutator(); + try (final VarCharVector vector = new VarCharVector("", allocator)) { vector.setInitialCapacity(512); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); try { - m.set(512, "foo".getBytes(StandardCharsets.UTF_8)); + vector.set(512, "foo".getBytes(StandardCharsets.UTF_8)); Assert.fail("Expected out of bounds exception"); } catch (Exception e) { // ok } vector.reAlloc(); - assertEquals(1023, vector.getValueCapacity()); + assertEquals(1024, vector.getValueCapacity()); - m.set(512, "foo".getBytes(StandardCharsets.UTF_8)); - assertEquals("foo", new String(vector.getAccessor().get(512), StandardCharsets.UTF_8)); + vector.set(512, "foo".getBytes(StandardCharsets.UTF_8)); + assertEquals("foo", new String(vector.get(512), StandardCharsets.UTF_8)); } } @@ -105,10 +104,10 @@ public void testListType() { vector.setInitialCapacity(512); vector.allocateNew(); - assertEquals(1023, vector.getValueCapacity()); // TODO this doubles for some reason... + assertEquals(1023, vector.getValueCapacity()); try { - vector.getOffsetVector().getAccessor().get(2014); + vector.getInnerValueCountAt(2014); Assert.fail("Expected out of bounds exception"); } catch (Exception e) { // ok @@ -116,14 +115,14 @@ public void testListType() { vector.reAlloc(); assertEquals(2047, vector.getValueCapacity()); // note: size - 1 - assertEquals(0, vector.getOffsetVector().getAccessor().get(2014)); + assertEquals(0, vector.getOffsetBuffer().getInt(2014 * ListVector.OFFSET_WIDTH)); } } @Test public void testMapType() { try (final NullableMapVector vector = NullableMapVector.empty("", allocator)) { - vector.addOrGet("", FieldType.nullable(MinorType.INT.getType()), NullableIntVector.class); + vector.addOrGet("", FieldType.nullable(MinorType.INT.getType()), IntVector.class); vector.setInitialCapacity(512); vector.allocateNew(); @@ -131,7 +130,7 @@ public void testMapType() { assertEquals(512, vector.getValueCapacity()); try { - vector.getAccessor().getObject(513); + vector.getObject(513); Assert.fail("Expected out of bounds exception"); } catch (Exception e) { // ok @@ -139,7 +138,7 @@ public void testMapType() { vector.reAlloc(); assertEquals(1024, vector.getValueCapacity()); - assertNull(vector.getAccessor().getObject(513)); + assertNull(vector.getObject(513)); } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java index d53f69489d4da..84ea9657f8c9f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java @@ -19,13 +19,23 @@ package org.apache.arrow.vector; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import io.netty.buffer.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.*; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.FieldType; + import org.junit.After; import org.junit.Before; import org.junit.Test; +import java.nio.charset.StandardCharsets; + public class TestVectorReset { private BufferAllocator allocator; @@ -40,16 +50,100 @@ public void terminate() throws Exception { allocator.close(); } + private void resetVectorAndVerify(ValueVector vector, ArrowBuf[] bufs) { + int[] sizeBefore = new int[bufs.length]; + for (int i = 0; i < bufs.length; i++) { + sizeBefore[i] = bufs[i].capacity(); + } + vector.reset(); + for (int i = 0; i < bufs.length; i++) { + assertEquals(sizeBefore[i], bufs[i].capacity()); + verifyBufferZeroed(bufs[i]); + } + assertEquals(0, vector.getValueCount()); + } + + private void verifyBufferZeroed(ArrowBuf buf) { + for (int i = 0; i < buf.capacity(); i++) { + assertTrue((byte) 0 == buf.getByte(i)); + } + } + @Test public void testFixedTypeReset() { - try (final UInt4Vector vector = new UInt4Vector("", allocator)) { - final UInt4Vector.Mutator m = vector.getMutator(); - vector.allocateNew(); - final int sizeBefore = vector.getAllocationSize(); - vector.reAlloc(); - vector.reset(); - final int sizeAfter = vector.getAllocationSize(); - assertEquals(sizeBefore, sizeAfter); + try (final UInt4Vector vector = new UInt4Vector("UInt4", allocator)) { + vector.allocateNewSafe(); + vector.setNull(0); + vector.setValueCount(1); + resetVectorAndVerify(vector, vector.getBuffers(false)); + } + } + + @Test + public void testVariableTypeReset() { + try (final VarCharVector vector = new VarCharVector("VarChar", allocator)) { + vector.allocateNewSafe(); + vector.set(0, "a".getBytes(StandardCharsets.UTF_8)); + vector.setLastSet(0); + vector.setValueCount(1); + resetVectorAndVerify(vector, vector.getBuffers(false)); + assertEquals(-1, vector.getLastSet()); + } + } + + @Test + public void testListTypeReset() { + try (final ListVector variableList = new ListVector("VarList", allocator, FieldType.nullable(MinorType.INT.getType()), null); + final FixedSizeListVector fixedList = new FixedSizeListVector("FixedList", allocator, FieldType.nullable(new FixedSizeList(2)), null) + ) { + // ListVector + variableList.allocateNewSafe(); + variableList.startNewValue(0); + variableList.endValue(0, 0); + variableList.setValueCount(1); + resetVectorAndVerify(variableList, variableList.getBuffers(false)); + assertEquals(0, variableList.getLastSet()); + + // FixedSizeListVector + fixedList.allocateNewSafe(); + fixedList.setNull(0); + fixedList.setValueCount(1); + resetVectorAndVerify(fixedList, fixedList.getBuffers(false)); + } + } + + @Test + public void testMapTypeReset() { + try (final MapVector mapVector = new MapVector("Map", allocator, FieldType.nullable(MinorType.INT.getType()), null); + final NullableMapVector nullableMapVector = new NullableMapVector("NullableMap", allocator, FieldType.nullable(MinorType.INT.getType()), null) + ) { + // MapVector + mapVector.allocateNewSafe(); + IntVector mapChild = mapVector.addOrGet("child", FieldType.nullable(new Int(32, true)), IntVector.class); + mapChild.setNull(0); + mapVector.setValueCount(1); + resetVectorAndVerify(mapVector, mapVector.getBuffers(false)); + + // NullableMapVector + nullableMapVector.allocateNewSafe(); + nullableMapVector.setNull(0); + nullableMapVector.setValueCount(1); + resetVectorAndVerify(nullableMapVector, nullableMapVector.getBuffers(false)); + } + } + + @Test + public void testUnionTypeReset() { + try (final UnionVector vector = new UnionVector("Union", allocator, null); + final IntVector dataVector = new IntVector("Int", allocator) + ) { + vector.getBufferSize(); + vector.allocateNewSafe(); + dataVector.allocateNewSafe(); + vector.addVector(dataVector); + dataVector.setNull(0); + vector.setValueCount(1); + resetVectorAndVerify(vector, vector.getBuffers(false)); } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java index 7facf73f511da..439a62725e49b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java @@ -39,19 +39,27 @@ import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; import org.apache.arrow.vector.complex.writer.BigIntWriter; import org.apache.arrow.vector.complex.writer.IntWriter; -import org.apache.arrow.vector.schema.ArrowFieldNode; -import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Test; +import org.junit.*; public class TestVectorUnloadLoad { - static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } @Test public void testUnloadLoad() throws IOException { @@ -183,24 +191,42 @@ public void testUnloadLoadAddPadding() throws IOException { * @throws IOException */ @Test - public void testLoadEmptyValidityBuffer() throws IOException { + public void testLoadValidityBuffer() throws IOException { Schema schema = new Schema(asList( new Field("intDefined", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()), new Field("intNull", FieldType.nullable(new ArrowType.Int(32, true)), Collections.emptyList()) )); int count = 10; - ArrowBuf validity = allocator.buffer(10).slice(0, 0); - ArrowBuf[] values = new ArrowBuf[2]; - for (int i = 0; i < values.length; i++) { - ArrowBuf arrowBuf = allocator.buffer(count * 4); // integers - values[i] = arrowBuf; + ArrowBuf[] values = new ArrowBuf[4]; + for (int i = 0; i < 4; i+=2) { + ArrowBuf buf1 = allocator.buffer(BitVectorHelper.getValidityBufferSize(count)); + ArrowBuf buf2 = allocator.buffer(count * 4); // integers + buf1.setZero(0, buf1.capacity()); + buf2.setZero(0, buf2.capacity()); + values[i] = buf1; + values[i+1] = buf2; for (int j = 0; j < count; j++) { - arrowBuf.setInt(j * 4, j); + if (i == 2) { + BitVectorHelper.setValidityBit(buf1, j, 0); + } else { + BitVectorHelper.setValidityBitToOne(buf1, j); + } + + buf2.setInt(j * 4, j); } - arrowBuf.writerIndex(count * 4); + buf1.writerIndex((int)Math.ceil(count / 8)); + buf2.writerIndex(count * 4); } + + /* + * values[0] - validity buffer for first vector + * values[1] - data buffer for first vector + * values[2] - validity buffer for second vector + * values[3] - data buffer for second vector + */ + try ( - ArrowRecordBatch recordBatch = new ArrowRecordBatch(count, asList(new ArrowFieldNode(count, 0), new ArrowFieldNode(count, count)), asList(validity, values[0], validity, values[1])); + ArrowRecordBatch recordBatch = new ArrowRecordBatch(count, asList(new ArrowFieldNode(count, 0), new ArrowFieldNode(count, count)), asList(values[0], values[1], values[2], values[3])); BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); VectorSchemaRoot newRoot = VectorSchemaRoot.create(schema, finalVectorsAllocator); ) { @@ -210,35 +236,34 @@ public void testLoadEmptyValidityBuffer() throws IOException { vectorLoader.load(recordBatch); - NullableIntVector intDefinedVector = (NullableIntVector) newRoot.getVector("intDefined"); - NullableIntVector intNullVector = (NullableIntVector) newRoot.getVector("intNull"); + IntVector intDefinedVector = (IntVector) newRoot.getVector("intDefined"); + IntVector intNullVector = (IntVector) newRoot.getVector("intNull"); for (int i = 0; i < count; i++) { - assertFalse("#" + i, intDefinedVector.getAccessor().isNull(i)); - assertEquals("#" + i, i, intDefinedVector.getAccessor().get(i)); - assertTrue("#" + i, intNullVector.getAccessor().isNull(i)); + assertFalse("#" + i, intDefinedVector.isNull(i)); + assertEquals("#" + i, i, intDefinedVector.get(i)); + assertTrue("#" + i, intNullVector.isNull(i)); } - intDefinedVector.getMutator().setSafe(count + 10, 1234); - assertTrue(intDefinedVector.getAccessor().isNull(count + 1)); + intDefinedVector.setSafe(count + 10, 1234); + assertTrue(intDefinedVector.isNull(count + 1)); // empty slots should still default to unset - intDefinedVector.getMutator().setSafe(count + 1, 789); - assertFalse(intDefinedVector.getAccessor().isNull(count + 1)); - assertEquals(789, intDefinedVector.getAccessor().get(count + 1)); - assertTrue(intDefinedVector.getAccessor().isNull(count)); - assertTrue(intDefinedVector.getAccessor().isNull(count + 2)); - assertTrue(intDefinedVector.getAccessor().isNull(count + 3)); - assertTrue(intDefinedVector.getAccessor().isNull(count + 4)); - assertTrue(intDefinedVector.getAccessor().isNull(count + 5)); - assertTrue(intDefinedVector.getAccessor().isNull(count + 6)); - assertTrue(intDefinedVector.getAccessor().isNull(count + 7)); - assertTrue(intDefinedVector.getAccessor().isNull(count + 8)); - assertTrue(intDefinedVector.getAccessor().isNull(count + 9)); - assertFalse(intDefinedVector.getAccessor().isNull(count + 10)); - assertEquals(1234, intDefinedVector.getAccessor().get(count + 10)); + intDefinedVector.setSafe(count + 1, 789); + assertFalse(intDefinedVector.isNull(count + 1)); + assertEquals(789, intDefinedVector.get(count + 1)); + assertTrue(intDefinedVector.isNull(count)); + assertTrue(intDefinedVector.isNull(count + 2)); + assertTrue(intDefinedVector.isNull(count + 3)); + assertTrue(intDefinedVector.isNull(count + 4)); + assertTrue(intDefinedVector.isNull(count + 5)); + assertTrue(intDefinedVector.isNull(count + 6)); + assertTrue(intDefinedVector.isNull(count + 7)); + assertTrue(intDefinedVector.isNull(count + 8)); + assertTrue(intDefinedVector.isNull(count + 9)); + assertFalse(intDefinedVector.isNull(count + 10)); + assertEquals(1234, intDefinedVector.get(count + 10)); } finally { for (ArrowBuf arrowBuf : values) { arrowBuf.release(); } - validity.release(); } } @@ -258,11 +283,11 @@ public void testUnloadLoadDuplicates() throws IOException { FieldVector vector = field.createVector(originalVectorsAllocator); vector.allocateNew(); sources.add(vector); - NullableIntVector.Mutator mutator = (NullableIntVector.Mutator) vector.getMutator(); + IntVector intVector = (IntVector)vector; for (int i = 0; i < count; i++) { - mutator.set(i, i); + intVector.set(i, i); } - mutator.setValueCount(count); + intVector.setValueCount(count); } try (VectorSchemaRoot root = new VectorSchemaRoot(schema.getFields(), sources, count)) { @@ -277,8 +302,8 @@ public void testUnloadLoadDuplicates() throws IOException { List targets = newRoot.getFieldVectors(); Assert.assertEquals(sources.size(), targets.size()); for (int k = 0; k < sources.size(); k++) { - NullableIntVector.Accessor src = (NullableIntVector.Accessor) sources.get(k).getAccessor(); - NullableIntVector.Accessor tgt = (NullableIntVector.Accessor) targets.get(k).getAccessor(); + IntVector src = (IntVector) sources.get(k); + IntVector tgt = (IntVector) targets.get(k); Assert.assertEquals(src.getValueCount(), tgt.getValueCount()); for (int i = 0; i < count; i++) { Assert.assertEquals(src.get(i), tgt.get(i)); @@ -291,14 +316,9 @@ public void testUnloadLoadDuplicates() throws IOException { public static VectorUnloader newVectorUnloader(FieldVector root) { Schema schema = new Schema(root.getField().getChildren()); - int valueCount = root.getAccessor().getValueCount(); + int valueCount = root.getValueCount(); List fields = root.getChildrenFromFields(); VectorSchemaRoot vsr = new VectorSchemaRoot(schema.getFields(), fields, valueCount); return new VectorUnloader(vsr); } - - @AfterClass - public static void afterClass() { - allocator.close(); - } } \ No newline at end of file diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java index 97efb7d5a6d30..b0d6cf555e98d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java @@ -80,24 +80,23 @@ public void testPromoteToUnion() throws Exception { writer.end(); - container.getMutator().setValueCount(5); + container.setValueCount(5); final UnionVector uv = v.getChild("A", UnionVector.class); - final UnionVector.Accessor accessor = uv.getAccessor(); - assertFalse("0 shouldn't be null", accessor.isNull(0)); - assertEquals(false, accessor.getObject(0)); + assertFalse("0 shouldn't be null", uv.isNull(0)); + assertEquals(false, uv.getObject(0)); - assertFalse("1 shouldn't be null", accessor.isNull(1)); - assertEquals(true, accessor.getObject(1)); + assertFalse("1 shouldn't be null", uv.isNull(1)); + assertEquals(true, uv.getObject(1)); - assertFalse("2 shouldn't be null", accessor.isNull(2)); - assertEquals(10, accessor.getObject(2)); + assertFalse("2 shouldn't be null", uv.isNull(2)); + assertEquals(10, uv.getObject(2)); - assertTrue("3 should be null", accessor.isNull(3)); + assertTrue("3 should be null", uv.isNull(3)); - assertFalse("4 shouldn't be null", accessor.isNull(4)); - assertEquals(100, accessor.getObject(4)); + assertFalse("4 shouldn't be null", uv.isNull(4)); + assertEquals(100, uv.getObject(4)); container.clear(); container.allocateNew(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index 856d60724b085..29d39aabe6b47 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -28,10 +28,10 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.SchemaChangeCallBack; -import org.apache.arrow.vector.NullableFloat8Vector; -import org.apache.arrow.vector.NullableFloat4Vector; -import org.apache.arrow.vector.NullableBigIntVector; -import org.apache.arrow.vector.NullableIntVector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.NullableMapVector; @@ -456,7 +456,7 @@ public void simpleUnion() { unionWriter.writeFloat4((float) i); } } - vector.getMutator().setValueCount(COUNT); + vector.setValueCount(COUNT); UnionReader unionReader = new UnionReader(vector); for (int i = 0; i < COUNT; i++) { unionReader.setPosition(i); @@ -834,7 +834,7 @@ public void complexCopierWithList() { TransferPair tp = mapVector.getTransferPair(allocator); tp.splitAndTransfer(0, 1); MapVector toMapVector = (MapVector) tp.getTo(); - JsonStringHashMap toMapValue = (JsonStringHashMap) toMapVector.getAccessor().getObject(0); + JsonStringHashMap toMapValue = (JsonStringHashMap) toMapVector.getObject(0); JsonStringArrayList object = (JsonStringArrayList) toMapValue.get("list"); assertEquals(1, object.get(0)); assertEquals(2, object.get(1)); @@ -885,10 +885,10 @@ public void testSingleMapWriter1() { singleMapWriter.end(); } - NullableIntVector intVector = (NullableIntVector)parent.getChild("intField"); - NullableBigIntVector bigIntVector = (NullableBigIntVector)parent.getChild("bigIntField"); - NullableFloat4Vector float4Vector = (NullableFloat4Vector)parent.getChild("float4Field"); - NullableFloat8Vector float8Vector = (NullableFloat8Vector)parent.getChild("float8Field"); + IntVector intVector = (IntVector)parent.getChild("intField"); + BigIntVector bigIntVector = (BigIntVector)parent.getChild("bigIntField"); + Float4Vector float4Vector = (Float4Vector)parent.getChild("float4Field"); + Float8Vector float8Vector = (Float8Vector)parent.getChild("float8Field"); assertEquals(initialCapacity, singleMapWriter.getValueCapacity()); assertEquals(initialCapacity, intVector.getValueCapacity()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java similarity index 69% rename from java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java rename to java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java index ba62de0a6d93a..9eb55c9bbdaee 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/BaseFileTest.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import java.math.BigDecimal; import java.math.BigInteger; @@ -28,16 +28,14 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.NullableDateMilliVector; -import org.apache.arrow.vector.NullableDecimalVector; -import org.apache.arrow.vector.NullableIntVector; -import org.apache.arrow.vector.NullableTimeMilliVector; -import org.apache.arrow.vector.NullableVarBinaryVector; -import org.apache.arrow.vector.NullableVarCharVector; -import org.apache.arrow.vector.ValueVector.Accessor; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.complex.ListVector; -import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; import org.apache.arrow.vector.complex.impl.UnionListWriter; @@ -47,6 +45,7 @@ import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; import org.apache.arrow.vector.complex.writer.BigIntWriter; import org.apache.arrow.vector.complex.writer.DateMilliWriter; +import org.apache.arrow.vector.complex.writer.Float4Writer; import org.apache.arrow.vector.complex.writer.IntWriter; import org.apache.arrow.vector.complex.writer.TimeMilliWriter; import org.apache.arrow.vector.complex.writer.TimeStampMilliTZWriter; @@ -71,7 +70,7 @@ import io.netty.buffer.ArrowBuf; -import static org.apache.arrow.vector.TestUtils.newNullableVarCharVector; +import static org.apache.arrow.vector.TestUtils.newVarCharVector; /** * Helps testing the file formats @@ -95,14 +94,32 @@ public void tearDown() { DateTimeZone.setDefault(defaultTimezone); } + protected void writeData(int count, NullableMapVector parent) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + Float4Writer float4Writer = rootWriter.float4("float"); + for (int i = 0; i < count; i++) { + intWriter.setPosition(i); + intWriter.writeInt(i); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + float4Writer.setPosition(i); + float4Writer.writeFloat4(i == 0 ? Float.NaN : i); + } + writer.setValueCount(count); + } + protected void validateContent(int count, VectorSchemaRoot root) { for (int i = 0; i < count; i++) { - Assert.assertEquals(i, root.getVector("int").getAccessor().getObject(i)); - Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); + Assert.assertEquals(i, root.getVector("int").getObject(i)); + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getObject(i)); + Assert.assertEquals(i == 0 ? Float.NaN : i, root.getVector("float").getObject(i)); } } - protected void writeComplexData(int count, MapVector parent) { + protected void writeComplexData(int count, NullableMapVector parent) { ArrowBuf varchar = allocator.buffer(3); varchar.readerIndex(0); varchar.setByte(0, 'a'); @@ -140,10 +157,9 @@ protected void writeComplexData(int count, MapVector parent) { public void printVectors(List vectors) { for (FieldVector vector : vectors) { LOGGER.debug(vector.getField().getName()); - Accessor accessor = vector.getAccessor(); - int valueCount = accessor.getValueCount(); + int valueCount = vector.getValueCount(); for (int i = 0; i < valueCount; i++) { - LOGGER.debug(String.valueOf(accessor.getObject(i))); + LOGGER.debug(String.valueOf(vector.getObject(i))); } } } @@ -152,14 +168,15 @@ protected void validateComplexContent(int count, VectorSchemaRoot root) { Assert.assertEquals(count, root.getRowCount()); printVectors(root.getFieldVectors()); for (int i = 0; i < count; i++) { - Object intVal = root.getVector("int").getAccessor().getObject(i); + + Object intVal = root.getVector("int").getObject(i); if (i % 5 != 3) { Assert.assertEquals(i, intVal); } else { Assert.assertNull(intVal); } - Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); - Assert.assertEquals(i % 3, ((List) root.getVector("list").getAccessor().getObject(i)).size()); + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getObject(i)); + Assert.assertEquals(i % 3, ((List) root.getVector("list").getObject(i)).size()); NullableTimeStampMilliHolder h = new NullableTimeStampMilliHolder(); FieldReader mapReader = root.getVector("map").getReader(); mapReader.setPosition(i); @@ -202,15 +219,15 @@ protected void validateDateTimeContent(int count, VectorSchemaRoot root) { Assert.assertEquals(count, root.getRowCount()); printVectors(root.getFieldVectors()); for (int i = 0; i < count; i++) { - long dateVal = ((NullableDateMilliVector) root.getVector("date")).getAccessor().get(i); + long dateVal = ((DateMilliVector) root.getVector("date")).get(i); LocalDateTime dt = makeDateTimeFromCount(i); LocalDateTime dateExpected = dt.minusMillis(dt.getMillisOfDay()); Assert.assertEquals(DateUtility.toMillis(dateExpected), dateVal); - long timeVal = ((NullableTimeMilliVector) root.getVector("time")).getAccessor().get(i); + long timeVal = ((TimeMilliVector) root.getVector("time")).get(i); Assert.assertEquals(dt.getMillisOfDay(), timeVal); - Object timestampMilliVal = root.getVector("timestamp-milli").getAccessor().getObject(i); + Object timestampMilliVal = root.getVector("timestamp-milli").getObject(i); Assert.assertEquals(dt, timestampMilliVal); - Object timestampMilliTZVal = root.getVector("timestamp-milliTZ").getAccessor().getObject(i); + Object timestampMilliTZVal = root.getVector("timestamp-milliTZ").getObject(i); Assert.assertEquals(DateUtility.toMillis(dt), timestampMilliTZVal); } } @@ -218,61 +235,56 @@ protected void validateDateTimeContent(int count, VectorSchemaRoot root) { protected VectorSchemaRoot writeFlatDictionaryData(BufferAllocator bufferAllocator, DictionaryProvider.MapDictionaryProvider provider) { // Define dictionaries and add to provider - NullableVarCharVector dictionary1Vector = newNullableVarCharVector("D1", bufferAllocator); + VarCharVector dictionary1Vector = newVarCharVector("D1", bufferAllocator); dictionary1Vector.allocateNewSafe(); - NullableVarCharVector.Mutator mutator = dictionary1Vector.getMutator(); - mutator.set(0, "foo".getBytes(StandardCharsets.UTF_8)); - mutator.set(1, "bar".getBytes(StandardCharsets.UTF_8)); - mutator.set(2, "baz".getBytes(StandardCharsets.UTF_8)); - mutator.setValueCount(3); + dictionary1Vector.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + dictionary1Vector.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + dictionary1Vector.set(2, "baz".getBytes(StandardCharsets.UTF_8)); + dictionary1Vector.setValueCount(3); Dictionary dictionary1 = new Dictionary(dictionary1Vector, new DictionaryEncoding(1L, false, null)); provider.put(dictionary1); - NullableVarCharVector dictionary2Vector = newNullableVarCharVector("D2", bufferAllocator); + VarCharVector dictionary2Vector = newVarCharVector("D2", bufferAllocator); dictionary2Vector.allocateNewSafe(); - mutator = dictionary2Vector.getMutator(); - mutator.set(0, "micro".getBytes(StandardCharsets.UTF_8)); - mutator.set(1, "small".getBytes(StandardCharsets.UTF_8)); - mutator.set(2, "large".getBytes(StandardCharsets.UTF_8)); - mutator.setValueCount(3); + dictionary2Vector.set(0, "micro".getBytes(StandardCharsets.UTF_8)); + dictionary2Vector.set(1, "small".getBytes(StandardCharsets.UTF_8)); + dictionary2Vector.set(2, "large".getBytes(StandardCharsets.UTF_8)); + dictionary2Vector.setValueCount(3); Dictionary dictionary2 = new Dictionary(dictionary2Vector, new DictionaryEncoding(2L, false, null)); provider.put(dictionary2); // Populate the vectors - NullableVarCharVector vector1A = newNullableVarCharVector("varcharA", bufferAllocator); + VarCharVector vector1A = newVarCharVector("varcharA", bufferAllocator); vector1A.allocateNewSafe(); - mutator = vector1A.getMutator(); - mutator.set(0, "foo".getBytes(StandardCharsets.UTF_8)); - mutator.set(1, "bar".getBytes(StandardCharsets.UTF_8)); - mutator.set(3, "baz".getBytes(StandardCharsets.UTF_8)); - mutator.set(4, "bar".getBytes(StandardCharsets.UTF_8)); - mutator.set(5, "baz".getBytes(StandardCharsets.UTF_8)); - mutator.setValueCount(6); + vector1A.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + vector1A.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + vector1A.set(3, "baz".getBytes(StandardCharsets.UTF_8)); + vector1A.set(4, "bar".getBytes(StandardCharsets.UTF_8)); + vector1A.set(5, "baz".getBytes(StandardCharsets.UTF_8)); + vector1A.setValueCount(6); FieldVector encodedVector1A = (FieldVector) DictionaryEncoder.encode(vector1A, dictionary1); vector1A.close(); // Done with this vector after encoding // Write this vector using indices instead of encoding - NullableIntVector encodedVector1B = new NullableIntVector("varcharB", bufferAllocator); + IntVector encodedVector1B = new IntVector("varcharB", bufferAllocator); encodedVector1B.allocateNewSafe(); - NullableIntVector.Mutator mutator1B = encodedVector1B.getMutator(); - mutator1B.set(0, 2); // "baz" - mutator1B.set(1, 1); // "bar" - mutator1B.set(2, 2); // "baz" - mutator1B.set(4, 1); // "bar" - mutator1B.set(5, 0); // "foo" - mutator1B.setValueCount(6); - - NullableVarCharVector vector2 = newNullableVarCharVector("sizes", bufferAllocator); + encodedVector1B.set(0, 2); // "baz" + encodedVector1B.set(1, 1); // "bar" + encodedVector1B.set(2, 2); // "baz" + encodedVector1B.set(4, 1); // "bar" + encodedVector1B.set(5, 0); // "foo" + encodedVector1B.setValueCount(6); + + VarCharVector vector2 = newVarCharVector("sizes", bufferAllocator); vector2.allocateNewSafe(); - mutator = vector2.getMutator(); - mutator.set(1, "large".getBytes(StandardCharsets.UTF_8)); - mutator.set(2, "small".getBytes(StandardCharsets.UTF_8)); - mutator.set(3, "small".getBytes(StandardCharsets.UTF_8)); - mutator.set(4, "large".getBytes(StandardCharsets.UTF_8)); - mutator.setValueCount(6); + vector2.set(1, "large".getBytes(StandardCharsets.UTF_8)); + vector2.set(2, "small".getBytes(StandardCharsets.UTF_8)); + vector2.set(3, "small".getBytes(StandardCharsets.UTF_8)); + vector2.set(4, "large".getBytes(StandardCharsets.UTF_8)); + vector2.setValueCount(6); FieldVector encodedVector2 = (FieldVector) DictionaryEncoder.encode(vector2, dictionary2); vector2.close(); // Done with this vector after encoding @@ -280,7 +292,7 @@ protected VectorSchemaRoot writeFlatDictionaryData(BufferAllocator bufferAllocat List fields = ImmutableList.of(encodedVector1A.getField(), encodedVector1B.getField(), encodedVector2.getField()); List vectors = ImmutableList.of(encodedVector1A, encodedVector1B, encodedVector2); - return new VectorSchemaRoot(fields, vectors, encodedVector1A.getAccessor().getValueCount()); + return new VectorSchemaRoot(fields, vectors, encodedVector1A.getValueCount()); } protected void validateFlatDictionary(VectorSchemaRoot root, DictionaryProvider provider) { @@ -291,14 +303,13 @@ protected void validateFlatDictionary(VectorSchemaRoot root, DictionaryProvider Assert.assertNotNull(encoding1A); Assert.assertEquals(1L, encoding1A.getId()); - FieldVector.Accessor accessor = vector1A.getAccessor(); - Assert.assertEquals(6, accessor.getValueCount()); - Assert.assertEquals(0, accessor.getObject(0)); - Assert.assertEquals(1, accessor.getObject(1)); - Assert.assertEquals(null, accessor.getObject(2)); - Assert.assertEquals(2, accessor.getObject(3)); - Assert.assertEquals(1, accessor.getObject(4)); - Assert.assertEquals(2, accessor.getObject(5)); + Assert.assertEquals(6, vector1A.getValueCount()); + Assert.assertEquals(0, vector1A.getObject(0)); + Assert.assertEquals(1, vector1A.getObject(1)); + Assert.assertEquals(null, vector1A.getObject(2)); + Assert.assertEquals(2, vector1A.getObject(3)); + Assert.assertEquals(1, vector1A.getObject(4)); + Assert.assertEquals(2, vector1A.getObject(5)); FieldVector vector1B = root.getVector("varcharB"); Assert.assertNotNull(vector1B); @@ -308,14 +319,13 @@ protected void validateFlatDictionary(VectorSchemaRoot root, DictionaryProvider Assert.assertTrue(encoding1A.equals(encoding1B)); Assert.assertEquals(1L, encoding1B.getId()); - accessor = vector1B.getAccessor(); - Assert.assertEquals(6, accessor.getValueCount()); - Assert.assertEquals(2, accessor.getObject(0)); - Assert.assertEquals(1, accessor.getObject(1)); - Assert.assertEquals(2, accessor.getObject(2)); - Assert.assertEquals(null, accessor.getObject(3)); - Assert.assertEquals(1, accessor.getObject(4)); - Assert.assertEquals(0, accessor.getObject(5)); + Assert.assertEquals(6, vector1B.getValueCount()); + Assert.assertEquals(2, vector1B.getObject(0)); + Assert.assertEquals(1, vector1B.getObject(1)); + Assert.assertEquals(2, vector1B.getObject(2)); + Assert.assertEquals(null, vector1B.getObject(3)); + Assert.assertEquals(1, vector1B.getObject(4)); + Assert.assertEquals(0, vector1B.getObject(5)); FieldVector vector2 = root.getVector("sizes"); Assert.assertNotNull(vector2); @@ -324,40 +334,39 @@ protected void validateFlatDictionary(VectorSchemaRoot root, DictionaryProvider Assert.assertNotNull(encoding2); Assert.assertEquals(2L, encoding2.getId()); - accessor = vector2.getAccessor(); - Assert.assertEquals(6, accessor.getValueCount()); - Assert.assertEquals(null, accessor.getObject(0)); - Assert.assertEquals(2, accessor.getObject(1)); - Assert.assertEquals(1, accessor.getObject(2)); - Assert.assertEquals(1, accessor.getObject(3)); - Assert.assertEquals(2, accessor.getObject(4)); - Assert.assertEquals(null, accessor.getObject(5)); + Assert.assertEquals(6, vector2.getValueCount()); + Assert.assertEquals(null, vector2.getObject(0)); + Assert.assertEquals(2, vector2.getObject(1)); + Assert.assertEquals(1, vector2.getObject(2)); + Assert.assertEquals(1, vector2.getObject(3)); + Assert.assertEquals(2, vector2.getObject(4)); + Assert.assertEquals(null, vector2.getObject(5)); Dictionary dictionary1 = provider.lookup(1L); Assert.assertNotNull(dictionary1); - NullableVarCharVector.Accessor dictionaryAccessor = ((NullableVarCharVector) dictionary1.getVector()).getAccessor(); - Assert.assertEquals(3, dictionaryAccessor.getValueCount()); - Assert.assertEquals(new Text("foo"), dictionaryAccessor.getObject(0)); - Assert.assertEquals(new Text("bar"), dictionaryAccessor.getObject(1)); - Assert.assertEquals(new Text("baz"), dictionaryAccessor.getObject(2)); + VarCharVector dictionaryVector = ((VarCharVector) dictionary1.getVector()); + Assert.assertEquals(3, dictionaryVector.getValueCount()); + Assert.assertEquals(new Text("foo"), dictionaryVector.getObject(0)); + Assert.assertEquals(new Text("bar"), dictionaryVector.getObject(1)); + Assert.assertEquals(new Text("baz"), dictionaryVector.getObject(2)); Dictionary dictionary2 = provider.lookup(2L); Assert.assertNotNull(dictionary2); - dictionaryAccessor = ((NullableVarCharVector) dictionary2.getVector()).getAccessor(); - Assert.assertEquals(3, dictionaryAccessor.getValueCount()); - Assert.assertEquals(new Text("micro"), dictionaryAccessor.getObject(0)); - Assert.assertEquals(new Text("small"), dictionaryAccessor.getObject(1)); - Assert.assertEquals(new Text("large"), dictionaryAccessor.getObject(2)); + dictionaryVector = ((VarCharVector) dictionary2.getVector()); + Assert.assertEquals(3, dictionaryVector.getValueCount()); + Assert.assertEquals(new Text("micro"), dictionaryVector.getObject(0)); + Assert.assertEquals(new Text("small"), dictionaryVector.getObject(1)); + Assert.assertEquals(new Text("large"), dictionaryVector.getObject(2)); } protected VectorSchemaRoot writeNestedDictionaryData(BufferAllocator bufferAllocator, DictionaryProvider.MapDictionaryProvider provider) { // Define the dictionary and add to the provider - NullableVarCharVector dictionaryVector = newNullableVarCharVector("D2", bufferAllocator); + VarCharVector dictionaryVector = newVarCharVector("D2", bufferAllocator); dictionaryVector.allocateNewSafe(); - dictionaryVector.getMutator().set(0, "foo".getBytes(StandardCharsets.UTF_8)); - dictionaryVector.getMutator().set(1, "bar".getBytes(StandardCharsets.UTF_8)); - dictionaryVector.getMutator().setValueCount(2); + dictionaryVector.set(0, "foo".getBytes(StandardCharsets.UTF_8)); + dictionaryVector.set(1, "bar".getBytes(StandardCharsets.UTF_8)); + dictionaryVector.setValueCount(2); Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(2L, false, null)); provider.put(dictionary); @@ -396,24 +405,23 @@ protected void validateNestedDictionary(VectorSchemaRoot root, DictionaryProvide Assert.assertEquals(2L, encoding.getId()); Assert.assertEquals(new ArrowType.Int(32, true), encoding.getIndexType()); - FieldVector.Accessor accessor = vector.getAccessor(); - Assert.assertEquals(3, accessor.getValueCount()); - Assert.assertEquals(Arrays.asList(0, 1), accessor.getObject(0)); - Assert.assertEquals(Arrays.asList(0), accessor.getObject(1)); - Assert.assertEquals(Arrays.asList(1), accessor.getObject(2)); + Assert.assertEquals(3, vector.getValueCount()); + Assert.assertEquals(Arrays.asList(0, 1), vector.getObject(0)); + Assert.assertEquals(Arrays.asList(0), vector.getObject(1)); + Assert.assertEquals(Arrays.asList(1), vector.getObject(2)); Dictionary dictionary = provider.lookup(2L); Assert.assertNotNull(dictionary); - NullableVarCharVector.Accessor dictionaryAccessor = ((NullableVarCharVector) dictionary.getVector()).getAccessor(); - Assert.assertEquals(2, dictionaryAccessor.getValueCount()); - Assert.assertEquals(new Text("foo"), dictionaryAccessor.getObject(0)); - Assert.assertEquals(new Text("bar"), dictionaryAccessor.getObject(1)); + VarCharVector dictionaryVector = ((VarCharVector) dictionary.getVector()); + Assert.assertEquals(2, dictionaryVector.getValueCount()); + Assert.assertEquals(new Text("foo"), dictionaryVector.getObject(0)); + Assert.assertEquals(new Text("bar"), dictionaryVector.getObject(1)); } protected VectorSchemaRoot writeDecimalData(BufferAllocator bufferAllocator) { - NullableDecimalVector decimalVector1 = new NullableDecimalVector("decimal1", bufferAllocator, 10, 3); - NullableDecimalVector decimalVector2 = new NullableDecimalVector("decimal2", bufferAllocator, 4, 2); - NullableDecimalVector decimalVector3 = new NullableDecimalVector("decimal3", bufferAllocator, 16, 8); + DecimalVector decimalVector1 = new DecimalVector("decimal1", bufferAllocator, 10, 3); + DecimalVector decimalVector2 = new DecimalVector("decimal2", bufferAllocator, 4, 2); + DecimalVector decimalVector3 = new DecimalVector("decimal3", bufferAllocator, 16, 8); int count = 10; decimalVector1.allocateNew(count); @@ -421,14 +429,14 @@ protected VectorSchemaRoot writeDecimalData(BufferAllocator bufferAllocator) { decimalVector3.allocateNew(count); for (int i = 0; i < count; i++) { - decimalVector1.getMutator().setSafe(i, new BigDecimal(BigInteger.valueOf(i), 3)); - decimalVector2.getMutator().setSafe(i, new BigDecimal(BigInteger.valueOf(i * (1 << 10)), 2)); - decimalVector3.getMutator().setSafe(i, new BigDecimal(BigInteger.valueOf(i * 1111111111111111L), 8)); + decimalVector1.setSafe(i, new BigDecimal(BigInteger.valueOf(i), 3)); + decimalVector2.setSafe(i, new BigDecimal(BigInteger.valueOf(i * (1 << 10)), 2)); + decimalVector3.setSafe(i, new BigDecimal(BigInteger.valueOf(i * 1111111111111111L), 8)); } - decimalVector1.getMutator().setValueCount(count); - decimalVector2.getMutator().setValueCount(count); - decimalVector3.getMutator().setValueCount(count); + decimalVector1.setValueCount(count); + decimalVector2.setValueCount(count); + decimalVector3.setValueCount(count); List fields = ImmutableList.of(decimalVector1.getField(), decimalVector2.getField(), decimalVector3.getField()); List vectors = ImmutableList.of(decimalVector1, decimalVector2, decimalVector3); @@ -436,47 +444,33 @@ protected VectorSchemaRoot writeDecimalData(BufferAllocator bufferAllocator) { } protected void validateDecimalData(VectorSchemaRoot root) { - NullableDecimalVector decimalVector1 = (NullableDecimalVector) root.getVector("decimal1"); - NullableDecimalVector decimalVector2 = (NullableDecimalVector) root.getVector("decimal2"); - NullableDecimalVector decimalVector3 = (NullableDecimalVector) root.getVector("decimal3"); + DecimalVector decimalVector1 = (DecimalVector) root.getVector("decimal1"); + DecimalVector decimalVector2 = (DecimalVector) root.getVector("decimal2"); + DecimalVector decimalVector3 = (DecimalVector) root.getVector("decimal3"); int count = 10; Assert.assertEquals(count, root.getRowCount()); for (int i = 0; i < count; i++) { // Verify decimal 1 vector - BigDecimal readValue = decimalVector1.getAccessor().getObject(i); + BigDecimal readValue = decimalVector1.getObject(i); ArrowType.Decimal type = (ArrowType.Decimal) decimalVector1.getField().getType(); BigDecimal genValue = new BigDecimal(BigInteger.valueOf(i), type.getScale()); Assert.assertEquals(genValue, readValue); // Verify decimal 2 vector - readValue = decimalVector2.getAccessor().getObject(i); + readValue = decimalVector2.getObject(i); type = (ArrowType.Decimal) decimalVector2.getField().getType(); genValue = new BigDecimal(BigInteger.valueOf(i * (1 << 10)), type.getScale()); Assert.assertEquals(genValue, readValue); // Verify decimal 3 vector - readValue = decimalVector3.getAccessor().getObject(i); + readValue = decimalVector3.getObject(i); type = (ArrowType.Decimal) decimalVector3.getField().getType(); genValue = new BigDecimal(BigInteger.valueOf(i * 1111111111111111L), type.getScale()); Assert.assertEquals(genValue, readValue); } } - protected void writeData(int count, MapVector parent) { - ComplexWriter writer = new ComplexWriterImpl("root", parent); - MapWriter rootWriter = writer.rootAsMap(); - IntWriter intWriter = rootWriter.integer("int"); - BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); - for (int i = 0; i < count; i++) { - intWriter.setPosition(i); - intWriter.writeInt(i); - bigIntWriter.setPosition(i); - bigIntWriter.writeBigInt(i); - } - writer.setValueCount(count); - } - public void validateUnionData(int count, VectorSchemaRoot root) { FieldReader unionReader = root.getVector("union").getReader(); for (int i = 0; i < count; i++) { @@ -569,7 +563,7 @@ protected void validateVarBinary(int count, VectorSchemaRoot root) { int numVarBinaryValues = 0; for (int i = 0; i < count; i++) { expectedArray[i] = (byte) i; - Object obj = listVector.getAccessor().getObject(i); + Object obj = listVector.getObject(i); List objList = (List) obj; if (i % 3 == 0) { Assert.assertTrue(objList.isEmpty()); @@ -584,10 +578,10 @@ protected void validateVarBinary(int count, VectorSchemaRoot root) { } // ListVector lastSet should be the index of last value + 1 - Assert.assertEquals(listVector.getMutator().getLastSet(), count); + Assert.assertEquals(listVector.getLastSet(), count); - // NullableVarBinaryVector lastSet should be the index of last value - NullableVarBinaryVector binaryVector = (NullableVarBinaryVector) listVector.getChildrenFromFields().get(0); - Assert.assertEquals(binaryVector.getMutator().getLastSet(), numVarBinaryValues - 1); + // VarBinaryVector lastSet should be the index of last value + VarBinaryVector binaryVector = (VarBinaryVector) listVector.getChildrenFromFields().get(0); + Assert.assertEquals(binaryVector.getLastSet(), numVarBinaryValues - 1); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java similarity index 94% rename from java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java rename to java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java index f968768f5e67d..239d3034ad135 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/stream/MessageSerializerTest.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/MessageSerializerTest.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.stream; +package org.apache.arrow.vector.ipc; import static java.util.Arrays.asList; import static org.junit.Assert.assertArrayEquals; @@ -33,12 +33,11 @@ import io.netty.buffer.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.file.ArrowBlock; -import org.apache.arrow.vector.file.ReadChannel; -import org.apache.arrow.vector.file.WriteChannel; -import org.apache.arrow.vector.schema.ArrowFieldNode; -import org.apache.arrow.vector.schema.ArrowMessage; -import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowMessage; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java similarity index 88% rename from java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java rename to java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java index 81e58989fccc4..055c34e7010f3 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFile.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -36,20 +36,17 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.NullableFloat4Vector; -import org.apache.arrow.vector.NullableIntVector; -import org.apache.arrow.vector.NullableTinyIntVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.TinyIntVector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.VectorUnloader; import org.apache.arrow.vector.complex.FixedSizeListVector; -import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.dictionary.DictionaryProvider.MapDictionaryProvider; -import org.apache.arrow.vector.schema.ArrowBuffer; -import org.apache.arrow.vector.schema.ArrowMessage; -import org.apache.arrow.vector.schema.ArrowRecordBatch; -import org.apache.arrow.vector.stream.ArrowStreamReader; -import org.apache.arrow.vector.stream.ArrowStreamWriter; -import org.apache.arrow.vector.stream.MessageSerializerTest; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowBuffer; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -72,7 +69,7 @@ public void testWrite() throws IOException { int count = COUNT; try ( BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); - MapVector parent = MapVector.empty("parent", vectorAllocator)) { + NullableMapVector parent = NullableMapVector.empty("parent", vectorAllocator)) { writeData(count, parent); write(parent.getChild("root"), file, new ByteArrayOutputStream()); } @@ -100,7 +97,7 @@ public void testWriteRead() throws IOException { // write try (BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); - MapVector parent = MapVector.empty("parent", originalVectorAllocator)) { + NullableMapVector parent = NullableMapVector.empty("parent", originalVectorAllocator)) { writeData(count, parent); write(parent.getChild("root"), file, stream); } @@ -108,52 +105,41 @@ public void testWriteRead() throws IOException { // read try (BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); FileInputStream fileInputStream = new FileInputStream(file); - ArrowFileReader arrowReader = new ArrowFileReader(fileInputStream.getChannel(), readerAllocator) { - @Override - protected ArrowMessage readMessage(SeekableReadChannel in, BufferAllocator allocator) throws IOException { - ArrowMessage message = super.readMessage(in, allocator); - if (message != null) { - ArrowRecordBatch batch = (ArrowRecordBatch) message; - List buffersLayout = batch.getBuffersLayout(); - for (ArrowBuffer arrowBuffer : buffersLayout) { - Assert.assertEquals(0, arrowBuffer.getOffset() % 8); - } - } - return message; - } - }) { - Schema schema = arrowReader.getVectorSchemaRoot().getSchema(); - LOGGER.debug("reading schema: " + schema); + ArrowFileReader arrowReader = new ArrowFileReader(fileInputStream.getChannel(), readerAllocator)) { + VectorSchemaRoot root = arrowReader.getVectorSchemaRoot(); + VectorUnloader unloader = new VectorUnloader(root); + Schema schema = root.getSchema(); + LOGGER.debug("reading schema: " + schema); for (ArrowBlock rbBlock : arrowReader.getRecordBlocks()) { arrowReader.loadRecordBatch(rbBlock); Assert.assertEquals(count, root.getRowCount()); + ArrowRecordBatch batch = unloader.getRecordBatch(); + List buffersLayout = batch.getBuffersLayout(); + for (ArrowBuffer arrowBuffer : buffersLayout) { + Assert.assertEquals(0, arrowBuffer.getOffset() % 8); + } validateContent(count, root); + batch.close(); } } // Read from stream. try (BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); ByteArrayInputStream input = new ByteArrayInputStream(stream.toByteArray()); - ArrowStreamReader arrowReader = new ArrowStreamReader(input, readerAllocator) { - @Override - protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) throws IOException { - ArrowMessage message = super.readMessage(in, allocator); - if (message != null) { - ArrowRecordBatch batch = (ArrowRecordBatch) message; - List buffersLayout = batch.getBuffersLayout(); - for (ArrowBuffer arrowBuffer : buffersLayout) { - Assert.assertEquals(0, arrowBuffer.getOffset() % 8); - } - } - return message; - } - }) { + ArrowStreamReader arrowReader = new ArrowStreamReader(input, readerAllocator)) { VectorSchemaRoot root = arrowReader.getVectorSchemaRoot(); + VectorUnloader unloader = new VectorUnloader(root); Schema schema = root.getSchema(); LOGGER.debug("reading schema: " + schema); Assert.assertTrue(arrowReader.loadNextBatch()); + ArrowRecordBatch batch = unloader.getRecordBatch(); + List buffersLayout = batch.getBuffersLayout(); + for (ArrowBuffer arrowBuffer : buffersLayout) { + Assert.assertEquals(0, arrowBuffer.getOffset() % 8); + } + batch.close(); Assert.assertEquals(count, root.getRowCount()); validateContent(count, root); } @@ -167,7 +153,7 @@ public void testWriteReadComplex() throws IOException { // write try (BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); - MapVector parent = MapVector.empty("parent", originalVectorAllocator)) { + NullableMapVector parent = NullableMapVector.empty("parent", originalVectorAllocator)) { writeComplexData(count, parent); write(parent.getChild("root"), file, stream); } @@ -208,7 +194,7 @@ public void testWriteReadMultipleRBs() throws IOException { // write try (BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); - MapVector parent = MapVector.empty("parent", originalVectorAllocator); + NullableMapVector parent = NullableMapVector.empty("parent", originalVectorAllocator); FileOutputStream fileOutputStream = new FileOutputStream(file)) { writeData(counts[0], parent); VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); @@ -317,11 +303,11 @@ public void testWriteReadTiny() throws IOException { try (VectorSchemaRoot root = VectorSchemaRoot.create(MessageSerializerTest.testSchema(), allocator)) { root.getFieldVectors().get(0).allocateNew(); - NullableTinyIntVector.Mutator mutator = (NullableTinyIntVector.Mutator) root.getFieldVectors().get(0).getMutator(); + TinyIntVector vector = (TinyIntVector) root.getFieldVectors().get(0); for (int i = 0; i < 16; i++) { - mutator.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); + vector.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); } - mutator.setValueCount(16); + vector.setValueCount(16); root.setRowCount(16); // write file @@ -365,12 +351,12 @@ public void testWriteReadTiny() throws IOException { private void validateTinyData(VectorSchemaRoot root) { Assert.assertEquals(16, root.getRowCount()); - NullableTinyIntVector vector = (NullableTinyIntVector) root.getFieldVectors().get(0); + TinyIntVector vector = (TinyIntVector) root.getFieldVectors().get(0); for (int i = 0; i < 16; i++) { if (i < 8) { - Assert.assertEquals((byte) (i + 1), vector.getAccessor().get(i)); + Assert.assertEquals((byte) (i + 1), vector.get(i)); } else { - Assert.assertTrue(vector.getAccessor().isNull(i)); + Assert.assertTrue(vector.isNull(i)); } } } @@ -397,7 +383,7 @@ public void testWriteReadMetadata() throws IOException { try (BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); NullableMapVector vector = (NullableMapVector) field.createVector(originalVectorAllocator)) { vector.allocateNewSafe(); - vector.getMutator().setValueCount(0); + vector.setValueCount(0); List vectors = ImmutableList.of(vector); VectorSchemaRoot root = new VectorSchemaRoot(originalSchema, vectors, 0); @@ -573,18 +559,18 @@ public void testWriteReadFixedSizeList() throws IOException { try (BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); NullableMapVector parent = NullableMapVector.empty("parent", originalVectorAllocator)) { FixedSizeListVector tuples = parent.addOrGet("float-pairs", FieldType.nullable(new FixedSizeList(2)), FixedSizeListVector.class); - NullableFloat4Vector floats = (NullableFloat4Vector) tuples.addOrGetVector(FieldType.nullable(MinorType.FLOAT4.getType())).getVector(); - NullableIntVector ints = parent.addOrGet("ints", FieldType.nullable(new Int(32, true)), NullableIntVector.class); + Float4Vector floats = (Float4Vector) tuples.addOrGetVector(FieldType.nullable(MinorType.FLOAT4.getType())).getVector(); + IntVector ints = parent.addOrGet("ints", FieldType.nullable(new Int(32, true)), IntVector.class); parent.allocateNew(); for (int i = 0; i < 10; i++) { - tuples.getMutator().setNotNull(i); - floats.getMutator().set(i * 2, i + 0.1f); - floats.getMutator().set(i * 2 + 1, i + 10.1f); - ints.getMutator().set(i, i); + tuples.setNotNull(i); + floats.set(i * 2, i + 0.1f); + floats.set(i * 2 + 1, i + 10.1f); + ints.set(i, i); } - parent.getMutator().setValueCount(10); + parent.setValueCount(10); write(parent, file, stream); } @@ -600,8 +586,8 @@ public void testWriteReadFixedSizeList() throws IOException { arrowReader.loadRecordBatch(rbBlock); Assert.assertEquals(count, root.getRowCount()); for (int i = 0; i < 10; i++) { - Assert.assertEquals(Lists.newArrayList(i + 0.1f, i + 10.1f), root.getVector("float-pairs").getAccessor().getObject(i)); - Assert.assertEquals(i, root.getVector("ints").getAccessor().getObject(i)); + Assert.assertEquals(Lists.newArrayList(i + 0.1f, i + 10.1f), root.getVector("float-pairs").getObject(i)); + Assert.assertEquals(i, root.getVector("ints").getObject(i)); } } } @@ -616,8 +602,8 @@ public void testWriteReadFixedSizeList() throws IOException { arrowReader.loadNextBatch(); Assert.assertEquals(count, root.getRowCount()); for (int i = 0; i < 10; i++) { - Assert.assertEquals(Lists.newArrayList(i + 0.1f, i + 10.1f), root.getVector("float-pairs").getAccessor().getObject(i)); - Assert.assertEquals(i, root.getVector("ints").getAccessor().getObject(i)); + Assert.assertEquals(Lists.newArrayList(i + 0.1f, i + 10.1f), root.getVector("float-pairs").getObject(i)); + Assert.assertEquals(i, root.getVector("ints").getObject(i)); } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFooter.java similarity index 93% rename from java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java rename to java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFooter.java index 4612465323130..235e8c1646712 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowFooter.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import static java.util.Arrays.asList; import static org.junit.Assert.assertEquals; @@ -27,6 +27,8 @@ import java.util.List; import org.apache.arrow.flatbuf.Footer; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowFooter; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java similarity index 85% rename from java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java rename to java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java index 3ce01a26835d5..bf42fbb83c84b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowReaderWriter.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import static java.nio.channels.Channels.newChannel; import static java.util.Arrays.asList; @@ -37,9 +37,15 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.TestUtils; +import org.apache.arrow.vector.VectorLoader; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.schema.ArrowFieldNode; -import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.ipc.ArrowFileReader; +import org.apache.arrow.vector.ipc.ArrowFileWriter; +import org.apache.arrow.vector.ipc.SeekableReadChannel; +import org.apache.arrow.vector.ipc.message.ArrowBlock; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -87,7 +93,10 @@ public void test() throws IOException { ArrowFileWriter writer = new ArrowFileWriter(root, null, newChannel(out))) { ArrowBuf validityb = buf(validity); ArrowBuf valuesb = buf(values); - writer.writeRecordBatch(new ArrowRecordBatch(16, asList(new ArrowFieldNode(16, 8)), asList(validityb, valuesb))); + ArrowRecordBatch batch = new ArrowRecordBatch(16, asList(new ArrowFieldNode(16, 8)), asList(validityb, valuesb)); + VectorLoader loader = new VectorLoader(root); + loader.load(batch); + writer.writeBatch(); } byte[] byteArray = out.toByteArray(); @@ -96,11 +105,12 @@ public void test() throws IOException { ArrowFileReader reader = new ArrowFileReader(channel, allocator)) { Schema readSchema = reader.getVectorSchemaRoot().getSchema(); assertEquals(schema, readSchema); - assertTrue(readSchema.getFields().get(0).getTypeLayout().getVectorTypes().toString(), readSchema.getFields().get(0).getTypeLayout().getVectors().size() > 0); // TODO: dictionaries List recordBatches = reader.getRecordBlocks(); assertEquals(1, recordBatches.size()); - ArrowRecordBatch recordBatch = (ArrowRecordBatch) reader.readMessage(channel, allocator); + reader.loadNextBatch(); + VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot()); + ArrowRecordBatch recordBatch = unloader.getRecordBatch(); List nodes = recordBatch.getNodes(); assertEquals(1, nodes.size()); ArrowFieldNode node = nodes.get(0); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStream.java similarity index 83% rename from java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java rename to java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStream.java index e2efabef0095b..f87a0ebd54a0f 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStream.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStream.java @@ -16,9 +16,8 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; -import static java.util.Arrays.asList; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -29,14 +28,12 @@ import io.netty.buffer.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.NullableTinyIntVector; +import org.apache.arrow.vector.TinyIntVector; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.schema.ArrowFieldNode; -import org.apache.arrow.vector.schema.ArrowMessage; -import org.apache.arrow.vector.schema.ArrowRecordBatch; -import org.apache.arrow.vector.stream.ArrowStreamReader; -import org.apache.arrow.vector.stream.ArrowStreamWriter; -import org.apache.arrow.vector.stream.MessageSerializerTest; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.ipc.BaseFileTest; +import org.apache.arrow.vector.ipc.MessageSerializerTest; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; import org.junit.Test; @@ -70,11 +67,11 @@ public void testReadWrite() throws IOException { int numBatches = 1; root.getFieldVectors().get(0).allocateNew(); - NullableTinyIntVector.Mutator mutator = (NullableTinyIntVector.Mutator) root.getFieldVectors().get(0).getMutator(); + TinyIntVector vector = (TinyIntVector)root.getFieldVectors().get(0); for (int i = 0; i < 16; i++) { - mutator.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); + vector.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); } - mutator.setValueCount(16); + vector.setValueCount(16); root.setRowCount(16); ByteArrayOutputStream out = new ByteArrayOutputStream(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStreamPipe.java similarity index 69% rename from java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java rename to java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStreamPipe.java index 40716942f02a2..9ec9a078f7ef0 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowStreamPipe.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestArrowStreamPipe.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file; +package org.apache.arrow.vector.ipc; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -28,12 +28,11 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.NullableTinyIntVector; +import org.apache.arrow.vector.TinyIntVector; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.schema.ArrowMessage; -import org.apache.arrow.vector.stream.ArrowStreamReader; -import org.apache.arrow.vector.stream.ArrowStreamWriter; -import org.apache.arrow.vector.stream.MessageSerializerTest; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.ipc.MessageSerializerTest; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; import org.junit.Test; @@ -62,13 +61,13 @@ public void run() { writer.start(); for (int j = 0; j < numBatches; j++) { root.getFieldVectors().get(0).allocateNew(); - NullableTinyIntVector.Mutator mutator = (NullableTinyIntVector.Mutator) root.getFieldVectors().get(0).getMutator(); + TinyIntVector vector = (TinyIntVector) root.getFieldVectors().get(0); // Send a changing batch id first - mutator.set(0, j); + vector.set(0, j); for (int i = 1; i < 16; i++) { - mutator.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); + vector.set(i, i < 8 ? 1 : 0, (byte) (i + 1)); } - mutator.setValueCount(16); + vector.setValueCount(16); root.setRowCount(16); writer.writeBatch(); @@ -95,37 +94,27 @@ private final class ReaderThread extends Thread { public ReaderThread(ReadableByteChannel sourceChannel) throws IOException { reader = new ArrowStreamReader(sourceChannel, alloc) { - @Override - protected ArrowMessage readMessage(ReadChannel in, BufferAllocator allocator) throws IOException { - // Read all the batches. Each batch contains an incrementing id and then some - // constant data. Verify both. - ArrowMessage message = super.readMessage(in, allocator); - if (message == null) { - done = true; - } else { - batchesRead++; - } - return message; - } @Override public boolean loadNextBatch() throws IOException { - if (!super.loadNextBatch()) { + if (super.loadNextBatch()) { + batchesRead++; + } else { + done = true; return false; } - if (!done) { - VectorSchemaRoot root = getVectorSchemaRoot(); - Assert.assertEquals(16, root.getRowCount()); - NullableTinyIntVector vector = (NullableTinyIntVector) root.getFieldVectors().get(0); - Assert.assertEquals((byte) (batchesRead - 1), vector.getAccessor().get(0)); - for (int i = 1; i < 16; i++) { - if (i < 8) { - Assert.assertEquals((byte) (i + 1), vector.getAccessor().get(i)); - } else { - Assert.assertTrue(vector.getAccessor().isNull(i)); - } + VectorSchemaRoot root = getVectorSchemaRoot(); + Assert.assertEquals(16, root.getRowCount()); + TinyIntVector vector = (TinyIntVector) root.getFieldVectors().get(0); + Assert.assertEquals((byte) (batchesRead - 1), vector.get(0)); + for (int i = 1; i < 16; i++) { + if (i < 8) { + Assert.assertEquals((byte) (i + 1), vector.get(i)); + } else { + Assert.assertTrue(vector.isNull(i)); } } + return true; } }; @@ -135,11 +124,8 @@ public boolean loadNextBatch() throws IOException { public void run() { try { assertEquals(schema, reader.getVectorSchemaRoot().getSchema()); - assertTrue( - reader.getVectorSchemaRoot().getSchema().getFields().get(0).getTypeLayout().getVectorTypes().toString(), - reader.getVectorSchemaRoot().getSchema().getFields().get(0).getTypeLayout().getVectors().size() > 0); while (!done) { - assertTrue(reader.loadNextBatch()); + assertTrue(reader.loadNextBatch() != done); } reader.close(); } catch (IOException e) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java similarity index 90% rename from java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java rename to java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java index ee90d340d7cc8..6a23a8438de48 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.arrow.vector.file.json; +package org.apache.arrow.vector.ipc; import java.io.File; import java.io.IOException; @@ -24,11 +24,9 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.dictionary.DictionaryProvider.MapDictionaryProvider; -import org.apache.arrow.vector.file.BaseFileTest; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.Validator; import org.junit.Assert; @@ -39,6 +37,33 @@ public class TestJSONFile extends BaseFileTest { private static final Logger LOGGER = LoggerFactory.getLogger(TestJSONFile.class); + @Test + public void testWriteRead() throws IOException { + File file = new File("target/mytest.json"); + int count = COUNT; + + // write + try (BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + NullableMapVector parent = NullableMapVector.empty("parent", originalVectorAllocator)) { + writeData(count, parent); + writeJSON(file, new VectorSchemaRoot(parent.getChild("root")), null); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(file, readerAllocator) + ) { + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateContent(count, root); + } + } + } + @Test public void testWriteReadComplexJSON() throws IOException { File file = new File("target/mytest_complex.json"); @@ -47,7 +72,7 @@ public void testWriteReadComplexJSON() throws IOException { // write try ( BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); - MapVector parent = MapVector.empty("parent", originalVectorAllocator)) { + NullableMapVector parent = NullableMapVector.empty("parent", originalVectorAllocator)) { writeComplexData(count, parent); writeJSON(file, new VectorSchemaRoot(parent.getChild("root")), null); } @@ -280,7 +305,7 @@ public void testSetStructLength() throws IOException { // initialize vectors try (VectorSchemaRoot root = reader.read();) { FieldVector vector = root.getVector("struct_nullable"); - Assert.assertEquals(7, vector.getAccessor().getValueCount()); + Assert.assertEquals(7, vector.getValueCount()); } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index f98aeac8c8196..f6f1ad221f3d1 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -21,13 +21,19 @@ import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; import static org.apache.arrow.vector.types.FloatingPointPrecision.SINGLE; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import java.nio.ByteBuffer; import java.util.HashMap; import java.util.Map; import com.google.common.collect.ImmutableList; import com.google.flatbuffers.FlatBufferBuilder; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.UnionMode; @@ -64,6 +70,40 @@ public void complex() { run(initialField); } + @Test + public void list() throws Exception { + ImmutableList.Builder childrenBuilder = ImmutableList.builder(); + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + ListVector writeVector = ListVector.empty("list", allocator); + FixedSizeListVector writeFixedVector = FixedSizeListVector.empty("fixedlist", 5, allocator)) { + Field listVectorField = writeVector.getField(); + childrenBuilder.add(listVectorField); + Field listFixedVectorField = writeFixedVector.getField(); + childrenBuilder.add(listFixedVectorField); + } + + Field initialField = new Field("a", FieldType.nullable(Struct.INSTANCE), childrenBuilder.build()); + ImmutableList.Builder parentBuilder = ImmutableList.builder(); + parentBuilder.add(initialField); + FlatBufferBuilder builder = new FlatBufferBuilder(); + builder.finish(initialField.getField(builder)); + org.apache.arrow.flatbuf.Field flatBufField = org.apache.arrow.flatbuf.Field.getRootAsField(builder.dataBuffer()); + Field finalField = Field.convertField(flatBufField); + assertEquals(initialField, finalField); + assertFalse(finalField.toString().contains("[DEFAULT]")); + + Schema initialSchema = new Schema(parentBuilder.build()); + String jsonSchema = initialSchema.toJson(); + String modifiedSchema = jsonSchema.replace("$data$", "[DEFAULT]"); + + Schema tempSchema = Schema.fromJSON(modifiedSchema); + FlatBufferBuilder schemaBuilder = new FlatBufferBuilder(); + org.apache.arrow.vector.types.pojo.Schema schema = new org.apache.arrow.vector.types.pojo.Schema(tempSchema.getFields()); + schemaBuilder.finish(schema.getSchema(schemaBuilder)); + Schema finalSchema = Schema.deserialize(ByteBuffer.wrap(schemaBuilder.sizedByteArray())); + assertFalse(finalSchema.toString().contains("[DEFAULT]")); + } + @Test public void schema() { ImmutableList.Builder childrenBuilder = ImmutableList.builder(); diff --git a/js/.gitignore b/js/.gitignore index 6d0f88d191cb0..f705f2510f93f 100644 --- a/js/.gitignore +++ b/js/.gitignore @@ -18,6 +18,7 @@ # Logs logs *.log +.esm-cache npm-debug.log* yarn-debug.log* yarn-error.log* @@ -57,10 +58,6 @@ build/Release node_modules/ jspm_packages/ -# Typescript declaration files -types/ -typings/ - # Optional npm cache directory .npm @@ -85,6 +82,9 @@ package-lock.json # compilation targets dist -targets/es5 -targets/es2015 -targets/esnext +targets + +# test data files +test/data/ +# jest snapshots (too big) +test/__snapshots__/ \ No newline at end of file diff --git a/js/.npmrc b/js/.npmrc index 71ffabdd55d95..b6b25d1f1816d 100644 --- a/js/.npmrc +++ b/js/.npmrc @@ -1,2 +1,2 @@ save-prefix= -package-lock=false +package-lock=false \ No newline at end of file diff --git a/js/DEVELOP.md b/js/DEVELOP.md index 5b4ac14ed3466..1dd999a9efbb6 100644 --- a/js/DEVELOP.md +++ b/js/DEVELOP.md @@ -17,6 +17,30 @@ under the License. --> +# Getting Involved +Even if you do not plan to contribute to Apache Arrow itself or Arrow +integrations in other projects, we'd be happy to have you involved: + +* Join the mailing list: send an email to + [dev-subscribe@arrow.apache.org][1]. Share your ideas and use cases for the + project. +* [Follow our activity on JIRA][3] +* [Learn the format][2] +* Contribute code to one of the reference implementations + +We prefer to receive contributions in the form of GitHub pull requests. Please send pull requests against the [github.com/apache/arrow][4] repository. + +If you are looking for some ideas on what to contribute, check out the [JIRA +issues][3] for the Apache Arrow project. Comment on the issue and/or contact +[dev@arrow.apache.org](http://mail-archives.apache.org/mod_mbox/arrow-dev/) +with your questions and ideas. + +If you’d like to report a bug but don’t have time to fix it, you can still post +it on JIRA, or email the mailing list +[dev@arrow.apache.org](http://mail-archives.apache.org/mod_mbox/arrow-dev/) + + + # The npm scripts * `npm run clean` - cleans targets @@ -50,91 +74,220 @@ Once generated, the flatbuffers format code needs to be adjusted for our TS and 1. Generate the flatbuffers TypeScript source from the Arrow project root directory: ```sh + cd $ARROW_HOME + flatc --ts -o ./js/src/format ./format/*.fbs + + cd ./js/src/format + + # Delete Tensor_generated.js (skip this when we support Tensors) + rm ./Tensor_generated.ts + + # Remove "_generated" suffix from TS files + mv ./File_generated.ts .File.ts + mv ./Schema_generated.ts .Schema.ts + mv ./Message_generated.ts .Message.ts ``` -1. Change all the `flatbuffers` imports to +1. Remove Tensor import from `Schema.ts` +1. Fix all the `flatbuffers` imports ```ts - import { flatbuffers } from "flatbuffers" + import { flatbuffers } from "./flatbuffers" // <-- change + import { flatbuffers } from "flatbuffers" // <-- to this ``` -1. Delete `Tensor_generated.ts` (remove this step once we support Tensors) -1. Remove Tensor import from `Schema_generated.ts` -1. Add `/* tslint:disable:class-name */` to the top of `Schema_generated.ts` +1. Remove `_generated` from the ES6 imports of the generated files + ```ts + import * as NS16187549871986683199 from "./Schema_generated"; // <-- change + import * as NS16187549871986683199 from "./Schema"; // <------- to this + ``` +1. Add `/* tslint:disable:class-name */` to the top of `Schema.ts` +1. Execute `npm run lint` to fix all the linting errors ## JavaScript (for Google Closure Compiler builds) 1. Generate the flatbuffers JS source from the Arrow project root directory ```sh - flatc --js -o ./js/closure-compiler-scripts ./format/*.fbs - ``` -1. Delete `Tensor_generated.js` (remove this step once we support Tensors) -1. Add `goog.module` declarations to the top of each generated file + cd $ARROW_HOME - Each file starts with a header that looks like this: - ```js - // automatically generated by the FlatBuffers compiler, do not modify + flatc --js --no-js-exports -o ./js/src/format ./format/*.fbs - /** - * @const - * @namespace - */ - var org = org || {}; - ``` + cd ./js/src/format + + # Delete Tensor_generated.js (skip this when we support Tensors) + rm Tensor_generated.js + + # append an ES6 export to Schema_generated.js + echo "$(cat Schema_generated.js) + export { org }; + " > Schema_generated.js - Update the header of each file to explicitly declare its module. + # import Schema's "org" namespace and + # append an ES6 export to File_generated.js + echo "import { org } from './Schema'; + $(cat File_generated.js) + export { org }; + " > File_generated.js - `Schema_generated.js`: + # import Schema's "org" namespace and + # append an ES6 export to Message_generated.js + echo "import { org } from './Schema'; + $(cat Message_generated.js) + export { org }; + " > Message_generated.js + ``` +1. Fixup the generated JS enums with the reverse value-to-key mappings to match TypeScript + `Message_generated.js` ```js - // automatically generated by the FlatBuffers compiler, do not modify - goog.module("module$targets$es5$cls$format$Schema_generated"); - goog.module.declareLegacyNamespace(); + // Replace this + org.apache.arrow.flatbuf.MessageHeader = { + NONE: 0, + Schema: 1, + DictionaryBatch: 2, + RecordBatch: 3, + Tensor: 4 + }; + // With this + org.apache.arrow.flatbuf.MessageHeader = { + NONE: 0, 0: 'NONE', + Schema: 1, 1: 'Schema', + DictionaryBatch: 2, 2: 'DictionaryBatch', + RecordBatch: 3, 3: 'RecordBatch', + Tensor: 4, 4: 'Tensor' + }; + ``` + `Schema_generated.js` + ```js + /** + * @enum + */ + org.apache.arrow.flatbuf.MetadataVersion = { + /** + * 0.1.0 + */ + V1: 0, 0: 'V1', + + /** + * 0.2.0 + */ + V2: 1, 1: 'V2', + + /** + * 0.3.0 -> 0.7.1 + */ + V3: 2, 2: 'V3', + + /** + * >= 0.8.0 + */ + V4: 3, 3: 'V4' + }; /** - * @const - * @namespace - */ - var org = org || {}; - ``` + * @enum + */ + org.apache.arrow.flatbuf.UnionMode = { + Sparse: 0, 0: 'Sparse', + Dense: 1, 1: 'Dense', + }; - `File_generated.js`: + /** + * @enum + */ + org.apache.arrow.flatbuf.Precision = { + HALF: 0, 0: 'HALF', + SINGLE: 1, 1: 'SINGLE', + DOUBLE: 2, 2: 'DOUBLE', + }; - ```js - // automatically generated by the FlatBuffers compiler, do not modify - goog.module("module$targets$es5$cls$format$File_generated"); - goog.module.declareLegacyNamespace(); - var Schema_ = goog.require("module$targets$es5$cls$format$Schema_generated"); /** - * @const - * @namespace - */ - var org = Schema_.org; - ``` + * @enum + */ + org.apache.arrow.flatbuf.DateUnit = { + DAY: 0, 0: 'DAY', + MILLISECOND: 1, 1: 'MILLISECOND', + }; + + /** + * @enum + */ + org.apache.arrow.flatbuf.TimeUnit = { + SECOND: 0, 0: 'SECOND', + MILLISECOND: 1, 1: 'MILLISECOND', + MICROSECOND: 2, 2: 'MICROSECOND', + NANOSECOND: 3, 3: 'NANOSECOND', + }; - `Message_generated.js`: + /** + * @enum + */ + org.apache.arrow.flatbuf.IntervalUnit = { + YEAR_MONTH: 0, 0: 'YEAR_MONTH', + DAY_TIME: 1, 1: 'DAY_TIME', + }; - ```js - // automatically generated by the FlatBuffers compiler, do not modify - goog.module("module$targets$es5$cls$format$Message_generated"); - goog.module.declareLegacyNamespace(); - var Schema_ = goog.require("module$targets$es5$cls$format$Schema_generated"); /** - * @const - * @namespace - */ - var org = Schema_.org; - ``` + * ---------------------------------------------------------------------- + * Top-level Type value, enabling extensible type-specific metadata. We can + * add new logical types to Type without breaking backwards compatibility + * + * @enum + */ + org.apache.arrow.flatbuf.Type = { + NONE: 0, 0: 'NONE', + Null: 1, 1: 'Null', + Int: 2, 2: 'Int', + FloatingPoint: 3, 3: 'FloatingPoint', + Binary: 4, 4: 'Binary', + Utf8: 5, 5: 'Utf8', + Bool: 6, 6: 'Bool', + Decimal: 7, 7: 'Decimal', + Date: 8, 8: 'Date', + Time: 9, 9: 'Time', + Timestamp: 10, 10: 'Timestamp', + Interval: 11, 11: 'Interval', + List: 12, 12: 'List', + Struct_: 13, 13: 'Struct_', + Union: 14, 14: 'Union', + FixedSizeBinary: 15, 15: 'FixedSizeBinary', + FixedSizeList: 16, 16: 'FixedSizeList', + Map: 17, 17: 'Map' + }; -1. Replace the last line's export declaration + /** + * ---------------------------------------------------------------------- + * The possible types of a vector + * + * @enum + */ + org.apache.arrow.flatbuf.VectorType = { + /** + * used in List type, Dense Union and variable length primitive types (String, Binary) + */ + OFFSET: 0, 0: 'OFFSET', - The last line of each file is: + /** + * actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector + */ + DATA: 1, 1: 'DATA', - ```js - // Exports for Node.js and RequireJS - this.org = org; - ``` + /** + * Bit vector indicating if each value is null + */ + VALIDITY: 2, 2: 'VALIDITY', - This should instead read: + /** + * Type vector used in Union type + */ + TYPE: 3, 3: 'TYPE' + }; - ```js - // Exports for Node.js and RequireJS - exports.org = org; - ``` \ No newline at end of file + /** + * ---------------------------------------------------------------------- + * Endianness of the platform producing the data + * + * @enum + */ + org.apache.arrow.flatbuf.Endianness = { + Little: 0, 0: 'Little', + Big: 1, 1: 'Big', + }; + ``` diff --git a/js/LICENSE b/js/LICENSE deleted file mode 100644 index 02e7948085055..0000000000000 --- a/js/LICENSE +++ /dev/null @@ -1,39 +0,0 @@ -## 3rd-party licenses for code that has been adapted for the Arrow JavaScript - library - --------------------------------------------------------------------------------- - -This project includes code from the FlatBuffers project - -Copyright 2014 Google Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - --------------------------------------------------------------------------------- - -This project includes code from the tslib project - -Copyright 2015 Microsoft Corporation. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - diff --git a/js/README.md b/js/README.md index bee3a9c7d42db..b427923e37ea1 100644 --- a/js/README.md +++ b/js/README.md @@ -22,7 +22,7 @@ [![Build Status](https://travis-ci.org/apache/arrow.svg?branch=master)](https://travis-ci.org/apache/arrow) [![Coverage Status](https://coveralls.io/repos/github/apache/arrow/badge.svg)](https://coveralls.io/github/apache/arrow) -Arrow is a set of technologies that enable big-data systems to process and move data fast. +Arrow is a set of technologies that enable big data systems to process and transfer data quickly. ## install [apache-arrow from npm](https://www.npmjs.com/package/apache-arrow) @@ -32,14 +32,9 @@ Arrow is a set of technologies that enable big-data systems to process and move # Powering Columnar In-Memory Analytics -Apache Arrow is a columnar memory layout specification for encoding vectors and table-like containers of flat and nested data. The Arrow spec aligns columnar data in memory to minimize cache misses and take advantage of the latest SIMD (Single input multiple data) and GPU operations on modern processors. +[Apache Arrow](https://github.com/apache/arrow) is a columnar memory layout specification for encoding vectors and table-like containers of flat and nested data. The Arrow spec aligns columnar data in memory to minimize cache misses and take advantage of the latest SIMD (Single input multiple data) and GPU operations on modern processors. -Apache Arrow is the emerging standard for large in-memory columnar data ([Spark](https://spark.apache.org/), [Pandas](http://wesmckinney.com/blog/pandas-and-apache-arrow/), [Drill](https://drill.apache.org/), ...). By standardizing on a common binary interchange format, big data systems can reduce the costs and friction associated with cross-system communication. - -# Related Projects - -* [GoAI](http://gpuopenanalytics.com/) -- Arrow-powered GPU analytics -* [rxjs-mapd](https://github.com/graphistry/rxjs-mapd) -- A MapD Core node-driver that returns query results as Arrow columns +Apache Arrow is the emerging standard for large in-memory columnar data ([Spark](https://spark.apache.org/), [Pandas](http://wesmckinney.com/blog/pandas-and-apache-arrow/), [Drill](https://drill.apache.org/), [Graphistry](https://www.graphistry.com), ...). By standardizing on a common binary interchange format, big data systems can reduce the costs and friction associated with cross-system communication. # Usage @@ -50,7 +45,7 @@ import { readFileSync } from 'fs'; import { Table } from 'apache-arrow'; const arrow = readFileSync('simple.arrow'); -const table = Table.from(arrow); +const table = Table.from([arrow]); console.log(table.toString()); @@ -70,7 +65,7 @@ null, null, null import { readFileSync } from 'fs'; import { Table } from 'apache-arrow'; -const table = Table.from(...[ +const table = Table.from([ 'latlong/schema.arrow', 'latlong/records.arrow' ].map((file) => readFileSync(file))); @@ -93,12 +88,12 @@ console.log(table.toString()); import { readFileSync } from 'fs'; import { Table } from 'apache-arrow'; -const table = Table.from(...[ +const table = Table.from([ 'latlong/schema.arrow', 'latlong/records.arrow' ].map(readFileSync)); -const column = table.getColumn('origin_lat'); +const column = table.col('origin_lat'); const typed = column.slice(); assert(typed instanceof Float32Array); @@ -135,7 +130,7 @@ MapD.open(host, port) // Create Arrow Table from results Table.from(schema, records)) .map((table) => - // Stringify the table to CSV + // Stringify the table to CSV with row numbers table.toString({ index: true })) .subscribe((csvStr) => console.log(csvStr)); @@ -183,22 +178,46 @@ The base `apache-arrow` package includes all the compilation targets for conveni The targets are also published under the `@apache-arrow` namespace: ```sh -npm install @apache-arrow/es5-cjs # ES5 CommonJS target -npm install @apache-arrow/es5-esm # ES5 ESModules target -npm install @apache-arrow/es5-umd # ES5 UMD target -npm install @apache-arrow/es2015-cjs # ES2015 CommonJS target -npm install @apache-arrow/es2015-esm # ES2015 ESModules target -npm install @apache-arrow/es2015-umd # ES2015 UMD target -npm install @apache-arrow/esnext-esm # ESNext CommonJS target -npm install @apache-arrow/esnext-esm # ESNext ESModules target -npm install @apache-arrow/esnext-umd # ESNext UMD target +npm install apache-arrow # <-- combined es5/UMD, es2015/CommonJS/ESModules/UMD, and TypeScript package +npm install @apache-arrow/ts # standalone TypeScript package +npm install @apache-arrow/es5-cjs # standalone es5/CommonJS package +npm install @apache-arrow/es5-esm # standalone es5/ESModules package +npm install @apache-arrow/es5-umd # standalone es5/UMD package +npm install @apache-arrow/es2015-cjs # standalone es2015/CommonJS package +npm install @apache-arrow/es2015-esm # standalone es2015/ESModules package +npm install @apache-arrow/es2015-umd # standalone es2015/UMD package +npm install @apache-arrow/esnext-esm # standalone esNext/CommonJS package +npm install @apache-arrow/esnext-esm # standalone esNext/ESModules package +npm install @apache-arrow/esnext-umd # standalone esNext/UMD package ``` ### Why we package like this The JS community is a diverse group with a varied list of target environments and tool chains. Publishing multiple packages accommodates projects of all stripes. -If you think we missed a compilation target and it's a blocker for adoption, please open an issue. We're here for you ❤️. +If you think we missed a compilation target and it's a blocker for adoption, please open an issue. + +# People + +Full list of broader Apache Arrow [committers](https://arrow.apache.org/committers/). + +* Brian Hulette, CCRi, _contributor_ +* Paul Taylor, Graphistry, Inc., _committer_ + +# Powered By Apache Arrow in JS + +Full list of broader Apache Arrow [projects & organizations](https://github.com/apache/arrow/blob/master/site/powered_by.md). + +## Open Source Projects + +* [Apache Arrow](https://arrow.apache.org) -- Parent project for Powering Columnar In-Memory Analytics, including affiliated open source projects +* [rxjs-mapd](https://github.com/graphistry/rxjs-mapd) -- A MapD Core node-driver that returns query results as Arrow columns + +## Companies & Organizations + +* [CCRi](http://www.ccri.com/) -- Commonwealth Computer Research Inc, or CCRi, is a Central Virginia based data science and software engineering company +* [GOAI](http://gpuopenanalytics.com/) -- GPU Open Analytics Initiative standardizes on Arrow as part of creating common data frameworks that enable developers and statistical researchers to accelerate data science on GPUs +* [Graphistry, Inc.](https://www.graphistry.com/) - An end-to-end GPU accelerated visual investigation platform used by teams for security, anti-fraud, and related investigations. Graphistry uses Arrow in its NodeJS GPU backend and client libraries, and is an early contributing member to GOAI and Arrow\[JS\] working to bring these technologies to the enterprise. # License @@ -207,4 +226,4 @@ If you think we missed a compilation target and it's a blocker for adoption, ple [1]: mailto:dev-subscribe@arrow.apache.org [2]: https://github.com/apache/arrow/tree/master/format [3]: https://issues.apache.org/jira/browse/ARROW -[4]: https://github.com/apache/arrow \ No newline at end of file +[4]: https://github.com/apache/arrow diff --git a/js/bin/integration.js b/js/bin/integration.js new file mode 100755 index 0000000000000..fe32433d3845a --- /dev/null +++ b/js/bin/integration.js @@ -0,0 +1,86 @@ +#! /usr/bin/env node + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +var path = require('path'); +var gulp = require.resolve(path.join(`..`, `node_modules/gulp/bin/gulp.js`)); +var child_process = require(`child_process`); +var optionList = [ + { + type: String, + name: 'mode', + description: 'The integration test to run' + }, + { + type: String, + name: 'arrow', alias: 'a', + description: 'The Arrow file to read/write' + }, + { + type: String, + name: 'json', alias: 'j', + description: 'The JSON file to read/write' + } +]; + +var argv = require(`command-line-args`)(optionList, { partial: true }); + +function print_usage() { + console.log(require('command-line-usage')([ + { + header: 'integration', + content: 'Script for running Arrow integration tests' + }, + { + header: 'Synopsis', + content: [ + '$ integration.js -j file.json -a file.arrow --mode validate' + ] + }, + { + header: 'Options', + optionList: [ + ...optionList, + { + name: 'help', + description: 'Print this usage guide.' + } + ] + }, + ])); + process.exit(1); +} + +if (!argv.arrow || !argv.json || !argv.mode) { + return print_usage(); +} + +switch (argv.mode.toUpperCase()) { + case 'VALIDATE': + child_process.spawnSync( + gulp, + [`test`, `-i`].concat(process.argv.slice(2)), + { + cwd: path.resolve(__dirname, '..'), + stdio: ['ignore', 'inherit', 'inherit'] + } + ); + break; + default: + print_usage(); +} diff --git a/js/closure-compiler-scripts/flatbuffers.js b/js/closure-compiler-scripts/flatbuffers.js deleted file mode 100644 index e51a4a039536f..0000000000000 --- a/js/closure-compiler-scripts/flatbuffers.js +++ /dev/null @@ -1,1204 +0,0 @@ -/** - * closure-compiler-friendly flatbuffers - * copied from node_modules/flatbuffers/js/flatbuffers.js - * update as needed - */ - - /// @file -/// @addtogroup flatbuffers_javascript_api -/// @{ -/// @cond FLATBUFFERS_INTERNAL - -goog.module("module$flatbuffers"); -goog.module.declareLegacyNamespace(); -/** - * @fileoverview - * - * Need to suppress 'global this' error so the Node.js export line doesn't cause - * closure compile to error out. - * @suppress {globalThis} - */ - -/** - * @const - * @namespace - */ -var flatbuffers = {}; - -/** - * @typedef {number} - */ -flatbuffers.Offset; - -/** - * @typedef {{ - * bb: flatbuffers.ByteBuffer, - * bb_pos: number - * }} - */ -flatbuffers.Table; - -/** - * @type {number} - * @const - */ -flatbuffers.SIZEOF_SHORT = 2; - -/** - * @type {number} - * @const - */ -flatbuffers.SIZEOF_INT = 4; - -/** - * @type {number} - * @const - */ -flatbuffers.FILE_IDENTIFIER_LENGTH = 4; - -/** - * @enum {number} - */ -flatbuffers.Encoding = { - UTF8_BYTES: 1, - UTF16_STRING: 2 -}; - -/** - * @type {Int32Array} - * @const - */ -flatbuffers.int32 = new Int32Array(2); - -/** - * @type {Float32Array} - * @const - */ -flatbuffers.float32 = new Float32Array(flatbuffers.int32.buffer); - -/** - * @type {Float64Array} - * @const - */ -flatbuffers.float64 = new Float64Array(flatbuffers.int32.buffer); - -/** - * @type {boolean} - * @const - */ -flatbuffers.isLittleEndian = new Uint16Array(new Uint8Array([1, 0]).buffer)[0] === 1; - -//////////////////////////////////////////////////////////////////////////////// - -/** - * @constructor - * @param {number} low - * @param {number} high - */ -flatbuffers.Long = function(low, high) { - /** - * @type {number} - * @const - */ - this.low = low | 0; - - /** - * @type {number} - * @const - */ - this.high = high | 0; -}; - -/** - * @param {number} low - * @param {number} high - * @returns {flatbuffers.Long} - */ -flatbuffers.Long.create = function(low, high) { - // Special-case zero to avoid GC overhead for default values - return low == 0 && high == 0 ? flatbuffers.Long.ZERO : new flatbuffers.Long(low, high); -}; - -/** - * @returns {number} - */ -flatbuffers.Long.prototype.toFloat64 = function() { - return (this.low >>> 0) + this.high * 0x100000000; -}; - -/** - * @param {flatbuffers.Long} other - * @returns {boolean} - */ -flatbuffers.Long.prototype.equals = function(other) { - return this.low == other.low && this.high == other.high; -}; - -/** - * @type {flatbuffers.Long} - * @const - */ -flatbuffers.Long.ZERO = new flatbuffers.Long(0, 0); - -/// @endcond -//////////////////////////////////////////////////////////////////////////////// -/** - * Create a FlatBufferBuilder. - * - * @constructor - * @param {number=} opt_initial_size - */ -flatbuffers.Builder = function(opt_initial_size) { - if (!opt_initial_size) { - var initial_size = 1024; - } else { - var initial_size = opt_initial_size; - } - - /** - * @type {flatbuffers.ByteBuffer} - * @private - */ - this.bb = flatbuffers.ByteBuffer.allocate(initial_size); - - /** - * Remaining space in the ByteBuffer. - * - * @type {number} - * @private - */ - this.space = initial_size; - - /** - * Minimum alignment encountered so far. - * - * @type {number} - * @private - */ - this.minalign = 1; - - /** - * The vtable for the current table. - * - * @type {Array.} - * @private - */ - this.vtable = null; - - /** - * The amount of fields we're actually using. - * - * @type {number} - * @private - */ - this.vtable_in_use = 0; - - /** - * Whether we are currently serializing a table. - * - * @type {boolean} - * @private - */ - this.isNested = false; - - /** - * Starting offset of the current struct/table. - * - * @type {number} - * @private - */ - this.object_start = 0; - - /** - * List of offsets of all vtables. - * - * @type {Array.} - * @private - */ - this.vtables = []; - - /** - * For the current vector being built. - * - * @type {number} - * @private - */ - this.vector_num_elems = 0; - - /** - * False omits default values from the serialized data - * - * @type {boolean} - * @private - */ - this.force_defaults = false; -}; - -/** - * In order to save space, fields that are set to their default value - * don't get serialized into the buffer. Forcing defaults provides a - * way to manually disable this optimization. - * - * @param {boolean} forceDefaults true always serializes default values - */ -flatbuffers.Builder.prototype.forceDefaults = function(forceDefaults) { - this.force_defaults = forceDefaults; -}; - -/** - * Get the ByteBuffer representing the FlatBuffer. Only call this after you've - * called finish(). The actual data starts at the ByteBuffer's current position, - * not necessarily at 0. - * - * @returns {flatbuffers.ByteBuffer} - */ -flatbuffers.Builder.prototype.dataBuffer = function() { - return this.bb; -}; - -/** - * Get the bytes representing the FlatBuffer. Only call this after you've - * called finish(). - * - * @returns {Uint8Array} - */ -flatbuffers.Builder.prototype.asUint8Array = function() { - return this.bb.bytes().subarray(this.bb.position(), this.bb.position() + this.offset()); -}; - -/// @cond FLATBUFFERS_INTERNAL -/** - * Prepare to write an element of `size` after `additional_bytes` have been - * written, e.g. if you write a string, you need to align such the int length - * field is aligned to 4 bytes, and the string data follows it directly. If all - * you need to do is alignment, `additional_bytes` will be 0. - * - * @param {number} size This is the of the new element to write - * @param {number} additional_bytes The padding size - */ -flatbuffers.Builder.prototype.prep = function(size, additional_bytes) { - // Track the biggest thing we've ever aligned to. - if (size > this.minalign) { - this.minalign = size; - } - - // Find the amount of alignment needed such that `size` is properly - // aligned after `additional_bytes` - var align_size = ((~(this.bb.capacity() - this.space + additional_bytes)) + 1) & (size - 1); - - // Reallocate the buffer if needed. - while (this.space < align_size + size + additional_bytes) { - var old_buf_size = this.bb.capacity(); - this.bb = flatbuffers.Builder.growByteBuffer(this.bb); - this.space += this.bb.capacity() - old_buf_size; - } - - this.pad(align_size); -}; - -/** - * @param {number} byte_size - */ -flatbuffers.Builder.prototype.pad = function(byte_size) { - for (var i = 0; i < byte_size; i++) { - this.bb.writeInt8(--this.space, 0); - } -}; - -/** - * @param {number} value - */ -flatbuffers.Builder.prototype.writeInt8 = function(value) { - this.bb.writeInt8(this.space -= 1, value); -}; - -/** - * @param {number} value - */ -flatbuffers.Builder.prototype.writeInt16 = function(value) { - this.bb.writeInt16(this.space -= 2, value); -}; - -/** - * @param {number} value - */ -flatbuffers.Builder.prototype.writeInt32 = function(value) { - this.bb.writeInt32(this.space -= 4, value); -}; - -/** - * @param {flatbuffers.Long} value - */ -flatbuffers.Builder.prototype.writeInt64 = function(value) { - this.bb.writeInt64(this.space -= 8, value); -}; - -/** - * @param {number} value - */ -flatbuffers.Builder.prototype.writeFloat32 = function(value) { - this.bb.writeFloat32(this.space -= 4, value); -}; - -/** - * @param {number} value - */ -flatbuffers.Builder.prototype.writeFloat64 = function(value) { - this.bb.writeFloat64(this.space -= 8, value); -}; -/// @endcond - -/** - * Add an `int8` to the buffer, properly aligned, and grows the buffer (if necessary). - * @param {number} value The `int8` to add the the buffer. - */ -flatbuffers.Builder.prototype.addInt8 = function(value) { - this.prep(1, 0); - this.writeInt8(value); -}; - -/** - * Add an `int16` to the buffer, properly aligned, and grows the buffer (if necessary). - * @param {number} value The `int16` to add the the buffer. - */ -flatbuffers.Builder.prototype.addInt16 = function(value) { - this.prep(2, 0); - this.writeInt16(value); -}; - -/** - * Add an `int32` to the buffer, properly aligned, and grows the buffer (if necessary). - * @param {number} value The `int32` to add the the buffer. - */ -flatbuffers.Builder.prototype.addInt32 = function(value) { - this.prep(4, 0); - this.writeInt32(value); -}; - -/** - * Add an `int64` to the buffer, properly aligned, and grows the buffer (if necessary). - * @param {flatbuffers.Long} value The `int64` to add the the buffer. - */ -flatbuffers.Builder.prototype.addInt64 = function(value) { - this.prep(8, 0); - this.writeInt64(value); -}; - -/** - * Add a `float32` to the buffer, properly aligned, and grows the buffer (if necessary). - * @param {number} value The `float32` to add the the buffer. - */ -flatbuffers.Builder.prototype.addFloat32 = function(value) { - this.prep(4, 0); - this.writeFloat32(value); -}; - -/** - * Add a `float64` to the buffer, properly aligned, and grows the buffer (if necessary). - * @param {number} value The `float64` to add the the buffer. - */ -flatbuffers.Builder.prototype.addFloat64 = function(value) { - this.prep(8, 0); - this.writeFloat64(value); -}; - -/// @cond FLATBUFFERS_INTERNAL -/** - * @param {number} voffset - * @param {number} value - * @param {number} defaultValue - */ -flatbuffers.Builder.prototype.addFieldInt8 = function(voffset, value, defaultValue) { - if (this.force_defaults || value != defaultValue) { - this.addInt8(value); - this.slot(voffset); - } -}; - -/** - * @param {number} voffset - * @param {number} value - * @param {number} defaultValue - */ -flatbuffers.Builder.prototype.addFieldInt16 = function(voffset, value, defaultValue) { - if (this.force_defaults || value != defaultValue) { - this.addInt16(value); - this.slot(voffset); - } -}; - -/** - * @param {number} voffset - * @param {number} value - * @param {number} defaultValue - */ -flatbuffers.Builder.prototype.addFieldInt32 = function(voffset, value, defaultValue) { - if (this.force_defaults || value != defaultValue) { - this.addInt32(value); - this.slot(voffset); - } -}; - -/** - * @param {number} voffset - * @param {flatbuffers.Long} value - * @param {flatbuffers.Long} defaultValue - */ -flatbuffers.Builder.prototype.addFieldInt64 = function(voffset, value, defaultValue) { - if (this.force_defaults || !value.equals(defaultValue)) { - this.addInt64(value); - this.slot(voffset); - } -}; - -/** - * @param {number} voffset - * @param {number} value - * @param {number} defaultValue - */ -flatbuffers.Builder.prototype.addFieldFloat32 = function(voffset, value, defaultValue) { - if (this.force_defaults || value != defaultValue) { - this.addFloat32(value); - this.slot(voffset); - } -}; - -/** - * @param {number} voffset - * @param {number} value - * @param {number} defaultValue - */ -flatbuffers.Builder.prototype.addFieldFloat64 = function(voffset, value, defaultValue) { - if (this.force_defaults || value != defaultValue) { - this.addFloat64(value); - this.slot(voffset); - } -}; - -/** - * @param {number} voffset - * @param {flatbuffers.Offset} value - * @param {flatbuffers.Offset} defaultValue - */ -flatbuffers.Builder.prototype.addFieldOffset = function(voffset, value, defaultValue) { - if (this.force_defaults || value != defaultValue) { - this.addOffset(value); - this.slot(voffset); - } -}; - -/** - * Structs are stored inline, so nothing additional is being added. `d` is always 0. - * - * @param {number} voffset - * @param {flatbuffers.Offset} value - * @param {flatbuffers.Offset} defaultValue - */ -flatbuffers.Builder.prototype.addFieldStruct = function(voffset, value, defaultValue) { - if (value != defaultValue) { - this.nested(value); - this.slot(voffset); - } -}; - -/** - * Structures are always stored inline, they need to be created right - * where they're used. You'll get this assertion failure if you - * created it elsewhere. - * - * @param {flatbuffers.Offset} obj The offset of the created object - */ -flatbuffers.Builder.prototype.nested = function(obj) { - if (obj != this.offset()) { - throw new Error('FlatBuffers: struct must be serialized inline.'); - } -}; - -/** - * Should not be creating any other object, string or vector - * while an object is being constructed - */ -flatbuffers.Builder.prototype.notNested = function() { - if (this.isNested) { - throw new Error('FlatBuffers: object serialization must not be nested.'); - } -}; - -/** - * Set the current vtable at `voffset` to the current location in the buffer. - * - * @param {number} voffset - */ -flatbuffers.Builder.prototype.slot = function(voffset) { - this.vtable[voffset] = this.offset(); -}; - -/** - * @returns {flatbuffers.Offset} Offset relative to the end of the buffer. - */ -flatbuffers.Builder.prototype.offset = function() { - return this.bb.capacity() - this.space; -}; - -/** - * Doubles the size of the backing ByteBuffer and copies the old data towards - * the end of the new buffer (since we build the buffer backwards). - * - * @param {flatbuffers.ByteBuffer} bb The current buffer with the existing data - * @returns {flatbuffers.ByteBuffer} A new byte buffer with the old data copied - * to it. The data is located at the end of the buffer. - * - * uint8Array.set() formally takes {Array|ArrayBufferView}, so to pass - * it a uint8Array we need to suppress the type check: - * @suppress {checkTypes} - */ -flatbuffers.Builder.growByteBuffer = function(bb) { - var old_buf_size = bb.capacity(); - - // Ensure we don't grow beyond what fits in an int. - if (old_buf_size & 0xC0000000) { - throw new Error('FlatBuffers: cannot grow buffer beyond 2 gigabytes.'); - } - - var new_buf_size = old_buf_size << 1; - var nbb = flatbuffers.ByteBuffer.allocate(new_buf_size); - nbb.setPosition(new_buf_size - old_buf_size); - nbb.bytes().set(bb.bytes(), new_buf_size - old_buf_size); - return nbb; -}; -/// @endcond - -/** - * Adds on offset, relative to where it will be written. - * - * @param {flatbuffers.Offset} offset The offset to add. - */ -flatbuffers.Builder.prototype.addOffset = function(offset) { - this.prep(flatbuffers.SIZEOF_INT, 0); // Ensure alignment is already done. - this.writeInt32(this.offset() - offset + flatbuffers.SIZEOF_INT); -}; - -/// @cond FLATBUFFERS_INTERNAL -/** - * Start encoding a new object in the buffer. Users will not usually need to - * call this directly. The FlatBuffers compiler will generate helper methods - * that call this method internally. - * - * @param {number} numfields - */ -flatbuffers.Builder.prototype.startObject = function(numfields) { - this.notNested(); - if (this.vtable == null) { - this.vtable = []; - } - this.vtable_in_use = numfields; - for (var i = 0; i < numfields; i++) { - this.vtable[i] = 0; // This will push additional elements as needed - } - this.isNested = true; - this.object_start = this.offset(); -}; - -/** - * Finish off writing the object that is under construction. - * - * @returns {flatbuffers.Offset} The offset to the object inside `dataBuffer` - */ -flatbuffers.Builder.prototype.endObject = function() { - if (this.vtable == null || !this.isNested) { - throw new Error('FlatBuffers: endObject called without startObject'); - } - - this.addInt32(0); - var vtableloc = this.offset(); - - // Write out the current vtable. - for (var i = this.vtable_in_use - 1; i >= 0; i--) { - // Offset relative to the start of the table. - this.addInt16(this.vtable[i] != 0 ? vtableloc - this.vtable[i] : 0); - } - - var standard_fields = 2; // The fields below: - this.addInt16(vtableloc - this.object_start); - this.addInt16((this.vtable_in_use + standard_fields) * flatbuffers.SIZEOF_SHORT); - - // Search for an existing vtable that matches the current one. - var existing_vtable = 0; -outer_loop: - for (var i = 0; i < this.vtables.length; i++) { - var vt1 = this.bb.capacity() - this.vtables[i]; - var vt2 = this.space; - var len = this.bb.readInt16(vt1); - if (len == this.bb.readInt16(vt2)) { - for (var j = flatbuffers.SIZEOF_SHORT; j < len; j += flatbuffers.SIZEOF_SHORT) { - if (this.bb.readInt16(vt1 + j) != this.bb.readInt16(vt2 + j)) { - continue outer_loop; - } - } - existing_vtable = this.vtables[i]; - break; - } - } - - if (existing_vtable) { - // Found a match: - // Remove the current vtable. - this.space = this.bb.capacity() - vtableloc; - - // Point table to existing vtable. - this.bb.writeInt32(this.space, existing_vtable - vtableloc); - } else { - // No match: - // Add the location of the current vtable to the list of vtables. - this.vtables.push(this.offset()); - - // Point table to current vtable. - this.bb.writeInt32(this.bb.capacity() - vtableloc, this.offset() - vtableloc); - } - - this.isNested = false; - return vtableloc; -}; -/// @endcond - -/** - * Finalize a buffer, poiting to the given `root_table`. - * - * @param {flatbuffers.Offset} root_table - * @param {string=} opt_file_identifier - */ -flatbuffers.Builder.prototype.finish = function(root_table, opt_file_identifier) { - if (opt_file_identifier) { - var file_identifier = opt_file_identifier; - this.prep(this.minalign, flatbuffers.SIZEOF_INT + - flatbuffers.FILE_IDENTIFIER_LENGTH); - if (file_identifier.length != flatbuffers.FILE_IDENTIFIER_LENGTH) { - throw new Error('FlatBuffers: file identifier must be length ' + - flatbuffers.FILE_IDENTIFIER_LENGTH); - } - for (var i = flatbuffers.FILE_IDENTIFIER_LENGTH - 1; i >= 0; i--) { - this.writeInt8(file_identifier.charCodeAt(i)); - } - } - this.prep(this.minalign, flatbuffers.SIZEOF_INT); - this.addOffset(root_table); - this.bb.setPosition(this.space); -}; - -/// @cond FLATBUFFERS_INTERNAL -/** - * This checks a required field has been set in a given table that has - * just been constructed. - * - * @param {flatbuffers.Offset} table - * @param {number} field - */ -flatbuffers.Builder.prototype.requiredField = function(table, field) { - var table_start = this.bb.capacity() - table; - var vtable_start = table_start - this.bb.readInt32(table_start); - var ok = this.bb.readInt16(vtable_start + field) != 0; - - // If this fails, the caller will show what field needs to be set. - if (!ok) { - throw new Error('FlatBuffers: field ' + field + ' must be set'); - } -}; - -/** - * Start a new array/vector of objects. Users usually will not call - * this directly. The FlatBuffers compiler will create a start/end - * method for vector types in generated code. - * - * @param {number} elem_size The size of each element in the array - * @param {number} num_elems The number of elements in the array - * @param {number} alignment The alignment of the array - */ -flatbuffers.Builder.prototype.startVector = function(elem_size, num_elems, alignment) { - this.notNested(); - this.vector_num_elems = num_elems; - this.prep(flatbuffers.SIZEOF_INT, elem_size * num_elems); - this.prep(alignment, elem_size * num_elems); // Just in case alignment > int. -}; - -/** - * Finish off the creation of an array and all its elements. The array must be - * created with `startVector`. - * - * @returns {flatbuffers.Offset} The offset at which the newly created array - * starts. - */ -flatbuffers.Builder.prototype.endVector = function() { - this.writeInt32(this.vector_num_elems); - return this.offset(); -}; -/// @endcond - -/** - * Encode the string `s` in the buffer using UTF-8. If a Uint8Array is passed - * instead of a string, it is assumed to contain valid UTF-8 encoded data. - * - * @param {string|Uint8Array} s The string to encode - * @return {flatbuffers.Offset} The offset in the buffer where the encoded string starts - */ -flatbuffers.Builder.prototype.createString = function(s) { - if (s instanceof Uint8Array) { - var utf8 = s; - } else { - var utf8 = []; - var i = 0; - - while (i < s.length) { - var codePoint; - - // Decode UTF-16 - var a = s.charCodeAt(i++); - if (a < 0xD800 || a >= 0xDC00) { - codePoint = a; - } else { - var b = s.charCodeAt(i++); - codePoint = (a << 10) + b + (0x10000 - (0xD800 << 10) - 0xDC00); - } - - // Encode UTF-8 - if (codePoint < 0x80) { - utf8.push(codePoint); - } else { - if (codePoint < 0x800) { - utf8.push(((codePoint >> 6) & 0x1F) | 0xC0); - } else { - if (codePoint < 0x10000) { - utf8.push(((codePoint >> 12) & 0x0F) | 0xE0); - } else { - utf8.push( - ((codePoint >> 18) & 0x07) | 0xF0, - ((codePoint >> 12) & 0x3F) | 0x80); - } - utf8.push(((codePoint >> 6) & 0x3F) | 0x80); - } - utf8.push((codePoint & 0x3F) | 0x80); - } - } - } - - this.addInt8(0); - this.startVector(1, utf8.length, 1); - this.bb.setPosition(this.space -= utf8.length); - for (var i = 0, offset = this.space, bytes = this.bb.bytes(); i < utf8.length; i++) { - bytes[offset++] = utf8[i]; - } - return this.endVector(); -}; - -/** - * A helper function to avoid generated code depending on this file directly. - * - * @param {number} low - * @param {number} high - * @returns {flatbuffers.Long} - */ -flatbuffers.Builder.prototype.createLong = function(low, high) { - return flatbuffers.Long.create(low, high); -}; -//////////////////////////////////////////////////////////////////////////////// -/// @cond FLATBUFFERS_INTERNAL -/** - * Create a new ByteBuffer with a given array of bytes (`Uint8Array`). - * - * @constructor - * @param {Uint8Array} bytes - */ -flatbuffers.ByteBuffer = function(bytes) { - /** - * @type {Uint8Array} - * @private - */ - this.bytes_ = bytes; - - /** - * @type {number} - * @private - */ - this.position_ = 0; -}; - -/** - * Create and allocate a new ByteBuffer with a given size. - * - * @param {number} byte_size - * @returns {flatbuffers.ByteBuffer} - */ -flatbuffers.ByteBuffer.allocate = function(byte_size) { - return new flatbuffers.ByteBuffer(new Uint8Array(byte_size)); -}; - -/** - * Get the underlying `Uint8Array`. - * - * @returns {Uint8Array} - */ -flatbuffers.ByteBuffer.prototype.bytes = function() { - return this.bytes_; -}; - -/** - * Get the buffer's position. - * - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.position = function() { - return this.position_; -}; - -/** - * Set the buffer's position. - * - * @param {number} position - */ -flatbuffers.ByteBuffer.prototype.setPosition = function(position) { - this.position_ = position; -}; - -/** - * Get the buffer's capacity. - * - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.capacity = function() { - return this.bytes_.length; -}; - -/** - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.readInt8 = function(offset) { - return this.readUint8(offset) << 24 >> 24; -}; - -/** - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.readUint8 = function(offset) { - return this.bytes_[offset]; -}; - -/** - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.readInt16 = function(offset) { - return this.readUint16(offset) << 16 >> 16; -}; - -/** - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.readUint16 = function(offset) { - return this.bytes_[offset] | this.bytes_[offset + 1] << 8; -}; - -/** - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.readInt32 = function(offset) { - return this.bytes_[offset] | this.bytes_[offset + 1] << 8 | this.bytes_[offset + 2] << 16 | this.bytes_[offset + 3] << 24; -}; - -/** - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.readUint32 = function(offset) { - return this.readInt32(offset) >>> 0; -}; - -/** - * @param {number} offset - * @returns {flatbuffers.Long} - */ -flatbuffers.ByteBuffer.prototype.readInt64 = function(offset) { - return new flatbuffers.Long(this.readInt32(offset), this.readInt32(offset + 4)); -}; - -/** - * @param {number} offset - * @returns {flatbuffers.Long} - */ -flatbuffers.ByteBuffer.prototype.readUint64 = function(offset) { - return new flatbuffers.Long(this.readUint32(offset), this.readUint32(offset + 4)); -}; - -/** - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.readFloat32 = function(offset) { - flatbuffers.int32[0] = this.readInt32(offset); - return flatbuffers.float32[0]; -}; - -/** - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.readFloat64 = function(offset) { - flatbuffers.int32[flatbuffers.isLittleEndian ? 0 : 1] = this.readInt32(offset); - flatbuffers.int32[flatbuffers.isLittleEndian ? 1 : 0] = this.readInt32(offset + 4); - return flatbuffers.float64[0]; -}; - -/** - * @param {number} offset - * @param {number|boolean} value - */ -flatbuffers.ByteBuffer.prototype.writeInt8 = function(offset, value) { - this.bytes_[offset] = /** @type {number} */(value); -}; - -/** - * @param {number} offset - * @param {number} value - */ -flatbuffers.ByteBuffer.prototype.writeUint8 = function(offset, value) { - this.bytes_[offset] = value; -}; - -/** - * @param {number} offset - * @param {number} value - */ -flatbuffers.ByteBuffer.prototype.writeInt16 = function(offset, value) { - this.bytes_[offset] = value; - this.bytes_[offset + 1] = value >> 8; -}; - -/** - * @param {number} offset - * @param {number} value - */ -flatbuffers.ByteBuffer.prototype.writeUint16 = function(offset, value) { - this.bytes_[offset] = value; - this.bytes_[offset + 1] = value >> 8; -}; - -/** - * @param {number} offset - * @param {number} value - */ -flatbuffers.ByteBuffer.prototype.writeInt32 = function(offset, value) { - this.bytes_[offset] = value; - this.bytes_[offset + 1] = value >> 8; - this.bytes_[offset + 2] = value >> 16; - this.bytes_[offset + 3] = value >> 24; -}; - -/** - * @param {number} offset - * @param {number} value - */ -flatbuffers.ByteBuffer.prototype.writeUint32 = function(offset, value) { - this.bytes_[offset] = value; - this.bytes_[offset + 1] = value >> 8; - this.bytes_[offset + 2] = value >> 16; - this.bytes_[offset + 3] = value >> 24; -}; - -/** - * @param {number} offset - * @param {flatbuffers.Long} value - */ -flatbuffers.ByteBuffer.prototype.writeInt64 = function(offset, value) { - this.writeInt32(offset, value.low); - this.writeInt32(offset + 4, value.high); -}; - -/** - * @param {number} offset - * @param {flatbuffers.Long} value - */ -flatbuffers.ByteBuffer.prototype.writeUint64 = function(offset, value) { - this.writeUint32(offset, value.low); - this.writeUint32(offset + 4, value.high); -}; - -/** - * @param {number} offset - * @param {number} value - */ -flatbuffers.ByteBuffer.prototype.writeFloat32 = function(offset, value) { - flatbuffers.float32[0] = value; - this.writeInt32(offset, flatbuffers.int32[0]); -}; - -/** - * @param {number} offset - * @param {number} value - */ -flatbuffers.ByteBuffer.prototype.writeFloat64 = function(offset, value) { - flatbuffers.float64[0] = value; - this.writeInt32(offset, flatbuffers.int32[flatbuffers.isLittleEndian ? 0 : 1]); - this.writeInt32(offset + 4, flatbuffers.int32[flatbuffers.isLittleEndian ? 1 : 0]); -}; - -/** - * Look up a field in the vtable, return an offset into the object, or 0 if the - * field is not present. - * - * @param {number} bb_pos - * @param {number} vtable_offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.__offset = function(bb_pos, vtable_offset) { - var vtable = bb_pos - this.readInt32(bb_pos); - return vtable_offset < this.readInt16(vtable) ? this.readInt16(vtable + vtable_offset) : 0; -}; - -/** - * Initialize any Table-derived type to point to the union at the given offset. - * - * @param {flatbuffers.Table} t - * @param {number} offset - * @returns {flatbuffers.Table} - */ -flatbuffers.ByteBuffer.prototype.__union = function(t, offset) { - t.bb_pos = offset + this.readInt32(offset); - t.bb = this; - return t; -}; - -/** - * Create a JavaScript string from UTF-8 data stored inside the FlatBuffer. - * This allocates a new string and converts to wide chars upon each access. - * - * To avoid the conversion to UTF-16, pass flatbuffers.Encoding.UTF8_BYTES as - * the "optionalEncoding" argument. This is useful for avoiding conversion to - * and from UTF-16 when the data will just be packaged back up in another - * FlatBuffer later on. - * - * @param {number} offset - * @param {flatbuffers.Encoding=} opt_encoding Defaults to UTF16_STRING - * @returns {string|Uint8Array} - */ -flatbuffers.ByteBuffer.prototype.__string = function(offset, opt_encoding) { - offset += this.readInt32(offset); - - var length = this.readInt32(offset); - var result = ''; - var i = 0; - - offset += flatbuffers.SIZEOF_INT; - - if (opt_encoding === flatbuffers.Encoding.UTF8_BYTES) { - return this.bytes_.subarray(offset, offset + length); - } - - while (i < length) { - var codePoint; - - // Decode UTF-8 - var a = this.readUint8(offset + i++); - if (a < 0xC0) { - codePoint = a; - } else { - var b = this.readUint8(offset + i++); - if (a < 0xE0) { - codePoint = - ((a & 0x1F) << 6) | - (b & 0x3F); - } else { - var c = this.readUint8(offset + i++); - if (a < 0xF0) { - codePoint = - ((a & 0x0F) << 12) | - ((b & 0x3F) << 6) | - (c & 0x3F); - } else { - var d = this.readUint8(offset + i++); - codePoint = - ((a & 0x07) << 18) | - ((b & 0x3F) << 12) | - ((c & 0x3F) << 6) | - (d & 0x3F); - } - } - } - - // Encode UTF-16 - if (codePoint < 0x10000) { - result += String.fromCharCode(codePoint); - } else { - codePoint -= 0x10000; - result += String.fromCharCode( - (codePoint >> 10) + 0xD800, - (codePoint & ((1 << 10) - 1)) + 0xDC00); - } - } - - return result; -}; - -/** - * Retrieve the relative offset stored at "offset" - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.__indirect = function(offset) { - return offset + this.readInt32(offset); -}; - -/** - * Get the start of data of a vector whose offset is stored at "offset" in this object. - * - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.__vector = function(offset) { - return offset + this.readInt32(offset) + flatbuffers.SIZEOF_INT; // data starts after the length -}; - -/** - * Get the length of a vector whose offset is stored at "offset" in this object. - * - * @param {number} offset - * @returns {number} - */ -flatbuffers.ByteBuffer.prototype.__vector_len = function(offset) { - return this.readInt32(offset + this.readInt32(offset)); -}; - -/** - * @param {string} ident - * @returns {boolean} - */ -flatbuffers.ByteBuffer.prototype.__has_identifier = function(ident) { - if (ident.length != flatbuffers.FILE_IDENTIFIER_LENGTH) { - throw new Error('FlatBuffers: file identifier must be length ' + - flatbuffers.FILE_IDENTIFIER_LENGTH); - } - for (var i = 0; i < flatbuffers.FILE_IDENTIFIER_LENGTH; i++) { - if (ident.charCodeAt(i) != this.readInt8(this.position_ + flatbuffers.SIZEOF_INT + i)) { - return false; - } - } - return true; -}; - -/** - * A helper function to avoid generated code depending on this file directly. - * - * @param {number} low - * @param {number} high - * @returns {flatbuffers.Long} - */ -flatbuffers.ByteBuffer.prototype.createLong = function(low, high) { - return flatbuffers.Long.create(low, high); -}; - -// Exports for Node.js and RequireJS -exports.flatbuffers = flatbuffers; - -/// @endcond -/// @} diff --git a/js/closure-compiler-scripts/text-encoding.js b/js/closure-compiler-scripts/text-encoding.js deleted file mode 100644 index ca9154f88ecba..0000000000000 --- a/js/closure-compiler-scripts/text-encoding.js +++ /dev/null @@ -1,648 +0,0 @@ -/** - * closure-compiler-friendly text-encoding-utf-8 - * copied from node_modules/text-encoding-utf-8/lib/encoding.cjs.js - * update as needed - */ - - // This is free and unencumbered software released into the public domain. -// See LICENSE.md for more information. - -// -// Utilities -// - -goog.module("module$text_encoding"); -goog.module.declareLegacyNamespace(); -/** - * @param {number} a The number to test. - * @param {number} min The minimum value in the range, inclusive. - * @param {number} max The maximum value in the range, inclusive. - * @return {boolean} True if a >= min and a <= max. - */ -function inRange(a, min, max) { - return min <= a && a <= max; -} - -/** - * @param {*} o - * @return {Object} - */ -function ToDictionary(o) { - if (o === undefined) return {}; - if (o === Object(o)) return o; - throw TypeError('Could not convert argument to dictionary'); -} - -/** - * @param {string} string Input string of UTF-16 code units. - * @return {!Array.} Code points. - */ -function stringToCodePoints(string) { - // https://heycam.github.io/webidl/#dfn-obtain-unicode - - // 1. Let S be the DOMString value. - var s = String(string); - - // 2. Let n be the length of S. - var n = s.length; - - // 3. Initialize i to 0. - var i = 0; - - // 4. Initialize U to be an empty sequence of Unicode characters. - var u = []; - - // 5. While i < n: - while (i < n) { - - // 1. Let c be the code unit in S at index i. - var c = s.charCodeAt(i); - - // 2. Depending on the value of c: - - // c < 0xD800 or c > 0xDFFF - if (c < 0xD800 || c > 0xDFFF) { - // Append to U the Unicode character with code point c. - u.push(c); - } - - // 0xDC00 ≤ c ≤ 0xDFFF - else if (0xDC00 <= c && c <= 0xDFFF) { - // Append to U a U+FFFD REPLACEMENT CHARACTER. - u.push(0xFFFD); - } - - // 0xD800 ≤ c ≤ 0xDBFF - else if (0xD800 <= c && c <= 0xDBFF) { - // 1. If i = n−1, then append to U a U+FFFD REPLACEMENT - // CHARACTER. - if (i === n - 1) { - u.push(0xFFFD); - } - // 2. Otherwise, i < n−1: - else { - // 1. Let d be the code unit in S at index i+1. - var d = string.charCodeAt(i + 1); - - // 2. If 0xDC00 ≤ d ≤ 0xDFFF, then: - if (0xDC00 <= d && d <= 0xDFFF) { - // 1. Let a be c & 0x3FF. - var a = c & 0x3FF; - - // 2. Let b be d & 0x3FF. - var b = d & 0x3FF; - - // 3. Append to U the Unicode character with code point - // 2^16+2^10*a+b. - u.push(0x10000 + (a << 10) + b); - - // 4. Set i to i+1. - i += 1; - } - - // 3. Otherwise, d < 0xDC00 or d > 0xDFFF. Append to U a - // U+FFFD REPLACEMENT CHARACTER. - else { - u.push(0xFFFD); - } - } - } - - // 3. Set i to i+1. - i += 1; - } - - // 6. Return U. - return u; -} - -/** - * @param {!Array.} code_points Array of code points. - * @return {string} string String of UTF-16 code units. - */ -function codePointsToString(code_points) { - var s = ''; - for (var i = 0; i < code_points.length; ++i) { - var cp = code_points[i]; - if (cp <= 0xFFFF) { - s += String.fromCharCode(cp); - } else { - cp -= 0x10000; - s += String.fromCharCode((cp >> 10) + 0xD800, - (cp & 0x3FF) + 0xDC00); - } - } - return s; -} - - -// -// Implementation of Encoding specification -// https://encoding.spec.whatwg.org/ -// - -// -// 3. Terminology -// - -/** - * End-of-stream is a special token that signifies no more tokens - * are in the stream. - * @const - */ var end_of_stream = -1; - -/** - * A stream represents an ordered sequence of tokens. - * - * @constructor - * @param {!(Array.|Uint8Array)} tokens Array of tokens that provide the - * stream. - */ -function Stream(tokens) { - /** @type {!Array.} */ - this.tokens = [].slice.call(tokens); -} - -Stream.prototype = { - /** - * @return {boolean} True if end-of-stream has been hit. - */ - endOfStream: function() { - return !this.tokens.length; - }, - - /** - * When a token is read from a stream, the first token in the - * stream must be returned and subsequently removed, and - * end-of-stream must be returned otherwise. - * - * @return {number} Get the next token from the stream, or - * end_of_stream. - */ - read: function() { - if (!this.tokens.length) - return end_of_stream; - return this.tokens.shift(); - }, - - /** - * When one or more tokens are prepended to a stream, those tokens - * must be inserted, in given order, before the first token in the - * stream. - * - * @param {(number|!Array.)} token The token(s) to prepend to the stream. - */ - prepend: function(token) { - if (Array.isArray(token)) { - var tokens = /**@type {!Array.}*/(token); - while (tokens.length) - this.tokens.unshift(tokens.pop()); - } else { - this.tokens.unshift(token); - } - }, - - /** - * When one or more tokens are pushed to a stream, those tokens - * must be inserted, in given order, after the last token in the - * stream. - * - * @param {(number|!Array.)} token The tokens(s) to prepend to the stream. - */ - push: function(token) { - if (Array.isArray(token)) { - var tokens = /**@type {!Array.}*/(token); - while (tokens.length) - this.tokens.push(tokens.shift()); - } else { - this.tokens.push(token); - } - } -}; - -// -// 4. Encodings -// - -// 4.1 Encoders and decoders - -/** @const */ -var finished = -1; - -/** - * @param {boolean} fatal If true, decoding errors raise an exception. - * @param {number=} opt_code_point Override the standard fallback code point. - * @return {number} The code point to insert on a decoding error. - */ -function decoderError(fatal, opt_code_point) { - if (fatal) - throw TypeError('Decoder error'); - return opt_code_point || 0xFFFD; -} - -// -// 7. API -// - -/** @const */ var DEFAULT_ENCODING = 'utf-8'; - -// 7.1 Interface TextDecoder - -/** - * @constructor - * @param {string=} encoding The label of the encoding; - * defaults to 'utf-8'. - * @param {Object=} options - */ -function TextDecoder(encoding, options) { - if (!(this instanceof TextDecoder)) { - return new TextDecoder(encoding, options); - } - encoding = encoding !== undefined ? String(encoding).toLowerCase() : DEFAULT_ENCODING; - if (encoding !== DEFAULT_ENCODING) { - throw new Error('Encoding not supported. Only utf-8 is supported'); - } - options = ToDictionary(options); - - /** @private @type {boolean} */ - this._streaming = false; - /** @private @type {boolean} */ - this._BOMseen = false; - /** @private @type {?Decoder} */ - this._decoder = null; - /** @private @type {boolean} */ - this._fatal = Boolean(options['fatal']); - /** @private @type {boolean} */ - this._ignoreBOM = Boolean(options['ignoreBOM']); - - Object.defineProperty(this, 'encoding', {value: 'utf-8'}); - Object.defineProperty(this, 'fatal', {value: this._fatal}); - Object.defineProperty(this, 'ignoreBOM', {value: this._ignoreBOM}); -} - -TextDecoder.prototype = { - /** - * @param {ArrayBufferView=} input The buffer of bytes to decode. - * @param {Object=} options - * @return {string} The decoded string. - */ - decode: function decode(input, options) { - var bytes; - if (typeof input === 'object' && input instanceof ArrayBuffer) { - bytes = new Uint8Array(input); - } else if (typeof input === 'object' && 'buffer' in input && - input.buffer instanceof ArrayBuffer) { - bytes = new Uint8Array(input.buffer, - input.byteOffset, - input.byteLength); - } else { - bytes = new Uint8Array(0); - } - - options = ToDictionary(options); - - if (!this._streaming) { - this._decoder = new UTF8Decoder({fatal: this._fatal}); - this._BOMseen = false; - } - this._streaming = Boolean(options['stream']); - - var input_stream = new Stream(bytes); - - var code_points = []; - - /** @type {?(number|!Array.)} */ - var result; - - while (!input_stream.endOfStream()) { - result = this._decoder.handler(input_stream, input_stream.read()); - if (result === finished) - break; - if (result === null) - continue; - if (Array.isArray(result)) - code_points.push.apply(code_points, /**@type {!Array.}*/(result)); - else - code_points.push(result); - } - if (!this._streaming) { - do { - result = this._decoder.handler(input_stream, input_stream.read()); - if (result === finished) - break; - if (result === null) - continue; - if (Array.isArray(result)) - code_points.push.apply(code_points, /**@type {!Array.}*/(result)); - else - code_points.push(result); - } while (!input_stream.endOfStream()); - this._decoder = null; - } - - if (code_points.length) { - // If encoding is one of utf-8, utf-16be, and utf-16le, and - // ignore BOM flag and BOM seen flag are unset, run these - // subsubsteps: - if (['utf-8'].indexOf(this.encoding) !== -1 && - !this._ignoreBOM && !this._BOMseen) { - // If token is U+FEFF, set BOM seen flag. - if (code_points[0] === 0xFEFF) { - this._BOMseen = true; - code_points.shift(); - } else { - // Otherwise, if token is not end-of-stream, set BOM seen - // flag and append token to output. - this._BOMseen = true; - } - } - } - - return codePointsToString(code_points); - } -}; - -// 7.2 Interface TextEncoder - -/** - * @constructor - * @param {string=} encoding The label of the encoding; - * defaults to 'utf-8'. - * @param {Object=} options - */ -function TextEncoder(encoding, options) { - if (!(this instanceof TextEncoder)) - return new TextEncoder(encoding, options); - encoding = encoding !== undefined ? String(encoding).toLowerCase() : DEFAULT_ENCODING; - if (encoding !== DEFAULT_ENCODING) { - throw new Error('Encoding not supported. Only utf-8 is supported'); - } - options = ToDictionary(options); - - /** @private @type {boolean} */ - this._streaming = false; - /** @private @type {?Encoder} */ - this._encoder = null; - /** @private @type {{fatal: boolean}} */ - this._options = {fatal: Boolean(options['fatal'])}; - - Object.defineProperty(this, 'encoding', {value: 'utf-8'}); -} - -TextEncoder.prototype = { - /** - * @param {string=} opt_string The string to encode. - * @param {Object=} options - * @return {Uint8Array} Encoded bytes, as a Uint8Array. - */ - encode: function encode(opt_string, options) { - opt_string = opt_string ? String(opt_string) : ''; - options = ToDictionary(options); - - // NOTE: This option is nonstandard. None of the encodings - // permitted for encoding (i.e. UTF-8, UTF-16) are stateful, - // so streaming is not necessary. - if (!this._streaming) - this._encoder = new UTF8Encoder(this._options); - this._streaming = Boolean(options['stream']); - - var bytes = []; - var input_stream = new Stream(stringToCodePoints(opt_string)); - /** @type {?(number|!Array.)} */ - var result; - while (!input_stream.endOfStream()) { - result = this._encoder.handler(input_stream, input_stream.read()); - if (result === finished) - break; - if (Array.isArray(result)) - bytes.push.apply(bytes, /**@type {!Array.}*/(result)); - else - bytes.push(result); - } - if (!this._streaming) { - while (true) { - result = this._encoder.handler(input_stream, input_stream.read()); - if (result === finished) - break; - if (Array.isArray(result)) - bytes.push.apply(bytes, /**@type {!Array.}*/(result)); - else - bytes.push(result); - } - this._encoder = null; - } - return new Uint8Array(bytes); - } -}; - -// -// 8. The encoding -// - -// 8.1 utf-8 - -/** - * @constructor - * @implements {Decoder} - * @param {{fatal: boolean}} options - */ -function UTF8Decoder(options) { - var fatal = options.fatal; - - // utf-8's decoder's has an associated utf-8 code point, utf-8 - // bytes seen, and utf-8 bytes needed (all initially 0), a utf-8 - // lower boundary (initially 0x80), and a utf-8 upper boundary - // (initially 0xBF). - var /** @type {number} */ utf8_code_point = 0, - /** @type {number} */ utf8_bytes_seen = 0, - /** @type {number} */ utf8_bytes_needed = 0, - /** @type {number} */ utf8_lower_boundary = 0x80, - /** @type {number} */ utf8_upper_boundary = 0xBF; - - /** - * @param {Stream} stream The stream of bytes being decoded. - * @param {number} bite The next byte read from the stream. - * @return {?(number|!Array.)} The next code point(s) - * decoded, or null if not enough data exists in the input - * stream to decode a complete code point. - */ - this.handler = function(stream, bite) { - // 1. If byte is end-of-stream and utf-8 bytes needed is not 0, - // set utf-8 bytes needed to 0 and return error. - if (bite === end_of_stream && utf8_bytes_needed !== 0) { - utf8_bytes_needed = 0; - return decoderError(fatal); - } - - // 2. If byte is end-of-stream, return finished. - if (bite === end_of_stream) - return finished; - - // 3. If utf-8 bytes needed is 0, based on byte: - if (utf8_bytes_needed === 0) { - - // 0x00 to 0x7F - if (inRange(bite, 0x00, 0x7F)) { - // Return a code point whose value is byte. - return bite; - } - - // 0xC2 to 0xDF - if (inRange(bite, 0xC2, 0xDF)) { - // Set utf-8 bytes needed to 1 and utf-8 code point to byte - // − 0xC0. - utf8_bytes_needed = 1; - utf8_code_point = bite - 0xC0; - } - - // 0xE0 to 0xEF - else if (inRange(bite, 0xE0, 0xEF)) { - // 1. If byte is 0xE0, set utf-8 lower boundary to 0xA0. - if (bite === 0xE0) - utf8_lower_boundary = 0xA0; - // 2. If byte is 0xED, set utf-8 upper boundary to 0x9F. - if (bite === 0xED) - utf8_upper_boundary = 0x9F; - // 3. Set utf-8 bytes needed to 2 and utf-8 code point to - // byte − 0xE0. - utf8_bytes_needed = 2; - utf8_code_point = bite - 0xE0; - } - - // 0xF0 to 0xF4 - else if (inRange(bite, 0xF0, 0xF4)) { - // 1. If byte is 0xF0, set utf-8 lower boundary to 0x90. - if (bite === 0xF0) - utf8_lower_boundary = 0x90; - // 2. If byte is 0xF4, set utf-8 upper boundary to 0x8F. - if (bite === 0xF4) - utf8_upper_boundary = 0x8F; - // 3. Set utf-8 bytes needed to 3 and utf-8 code point to - // byte − 0xF0. - utf8_bytes_needed = 3; - utf8_code_point = bite - 0xF0; - } - - // Otherwise - else { - // Return error. - return decoderError(fatal); - } - - // Then (byte is in the range 0xC2 to 0xF4) set utf-8 code - // point to utf-8 code point << (6 × utf-8 bytes needed) and - // return continue. - utf8_code_point = utf8_code_point << (6 * utf8_bytes_needed); - return null; - } - - // 4. If byte is not in the range utf-8 lower boundary to utf-8 - // upper boundary, run these substeps: - if (!inRange(bite, utf8_lower_boundary, utf8_upper_boundary)) { - - // 1. Set utf-8 code point, utf-8 bytes needed, and utf-8 - // bytes seen to 0, set utf-8 lower boundary to 0x80, and set - // utf-8 upper boundary to 0xBF. - utf8_code_point = utf8_bytes_needed = utf8_bytes_seen = 0; - utf8_lower_boundary = 0x80; - utf8_upper_boundary = 0xBF; - - // 2. Prepend byte to stream. - stream.prepend(bite); - - // 3. Return error. - return decoderError(fatal); - } - - // 5. Set utf-8 lower boundary to 0x80 and utf-8 upper boundary - // to 0xBF. - utf8_lower_boundary = 0x80; - utf8_upper_boundary = 0xBF; - - // 6. Increase utf-8 bytes seen by one and set utf-8 code point - // to utf-8 code point + (byte − 0x80) << (6 × (utf-8 bytes - // needed − utf-8 bytes seen)). - utf8_bytes_seen += 1; - utf8_code_point += (bite - 0x80) << (6 * (utf8_bytes_needed - utf8_bytes_seen)); - - // 7. If utf-8 bytes seen is not equal to utf-8 bytes needed, - // continue. - if (utf8_bytes_seen !== utf8_bytes_needed) - return null; - - // 8. Let code point be utf-8 code point. - var code_point = utf8_code_point; - - // 9. Set utf-8 code point, utf-8 bytes needed, and utf-8 bytes - // seen to 0. - utf8_code_point = utf8_bytes_needed = utf8_bytes_seen = 0; - - // 10. Return a code point whose value is code point. - return code_point; - }; -} - -/** - * @constructor - * @implements {Encoder} - * @param {{fatal: boolean}} options - */ -function UTF8Encoder(options) { - var fatal = options.fatal; - /** - * @param {Stream} stream Input stream. - * @param {number} code_point Next code point read from the stream. - * @return {(number|!Array.)} Byte(s) to emit. - */ - this.handler = function(stream, code_point) { - // 1. If code point is end-of-stream, return finished. - if (code_point === end_of_stream) - return finished; - - // 2. If code point is in the range U+0000 to U+007F, return a - // byte whose value is code point. - if (inRange(code_point, 0x0000, 0x007f)) - return code_point; - - // 3. Set count and offset based on the range code point is in: - var count, offset; - // U+0080 to U+07FF: 1 and 0xC0 - if (inRange(code_point, 0x0080, 0x07FF)) { - count = 1; - offset = 0xC0; - } - // U+0800 to U+FFFF: 2 and 0xE0 - else if (inRange(code_point, 0x0800, 0xFFFF)) { - count = 2; - offset = 0xE0; - } - // U+10000 to U+10FFFF: 3 and 0xF0 - else if (inRange(code_point, 0x10000, 0x10FFFF)) { - count = 3; - offset = 0xF0; - } - - // 4.Let bytes be a byte sequence whose first byte is (code - // point >> (6 × count)) + offset. - var bytes = [(code_point >> (6 * count)) + offset]; - - // 5. Run these substeps while count is greater than 0: - while (count > 0) { - - // 1. Set temp to code point >> (6 × (count − 1)). - var temp = code_point >> (6 * (count - 1)); - - // 2. Append to bytes 0x80 | (temp & 0x3F). - bytes.push(0x80 | (temp & 0x3F)); - - // 3. Decrease count by one. - count -= 1; - } - - // 6. Return bytes bytes, in order. - return bytes; - }; -} - -exports.TextEncoder = TextEncoder; -exports.TextDecoder = TextDecoder; diff --git a/js/closure-compiler-scripts/tslib.js b/js/closure-compiler-scripts/tslib.js deleted file mode 100644 index b5a722a652c34..0000000000000 --- a/js/closure-compiler-scripts/tslib.js +++ /dev/null @@ -1,151 +0,0 @@ -/** - * closure-compiler-friendly tslib - * copied from node_modules/tslib/tslib.js - * update as needed - */ - -var extendStatics = Object.setPrototypeOf || - ({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) || - function (d, b) { for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p]; }; - -function __extends(d, b) { - extendStatics(d, b); - function __() { this.constructor = d; } - d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __()); -}; - -var __assign = Object.assign || function (t) { - for (var s, i = 1, n = arguments.length; i < n; i++) { - s = arguments[i]; - for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p]; - } - return t; -}; - -function __rest(s, e) { - var t = {}; - for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0) - t[p] = s[p]; - if (s != null && typeof Object.getOwnPropertySymbols === "function") - for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) if (e.indexOf(p[i]) < 0) - t[p[i]] = s[p[i]]; - return t; -}; - -function __decorate(decorators, target, key, desc) { - var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; - if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); - else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; - return c > 3 && r && Object.defineProperty(target, key, r), r; -}; - -function __param(paramIndex, decorator) { - return function (target, key) { decorator(target, key, paramIndex); } -}; - -function __metadata(metadataKey, metadataValue) { - if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(metadataKey, metadataValue); -}; - -function __awaiter(thisArg, _arguments, P, generator) { - return new (P || (P = Promise))(function (resolve, reject) { - function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } - function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } - function step(result) { result.done ? resolve(result.value) : new P(function (resolve) { resolve(result.value); }).then(fulfilled, rejected); } - step((generator = generator.apply(thisArg, _arguments || [])).next()); - }); -}; - -function __generator(thisArg, body) { - var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; - return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; - function verb(n) { return function (v) { return step([n, v]); }; } - function step(op) { - if (f) throw new TypeError("Generator is already executing."); - while (_) try { - if (f = 1, y && (t = y[op[0] & 2 ? "return" : op[0] ? "throw" : "next"]) && !(t = t.call(y, op[1])).done) return t; - if (y = 0, t) op = [0, t.value]; - switch (op[0]) { - case 0: case 1: t = op; break; - case 4: _.label++; return { value: op[1], done: false }; - case 5: _.label++; y = op[1]; op = [0]; continue; - case 7: op = _.ops.pop(); _.trys.pop(); continue; - default: - if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } - if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } - if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } - if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } - if (t[2]) _.ops.pop(); - _.trys.pop(); continue; - } - op = body.call(thisArg, _); - } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } - if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; - } -}; - -function __exportStar(m, exports) { - for (var p in m) if (!exports.hasOwnProperty(p)) exports[p] = m[p]; -}; - -function __values(o) { - var m = typeof Symbol === "function" && o[Symbol.iterator], i = 0; - if (m) return m.call(o); - return { - next: function () { - if (o && i >= o.length) o = void 0; - return { value: o && o[i++], done: !o }; - } - }; -}; - -function __read(o, n) { - var m = typeof Symbol === "function" && o[Symbol.iterator]; - if (!m) return o; - var i = m.call(o), r, ar = [], e; - try { - while ((n === void 0 || n-- > 0) && !(r = i.next()).done) ar.push(r.value); - } - catch (error) { e = { error: error }; } - finally { - try { - if (r && !r.done && (m = i["return"])) m.call(i); - } - finally { if (e) throw e.error; } - } - return ar; -}; - -function __spread() { - for (var ar = [], i = 0; i < arguments.length; i++) - ar = ar.concat(__read(arguments[i])); - return ar; -}; - -function __await(v) { - return this instanceof __await ? (this.v = v, this) : new __await(v); -}; - -function __asyncGenerator(thisArg, _arguments, generator) { - if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); - var g = generator.apply(thisArg, _arguments || []), i, q = []; - return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i; - function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; } - function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } } - function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); } - function fulfill(value) { resume("next", value); } - function reject(value) { resume("throw", value); } - function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); } -}; - -function __asyncDelegator(o) { - var i, p; - return i = {}, verb("next"), verb("throw", function (e) { throw e; }), verb("return"), i[Symbol.iterator] = function () { return this; }, i; - function verb(n, f) { if (o[n]) i[n] = function (v) { return (p = !p) ? { value: __await(o[n](v)), done: n === "return" } : f ? f(v) : v; }; } -}; - -function __asyncValues(o) { - if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); - var m = o[Symbol.asyncIterator]; - return m ? m.call(o) : typeof __values === "function" ? __values(o) : o[Symbol.iterator](); -}; diff --git a/js/examples/read_file.html b/js/examples/read_file.html index 2a1ebaba82c50..3093622fcc1d6 100644 --- a/js/examples/read_file.html +++ b/js/examples/read_file.html @@ -40,7 +40,7 @@ } reader.onload = function (evt) { - var arrowTable = Arrow.Table.from(new Uint8Array(evt.target.result)); + var arrowTable = Arrow.Table.from([new Uint8Array(evt.target.result)]); var thead = document.getElementById("thead"); var tbody = document.getElementById("tbody"); @@ -53,13 +53,13 @@ } var header_row = document.createElement("tr"); - for (let column of arrowTable.cols()) { + for (let column of arrowTable.columns) { addCell(header_row, "th", column.name); } thead.appendChild(header_row); - for (let row of arrowTable.rows(true)) { + for (let row of arrowTable) { var tr = document.createElement("tr"); for (let cell of row) { addCell(tr, "td", @@ -85,6 +85,6 @@

- + diff --git a/js/gulp/argv.js b/js/gulp/argv.js new file mode 100644 index 0000000000000..6f80912e97e52 --- /dev/null +++ b/js/gulp/argv.js @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const argv = require(`command-line-args`)([ + { name: `all`, type: Boolean }, + { name: 'update', alias: 'u', type: Boolean }, + { name: 'verbose', alias: 'v', type: Boolean }, + { name: `target`, type: String, defaultValue: `` }, + { name: `module`, type: String, defaultValue: `` }, + { name: `coverage`, type: Boolean, defaultValue: false }, + { name: `json_file`, alias: `j`, type: String, defaultValue: null }, + { name: `arrow_file`, alias: `a`, type: String, defaultValue: null }, + { name: `integration`, alias: `i`, type: Boolean, defaultValue: false }, + { name: `targets`, alias: `t`, type: String, multiple: true, defaultValue: [] }, + { name: `modules`, alias: `m`, type: String, multiple: true, defaultValue: [] }, + { name: `sources`, alias: `s`, type: String, multiple: true, defaultValue: [`cpp`, `java`] }, + { name: `formats`, alias: `f`, type: String, multiple: true, defaultValue: [`file`, `stream`] }, +], { partial: true }); + +const { targets, modules } = argv; + +argv.target && !targets.length && targets.push(argv.target); +argv.module && !modules.length && modules.push(argv.module); +(argv.all || !targets.length) && targets.push(`all`); +(argv.all || !modules.length) && modules.push(`all`); + +module.exports = { argv, targets, modules }; diff --git a/js/gulp/arrow-task.js b/js/gulp/arrow-task.js new file mode 100644 index 0000000000000..d1e8046e67ab9 --- /dev/null +++ b/js/gulp/arrow-task.js @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { + mainExport, gCCLanguageNames, + targetDir, observableFromStreams +} = require('./util'); + +const gulp = require('gulp'); +const path = require('path'); +const gulpRename = require(`gulp-rename`); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); + +const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target, format) { + const out = targetDir(target); + const dtsGlob = `${targetDir(`es2015`, `cjs`)}/**/*.ts`; + const cjsGlob = `${targetDir(`es2015`, `cjs`)}/**/*.js`; + const esmGlob = `${targetDir(`es2015`, `esm`)}/**/*.js`; + const es5UmdGlob = `${targetDir(`es5`, `umd`)}/**/*.js`; + const es5UmdMaps = `${targetDir(`es5`, `umd`)}/**/*.map`; + const es2015UmdGlob = `${targetDir(`es2015`, `umd`)}/**/*.js`; + const es2015UmdMaps = `${targetDir(`es2015`, `umd`)}/**/*.map`; + const ch_ext = (ext) => gulpRename((p) => { p.extname = ext; }); + const append = (ap) => gulpRename((p) => { p.basename += ap; }); + return Observable.forkJoin( + observableFromStreams(gulp.src(dtsGlob), gulp.dest(out)), // copy d.ts files + observableFromStreams(gulp.src(cjsGlob), gulp.dest(out)), // copy es2015 cjs files + observableFromStreams(gulp.src(esmGlob), ch_ext(`.mjs`), gulp.dest(out)), // copy es2015 esm files and rename to `.mjs` + observableFromStreams(gulp.src(es5UmdGlob), append(`.es5.min`), gulp.dest(out)), // copy es5 umd files and add `.min` + observableFromStreams(gulp.src(es5UmdMaps), gulp.dest(out)), // copy es5 umd sourcemap files, but don't rename + observableFromStreams(gulp.src(es2015UmdGlob), append(`.es2015.min`), gulp.dest(out)), // copy es2015 umd files and add `.es6.min` + observableFromStreams(gulp.src(es2015UmdMaps), gulp.dest(out)), // copy es2015 umd sourcemap files, but don't rename + ).publish(new ReplaySubject()).refCount(); +}))({}); + +const arrowTSTask = ((cache) => memoizeTask(cache, function copyTS(target, format) { + return observableFromStreams(gulp.src(`src/**/*.ts`), gulp.dest(targetDir(target, format))); +}))({}); + + +module.exports = arrowTask; +module.exports.arrowTask = arrowTask; +module.exports.arrowTSTask = arrowTSTask; \ No newline at end of file diff --git a/js/gulp/build-task.js b/js/gulp/build-task.js new file mode 100644 index 0000000000000..01152e662fcec --- /dev/null +++ b/js/gulp/build-task.js @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { npmPkgName } = require('./util'); +const { memoizeTask } = require('./memoize-task'); + +const uglifyTask = require('./uglify-task'); +const closureTask = require('./closure-task'); +const typescriptTask = require('./typescript-task'); +const { arrowTask, arrowTSTask } = require('./arrow-task'); + +const buildTask = ((cache) => memoizeTask(cache, function build(target, format, ...args) { + return target === npmPkgName ? arrowTask(target, format, ...args)() + : target === `ts` ? arrowTSTask(target, format, ...args)() + : format === `umd` ? target === `es5` ? closureTask(target, format, ...args)() + : uglifyTask(target, format, ...args)() + : typescriptTask(target, format, ...args)(); +}))({}); + +module.exports = buildTask; +module.exports.buildTask = buildTask; \ No newline at end of file diff --git a/js/gulp/clean-task.js b/js/gulp/clean-task.js new file mode 100644 index 0000000000000..d6c90f4637c8b --- /dev/null +++ b/js/gulp/clean-task.js @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const del = require('del'); +const { targetDir } = require('./util'); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); + +const cleanTask = ((cache) => memoizeTask(cache, function clean(target, format) { + return Observable + .from(del(`${targetDir(target, format)}/**`)) + .catch((e) => Observable.empty()) + .multicast(new ReplaySubject()).refCount(); +}))({}); + +module.exports = cleanTask; +module.exports.cleanTask = cleanTask; \ No newline at end of file diff --git a/js/gulp/closure-task.js b/js/gulp/closure-task.js new file mode 100644 index 0000000000000..1bd872fd3044a --- /dev/null +++ b/js/gulp/closure-task.js @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { + targetDir, + mainExport, + gCCLanguageNames, + UMDSourceTargets, + observableFromStreams +} = require('./util'); + +const gulp = require('gulp'); +const path = require('path'); +const sourcemaps = require('gulp-sourcemaps'); +const { memoizeTask } = require('./memoize-task'); +const ASTBuilders = require('ast-types').builders; +const transformAST = require('gulp-transform-js-ast'); +const { Observable, ReplaySubject } = require('rxjs'); +const closureCompiler = require('google-closure-compiler').gulp(); + +const closureTask = ((cache) => memoizeTask(cache, function closure(target, format) { + const src = targetDir(target, `cls`); + const out = targetDir(target, format); + const entry = path.join(src, mainExport); + const externs = path.join(src, `${mainExport}.externs`); + return observableFromStreams( + gulp.src([ +/* external libs first --> */ `node_modules/tslib/package.json`, + `node_modules/tslib/tslib.es6.js`, + `node_modules/flatbuffers/package.json`, + `node_modules/flatbuffers/js/flatbuffers.mjs`, + `node_modules/text-encoding-utf-8/package.json`, + `node_modules/text-encoding-utf-8/src/encoding.js`, +/* then sources globs --> */ `${src}/**/*.js`, +/* and exclusions last --> */ `!${src}/Arrow.externs.js`, + ], { base: `./` }), + sourcemaps.init(), + closureCompiler(createClosureArgs(entry, externs)), + // Strip out closure compiler's error-throwing iterator-return methods + // see this issue: https://github.com/google/closure-compiler/issues/2728 + transformAST(iteratorReturnVisitor), + // rename the sourcemaps from *.js.map files to *.min.js.map + sourcemaps.write(`.`, { mapFile: (mapPath) => mapPath.replace(`.js.map`, `.${target}.min.js.map`) }), + gulp.dest(out) + ).publish(new ReplaySubject()).refCount(); +}))({}); + +const createClosureArgs = (entry, externs) => ({ + third_party: true, + warning_level: `QUIET`, + dependency_mode: `STRICT`, + rewrite_polyfills: false, + externs: `${externs}.js`, + entry_point: `${entry}.js`, + module_resolution: `NODE`, + // formatting: `PRETTY_PRINT`, debug: true, + compilation_level: `ADVANCED`, + allow_method_call_decomposing: true, + package_json_entry_names: `module,jsnext:main,main`, + assume_function_wrapper: true, + js_output_file: `${mainExport}.js`, + language_in: gCCLanguageNames[`es2015`], + language_out: gCCLanguageNames[`es5`], + output_wrapper: +`// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +(function (global, factory) { + typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : + typeof define === 'function' && define.amd ? define(['exports'], factory) : + (factory(global.Arrow = global.Arrow || {})); +}(this, (function (exports) {%output%}.bind(this))));` +}); + +module.exports = closureTask; +module.exports.closureTask = closureTask; + +const iteratorReturnVisitor = { + visitObjectExpression(p) { + const node = p.node, value = p.value; + if (!node.properties || !(node.properties.length === 3)) { return value; } + if (!propertyIsThrowingIteratorReturn(node.properties[2])) { return value; } + value.properties = value.properties.slice(0, 2); + return value; + } +}; + +function propertyIsThrowingIteratorReturn(p) { + if (!p || !(p.kind === 'init')) { return false; } + if (!p.key || !(p.key.type === 'Identifier') || !(p.key.name === 'return')) { return false; } + if (!p.value || !(p.value.type === 'FunctionExpression') || !p.value.params || !(p.value.params.length === 0)) { return false; } + if (!p.value.body || !p.value.body.body || !(p.value.body.body.length === 1) || !(p.value.body.body[0].type === 'ThrowStatement')) { return false; } + if (!p.value.body.body[0].argument || !(p.value.body.body[0].argument.type === 'CallExpression')) { return false; } + if (!p.value.body.body[0].argument.arguments || !(p.value.body.body[0].argument.arguments.length === 1)) { return false; } + if (!p.value.body.body[0].argument.arguments[0] || !(p.value.body.body[0].argument.arguments[0].type === 'Literal')) { return false; } + return p.value.body.body[0].argument.arguments[0].value === 'Not yet implemented'; +} \ No newline at end of file diff --git a/js/gulp/memoize-task.js b/js/gulp/memoize-task.js new file mode 100644 index 0000000000000..0b0fc843c451a --- /dev/null +++ b/js/gulp/memoize-task.js @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { taskName } = require('./util'); + +const memoizeTask = ((cache, taskFn) => ((target, format, ...args) => { + // Give the memoized fn a displayName so gulp's output is easier to follow. + const fn = () => ( + cache[taskName(target, format)] || ( + cache[taskName(target, format)] = taskFn(target, format, ...args))); + fn.displayName = `${taskFn.name || ``}:${taskName(target, format, ...args)}:task`; + return fn; +})); + +module.exports = memoizeTask; +module.exports.memoizeTask = memoizeTask; \ No newline at end of file diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js new file mode 100644 index 0000000000000..2976d0ad45d09 --- /dev/null +++ b/js/gulp/package-task.js @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { + metadataFiles, packageJSONFields, + mainExport, npmPkgName, npmOrgName, + targetDir, packageName, observableFromStreams +} = require('./util'); + +const gulp = require('gulp'); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); +const gulpJsonTransform = require('gulp-json-transform'); + +const packageTask = ((cache) => memoizeTask(cache, function bundle(target, format) { + const out = targetDir(target, format); + const jsonTransform = gulpJsonTransform(target === npmPkgName ? createMainPackageJson(target, format) : + target === `ts` ? createTypeScriptPackageJson(target, format) + : createScopedPackageJSON(target, format), + 2); + return Observable.forkJoin( + observableFromStreams(gulp.src(metadataFiles), gulp.dest(out)), // copy metadata files + observableFromStreams(gulp.src(`package.json`), jsonTransform, gulp.dest(out)) // write packageJSONs + ).publish(new ReplaySubject()).refCount(); +}))({}); + +module.exports = packageTask; +module.exports.packageTask = packageTask; + +const createMainPackageJson = (target, format) => (orig) => ({ + ...createTypeScriptPackageJson(target, format)(orig), + name: npmPkgName, + main: mainExport, + module: `${mainExport}.mjs`, + dist: `${mainExport}.es5.min.js`, + [`dist:es2015`]: `${mainExport}.es2015.min.js`, + [`@std/esm`]: { esm: `mjs` } +}); + +const createTypeScriptPackageJson = (target, format) => (orig) => ({ + ...createScopedPackageJSON(target, format)(orig), + main: `${mainExport}.ts`, types: `${mainExport}.ts`, + dependencies: { + '@types/flatbuffers': '*', + '@types/node': '*', + ...orig.dependencies + } +}); + +const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => + conditionallyAddStandardESMEntry(target, format)( + packageJSONFields.reduce( + (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), + { name: `${npmOrgName}/${packageName(target, format)}`, + version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, + dist: undefined, [`dist:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined } + ) + ) +); + +const conditionallyAddStandardESMEntry = (target, format) => (packageJSON) => ( + format !== `esm` + ? packageJSON + : { ...packageJSON, [`@std/esm`]: { esm: `js` } } +); + \ No newline at end of file diff --git a/js/gulp/test-task.js b/js/gulp/test-task.js new file mode 100644 index 0000000000000..ab280b092635c --- /dev/null +++ b/js/gulp/test-task.js @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const del = require('del'); +const path = require('path'); +const { argv } = require('./argv'); +const { promisify } = require('util'); +const glob = promisify(require('glob')); +const stat = promisify(require('fs').stat); +const mkdirp = promisify(require('mkdirp')); +const rimraf = promisify(require('rimraf')); +const child_process = require(`child_process`); +const { memoizeTask } = require('./memoize-task'); +const readFile = promisify(require('fs').readFile); +const exec = promisify(require('child_process').exec); +const parseXML = promisify(require('xml2js').parseString); + +const jestArgv = []; +argv.update && jestArgv.push(`-u`); +argv.verbose && jestArgv.push(`--verbose`); +argv.coverage && jestArgv.push(`--coverage`); + +const debugArgv = [`--runInBand`, `--env`, `node-debug`]; +const jest = require.resolve(path.join(`..`, `node_modules`, `.bin`, `jest`)); +const testOptions = { + env: { ...process.env }, + stdio: [`ignore`, `inherit`, `inherit`], +}; + +const testTask = ((cache, execArgv, testOptions) => memoizeTask(cache, function test(target, format, debug = false) { + const opts = { ...testOptions }; + const args = !debug ? [...execArgv] : [...debugArgv, ...execArgv]; + args.push(`test/${argv.integration ? `integration/*` : `unit/*`}`); + opts.env = { ...opts.env, + TEST_TARGET: target, + TEST_MODULE: format, + JSON_PATH: argv.json_file, + ARROW_PATH: argv.arrow_file, + TEST_TS_SOURCE: !!argv.coverage, + TEST_SOURCES: JSON.stringify(Array.isArray(argv.sources) ? argv.sources : [argv.sources]), + TEST_FORMATS: JSON.stringify(Array.isArray(argv.formats) ? argv.formats : [argv.formats]), + }; + return !debug ? + child_process.spawn(jest, args, opts) : + child_process.exec(`node --inspect-brk ${jest} ${args.join(` `)}`, opts); +}))({}, jestArgv, testOptions); + +module.exports = testTask; +module.exports.testTask = testTask; +module.exports.cleanTestData = cleanTestData; +module.exports.createTestData = createTestData; + +// Pull C++ and Java paths from environment vars first, otherwise sane defaults +const ARROW_HOME = process.env.ARROW_HOME || path.resolve('../'); +const ARROW_JAVA_DIR = process.env.ARROW_JAVA_DIR || path.join(ARROW_HOME, 'java'); +const CPP_EXE_PATH = process.env.ARROW_CPP_EXE_PATH || path.join(ARROW_HOME, 'cpp/build/debug'); +const ARROW_INTEGRATION_DIR = process.env.ARROW_INTEGRATION_DIR || path.join(ARROW_HOME, 'integration'); +const CPP_JSON_TO_ARROW = path.join(CPP_EXE_PATH, 'json-integration-test'); +const CPP_STREAM_TO_FILE = path.join(CPP_EXE_PATH, 'stream-to-file'); +const CPP_FILE_TO_STREAM = path.join(CPP_EXE_PATH, 'file-to-stream'); + +const testFilesDir = path.join(ARROW_HOME, 'js/test/data'); +const snapshotsDir = path.join(ARROW_HOME, 'js/test/__snapshots__'); +const cppFilesDir = path.join(testFilesDir, 'cpp'); +const javaFilesDir = path.join(testFilesDir, 'java'); +const jsonFilesDir = path.join(testFilesDir, 'json'); + +async function cleanTestData() { + return await del([`${testFilesDir}/**`, `${snapshotsDir}/**`]); +} + +async function createTestJSON() { + await mkdirp(jsonFilesDir); + await exec(`shx cp ${ARROW_INTEGRATION_DIR}/data/*.json ${jsonFilesDir}`); + await exec(`python ${ARROW_INTEGRATION_DIR}/integration_test.py --write_generated_json ${jsonFilesDir}`); +} + +async function createTestData() { + + let JAVA_TOOLS_JAR = process.env.ARROW_JAVA_INTEGRATION_JAR; + if (!JAVA_TOOLS_JAR) { + const pom_version = await + readFile(path.join(ARROW_JAVA_DIR, 'pom.xml')) + .then((pom) => parseXML(pom.toString())) + .then((pomXML) => pomXML.project.version[0]); + JAVA_TOOLS_JAR = path.join(ARROW_JAVA_DIR, `/tools/target/arrow-tools-${pom_version}-jar-with-dependencies.jar`); + } + + await cleanTestData().then(createTestJSON); + await mkdirp(path.join(cppFilesDir, 'file')); + await mkdirp(path.join(javaFilesDir, 'file')); + await mkdirp(path.join(cppFilesDir, 'stream')); + await mkdirp(path.join(javaFilesDir, 'stream')); + + const errors = []; + const names = await glob(path.join(jsonFilesDir, '*.json')); + + for (let jsonPath of names) { + const name = path.parse(path.basename(jsonPath)).name; + const arrowCppFilePath = path.join(cppFilesDir, 'file', `${name}.arrow`); + const arrowJavaFilePath = path.join(javaFilesDir, 'file', `${name}.arrow`); + const arrowCppStreamPath = path.join(cppFilesDir, 'stream', `${name}.arrow`); + const arrowJavaStreamPath = path.join(javaFilesDir, 'stream', `${name}.arrow`); + try { + await generateCPPFile(path.resolve(jsonPath), arrowCppFilePath); + await generateCPPStream(arrowCppFilePath, arrowCppStreamPath); + } catch (e) { errors.push(`${e.stdout}\n${e.message}`); } + try { + await generateJavaFile(path.resolve(jsonPath), arrowJavaFilePath); + await generateJavaStream(arrowJavaFilePath, arrowJavaStreamPath); + } catch (e) { errors.push(`${e.stdout}\n${e.message}`); } + } + if (errors.length) { + console.error(errors.join(`\n`)); + process.exit(1); + } + + async function generateCPPFile(jsonPath, filePath) { + await rimraf(filePath); + return await exec( + `${CPP_JSON_TO_ARROW} ${ + `--integration --mode=JSON_TO_ARROW`} ${ + `--json=${jsonPath} --arrow=${filePath}`}`, + { maxBuffer: Math.pow(2, 53) - 1 } + ); + } + + async function generateCPPStream(filePath, streamPath) { + await rimraf(streamPath); + return await exec( + `${CPP_FILE_TO_STREAM} ${filePath} > ${streamPath}`, + { maxBuffer: Math.pow(2, 53) - 1 } + ); + } + + async function generateJavaFile(jsonPath, filePath) { + await rimraf(filePath); + return await exec( + `java -cp ${JAVA_TOOLS_JAR} ${ + `org.apache.arrow.tools.Integration -c JSON_TO_ARROW`} ${ + `-j ${path.resolve(jsonPath)} -a ${filePath}`}`, + { maxBuffer: Math.pow(2, 53) - 1 } + ); + } + + async function generateJavaStream(filePath, streamPath) { + await rimraf(streamPath); + return await exec( + `java -cp ${JAVA_TOOLS_JAR} ${ + `org.apache.arrow.tools.FileToStream`} ${filePath} ${streamPath}`, + { maxBuffer: Math.pow(2, 53) - 1 } + ); + } +} diff --git a/js/gulp/typescript-task.js b/js/gulp/typescript-task.js new file mode 100644 index 0000000000000..8b755cf7f1624 --- /dev/null +++ b/js/gulp/typescript-task.js @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { + targetDir, tsconfigName, observableFromStreams +} = require('./util'); + +const del = require('del'); +const gulp = require('gulp'); +const path = require('path'); +const ts = require(`gulp-typescript`); +const gulpRename = require(`gulp-rename`); +const sourcemaps = require('gulp-sourcemaps'); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); + +const typescriptTask = ((cache) => memoizeTask(cache, function typescript(target, format) { + const out = targetDir(target, format); + const tsconfigFile = `tsconfig.${tsconfigName(target, format)}.json`; + const tsProject = ts.createProject(path.join(`tsconfig`, tsconfigFile), { typescript: require(`typescript`) }); + const { stream: { js, dts } } = observableFromStreams( + tsProject.src(), sourcemaps.init(), + tsProject(ts.reporter.fullReporter(true)) + ); + const writeDTypes = observableFromStreams(dts, gulp.dest(out)); + const writeJS = observableFromStreams(js, sourcemaps.write(), gulp.dest(out)); + return Observable + .forkJoin(writeDTypes, writeJS) + .concat(maybeCopyRawJSArrowFormatFiles(target, format)) + .publish(new ReplaySubject()).refCount(); +}))({}); + +module.exports = typescriptTask; +module.exports.typescriptTask = typescriptTask; + +function maybeCopyRawJSArrowFormatFiles(target, format) { + if (target !== `es5` || format !== `cls`) { + return Observable.empty(); + } + return Observable.defer(async () => { + const outFormatDir = path.join(targetDir(target, format), `format`, `fb`); + await del(path.join(outFormatDir, '*.js')); + await observableFromStreams( + gulp.src(path.join(`src`, `format`, `fb`, `*_generated.js`)), + gulpRename((p) => { p.basename = p.basename.replace(`_generated`, ``); }), + gulp.dest(outFormatDir) + ).toPromise(); + }); +} \ No newline at end of file diff --git a/js/gulp/uglify-task.js b/js/gulp/uglify-task.js new file mode 100644 index 0000000000000..5c605cb7882bd --- /dev/null +++ b/js/gulp/uglify-task.js @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { + targetDir, + mainExport, + ESKeywords, + UMDSourceTargets, + uglifyLanguageNames, + observableFromStreams +} = require('./util'); + +const path = require('path'); +const webpack = require(`webpack`); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); +const UglifyJSPlugin = require(`uglifyjs-webpack-plugin`); +const esmRequire = require(`@std/esm`)(module, { cjs: true, esm: `js` }); + +const uglifyTask = ((cache, commonConfig) => memoizeTask(cache, function uglifyJS(target, format) { + + const sourceTarget = UMDSourceTargets[target]; + const PublicNames = reservePublicNames(sourceTarget, `cls`); + const out = targetDir(target, format), src = targetDir(sourceTarget, `cls`); + + const targetConfig = { ...commonConfig, + output: { ...commonConfig.output, + path: path.resolve(`./${out}`) } }; + + const webpackConfigs = [ + [mainExport, PublicNames] + ].map(([entry, reserved]) => ({ + ...targetConfig, + name: entry, + entry: { [entry]: path.resolve(`${src}/${entry}.js`) }, + plugins: [ + ...(targetConfig.plugins || []), + new webpack.SourceMapDevToolPlugin({ + filename: `[name].${target}.min.js.map`, + moduleFilenameTemplate: ({ resourcePath }) => + resourcePath + .replace(/\s/, `_`) + .replace(/\.\/node_modules\//, ``) + }), + new UglifyJSPlugin({ + sourceMap: true, + uglifyOptions: { + ecma: uglifyLanguageNames[target], + compress: { unsafe: true }, + output: { comments: false, beautify: false }, + mangle: { eval: true, safari10: true, // <-- Works around a Safari 10 bug: // https://github.com/mishoo/UglifyJS2/issues/1753 + properties: { reserved, keep_quoted: true } + } + }, + }) + ] + })); + + const compilers = webpack(webpackConfigs); + return Observable + .bindNodeCallback(compilers.run.bind(compilers))() + .multicast(new ReplaySubject()).refCount(); +}))({}, { + resolve: { mainFields: [`module`, `main`] }, + module: { rules: [{ test: /\.js$/, enforce: `pre`, use: [`source-map-loader`] }] }, + output: { filename: '[name].js', library: mainExport, libraryTarget: `umd`, umdNamedDefine: true }, +}); + +module.exports = uglifyTask; +module.exports.uglifyTask = uglifyTask; + +const reservePublicNames = ((ESKeywords) => function reservePublicNames(target, format) { + const publicModulePath = `../${targetDir(target, format)}/${mainExport}.js`; + return [ + ...ESKeywords, + ...reserveExportedNames(esmRequire(publicModulePath)) + ]; +})(ESKeywords); + +// Reflect on the Arrow modules to come up with a list of keys to save from Uglify's +// mangler. Assume all the non-inherited static and prototype members of the Arrow +// module and its direct exports are public, and should be preserved through minification. +const reserveExportedNames = (entryModule) => ( + Object + .getOwnPropertyNames(entryModule) + .filter((name) => ( + typeof entryModule[name] === `object` || + typeof entryModule[name] === `function` + )) + .map((name) => [name, entryModule[name]]) + .reduce((reserved, [name, value]) => { + const fn = function() {}; + const ownKeys = value && Object.getOwnPropertyNames(value) || []; + const protoKeys = typeof value === `function` && Object.getOwnPropertyNames(value.prototype) || []; + const publicNames = [...ownKeys, ...protoKeys].filter((x) => x !== `default` && x !== `undefined` && !(x in fn)); + return [...reserved, name, ...publicNames]; + }, [] + ) +); diff --git a/js/gulp/util.js b/js/gulp/util.js new file mode 100644 index 0000000000000..ba6ebece51bba --- /dev/null +++ b/js/gulp/util.js @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const fs = require('fs'); +const path = require(`path`); +const pump = require(`pump`); +const { Observable, ReplaySubject } = require('rxjs'); + +const mainExport = `Arrow`; +const npmPkgName = `apache-arrow`; +const npmOrgName = `@${npmPkgName}`; + +const releasesRootDir = `targets`; +const knownTargets = [`es5`, `es2015`, `esnext`]; +const knownModules = [`cjs`, `esm`, `cls`, `umd`]; +const moduleFormatsToSkipCombosOf = { cls: { test: true, integration: true } }; +const packageJSONFields = [ + `version`, `license`, `description`, + `author`, `homepage`, `repository`, + `bugs`, `keywords`, `dependencies`, + `bin` +]; + +const metadataFiles = [`LICENSE.txt`, `NOTICE.txt`, `README.md`].map((filename) => { + let err = false, prefixes = [`./`, `../`]; + let p = prefixes.find((prefix) => { + try { + fs.statSync(path.resolve(path.join(prefix, filename))); + } catch (e) { return false; } + return true; + }); + if (!p) { + throw new Error(`Couldn't find ${filename} in ./ or ../`); + } + return path.join(p, filename); +}); + +// see: https://github.com/google/closure-compiler/blob/c1372b799d94582eaf4b507a4a22558ff26c403c/src/com/google/javascript/jscomp/CompilerOptions.java#L2988 +const gCCLanguageNames = { + es5: `ECMASCRIPT5`, + es2015: `ECMASCRIPT_2015`, + es2016: `ECMASCRIPT_2016`, + es2017: `ECMASCRIPT_2017`, + esnext: `ECMASCRIPT_NEXT` +}; + +const UMDSourceTargets = { + es5: `es5`, + es2015: `es2015`, + es2016: `es2015`, + es2017: `es2015`, + esnext: `es2015` +}; + +const uglifyLanguageNames = { + es5: 5, es2015: 6, + es2016: 7, es2017: 8, + esnext: 8 // <--- ? +}; + +// ES7+ keywords Uglify shouldn't mangle +// Hardcoded here since some are from ES7+, others are +// only defined in interfaces, so difficult to get by reflection. +const ESKeywords = [ + // PropertyDescriptors + `configurable`, `enumerable`, + // IteratorResult, Symbol.asyncIterator + `done`, `value`, `Symbol.asyncIterator`, `asyncIterator`, + // AsyncObserver + `values`, `hasError`, `hasCompleted`,`errorValue`, `closed`, + // Observable/Subscription/Scheduler + `next`, `error`, `complete`, `subscribe`, `unsubscribe`, `isUnsubscribed`, + // EventTarget + `addListener`, `removeListener`, `addEventListener`, `removeEventListener`, + // Arrow properties + `low`, `high`, `data`, `index`, `field`, `validity`, `columns`, `fieldNode`, `subarray`, +]; + +function taskName(target, format) { + return !format ? target : `${target}:${format}`; +} + +function packageName(target, format) { + return !format ? target : `${target}-${format}`; +} + +function tsconfigName(target, format) { + return !format ? target : `${target}.${format}`; +} + +function targetDir(target, format) { + return path.join(releasesRootDir, ...(!format ? [target] : [target, format])); +} + +function logAndDie(e) { + if (e) { + console.error(e); + process.exit(1); + } +} + +function observableFromStreams(...streams) { + const pumped = streams.length <= 1 ? streams[0] + : pump(...streams, logAndDie); + const fromEvent = Observable.fromEvent.bind(null, pumped); + const streamObs = fromEvent(`data`) + .merge(fromEvent(`error`).flatMap((e) => Observable.throw(e))) + .takeUntil(fromEvent(`end`).merge(fromEvent(`close`))) + .defaultIfEmpty(`empty stream`) + .multicast(new ReplaySubject()).refCount(); + streamObs.stream = pumped; + streamObs.observable = streamObs; + return streamObs; +} + +function* combinations(_targets, _modules) { + + const targets = known(knownTargets, _targets || [`all`]); + const modules = known(knownModules, _modules || [`all`]); + + if (_targets[0] === `all` && _modules[0] === `all`) { + yield [`ts`, ``]; + yield [npmPkgName, ``]; + } + + for (const format of modules) { + for (const target of targets) { + yield [target, format]; + } + } + + function known(known, values) { + return ~values.indexOf(`all`) + ? known + : Object.keys( + values.reduce((map, arg) => (( + (known.indexOf(arg) !== -1) && + (map[arg.toLowerCase()] = true) + || true) && map + ), {}) + ).sort((a, b) => known.indexOf(a) - known.indexOf(b)); + } +} + +module.exports = { + + mainExport, npmPkgName, npmOrgName, metadataFiles, packageJSONFields, + + knownTargets, knownModules, moduleFormatsToSkipCombosOf, + ESKeywords, gCCLanguageNames, UMDSourceTargets, uglifyLanguageNames, + + taskName, packageName, tsconfigName, targetDir, combinations, observableFromStreams, +}; \ No newline at end of file diff --git a/js/gulpfile.js b/js/gulpfile.js index 9f8e564bd9e3a..7b82962035e46 100644 --- a/js/gulpfile.js +++ b/js/gulpfile.js @@ -15,278 +15,94 @@ // specific language governing permissions and limitations // under the License. -const del = require(`del`); -const gulp = require(`gulp`); -const path = require(`path`); -const pump = require(`pump`); -const ts = require(`gulp-typescript`); -const streamMerge = require(`merge2`); -const sourcemaps = require(`gulp-sourcemaps`); -const child_process = require(`child_process`); -const gulpJsonTransform = require(`gulp-json-transform`); -const closureCompiler = require(`google-closure-compiler`).gulp(); - -const knownTargets = [`es5`, `es2015`, `esnext`]; -const knownModules = [`cjs`, `esm`, `cls`, `umd`]; - -// see: https://github.com/google/closure-compiler/blob/c1372b799d94582eaf4b507a4a22558ff26c403c/src/com/google/javascript/jscomp/CompilerOptions.java#L2988 -const gCCTargets = { - es5: `ECMASCRIPT5`, - es2015: `ECMASCRIPT_2015`, - es2016: `ECMASCRIPT_2016`, - es2017: `ECMASCRIPT_2017`, - esnext: `ECMASCRIPT_NEXT` -}; - -const tsProjects = []; -const argv = require(`command-line-args`)([ - { name: `all`, alias: `a`, type: Boolean }, - { name: 'update', alias: 'u', type: Boolean }, - { name: 'verbose', alias: 'v', type: Boolean }, - { name: `target`, type: String, defaultValue: `` }, - { name: `module`, type: String, defaultValue: `` }, - { name: `coverage`, type: Boolean, defaultValue: false }, - { name: `targets`, alias: `t`, type: String, multiple: true, defaultValue: [] }, - { name: `modules`, alias: `m`, type: String, multiple: true, defaultValue: [] } -]); - -const { targets, modules } = argv; - -argv.target && !targets.length && targets.push(argv.target); -argv.module && !modules.length && modules.push(argv.module); -(argv.all || !targets.length) && targets.push(`all`); -(argv.all || !modules.length) && modules.push(`all`); - -for (const [target, format] of combinations([`all`, `all`])) { - const combo = `${target}:${format}`; - gulp.task(`test:${combo}`, gulp.series(testTask(target, format, combo, `targets/${target}/${format}`))); - gulp.task(`clean:${combo}`, gulp.series(cleanTask(target, format, combo, `targets/${target}/${format}`))); - gulp.task(`build:${combo}`, gulp.series(buildTask(target, format, combo, `targets/${target}/${format}`))); - gulp.task(`bundle:${combo}`, gulp.series(bundleTask(target, format, combo, `targets/${target}/${format}`))); - gulp.task(`package:${combo}`, gulp.series(packageTask(target, format, combo, `targets/${target}/${format}`))); - gulp.task(`test:debug:${combo}`, gulp.series(testTask(target, format, combo, `targets/${target}/${format}`, true))); +const del = require('del'); +const gulp = require('gulp'); +const path = require('path'); +const { Observable } = require('rxjs'); +const buildTask = require('./gulp/build-task'); +const cleanTask = require('./gulp/clean-task'); +const packageTask = require('./gulp/package-task'); +const { targets, modules } = require('./gulp/argv'); +const { testTask, createTestData, cleanTestData } = require('./gulp/test-task'); +const { + targetDir, + taskName, combinations, + knownTargets, knownModules, + npmPkgName, UMDSourceTargets, + moduleFormatsToSkipCombosOf +} = require('./gulp/util'); + +for (const [target, format] of combinations([`all`], [`all`])) { + const task = taskName(target, format); + gulp.task(`clean:${task}`, cleanTask(target, format)); + gulp.task( `test:${task}`, testTask(target, format)); + gulp.task(`debug:${task}`, testTask(target, format, true)); + gulp.task(`build:${task}`, gulp.series(`clean:${task}`, + buildTask(target, format), + packageTask(target, format))); } -gulp.task(`test`, gulp.series(runTaskCombos(`test`))); -gulp.task(`clean`, gulp.parallel(runTaskCombos(`clean`))); -gulp.task(`build`, gulp.parallel(runTaskCombos(`build`))); -gulp.task(`bundle`, gulp.parallel(runTaskCombos(`bundle`))); -gulp.task(`package`, gulp.parallel(runTaskCombos(`package`))); -gulp.task(`test:debug`, gulp.series(runTaskCombos(`test:debug`))); -gulp.task(`default`, gulp.task(`package`)); - -function runTaskCombos(name) { - const combos = []; +// The UMD bundles build temporary es5/6/next targets via TS, +// then run the TS source through either closure-compiler or +// uglify, so we special case that here. +knownTargets.forEach((target) => + gulp.task(`build:${target}:umd`, + gulp.series( + gulp.parallel( + cleanTask(target, `umd`), + cleanTask(UMDSourceTargets[target], `cls`) + ), + buildTask(UMDSourceTargets[target], `cls`), + buildTask(target, `umd`), packageTask(target, `umd`) + ) + ) +); + +// The main "apache-arrow" module builds the es5/cjs, es5/umd, +// es2015/esm, es2015/umd, and ts targets, then copies and +// renames the compiled output into the apache-arrow folder +gulp.task(`build:${npmPkgName}`, + gulp.series( + cleanTask(npmPkgName), + gulp.parallel( + `build:${taskName(`es5`, `cjs`)}`, + `build:${taskName(`es5`, `umd`)}`, + `build:${taskName(`es2015`, `esm`)}`, + `build:${taskName(`es2015`, `umd`)}` + ), + buildTask(npmPkgName), packageTask(npmPkgName) + ) +); + + +function gulpConcurrent(tasks) { + return () => Observable.bindCallback((tasks, cb) => gulp.parallel(tasks)(cb))(tasks); +} + +const buildConcurrent = (tasks) => () => + gulpConcurrent(tasks)() + .concat(Observable + .defer(() => Observable + .merge(...knownTargets.map((target) => + del(`${targetDir(target, `cls`)}/**`))))); + +gulp.task(`clean:testdata`, cleanTestData); +gulp.task(`create:testdata`, createTestData); +gulp.task( `test`, gulp.series(getTasks(`test`))); +gulp.task(`debug`, gulp.series(getTasks(`debug`))); +gulp.task(`clean`, gulp.parallel(getTasks(`clean`))); +gulp.task(`build`, buildConcurrent(getTasks(`build`))); +gulp.task(`default`, gulp.series(`build`, `test`)); + +function getTasks(name) { + const tasks = []; + if (targets.indexOf(`ts`) !== -1) tasks.push(`${name}:ts`); + if (targets.indexOf(npmPkgName) !== -1) tasks.push(`${name}:${npmPkgName}`); for (const [target, format] of combinations(targets, modules)) { - if (format === `cls`) { + if (moduleFormatsToSkipCombosOf[format] && moduleFormatsToSkipCombosOf[format][name]) { continue; } - combos.push(`${name}:${target}:${format}`); - } - return combos; -} - -function cleanTask(target, format, taskName, outDir) { - return function cleanTask() { - const globs = [`${outDir}/**`]; - if (target === `es5` && format === `cjs`) { - globs.push(`types`, `typings`); - } - return del(globs); - }; -} - -function buildTask(target, format, taskName, outDir) { - return format === `umd` - ? closureTask(target, format, taskName, outDir) - : typescriptTask(target, format, taskName, outDir); -} - -function bundleTask(target, format, taskName, outDir) { - return function bundleTask() { - return streamMerge([ - pump(gulp.src([`LICENSE`, `README.md`]), gulp.dest(outDir), onError), - pump( - gulp.src(`package.json`), - gulpJsonTransform((orig) => [ - `version`, `description`, `keywords`, - `repository`, `author`, `homepage`, `bugs`, `license`, - `dependencies`, `peerDependencies` - ].reduce((copy, key) => ( - (copy[key] = orig[key]) && copy || copy - ), { - main: `Arrow.js`, - types: `Arrow.d.ts`, - typings: `Arrow.d.ts`, - name: `@apache-arrow/${target}-${format}` - }), 2), - gulp.dest(outDir), - onError - ) - ]); - } -} - -function packageTask(target, format, taskName, outDir) { - return [`build:${taskName}`, `bundle:${taskName}`]; -} - -function testTask(target, format, taskName, outDir, debug) { - const jestOptions = !debug ? [] : [ - `--runInBand`, `--env`, `jest-environment-node-debug`]; - argv.update && jestOptions.unshift(`-u`); - argv.verbose && jestOptions.unshift(`--verbose`); - argv.coverage && jestOptions.unshift(`--coverage`); - const jestPath = `./node_modules/.bin/jest`; - const debugOpts = jestOptions.join(' '); - const spawnOptions = { - stdio: [`ignore`, `inherit`, `inherit`], - env: Object.assign({}, process.env, { - TEST_TARGET: target, TEST_MODULE: format - }) - }; - return function testTask() { - return !debug ? - child_process.spawn(jestPath, jestOptions, spawnOptions) : - child_process.exec(`node --inspect-brk ${jestPath} ${debugOpts}`, spawnOptions); - } -} - -function closureTask(target, format, taskName, outDir) { - const clsTarget = `es5`; - const googleRoot = `targets/${clsTarget}/cls`; - const languageIn = clsTarget === `es5` ? `es2015` : clsTarget; - return [ - [`clean:${taskName}`, `build:${clsTarget}:cls`], - function closureTask() { - return closureStream( - closureSrcs(), - closureCompiler(closureArgs()) - ).on('end', () => del([`targets/${target}/cls/**`])); - } - ]; - function closureSrcs() { - return gulp.src([ - `closure-compiler-scripts/*.js`, - `${googleRoot}/**/*.js`, - `!${googleRoot}/format/*.js`, - `!${googleRoot}/Arrow.externs.js`, - ], { base: `./` }); - } - function closureStream(sources, compiler) { - const streams = [ - sources, - sourcemaps.init(), - compiler, - sourcemaps.write('.'), - gulp.dest(outDir) - ]; - // copy the ES5 UMD bundle to dist - if (target === `es5`) { - streams.push(gulp.dest(`dist`)); - } - return pump(...streams, onError); - } - function closureArgs() { - return { - third_party: true, - externs: `${googleRoot}/Arrow.externs.js`, - warning_level: `QUIET`, - dependency_mode: `LOOSE`, - rewrite_polyfills: false, - // formatting: `PRETTY_PRINT`, - compilation_level: `ADVANCED`, - assume_function_wrapper: true, - js_output_file: `Arrow.js`, - language_in: gCCTargets[languageIn], - language_out: gCCTargets[clsTarget], - entry_point: `${googleRoot}/Arrow.js`, - output_wrapper: -`// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -(function (global, factory) { - typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : - typeof define === 'function' && define.amd ? define(['exports'], factory) : - (factory(global.Arrow = global.Arrow || {})); -}(this, (function (exports) {%output%}.bind(this))));` - }; - } -} - -function typescriptTask(target, format, taskName, outDir) { - return [ - [`clean:${taskName}`], - function typescriptTask() { - const tsconfigPath = `tsconfig/tsconfig.${target}.${format}.json`; - let { js, dts } = tsProjects.find((p) => p.target === target && p.format === format) || {}; - if (!js || !dts) { - let tsProject = ts.createProject(tsconfigPath); - ({ js, dts } = pump( - tsProject.src(), - sourcemaps.init(), - tsProject(ts.reporter.fullReporter(true)), - onError - )); - dts = [dts, gulp.dest(outDir)]; - js = [js, sourcemaps.write(), gulp.dest(outDir)]; - // copy types to the root - if (target === `es5` && format === `cjs`) { - dts.push(gulp.dest(`types`)); - } - tsProjects.push({ - target, format, - js: js = pump(...js, onError), - dts: dts = pump(...dts, onError) - }); - } - return streamMerge([ dts, js ]); - } - ]; -} - -function* combinations(_targets, _modules) { - - const targets = known(knownTargets, _targets || [`all`]); - const modules = known(knownModules, _modules || [`all`]); - - for (const format of modules) { - for (const target of targets) { - yield [target, format]; - } - } - - function known(known, values) { - return ~values.indexOf(`all`) - ? known - : Object.keys( - values.reduce((map, arg) => (( - (known.indexOf(arg) !== -1) && - (map[arg.toLowerCase()] = true) - || true) && map - ), {}) - ).sort((a, b) => known.indexOf(a) - known.indexOf(b)); + tasks.push(`${name}:${taskName(target, format)}`); } + return tasks.length && tasks || [(done) => done()]; } - -function onError(err) { - if (typeof err === 'number') { - process.exit(err); - } else if (err) { - console.error(err.stack || err.toString()); - process.exit(1); - } -} \ No newline at end of file diff --git a/js/lerna.json b/js/lerna.json index c8fb8c072c61e..0bf16fdfd57be 100644 --- a/js/lerna.json +++ b/js/lerna.json @@ -2,8 +2,10 @@ "lerna": "2.0.0", "version": "0.1.1", "packages": [ + "targets/ts", "targets/es5/*", "targets/es2015/*", - "targets/esnext/*" + "targets/esnext/*", + "targets/apache-arrow" ] } diff --git a/cpp/src/plasma/test/run_valgrind.sh b/js/npm-release.sh old mode 100644 new mode 100755 similarity index 71% rename from cpp/src/plasma/test/run_valgrind.sh rename to js/npm-release.sh index 0472194128679..42cd73c0cfe59 --- a/cpp/src/plasma/test/run_valgrind.sh +++ b/js/npm-release.sh @@ -16,12 +16,15 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -# Cause the script to exit if a single command fails. set -e -./src/plasma/plasma_store -s /tmp/plasma_store_socket_1 -m 0 & -sleep 1 -valgrind --leak-check=full --error-exitcode=1 ./src/plasma/manager_tests -killall plasma_store -valgrind --leak-check=full --error-exitcode=1 ./src/plasma/serialization_tests +# validate the targets pass all tests before publishing +npm install +# npx run-s clean:all lint create:testdata build +# npm run test -- -t ts -u --integration +# npm run test -- --integration +npx run-s clean:all lint build +npm run test + +# publish the JS target modules to npm +npx lerna publish --yes --skip-git --force-publish=* diff --git a/js/package.json b/js/package.json index 03687a8b25ca2..d68e7a6279e61 100644 --- a/js/package.json +++ b/js/package.json @@ -1,31 +1,28 @@ { + "version": "0.2.0", "name": "apache-arrow", - "version": "0.1.2", - "types": "./types/Arrow.d.ts", - "typings": "./types/Arrow.d.ts", - "main": "./targets/es5/cjs/Arrow.js", - "module": "./targets/es5/esm/Arrow.js", - "browser": "./targets/es5/umd/Arrow.js", - "jsnext:main": "./targets/es2015/esm/Arrow.js", - "esnext:main": "./targets/esnext/esm/Arrow.js", "description": "Apache Arrow columnar in-memory format", + "bin": { + "arrow2csv": "bin/arrow2csv" + }, "scripts": { "lerna": "lerna", "test": "gulp test", "build": "gulp build", "clean": "gulp clean", - "bundle": "gulp bundle", - "package": "gulp package", + "debug": "gulp debug", "perf": "node ./perf/index.js", - "test:debug": "gulp test:debug", - "test:coverage": "gulp test -t esnext -m esm --coverage", - "validate": "npm-run-all clean lint build test bundle", - "lerna:publish": "lerna exec --bail=false npm publish", - "prepublishOnly": "sh ./prepublish.sh", + "release": "./npm-release.sh", + "clean:all": "run-p clean clean:testdata", + "clean:testdata": "gulp clean:testdata", + "create:testdata": "gulp create:testdata", + "test:coverage": "gulp test -t ts --coverage", "doc": "shx rm -rf ./doc && esdoc", - "lint": "npm-run-all -p lint:*", - "lint:src": "tslint --fix --type-check -p tsconfig.json -c tslint.json \"src/**/*.ts\"", - "lint:test": "tslint --fix --type-check -p test/tsconfig.json -c tslint.json \"test/**/*.ts\"" + "lint": "run-p lint:*", + "lint:src": "tslint --fix --project -p tsconfig.json -c tslint.json \"src/**/*.ts\"", + "lint:test": "tslint --fix --project -p test/tsconfig.json -c tslint.json \"test/**/*.ts\"", + "prepublishOnly": "echo \"Error: do 'npm run release' instead of 'npm publish'\" && exit 1", + "version": "npm install && npm run clean:all" }, "repository": { "type": "git", @@ -42,54 +39,66 @@ }, "homepage": "https://github.com/apache/arrow/blob/master/js/README.md", "files": [ + "bin", "src", - "dist", - "types", - "targets", - "LICENSE", - "README.md" + "gulp", + "test", + "*.json", + "tsconfig", + "README.md", + "gulpfile.js", + "npm-release.sh" ], - "peerDependencies": { - "tslib": "~1.7.1", - "command-line-usage": "4.0.1" - }, "dependencies": { - "flatbuffers": "1.7.0", - "text-encoding": "0.6.4" + "@types/text-encoding-utf-8": "1.0.1", + "command-line-args": "4.0.7", + "command-line-usage": "4.0.2", + "flatbuffers": "trxcllnt/flatbuffers-esm", + "json-bignum": "0.0.3", + "text-encoding-utf-8": "^1.0.2", + "tslib": "1.8.1" }, "devDependencies": { - "@types/flatbuffers": "1.6.4", - "@types/jest": "20.0.8", - "@types/node": "^8.0.24", - "@types/text-encoding": "0.0.32", + "@std/esm": "0.19.1", + "@types/flatbuffers": "1.6.5", + "@types/glob": "5.0.34", + "@types/jest": "22.0.1", + "@types/node": "9.3.0", + "ast-types": "0.10.1", "benchmark": "2.1.4", - "coveralls": "2.13.1", - "command-line-args": "4.0.7", + "coveralls": "3.0.0", "del": "3.0.0", - "esdoc": "1.0.3", + "esdoc": "1.0.4", "esdoc-standard-plugin": "1.0.0", - "google-closure-compiler": "20170910.0.0", - "gulp": "github:gulpjs/gulp#4.0", - "gulp-json-transform": "0.4.2", - "gulp-sourcemaps": "2.6.1", - "gulp-typescript": "3.2.2", - "jest": "21.1.0", + "glob": "7.1.2", + "google-closure-compiler": "20180101.0.0", + "gulp": "github:gulpjs/gulp#6d71a658c61edb3090221579d8f97dbe086ba2ed", + "gulp-json-transform": "0.4.5", + "gulp-rename": "1.2.2", + "gulp-sourcemaps": "2.6.3", + "gulp-transform-js-ast": "1.0.2", + "gulp-typescript": "3.2.3", + "ix": "2.3.4", + "jest": "22.0.5", "jest-environment-node-debug": "2.0.0", "json": "9.0.6", - "lerna": "2.2.0", - "lint-staged": "4.2.1", - "merge2": "1.2.0", + "lerna": "2.6.0", + "lint-staged": "6.0.0", + "merge2": "1.2.1", "mkdirp": "0.5.1", - "npm-run-all": "4.1.1", + "npm-run-all": "4.1.2", "pump": "1.0.2", "rimraf": "2.6.2", + "rxjs": "5.5.6", "shx": "0.2.2", - "text-encoding-utf-8": "1.0.1", - "trash": "4.1.0", - "ts-jest": "21.0.1", - "tslib": "1.7.1", - "tslint": "5.7.0", - "typescript": "2.5.2" + "source-map-loader": "0.2.3", + "trash": "4.2.1", + "ts-jest": "22.0.1", + "tslint": "5.9.1", + "typescript": "2.6.2", + "uglifyjs-webpack-plugin": "1.1.6", + "webpack": "3.10.0", + "xml2js": "0.4.19" }, "lint-staged": { "*.@(ts)": [ @@ -122,9 +131,12 @@ "/node_modules/" ], "transform": { - ".(ts|tsx)": "/node_modules/ts-jest/preprocessor.js", - ".(js|jsx)": "/node_modules/babel-jest/build/index.js" + ".(ts|tsx)": "./node_modules/ts-jest/preprocessor.js", + ".(js|jsx)": "./node_modules/babel-jest/build/index.js" }, + "transformIgnorePatterns": [ + "/node_modules/", "/(es2015|esnext)\/umd/" + ], "testRegex": "(.*(-|\\.)(test|spec)s?)\\.(ts|tsx|js)$" } } diff --git a/js/perf/arrows/file/dictionary.arrow b/js/perf/arrows/file/dictionary.arrow deleted file mode 100644 index 34d41db1f2001..0000000000000 Binary files a/js/perf/arrows/file/dictionary.arrow and /dev/null differ diff --git a/js/perf/arrows/file/simple.arrow b/js/perf/arrows/file/simple.arrow deleted file mode 100644 index 838db6dc8eda5..0000000000000 Binary files a/js/perf/arrows/file/simple.arrow and /dev/null differ diff --git a/js/perf/arrows/file/struct.arrow b/js/perf/arrows/file/struct.arrow deleted file mode 100644 index 3d2c018e6c27c..0000000000000 Binary files a/js/perf/arrows/file/struct.arrow and /dev/null differ diff --git a/js/perf/arrows/multi/count/records.arrow b/js/perf/arrows/multi/count/records.arrow deleted file mode 100644 index 00d883762d369..0000000000000 Binary files a/js/perf/arrows/multi/count/records.arrow and /dev/null differ diff --git a/js/perf/arrows/multi/count/schema.arrow b/js/perf/arrows/multi/count/schema.arrow deleted file mode 100644 index dfd24e9e0018c..0000000000000 Binary files a/js/perf/arrows/multi/count/schema.arrow and /dev/null differ diff --git a/js/perf/arrows/multi/latlong/records.arrow b/js/perf/arrows/multi/latlong/records.arrow deleted file mode 100644 index 563d12d175d4e..0000000000000 Binary files a/js/perf/arrows/multi/latlong/records.arrow and /dev/null differ diff --git a/js/perf/arrows/multi/latlong/schema.arrow b/js/perf/arrows/multi/latlong/schema.arrow deleted file mode 100644 index 638b2ab622f8e..0000000000000 Binary files a/js/perf/arrows/multi/latlong/schema.arrow and /dev/null differ diff --git a/js/perf/arrows/multi/origins/records.arrow b/js/perf/arrows/multi/origins/records.arrow deleted file mode 100644 index 49a8c407e176e..0000000000000 Binary files a/js/perf/arrows/multi/origins/records.arrow and /dev/null differ diff --git a/js/perf/arrows/multi/origins/schema.arrow b/js/perf/arrows/multi/origins/schema.arrow deleted file mode 100644 index 0d10fb0e2d135..0000000000000 Binary files a/js/perf/arrows/multi/origins/schema.arrow and /dev/null differ diff --git a/js/perf/arrows/stream/dictionary.arrow b/js/perf/arrows/stream/dictionary.arrow deleted file mode 100644 index 17ca48b3a97f5..0000000000000 Binary files a/js/perf/arrows/stream/dictionary.arrow and /dev/null differ diff --git a/js/perf/arrows/stream/simple.arrow b/js/perf/arrows/stream/simple.arrow deleted file mode 100644 index 2c68c0e44b0af..0000000000000 Binary files a/js/perf/arrows/stream/simple.arrow and /dev/null differ diff --git a/js/perf/arrows/stream/struct.arrow b/js/perf/arrows/stream/struct.arrow deleted file mode 100644 index 4e97b7084f6b7..0000000000000 Binary files a/js/perf/arrows/stream/struct.arrow and /dev/null differ diff --git a/js/perf/config.js b/js/perf/config.js index 4fbcda3799587..cca1080154790 100644 --- a/js/perf/config.js +++ b/js/perf/config.js @@ -17,22 +17,14 @@ const fs = require('fs'); const path = require('path'); -const arrowFormats = ['file', 'stream']; -const arrowFileNames = ['simple', 'struct', 'dictionary']; -const multipartArrows = ['count', 'latlong', 'origins']; -let arrowTestConfigurations = []; +const glob = require('glob'); -arrowTestConfigurations = multipartArrows.reduce((configs, folder) => { - const schemaPath = path.resolve(__dirname, `./arrows/multi/${folder}/schema.arrow`); - const recordsPath = path.resolve(__dirname, `./arrows/multi/${folder}/records.arrow`); - return [...configs, [`multipart ${folder}`, fs.readFileSync(schemaPath), fs.readFileSync(recordsPath)]]; -}, arrowTestConfigurations); +const config = []; +const filenames = glob.sync(path.resolve(__dirname, `../test/data/cpp/stream`, `*.arrow`)); -arrowTestConfigurations = arrowFormats.reduce((configs, format) => { - return arrowFileNames.reduce((configs, name) => { - const arrowPath = path.resolve(__dirname, `./arrows/${format}/${name}.arrow`); - return [...configs, [`${name} ${format}`, fs.readFileSync(arrowPath)]]; - }, configs); -}, arrowTestConfigurations); +for (const filename of filenames) { + const { name } = path.parse(filename); + config.push({ name, buffers: [fs.readFileSync(filename)] }); +} -module.exports = arrowTestConfigurations; +module.exports = config; diff --git a/js/perf/index.js b/js/perf/index.js index 669f690122d10..9eac40e64ac71 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,31 +16,29 @@ // under the License. // Use the ES5 UMD target as perf baseline -// ES6/7 iterators are faster in turbofan, but something about the -// ES5 transpilation (rewriting let and const to var?) JITs better -const { Table, readBuffers } = require('../dist/Arrow'); -// const { Table, readBuffers } = require('../targets/es5/cjs'); -// const { Table, readBuffers } = require('../targets/es2015/cjs'); -// const { Table, readBuffers } = require('../targets/esnext/cjs'); +// const { Table, readVectors } = require('../targets/es5/umd'); +// const { Table, readVectors } = require('../targets/es5/cjs'); +const { Table, readVectors } = require('../targets/es2015/umd'); +// const { Table, readVectors } = require('../targets/es2015/cjs'); +const config = require('./config'); const Benchmark = require('benchmark'); -const arrowTestConfigurations = require('./config'); const suites = []; -for (let [name, ...buffers] of arrowTestConfigurations) { +for (let { name, buffers} of config) { const parseSuite = new Benchmark.Suite(`Parse ${name}`, { async: true }); const sliceSuite = new Benchmark.Suite(`Slice ${name} vectors`, { async: true }); const iterateSuite = new Benchmark.Suite(`Iterate ${name} vectors`, { async: true }); const getByIndexSuite = new Benchmark.Suite(`Get ${name} values by index`, { async: true }); parseSuite.add(createFromTableTest(name, buffers)); - parseSuite.add(createReadBuffersTest(name, buffers)); - for (const vector of Table.from(...buffers).cols()) { + parseSuite.add(createReadVectorsTest(name, buffers)); + for (const vector of Table.from(buffers).columns) { sliceSuite.add(createSliceTest(vector)); iterateSuite.add(createIterateTest(vector)); getByIndexSuite.add(createGetByIndexTest(vector)); } - suites.push(parseSuite, sliceSuite, getByIndexSuite, iterateSuite); + suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); } console.log('Running apache-arrow performance tests...\n'); @@ -68,16 +66,16 @@ function createFromTableTest(name, buffers) { return { async: true, name: `Table.from`, - fn() { table = Table.from(...buffers); } + fn() { table = Table.from(buffers); } }; } -function createReadBuffersTest(name, buffers) { +function createReadVectorsTest(name, buffers) { let vectors; return { async: true, - name: `readBuffers`, - fn() { for (vectors of readBuffers(...buffers)) {} } + name: `readVectors`, + fn() { for (vectors of readVectors(buffers)) {} } }; } diff --git a/js/src/Arrow.externs.ts b/js/src/Arrow.externs.ts index 7289d6d2732b6..c23930271183d 100644 --- a/js/src/Arrow.externs.ts +++ b/js/src/Arrow.externs.ts @@ -24,23 +24,30 @@ Symbol.iterator; /** @type {symbol} */ Symbol.asyncIterator; -let Table = function() {}; + +let RowVector = function() {}; /** @type {?} */ -Table.prototype.length; +RowVector.prototype.toJSON; /** @type {?} */ -Table.prototype.rows; +RowVector.prototype.toArray; /** @type {?} */ -Table.prototype.cols; +RowVector.prototype.toObject; /** @type {?} */ -Table.prototype.getRow; +RowVector.prototype.toString; + +let Table = function() {}; +/** @type {?} */ +( Table).from; /** @type {?} */ -Table.prototype.getCell; +Table.prototype.columns; /** @type {?} */ -Table.prototype.getCellAt; +Table.prototype.length; /** @type {?} */ -Table.prototype.getColumn; +Table.prototype.col; /** @type {?} */ -Table.prototype.getColumnAt; +Table.prototype.key; +/** @type {?} */ +Table.prototype.select; /** @type {?} */ Table.prototype.toString; @@ -52,24 +59,26 @@ Vector.prototype.name; /** @type {?} */ Vector.prototype.type; /** @type {?} */ -Vector.prototype.props; -/** @type {?} */ Vector.prototype.get; /** @type {?} */ Vector.prototype.concat; /** @type {?} */ Vector.prototype.slice; - -let TypedVector = function() {}; /** @type {?} */ -TypedVector.prototype.arrayType; +Vector.prototype.metadata; +/** @type {?} */ +Vector.prototype.nullable; +/** @type {?} */ +Vector.prototype.nullCount; -let ValidityVector = function() {}; +let BoolVector = function() {}; +/** @type {?} */ +( BoolVector).pack; /** @type {?} */ -( ValidityVector).pack; +BoolVector.prototype.set; let DictionaryVector = function() {}; /** @type {?} */ -DictionaryVector.prototype.index; +DictionaryVector.prototype.getKey; /** @type {?} */ -DictionaryVector.prototype.value; +DictionaryVector.prototype.getValue; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index ea8a5c3e1d9bb..3a8943434eece 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -15,31 +15,52 @@ // specific language governing permissions and limitations // under the License. -import { Table } from './table'; -import { readBuffers } from './reader/arrow'; +import { Table } from './vector/table'; import { Vector } from './vector/vector'; -import { StructVector } from './vector/struct'; +import { Utf8Vector } from './vector/utf8'; import { DictionaryVector } from './vector/dictionary'; -import { ListVector, Utf8Vector, FixedSizeListVector } from './vector/list'; +import { StructVector, StructRow } from './vector/struct'; +import { read, readAsync } from './reader/arrow'; +import { Uint64, Int64, Int128 } from './util/int'; +import { ListVector, BinaryVector, FixedSizeListVector } from './vector/list'; + import { - TypedVector, BitVector, - DateVector, IndexVector, - Int8Vector, Int16Vector, - Int32Vector, Int64Vector, - Uint8Vector, Uint16Vector, - Uint32Vector, Uint64Vector, - Float32Vector, Float64Vector, -} from './vector/typed'; + BoolVector, + Int8Vector, + Int16Vector, + Int32Vector, + Int64Vector, + Uint8Vector, + Uint16Vector, + Uint32Vector, + Uint64Vector, + Float16Vector, + Float32Vector, + Float64Vector, + Date32Vector, + Date64Vector, + Time32Vector, + Time64Vector, + DecimalVector, + TimestampVector, +} from './vector/numeric'; + +// closure compiler always erases static method names: +// https://github.com/google/closure-compiler/issues/1776 +// set them via string indexers to save them from the mangler +Table['from'] = Table.from; +Table['fromAsync'] = Table.fromAsync; +BoolVector['pack'] = BoolVector.pack; +export { read, readAsync }; +export { Table, Vector, StructRow }; +export { Uint64, Int64, Int128 }; +export { NumericVectorConstructor } from './vector/numeric'; +export { List, TypedArray, TypedArrayConstructor } from './vector/types'; export { - Table, readBuffers, - Vector, - BitVector, + BoolVector, ListVector, Utf8Vector, - DateVector, - IndexVector, - TypedVector, Int8Vector, Int16Vector, Int32Vector, @@ -48,9 +69,17 @@ export { Uint16Vector, Uint32Vector, Uint64Vector, + Date32Vector, + Date64Vector, + Time32Vector, + Time64Vector, + BinaryVector, + StructVector, + Float16Vector, Float32Vector, Float64Vector, - StructVector, + DecimalVector, + TimestampVector, DictionaryVector, FixedSizeListVector, }; @@ -60,15 +89,14 @@ try { const Arrow = eval('exports'); if (typeof Arrow === 'object') { // string indexers tell closure compiler not to rename these properties + Arrow['read'] = read; + Arrow['readAsync'] = readAsync; Arrow['Table'] = Table; - Arrow['readBuffers'] = readBuffers; Arrow['Vector'] = Vector; - Arrow['BitVector'] = BitVector; + Arrow['StructRow'] = StructRow; + Arrow['BoolVector'] = BoolVector; Arrow['ListVector'] = ListVector; Arrow['Utf8Vector'] = Utf8Vector; - Arrow['DateVector'] = DateVector; - Arrow['IndexVector'] = IndexVector; - Arrow['TypedVector'] = TypedVector; Arrow['Int8Vector'] = Int8Vector; Arrow['Int16Vector'] = Int16Vector; Arrow['Int32Vector'] = Int32Vector; @@ -77,9 +105,17 @@ try { Arrow['Uint16Vector'] = Uint16Vector; Arrow['Uint32Vector'] = Uint32Vector; Arrow['Uint64Vector'] = Uint64Vector; + Arrow['Date32Vector'] = Date32Vector; + Arrow['Date64Vector'] = Date64Vector; + Arrow['Time32Vector'] = Time32Vector; + Arrow['Time64Vector'] = Time64Vector; + Arrow['BinaryVector'] = BinaryVector; + Arrow['StructVector'] = StructVector; + Arrow['Float16Vector'] = Float16Vector; Arrow['Float32Vector'] = Float32Vector; Arrow['Float64Vector'] = Float64Vector; - Arrow['StructVector'] = StructVector; + Arrow['DecimalVector'] = DecimalVector; + Arrow['TimestampVector'] = TimestampVector; Arrow['DictionaryVector'] = DictionaryVector; Arrow['FixedSizeListVector'] = FixedSizeListVector; } diff --git a/js/bin/arrow2csv.js b/js/src/bin/arrow2csv.ts old mode 100755 new mode 100644 similarity index 53% rename from js/bin/arrow2csv.js rename to js/src/bin/arrow2csv.ts index f316b84b69e58..01ef0b848ce75 --- a/js/bin/arrow2csv.js +++ b/js/src/bin/arrow2csv.ts @@ -1,4 +1,4 @@ -#! /usr/bin/env node +// #! /usr/bin/env node // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file @@ -17,13 +17,19 @@ // specific language governing permissions and limitations // under the License. -var fs = require('fs'); -var Table = require('../dist/Arrow.js').Table; -var optionList = [ +/* tslint:disable */ + +import * as Arrow from '../Arrow'; + +(function() { + +const fs = require('fs'); +const { parse } = require('json-bignum'); +const optionList = [ { type: String, - name: 'schema', - alias: 's', multiple: true, + name: 'schema', alias: 's', + optional: true, multiple: true, typeLabel: '[underline]{columns}', description: 'A space-delimited list of column names' }, @@ -34,12 +40,10 @@ var optionList = [ } ]; -var argv = require(`command-line-args`)(optionList, { partial: true }); -var files = [argv.file, ...(argv._unknown || [])].filter(Boolean); - -// console.log(JSON.stringify(argv)); +const argv = require(`command-line-args`)(optionList, { partial: true }); +const files = [argv.file, ...(argv._unknown || [])].filter(Boolean); -if (!argv.schema || !files.length) { +if (!files.length) { console.log(require('command-line-usage')([ { header: 'arrow2csv', @@ -81,9 +85,51 @@ if (!argv.schema || !files.length) { } files.forEach((source) => { - var allColumns = Table.from(fs.readFileSync(source)); - var selectedColumns = new Table(argv.schema.map((columnName) => { - return allColumns.getColumn(columnName); - })); - console.log(selectedColumns.toString()); + let table: any, input = fs.readFileSync(source); + try { + table = Arrow.Table.from([input]); + } catch (e) { + table = Arrow.Table.from(parse(input + '')); + } + if (argv.schema && argv.schema.length) { + table = table.select(...argv.schema); + } + printTable(table); }); + +function printTable(table: Arrow.Table) { + let header = [...table.columns.map((_, i) => table.key(i))].map(stringify); + let maxColumnWidths = header.map(x => x.length); + // Pass one to convert to strings and count max column widths + for (let i = -1, n = table.length - 1; ++i < n;) { + let val, + row = [i, ...table.get(i)]; + for (let j = -1, k = row.length; ++j < k; ) { + val = stringify(row[j]); + maxColumnWidths[j] = Math.max(maxColumnWidths[j], val.length); + } + } + console.log(header.map((x, j) => leftPad(x, ' ', maxColumnWidths[j])).join(' | ')); + // Pass two to pad each one to max column width + for (let i = -1, n = table.length; ++i < n; ) { + console.log( + [...table.get(i)] + .map(stringify) + .map((x, j) => leftPad(x, ' ', maxColumnWidths[j])) + .join(' | ') + ); + } +} + +function leftPad(str: string, fill: string, n: number) { + return (new Array(n + 1).join(fill) + str).slice(-1 * n); +} + +function stringify(x: any) { + return typeof x === 'string' ? `"${x}"` + : Array.isArray(x) ? JSON.stringify(x) + : ArrayBuffer.isView(x) ? `[${x}]` + : `${x}`; +} + +})(); \ No newline at end of file diff --git a/js/src/format/arrow.ts b/js/src/format/arrow.ts new file mode 100644 index 0000000000000..14adf9040a47f --- /dev/null +++ b/js/src/format/arrow.ts @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { footerFromByteBuffer, messageFromByteBuffer } from './fb'; +import { schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON } from './json'; +import { + IntBitWidth, TimeBitWidth, + VisitorNode, Visitor, Footer, Block, Message, Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, + Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from './types'; + +export { + IntBitWidth, TimeBitWidth, + footerFromByteBuffer, messageFromByteBuffer, + schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON, + VisitorNode, Visitor, Footer, Block, Message, Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, + Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_ as Map, +}; diff --git a/js/src/format/fb.ts b/js/src/format/fb.ts new file mode 100644 index 0000000000000..fdf7f7b0ed99a --- /dev/null +++ b/js/src/format/fb.ts @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as File_ from './fb/File'; +import * as Schema_ from './fb/Schema'; +import * as Message_ from './fb/Message'; +import { flatbuffers } from 'flatbuffers'; +import ByteBuffer = flatbuffers.ByteBuffer; +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; +import _Footer = File_.org.apache.arrow.flatbuf.Footer; +import _Block = File_.org.apache.arrow.flatbuf.Block; +import _Message = Message_.org.apache.arrow.flatbuf.Message; +import _Schema = Schema_.org.apache.arrow.flatbuf.Schema; +import _Field = Schema_.org.apache.arrow.flatbuf.Field; +import _RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; +import _DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; +import _FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; +import _Buffer = Schema_.org.apache.arrow.flatbuf.Buffer; +import _DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; +import _Null = Schema_.org.apache.arrow.flatbuf.Null; +import _Int = Schema_.org.apache.arrow.flatbuf.Int; +import _FloatingPoint = Schema_.org.apache.arrow.flatbuf.FloatingPoint; +import _Binary = Schema_.org.apache.arrow.flatbuf.Binary; +import _Bool = Schema_.org.apache.arrow.flatbuf.Bool; +import _Utf8 = Schema_.org.apache.arrow.flatbuf.Utf8; +import _Decimal = Schema_.org.apache.arrow.flatbuf.Decimal; +import _Date = Schema_.org.apache.arrow.flatbuf.Date; +import _Time = Schema_.org.apache.arrow.flatbuf.Time; +import _Timestamp = Schema_.org.apache.arrow.flatbuf.Timestamp; +import _Interval = Schema_.org.apache.arrow.flatbuf.Interval; +import _List = Schema_.org.apache.arrow.flatbuf.List; +import _Struct = Schema_.org.apache.arrow.flatbuf.Struct_; +import _Union = Schema_.org.apache.arrow.flatbuf.Union; +import _FixedSizeBinary = Schema_.org.apache.arrow.flatbuf.FixedSizeBinary; +import _FixedSizeList = Schema_.org.apache.arrow.flatbuf.FixedSizeList; +import _Map = Schema_.org.apache.arrow.flatbuf.Map; + +import { + IntBitWidth, TimeBitWidth, + Footer, Block, Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, + Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from './types'; + +export function footerFromByteBuffer(bb: ByteBuffer) { + const f = _Footer.getRootAsFooter(bb), s = f.schema()!; + return new Footer( + dictionaryBatchesFromFooter(f), recordBatchesFromFooter(f), + new Schema(f.version(), fieldsFromSchema(s), customMetadata(s), s.endianness()) + ); +} + +export function messageFromByteBuffer(bb: ByteBuffer) { + const m = _Message.getRootAsMessage(bb)!, type = m.headerType(), version = m.version(); + switch (type) { + case MessageHeader.Schema: return schemaFromMessage(version, m.header(new _Schema())!); + case MessageHeader.RecordBatch: return recordBatchFromMessage(version, m.header(new _RecordBatch())!); + case MessageHeader.DictionaryBatch: return dictionaryBatchFromMessage(version, m.header(new _DictionaryBatch())!); + } + return null; + // throw new Error(`Unrecognized Message type '${type}'`); +} + +function schemaFromMessage(version: MetadataVersion, s: _Schema) { + return new Schema(version, fieldsFromSchema(s), customMetadata(s), s.endianness()); +} + +function recordBatchFromMessage(version: MetadataVersion, b: _RecordBatch) { + return new RecordBatch(version, b.length(), fieldNodesFromRecordBatch(b), buffersFromRecordBatch(b, version)); +} + +function dictionaryBatchFromMessage(version: MetadataVersion, d: _DictionaryBatch) { + return new DictionaryBatch(version, recordBatchFromMessage(version, d.data()!), d.id(), d.isDelta()); +} + +function dictionaryBatchesFromFooter(f: _Footer) { + const blocks = [] as Block[]; + for (let b: _Block, i = -1, n = f && f.dictionariesLength(); ++i < n;) { + if (b = f.dictionaries(i)!) { + blocks.push(new Block(b.metaDataLength(), b.bodyLength(), b.offset())); + } + } + return blocks; +} + +function recordBatchesFromFooter(f: _Footer) { + const blocks = [] as Block[]; + for (let b: _Block, i = -1, n = f && f.recordBatchesLength(); ++i < n;) { + if (b = f.recordBatches(i)!) { + blocks.push(new Block(b.metaDataLength(), b.bodyLength(), b.offset())); + } + } + return blocks; +} + +function fieldsFromSchema(s: _Schema) { + const fields = [] as Field[]; + for (let i = -1, n = s && s.fieldsLength(); ++i < n;) { + fields.push(field(s.fields(i)!)); + } + return fields; +} + +function fieldsFromField(f: _Field) { + const fields = [] as Field[]; + for (let i = -1, n = f && f.childrenLength(); ++i < n;) { + fields.push(field(f.children(i)!)); + } + return fields; +} + +function fieldNodesFromRecordBatch(b: _RecordBatch) { + const fieldNodes = [] as FieldNode[]; + for (let i = -1, n = b.nodesLength(); ++i < n;) { + fieldNodes.push(fieldNodeFromRecordBatch(b.nodes(i)!)); + } + return fieldNodes; +} + +function buffersFromRecordBatch(b: _RecordBatch, version: MetadataVersion) { + const buffers = [] as Buffer[]; + for (let i = -1, n = b.buffersLength(); ++i < n;) { + let buffer = b.buffers(i)!; + // If this Arrow buffer was written before version 4, + // advance the buffer's bb_pos 8 bytes to skip past + // the now-removed page id field. + if (version < MetadataVersion.V4) { + buffer.bb_pos += (8 * (i + 1)); + } + buffers.push(bufferFromRecordBatch(buffer)); + } + return buffers; +} + +function field(f: _Field) { + return new Field( + f.name()!, + typeFromField(f), + f.typeType(), + f.nullable(), + fieldsFromField(f), + customMetadata(f), + dictionaryEncodingFromField(f) + ); +} + +function dictionaryEncodingFromField(f: _Field) { + let t: _Int | null; + let e: _DictionaryEncoding | null; + if (e = f.dictionary()) { + if (t = e.indexType()) { + return new DictionaryEncoding(new Int(t.isSigned(), t.bitWidth() as IntBitWidth), e.id(), e.isOrdered()); + } + return new DictionaryEncoding(null, e.id(), e.isOrdered()); + } + return undefined; +} + +function customMetadata(parent?: _Schema | _Field | null) { + const data = new Map(); + if (parent) { + for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { + if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { + data.set(key, entry.value()!); + } + } + } + return data; +} + +function fieldNodeFromRecordBatch(f: _FieldNode) { + return new FieldNode(f.length(), f.nullCount()); +} + +function bufferFromRecordBatch(b: _Buffer) { + return new Buffer(b.offset(), b.length()); +} + +function typeFromField(f: _Field) { + switch (f.typeType()) { + case Type.NONE: return nullFromField(f.type(new _Null())!); + case Type.Null: return nullFromField(f.type(new _Null())!); + case Type.Int: return intFromField(f.type(new _Int())!); + case Type.FloatingPoint: return floatingPointFromField(f.type(new _FloatingPoint())!); + case Type.Binary: return binaryFromField(f.type(new _Binary())!); + case Type.Utf8: return utf8FromField(f.type(new _Utf8())!); + case Type.Bool: return boolFromField(f.type(new _Bool())!); + case Type.Decimal: return decimalFromField(f.type(new _Decimal())!); + case Type.Date: return dateFromField(f.type(new _Date())!); + case Type.Time: return timeFromField(f.type(new _Time())!); + case Type.Timestamp: return timestampFromField(f.type(new _Timestamp())!); + case Type.Interval: return intervalFromField(f.type(new _Interval())!); + case Type.List: return listFromField(f.type(new _List())!); + case Type.Struct_: return structFromField(f.type(new _Struct())!); + case Type.Union: return unionFromField(f.type(new _Union())!); + case Type.FixedSizeBinary: return fixedSizeBinaryFromField(f.type(new _FixedSizeBinary())!); + case Type.FixedSizeList: return fixedSizeListFromField(f.type(new _FixedSizeList())!); + case Type.Map: return mapFromField(f.type(new _Map())!); + } + throw new Error(`Unrecognized type ${f.typeType()}`); +} + +function nullFromField(_type: _Null) { return new Null(); } +function intFromField(_type: _Int) { return new Int(_type.isSigned(), _type.bitWidth() as IntBitWidth); } +function floatingPointFromField(_type: _FloatingPoint) { return new FloatingPoint(_type.precision()); } +function binaryFromField(_type: _Binary) { return new Binary(); } +function utf8FromField(_type: _Utf8) { return new Utf8(); } +function boolFromField(_type: _Bool) { return new Bool(); } +function decimalFromField(_type: _Decimal) { return new Decimal(_type.scale(), _type.precision()); } +function dateFromField(_type: _Date) { return new Date(_type.unit()); } +function timeFromField(_type: _Time) { return new Time(_type.unit(), _type.bitWidth() as TimeBitWidth); } +function timestampFromField(_type: _Timestamp) { return new Timestamp(_type.unit(), _type.timezone()); } +function intervalFromField(_type: _Interval) { return new Interval(_type.unit()); } +function listFromField(_type: _List) { return new List(); } +function structFromField(_type: _Struct) { return new Struct(); } +function unionFromField(_type: _Union) { return new Union(_type.mode(), (_type.typeIdsArray() || []) as Type[]); } +function fixedSizeBinaryFromField(_type: _FixedSizeBinary) { return new FixedSizeBinary(_type.byteWidth()); } +function fixedSizeListFromField(_type: _FixedSizeList) { return new FixedSizeList(_type.listSize()); } +function mapFromField(_type: _Map) { return new Map_(_type.keysSorted()); } diff --git a/js/src/format/File_generated.ts b/js/src/format/fb/File.ts similarity index 99% rename from js/src/format/File_generated.ts rename to js/src/format/fb/File.ts index d0b774ae34095..56f50ed20e936 100644 --- a/js/src/format/File_generated.ts +++ b/js/src/format/fb/File.ts @@ -1,7 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify import { flatbuffers } from 'flatbuffers'; -import * as NS16187549871986683199 from './Schema_generated'; +import * as NS16187549871986683199 from './Schema'; /** * ---------------------------------------------------------------------- * Arrow File metadata diff --git a/js/closure-compiler-scripts/File_generated.js b/js/src/format/fb/File_generated.js similarity index 95% rename from js/closure-compiler-scripts/File_generated.js rename to js/src/format/fb/File_generated.js index bb82cc4ccc6e4..12aae293ea4eb 100644 --- a/js/closure-compiler-scripts/File_generated.js +++ b/js/src/format/fb/File_generated.js @@ -1,12 +1,5 @@ +import { org } from './Schema'; // automatically generated by the FlatBuffers compiler, do not modify -goog.module("module$targets$es5$cls$format$File_generated"); -goog.module.declareLegacyNamespace(); -var Schema_ = goog.require("module$targets$es5$cls$format$Schema_generated"); -/** - * @const - * @namespace - */ -var org = Schema_.org; /** * @const @@ -259,6 +252,5 @@ org.apache.arrow.flatbuf.Block.createBlock = function(builder, offset, metaDataL builder.writeInt64(offset); return builder.offset(); }; +export { org }; -// Exports for Node.js and RequireJS -exports.org = org; diff --git a/js/src/format/Message_generated.ts b/js/src/format/fb/Message.ts similarity index 94% rename from js/src/format/Message_generated.ts rename to js/src/format/fb/Message.ts index daa781f9b9290..4610fbef2e1c8 100644 --- a/js/src/format/Message_generated.ts +++ b/js/src/format/fb/Message.ts @@ -1,7 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify import { flatbuffers } from 'flatbuffers'; -import * as NS16187549871986683199 from './Schema_generated'; +import * as NS16187549871986683199 from './Schema'; export namespace org.apache.arrow.flatbuf { export import Schema = NS16187549871986683199.org.apache.arrow.flatbuf.Schema; } @@ -181,7 +181,7 @@ export namespace org.apache.arrow.flatbuf { */ buffers(index: number, obj?: NS16187549871986683199.org.apache.arrow.flatbuf.Buffer): NS16187549871986683199.org.apache.arrow.flatbuf.Buffer | null { let offset = this.bb.__offset(this.bb_pos, 8); - return offset ? (obj || new NS16187549871986683199.org.apache.arrow.flatbuf.Buffer).__init(this.bb.__vector(this.bb_pos + offset) + index * 24, this.bb) : null; + return offset ? (obj || new NS16187549871986683199.org.apache.arrow.flatbuf.Buffer).__init(this.bb.__vector(this.bb_pos + offset) + index * 16, this.bb) : null; } /** @@ -236,7 +236,7 @@ export namespace org.apache.arrow.flatbuf { * @param {number} numElems */ static startBuffersVector(builder: flatbuffers.Builder, numElems: number) { - builder.startVector(24, numElems, 8); + builder.startVector(16, numElems, 8); } /** @@ -251,12 +251,12 @@ export namespace org.apache.arrow.flatbuf { } } /** - * ---------------------------------------------------------------------- * For sending dictionary encoding information. Any Field can be * dictionary-encoded, but in this case none of its children may be * dictionary-encoded. - * There is one vector / column per dictionary - * + * There is one vector / column per dictionary, but that vector / column + * may be spread across multiple dictionary batches by using the isDelta + * flag * * @constructor */ @@ -308,11 +308,22 @@ export namespace org.apache.arrow.flatbuf { return offset ? (obj || new org.apache.arrow.flatbuf.RecordBatch).__init(this.bb.__indirect(this.bb_pos + offset), this.bb) : null; } + /** + * If isDelta is true the values in the dictionary are to be appended to a + * dictionary with the indicated id + * + * @returns {boolean} + */ + isDelta(): boolean { + let offset = this.bb.__offset(this.bb_pos, 8); + return offset ? !!this.bb.readInt8(this.bb_pos + offset) : false; + } + /** * @param {flatbuffers.Builder} builder */ static startDictionaryBatch(builder: flatbuffers.Builder) { - builder.startObject(2); + builder.startObject(3); } /** @@ -331,6 +342,14 @@ export namespace org.apache.arrow.flatbuf { builder.addFieldOffset(1, dataOffset, 0); } + /** + * @param {flatbuffers.Builder} builder + * @param {boolean} isDelta + */ + static addIsDelta(builder: flatbuffers.Builder, isDelta: boolean) { + builder.addFieldInt8(2, +isDelta, +false); + } + /** * @param {flatbuffers.Builder} builder * @returns {flatbuffers.Offset} diff --git a/js/closure-compiler-scripts/Message_generated.js b/js/src/format/fb/Message_generated.js similarity index 93% rename from js/closure-compiler-scripts/Message_generated.js rename to js/src/format/fb/Message_generated.js index 0c1a1a99d74ca..ef46c98057c9f 100644 --- a/js/closure-compiler-scripts/Message_generated.js +++ b/js/src/format/fb/Message_generated.js @@ -1,12 +1,5 @@ +import { org } from './Schema'; // automatically generated by the FlatBuffers compiler, do not modify -goog.module("module$targets$es5$cls$format$Message_generated"); -goog.module.declareLegacyNamespace(); -var Schema_ = goog.require("module$targets$es5$cls$format$Schema_generated"); -/** - * @const - * @namespace - */ -var org = Schema_.org; /** * @const @@ -200,7 +193,7 @@ org.apache.arrow.flatbuf.RecordBatch.prototype.nodesLength = function() { */ org.apache.arrow.flatbuf.RecordBatch.prototype.buffers = function(index, obj) { var offset = this.bb.__offset(this.bb_pos, 8); - return offset ? (obj || new org.apache.arrow.flatbuf.Buffer).__init(this.bb.__vector(this.bb_pos + offset) + index * 24, this.bb) : null; + return offset ? (obj || new org.apache.arrow.flatbuf.Buffer).__init(this.bb.__vector(this.bb_pos + offset) + index * 16, this.bb) : null; }; /** @@ -255,7 +248,7 @@ org.apache.arrow.flatbuf.RecordBatch.addBuffers = function(builder, buffersOffse * @param {number} numElems */ org.apache.arrow.flatbuf.RecordBatch.startBuffersVector = function(builder, numElems) { - builder.startVector(24, numElems, 8); + builder.startVector(16, numElems, 8); }; /** @@ -268,12 +261,12 @@ org.apache.arrow.flatbuf.RecordBatch.endRecordBatch = function(builder) { }; /** - * ---------------------------------------------------------------------- * For sending dictionary encoding information. Any Field can be * dictionary-encoded, but in this case none of its children may be * dictionary-encoded. - * There is one vector / column per dictionary - * + * There is one vector / column per dictionary, but that vector / column + * may be spread across multiple dictionary batches by using the isDelta + * flag * * @constructor */ @@ -326,11 +319,22 @@ org.apache.arrow.flatbuf.DictionaryBatch.prototype.data = function(obj) { return offset ? (obj || new org.apache.arrow.flatbuf.RecordBatch).__init(this.bb.__indirect(this.bb_pos + offset), this.bb) : null; }; +/** + * If isDelta is true the values in the dictionary are to be appended to a + * dictionary with the indicated id + * + * @returns {boolean} + */ +org.apache.arrow.flatbuf.DictionaryBatch.prototype.isDelta = function() { + var offset = this.bb.__offset(this.bb_pos, 8); + return offset ? !!this.bb.readInt8(this.bb_pos + offset) : false; +}; + /** * @param {flatbuffers.Builder} builder */ org.apache.arrow.flatbuf.DictionaryBatch.startDictionaryBatch = function(builder) { - builder.startObject(2); + builder.startObject(3); }; /** @@ -349,6 +353,14 @@ org.apache.arrow.flatbuf.DictionaryBatch.addData = function(builder, dataOffset) builder.addFieldOffset(1, dataOffset, 0); }; +/** + * @param {flatbuffers.Builder} builder + * @param {boolean} isDelta + */ +org.apache.arrow.flatbuf.DictionaryBatch.addIsDelta = function(builder, isDelta) { + builder.addFieldInt8(2, +isDelta, +false); +}; + /** * @param {flatbuffers.Builder} builder * @returns {flatbuffers.Offset} @@ -481,6 +493,5 @@ org.apache.arrow.flatbuf.Message.endMessage = function(builder) { org.apache.arrow.flatbuf.Message.finishMessageBuffer = function(builder, offset) { builder.finish(offset); }; +export { org }; -// Exports for Node.js and RequireJS -exports.org = org; diff --git a/js/src/format/Schema_generated.ts b/js/src/format/fb/Schema.ts similarity index 99% rename from js/src/format/Schema_generated.ts rename to js/src/format/fb/Schema.ts index 65493b7f685ec..d9b45ed20089c 100644 --- a/js/src/format/Schema_generated.ts +++ b/js/src/format/fb/Schema.ts @@ -7,9 +7,25 @@ import { flatbuffers } from 'flatbuffers'; */ export namespace org.apache.arrow.flatbuf { export enum MetadataVersion { + /** + * 0.1.0 + */ V1 = 0, + + /** + * 0.2.0 + */ V2 = 1, - V3 = 2 + + /** + * 0.3.0 -> 0.7.1 + */ + V3 = 2, + + /** + * >= 0.8.0 + */ + V4 = 3 } } @@ -2027,16 +2043,6 @@ export namespace org.apache.arrow.flatbuf { return this; } - /** - * The shared memory page id where this buffer is located. Currently this is - * not used - * - * @returns {number} - */ - page(): number { - return this.bb.readInt32(this.bb_pos); - } - /** * The relative offset into the shared memory page where the bytes for this * buffer starts @@ -2044,7 +2050,7 @@ export namespace org.apache.arrow.flatbuf { * @returns {flatbuffers.Long} */ offset(): flatbuffers.Long { - return this.bb.readInt64(this.bb_pos + 8); + return this.bb.readInt64(this.bb_pos); } /** @@ -2054,22 +2060,19 @@ export namespace org.apache.arrow.flatbuf { * @returns {flatbuffers.Long} */ length(): flatbuffers.Long { - return this.bb.readInt64(this.bb_pos + 16); + return this.bb.readInt64(this.bb_pos + 8); } /** * @param {flatbuffers.Builder} builder - * @param {number} page * @param {flatbuffers.Long} offset * @param {flatbuffers.Long} length * @returns {flatbuffers.Offset} */ - static createBuffer(builder: flatbuffers.Builder, page: number, offset: flatbuffers.Long, length: flatbuffers.Long): flatbuffers.Offset { - builder.prep(8, 24); + static createBuffer(builder: flatbuffers.Builder, offset: flatbuffers.Long, length: flatbuffers.Long): flatbuffers.Offset { + builder.prep(8, 16); builder.writeInt64(length); builder.writeInt64(offset); - builder.pad(4); - builder.writeInt32(page); return builder.offset(); } diff --git a/js/closure-compiler-scripts/Schema_generated.js b/js/src/format/fb/Schema_generated.js similarity index 97% rename from js/closure-compiler-scripts/Schema_generated.js rename to js/src/format/fb/Schema_generated.js index 5b76443886543..ebed8a90645c8 100644 --- a/js/closure-compiler-scripts/Schema_generated.js +++ b/js/src/format/fb/Schema_generated.js @@ -1,6 +1,4 @@ // automatically generated by the FlatBuffers compiler, do not modify -goog.module("module$targets$es5$cls$format$Schema_generated"); -goog.module.declareLegacyNamespace(); /** * @const @@ -30,52 +28,68 @@ org.apache.arrow.flatbuf = org.apache.arrow.flatbuf || {}; * @enum */ org.apache.arrow.flatbuf.MetadataVersion = { - V1: 0, 0: 'V1', - V2: 1, 1: 'V2', - V3: 2, 2: 'V3', + /** + * 0.1.0 + */ + 'V1': 0, 0: 'V1', + + /** + * 0.2.0 + */ + 'V2': 1, 1: 'V2', + + /** + * 0.3.0 -> 0.7.1 + */ + 'V3': 2, 2: 'V3', + + /** + * >= 0.8.0 + */ + 'V4': 3, 3: 'V4' }; /** * @enum */ org.apache.arrow.flatbuf.UnionMode = { - Sparse: 0, 0: 'Sparse', - Dense: 1, 1: 'Dense', + 'Sparse': 0, 0: 'Sparse', + 'Dense': 1, 1: 'Dense', }; /** * @enum */ org.apache.arrow.flatbuf.Precision = { - HALF: 0, 0: 'HALF', - SINGLE: 1, 1: 'SINGLE', - DOUBLE: 2, 2: 'DOUBLE', + 'HALF': 0, 0: 'HALF', + 'SINGLE': 1, 1: 'SINGLE', + 'DOUBLE': 2, 2: 'DOUBLE', }; /** * @enum */ org.apache.arrow.flatbuf.DateUnit = { - DAY: 0, 0: 'DAY', - MILLISECOND: 1, 1: 'MILLISECOND', + 'DAY': 0, 0: 'DAY', + 'MILLISECOND': 1, 1: 'MILLISECOND', }; /** * @enum */ org.apache.arrow.flatbuf.TimeUnit = { - SECOND: 0, 0: 'SECOND', - MILLISECOND: 1, 1: 'MILLISECOND', - MICROSECOND: 2, 2: 'MICROSECOND', - NANOSECOND: 3, 3: 'NANOSECOND', + 'SECOND': 0, 0: 'SECOND', + 'MILLISECOND': 1, 1: 'MILLISECOND', + 'MICROSECOND': 2, 2: 'MICROSECOND', + 'NANOSECOND': 3, 3: 'NANOSECOND', }; /** * @enum */ org.apache.arrow.flatbuf.IntervalUnit = { - YEAR_MONTH: 0, 0: 'YEAR_MONTH', - DAY_TIME: 1, 1: 'DAY_TIME', + 'YEAR_MONTH': 0, 0: 'YEAR_MONTH', + 'DAY_TIME': 1, 1: 'DAY_TIME', }; /** @@ -86,24 +100,24 @@ org.apache.arrow.flatbuf.IntervalUnit = { * @enum */ org.apache.arrow.flatbuf.Type = { - NONE: 0, 0: 'NONE', - Null: 1, 1: 'Null', - Int: 2, 2: 'Int', - FloatingPoint: 3, 3: 'FloatingPoint', - Binary: 4, 4: 'Binary', - Utf8: 5, 5: 'Utf8', - Bool: 6, 6: 'Bool', - Decimal: 7, 7: 'Decimal', - Date: 8, 8: 'Date', - Time: 9, 9: 'Time', - Timestamp: 10, 10: 'Timestamp', - Interval: 11, 11: 'Interval', - List: 12, 12: 'List', - Struct_: 13, 13: 'Struct_', - Union: 14, 14: 'Union', - FixedSizeBinary: 15, 15: 'FixedSizeBinary', - FixedSizeList: 16, 16: 'FixedSizeList', - Map: 17, 17: 'Map', + 'NONE': 0, 0: 'NONE', + 'Null': 1, 1: 'Null', + 'Int': 2, 2: 'Int', + 'FloatingPoint': 3, 3: 'FloatingPoint', + 'Binary': 4, 4: 'Binary', + 'Utf8': 5, 5: 'Utf8', + 'Bool': 6, 6: 'Bool', + 'Decimal': 7, 7: 'Decimal', + 'Date': 8, 8: 'Date', + 'Time': 9, 9: 'Time', + 'Timestamp': 10, 10: 'Timestamp', + 'Interval': 11, 11: 'Interval', + 'List': 12, 12: 'List', + 'Struct_': 13, 13: 'Struct_', + 'Union': 14, 14: 'Union', + 'FixedSizeBinary': 15, 15: 'FixedSizeBinary', + 'FixedSizeList': 16, 16: 'FixedSizeList', + 'Map': 17, 17: 'Map' }; /** @@ -116,22 +130,22 @@ org.apache.arrow.flatbuf.VectorType = { /** * used in List type, Dense Union and variable length primitive types (String, Binary) */ - OFFSET: 0, 0: 'OFFSET', + 'OFFSET': 0, 0: 'OFFSET', /** * actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector */ - DATA: 1, 1: 'DATA', + 'DATA': 1, 1: 'DATA', /** * Bit vector indicating if each value is null */ - VALIDITY: 2, 2: 'VALIDITY', + 'VALIDITY': 2, 2: 'VALIDITY', /** * Type vector used in Union type */ - TYPE: 3, 3: 'TYPE', + 'TYPE': 3, 3: 'TYPE' }; /** @@ -141,8 +155,8 @@ org.apache.arrow.flatbuf.VectorType = { * @enum */ org.apache.arrow.flatbuf.Endianness = { - Little: 0, 0: 'Little', - Big: 1, 1: 'Big', + 'Little': 0, 0: 'Little', + 'Big': 1, 1: 'Big', }; /** @@ -2005,16 +2019,6 @@ org.apache.arrow.flatbuf.Buffer.prototype.__init = function(i, bb) { return this; }; -/** - * The shared memory page id where this buffer is located. Currently this is - * not used - * - * @returns {number} - */ -org.apache.arrow.flatbuf.Buffer.prototype.page = function() { - return this.bb.readInt32(this.bb_pos); -}; - /** * The relative offset into the shared memory page where the bytes for this * buffer starts @@ -2022,7 +2026,7 @@ org.apache.arrow.flatbuf.Buffer.prototype.page = function() { * @returns {flatbuffers.Long} */ org.apache.arrow.flatbuf.Buffer.prototype.offset = function() { - return this.bb.readInt64(this.bb_pos + 8); + return this.bb.readInt64(this.bb_pos); }; /** @@ -2032,22 +2036,19 @@ org.apache.arrow.flatbuf.Buffer.prototype.offset = function() { * @returns {flatbuffers.Long} */ org.apache.arrow.flatbuf.Buffer.prototype.length = function() { - return this.bb.readInt64(this.bb_pos + 16); + return this.bb.readInt64(this.bb_pos + 8); }; /** * @param {flatbuffers.Builder} builder - * @param {number} page * @param {flatbuffers.Long} offset * @param {flatbuffers.Long} length * @returns {flatbuffers.Offset} */ -org.apache.arrow.flatbuf.Buffer.createBuffer = function(builder, page, offset, length) { - builder.prep(8, 24); +org.apache.arrow.flatbuf.Buffer.createBuffer = function(builder, offset, length) { + builder.prep(8, 16); builder.writeInt64(length); builder.writeInt64(offset); - builder.pad(4); - builder.writeInt32(page); return builder.offset(); }; @@ -2226,6 +2227,5 @@ org.apache.arrow.flatbuf.Schema.endSchema = function(builder) { org.apache.arrow.flatbuf.Schema.finishSchemaBuffer = function(builder, offset) { builder.finish(offset); }; +export { org }; -// Exports for Node.js and RequireJS -exports.org = org; diff --git a/js/src/format/json.ts b/js/src/format/json.ts new file mode 100644 index 0000000000000..3da3db6d5fea3 --- /dev/null +++ b/js/src/format/json.ts @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as Schema_ from './fb/Schema'; +import { flatbuffers } from 'flatbuffers'; +import Long = flatbuffers.Long; +import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; +import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; +import { + IntBitWidth, TimeBitWidth, + Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, + Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from './types'; + +export function schemaFromJSON(s: any): Schema { + // todo: metadataFromJSON + return new Schema( + MetadataVersion.V4, + fieldsFromJSON(s['fields']), + customMetadata(s['customMetadata']) + ); +} + +export function recordBatchFromJSON(b: any): RecordBatch { + return new RecordBatch( + MetadataVersion.V4, + new Long(b['count'], 0), + fieldNodesFromJSON(b['columns']), + buffersFromJSON(b['columns']) + ); +} + +export function dictionaryBatchFromJSON(b: any): DictionaryBatch { + return new DictionaryBatch( + MetadataVersion.V4, + recordBatchFromJSON(b['data']), + new Long(b['id'], 0), b['isDelta'] + ); +} + +function fieldsFromJSON(fs: any[]): Field[] { + return (fs || []).map(fieldFromJSON); +} + +function fieldNodesFromJSON(xs: any[]): FieldNode[] { + return (xs || []).reduce((fieldNodes, column: any) => [ + ...fieldNodes, + new FieldNode( + new Long(column['count'], 0), + new Long(nullCountFromJSON(column['VALIDITY']), 0) + ), + ...fieldNodesFromJSON(column['children']) + ], [] as FieldNode[]); +} + +function buffersFromJSON(xs: any[], buffers: Buffer[] = []): Buffer[] { + for (let i = -1, n = (xs || []).length; ++i < n;) { + const column = xs[i]; + column['VALIDITY'] && buffers.push(new Buffer(new Long(buffers.length, 0), new Long(column['VALIDITY'].length, 0))); + column['OFFSET'] && buffers.push(new Buffer(new Long(buffers.length, 0), new Long(column['OFFSET'].length, 0))); + column['DATA'] && buffers.push(new Buffer(new Long(buffers.length, 0), new Long(column['DATA'].length, 0))); + buffers = buffersFromJSON(column['children'], buffers); + } + return buffers; +} + +function nullCountFromJSON(validity: number[]) { + return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); +} + +function fieldFromJSON(f: any) { + return new Field( + f['name'], + typeFromJSON(f['type']), + namesToTypeMap[f['type']['name']], + f.nullable, + fieldsFromJSON(f['children']), + customMetadata(f['customMetadata']), + dictionaryEncodingFromJSON(f['dictionary']) + ); +} + +function dictionaryEncodingFromJSON(d: any) { + return !d ? null : new DictionaryEncoding( + d.indexType ? intFromJSON(d.indexType) : null, + new Long(d.id, 0), d.isOrdered + ); +} + +function customMetadata(metadata?: any) { + return new Map(Object.entries(metadata || {})); +} + +const namesToTypeMap: { [n: string]: Type } = { + 'NONE': Type.NONE, + 'null': Type.Null, + 'int': Type.Int, + 'floatingpoint': Type.FloatingPoint, + 'binary': Type.Binary, + 'bool': Type.Bool, + 'utf8': Type.Utf8, + 'decimal': Type.Decimal, + 'date': Type.Date, + 'time': Type.Time, + 'timestamp': Type.Timestamp, + 'interval': Type.Interval, + 'list': Type.List, + 'struct': Type.Struct_, + 'union': Type.Union, + 'fixedsizebinary': Type.FixedSizeBinary, + 'fixedsizelist': Type.FixedSizeList, + 'map': Type.Map, +}; + +function typeFromJSON(t: any) { + switch (namesToTypeMap[t['name']]) { + case Type.NONE: return nullFromJSON(t); + case Type.Null: return nullFromJSON(t); + case Type.Int: return intFromJSON(t); + case Type.FloatingPoint: return floatingPointFromJSON(t); + case Type.Binary: return binaryFromJSON(t); + case Type.Utf8: return utf8FromJSON(t); + case Type.Bool: return boolFromJSON(t); + case Type.Decimal: return decimalFromJSON(t); + case Type.Date: return dateFromJSON(t); + case Type.Time: return timeFromJSON(t); + case Type.Timestamp: return timestampFromJSON(t); + case Type.Interval: return intervalFromJSON(t); + case Type.List: return listFromJSON(t); + case Type.Struct_: return structFromJSON(t); + case Type.Union: return unionFromJSON(t); + case Type.FixedSizeBinary: return fixedSizeBinaryFromJSON(t); + case Type.FixedSizeList: return fixedSizeListFromJSON(t); + case Type.Map: return mapFromJSON(t); + } + throw new Error(`Unrecognized type ${t['name']}`); +} + +function nullFromJSON(_type: any) { return new Null(); } +function intFromJSON(_type: any) { return new Int(_type['isSigned'], _type['bitWidth'] as IntBitWidth); } +function floatingPointFromJSON(_type: any) { return new FloatingPoint(Precision[_type['precision']] as any); } +function binaryFromJSON(_type: any) { return new Binary(); } +function utf8FromJSON(_type: any) { return new Utf8(); } +function boolFromJSON(_type: any) { return new Bool(); } +function decimalFromJSON(_type: any) { return new Decimal(_type['scale'], _type['precision']); } +function dateFromJSON(_type: any) { return new Date(DateUnit[_type['unit']] as any); } +function timeFromJSON(_type: any) { return new Time(TimeUnit[_type['unit']] as any, _type['bitWidth'] as TimeBitWidth); } +function timestampFromJSON(_type: any) { return new Timestamp(TimeUnit[_type['unit']] as any, _type['timezone']); } +function intervalFromJSON(_type: any) { return new Interval(IntervalUnit[_type['unit']] as any); } +function listFromJSON(_type: any) { return new List(); } +function structFromJSON(_type: any) { return new Struct(); } +function unionFromJSON(_type: any) { return new Union(_type['mode'], (_type['typeIdsArray'] || []) as Type[]); } +function fixedSizeBinaryFromJSON(_type: any) { return new FixedSizeBinary(_type['byteWidth']); } +function fixedSizeListFromJSON(_type: any) { return new FixedSizeList(_type['listSize']); } +function mapFromJSON(_type: any) { return new Map_(_type['keysSorted']); } diff --git a/js/src/format/types.ts b/js/src/format/types.ts new file mode 100644 index 0000000000000..09df8ccbbdf7c --- /dev/null +++ b/js/src/format/types.ts @@ -0,0 +1,393 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* tslint:disable:class-name */ + +import { align } from '../util/layout'; +import * as Schema_ from './fb/Schema'; +import * as Message_ from './fb/Message'; +import { flatbuffers } from 'flatbuffers'; +import Long = flatbuffers.Long; +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; +import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +import UnionMode = Schema_.org.apache.arrow.flatbuf.UnionMode; +import Endianness = Schema_.org.apache.arrow.flatbuf.Endianness; +import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; +import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; + +export type IntBitWidth = 8 | 16 | 32 | 64; +export type TimeBitWidth = IntBitWidth | 128; + +export interface VisitorNode { + accept(visitor: Visitor): any; +} + +export abstract class Visitor { + visit(node: VisitorNode): T { + return node.accept(this); + } + visitMany(nodes: VisitorNode[]): T[] { + return nodes.map((node) => this.visit(node)); + } + abstract visitFooter(node: Footer): any; + abstract visitBlock(node: Block): any; + abstract visitMessage(node: Message): any; + abstract visitSchema(node: Schema): any; + abstract visitField(node: Field): any; + abstract visitBuffer(node: Buffer): any; + abstract visitFieldNode(node: FieldNode): any; + abstract visitRecordBatch(node: RecordBatch): any; + abstract visitDictionaryBatch(node: DictionaryBatch): any; + abstract visitDictionaryEncoding(node: DictionaryEncoding): any; + abstract visitNullFieldType(node: Null): any; + abstract visitIntFieldType(node: Int): any; + abstract visitFloatingPointFieldType(node: FloatingPoint): any; + abstract visitBinaryFieldType(node: Binary): any; + abstract visitBoolFieldType(node: Bool): any; + abstract visitUtf8FieldType(node: Utf8): any; + abstract visitDecimalFieldType(node: Decimal): any; + abstract visitDateFieldType(node: Date): any; + abstract visitTimeFieldType(node: Time): any; + abstract visitTimestampFieldType(node: Timestamp): any; + abstract visitIntervalFieldType(node: Interval): any; + abstract visitListFieldType(node: List): any; + abstract visitStructFieldType(node: Struct): any; + abstract visitUnionFieldType(node: Union): any; + abstract visitFixedSizeBinaryFieldType(node: FixedSizeBinary): any; + abstract visitFixedSizeListFieldType(node: FixedSizeList): any; + abstract visitMapFieldType(node: Map_): any; +} + +export class Footer implements VisitorNode { + constructor(public dictionaryBatches: Block[], public recordBatches: Block[], public schema: Schema) {} + accept(visitor: Visitor): any { + return visitor.visitFooter(this); + } +} + +export class Block implements VisitorNode { + constructor(public metaDataLength: number, public bodyLength: Long, public offset: Long) {} + accept(visitor: Visitor): any { + return visitor.visitBlock(this); + } +} + +export class Message implements VisitorNode { + constructor(public version: MetadataVersion, public bodyLength: Long, public headerType: MessageHeader) {} + isSchema(): this is Schema { return this.headerType === MessageHeader.Schema; } + isRecordBatch(): this is RecordBatch { return this.headerType === MessageHeader.RecordBatch; } + isDictionaryBatch(): this is DictionaryBatch { return this.headerType === MessageHeader.DictionaryBatch; } + accept(visitor: Visitor): any { + visitor.visitMessage(this); + } +} + +export class Schema extends Message { + public dictionaries: Map; + constructor(version: MetadataVersion, public fields: Field[], public customMetadata?: Map, public endianness = Endianness.Little) { + super(version, Long.ZERO, MessageHeader.Schema); + const dictionaries = [] as Field[]; + for (let f: Field, i = -1, n = fields.length; ++i < n;) { + if ((f = fields[i])) { + f.dictionary && dictionaries.push(f); + dictionaries.push(...f.dictionaries); + } + } + this.dictionaries = new Map(dictionaries.map<[string, Field]>((f) => [ + f.dictionary!.dictionaryId.toFloat64().toString(), f + ])); + } + accept(visitor: Visitor): any { + return visitor.visitSchema(this); + } +} + +export class RecordBatch extends Message { + constructor(version: MetadataVersion, public length: Long, public fieldNodes: FieldNode[], public buffers: Buffer[]) { + super(version, new Long(buffers.reduce((s, b) => align(s + b.length.low + (b.offset.low - s), 8), 0), 0), MessageHeader.RecordBatch); + } + accept(visitor: Visitor) { + return visitor.visitRecordBatch(this); + } +} + +export class DictionaryBatch extends Message { + constructor(version: MetadataVersion, public dictionary: RecordBatch, public dictionaryId: Long, public isDelta: boolean) { + super(version, dictionary.bodyLength, MessageHeader.DictionaryBatch); + } + get fieldNodes(): FieldNode[] { return this.dictionary.fieldNodes; } + get buffers(): Buffer[] { return this.dictionary.buffers; } + accept(visitor: Visitor) { + return visitor.visitDictionaryBatch(this); + } + static atomicDictionaryId = 0; +} + +export class Field implements VisitorNode { + public dictionaries: Field[]; + constructor(public name: string, + public type: FieldType, + public typeType: Type, + public nullable = false, + public children: Field[] = [], + public metadata?: Map | null, + public dictionary?: DictionaryEncoding | null) { + const dictionaries = [] as Field[]; + for (let f: Field, i = -1, n = children.length; ++i < n;) { + if ((f = children[i])) { + f.dictionary && dictionaries.push(f); + dictionaries.push(...f.dictionaries); + } + } + this.dictionaries = dictionaries; + } + accept(visitor: Visitor): any { + return visitor.visitField(this); + } + indexField() { + return !this.dictionary ? this : new Field( + this.name, + this.dictionary.indexType, this.dictionary.indexType.type, + this.nullable, this.children, this.metadata, this.dictionary + ); + } + toString() { return `Field name[${this.name}], nullable[${this.nullable}], type[${this.type.toString()}]`; } +} + +export class Buffer implements VisitorNode { + constructor(public offset: Long, public length: Long) {} + accept(visitor: Visitor) { + return visitor.visitBuffer(this); + } +} + +export class FieldNode implements VisitorNode { + constructor(public length: Long, public nullCount: Long) {} + accept(visitor: Visitor) { + return visitor.visitFieldNode(this); + } +} + +export class DictionaryEncoding implements VisitorNode { + public isOrdered: boolean; + public dictionaryId: Long; + public indexType: Int; + constructor(indexType?: Int | null, dictionaryId?: Long | null, isOrdered?: boolean | null) { + this.isOrdered = isOrdered || false; + /* a dictionary index defaults to signed 32 bit int if unspecified */ + this.indexType = indexType || new Int(true, 32); + this.dictionaryId = dictionaryId || new Long(DictionaryBatch.atomicDictionaryId++, 0); + } + accept(visitor: Visitor): any { + return visitor.visitDictionaryEncoding(this); + } +} + +export abstract class FieldType implements VisitorNode { + constructor(public type: Type) {} + abstract accept(visitor: Visitor): any; + isNull(): this is Null { return this.type === Type.Null; } + isInt(): this is Int { return this.type === Type.Int; } + isFloatingPoint(): this is FloatingPoint { return this.type === Type.FloatingPoint; } + isBinary(): this is Binary { return this.type === Type.Binary; } + isUtf8(): this is Utf8 { return this.type === Type.Utf8; } + isBool(): this is Bool { return this.type === Type.Bool; } + isDecimal(): this is Decimal { return this.type === Type.Decimal; } + isDate(): this is Date { return this.type === Type.Date; } + isTime(): this is Time { return this.type === Type.Time; } + isTimestamp(): this is Timestamp { return this.type === Type.Timestamp; } + isInterval(): this is Interval { return this.type === Type.Interval; } + isList(): this is List { return this.type === Type.List; } + isStruct(): this is Struct { return this.type === Type.Struct_; } + isUnion(): this is Union { return this.type === Type.Union; } + isFixedSizeBinary(): this is FixedSizeBinary { return this.type === Type.FixedSizeBinary; } + isFixedSizeList(): this is FixedSizeList { return this.type === Type.FixedSizeList; } + isMap(): this is Map_ { return this.type === Type.Map; } +} + +export class Null extends FieldType { + toString() { return `Null`; } + constructor() { + super(Type.Null); + } + accept(visitor: Visitor) { + return visitor.visitNullFieldType(this); + } +} + +export class Int extends FieldType { + toString() { return `Int isSigned[${this.isSigned}], bitWidth[${this.bitWidth}]`; } + constructor(public isSigned: boolean, public bitWidth: IntBitWidth) { + super(Type.Int); + } + accept(visitor: Visitor) { + return visitor.visitIntFieldType(this); + } +} + +export class FloatingPoint extends FieldType { + toString() { return `FloatingPoint precision`; } + constructor(public precision: Precision) { + super(Type.FloatingPoint); + } + accept(visitor: Visitor) { + return visitor.visitFloatingPointFieldType(this); + } +} + +export class Binary extends FieldType { + toString() { return `Binary`; } + constructor() { + super(Type.Binary); + } + accept(visitor: Visitor) { + return visitor.visitBinaryFieldType(this); + } +} + +export class Utf8 extends FieldType { + toString() { return `Utf8`; } + constructor() { + super(Type.Utf8); + } + accept(visitor: Visitor) { + return visitor.visitUtf8FieldType(this); + } +} + +export class Bool extends FieldType { + toString() { return `Bool`; } + constructor() { + super(Type.Bool); + } + accept(visitor: Visitor) { + return visitor.visitBoolFieldType(this); + } +} + +export class Decimal extends FieldType { + toString() { return `Decimal scale[${this.scale}], precision[${this.precision}]`; } + constructor(public scale: number, public precision: number) { + super(Type.Decimal); + } + accept(visitor: Visitor) { + return visitor.visitDecimalFieldType(this); + } +} + +export class Date extends FieldType { + toString() { return `Date unit[${this.unit}]`; } + constructor(public unit: DateUnit) { + super(Type.Date); + } + accept(visitor: Visitor) { + return visitor.visitDateFieldType(this); + } +} + +export class Time extends FieldType { + toString() { return `Time unit[${this.unit}], bitWidth[${this.bitWidth}]`; } + constructor(public unit: TimeUnit, public bitWidth: TimeBitWidth) { + super(Type.Time); + } + accept(visitor: Visitor) { + return visitor.visitTimeFieldType(this); + } +} + +export class Timestamp extends FieldType { + toString() { return `Timestamp unit[${this.unit}], timezone[${this.timezone}]`; } + constructor(public unit: TimeUnit, public timezone?: string | null) { + super(Type.Timestamp); + } + accept(visitor: Visitor) { + return visitor.visitTimestampFieldType(this); + } +} + +export class Interval extends FieldType { + toString() { return `Interval unit[${this.unit}]`; } + constructor(public unit: IntervalUnit) { + super(Type.Interval); + } + accept(visitor: Visitor) { + return visitor.visitIntervalFieldType(this); + } +} + +export class List extends FieldType { + toString() { return `List`; } + constructor() { + super(Type.List); + } + accept(visitor: Visitor) { + return visitor.visitListFieldType(this); + } +} + +export class Struct extends FieldType { + toString() { return `Struct`; } + constructor() { + super(Type.Struct_); + } + accept(visitor: Visitor) { + return visitor.visitStructFieldType(this); + } +} + +export class Union extends FieldType { + toString() { return `Union mode[${this.mode}], typeIds[${this.typeIds}]`; } + constructor(public mode: UnionMode, public typeIds: Type[]) { + super(Type.Union); + } + accept(visitor: Visitor) { + return visitor.visitUnionFieldType(this); + } +} + +export class FixedSizeBinary extends FieldType { + toString() { return `FixedSizeBinary byteWidth[${this.byteWidth}]`; } + constructor(public byteWidth: number) { + super(Type.FixedSizeBinary); + } + accept(visitor: Visitor) { + return visitor.visitFixedSizeBinaryFieldType(this); + } +} + +export class FixedSizeList extends FieldType { + toString() { return `FixedSizeList listSize[${this.listSize}]`; } + constructor(public listSize: number) { + super(Type.FixedSizeList); + } + accept(visitor: Visitor) { + return visitor.visitFixedSizeListFieldType(this); + } +} + +export class Map_ extends FieldType { + toString() { return `Map keysSorted[${this.keysSorted}]`; } + constructor(public keysSorted: boolean) { + super(Type.Map); + } + accept(visitor: Visitor) { + return visitor.visitMapFieldType(this); + } +} diff --git a/js/src/reader/arrow.ts b/js/src/reader/arrow.ts index dbb6acd0e79e8..cf8a3d6a281a2 100644 --- a/js/src/reader/arrow.ts +++ b/js/src/reader/arrow.ts @@ -15,68 +15,34 @@ // specific language governing permissions and limitations // under the License. -import { flatbuffers } from 'flatbuffers'; -import * as Schema_ from '../format/Schema_generated'; -import * as Message_ from '../format/Message_generated'; - -import { readFile } from './file'; -import { readStream } from './stream'; -import { readVector } from './vector'; +import { readJSON } from './json'; +import { readBuffers, readBuffersAsync } from './buffer'; +import { readVectors, readVectorsAsync } from './vector'; import { Vector } from '../vector/vector'; -import { readDictionary } from './dictionary'; -import ByteBuffer = flatbuffers.ByteBuffer; -export import Schema = Schema_.org.apache.arrow.flatbuf.Schema; -export import RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; -export type Dictionaries = { [k: string]: Vector }; -export type IteratorState = { nodeIndex: number; bufferIndex: number }; +export { readJSON }; +export { readBuffers, readBuffersAsync }; +export { readVectors, readVectorsAsync }; -export function* readRecords(...bytes: ByteBuffer[]) { - try { - yield* readFile(...bytes); - } catch (e) { - try { - yield* readStream(...bytes); - } catch (e) { - throw new Error('Invalid Arrow buffer'); - } +export function* read(sources: Iterable | object | string) { + let input: any = sources; + let batches: Iterable; + if (typeof input === 'string') { + try { input = JSON.parse(input); } + catch (e) { input = sources; } } -} - -export function* readBuffers(...bytes: Array) { - const dictionaries: Dictionaries = {}; - const byteBuffers = bytes.map(toByteBuffer); - for (let { schema, batch } of readRecords(...byteBuffers)) { - let vectors: Vector[] = []; - let state = { nodeIndex: 0, bufferIndex: 0 }; - let index = -1, fieldsLength = schema.fieldsLength(); - if (batch.id) { - // A dictionary batch only contain a single vector. Traverse each - // field and its children until we find one that uses this dictionary - while (++index < fieldsLength) { - let vector = readDictionary(schema.fields(index), batch, state, dictionaries); - if (vector) { - dictionaries[batch.id] = dictionaries[batch.id] && dictionaries[batch.id].concat(vector) || vector; - break; - } - } - } else { - while (++index < fieldsLength) { - vectors[index] = readVector(schema.fields(index), batch, state, dictionaries); - } - yield vectors; - } + if (!input || typeof input !== 'object') { + batches = (typeof input === 'string') ? readVectors(readBuffers([input])) : []; + } else { + batches = (typeof input[Symbol.iterator] === 'function') + ? readVectors(readBuffers(input)) + : readVectors(readJSON(input)); } + yield* batches; } -function toByteBuffer(bytes?: Uint8Array | Buffer | string) { - let arr: Uint8Array = bytes as any || new Uint8Array(0); - if (typeof bytes === 'string') { - arr = new Uint8Array(bytes.length); - for (let i = -1, n = bytes.length; ++i < n;) { - arr[i] = bytes.charCodeAt(i); - } - return new ByteBuffer(arr); +export async function* readAsync(sources: AsyncIterable) { + for await (let vectors of readVectorsAsync(readBuffersAsync(sources))) { + yield vectors; } - return new ByteBuffer(arr); } diff --git a/js/src/reader/buffer.ts b/js/src/reader/buffer.ts new file mode 100644 index 0000000000000..c7b90507e396f --- /dev/null +++ b/js/src/reader/buffer.ts @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { flatbuffers } from 'flatbuffers'; +import { VectorLayoutReader } from './vector'; +import { TypedArray, TypedArrayConstructor } from '../vector/types'; +import { footerFromByteBuffer, messageFromByteBuffer } from '../format/fb'; +import { Footer, Schema, RecordBatch, DictionaryBatch, Field, Buffer, FieldNode } from '../format/arrow'; +import ByteBuffer = flatbuffers.ByteBuffer; + +export function* readBuffers(sources: Iterable) { + let schema: Schema | null = null; + let readMessages: ((bb: ByteBuffer) => IterableIterator) | null = null; + for (const source of sources) { + const bb = toByteBuffer(source); + if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { + for (const message of readMessages(bb)) { + yield { + schema, message, reader: new BufferVectorLayoutReader( + bb, + (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), + (function* (buffers) { yield* buffers; })(message.buffers) + ) as VectorLayoutReader + }; + } + } + } +} + +export async function* readBuffersAsync(sources: AsyncIterable) { + let schema: Schema | null = null; + let readMessages: ((bb: ByteBuffer) => IterableIterator) | null = null; + for await (const source of sources) { + const bb = toByteBuffer(source); + if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { + for (const message of readMessages(bb)) { + yield { + schema, message, reader: new BufferVectorLayoutReader( + bb, + (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), + (function* (buffers) { yield* buffers; })(message.buffers) + ) as VectorLayoutReader + }; + } + } + } +} + +function toByteBuffer(bytes?: Uint8Array | NodeBuffer | string) { + let arr: Uint8Array = bytes as any || new Uint8Array(0); + if (typeof bytes === 'string') { + arr = new Uint8Array(bytes.length); + for (let i = -1, n = bytes.length; ++i < n;) { + arr[i] = bytes.charCodeAt(i); + } + return new ByteBuffer(arr); + } + return new ByteBuffer(arr); +} + +function readSchema(bb: ByteBuffer) { + let schema: Schema, readMessages, footer: Footer | null; + if (footer = readFileSchema(bb)) { + schema = footer.schema!; + readMessages = readFileMessages(footer); + } else if (schema = readStreamSchema(bb)!) { + readMessages = readStreamMessages; + } else { + throw new Error('Invalid Arrow buffer'); + } + return { schema, readMessages }; +} + +const PADDING = 4; +const MAGIC_STR = 'ARROW1'; +const MAGIC = new Uint8Array(MAGIC_STR.length); +for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { + MAGIC[i] = MAGIC_STR.charCodeAt(i); +} + +function checkForMagicArrowString(buffer: Uint8Array, index = 0) { + for (let i = -1, n = MAGIC.length; ++i < n;) { + if (MAGIC[i] !== buffer[index + i]) { + return false; + } + } + return true; +} + +const magicLength = MAGIC.length; +const magicAndPadding = magicLength + PADDING; +const magicX2AndPadding = magicLength * 2 + PADDING; + +function readStreamSchema(bb: ByteBuffer) { + if (!checkForMagicArrowString(bb.bytes(), 0)) { + for (const message of readMessages(bb)) { + if (message.isSchema()) { + return message as Schema; + } + } + } + return null; +} + +function* readStreamMessages(bb: ByteBuffer) { + for (const message of readMessages(bb)) { + if (message.isRecordBatch()) { + yield message; + } else if (message.isDictionaryBatch()) { + yield message; + } else { + continue; + } + // position the buffer after the body to read the next message + bb.setPosition(bb.position() + message.bodyLength.low); + } +} + +function readFileSchema(bb: ByteBuffer) { + let fileLength = bb.capacity(), footerLength: number, footerOffset: number; + if ((fileLength < magicX2AndPadding /* Arrow buffer too small */) || + (!checkForMagicArrowString(bb.bytes(), 0) /* Missing magic start */) || + (!checkForMagicArrowString(bb.bytes(), fileLength - magicLength) /* Missing magic end */) || + (/* Invalid footer length */ + (footerLength = bb.readInt32(footerOffset = fileLength - magicAndPadding)) < 1 && + (footerLength + magicX2AndPadding > fileLength))) { + return null; + } + bb.setPosition(footerOffset - footerLength); + return footerFromByteBuffer(bb); +} + +function readFileMessages(footer: Footer) { + return function* (bb: ByteBuffer) { + for (let i = -1, batches = footer.dictionaryBatches, n = batches.length; ++i < n;) { + bb.setPosition(batches[i].offset.low); + yield readMessage(bb, bb.readInt32(bb.position())) as DictionaryBatch; + } + for (let i = -1, batches = footer.recordBatches, n = batches.length; ++i < n;) { + bb.setPosition(batches[i].offset.low); + yield readMessage(bb, bb.readInt32(bb.position())) as RecordBatch; + } + }; +} + +function* readMessages(bb: ByteBuffer) { + let length: number, message: Schema | RecordBatch | DictionaryBatch; + while (bb.position() < bb.capacity() && + (length = bb.readInt32(bb.position())) > 0) { + if (message = readMessage(bb, length)!) { + yield message; + } + } +} + +function readMessage(bb: ByteBuffer, length: number) { + bb.setPosition(bb.position() + PADDING); + const message = messageFromByteBuffer(bb); + bb.setPosition(bb.position() + length); + return message; +} + +class BufferVectorLayoutReader implements VectorLayoutReader { + private offset: number; + private bytes: Uint8Array; + constructor(bb: ByteBuffer, private fieldNodes: Iterator, private buffers: Iterator) { + this.bytes = bb.bytes(); + this.offset = bb.position(); + } + readContainerLayout(field: Field) { + const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; + return { + field, fieldNode, + validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value) + }; + } + readFixedWidthLayout(field: Field, dataType: TypedArrayConstructor) { + const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; + return { + field, fieldNode, + validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value), + data: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, dataType) + }; + } + readBinaryLayout(field: Field) { + const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; + return { + field, fieldNode, + validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value), + offsets: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, Int32Array), + data: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, Uint8Array) + }; + } + readVariableWidthLayout(field: Field) { + const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; + return { + field, fieldNode, + validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value), + offsets: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, Int32Array) + }; + } +} + +function createValidityArray(bytes: Uint8Array, field: Field, fieldNode: FieldNode, offset: number, buffer: Buffer) { + return field.nullable && fieldNode.nullCount.low > 0 && + createTypedArray(bytes, field, fieldNode, offset, buffer, Uint8Array) || null; +} + +function createTypedArray(bytes: Uint8Array, _field: Field, _fieldNode: FieldNode, offset: number, buffer: Buffer, ArrayConstructor: TypedArrayConstructor): T { + return new ArrayConstructor( + bytes.buffer, + bytes.byteOffset + offset + buffer.offset.low, + buffer.length.low / ArrayConstructor.BYTES_PER_ELEMENT + ); +} diff --git a/js/src/reader/dictionary.ts b/js/src/reader/dictionary.ts deleted file mode 100644 index 93a9ba76bba3a..0000000000000 --- a/js/src/reader/dictionary.ts +++ /dev/null @@ -1,38 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { readVector } from './vector'; -import { MessageBatch } from './message'; -import * as Schema_ from '../format/Schema_generated'; -import { IteratorState, Dictionaries } from './arrow'; -import Field = Schema_.org.apache.arrow.flatbuf.Field; - -export function readDictionary(field: Field | null, - batch: MessageBatch, - iterator: IteratorState, - dictionaries: Dictionaries) { - let id: string, encoding = field && field.dictionary(); - if (encoding && batch.id === (id = encoding.id().toFloat64().toString())) { - return readVector(field, batch, iterator, null); - } - for (let i = -1, n = field && field.childrenLength() || 0; ++i < n;) { - let vector = readDictionary(field.children(i), batch, iterator, dictionaries); - if (vector) { - return vector; - } - } -} diff --git a/js/src/reader/file.ts b/js/src/reader/file.ts deleted file mode 100644 index b05b99a5e6dcf..0000000000000 --- a/js/src/reader/file.ts +++ /dev/null @@ -1,79 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { flatbuffers } from 'flatbuffers'; -import * as File_ from '../format/File_generated'; -import * as Schema_ from '../format/Schema_generated'; -import * as Message_ from '../format/Message_generated'; -import { PADDING, readMessageBatches } from './message'; - -import ByteBuffer = flatbuffers.ByteBuffer; -import Footer = File_.org.apache.arrow.flatbuf.Footer; -export import Schema = Schema_.org.apache.arrow.flatbuf.Schema; -export import RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; - -const MAGIC_STR = 'ARROW1'; -const MAGIC = new Uint8Array(MAGIC_STR.length); -for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { - MAGIC[i] = MAGIC_STR.charCodeAt(i); -} - -export function _checkMagic(buffer: Uint8Array, index = 0) { - for (let i = -1, n = MAGIC.length; ++i < n;) { - if (MAGIC[i] !== buffer[index + i]) { - return false; - } - } - return true; -} - -const magicLength = MAGIC.length; -const magicAndPadding = magicLength + PADDING; -const magicX2AndPadding = magicLength * 2 + PADDING; - -export function* readFile(...bbs: ByteBuffer[]) { - for (let bb of bbs) { - let fileLength = bb.capacity(); - let footerLength: number, footerOffset: number; - if ((fileLength < magicX2AndPadding /* Arrow buffer too small */) || - (!_checkMagic(bb.bytes(), 0) /* Missing magic start */) || - (!_checkMagic(bb.bytes(), fileLength - magicLength) /* Missing magic end */) || - (/* Invalid footer length */ - (footerLength = bb.readInt32(footerOffset = fileLength - magicAndPadding)) < 1 && - (footerLength + magicX2AndPadding > fileLength))) { - throw new Error('Invalid file'); - } - bb.setPosition(footerOffset - footerLength); - let footer = Footer.getRootAsFooter(bb), schema = footer.schema(); - for (let i = -1, n = footer.dictionariesLength(); ++i < n;) { - let block = footer.dictionaries(i); - bb.setPosition(block.offset().low); - for (let batch of readMessageBatches(bb)) { - yield { schema, batch }; - break; - } - } - for (let i = -1, n = footer.recordBatchesLength(); ++i < n;) { - const block = footer.recordBatches(i); - bb.setPosition(block.offset().low); - for (let batch of readMessageBatches(bb)) { - yield { schema, batch }; - break; - } - } - } -} diff --git a/js/src/reader/json.ts b/js/src/reader/json.ts new file mode 100644 index 0000000000000..49431496354e8 --- /dev/null +++ b/js/src/reader/json.ts @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as Schema_ from '../format/fb/Schema'; +import { Int64, Int128 } from '../util/int'; +import { VectorLayoutReader } from './vector'; +import { TextEncoder } from 'text-encoding-utf-8'; +import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +import { TypedArray, TypedArrayConstructor } from '../vector/types'; +import { schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON } from '../format/json'; +import { Schema, RecordBatch, DictionaryBatch, Field, Buffer, FieldNode } from '../format/arrow'; +export { Schema, RecordBatch, DictionaryBatch }; + +export function* readJSON(json: any) { + const schema = schemaFromJSON(json['schema']); + for (const batch of (json['dictionaries'] || [])) { + const message = dictionaryBatchFromJSON(batch); + yield { + schema, message, reader: new JSONVectorLayoutReader( + flattenDataSources(batch['data']['columns']), + (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), + (function* (buffers) { yield* buffers; })(message.buffers) + ) as VectorLayoutReader + }; + } + for (const batch of (json['batches'] || [])) { + const message = recordBatchFromJSON(batch); + yield { + schema, message, reader: new JSONVectorLayoutReader( + flattenDataSources(batch['columns']), + (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), + (function* (buffers) { yield* buffers; })(message.buffers) + ) as VectorLayoutReader + }; + } +} + +function flattenDataSources(xs: any[]): any[][] { + return (xs || []).reduce((buffers, column: any) => [ + ...buffers, + ...(column['VALIDITY'] && [column['VALIDITY']] || []), + ...(column['OFFSET'] && [column['OFFSET']] || []), + ...(column['DATA'] && [column['DATA']] || []), + ...flattenDataSources(column['children']) + ], [] as any[][]); +} + +class JSONVectorLayoutReader implements VectorLayoutReader { + constructor(private sources: any[][], private fieldNodes: Iterator, private buffers: Iterator) {} + readContainerLayout(field: Field) { + const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; + return { + field, fieldNode, + validity: createValidityArray(sources, field, fieldNode, buffers.next().value) + }; + } + readFixedWidthLayout(field: Field, dataType: TypedArrayConstructor) { + const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; + return { + field, fieldNode, + validity: createValidityArray(sources, field, fieldNode, buffers.next().value), + data: createDataArray(sources, field, fieldNode, buffers.next().value, dataType) + }; + } + readBinaryLayout(field: Field) { + const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; + return { + field, fieldNode, + validity: createValidityArray(sources, field, fieldNode, buffers.next().value), + offsets: new Int32Array(sources[buffers.next().value.offset.low]), + data: createDataArray(sources, field, fieldNode, buffers.next().value, Uint8Array) + }; + } + readVariableWidthLayout(field: Field) { + const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; + return { + field, fieldNode, + validity: createValidityArray(sources, field, fieldNode, buffers.next().value), + offsets: new Int32Array(sources[buffers.next().value.offset.low]), + }; + } +} + +function createValidityArray(sources: any[][], field: Field, fieldNode: FieldNode, buffer: Buffer) { + return field.nullable && fieldNode.nullCount.low > 0 && + booleanFromJSON(sources[buffer.offset.low]) || null; +} + +const encoder = new TextEncoder('utf-8'); + +function createDataArray(sources: any[][], field: Field, _fieldNode: FieldNode, buffer: Buffer, ArrayConstructor: TypedArrayConstructor): T { + let type = field.type, data: ArrayLike | ArrayBufferLike; + if (type.isTimestamp() === true) { + data = int64sFromJSON(sources[buffer.offset.low] as string[]); + } else if ((type.isInt() || type.isTime()) && type.bitWidth === 64) { + data = int64sFromJSON(sources[buffer.offset.low] as string[]); + } else if (type.isDate() && type.unit === DateUnit.MILLISECOND) { + data = int64sFromJSON(sources[buffer.offset.low] as string[]); + } else if (type.isDecimal() === true) { + data = decimalFromJSON(sources[buffer.offset.low] as string[]); + } else if (type.isBinary() === true) { + data = binaryFromJSON(sources[buffer.offset.low] as string[]); + } else if (type.isBool() === true) { + data = booleanFromJSON(sources[buffer.offset.low] as number[]).buffer; + } else if (type.isUtf8() === true) { + data = encoder.encode((sources[buffer.offset.low] as string[]).join('')); + } else { + data = (sources[buffer.offset.low]).map((x) => +x); + } + return new ArrayConstructor(data); +} + +function int64sFromJSON(values: string[]) { + const data = new Uint32Array(values.length * 2); + for (let i = -1, n = values.length; ++i < n;) { + // Force all values (even numbers) to be parsed as strings since + // pulling out high and low bits seems to lose precision sometimes + // For example: + // > -4613034156400212000 >>> 0 + // 721782784 + // The correct lower 32-bits are 721782752 + Int64.fromString(values[i].toString(), new Uint32Array(data.buffer, data.byteOffset + 2 * i * 4, 2)); + } + return data.buffer; +} + +function decimalFromJSON(values: string[]) { + const data = new Uint32Array(values.length * 4); + for (let i = -1, n = values.length; ++i < n;) { + Int128.fromString(values[i], new Uint32Array(data.buffer, data.byteOffset + 4 * 4 * i, 4)); + } + return data.buffer; +} + +function binaryFromJSON(values: string[]) { + // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] + // There are definitely more efficient ways to do this... but it gets the + // job done. + const joined = values.join(''); + const data = new Uint8Array(joined.length / 2); + for (let i = 0; i < joined.length; i += 2) { + data[i >> 1] = parseInt(joined.substr(i, 2), 16); + } + return data.buffer; +} + +function booleanFromJSON(arr: number[]) { + let xs = [], n, i = 0; + let bit = 0, byte = 0; + for (const value of arr) { + value && (byte |= 1 << bit); + if (++bit === 8) { + xs[i++] = byte; + byte = bit = 0; + } + } + if (i === 0 || bit > 0) { xs[i++] = byte; } + if (i % 8 && (n = i + 8 - i % 8)) { + do { xs[i] = 0; } while (++i < n); + } + return new Uint8Array(xs); +} diff --git a/js/src/reader/message.ts b/js/src/reader/message.ts deleted file mode 100644 index 5472f10833878..0000000000000 --- a/js/src/reader/message.ts +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { flatbuffers } from 'flatbuffers'; -import * as Message_ from '../format/Message_generated'; -import ByteBuffer = flatbuffers.ByteBuffer; -import Message = Message_.org.apache.arrow.flatbuf.Message; -import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; -import RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; -import DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; - -export const PADDING = 4; -export type MessageBatch = { - id?: string; - offset: number; - bytes: Uint8Array; - data: RecordBatch; -}; - -export function* readMessages(bb: ByteBuffer) { - let message, length; - while (bb.position() < bb.capacity() && - (length = bb.readInt32(bb.position())) > 0) { - bb.setPosition(bb.position() + PADDING); - message = Message.getRootAsMessage(bb); - bb.setPosition(bb.position() + length); - yield message; - } -} - -export function* readMessageBatches(bb: ByteBuffer) { - let bytes = bb.bytes(); - for (let message of readMessages(bb)) { - let type = message.headerType(); - let id: string, data: RecordBatch; - if (type === MessageHeader.RecordBatch) { - data = message.header(new RecordBatch()); - } else if (type === MessageHeader.DictionaryBatch) { - let header = message.header(new DictionaryBatch()); - id = header.id().toFloat64().toString(); - data = header.data(); - } else { - continue; - } - yield { id, data, bytes, offset: bytes.byteOffset + bb.position() }; - // position the buffer after the body to read the next message - bb.setPosition(bb.position() + message.bodyLength().low); - } -} diff --git a/js/src/reader/stream.ts b/js/src/reader/stream.ts deleted file mode 100644 index 9869f633d08f4..0000000000000 --- a/js/src/reader/stream.ts +++ /dev/null @@ -1,43 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { _checkMagic } from './file'; -import { flatbuffers } from 'flatbuffers'; -import * as Schema_ from '../format/Schema_generated'; -import * as Message_ from '../format/Message_generated'; -import { readMessages, readMessageBatches } from './message'; - -import ByteBuffer = flatbuffers.ByteBuffer; -import Schema = Schema_.org.apache.arrow.flatbuf.Schema; -import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; - -export function* readStream(...bbs: ByteBuffer[]) { - if (!bbs.length || _checkMagic(bbs[0].bytes(), 0)) { - throw new Error('Invalid Arrow Stream'); - } - for (const message of readMessages(bbs[0])) { - if (message.headerType() === MessageHeader.Schema) { - const schema = message.header(new Schema()); - for (const bb of bbs) { - for (const batch of readMessageBatches(bb)) { - yield { schema, batch }; - } - } - break; - } - } -} diff --git a/js/src/reader/vector.ts b/js/src/reader/vector.ts index 3b6663be89bdc..3bd6d2bb67650 100644 --- a/js/src/reader/vector.ts +++ b/js/src/reader/vector.ts @@ -15,262 +15,241 @@ // specific language governing permissions and limitations // under the License. -import { flatbuffers } from 'flatbuffers'; -import { MessageBatch } from './message'; -import { Vector } from '../vector/vector'; -import * as Schema_ from '../format/Schema_generated'; -import { StructVector } from '../vector/struct'; -import { IteratorState, Dictionaries } from './arrow'; -import { DictionaryVector } from '../vector/dictionary'; -import { Utf8Vector, ListVector, FixedSizeListVector } from '../vector/list'; +import * as Schema_ from '../format/fb/Schema'; +import { TypedArray, TypedArrayConstructor } from '../vector/types'; +import { Schema, RecordBatch, DictionaryBatch, Field, FieldNode } from '../format/arrow'; +import { Int, Date, Time, Timestamp, Decimal, FixedSizeList, FixedSizeBinary, FloatingPoint } from '../format/arrow'; import { - TypedArray, TypedArrayCtor, IntArray, FloatArray, + Vector, BoolVector, BinaryVector, DictionaryVector, Int8Vector, Int16Vector, Int32Vector, Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, - Float32Vector, Float64Vector, IndexVector, DateVector, -} from '../vector/typed'; + Utf8Vector, ListVector, FixedSizeListVector, StructVector, + Float16Vector, Float32Vector, Float64Vector, DecimalVector, + Date32Vector, Date64Vector, Time32Vector, Time64Vector, TimestampVector, +} from '../vector/arrow'; -import Int = Schema_.org.apache.arrow.flatbuf.Int; import Type = Schema_.org.apache.arrow.flatbuf.Type; -import Field = Schema_.org.apache.arrow.flatbuf.Field; +import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -import VectorType = Schema_.org.apache.arrow.flatbuf.VectorType; -import VectorLayout = Schema_.org.apache.arrow.flatbuf.VectorLayout; -import FixedSizeList = Schema_.org.apache.arrow.flatbuf.FixedSizeList; -import FloatingPoint = Schema_.org.apache.arrow.flatbuf.FloatingPoint; -import DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; +// import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; -export function readVector(field: Field, batch: MessageBatch, state: IteratorState, dictionaries: Dictionaries) { - return readDictionaryVector(field, batch, state, dictionaries) || - readTypedVector(field, batch, state, dictionaries); +export interface ContainerLayout { + fieldNode: FieldNode; + validity: Uint8Array | null | void; } -function readTypedVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries) { - let typeType = field.typeType(), readTyped = typedVectorReaders[typeType]; - if (!readTyped) { - throw new Error('Unrecognized vector name "' + Type[typeType] + '" type "' + typeType + '"'); - } - return readTyped(field, batch, iterator, dictionaries); +export interface VariableWidthLayout { + fieldNode: FieldNode; + offsets: Int32Array; + validity: Uint8Array | null | void; } -function readDictionaryVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries) { - let encoding: DictionaryEncoding | null; - if (dictionaries && (encoding = field.dictionary())) { - let id = encoding.id().toFloat64().toString(); - let fieldType = encoding.indexType() || - /* a dictionary index defaults to signed 32 bit int if unspecified */ - { bitWidth: () => 32, isSigned: () => true }; - // workaround for https://issues.apache.org/jira/browse/ARROW-1363 - let indexField = createSyntheticDictionaryIndexField(field, fieldType); - let index = readIntVector(indexField, batch, iterator, null, fieldType); - return DictionaryVector.create(field, index.length, index, dictionaries[id]); - } +export interface BinaryLayout extends FixedWidthLayout { + offsets: Int32Array; } -const IntViews = [Int8Array, Int16Array, Int32Array, Int32Array ]; -const Int32Views = [Int32Array, Int32Array, Int32Array, Int32Array ]; -const UintViews = [Uint8Array, Uint16Array, Uint32Array, Uint32Array ]; -const Uint8Views = [Uint8Array, Uint8Array, Uint8Array, Uint8Array ]; -const Uint32Views = [Uint32Array, Uint32Array, Uint32Array, Uint32Array ]; -const FloatViews = [Int8Array, Int16Array, Float32Array, Float64Array]; +export interface FixedWidthLayout { + fieldNode: FieldNode; + data: T; + validity: Uint8Array | null | void; +} -const createIntDataViews = createDataView.bind(null, IntViews, null); -const createUintDataViews = createDataView.bind(null, UintViews, null); -const createDateDataViews = createDataView.bind(null, Uint32Views, null); -const createFloatDataViews = createDataView.bind(null, FloatViews, null); -const createNestedDataViews = createDataView.bind(null, Uint32Views, null); -const createValidityDataViews = createDataView.bind(null, Uint8Views, null); -const createUtf8DataViews = createDataView.bind(null, Uint8Views, Int32Views); +export function* readVectors(messages: Iterable<{ schema: Schema, message: RecordBatch | DictionaryBatch, reader: VectorLayoutReader }>) { + const dictionaries = new Map(); + for (const { schema, message, reader } of messages) { + yield* readMessageVectors(schema, message, new VectorReader(dictionaries, reader)); + } +} -const floatVectors = { - [Precision.SINGLE]: Float32Vector, - [Precision.DOUBLE]: Float64Vector -}; -const intVectors = [ - [/* unsigned */ Uint8Vector, /* signed */ Int8Vector ], - [/* unsigned */ Uint16Vector, /* signed */ Int16Vector], - [/* unsigned */ Uint32Vector, /* signed */ Int32Vector], - [/* unsigned */ Uint64Vector, /* signed */ Int64Vector] -]; +export async function* readVectorsAsync(messages: AsyncIterable<{ schema: Schema, message: RecordBatch | DictionaryBatch, reader: VectorLayoutReader }>) { + const dictionaries = new Map(); + for await (const { schema, message, reader } of messages) { + yield* readMessageVectors(schema, message, new VectorReader(dictionaries, reader)); + } +} -function readIntVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries, fieldType?: FieldType) { - let type = (fieldType || field.type(new Int())); - return type.isSigned() ? - read_IntVector(field, batch, iterator, dictionaries, type) : - readUintVector(field, batch, iterator, dictionaries, type); +function* readMessageVectors(schema: Schema, message: RecordBatch | DictionaryBatch, reader: VectorReader) { + if (message.isRecordBatch() === true) { + yield schema.fields.map((field) => reader.readVector(field)); + } else if (message.isDictionaryBatch()) { + let id = message.dictionaryId.toFloat64().toString(); + let vector = reader.readValueVector(schema.dictionaries.get(id)!); + if (message.isDelta) { + vector = reader.dictionaries.get(id)!.concat(vector); + } + reader.dictionaries.set(id, vector); + } } -const read_IntVector = readVectorLayout(createIntDataViews, createIntVector); -const readUintVector = readVectorLayout(createUintDataViews, createIntVector); -function createIntVector(field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) { - let type = fieldType || field.type(new Int()), bitWidth = type.bitWidth(); - let Vector = valueForBitWidth(bitWidth, intVectors)[+type.isSigned()]; - return Vector.create(field, length, validity, data || offsets); - // ----------------------------------------------- 👆: - // Workaround for https://issues.apache.org/jira/browse/ARROW-1363 - // This bug causes dictionary encoded vector indicies' IntVector data - // buffers to be tagged as VectorType.OFFSET (0) in the field metadata - // instead of VectorType.DATA. The `readVectorLayout` routine strictly - // obeys the types in the field metadata, so if we're parsing an Arrow - // file written by a version of the library published before ARROW-1363 - // was fixed, the IntVector's data buffer will be null, and the offset - // buffer will be the actual data. If data is null, it's safe to assume - // the offset buffer is the data, because IntVectors don't have offsets. +export interface VectorLayoutReader { + readBinaryLayout(field: Field): BinaryLayout; + readContainerLayout(field: Field): ContainerLayout; + readVariableWidthLayout(field: Field): VariableWidthLayout; + readFixedWidthLayout(field: Field, TypedArrayConstructor: TypedArrayConstructor): FixedWidthLayout; } -const readFloatVector = readVectorLayout( - createFloatDataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - let type = field.type(new FloatingPoint()); - let Vector = floatVectors[type.precision()]; - return Vector.create(field, length, validity, data); +export class VectorReader implements VectorLayoutReader { + constructor(public dictionaries: Map, protected layout: VectorLayoutReader) {} + readVector(field: Field): Vector { + return this.readDictionaryVector(field) || this.readValueVector(field); } -); - -const readDateVector = readVectorLayout( - createDateDataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - return DateVector.create(field, length, validity, data); + readDictionaryVector(field: Field) { + const encoding = field.dictionary; + if (encoding) { + const keys = this.readIntVector(field.indexField()); + const data = this.dictionaries.get(encoding.dictionaryId.toFloat64().toString())!; + return new DictionaryVector({ + field, data, keys, + validity: (keys as any).validity, + fieldNode: (keys as any).fieldNode, + }); + } + return null; } -); - -const readUtf8Vector = readVectorLayout( - createUtf8DataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - let offsetsAdjusted = new Int32Array(offsets.buffer, offsets.byteOffset, length + 1); - return Utf8Vector.create( - field, length, validity, - Uint8Vector.create(field, data.length, null, data), - IndexVector.create(field, length + 1, null, offsetsAdjusted) - ); + readValueVector(field: Field) { + switch (field.typeType) { + case Type.NONE: return this.readNullVector(); + case Type.Null: return this.readNullVector(); + // case Type.Map: return this.readMapVector(field); + case Type.Int: return this.readIntVector(field); + case Type.Bool: return this.readBoolVector(field); + case Type.Date: return this.readDateVector(field); + case Type.List: return this.readListVector(field); + case Type.Utf8: return this.readUtf8Vector(field); + case Type.Time: return this.readTimeVector(field); + // case Type.Union: return this.readUnionVector(field); + case Type.Binary: return this.readBinaryVector(field); + case Type.Decimal: return this.readDecimalVector(field); + case Type.Struct_: return this.readStructVector(field); + case Type.FloatingPoint: return this.readFloatVector(field); + case Type.Timestamp: return this.readTimestampVector(field); + case Type.FixedSizeList: return this.readFixedSizeListVector(field); + case Type.FixedSizeBinary: return this.readFixedSizeBinaryVector(field); + } + throw new Error(`Unrecognized ${field.toString()}`); } -); - -const readListVector = readVectorLayout( - createNestedDataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - let offsetsAdjusted = new Int32Array(offsets.buffer, offsets.byteOffset, length + 1); - return ListVector.create( - field, length, validity, - readVector(field.children(0), batch, iterator, dictionaries), - IndexVector.create(field, length + 1, null, offsetsAdjusted) - ); + readNullVector() { + return new Vector(); } -); - -const readFixedSizeListVector = readVectorLayout( - createNestedDataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - let size = field.type(new FixedSizeList()).listSize(); - return FixedSizeListVector.create( - field, length, size, validity, - readVector(field.children(0), batch, iterator, dictionaries) - ); + readBoolVector(field: Field) { + return new BoolVector(this.readFixedWidthLayout(field, Uint8Array)); } -); - -const readStructVector = readVectorLayout>( - createNestedDataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - let vectors: Vector[] = []; - for (let i = -1, n = field.childrenLength(); ++i < n;) { - vectors[i] = readVector(field.children(i), batch, iterator, dictionaries); + readDateVector(field: Field) { + const type = field.type as Date; + switch (type.unit) { + case DateUnit.DAY: return new Date32Vector({ ...this.readFixedWidthLayout(field, Int32Array), unit: DateUnit[type.unit] }); + case DateUnit.MILLISECOND: return new Date64Vector({ ...this.readFixedWidthLayout(field, Int32Array), unit: DateUnit[type.unit] }); } - return StructVector.create(field, length, validity, ...vectors); + throw new Error(`Unrecognized ${type.toString()}`); } -); - -const typedVectorReaders = { - [Type.Int]: readIntVector, - [Type.Date]: readDateVector, - [Type.List]: readListVector, - [Type.Utf8]: readUtf8Vector, - [Type.Struct_]: readStructVector, - [Type.FloatingPoint]: readFloatVector, - [Type.FixedSizeList]: readFixedSizeListVector, -}; - -type FieldType = { bitWidth(): number; isSigned(): boolean }; -type dataViewFactory = (batch: MessageBatch, type: VectorType, bitWidth: number, offset: number, length: number) => V; -type vectorFactory> = (field: Field, - length: number, - data: TList, - nulls: Uint8Array, - offsets: TypedArray, - fieldType: FieldType, - chunk: MessageBatch, - iterable: IteratorState, - dictionaries: Dictionaries) => V; - -function readVectorLayout(createDataView: dataViewFactory, createVector: vectorFactory>) { - return function readLayout( - field: Field, - chunk: MessageBatch, - iterator: IteratorState, - dictionaries: Dictionaries, - integerFieldType?: FieldType - ) { - let batch = chunk.data; - let layoutLength = field.layoutLength(); - let node = batch.nodes(iterator.nodeIndex++); - let data: TList, offsets: any, validity: Uint8Array; - let type, bitWidth, bufferLength, nodeLength = node.length().low; - for (let i = -1; ++i < layoutLength;) { - let layout = field.layout(i); - let buffer = batch.buffers(iterator.bufferIndex++); - if ((type = layout.type()) === VectorType.TYPE || - (bufferLength = buffer.length().low) <= 0 || - (bitWidth = layout.bitWidth()) <= 0) { - continue; - } else if (type === VectorType.DATA) { - data = createDataView(chunk, type, bitWidth, buffer.offset().low, bufferLength); - } else if (type === VectorType.OFFSET) { - offsets = createDataView(chunk, type, bitWidth, buffer.offset().low, bufferLength); - } else if (node.nullCount().low > 0) { - validity = createValidityDataViews(chunk, type, bitWidth, buffer.offset().low, nodeLength); + readTimeVector(field: Field) { + const type = field.type as Time; + switch (type.bitWidth) { + case 32: return new Time32Vector({ ...this.readFixedWidthLayout(field, Int32Array), unit: TimeUnit[type.unit] }); + case 64: return new Time64Vector({ ...this.readFixedWidthLayout(field, Uint32Array), unit: TimeUnit[type.unit] }); + } + throw new Error(`Unrecognized ${type.toString()}`); + } + readTimestampVector(field: Field) { + const type = field.type as Timestamp; + const { fieldNode, validity, data } = this.readFixedWidthLayout(field, Uint32Array); + return new TimestampVector({ + field, fieldNode, validity, data, + timezone: type.timezone!, + unit: TimeUnit[type.unit], + }); + } + readListVector(field: Field) { + const { fieldNode, validity, offsets } = this.readVariableWidthLayout(field); + return new ListVector({ + field, fieldNode, validity, offsets, + values: this.readVector(field.children[0]) + }); + } + readStructVector(field: Field) { + const { fieldNode, validity } = this.readContainerLayout(field); + return new StructVector({ + field, fieldNode, validity, + columns: field.children.map((field) => this.readVector(field)) + }); + } + readBinaryVector(field: Field) { + return new BinaryVector(this.readBinaryLayout(field)); + } + readDecimalVector(field: Field) { + const type = field.type as Decimal; + const { fieldNode, validity, data } = this.readFixedWidthLayout(field, Uint32Array); + return new DecimalVector({ + scale: type.scale, + precision: type.precision, + field, fieldNode, validity, data + }); + } + readUtf8Vector(field: Field) { + const { fieldNode, validity, offsets, data } = this.readBinaryLayout(field); + return new Utf8Vector({ + field, fieldNode, + values: new BinaryVector({ + validity, offsets, data + }) + }); + } + readFixedSizeListVector(field: Field) { + const type = field.type as FixedSizeList; + const { fieldNode, validity } = this.readContainerLayout(field); + return new FixedSizeListVector({ + field, fieldNode, validity, + size: type.listSize, + values: this.readVector(field.children[0]) + }); + } + readFixedSizeBinaryVector(field: Field) { + const type = field.type as FixedSizeBinary; + const { fieldNode, validity, data } = this.readFixedWidthLayout(field, Uint8Array); + return new FixedSizeListVector({ + size: type.byteWidth, + field, fieldNode, validity, + values: new Uint8Vector({ data }) + }); + } + readFloatVector(field: Field) { + const type = field.type as FloatingPoint; + switch (type.precision) { + case Precision.HALF: return new Float16Vector(this.readFixedWidthLayout(field, Uint16Array)); + case Precision.SINGLE: return new Float32Vector(this.readFixedWidthLayout(field, Float32Array)); + case Precision.DOUBLE: return new Float64Vector(this.readFixedWidthLayout(field, Float64Array)); + } + throw new Error(`Unrecognized FloatingPoint { precision: ${type.precision} }`); + } + readIntVector(field: Field) { + const type = field.type as Int; + if (type.isSigned) { + switch (type.bitWidth) { + case 8: return new Int8Vector(this.readFixedWidthLayout(field, Int8Array)); + case 16: return new Int16Vector(this.readFixedWidthLayout(field, Int16Array)); + case 32: return new Int32Vector(this.readFixedWidthLayout(field, Int32Array)); + case 64: return new Int64Vector(this.readFixedWidthLayout(field, Int32Array)); } } - return createVector(field, nodeLength, data, validity, offsets, integerFieldType, chunk, iterator, dictionaries); - }; -} - -function createDataView( - dataViews: TypedArrayCtor[], offsetViews: TypedArrayCtor[] | null, - batch: MessageBatch, type: VectorType, bitWidth: number, offset: number, length: number -) { - const buffer = batch.bytes.buffer; - const byteLength = buffer.byteLength; - const byteOffset = batch.offset + offset; - const DataViewType = valueForBitWidth(bitWidth, type === VectorType.OFFSET && offsetViews || dataViews); - const dataViewLength = ((byteOffset + length) <= byteLength - ? length - : byteLength - byteOffset - ) / DataViewType['BYTES_PER_ELEMENT']; - return new DataViewType(buffer, byteOffset, dataViewLength); -} - -function valueForBitWidth(bitWidth: number, values: any[]) { - return values[bitWidth >> 4] || values[3]; -} - -function createSyntheticDictionaryIndexField(field: Field, type: FieldType) { - let layouts = []; - let builder = new flatbuffers.Builder(); - if (field.nullable()) { - VectorLayout.startVectorLayout(builder); - VectorLayout.addBitWidth(builder, 8); - VectorLayout.addType(builder, VectorType.VALIDITY); - builder.finish(VectorLayout.endVectorLayout(builder)); - layouts.push(VectorLayout.getRootAsVectorLayout(builder.dataBuffer())); - builder = new flatbuffers.Builder(); + switch (type.bitWidth) { + case 8: return new Uint8Vector(this.readFixedWidthLayout(field, Uint8Array)); + case 16: return new Uint16Vector(this.readFixedWidthLayout(field, Uint16Array)); + case 32: return new Uint32Vector(this.readFixedWidthLayout(field, Uint32Array)); + case 64: return new Uint64Vector(this.readFixedWidthLayout(field, Uint32Array)); + } + throw new Error(`Unrecognized Int { isSigned: ${type.isSigned}, bitWidth: ${type.bitWidth} }`); } - VectorLayout.startVectorLayout(builder); - VectorLayout.addBitWidth(builder, type.bitWidth()); - VectorLayout.addType(builder, VectorType.DATA); - builder.finish(VectorLayout.endVectorLayout(builder)); - layouts.push(VectorLayout.getRootAsVectorLayout(builder.dataBuffer())); - return Object.create(field, { - layout: { value(i) { return layouts[i]; } }, - layoutLength: { value() { return layouts.length; } } - }); -} \ No newline at end of file + readContainerLayout(field: Field) { + return this.layout.readContainerLayout(field); + } + readBinaryLayout(field: Field) { + return this.layout.readBinaryLayout(field); + } + readVariableWidthLayout(field: Field) { + return this.layout.readVariableWidthLayout(field); + } + readFixedWidthLayout(field: Field, TypedArrayConstructor: TypedArrayConstructor) { + return this.layout.readFixedWidthLayout(field, TypedArrayConstructor); + } +} diff --git a/js/src/table.ts b/js/src/table.ts deleted file mode 100644 index 5e781054daf31..0000000000000 --- a/js/src/table.ts +++ /dev/null @@ -1,143 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { readBuffers } from './reader/arrow'; -import { StructVector } from './vector/struct'; -import { Vector, sliceToRangeArgs } from './vector/vector'; - -export type RowObject = { [k: string]: any }; - -export class Table implements Iterable> { - public length: number; - protected _columns: Vector[]; - protected _columnsMap: { [k: string]: Vector }; - static from(...bytes: Array) { - let columns: Vector[]; - for (let vectors of readBuffers(...bytes)) { - columns = !columns ? vectors : columns.map((v, i) => v.concat(vectors[i])); - } - return new Table(columns); - } - static fromStruct(vector: StructVector) { - return new Table(( vector).vectors); - } - constructor(columns: Vector[]) { - this._columns = columns || []; - this.length = Math.max(...this._columns.map((v) => v.length)); - this._columnsMap = this._columns.reduce((map, vec) => { - return (map[vec.name] = vec) && map || map; - }, {}); - } - *[Symbol.iterator]() { - for (let cols = this._columns, i = -1, n = this.length; ++i < n;) { - yield rowAsMap(i, cols); - } - } - *rows(startRow?: number | boolean, endRow?: number | boolean, compact?: boolean) { - let start = startRow as number, end = endRow as number; - if (typeof startRow === 'boolean') { - compact = startRow; - start = end; - end = undefined; - } else if (typeof endRow === 'boolean') { - compact = endRow; - end = undefined; - } - let rowIndex = -1, { length } = this; - const [rowOffset, rowsTotal] = sliceToRangeArgs(length, start, end); - while (++rowIndex < rowsTotal) { - yield this.getRow((rowIndex + rowOffset) % length, compact); - } - } - *cols(startCol?: number, endCol?: number) { - for (const column of this._columns.slice(startCol, endCol)) { - yield column; - } - } - getRow(rowIndex: number): RowObject; - getRow(rowIndex: number, compact: boolean): Array; - getRow(rowIndex: number, compact?: boolean) { - return (compact && rowAsArray || rowAsObject)(rowIndex, this._columns); - } - getCell(columnName: string, rowIndex: number) { - return this.getColumn>(columnName).get(rowIndex); - } - getCellAt(columnIndex: number, rowIndex: number) { - return this.getColumnAt>(columnIndex).get(rowIndex); - } - getColumn>(columnName: string) { - return this._columnsMap[columnName] as T; - } - getColumnAt>(columnIndex: number) { - return this._columns[columnIndex] as T; - } - toString(): string; - toString(index: boolean): string; - toString(options: { index: boolean }): string; - toString(options?: any) { - const index = typeof options === 'object' ? options && !!options.index - : typeof options === 'boolean' ? !!options - : false; - const { length } = this; - if (length <= 0) { return ''; } - const maxColumnWidths = []; - const rows = new Array(length + 1); - rows[0] = this._columns.map((c) => c.name); - index && rows[0].unshift('Index'); - for (let i = -1, n = rows.length - 1; ++i < n;) { - rows[i + 1] = this.getRow(i, true); - index && rows[i + 1].unshift(i); - } - // Pass one to convert to strings and count max column widths - for (let i = -1, n = rows.length; ++i < n;) { - const row = rows[i]; - for (let j = -1, k = row.length; ++j < k;) { - const val = row[j] = `${row[j]}`; - maxColumnWidths[j] = !maxColumnWidths[j] - ? val.length - : Math.max(maxColumnWidths[j], val.length); - } - } - // Pass two to pad each one to max column width - for (let i = -1, n = rows.length; ++i < n;) { - const row = rows[i]; - for (let j = -1, k = row.length; ++j < k;) { - row[j] = leftPad(row[j], ' ', maxColumnWidths[j]); - } - rows[i] = row.join(', '); - } - return rows.join('\n'); - } -} - -Table.prototype.length = 0; - -function leftPad(str, fill, n) { - return (new Array(n + 1).join(fill) + str).slice(-1 * n); -} - -function rowAsMap(row: number, columns: Vector[]) { - return columns.reduce((map, vector) => map.set(vector.name, vector.get(row)), new Map()); -} - -function rowAsObject(rowIndex: number, columns: Vector[]) { - return columns.reduce((row, vector) => (row[vector.name] = vector.get(rowIndex)) && row || row, Object.create(null)); -} - -function rowAsArray(rowIndex: number, columns: Vector[]) { - return columns.reduce((row, vector, columnIndex) => (row[columnIndex] = vector.get(rowIndex)) && row || row, new Array(columns.length)); -} diff --git a/js/src/util/int.ts b/js/src/util/int.ts new file mode 100644 index 0000000000000..9088e7b995573 --- /dev/null +++ b/js/src/util/int.ts @@ -0,0 +1,320 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const carryBit16 = 1 << 16; + +function intAsHex(value: number): string { + if (value < 0) { + value = 0xFFFFFFFF + value + 1; + } + return `0x${value.toString(16)}`; +} + +const kInt32DecimalDigits = 8; +const kPowersOfTen = [1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000]; + +export class BaseInt64 { + constructor (protected buffer: Uint32Array) {} + + high(): number { return this.buffer[1]; } + low (): number { return this.buffer[0]; } + + protected _times(other: BaseInt64) { + // Break the left and right numbers into 16 bit chunks + // so that we can multiply them without overflow. + const L = new Uint32Array([ + this.buffer[1] >>> 16, + this.buffer[1] & 0xFFFF, + this.buffer[0] >>> 16, + this.buffer[0] & 0xFFFF + ]); + + const R = new Uint32Array([ + other.buffer[1] >>> 16, + other.buffer[1] & 0xFFFF, + other.buffer[0] >>> 16, + other.buffer[0] & 0xFFFF + ]); + + let product = L[3] * R[3]; + this.buffer[0] = product & 0xFFFF; + + let sum = product >>> 16; + + product = L[2] * R[3]; + sum += product; + + product = (L[3] * R[2]) >>> 0; + sum += product; + + this.buffer[0] += sum << 16; + + this.buffer[1] = (sum >>> 0 < product ? carryBit16 : 0); + + this.buffer[1] += sum >>> 16; + this.buffer[1] += L[1] * R[3] + L[2] * R[2] + L[3] * R[1]; + this.buffer[1] += (L[0] * R[3] + L[1] * R[2] + L[2] * R[1] + L[3] * R[0]) << 16; + + return this; + } + + protected _plus(other: BaseInt64) { + const sum = (this.buffer[0] + other.buffer[0]) >>> 0; + this.buffer[1] += other.buffer[1]; + if (sum < (this.buffer[0] >>> 0)) { + ++this.buffer[1]; + } + this.buffer[0] = sum; + } + + lessThan(other: BaseInt64): boolean { + return this.buffer[1] < other.buffer[1] || + (this.buffer[1] === other.buffer[1] && this.buffer[0] < other.buffer[0]); + } + + equals(other: BaseInt64): boolean { + return this.buffer[1] === other.buffer[1] && this.buffer[0] == other.buffer[0]; + } + + greaterThan(other: BaseInt64): boolean { + return other.lessThan(this); + } + + hex(): string { + return `${intAsHex(this.buffer[1])} ${intAsHex(this.buffer[0])}`; + } +} + +export class Uint64 extends BaseInt64 { + times(other: Uint64): Uint64 { + this._times(other); + return this; + } + + plus(other: Uint64): Uint64 { + this._plus(other); + return this; + } + + static multiply(left: Uint64, right: Uint64): Uint64 { + let rtrn = new Uint64(new Uint32Array(left.buffer)); + return rtrn.times(right); + } + + static add(left: Uint64, right: Uint64): Uint64 { + let rtrn = new Uint64(new Uint32Array(left.buffer)); + return rtrn.plus(right); + } +} + +export class Int64 extends BaseInt64 { + negate(): Int64 { + this.buffer[0] = ~this.buffer[0] + 1; + this.buffer[1] = ~this.buffer[1]; + + if (this.buffer[0] == 0) { ++this.buffer[1]; } + return this; + } + + times(other: Int64): Int64 { + this._times(other); + return this; + } + + plus(other: Int64): Int64 { + this._plus(other); + return this; + } + + lessThan(other: Int64): boolean { + // force high bytes to be signed + const this_high = this.buffer[1] << 0; + const other_high = other.buffer[1] << 0; + return this_high < other_high || + (this_high === other_high && this.buffer[0] < other.buffer[0]); + } + + static fromString(str: string, out_buffer = new Uint32Array(2)): Int64 { + // TODO: Assert that out_buffer is 0 and length = 2 + const negate = str.startsWith('-'); + const length = str.length; + + let out = new Int64(out_buffer); + for (let posn = negate ? 1 : 0; posn < length;) { + const group = kInt32DecimalDigits < length - posn ? + kInt32DecimalDigits : length - posn; + const chunk = new Int64(new Uint32Array([parseInt(str.substr(posn, group), 10), 0])); + const multiple = new Int64(new Uint32Array([kPowersOfTen[group], 0])); + + out.times(multiple); + out.plus(chunk); + + posn += group; + } + + return negate ? out.negate() : out; + } + + static multiply(left: Int64, right: Int64): Int64 { + let rtrn = new Int64(new Uint32Array(left.buffer)); + return rtrn.times(right); + } + + static add(left: Int64, right: Int64): Int64 { + let rtrn = new Int64(new Uint32Array(left.buffer)); + return rtrn.plus(right); + } +} + +export class Int128 { + constructor (private buffer: Uint32Array) { + // buffer[3] MSB (high) + // buffer[2] + // buffer[1] + // buffer[0] LSB (low) + } + + high(): Int64 { + return new Int64(new Uint32Array(this.buffer.buffer, this.buffer.byteOffset + 8, 2)); + } + + low(): Int64 { + return new Int64(new Uint32Array(this.buffer.buffer, this.buffer.byteOffset, 2)); + } + + negate(): Int128 { + this.buffer[0] = ~this.buffer[0] + 1; + this.buffer[1] = ~this.buffer[1]; + this.buffer[2] = ~this.buffer[2]; + this.buffer[3] = ~this.buffer[3]; + + if (this.buffer[0] == 0) { ++this.buffer[1]; } + if (this.buffer[1] == 0) { ++this.buffer[2]; } + if (this.buffer[2] == 0) { ++this.buffer[3]; } + return this; + } + + times(other: Int128): Int128 { + // Break the left and right numbers into 32 bit chunks + // so that we can multiply them without overflow. + const L0 = new Uint64(new Uint32Array([this.buffer[3], 0])); + const L1 = new Uint64(new Uint32Array([this.buffer[2], 0])); + const L2 = new Uint64(new Uint32Array([this.buffer[1], 0])); + const L3 = new Uint64(new Uint32Array([this.buffer[0], 0])); + + const R0 = new Uint64(new Uint32Array([other.buffer[3], 0])); + const R1 = new Uint64(new Uint32Array([other.buffer[2], 0])); + const R2 = new Uint64(new Uint32Array([other.buffer[1], 0])); + const R3 = new Uint64(new Uint32Array([other.buffer[0], 0])); + + let product = Uint64.multiply(L3, R3); + this.buffer[0] = product.low(); + + let sum = new Uint64(new Uint32Array([product.high(), 0])); + + product = Uint64.multiply(L2, R3); + sum.plus(product); + + product = Uint64.multiply(L3, R2); + sum.plus(product); + + this.buffer[1] = sum.low(); + + this.buffer[3] = (sum.lessThan(product) ? 1 : 0); + + this.buffer[2] = sum.high(); + let high = new Uint64(new Uint32Array(this.buffer.buffer, this.buffer.byteOffset + 8, 2)); + + high.plus(Uint64.multiply(L1, R3)) + .plus(Uint64.multiply(L2, R2)) + .plus(Uint64.multiply(L3, R1)); + this.buffer[3] += Uint64.multiply(L0, R3) + .plus(Uint64.multiply(L1, R2)) + .plus(Uint64.multiply(L2, R1)) + .plus(Uint64.multiply(L3, R0)).low(); + + return this; + } + + plus(other: Int128): Int128 { + let sums = new Uint32Array(4); + sums[3] = (this.buffer[3] + other.buffer[3]) >>> 0; + sums[2] = (this.buffer[2] + other.buffer[2]) >>> 0; + sums[1] = (this.buffer[1] + other.buffer[1]) >>> 0; + sums[0] = (this.buffer[0] + other.buffer[0]) >>> 0; + + if (sums[0] < (this.buffer[0] >>> 0)) { + ++sums[1]; + } + if (sums[1] < (this.buffer[1] >>> 0)) { + ++sums[2]; + } + if (sums[2] < (this.buffer[2] >>> 0)) { + ++sums[3]; + } + + this.buffer[3] = sums[3]; + this.buffer[2] = sums[2]; + this.buffer[1] = sums[1]; + this.buffer[0] = sums[0]; + + return this; + } + + hex(): string { + return `${intAsHex(this.buffer[3])} ${intAsHex(this.buffer[2])} ${intAsHex(this.buffer[1])} ${intAsHex(this.buffer[0])}`; + } + + static multiply(left: Int128, right: Int128): Int128 { + let rtrn = new Int128(new Uint32Array(left.buffer)); + return rtrn.times(right); + } + + static add(left: Int128, right: Int128): Int128 { + let rtrn = new Int128(new Uint32Array(left.buffer)); + return rtrn.plus(right); + } + + static fromString(str: string, out_buffer = new Uint32Array(4)): Int128 { + // TODO: Assert that out_buffer is 0 and length = 4 + const negate = str.startsWith('-'); + const length = str.length; + + let out = new Int128(out_buffer); + for (let posn = negate ? 1 : 0; posn < length;) { + const group = kInt32DecimalDigits < length - posn ? + kInt32DecimalDigits : length - posn; + const chunk = new Int128(new Uint32Array([parseInt(str.substr(posn, group), 10), 0, 0, 0])); + const multiple = new Int128(new Uint32Array([kPowersOfTen[group], 0, 0, 0])); + + out.times(multiple); + out.plus(chunk); + + posn += group; + } + + return negate ? out.negate() : out; + } +} diff --git a/js/src/util/layout.ts b/js/src/util/layout.ts new file mode 100644 index 0000000000000..c064ee9d7d0b0 --- /dev/null +++ b/js/src/util/layout.ts @@ -0,0 +1,200 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { TextEncoder } from 'text-encoding-utf-8'; +import { TypedArrayConstructor, TypedArray } from '../vector/types'; + +export function align(value: number, alignment: number) { + return value + padding(value, alignment); +} + +export function padding(value: number, alignment: number) { + return (value % alignment === 0 ? 0 : alignment - value % alignment); +} + +export type NullableLayout = { nullCount: number, validity: Uint8Array }; +export type BufferLayout> = { data: TArray }; +export type DictionaryLayout> = { data: TArray, keys: number[] }; +export type VariableWidthLayout> = { data: TArray, offsets: number[] }; +export type VariableWidthDictionaryLayout> = { data: TArray, keys: number[], offsets: number[] }; + +export type values = ArrayLike; +export type BufferValueWriter = (src: ArrayLike, dst: number[], index: number) => boolean | void; +export type BufferWriter = (values: values, nulls?: ArrayLike) => BufferLayout; +export type BufferLayoutWriter = (write: BufferValueWriter, values: values, nulls?: ArrayLike) => BufferLayout; + +const writeNumeric64Value = writeFixedWidthValue.bind(null, 64); +const writeNumeric128Value = writeFixedWidthValue.bind(null, 128); +const utf8Encoder = new TextEncoder() as { encode: (s: string) => Uint8Array }; + +const stride1Encode = writeValidityLayout.bind(null, writeFixedWidthLayoutWithStride.bind(null, 1)); +const stride1FixedWidth = writeFixedWidthLayout.bind(null, writeValidityLayout.bind(null, stride1Encode)); +const stride2FixedWidth = writeFixedWidthLayout.bind(null, writeValidityLayout.bind(null, writeFixedWidthLayoutWithStride.bind(null, 2))); +const stride4FixedWidth = writeFixedWidthLayout.bind(null, writeValidityLayout.bind(null, writeFixedWidthLayoutWithStride.bind(null, 4))); + +export const writeBools = writeTypedLayout.bind(null, stride1FixedWidth.bind(null, writeBooleanValue), Uint8Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeInt8s = writeTypedLayout.bind(null, stride1FixedWidth.bind(null, writeNumericValue), Int8Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeInt16s = writeTypedLayout.bind(null, stride1FixedWidth.bind(null, writeNumericValue), Int16Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeInt32s = writeTypedLayout.bind(null, stride1FixedWidth.bind(null, writeNumericValue), Int32Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeInt64s = writeTypedLayout.bind(null, stride2FixedWidth.bind(null, writeNumeric64Value), Int32Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeUint8s = writeTypedLayout.bind(null, stride1FixedWidth.bind(null, writeNumericValue), Uint8Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeUint16s = writeTypedLayout.bind(null, stride1FixedWidth.bind(null, writeNumericValue), Uint16Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeUint32s = writeTypedLayout.bind(null, stride1FixedWidth.bind(null, writeNumericValue), Uint32Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeUint64s = writeTypedLayout.bind(null, stride2FixedWidth.bind(null, writeNumeric64Value), Uint32Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeDecimals = writeTypedLayout.bind(null, stride4FixedWidth.bind(null, writeNumeric128Value), Uint32Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeFloat32s = writeTypedLayout.bind(null, stride1FixedWidth.bind(null, writeNumericValue), Float32Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeFloat64s = writeTypedLayout.bind(null, stride1FixedWidth.bind(null, writeNumericValue), Float64Array) as (values: values, nulls?: ArrayLike) => BufferLayout; +export const writeVariableWidth = writeVariableWidthLayout.bind(null, stride1Encode) as (writeValue: BufferValueWriter, values: values, nulls?: ArrayLike) => VariableWidthLayout; +export const writeBinary = writeTypedLayout.bind(null, writeVariableWidth.bind(null, writeBinaryValue)) as (values: values, TNull>, nulls?: ArrayLike) => VariableWidthLayout; +export const writeUtf8s = writeTypedLayout.bind(null, writeVariableWidth.bind(null, writeUtf8Value), Uint8Array) as (values: values, nulls?: ArrayLike) => VariableWidthLayout; +export const writeDictionaryEncoded = writeDictionaryLayout.bind(null, stride1Encode) as (writeValue: BufferValueWriter, values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedBools = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride1FixedWidth, writeBooleanValue), Uint8Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedInt8s = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride1FixedWidth, writeNumericValue), Int8Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedInt16s = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride1FixedWidth, writeNumericValue), Int16Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedInt32s = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride1FixedWidth, writeNumericValue), Int32Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedInt64s = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride2FixedWidth, writeNumeric64Value), Int32Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedUint8s = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride1FixedWidth, writeNumericValue), Uint8Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedUint16s = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride1FixedWidth, writeNumericValue), Uint16Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedUint32s = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride1FixedWidth, writeNumericValue), Uint32Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedUint64s = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride2FixedWidth, writeNumeric64Value), Uint32Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedDecimals = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride4FixedWidth, writeNumeric128Value), Uint32Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedFloat32s = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride1FixedWidth, writeNumericValue), Float32Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedFloat64s = writeTypedLayout.bind(null, writeDictionaryLayout.bind(null, stride1FixedWidth, writeNumericValue), Float64Array) as (values: values, nulls?: ArrayLike) => DictionaryLayout; +export const writeDictionaryEncodedVariableWidth = writeDictionaryLayout.bind(null, writeVariableWidth) as (writeValue: BufferValueWriter, values: values, nulls?: ArrayLike) => VariableWidthDictionaryLayout; +export const writeDictionaryEncodedBinary = writeTypedLayout.bind(null, writeDictionaryEncodedVariableWidth.bind(null, writeBinaryValue)) as (values: values, TNull>, nulls?: ArrayLike) => VariableWidthDictionaryLayout; +export const writeDictionaryEncodedUtf8s = writeTypedLayout.bind(null, writeDictionaryEncodedVariableWidth.bind(null, writeUtf8Value), Uint8Array) as (values: values, nulls?: ArrayLike) => VariableWidthDictionaryLayout; + +function writeFixedWidthLayoutWithStride( + stride: number, + writeValue: BufferValueWriter, + values: values +) { + let index = -stride; + const data = [] as number[]; + const length = values.length; + while ((index += stride) < length) { + writeValue(values as ArrayLike, data, index); + } + return { data: data as ArrayLike }; +} + +function writeFixedWidthLayout( + writeLayout: BufferLayoutWriter, + writeValue: BufferValueWriter, + values: values, + nulls?: ArrayLike +) { + return writeLayout(writeValue, values, nulls); +} + +function writeValidityLayout( + writeLayout: BufferLayoutWriter, + writeValue: BufferValueWriter, + values: values, + nulls?: ArrayLike +) { + let nullCount = 0; + let nullsLength = nulls && nulls.length || 0; + let validity = new Uint8Array(align(values.length >>> 3, 8)).fill(255); + return { + ...writeLayout(writeValueOrValidity, values), + nullCount, validity: (nullCount > 0 && validity) || new Uint8Array(0) + } as BufferLayout & NullableLayout; + function writeValueOrValidity(src: ArrayLike, dst: number[], index: number) { + writeValue(src, dst, index); + let i = -1, x = src[index] as T | TNull; + let isNull = x === null || x === undefined; + while (!isNull && ++i < nullsLength) { + isNull = x === nulls![i]; + } + if (isNull) { + nullCount++; + validity[index >> 3] &= ~(1 << (index % 8)); + } + } +} + +function writeVariableWidthLayout( + writeLayout: BufferLayoutWriter, + writeValue: BufferValueWriter, + values: values, + nulls?: ArrayLike +) { + let offsets = [0], offsetsIndex = 0; + return { ...writeLayout(writeValueAndOffset, values, nulls), offsets } as VariableWidthLayout; + function writeValueAndOffset(src: ArrayLike, dst: number[], index: number) { + if (!writeValue(src, dst, index)) { + offsets[++offsetsIndex] = dst.length; + } + } +} + +function writeDictionaryLayout( + writeLayout: BufferLayoutWriter, + writeValue: BufferValueWriter, + values: values, + nulls?: ArrayLike +) { + let keys = [] as number[], keysIndex = 0, keysMap = Object.create(null); + return { ...writeLayout(writeKeysOrValues, values, nulls), keys }; + function writeKeysOrValues(src: ArrayLike, dst: number[], index: number) { + const x: any = src[index]; + if (x in keysMap) { + return (keys[index] = keysMap[x]) || true; + } else if (!writeValue(src, dst, index)) { + keys[index] = keysMap[x] = keysIndex++; + } + } +} + +function writeTypedLayout( + writeBuffers: BufferWriter, + ArrayBufferView: TypedArrayConstructor, + values: values, + nulls?: ArrayLike +) { + const result = writeBuffers(values, nulls); + result.data = new ArrayBufferView(result.data); + return result as BufferLayout; +} + +function writeBooleanValue(src: ArrayLike, dst: number[], index: number) { + if (src[index]) { + let i = index >>> 3; + let b = dst[i] || 0; + dst[i] = b | 1 << (index % 8); + } +} + +function writeNumericValue(src: ArrayLike, dst: number[], index: number) { + dst[index] = +src[index]; +} + +function writeFixedWidthValue(bitWidth: number, src: ArrayLike, dst: number[], index: number) { + const bytesLen = bitWidth / 32; + for (let i = -1; ++i < bytesLen;) { + dst[index + i] = src[index + i]; + } +} + +function writeUtf8Value(src: ArrayLike, dst: number[], index: number) { + dst.push(...utf8Encoder.encode(src[index])); +} + +function writeBinaryValue(src: ArrayLike>, dst: number[], index: number) { + dst.push(...src[index]); +} diff --git a/js/src/vector/arrow.ts b/js/src/vector/arrow.ts new file mode 100644 index 0000000000000..64a43bdab1c6b --- /dev/null +++ b/js/src/vector/arrow.ts @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from './vector'; +import * as vectors from './traits/vectors'; +import * as fieldVectors from './traits/field'; +import * as nullableVectors from './traits/nullable'; +import * as nullableFieldVectors from './traits/nullablefield'; +import { Field, FieldNode } from '../format/arrow'; +import { isFieldArgv, isNullableArgv } from './traits/mixins'; + +function MixinArrowTraits, TArgv>( + Base: new (argv: TArgv) => T, + Field: new (argv: TArgv & { field: Field, fieldNode: FieldNode }) => T, + Nullable: new (argv: TArgv & { validity: Uint8Array }) => T, + NullableField: new (argv: TArgv & { validity: Uint8Array, field: Field, fieldNode: FieldNode }) => T +) { + return function(argv: TArgv | (TArgv & { validity: Uint8Array }) | (TArgv & { field: Field, fieldNode: FieldNode })) { + return new (!isFieldArgv(argv) + ? !isNullableArgv(argv) ? Base : Nullable + : !isNullableArgv(argv) ? Field : NullableField + )(argv as any); + } as any as { new (argv: TArgv | (TArgv & { validity: Uint8Array }) | (TArgv & { field: Field, fieldNode: FieldNode })): T }; +} + +export { Vector }; +export const MixinListVector = MixinArrowTraits(vectors.ListVector as any, fieldVectors.ListVector as any, nullableVectors.ListVector as any, nullableFieldVectors.ListVector as any); +export class ListVector extends MixinListVector {} +export const MixinBinaryVector = MixinArrowTraits(vectors.BinaryVector as any, fieldVectors.BinaryVector as any, nullableVectors.BinaryVector as any, nullableFieldVectors.BinaryVector as any); +export class BinaryVector extends MixinBinaryVector {} +export const MixinUtf8Vector = MixinArrowTraits(vectors.Utf8Vector as any, fieldVectors.Utf8Vector as any, nullableVectors.Utf8Vector as any, nullableFieldVectors.Utf8Vector as any); +export class Utf8Vector extends MixinUtf8Vector {} +export const MixinBoolVector = MixinArrowTraits(vectors.BoolVector as any, fieldVectors.BoolVector as any, nullableVectors.BoolVector as any, nullableFieldVectors.BoolVector as any); +export class BoolVector extends MixinBoolVector {} +export const MixinInt8Vector = MixinArrowTraits(vectors.Int8Vector as any, fieldVectors.Int8Vector as any, nullableVectors.Int8Vector as any, nullableFieldVectors.Int8Vector as any); +export class Int8Vector extends MixinInt8Vector {} +export const MixinInt16Vector = MixinArrowTraits(vectors.Int16Vector as any, fieldVectors.Int16Vector as any, nullableVectors.Int16Vector as any, nullableFieldVectors.Int16Vector as any); +export class Int16Vector extends MixinInt16Vector {} +export const MixinInt32Vector = MixinArrowTraits(vectors.Int32Vector as any, fieldVectors.Int32Vector as any, nullableVectors.Int32Vector as any, nullableFieldVectors.Int32Vector as any); +export class Int32Vector extends MixinInt32Vector {} +export const MixinInt64Vector = MixinArrowTraits(vectors.Int64Vector as any, fieldVectors.Int64Vector as any, nullableVectors.Int64Vector as any, nullableFieldVectors.Int64Vector as any); +export class Int64Vector extends MixinInt64Vector {} +export const MixinUint8Vector = MixinArrowTraits(vectors.Uint8Vector as any, fieldVectors.Uint8Vector as any, nullableVectors.Uint8Vector as any, nullableFieldVectors.Uint8Vector as any); +export class Uint8Vector extends MixinUint8Vector {} +export const MixinUint16Vector = MixinArrowTraits(vectors.Uint16Vector as any, fieldVectors.Uint16Vector as any, nullableVectors.Uint16Vector as any, nullableFieldVectors.Uint16Vector as any); +export class Uint16Vector extends MixinUint16Vector {} +export const MixinUint32Vector = MixinArrowTraits(vectors.Uint32Vector as any, fieldVectors.Uint32Vector as any, nullableVectors.Uint32Vector as any, nullableFieldVectors.Uint32Vector as any); +export class Uint32Vector extends MixinUint32Vector {} +export const MixinUint64Vector = MixinArrowTraits(vectors.Uint64Vector as any, fieldVectors.Uint64Vector as any, nullableVectors.Uint64Vector as any, nullableFieldVectors.Uint64Vector as any); +export class Uint64Vector extends MixinUint64Vector {} +export const MixinDate32Vector = MixinArrowTraits(vectors.Date32Vector as any, fieldVectors.Date32Vector as any, nullableVectors.Date32Vector as any, nullableFieldVectors.Date32Vector as any); +export class Date32Vector extends MixinDate32Vector {} +export const MixinDate64Vector = MixinArrowTraits(vectors.Date64Vector as any, fieldVectors.Date64Vector as any, nullableVectors.Date64Vector as any, nullableFieldVectors.Date64Vector as any); +export class Date64Vector extends MixinDate64Vector {} +export const MixinTime32Vector = MixinArrowTraits(vectors.Time32Vector as any, fieldVectors.Time32Vector as any, nullableVectors.Time32Vector as any, nullableFieldVectors.Time32Vector as any); +export class Time32Vector extends MixinTime32Vector {} +export const MixinTime64Vector = MixinArrowTraits(vectors.Time64Vector as any, fieldVectors.Time64Vector as any, nullableVectors.Time64Vector as any, nullableFieldVectors.Time64Vector as any); +export class Time64Vector extends MixinTime64Vector {} +export const MixinFloat16Vector = MixinArrowTraits(vectors.Float16Vector as any, fieldVectors.Float16Vector as any, nullableVectors.Float16Vector as any, nullableFieldVectors.Float16Vector as any); +export class Float16Vector extends MixinFloat16Vector {} +export const MixinFloat32Vector = MixinArrowTraits(vectors.Float32Vector as any, fieldVectors.Float32Vector as any, nullableVectors.Float32Vector as any, nullableFieldVectors.Float32Vector as any); +export class Float32Vector extends MixinFloat32Vector {} +export const MixinFloat64Vector = MixinArrowTraits(vectors.Float64Vector as any, fieldVectors.Float64Vector as any, nullableVectors.Float64Vector as any, nullableFieldVectors.Float64Vector as any); +export class Float64Vector extends MixinFloat64Vector {} +export const MixinStructVector = MixinArrowTraits(vectors.StructVector as any, fieldVectors.StructVector as any, nullableVectors.StructVector as any, nullableFieldVectors.StructVector as any); +export class StructVector extends MixinStructVector {} +export const MixinDecimalVector = MixinArrowTraits(vectors.DecimalVector as any, fieldVectors.DecimalVector as any, nullableVectors.DecimalVector as any, nullableFieldVectors.DecimalVector as any); +export class DecimalVector extends MixinDecimalVector {} +export const MixinTimestampVector = MixinArrowTraits(vectors.TimestampVector as any, fieldVectors.TimestampVector as any, nullableVectors.TimestampVector as any, nullableFieldVectors.TimestampVector as any); +export class TimestampVector extends MixinTimestampVector {} +export const MixinDictionaryVector = MixinArrowTraits(vectors.DictionaryVector as any, fieldVectors.DictionaryVector as any, nullableVectors.DictionaryVector as any, nullableFieldVectors.DictionaryVector as any); +export class DictionaryVector extends MixinDictionaryVector {} +export const MixinFixedSizeListVector = MixinArrowTraits(vectors.FixedSizeListVector as any, fieldVectors.FixedSizeListVector as any, nullableVectors.FixedSizeListVector as any, nullableFieldVectors.FixedSizeListVector as any); +export class FixedSizeListVector extends MixinFixedSizeListVector {} diff --git a/js/src/vector/dictionary.ts b/js/src/vector/dictionary.ts index de811eaf5b050..b7375c0904276 100644 --- a/js/src/vector/dictionary.ts +++ b/js/src/vector/dictionary.ts @@ -16,36 +16,33 @@ // under the License. import { Vector } from './vector'; +import { VirtualVector } from './virtual'; export class DictionaryVector extends Vector { - protected data: Vector; - protected keys: Vector; - constructor(index: Vector, dictionary: Vector) { + readonly length: number; + readonly data: Vector; + readonly keys: Vector; + constructor(argv: { data: Vector, keys: Vector }) { super(); - this.keys = index; - this.data = dictionary; - this.length = index && index.length || 0; + this.data = argv.data; + this.keys = argv.keys; + this.length = this.keys.length; } - index(index: number) { - return this.keys.get(index); + get(index: number) { + return this.getValue(this.getKey(index)!); } - value(index: number) { - return this.data.get(index); + getKey(index: number) { + return this.keys.get(index); } - get(index: number) { - return this.value(this.index(index)); + getValue(key: number) { + return this.data.get(key); } - concat(vector: DictionaryVector) { - return DictionaryVector.from(this, - this.length + vector.length, - this.keys.concat(vector.keys), - this.data - ); + concat(...vectors: Vector[]): Vector { + return new VirtualVector(Array, this, ...vectors); } *[Symbol.iterator]() { - let { data } = this; - for (const loc of this.keys) { - yield data.get(loc); + for (let i = -1, n = this.length; ++i < n;) { + yield this.get(i); } } } diff --git a/js/src/vector/list.ts b/js/src/vector/list.ts index 7360d968b0250..97913f8d8878c 100644 --- a/js/src/vector/list.ts +++ b/js/src/vector/list.ts @@ -15,94 +15,60 @@ // specific language governing permissions and limitations // under the License. +import { List } from './types'; import { Vector } from './vector'; -import { TextDecoder } from 'text-encoding'; -import { IndexVector, BitVector, ValidityArgs } from './typed'; +import { VirtualVector } from './virtual'; -export class ListVectorBase extends Vector { - protected values: Vector; - protected offsets: IndexVector; - constructor(validity: ValidityArgs, values: Vector, offsets: IndexVector) { +export class BinaryVector extends Vector { + readonly data: Uint8Array; + readonly offsets: Int32Array; + constructor(argv: { offsets: Int32Array, data: Uint8Array }) { super(); - this.values = values; - this.offsets = offsets; - validity && (this.validity = BitVector.from(validity)); + this.data = argv.data; + this.offsets = argv.offsets; } get(index: number) { - let batch, from, to, { offsets } = this; - if (!this.validity.get(index) || - /* return null if `to` is null */ - ((to = offsets.get(index + 1)) === null) || !( - /* - return null if `batch` is less than than 0. this check is placed - second to avoid creating the [from, batch] tuple if `to` is null - */ - ([from, batch] = offsets.get(index, true) as number[]) && batch > -1)) { - return null; - } - return this.values.slice(from, to, batch) as any; - } - concat(vector: ListVectorBase) { - return (this.constructor as typeof ListVectorBase).from(this, - this.length + vector.length, - this.validity.concat(vector.validity), - this.values.concat(vector.values), - this.offsets.concat(vector.offsets) - ); + return this.data.subarray(this.offsets[index], this.offsets[index + 1]); } - *[Symbol.iterator]() { - let v, r1, r2, { values } = this; - let it = this.offsets[Symbol.iterator](); - let iv = this.validity[Symbol.iterator](); - while (!(v = iv.next()).done && !(r1 = it.next()).done && !(r2 = it.next()).done) { - yield !v.value ? null : values.slice(r1.value[0], r2.value, r1.value[1]) as any; - } + concat(...vectors: Vector[]): Vector { + return new VirtualVector(Array, this, ...vectors); } } -export class ListVector extends ListVectorBase {} -export class Utf8Vector extends ListVectorBase { - protected static decoder = new TextDecoder(`utf-8`); - get(index: number) { - let chars = super.get(index) as any; - return chars ? Utf8Vector.decoder.decode(chars) : null; +export class ListVector extends Vector { + readonly offsets: Int32Array; + readonly values: Vector; + constructor(argv: { offsets: Int32Array, values: Vector }) { + super(); + this.values = argv.values; + this.offsets = argv.offsets; } - *[Symbol.iterator]() { - let decoder = Utf8Vector.decoder; - for (const chars of super[Symbol.iterator]()) { - yield !chars ? null : decoder.decode(chars); + get(index: number) { + const { offsets, values } = this; + const from = offsets[index]; + const xs = new Array(offsets[index + 1] - from); + for (let i = -1, n = xs.length; ++i < n;) { + xs[i] = values.get(i + from); } + return xs; + } + concat(...vectors: Vector[]): Vector { + return new VirtualVector(Array, this, ...vectors); } } -export class FixedSizeListVector extends Vector { - protected size: number; - protected values: Vector; - constructor(size: number, validity: ValidityArgs, values: Vector) { +export class FixedSizeListVector> extends Vector { + readonly size: number; + readonly values: Vector; + constructor(argv: { size: number, values: Vector }) { super(); - this.values = values; - this.size = Math.abs(size | 0) || 1; - validity && (this.validity = BitVector.from(validity)); + this.size = argv.size; + this.values = argv.values; } get(index: number) { - return !this.validity.get(index) ? null : this.values.slice( - this.size * index, this.size * (index + 1) - ) as T[]; + return this.values.slice(this.size * index, this.size * (index + 1)); } - concat(vector: FixedSizeListVector) { - return FixedSizeListVector.from(this, - this.length + vector.length, - this.size, - this.validity.concat(vector.validity), - this.values.concat(vector.values) - ); - } - *[Symbol.iterator]() { - let v, i = -1; - let { size, length, values } = this; - let iv = this.validity[Symbol.iterator](); - while (!(v = iv.next()).done && ++i < length) { - yield !v.value ? null : values.slice(size * i, size * (i + 1)) as T[]; - } + concat(...vectors: Vector[]): Vector { + return new VirtualVector(Array, this, ...vectors); } } diff --git a/js/src/vector/numeric.ts b/js/src/vector/numeric.ts new file mode 100644 index 0000000000000..830d6082bcc4a --- /dev/null +++ b/js/src/vector/numeric.ts @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from './vector'; +import { VirtualVector } from './virtual'; +import { TypedArray, TypedArrayConstructor } from './types'; + +export class NumericVector extends Vector { + readonly data: TArray; + readonly length: number; + readonly stride: number; + constructor(argv: { data: TArray }) { + super(); + const data = (ArrayBuffer.isView(argv) ? argv : argv.data) as TArray; + this.length = ((this.data = data).length / this.stride) | 0; + } + get(index: number) { + return this.data[index] as any; + } + concat(...vectors: Vector[]): Vector { + return new VirtualVector(this.data.constructor as TypedArrayConstructor, this, ...vectors); + } + slice(start?: number, end?: number): R { + const { data, stride } = this, from = start! | 0; + const to = end === undefined ? data.length : Math.max(end | 0, from); + return data.subarray(Math.min(from, to) * stride | 0, to * stride | 0) as any as R; + } +} + +export class FixedWidthNumericVector extends NumericVector { + get(index: number) { + return this.data.slice(this.stride * index, this.stride * (index + 1)) as TArray; + } +} + +export class BoolVector extends NumericVector { + static pack(values: Iterable) { + let n = 0, i = 0; + let xs: number[] = []; + let bit = 0, byte = 0; + for (const value of values) { + value && (byte |= 1 << bit); + if (++bit === 8) { + xs[i++] = byte; + byte = bit = 0; + } + } + if (i === 0 || bit > 0) { xs[i++] = byte; } + if (i % 8 && (n = i + 8 - i % 8)) { + do { xs[i] = 0; } while (++i < n); + } + return new Uint8Array(xs); + } + get(index: number) { + return (this.data[index >> 3] & 1 << index % 8) !== 0; + } + set(index: number, value: boolean) { + if (index > -1 === false) { + return; + } else if (value) { + this.data[index >> 3] |= (1 << (index % 8)); + } else { + this.data[index >> 3] &= ~(1 << (index % 8)); + } + } +} + +export class Int8Vector extends NumericVector {} +export class Int16Vector extends NumericVector {} +export class Int32Vector extends NumericVector {} +export class Int64Vector extends FixedWidthNumericVector {} + +export class Uint8Vector extends NumericVector {} +export class Uint16Vector extends NumericVector {} +export class Uint32Vector extends NumericVector {} +export class Uint64Vector extends FixedWidthNumericVector {} + +export class Float16Vector extends NumericVector { + get(index: number) { + return Math.min((super.get(index)! - 32767) / 32767, 1); + } +} + +export class Float32Vector extends NumericVector {} +export class Float64Vector extends NumericVector {} + +export class Date32Vector extends NumericVector { + public readonly unit: string; + constructor(argv: { data: Int32Array, unit: string }) { + super(argv); + this.unit = argv.unit; + } + get(index: number): Date { + return new Date(86400000 * (super.get(index) as any)); + } +} + +export class Date64Vector extends NumericVector { + public readonly unit: string; + constructor(argv: { unit: string, data: Int32Array }) { + super(argv); + this.unit = argv.unit; + } + get(index: number): Date { + return new Date(4294967296 * /* 2^32 */ + (super.get(index * 2 + 1) as any) + /* high */ + (super.get(index * 2) as any) /* low */ + ); + } +} + +export class Time32Vector extends NumericVector { + public readonly unit: string; + constructor(argv: { data: Int32Array, unit: string }) { + super(argv); + this.unit = argv.unit; + } +} + +export class Time64Vector extends FixedWidthNumericVector { + public readonly unit: string; + constructor(argv: { unit: string, data: Uint32Array }) { + super(argv); + this.unit = argv.unit; + } +} + +export class DecimalVector extends FixedWidthNumericVector { + readonly scale: number; + readonly precision: number; + constructor(argv: { precision: number, scale: number, data: Uint32Array }) { + super(argv); + this.scale = argv.scale; + this.precision = argv.precision; + } +} + +export class TimestampVector extends FixedWidthNumericVector { + readonly unit: string; + readonly timezone: string; + constructor(argv: { unit: string, timezone: string, data: Uint32Array }) { + super(argv); + this.unit = argv.unit; + this.timezone = argv.timezone; + } +} + +export interface NumericVectorConstructor { + readonly prototype: NumericVector; + new (argv: { data: TArray }): NumericVector; +} + +(DecimalVector.prototype as any).stride = 4; +(NumericVector.prototype as any).stride = 1; +(FixedWidthNumericVector.prototype as any).stride = 2; diff --git a/js/src/vector/struct.ts b/js/src/vector/struct.ts index e59ac91e9cd08..c43f6efc48fbe 100644 --- a/js/src/vector/struct.ts +++ b/js/src/vector/struct.ts @@ -16,24 +16,112 @@ // under the License. import { Vector } from './vector'; -import { BitVector, ValidityArgs } from './typed'; +import { VirtualVector } from './virtual'; -export class StructVector extends Vector { - protected vectors: Vector[]; - constructor(validity: ValidityArgs, ...vectors: Vector[]) { +export class StructVector extends Vector> { + readonly length: number; + readonly columns: Vector[]; + constructor(argv: { columns: Vector[] }) { super(); - this.vectors = vectors; - this.length = Math.max(0, ...vectors.map((v) => v.length)); - validity && (this.validity = BitVector.from(validity)); + this.columns = argv.columns || []; + } + get(index: number): StructRow { + return new StructRow(this, index); + } + col(name: string) { + return this.columns.find((col) => col.name === name) || null; + } + key(index: number) { + return this.columns[index] ? this.columns[index].name : null; + } + select(...columns: string[]) { + return new StructVector({ columns: columns.map((name) => this.col(name)!) }); + } + concat(...structs: Vector>[]): Vector> { + return new VirtualVector(Array, this, ...structs as any[]); + } + toString(options?: any) { + const index = typeof options === 'object' ? options && !!options.index + : typeof options === 'boolean' ? !!options + : false; + const { length } = this; + if (length <= 0) { return ''; } + const rows = new Array(length + 1); + const maxColumnWidths = [] as number[]; + rows[0] = this.columns.map((_, i) => this.key(i)); + index && rows[0].unshift('Index'); + for (let i = -1, n = rows.length - 1; ++i < n;) { + rows[i + 1] = [...this.get(i)!]; + index && rows[i + 1].unshift(i); + } + // Pass one to convert to strings and count max column widths + for (let i = -1, n = rows.length; ++i < n;) { + const row = rows[i]; + for (let j = -1, k = row.length; ++j < k;) { + const val = row[j] = stringify(row[j]); + maxColumnWidths[j] = !maxColumnWidths[j] + ? val.length + : Math.max(maxColumnWidths[j], val.length); + } + } + // Pass two to pad each one to max column width + for (let i = -1, n = rows.length; ++i < n;) { + const row = rows[i]; + for (let j = -1, k = row.length; ++j < k;) { + row[j] = leftPad(row[j], ' ', maxColumnWidths[j]); + } + rows[i] = row.join(', '); + } + return rows.join('\n'); + } +} + +export class StructRow extends Vector { + readonly row: number; + readonly length: number; + readonly table: StructVector; + [Symbol.toStringTag]() { return 'Row'; } + constructor(table: StructVector, row: number) { + super(); + this.row = row; + this.table = table; + this.length = table.columns.length; } get(index: number) { - return this.validity.get(index) ? this.vectors.map((v) => v.get(index)) : null; + const col = this.table.columns[index]; + return col ? col.get(this.row) as T : null; + } + col(key: string) { + const col = this.table.col(key); + return col ? col.get(this.row) as T : null; + } + *[Symbol.iterator]() { + const { row } = this; + for (const col of this.table.columns) { + yield col ? col.get(row) : null; + } + } + concat(...rows: Vector[]): Vector { + return new VirtualVector(Array, this, ...rows as any[]); } - concat(vector: StructVector) { - return StructVector.from(this, - this.length + vector.length, - this.validity.concat(vector.validity), - ...this.vectors.map((v, i) => v.concat(vector.vectors[i])) - ); + toArray() { return [...this]; } + toJSON() { return this.toArray(); } + toString() { return JSON.stringify(this); } + toObject(): Record { + const { row } = this, map = Object.create(null); + for (const col of this.table.columns) { + if (col && col.name) { + map[col.name] = col.get(row); + } + } + return map; } } + +function leftPad(str: string, fill: string, n: number) { + return (new Array(n + 1).join(fill) + str).slice(-1 * n); +} + +function stringify(x: any) { + return Array.isArray(x) ? JSON.stringify(x) : ArrayBuffer.isView(x) ? `[${x}]` : `${x}`; +} diff --git a/js/src/vector/table.ts b/js/src/vector/table.ts new file mode 100644 index 0000000000000..ca2b66a22da80 --- /dev/null +++ b/js/src/vector/table.ts @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from './vector'; +import { StructVector, StructRow } from './struct'; +import { read, readAsync } from '../reader/arrow'; + +function concatVectors(tableVectors: Vector[], batchVectors: Vector[]) { + return tableVectors.length === 0 ? batchVectors : batchVectors.map((vec, i, _vs, col = tableVectors[i]) => + vec && col && col.concat(vec) || col || vec + ) as Vector[]; +} + +export class Table extends StructVector { + static from(sources?: Iterable | object | string) { + let columns: Vector[] = []; + if (sources) { + for (let vectors of read(sources)) { + columns = concatVectors(columns, vectors); + } + } + return new Table({ columns }); + } + static async fromAsync(sources?: AsyncIterable) { + let columns: Vector[] = []; + if (sources) { + for await (let vectors of readAsync(sources)) { + columns = columns = concatVectors(columns, vectors); + } + } + return new Table({ columns }); + } + readonly length: number; + constructor(argv: { columns: Vector[] }) { + super(argv); + this.length = Math.max(...this.columns.map((col) => col.length)) | 0; + } + get(index: number): TableRow { + return new TableRow(this, index); + } +} + +export class TableRow extends StructRow { + toString() { + return this.toArray().map((x) => JSON.stringify(x)).join(', '); + } +} diff --git a/js/src/vector/traits/field.ts b/js/src/vector/traits/field.ts new file mode 100644 index 0000000000000..9f68f507c6f10 --- /dev/null +++ b/js/src/vector/traits/field.ts @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import * as vectors from './vectors'; +import { fieldMixin } from './mixins'; +import { Field, FieldNode } from '../../format/arrow'; +export { Field, FieldNode, Vector }; + +export const FieldListVector = fieldMixin(vectors.ListVector); +export class ListVector extends FieldListVector {} +export const FieldBinaryVector = fieldMixin(vectors.BinaryVector); +export class BinaryVector extends FieldBinaryVector {} +export const FieldUtf8Vector = fieldMixin(vectors.Utf8Vector); +export class Utf8Vector extends FieldUtf8Vector {} +export const FieldBoolVector = fieldMixin(vectors.BoolVector); +export class BoolVector extends FieldBoolVector {} +export const FieldInt8Vector = fieldMixin(vectors.Int8Vector); +export class Int8Vector extends FieldInt8Vector {} +export const FieldInt16Vector = fieldMixin(vectors.Int16Vector); +export class Int16Vector extends FieldInt16Vector {} +export const FieldInt32Vector = fieldMixin(vectors.Int32Vector); +export class Int32Vector extends FieldInt32Vector {} +export const FieldInt64Vector = fieldMixin(vectors.Int64Vector); +export class Int64Vector extends FieldInt64Vector {} +export const FieldUint8Vector = fieldMixin(vectors.Uint8Vector); +export class Uint8Vector extends FieldUint8Vector {} +export const FieldUint16Vector = fieldMixin(vectors.Uint16Vector); +export class Uint16Vector extends FieldUint16Vector {} +export const FieldUint32Vector = fieldMixin(vectors.Uint32Vector); +export class Uint32Vector extends FieldUint32Vector {} +export const FieldUint64Vector = fieldMixin(vectors.Uint64Vector); +export class Uint64Vector extends FieldUint64Vector {} +export const FieldDate32Vector = fieldMixin(vectors.Date32Vector); +export class Date32Vector extends FieldDate32Vector {} +export const FieldDate64Vector = fieldMixin(vectors.Date64Vector); +export class Date64Vector extends FieldDate64Vector {} +export const FieldTime32Vector = fieldMixin(vectors.Time32Vector); +export class Time32Vector extends FieldTime32Vector {} +export const FieldTime64Vector = fieldMixin(vectors.Time64Vector); +export class Time64Vector extends FieldTime64Vector {} +export const FieldFloat16Vector = fieldMixin(vectors.Float16Vector); +export class Float16Vector extends FieldFloat16Vector {} +export const FieldFloat32Vector = fieldMixin(vectors.Float32Vector); +export class Float32Vector extends FieldFloat32Vector {} +export const FieldFloat64Vector = fieldMixin(vectors.Float64Vector); +export class Float64Vector extends FieldFloat64Vector {} +export const FieldStructVector = fieldMixin(vectors.StructVector); +export class StructVector extends FieldStructVector {} +export const FieldDecimalVector = fieldMixin(vectors.DecimalVector); +export class DecimalVector extends FieldDecimalVector {} +export const FieldTimestampVector = fieldMixin(vectors.TimestampVector); +export class TimestampVector extends FieldTimestampVector {} +export const FieldDictionaryVector = fieldMixin(vectors.DictionaryVector); +export class DictionaryVector extends FieldDictionaryVector {} +export const FieldFixedSizeListVector = fieldMixin(vectors.FixedSizeListVector); +export class FixedSizeListVector extends FieldFixedSizeListVector {} \ No newline at end of file diff --git a/js/src/vector/traits/mixins.ts b/js/src/vector/traits/mixins.ts new file mode 100644 index 0000000000000..011ac31352db6 --- /dev/null +++ b/js/src/vector/traits/mixins.ts @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import { BoolVector } from '../numeric'; +import * as Schema_ from '../../format/fb/Schema'; +import { Field, FieldNode } from '../../format/arrow'; +import Type = Schema_.org.apache.arrow.flatbuf.Type; + +function isField(x: any): x is Field { + return x instanceof Field; +} + +function isFieldNode(x: any): x is FieldNode { + return x instanceof FieldNode; +} + +export function isFieldArgv(x: any): x is { field: Field, fieldNode: FieldNode } { + return x && isField(x.field) && isFieldNode(x.fieldNode); +} + +export function isNullableArgv(x: any): x is { validity: Uint8Array } { + return x && x.validity && ArrayBuffer.isView(x.validity) && x.validity instanceof Uint8Array; +} + +type Ctor = new (argv: TArgv) => Vector; + +export const nullableMixin = (superclass: new (argv: TArgv) => T) => + class extends (superclass as Ctor) { + readonly validity: Vector; + constructor(argv: TArgv & { validity: Uint8Array }) { + super(argv); + this.validity = new BoolVector({ data: argv.validity }); + } + get(index: number) { + return this.validity.get(index) ? super.get(index) : null; + } + }; + +export const fieldMixin = (superclass: new (argv: TArgv) => T) => + class extends (superclass as Ctor) implements Vector { + readonly field: Field; + readonly type: string; + readonly length: number; + readonly stride: number; + readonly nullable: boolean; + readonly nullCount: number; + readonly fieldNode: FieldNode; + constructor(argv: TArgv & { field: Field, fieldNode: FieldNode }) { + super(argv); + const { field, fieldNode } = argv; + this.field = field; + this.fieldNode = fieldNode; + this.nullable = field.nullable; + this.type = Type[field.typeType]; + this.length = fieldNode.length.low | 0; + this.nullCount = fieldNode.nullCount.low; + } + get name() { return this.field.name!; } + get metadata() { return this.field.metadata!; } + }; diff --git a/js/src/vector/traits/nullable.ts b/js/src/vector/traits/nullable.ts new file mode 100644 index 0000000000000..1393e5fd1bf68 --- /dev/null +++ b/js/src/vector/traits/nullable.ts @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import * as vectors from './vectors'; +import { nullableMixin } from './mixins'; + +export { Vector }; +export const NullableListVector = nullableMixin(vectors.ListVector); +export class ListVector extends NullableListVector {} +export const NullableBinaryVector = nullableMixin(vectors.BinaryVector); +export class BinaryVector extends NullableBinaryVector {} +export const NullableUtf8Vector = nullableMixin(vectors.Utf8Vector); +export class Utf8Vector extends NullableUtf8Vector {} +export const NullableBoolVector = nullableMixin(vectors.BoolVector); +export class BoolVector extends NullableBoolVector {} +export const NullableInt8Vector = nullableMixin(vectors.Int8Vector); +export class Int8Vector extends NullableInt8Vector {} +export const NullableInt16Vector = nullableMixin(vectors.Int16Vector); +export class Int16Vector extends NullableInt16Vector {} +export const NullableInt32Vector = nullableMixin(vectors.Int32Vector); +export class Int32Vector extends NullableInt32Vector {} +export const NullableInt64Vector = nullableMixin(vectors.Int64Vector); +export class Int64Vector extends NullableInt64Vector {} +export const NullableUint8Vector = nullableMixin(vectors.Uint8Vector); +export class Uint8Vector extends NullableUint8Vector {} +export const NullableUint16Vector = nullableMixin(vectors.Uint16Vector); +export class Uint16Vector extends NullableUint16Vector {} +export const NullableUint32Vector = nullableMixin(vectors.Uint32Vector); +export class Uint32Vector extends NullableUint32Vector {} +export const NullableUint64Vector = nullableMixin(vectors.Uint64Vector); +export class Uint64Vector extends NullableUint64Vector {} +export const NullableDate32Vector = nullableMixin(vectors.Date32Vector); +export class Date32Vector extends NullableDate32Vector {} +export const NullableDate64Vector = nullableMixin(vectors.Date64Vector); +export class Date64Vector extends NullableDate64Vector {} +export const NullableTime32Vector = nullableMixin(vectors.Time32Vector); +export class Time32Vector extends NullableTime32Vector {} +export const NullableTime64Vector = nullableMixin(vectors.Time64Vector); +export class Time64Vector extends NullableTime64Vector {} +export const NullableFloat16Vector = nullableMixin(vectors.Float16Vector); +export class Float16Vector extends NullableFloat16Vector {} +export const NullableFloat32Vector = nullableMixin(vectors.Float32Vector); +export class Float32Vector extends NullableFloat32Vector {} +export const NullableFloat64Vector = nullableMixin(vectors.Float64Vector); +export class Float64Vector extends NullableFloat64Vector {} +export const NullableStructVector = nullableMixin(vectors.StructVector); +export class StructVector extends NullableStructVector {} +export const NullableDecimalVector = nullableMixin(vectors.DecimalVector); +export class DecimalVector extends NullableDecimalVector {} +export const NullableTimestampVector = nullableMixin(vectors.TimestampVector); +export class TimestampVector extends NullableTimestampVector {} +export const NullableDictionaryVector = nullableMixin(vectors.DictionaryVector); +export class DictionaryVector extends NullableDictionaryVector {} +export const NullableFixedSizeListVector = nullableMixin(vectors.FixedSizeListVector); +export class FixedSizeListVector extends NullableFixedSizeListVector {} \ No newline at end of file diff --git a/js/src/vector/traits/nullablefield.ts b/js/src/vector/traits/nullablefield.ts new file mode 100644 index 0000000000000..8cbee62e43dc3 --- /dev/null +++ b/js/src/vector/traits/nullablefield.ts @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import * as vectors from './vectors'; +import { nullableMixin, fieldMixin } from './mixins'; +import { Field, FieldNode } from '../../format/arrow'; +export { Vector, Field, FieldNode }; + +export const NullableFieldListVector = nullableMixin(fieldMixin(vectors.ListVector)); +export class ListVector extends NullableFieldListVector {} +export const NullableFieldBinaryVector = nullableMixin(fieldMixin(vectors.BinaryVector)); +export class BinaryVector extends NullableFieldBinaryVector {} +export const NullableFieldUtf8Vector = nullableMixin(fieldMixin(vectors.Utf8Vector)); +export class Utf8Vector extends NullableFieldUtf8Vector {} +export const NullableFieldBoolVector = nullableMixin(fieldMixin(vectors.BoolVector)); +export class BoolVector extends NullableFieldBoolVector {} +export const NullableFieldInt8Vector = nullableMixin(fieldMixin(vectors.Int8Vector)); +export class Int8Vector extends NullableFieldInt8Vector {} +export const NullableFieldInt16Vector = nullableMixin(fieldMixin(vectors.Int16Vector)); +export class Int16Vector extends NullableFieldInt16Vector {} +export const NullableFieldInt32Vector = nullableMixin(fieldMixin(vectors.Int32Vector)); +export class Int32Vector extends NullableFieldInt32Vector {} +export const NullableFieldInt64Vector = nullableMixin(fieldMixin(vectors.Int64Vector)); +export class Int64Vector extends NullableFieldInt64Vector {} +export const NullableFieldUint8Vector = nullableMixin(fieldMixin(vectors.Uint8Vector)); +export class Uint8Vector extends NullableFieldUint8Vector {} +export const NullableFieldUint16Vector = nullableMixin(fieldMixin(vectors.Uint16Vector)); +export class Uint16Vector extends NullableFieldUint16Vector {} +export const NullableFieldUint32Vector = nullableMixin(fieldMixin(vectors.Uint32Vector)); +export class Uint32Vector extends NullableFieldUint32Vector {} +export const NullableFieldUint64Vector = nullableMixin(fieldMixin(vectors.Uint64Vector)); +export class Uint64Vector extends NullableFieldUint64Vector {} +export const NullableFieldDate32Vector = nullableMixin(fieldMixin(vectors.Date32Vector)); +export class Date32Vector extends NullableFieldDate32Vector {} +export const NullableFieldDate64Vector = nullableMixin(fieldMixin(vectors.Date64Vector)); +export class Date64Vector extends NullableFieldDate64Vector {} +export const NullableFieldTime32Vector = nullableMixin(fieldMixin(vectors.Time32Vector)); +export class Time32Vector extends NullableFieldTime32Vector {} +export const NullableFieldTime64Vector = nullableMixin(fieldMixin(vectors.Time64Vector)); +export class Time64Vector extends NullableFieldTime64Vector {} +export const NullableFieldFloat16Vector = nullableMixin(fieldMixin(vectors.Float16Vector)); +export class Float16Vector extends NullableFieldFloat16Vector {} +export const NullableFieldFloat32Vector = nullableMixin(fieldMixin(vectors.Float32Vector)); +export class Float32Vector extends NullableFieldFloat32Vector {} +export const NullableFieldFloat64Vector = nullableMixin(fieldMixin(vectors.Float64Vector)); +export class Float64Vector extends NullableFieldFloat64Vector {} +export const NullableFieldStructVector = nullableMixin(fieldMixin(vectors.StructVector)); +export class StructVector extends NullableFieldStructVector {} +export const NullableFieldDecimalVector = nullableMixin(fieldMixin(vectors.DecimalVector)); +export class DecimalVector extends NullableFieldDecimalVector {} +export const NullableFieldTimestampVector = nullableMixin(fieldMixin(vectors.TimestampVector)); +export class TimestampVector extends NullableFieldTimestampVector {} +export const NullableFieldDictionaryVector = nullableMixin(fieldMixin(vectors.DictionaryVector)); +export class DictionaryVector extends NullableFieldDictionaryVector {} +export const NullableFieldFixedSizeListVector = nullableMixin(fieldMixin(vectors.FixedSizeListVector)); +export class FixedSizeListVector extends NullableFieldFixedSizeListVector {} \ No newline at end of file diff --git a/js/src/vector/traits/vectors.ts b/js/src/vector/traits/vectors.ts new file mode 100644 index 0000000000000..f9e05fd4eff1b --- /dev/null +++ b/js/src/vector/traits/vectors.ts @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../vector'; +import { Utf8Vector } from '../utf8'; +import { StructVector } from '../struct'; +import { DictionaryVector } from '../dictionary'; +import { + ListVector, + BinaryVector, + FixedSizeListVector +} from '../list'; + +import { + BoolVector, + Int8Vector, + Int16Vector, + Int32Vector, + Int64Vector, + Uint8Vector, + Uint16Vector, + Uint32Vector, + Uint64Vector, + Float16Vector, + Float32Vector, + Float64Vector, + Date32Vector, + Date64Vector, + Time32Vector, + Time64Vector, + DecimalVector, + TimestampVector, +} from '../numeric'; + +export { + Vector, + BoolVector, + ListVector, + Utf8Vector, + Int8Vector, + Int16Vector, + Int32Vector, + Int64Vector, + Uint8Vector, + Uint16Vector, + Uint32Vector, + Uint64Vector, + Date32Vector, + Date64Vector, + Time32Vector, + Time64Vector, + BinaryVector, + StructVector, + Float16Vector, + Float32Vector, + Float64Vector, + DecimalVector, + TimestampVector, + DictionaryVector, + FixedSizeListVector, +}; diff --git a/js/src/vector/typed.ts b/js/src/vector/typed.ts deleted file mode 100644 index b38812e07d065..0000000000000 --- a/js/src/vector/typed.ts +++ /dev/null @@ -1,326 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Vector } from './vector'; -import { flatbuffers } from 'flatbuffers'; - -import Long = flatbuffers.Long; - -export type VArray = { - [k: number]: T; length: number; - constructor: VArrayCtor>; -}; - -export type VArrayCtor = { - readonly prototype: VArray; - BYTES_PER_ELEMENT?: number; - new(...args: any[]): VArray; -}; - -export class VirtualVector> extends Vector { - protected lists: TArrayType[]; - protected _arrayType: VArrayCtor; - public get arrayType() { return this._arrayType; } - constructor(...lists: TArrayType[]) { - super(); - this.lists = lists.filter(Boolean); - } - get(index: number): T { - /* inlined `findVirtual` impl */ - let rows, length, lists = this.lists; - for (let batch = -1; - (rows = lists[++batch]) && - (length = rows.length) <= index && - 0 <= (index -= length);) {} - return rows && -1 < index ? rows[index] : null; - } - protected range(from: number, total: number, batch?: number) { - /* inlined `findVirtual` impl */ - let rows, local = from, length; - let { lists, _arrayType } = this; - for (batch = (batch || 0) - 1; - (rows = lists[++batch]) && - (length = rows.length) <= local && - 0 <= (local -= length);) {} - if (rows && local > -1) { - let index = 0, listsLength = lists.length; - let set: any = Array.isArray(rows) ? arraySet : typedArraySet; - let slice = _arrayType['prototype']['subarray'] || _arrayType['prototype']['slice']; - let source = slice.call(rows, local, local + total), target = source; - // Perf optimization: if the first slice contains all the values we're looking for, - // we don't have to copy values to a target Array. If we're slicing a TypedArray, - // this is a significant improvement as we avoid the memcpy 🎉 - if (source.length < total) { - target = new _arrayType(total); - while ((index = set(source, target, index)) < total) { - rows = lists[batch = ((batch + 1) % listsLength)]; - source = slice.call(rows, 0, Math.min(rows.length, total - index)); - } - } - return target as any; - } - return new _arrayType(0); - } - *[Symbol.iterator]() { - let index = -1, { lists, length } = this; - for (let outer = -1, n = lists.length; ++outer < n;) { - let list = lists[outer] as any; - for (let inner = -1, k = list.length; ++index < length && ++inner < k;) { - yield list[inner]; - } - } - } -} - -export type ValidityArgs = Vector | Uint8Array; -export class BitVector extends VirtualVector { - static constant: Vector = new (class ValidVector extends Vector { - get() { return true; } - *[Symbol.iterator]() { - do { yield true; } while (true); - } - })(); - static from(src: any) { - return src instanceof BitVector ? src - : src === BitVector.constant ? src - : src instanceof Uint8Array ? new BitVector(src) - : src instanceof Array ? new BitVector(BitVector.pack(src)) - : src instanceof Vector ? new BitVector(BitVector.pack(src)) - : BitVector.constant as Vector; - } - static pack(values: Iterable) { - let xs = [], n, i = 0; - let bit = 0, byte = 0; - for (const value of values) { - value && (byte |= 1 << bit); - if (++bit === 8) { - xs[i++] = byte; - byte = bit = 0; - } - } - if (i === 0 || bit > 0) { xs[i++] = byte; } - if (i % 8 && (n = n = i + 8 - i % 8)) { - do { xs[i] = 0; } while (++i < n); - } - return new Uint8Array(xs); - } - constructor(...lists: Uint8Array[]) { - super(...lists); - this.length = this.lists.reduce((l, xs) => l + xs['length'], 0); - } - get(index: number) { - /* inlined `findVirtual` impl */ - let rows, length, lists = this.lists; - for (let batch = -1; - (rows = lists[++batch]) && - (length = rows.length) <= index && - 0 <= (index -= length);) {} - return !(!rows || index < 0 || (rows[index >> 3 | 0] & 1 << index % 8) === 0); - } - set(index: number, value: boolean) { - /* inlined `findVirtual` impl */ - let rows, length, lists = this.lists; - for (let batch = -1; - (rows = lists[++batch]) && - (length = rows.length) <= index && - 0 <= (index -= length);) {} - if (rows && index > -1) { - value - ? (rows[index >> 3 | 0] |= (1 << (index % 8))) - : (rows[index >> 3 | 0] &= ~(1 << (index % 8))); - } - } - concat(vector: BitVector) { - return new BitVector(...this.lists, ...vector.lists); - } - *[Symbol.iterator]() { - for (const byte of super[Symbol.iterator]()) { - for (let i = -1; ++i < 8;) { - yield (byte & 1 << i) !== 0; - } - } - } -} - -export class TypedVector extends VirtualVector { - constructor(validity: ValidityArgs, ...lists: TArrayType[]) { - super(...lists); - validity && (this.validity = BitVector.from(validity)); - } - concat(vector: TypedVector) { - return (this.constructor as typeof TypedVector).from(this, - this.length + vector.length, - this.validity.concat(vector.validity), - ...this.lists, ...vector.lists - ); - } -} - -export class DateVector extends TypedVector { - get(index: number) { - return !this.validity.get(index) ? null : new Date( - Math.pow(2, 32) * - super.get(2 * index + 1) + - super.get(2 * index) - ); - } - *[Symbol.iterator]() { - let v, low, high; - let it = super[Symbol.iterator](); - let iv = this.validity[Symbol.iterator](); - while (!(v = iv.next()).done && !(low = it.next()).done && !(high = it.next()).done) { - yield !v.value ? null : new Date(Math.pow(2, 32) * high.value + low.value); - } - } -} - -export class IndexVector extends TypedVector { - get(index: number, returnWithBatchIndex = false) { - /* inlined `findVirtual` impl */ - let rows, length, batch = -1, lists = this.lists; - for (; - (rows = lists[++batch]) && - (length = rows.length) <= index && - 0 <= (index -= length);) {} - return !returnWithBatchIndex - ? (rows && -1 < index ? rows[index + batch] : null) as number - : (rows && -1 < index ? [rows[index + batch], batch] : [0, -1]) as number[]; - } - *[Symbol.iterator]() { - // Alternate between iterating a tuple of [from, batch], and to. The from - // and to values are relative to the record batch they're defined in, so - // `ListVectorBase` needs to know the right batch to read. - let xs = new Int32Array(2), { lists } = this; - for (let i = -1, n = lists.length; ++i < n;) { - let list = lists[i] as any; - for (let j = -1, k = list.length - 1; ++j < k;) { - xs[1] = i; - xs[0] = list[j]; - yield xs; - yield list[j + 1]; - } - } - } -} - -export class ByteVector extends TypedVector { - get(index: number) { - return this.validity.get(index) ? super.get(index) : null; - } - *[Symbol.iterator]() { - let v, r, { validity } = this; - let it = super[Symbol.iterator](); - // fast path the case of no nulls - if (validity === BitVector.constant) { - yield* it; - } else { - let iv = validity[Symbol.iterator](); - while (!(v = iv.next()).done && !(r = it.next()).done) { - yield !v.value ? null : r.value; - } - } - } -} - -export class LongVector extends TypedVector { - get(index: number) { - return !this.validity.get(index) ? null : new Long( - super.get(index * 2), /* low */ - super.get(index * 2 + 1) /* high */ - ); - } - *[Symbol.iterator]() { - let v, low, high; - let it = super[Symbol.iterator](); - let iv = this.validity[Symbol.iterator](); - while (!(v = iv.next()).done && !(low = it.next()).done && !(high = it.next()).done) { - yield !v.value ? null : new Long(low.value, high.value); - } - } -} - -export class Int8Vector extends ByteVector {} -export class Int16Vector extends ByteVector {} -export class Int32Vector extends ByteVector {} -export class Int64Vector extends LongVector {} -export class Uint8Vector extends ByteVector {} -export class Uint16Vector extends ByteVector {} -export class Uint32Vector extends ByteVector {} -export class Uint64Vector extends LongVector {} -export class Float32Vector extends ByteVector {} -export class Float64Vector extends ByteVector {} - -LongVector.prototype.stride = 2; -(Vector.prototype as any).lists = []; -(Vector.prototype as any).validity = BitVector.constant; -(VirtualVector.prototype as any)._arrayType = Array; -(BitVector.prototype as any)._arrayType = Uint8Array; -(Int8Vector.prototype as any)._arrayType = Int8Array; -(Int16Vector.prototype as any)._arrayType = Int16Array; -(Int32Vector.prototype as any)._arrayType = Int32Array; -(Int64Vector.prototype as any)._arrayType = Int32Array; -(Uint8Vector.prototype as any)._arrayType = Uint8Array; -(Uint16Vector.prototype as any)._arrayType = Uint16Array; -(Uint32Vector.prototype as any)._arrayType = Uint32Array; -(Uint64Vector.prototype as any)._arrayType = Uint32Array; -(DateVector.prototype as any)._arrayType = Uint32Array; -(IndexVector.prototype as any)._arrayType = Int32Array; -(Float32Vector.prototype as any)._arrayType = Float32Array; -(Float64Vector.prototype as any)._arrayType = Float64Array; - -function arraySet(source: Array, target: Array, index: number) { - for (let i = 0, n = source.length; i < n;) { - target[index++] = source[i++]; - } - return index; -} - -function typedArraySet(source: TypedArray, target: TypedArray, index: number) { - return target.set(source, index) || index + source.length; -} - -// Rather than eat the iterator cost, we've inlined this function into the relevant functions -// function* findVirtual(index: number, lists: TList[], batch?: number) { -// let rows, length; -// for (batch = (batch || 0) - 1; -// (rows = lists[++batch]) && -// (length = rows.length) <= index && -// 0 <= (index -= length);) {} -// return rows && -1 < index ? yield [rows, index, batch] : null; -// } - -export type TypedArrayCtor = { - readonly prototype: T; - readonly BYTES_PER_ELEMENT: number; - new(length: number): T; - new(array: ArrayLike): T; - new(buffer: ArrayBufferLike, byteOffset?: number, length?: number): T; -}; - -export type FloatArray = Float32Array | Float64Array; -export type IntArray = Int8Array | Int16Array | Int32Array | Uint8ClampedArray | Uint8Array | Uint16Array | Uint32Array; - -export type TypedArray = ( - Int8Array | - Uint8Array | - Int16Array | - Int32Array | - Uint16Array | - Uint32Array | - Float32Array | - Float64Array | - Uint8ClampedArray); diff --git a/js/src/vector/types.ts b/js/src/vector/types.ts new file mode 100644 index 0000000000000..363fcf2265c30 --- /dev/null +++ b/js/src/vector/types.ts @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +export interface TypedArrayConstructor { + readonly prototype: T; + readonly BYTES_PER_ELEMENT: number; + new (length: number): T; + new (elements: Iterable): T; + new (arrayOrArrayBuffer: ArrayLike | ArrayBufferLike): T; + new (buffer: ArrayBufferLike, byteOffset: number, length?: number): T; +} + +export interface TypedArray extends Iterable { + [index: number]: number; + readonly length: number; + readonly byteLength: number; + readonly byteOffset: number; + readonly buffer: ArrayBufferLike; + readonly BYTES_PER_ELEMENT: number; + [Symbol.iterator](): IterableIterator; + slice(start?: number, end?: number): TypedArray; + subarray(begin: number, end?: number): TypedArray; + set(array: ArrayLike, offset?: number): void; +} + +export type List = T[] | TypedArray; +export type FloatArray = Float32Array | Float64Array; +export type IntArray = Int8Array | Int16Array | Int32Array; +export type UintArray = Uint8ClampedArray | Uint8Array | Uint16Array | Uint32Array; diff --git a/js/src/vector/utf8.ts b/js/src/vector/utf8.ts new file mode 100644 index 0000000000000..ba875cf333fe7 --- /dev/null +++ b/js/src/vector/utf8.ts @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from './vector'; +import { VirtualVector } from './virtual'; +import { TextDecoder } from 'text-encoding-utf-8'; + +const decoder = new TextDecoder('utf-8'); + +export class Utf8Vector extends Vector { + readonly values: Vector; + constructor(argv: { values: Vector }) { + super(); + this.values = argv.values; + } + get(index: number) { + const chars = this.getCodePoints(index); + return chars ? decoder.decode(chars) : null; + } + getCodePoints(index: number) { + return this.values.get(index); + } + concat(...vectors: Vector[]): Vector { + return new VirtualVector(Array, this, ...vectors); + } +} diff --git a/js/src/vector/vector.ts b/js/src/vector/vector.ts index 1f39f87cbefc8..9565478ad969b 100644 --- a/js/src/vector/vector.ts +++ b/js/src/vector/vector.ts @@ -15,67 +15,30 @@ // specific language governing permissions and limitations // under the License. -import * as Schema_ from '../format/Schema_generated'; -export import Type = Schema_.org.apache.arrow.flatbuf.Type; -export import Field = Schema_.org.apache.arrow.flatbuf.Field; +import * as Schema_ from '../format/fb/Schema'; +import Type = Schema_.org.apache.arrow.flatbuf.Type; -export function sliceToRangeArgs(length: number, start: number, end?: number) { - let total = length, from = start || 0; - let to = end === end && typeof end == 'number' ? end : total; - if (to < 0) { to = total + to; } - if (from < 0) { from = total - (from * -1) % total; } - if (to < from) { from = to; to = start; } - total = !isFinite(total = (to - from)) || total < 0 ? 0 : total; - return [from, total]; +export interface Vector extends Iterable { + readonly name: string; + readonly type: string; + readonly length: number; + readonly nullable: boolean; + readonly nullCount: number; + readonly metadata: Map; + get(index: number): T | null; + concat(...vectors: Vector[]): Vector; + slice(start?: number, end?: number): R; } -export class Vector implements Iterable { - static defaultName = ''; - static defaultProps = new Map(); - static defaultType = Type[Type.NONE]; - static create(field: Field, length: number, ...args: any[]) { - let vector = new this(...args), m; - vector.length = length; - vector.name = field.name(); - vector.type = Type[field.typeType()]; - if ((m = field.customMetadataLength()) > 0) { - let entry, i = 0, data = vector.props = new Map(); - do { - entry = field.customMetadata(i); - data[entry.key()] = entry.value(); - } while (++i < m); +export class Vector implements Vector { + slice(start?: number, end?: number): R { + let { length } = this, from = start! | 0; + let to = end === undefined ? length : Math.max(end | 0, from); + let result = new Array(to - Math.min(from, to)); + for (let i = -1, n = result.length; ++i < n;) { + result[i] = this.get(i + from); } - return vector; - } - static from(source: Vector, length: number, ...args: any[]) { - let vector = new this(...args); - vector.length = length; - source.name !== Vector.defaultName && (vector.name = source.name); - source.type !== Vector.defaultType && (vector.type = source.type); - source.props !== Vector.defaultProps && (vector.props = source.props); - return vector; - } - public name: string; - public type: string; - public length: number; - public stride: number; - public props: Map; - protected validity: Vector; - get(index: number): T { return null; } - concat(vector: Vector) { return vector; } - slice(start?: number, end?: number, batch?: number) { - const { stride } = this; - const [offset, length] = sliceToRangeArgs( - stride * this.length, stride * (start || 0), stride * end - ); - return this.range(offset, length, batch); - } - protected range(index: number, length: number, batch?: number) { - const result = new Array(length); - for (let i = -1, n = this.length; ++i < length;) { - result[i] = this.get((i + index) % n) as any; - } - return result as Iterable; + return result as any; } *[Symbol.iterator]() { for (let i = -1, n = this.length; ++i < n;) { @@ -84,8 +47,9 @@ export class Vector implements Iterable { } } -Vector.prototype.length = 0; -Vector.prototype.stride = 1; -Vector.prototype.name = Vector.defaultName; -Vector.prototype.type = Vector.defaultType; -Vector.prototype.props = Vector.defaultProps; +(Vector.prototype as any).name = ''; +(Vector.prototype as any).stride = 1; +(Vector.prototype as any).nullable = !1; +(Vector.prototype as any).nullCount = 0; +(Vector.prototype as any).metadata = new Map(); +(Vector.prototype as any).type = Type[Type.NONE]; diff --git a/js/src/vector/virtual.ts b/js/src/vector/virtual.ts new file mode 100644 index 0000000000000..42db78706db51 --- /dev/null +++ b/js/src/vector/virtual.ts @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from './vector'; +import { NumericVector } from './numeric'; +import { TypedArray, TypedArrayConstructor } from './types'; + +export class VirtualVector implements Vector { + readonly name: string; + readonly type: string; + readonly length: number; + readonly vectors: Vector[]; + readonly offsets: Uint32Array; + readonly ArrayType: ArrayConstructor | TypedArrayConstructor; + constructor(ArrayType: ArrayConstructor | TypedArrayConstructor, ...vectors: Vector[]) { + this.vectors = vectors; + this.ArrayType = ArrayType; + this.name = (vectors[0] as any).name; + this.type = (vectors[0] as any).type; + this.length = vectors.reduce((sum, vec) => sum + vec.length, 0); + this.offsets = Uint32Array.from(vectors.reduce((sums, vector, index) => [...sums, vector.length + sums[index]], [0])); + } + *[Symbol.iterator]() { + for (const vector of this.vectors) { + yield* vector; + } + } + get nullable() { + return (this.vectors as Vector[]).some((vec) => vec.nullable); + } + get nullCount() { + return (this.vectors as Vector[]).reduce((sum, v) => sum + v.nullCount | 0, 0); + } + get metadata() { + return new Map( + (this.vectors as Vector[]).reduce((entries, v) => [ + ...entries, ...v.metadata.entries() + ], [] as [string, string][]) + ); + } + get(index: number) { + // find the vector index and adjusted value offset (inlined) + let offsets = this.offsets, offset = 0; + let left = 0, middle = 0, right = offsets.length - 1; + while (index < offsets[right] && index >= (offset = offsets[left])) { + if (left + 1 === right) { + return this.vectors[left].get(index - offset); + } + middle = left + ((right - left) / 2) | 0; + index >= offsets[middle] ? (left = middle) : (right = middle); + } + return null; + } + concat(...vectors: Vector[]) { + return new VirtualVector(this.ArrayType, ...this.vectors, ...vectors); + } + slice(begin?: number, end?: number) { + + // clamp begin and end values between the virtual length (inlined) + // let [from, total] = clampRange(this.length, begin!, end); + let total = this.length, from = begin! | 0; + let to = end === end && typeof end == 'number' ? end : total; + if (to < 0) { to = total + to; } + if (from < 0) { from = total - (from * -1) % total; } + if (to < from) { from = to; to = begin! | 0; } + total = !isFinite(total = (to - from)) || total < 0 ? 0 : total; + + // find the vector index and adjusted value offset (inlined) + let offsets = this.offsets, ArrayType = this.ArrayType as any; + let offset = 0, index = 0, middle = 0, right = offsets.length - 1; + while (from < offsets[right] && from >= (offset = offsets[index])) { + if (index + 1 === right) { + from -= offset; + let set = ArrayType === Array ? arraySet : typedArraySet as any; + let vectors = this.vectors as any as NumericVector[]; + let vector = vectors[index], source = vector.slice(from, from + total), target = source; + // Perf optimization: if the first slice contains all the values we're looking for, + // we don't have to copy values to a target Array. If we're slicing a TypedArray, + // this is a significant improvement as we avoid the memcpy 🎉 + if ((source.length / vector.stride | 0) < total) { + let vectorsLength = vectors.length; + let count = 0, length = 0, sources = [] as any[]; + do { + sources.push(source); + length += source.length; + count += (source.length / vector.stride | 0); + } while ( + (count < total) && + (vector = vectors[index = (++index % vectorsLength)]) && + (source = vector.slice(0, Math.min(vector.length, total - count))) + ); + target = new ArrayType(length); + for (let i = -1, j = 0, n = sources.length; ++i < n;) { + j = set(sources[i], target, j); + } + } + return target; + } + middle = index + ((right - index) / 2) | 0; + from >= offsets[middle] ? (index = middle) : (right = middle); + } + return new ArrayType(0); + } +} + +function arraySet(source: T[], target: T[], index: number) { + for (let i = 0, n = source.length; i < n;) { + target[index++] = source[i++]; + } + return index; +} + +function typedArraySet(source: TypedArray, target: TypedArray, index: number) { + return target.set(source, index) || index + source.length; +} diff --git a/js/test/Arrow.ts b/js/test/Arrow.ts index 3f29c5409ab26..f2c4e930f92e4 100644 --- a/js/test/Arrow.ts +++ b/js/test/Arrow.ts @@ -16,11 +16,12 @@ // under the License. /* tslint:disable */ -// Dynamically load an Ix target build based on command line arguments +// Dynamically load an Arrow target build based on command line arguments -const target = process.env.TEST_TARGET; -const format = process.env.TEST_MODULE; -const resolve = require('path').resolve; +const path = require('path'); +const target = process.env.TEST_TARGET!; +const format = process.env.TEST_MODULE!; +const useSrc = process.env.TEST_TS_SOURCE === `true`; // these are duplicated in the gulpfile :< const targets = [`es5`, `es2015`, `esnext`]; @@ -30,55 +31,20 @@ function throwInvalidImportError(name: string, value: string, values: string[]) throw new Error('Unrecognized ' + name + ' \'' + value + '\'. Please run tests with \'--' + name + ' \''); } -if (!~targets.indexOf(target)) throwInvalidImportError('target', target, targets); -if (!~formats.indexOf(format)) throwInvalidImportError('module', format, formats); +let modulePath = ``; -let Arrow: any = require(resolve(`./targets/${target}/${format}/Arrow.js`)); +if (useSrc) modulePath = '../src'; +else if (target === `ts` || target === `apache-arrow`) modulePath = target; +else if (!~targets.indexOf(target)) throwInvalidImportError('target', target, targets); +else if (!~formats.indexOf(format)) throwInvalidImportError('module', format, formats); +else modulePath = path.join(target, format); -import { - Table as Table_, - readBuffers as readBuffers_, - Vector as Vector_, - BitVector as BitVector_, - ListVector as ListVector_, - Utf8Vector as Utf8Vector_, - DateVector as DateVector_, - IndexVector as IndexVector_, - TypedVector as TypedVector_, - Int8Vector as Int8Vector_, - Int16Vector as Int16Vector_, - Int32Vector as Int32Vector_, - Int64Vector as Int64Vector_, - Uint8Vector as Uint8Vector_, - Uint16Vector as Uint16Vector_, - Uint32Vector as Uint32Vector_, - Uint64Vector as Uint64Vector_, - Float32Vector as Float32Vector_, - Float64Vector as Float64Vector_, - StructVector as StructVector_, - DictionaryVector as DictionaryVector_, - FixedSizeListVector as FixedSizeListVector_, -} from '../src/Arrow'; +export { Int64, Uint64, Int128 } from '../src/Arrow'; +export { List } from '../src/Arrow'; +export { TypedArray } from '../src/Arrow'; +export { TypedArrayConstructor } from '../src/Arrow'; +export { NumericVectorConstructor } from '../src/Arrow'; -export let Table = Arrow.Table as typeof Table_; -export let readBuffers = Arrow.readBuffers as typeof readBuffers_; -export let Vector = Arrow.Vector as typeof Vector_; -export let BitVector = Arrow.BitVector as typeof BitVector_; -export let ListVector = Arrow.ListVector as typeof ListVector_; -export let Utf8Vector = Arrow.Utf8Vector as typeof Utf8Vector_; -export let DateVector = Arrow.DateVector as typeof DateVector_; -export let IndexVector = Arrow.IndexVector as typeof IndexVector_; -export let TypedVector = Arrow.TypedVector as typeof TypedVector_; -export let Int8Vector = Arrow.Int8Vector as typeof Int8Vector_; -export let Int16Vector = Arrow.Int16Vector as typeof Int16Vector_; -export let Int32Vector = Arrow.Int32Vector as typeof Int32Vector_; -export let Int64Vector = Arrow.Int64Vector as typeof Int64Vector_; -export let Uint8Vector = Arrow.Uint8Vector as typeof Uint8Vector_; -export let Uint16Vector = Arrow.Uint16Vector as typeof Uint16Vector_; -export let Uint32Vector = Arrow.Uint32Vector as typeof Uint32Vector_; -export let Uint64Vector = Arrow.Uint64Vector as typeof Uint64Vector_; -export let Float32Vector = Arrow.Float32Vector as typeof Float32Vector_; -export let Float64Vector = Arrow.Float64Vector as typeof Float64Vector_; -export let StructVector = Arrow.StructVector as typeof StructVector_; -export let DictionaryVector = Arrow.DictionaryVector as typeof DictionaryVector_; -export let FixedSizeListVector = Arrow.FixedSizeListVector as typeof FixedSizeListVector_; +import * as Arrow_ from '../src/Arrow'; +export let Arrow: typeof Arrow_ = require(path.resolve(`./targets`, modulePath, `Arrow`)); +export default Arrow; diff --git a/js/test/__snapshots__/reader-tests.ts.snap b/js/test/__snapshots__/reader-tests.ts.snap deleted file mode 100644 index 961ce87861c3f..0000000000000 --- a/js/test/__snapshots__/reader-tests.ts.snap +++ /dev/null @@ -1,497 +0,0 @@ -// Jest Snapshot v1, https://goo.gl/fbAQLP - -exports[`dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"example-csv"`; - -exports[`dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"Struct_"`; - -exports[`dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `2`; - -exports[`dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = ` -Array [ - "Hermione", - 25, - Float32Array [ - -53.235599517822266, - 40.231998443603516, - ], -] -`; - -exports[`dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 5`] = ` -Array [ - "Severus", - 30, - Float32Array [ - -62.22999954223633, - 3, - ], -] -`; - -exports[`dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 6`] = `"example-csv"`; - -exports[`dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 7`] = `"Struct_"`; - -exports[`dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 8`] = `1`; - -exports[`dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 9`] = ` -Array [ - "Harry", - 20, - Float32Array [ - 23, - -100.23652648925781, - ], -] -`; - -exports[`dictionary stream Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"example-csv"`; - -exports[`dictionary stream Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"Struct_"`; - -exports[`dictionary stream Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `2`; - -exports[`dictionary stream Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = ` -Array [ - "Hermione", - 25, - Float32Array [ - -53.235599517822266, - 40.231998443603516, - ], -] -`; - -exports[`dictionary stream Arrow readBuffers enumerates each batch as an Array of Vectors 5`] = ` -Array [ - "Severus", - 30, - Float32Array [ - -62.22999954223633, - 3, - ], -] -`; - -exports[`dictionary stream Arrow readBuffers enumerates each batch as an Array of Vectors 6`] = `"example-csv"`; - -exports[`dictionary stream Arrow readBuffers enumerates each batch as an Array of Vectors 7`] = `"Struct_"`; - -exports[`dictionary stream Arrow readBuffers enumerates each batch as an Array of Vectors 8`] = `1`; - -exports[`dictionary stream Arrow readBuffers enumerates each batch as an Array of Vectors 9`] = ` -Array [ - "Harry", - 20, - Float32Array [ - 23, - -100.23652648925781, - ], -] -`; - -exports[`dictionary2 file Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"struct"`; - -exports[`dictionary2 file Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"Struct_"`; - -exports[`dictionary2 file Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `2`; - -exports[`dictionary2 file Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = ` -Array [ - "a0fb47f9-f8fb-4403-a64a-786d7611f8ef", - "Airbus", - 1502880750, - Float32Array [ - 32.45663833618164, - 1.8712350130081177, - ], -] -`; - -exports[`dictionary2 file Arrow readBuffers enumerates each batch as an Array of Vectors 5`] = ` -Array [ - "50fb46f4-fefa-42c1-919c-0121974cdd00", - "Boeing", - 1502880750, - Float32Array [ - 38.766666412353516, - -4.181231498718262, - ], -] -`; - -exports[`multi_dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"struct"`; - -exports[`multi_dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"Struct_"`; - -exports[`multi_dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `2`; - -exports[`multi_dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = ` -Array [ - "a0fb47f9-f8fb-4403-a64a-786d7611f8ef", - "12345", - "Airbus", - 1502880750, - Float32Array [ - 32.45663833618164, - 1.8712350130081177, - ], -] -`; - -exports[`multi_dictionary file Arrow readBuffers enumerates each batch as an Array of Vectors 5`] = ` -Array [ - "50fb46f4-fefa-42c1-919c-0121974cdd00", - "67890", - "Boeing", - 1502880750, - Float32Array [ - 38.766666412353516, - -4.181231498718262, - ], -] -`; - -exports[`multipart count Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"row_count"`; - -exports[`multipart count Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"Int"`; - -exports[`multipart count Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `1`; - -exports[`multipart count Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = `10000`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"origin_lat"`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"FloatingPoint"`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `5`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = `35.393089294433594`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 5`] = `35.393089294433594`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 6`] = `35.393089294433594`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 7`] = `29.533695220947266`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 8`] = `29.533695220947266`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 9`] = `"origin_lon"`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 10`] = `"FloatingPoint"`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 11`] = `5`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 12`] = `-97.6007308959961`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 13`] = `-97.6007308959961`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 14`] = `-97.6007308959961`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 15`] = `-98.46977996826172`; - -exports[`multipart latlong Arrow readBuffers enumerates each batch as an Array of Vectors 16`] = `-98.46977996826172`; - -exports[`multipart origins Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"origin_city"`; - -exports[`multipart origins Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"Utf8"`; - -exports[`multipart origins Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `5`; - -exports[`multipart origins Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = `"Oklahoma City"`; - -exports[`multipart origins Arrow readBuffers enumerates each batch as an Array of Vectors 5`] = `"Oklahoma City"`; - -exports[`multipart origins Arrow readBuffers enumerates each batch as an Array of Vectors 6`] = `"Oklahoma City"`; - -exports[`multipart origins Arrow readBuffers enumerates each batch as an Array of Vectors 7`] = `"San Antonio"`; - -exports[`multipart origins Arrow readBuffers enumerates each batch as an Array of Vectors 8`] = `"San Antonio"`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"foo"`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"Int"`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `5`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = `1`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 5`] = `null`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 6`] = `3`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 7`] = `4`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 8`] = `5`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 9`] = `"bar"`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 10`] = `"FloatingPoint"`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 11`] = `5`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 12`] = `1`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 13`] = `null`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 14`] = `null`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 15`] = `4`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 16`] = `5`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 17`] = `"baz"`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 18`] = `"Utf8"`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 19`] = `5`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 20`] = `"aa"`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 21`] = `null`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 22`] = `null`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 23`] = `"bbb"`; - -exports[`simple file Arrow readBuffers enumerates each batch as an Array of Vectors 24`] = `"cccc"`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"foo"`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"Int"`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `5`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = `1`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 5`] = `null`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 6`] = `3`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 7`] = `4`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 8`] = `5`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 9`] = `"bar"`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 10`] = `"FloatingPoint"`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 11`] = `5`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 12`] = `1`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 13`] = `null`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 14`] = `null`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 15`] = `4`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 16`] = `5`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 17`] = `"baz"`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 18`] = `"Utf8"`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 19`] = `5`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 20`] = `"aa"`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 21`] = `null`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 22`] = `null`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 23`] = `"bbb"`; - -exports[`simple stream Arrow readBuffers enumerates each batch as an Array of Vectors 24`] = `"cccc"`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"struct_nullable"`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"Struct_"`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `7`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = `null`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 5`] = ` -Array [ - null, - "MhRNxD4", -] -`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 6`] = ` -Array [ - 137773603, - "3F9HBxK", -] -`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 7`] = ` -Array [ - 410361374, - "aVd88fp", -] -`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 8`] = `null`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 9`] = ` -Array [ - null, - "3loZrRf", -] -`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 10`] = `null`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 11`] = `"struct_nullable"`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 12`] = `"Struct_"`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 13`] = `10`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 14`] = `null`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 15`] = ` -Array [ - null, - null, -] -`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 16`] = ` -Array [ - null, - null, -] -`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 17`] = `null`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 18`] = ` -Array [ - null, - "78SLiRw", -] -`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 19`] = `null`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 20`] = `null`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 21`] = ` -Array [ - null, - "0ilsf82", -] -`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 22`] = ` -Array [ - null, - "LjS9MbU", -] -`; - -exports[`struct file Arrow readBuffers enumerates each batch as an Array of Vectors 23`] = ` -Array [ - null, - null, -] -`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 1`] = `"struct_nullable"`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 2`] = `"Struct_"`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 3`] = `7`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 4`] = `null`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 5`] = ` -Array [ - null, - "MhRNxD4", -] -`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 6`] = ` -Array [ - 137773603, - "3F9HBxK", -] -`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 7`] = ` -Array [ - 410361374, - "aVd88fp", -] -`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 8`] = `null`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 9`] = ` -Array [ - null, - "3loZrRf", -] -`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 10`] = `null`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 11`] = `"struct_nullable"`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 12`] = `"Struct_"`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 13`] = `10`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 14`] = `null`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 15`] = ` -Array [ - null, - null, -] -`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 16`] = ` -Array [ - null, - null, -] -`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 17`] = `null`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 18`] = ` -Array [ - null, - "78SLiRw", -] -`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 19`] = `null`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 20`] = `null`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 21`] = ` -Array [ - null, - "0ilsf82", -] -`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 22`] = ` -Array [ - null, - "LjS9MbU", -] -`; - -exports[`struct stream Arrow readBuffers enumerates each batch as an Array of Vectors 23`] = ` -Array [ - null, - null, -] -`; diff --git a/js/test/__snapshots__/table-tests.ts.snap b/js/test/__snapshots__/table-tests.ts.snap deleted file mode 100644 index a7fb9c5a64c24..0000000000000 --- a/js/test/__snapshots__/table-tests.ts.snap +++ /dev/null @@ -1,1815 +0,0 @@ -// Jest Snapshot v1, https://goo.gl/fbAQLP - -exports[`dictionary file Arrow Table creates a Table from Arrow buffers 1`] = `"example-csv"`; - -exports[`dictionary file Arrow Table creates a Table from Arrow buffers 2`] = `"Struct_"`; - -exports[`dictionary file Arrow Table creates a Table from Arrow buffers 3`] = `3`; - -exports[`dictionary file Arrow Table creates a Table from Arrow buffers 4`] = ` -Array [ - "Hermione", - 25, - Float32Array [ - -53.235599517822266, - 40.231998443603516, - ], -] -`; - -exports[`dictionary file Arrow Table creates a Table from Arrow buffers 5`] = ` -Array [ - "Severus", - 30, - Float32Array [ - -62.22999954223633, - 3, - ], -] -`; - -exports[`dictionary file Arrow Table creates a Table from Arrow buffers 6`] = ` -Array [ - "Harry", - 20, - Float32Array [ - 23, - -100.23652648925781, - ], -] -`; - -exports[`dictionary file Arrow Table enumerates Table rows 1`] = ` -Object { - "example-csv": Array [ - "Hermione", - 25, - Float32Array [ - -53.235599517822266, - 40.231998443603516, - ], - ], -} -`; - -exports[`dictionary file Arrow Table enumerates Table rows 2`] = ` -Object { - "example-csv": Array [ - "Severus", - 30, - Float32Array [ - -62.22999954223633, - 3, - ], - ], -} -`; - -exports[`dictionary file Arrow Table enumerates Table rows 3`] = ` -Object { - "example-csv": Array [ - "Harry", - 20, - Float32Array [ - 23, - -100.23652648925781, - ], - ], -} -`; - -exports[`dictionary file Arrow Table enumerates Table rows compact 1`] = ` -Array [ - Array [ - "Hermione", - 25, - Float32Array [ - -53.235599517822266, - 40.231998443603516, - ], - ], -] -`; - -exports[`dictionary file Arrow Table enumerates Table rows compact 2`] = ` -Array [ - Array [ - "Severus", - 30, - Float32Array [ - -62.22999954223633, - 3, - ], - ], -] -`; - -exports[`dictionary file Arrow Table enumerates Table rows compact 3`] = ` -Array [ - Array [ - "Harry", - 20, - Float32Array [ - 23, - -100.23652648925781, - ], - ], -] -`; - -exports[`dictionary file Arrow Table toString() prints a pretty Table 1`] = ` -" example-csv -Hermione,25,-53.235599517822266,40.231998443603516 - Severus,30,-62.22999954223633,3 - Harry,20,23,-100.23652648925781" -`; - -exports[`dictionary file Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`dictionary file Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, example-csv - 0, Hermione,25,-53.235599517822266,40.231998443603516 - 1, Severus,30,-62.22999954223633,3 - 2, Harry,20,23,-100.23652648925781" -`; - -exports[`dictionary stream Arrow Table creates a Table from Arrow buffers 1`] = `"example-csv"`; - -exports[`dictionary stream Arrow Table creates a Table from Arrow buffers 2`] = `"Struct_"`; - -exports[`dictionary stream Arrow Table creates a Table from Arrow buffers 3`] = `3`; - -exports[`dictionary stream Arrow Table creates a Table from Arrow buffers 4`] = ` -Array [ - "Hermione", - 25, - Float32Array [ - -53.235599517822266, - 40.231998443603516, - ], -] -`; - -exports[`dictionary stream Arrow Table creates a Table from Arrow buffers 5`] = ` -Array [ - "Severus", - 30, - Float32Array [ - -62.22999954223633, - 3, - ], -] -`; - -exports[`dictionary stream Arrow Table creates a Table from Arrow buffers 6`] = ` -Array [ - "Harry", - 20, - Float32Array [ - 23, - -100.23652648925781, - ], -] -`; - -exports[`dictionary stream Arrow Table enumerates Table rows 1`] = ` -Object { - "example-csv": Array [ - "Hermione", - 25, - Float32Array [ - -53.235599517822266, - 40.231998443603516, - ], - ], -} -`; - -exports[`dictionary stream Arrow Table enumerates Table rows 2`] = ` -Object { - "example-csv": Array [ - "Severus", - 30, - Float32Array [ - -62.22999954223633, - 3, - ], - ], -} -`; - -exports[`dictionary stream Arrow Table enumerates Table rows 3`] = ` -Object { - "example-csv": Array [ - "Harry", - 20, - Float32Array [ - 23, - -100.23652648925781, - ], - ], -} -`; - -exports[`dictionary stream Arrow Table enumerates Table rows compact 1`] = ` -Array [ - Array [ - "Hermione", - 25, - Float32Array [ - -53.235599517822266, - 40.231998443603516, - ], - ], -] -`; - -exports[`dictionary stream Arrow Table enumerates Table rows compact 2`] = ` -Array [ - Array [ - "Severus", - 30, - Float32Array [ - -62.22999954223633, - 3, - ], - ], -] -`; - -exports[`dictionary stream Arrow Table enumerates Table rows compact 3`] = ` -Array [ - Array [ - "Harry", - 20, - Float32Array [ - 23, - -100.23652648925781, - ], - ], -] -`; - -exports[`dictionary stream Arrow Table toString() prints a pretty Table 1`] = ` -" example-csv -Hermione,25,-53.235599517822266,40.231998443603516 - Severus,30,-62.22999954223633,3 - Harry,20,23,-100.23652648925781" -`; - -exports[`dictionary stream Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`dictionary stream Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, example-csv - 0, Hermione,25,-53.235599517822266,40.231998443603516 - 1, Severus,30,-62.22999954223633,3 - 2, Harry,20,23,-100.23652648925781" -`; - -exports[`dictionary2 file Arrow Table creates a Table from Arrow buffers 1`] = `"struct"`; - -exports[`dictionary2 file Arrow Table creates a Table from Arrow buffers 2`] = `"Struct_"`; - -exports[`dictionary2 file Arrow Table creates a Table from Arrow buffers 3`] = `2`; - -exports[`dictionary2 file Arrow Table creates a Table from Arrow buffers 4`] = ` -Array [ - "a0fb47f9-f8fb-4403-a64a-786d7611f8ef", - "Airbus", - 1502880750, - Float32Array [ - 32.45663833618164, - 1.8712350130081177, - ], -] -`; - -exports[`dictionary2 file Arrow Table creates a Table from Arrow buffers 5`] = ` -Array [ - "50fb46f4-fefa-42c1-919c-0121974cdd00", - "Boeing", - 1502880750, - Float32Array [ - 38.766666412353516, - -4.181231498718262, - ], -] -`; - -exports[`dictionary2 file Arrow Table enumerates Table rows 1`] = ` -Object { - "struct": Array [ - "a0fb47f9-f8fb-4403-a64a-786d7611f8ef", - "Airbus", - 1502880750, - Float32Array [ - 32.45663833618164, - 1.8712350130081177, - ], - ], -} -`; - -exports[`dictionary2 file Arrow Table enumerates Table rows 2`] = ` -Object { - "struct": Array [ - "50fb46f4-fefa-42c1-919c-0121974cdd00", - "Boeing", - 1502880750, - Float32Array [ - 38.766666412353516, - -4.181231498718262, - ], - ], -} -`; - -exports[`dictionary2 file Arrow Table enumerates Table rows compact 1`] = ` -Array [ - Array [ - "a0fb47f9-f8fb-4403-a64a-786d7611f8ef", - "Airbus", - 1502880750, - Float32Array [ - 32.45663833618164, - 1.8712350130081177, - ], - ], -] -`; - -exports[`dictionary2 file Arrow Table enumerates Table rows compact 2`] = ` -Array [ - Array [ - "50fb46f4-fefa-42c1-919c-0121974cdd00", - "Boeing", - 1502880750, - Float32Array [ - 38.766666412353516, - -4.181231498718262, - ], - ], -] -`; - -exports[`dictionary2 file Arrow Table toString() prints a pretty Table 1`] = ` -" struct - a0fb47f9-f8fb-4403-a64a-786d7611f8ef,Airbus,1502880750,32.45663833618164,1.8712350130081177 -50fb46f4-fefa-42c1-919c-0121974cdd00,Boeing,1502880750,38.766666412353516,-4.181231498718262" -`; - -exports[`dictionary2 file Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`dictionary2 file Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, struct - 0, a0fb47f9-f8fb-4403-a64a-786d7611f8ef,Airbus,1502880750,32.45663833618164,1.8712350130081177 - 1, 50fb46f4-fefa-42c1-919c-0121974cdd00,Boeing,1502880750,38.766666412353516,-4.181231498718262" -`; - -exports[`multi_dictionary file Arrow Table creates a Table from Arrow buffers 1`] = `"struct"`; - -exports[`multi_dictionary file Arrow Table creates a Table from Arrow buffers 2`] = `"Struct_"`; - -exports[`multi_dictionary file Arrow Table creates a Table from Arrow buffers 3`] = `2`; - -exports[`multi_dictionary file Arrow Table creates a Table from Arrow buffers 4`] = ` -Array [ - "a0fb47f9-f8fb-4403-a64a-786d7611f8ef", - "12345", - "Airbus", - 1502880750, - Float32Array [ - 32.45663833618164, - 1.8712350130081177, - ], -] -`; - -exports[`multi_dictionary file Arrow Table creates a Table from Arrow buffers 5`] = ` -Array [ - "50fb46f4-fefa-42c1-919c-0121974cdd00", - "67890", - "Boeing", - 1502880750, - Float32Array [ - 38.766666412353516, - -4.181231498718262, - ], -] -`; - -exports[`multi_dictionary file Arrow Table enumerates Table rows 1`] = ` -Object { - "struct": Array [ - "a0fb47f9-f8fb-4403-a64a-786d7611f8ef", - "12345", - "Airbus", - 1502880750, - Float32Array [ - 32.45663833618164, - 1.8712350130081177, - ], - ], -} -`; - -exports[`multi_dictionary file Arrow Table enumerates Table rows 2`] = ` -Object { - "struct": Array [ - "50fb46f4-fefa-42c1-919c-0121974cdd00", - "67890", - "Boeing", - 1502880750, - Float32Array [ - 38.766666412353516, - -4.181231498718262, - ], - ], -} -`; - -exports[`multi_dictionary file Arrow Table enumerates Table rows compact 1`] = ` -Array [ - Array [ - "a0fb47f9-f8fb-4403-a64a-786d7611f8ef", - "12345", - "Airbus", - 1502880750, - Float32Array [ - 32.45663833618164, - 1.8712350130081177, - ], - ], -] -`; - -exports[`multi_dictionary file Arrow Table enumerates Table rows compact 2`] = ` -Array [ - Array [ - "50fb46f4-fefa-42c1-919c-0121974cdd00", - "67890", - "Boeing", - 1502880750, - Float32Array [ - 38.766666412353516, - -4.181231498718262, - ], - ], -] -`; - -exports[`multi_dictionary file Arrow Table toString() prints a pretty Table 1`] = ` -" struct - a0fb47f9-f8fb-4403-a64a-786d7611f8ef,12345,Airbus,1502880750,32.45663833618164,1.8712350130081177 -50fb46f4-fefa-42c1-919c-0121974cdd00,67890,Boeing,1502880750,38.766666412353516,-4.181231498718262" -`; - -exports[`multi_dictionary file Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`multi_dictionary file Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, struct - 0, a0fb47f9-f8fb-4403-a64a-786d7611f8ef,12345,Airbus,1502880750,32.45663833618164,1.8712350130081177 - 1, 50fb46f4-fefa-42c1-919c-0121974cdd00,67890,Boeing,1502880750,38.766666412353516,-4.181231498718262" -`; - -exports[`multipart count Arrow Table creates a Table from Arrow buffers 1`] = `"row_count"`; - -exports[`multipart count Arrow Table creates a Table from Arrow buffers 2`] = `"Int"`; - -exports[`multipart count Arrow Table creates a Table from Arrow buffers 3`] = `1`; - -exports[`multipart count Arrow Table creates a Table from Arrow buffers 4`] = `10000`; - -exports[`multipart count Arrow Table enumerates Table rows 1`] = ` -Object { - "row_count": 10000, -} -`; - -exports[`multipart count Arrow Table enumerates Table rows compact 1`] = ` -Array [ - 10000, -] -`; - -exports[`multipart count Arrow Table toString() prints a pretty Table 1`] = ` -"row_count - 10000" -`; - -exports[`multipart count Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`multipart count Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, row_count - 0, 10000" -`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 1`] = `"origin_lat"`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 2`] = `"FloatingPoint"`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 3`] = `5`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 4`] = `35.393089294433594`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 5`] = `35.393089294433594`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 6`] = `35.393089294433594`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 7`] = `29.533695220947266`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 8`] = `29.533695220947266`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 9`] = `"origin_lon"`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 10`] = `"FloatingPoint"`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 11`] = `5`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 12`] = `-97.6007308959961`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 13`] = `-97.6007308959961`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 14`] = `-97.6007308959961`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 15`] = `-98.46977996826172`; - -exports[`multipart latlong Arrow Table creates a Table from Arrow buffers 16`] = `-98.46977996826172`; - -exports[`multipart latlong Arrow Table enumerates Table rows 1`] = ` -Object { - "origin_lat": 35.393089294433594, - "origin_lon": -97.6007308959961, -} -`; - -exports[`multipart latlong Arrow Table enumerates Table rows 2`] = ` -Object { - "origin_lat": 35.393089294433594, - "origin_lon": -97.6007308959961, -} -`; - -exports[`multipart latlong Arrow Table enumerates Table rows 3`] = ` -Object { - "origin_lat": 35.393089294433594, - "origin_lon": -97.6007308959961, -} -`; - -exports[`multipart latlong Arrow Table enumerates Table rows 4`] = ` -Object { - "origin_lat": 29.533695220947266, - "origin_lon": -98.46977996826172, -} -`; - -exports[`multipart latlong Arrow Table enumerates Table rows 5`] = ` -Object { - "origin_lat": 29.533695220947266, - "origin_lon": -98.46977996826172, -} -`; - -exports[`multipart latlong Arrow Table enumerates Table rows compact 1`] = ` -Array [ - 35.393089294433594, - -97.6007308959961, -] -`; - -exports[`multipart latlong Arrow Table enumerates Table rows compact 2`] = ` -Array [ - 35.393089294433594, - -97.6007308959961, -] -`; - -exports[`multipart latlong Arrow Table enumerates Table rows compact 3`] = ` -Array [ - 35.393089294433594, - -97.6007308959961, -] -`; - -exports[`multipart latlong Arrow Table enumerates Table rows compact 4`] = ` -Array [ - 29.533695220947266, - -98.46977996826172, -] -`; - -exports[`multipart latlong Arrow Table enumerates Table rows compact 5`] = ` -Array [ - 29.533695220947266, - -98.46977996826172, -] -`; - -exports[`multipart latlong Arrow Table toString() prints a pretty Table 1`] = ` -" origin_lat, origin_lon -35.393089294433594, -97.6007308959961 -35.393089294433594, -97.6007308959961 -35.393089294433594, -97.6007308959961 -29.533695220947266, -98.46977996826172 -29.533695220947266, -98.46977996826172" -`; - -exports[`multipart latlong Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`multipart latlong Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, origin_lat, origin_lon - 0, 35.393089294433594, -97.6007308959961 - 1, 35.393089294433594, -97.6007308959961 - 2, 35.393089294433594, -97.6007308959961 - 3, 29.533695220947266, -98.46977996826172 - 4, 29.533695220947266, -98.46977996826172" -`; - -exports[`multipart origins Arrow Table creates a Table from Arrow buffers 1`] = `"origin_city"`; - -exports[`multipart origins Arrow Table creates a Table from Arrow buffers 2`] = `"Utf8"`; - -exports[`multipart origins Arrow Table creates a Table from Arrow buffers 3`] = `5`; - -exports[`multipart origins Arrow Table creates a Table from Arrow buffers 4`] = `"Oklahoma City"`; - -exports[`multipart origins Arrow Table creates a Table from Arrow buffers 5`] = `"Oklahoma City"`; - -exports[`multipart origins Arrow Table creates a Table from Arrow buffers 6`] = `"Oklahoma City"`; - -exports[`multipart origins Arrow Table creates a Table from Arrow buffers 7`] = `"San Antonio"`; - -exports[`multipart origins Arrow Table creates a Table from Arrow buffers 8`] = `"San Antonio"`; - -exports[`multipart origins Arrow Table enumerates Table rows 1`] = ` -Object { - "origin_city": "Oklahoma City", -} -`; - -exports[`multipart origins Arrow Table enumerates Table rows 2`] = ` -Object { - "origin_city": "Oklahoma City", -} -`; - -exports[`multipart origins Arrow Table enumerates Table rows 3`] = ` -Object { - "origin_city": "Oklahoma City", -} -`; - -exports[`multipart origins Arrow Table enumerates Table rows 4`] = ` -Object { - "origin_city": "San Antonio", -} -`; - -exports[`multipart origins Arrow Table enumerates Table rows 5`] = ` -Object { - "origin_city": "San Antonio", -} -`; - -exports[`multipart origins Arrow Table enumerates Table rows compact 1`] = ` -Array [ - "Oklahoma City", -] -`; - -exports[`multipart origins Arrow Table enumerates Table rows compact 2`] = ` -Array [ - "Oklahoma City", -] -`; - -exports[`multipart origins Arrow Table enumerates Table rows compact 3`] = ` -Array [ - "Oklahoma City", -] -`; - -exports[`multipart origins Arrow Table enumerates Table rows compact 4`] = ` -Array [ - "San Antonio", -] -`; - -exports[`multipart origins Arrow Table enumerates Table rows compact 5`] = ` -Array [ - "San Antonio", -] -`; - -exports[`multipart origins Arrow Table toString() prints a pretty Table 1`] = ` -" origin_city -Oklahoma City -Oklahoma City -Oklahoma City - San Antonio - San Antonio" -`; - -exports[`multipart origins Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`multipart origins Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, origin_city - 0, Oklahoma City - 1, Oklahoma City - 2, Oklahoma City - 3, San Antonio - 4, San Antonio" -`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 1`] = `"foo"`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 2`] = `"Int"`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 3`] = `5`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 4`] = `1`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 5`] = `null`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 6`] = `3`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 7`] = `4`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 8`] = `5`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 9`] = `"bar"`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 10`] = `"FloatingPoint"`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 11`] = `5`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 12`] = `1`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 13`] = `null`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 14`] = `null`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 15`] = `4`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 16`] = `5`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 17`] = `"baz"`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 18`] = `"Utf8"`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 19`] = `5`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 20`] = `"aa"`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 21`] = `null`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 22`] = `null`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 23`] = `"bbb"`; - -exports[`simple file Arrow Table creates a Table from Arrow buffers 24`] = `"cccc"`; - -exports[`simple file Arrow Table enumerates Table rows 1`] = ` -Object { - "bar": 1, - "baz": "aa", - "foo": 1, -} -`; - -exports[`simple file Arrow Table enumerates Table rows 2`] = ` -Object { - "bar": null, - "baz": null, - "foo": null, -} -`; - -exports[`simple file Arrow Table enumerates Table rows 3`] = ` -Object { - "bar": null, - "baz": null, - "foo": 3, -} -`; - -exports[`simple file Arrow Table enumerates Table rows 4`] = ` -Object { - "bar": 4, - "baz": "bbb", - "foo": 4, -} -`; - -exports[`simple file Arrow Table enumerates Table rows 5`] = ` -Object { - "bar": 5, - "baz": "cccc", - "foo": 5, -} -`; - -exports[`simple file Arrow Table enumerates Table rows compact 1`] = ` -Array [ - 1, - 1, - "aa", -] -`; - -exports[`simple file Arrow Table enumerates Table rows compact 2`] = ` -Array [ - null, - null, - null, -] -`; - -exports[`simple file Arrow Table enumerates Table rows compact 3`] = ` -Array [ - 3, - null, - null, -] -`; - -exports[`simple file Arrow Table enumerates Table rows compact 4`] = ` -Array [ - 4, - 4, - "bbb", -] -`; - -exports[`simple file Arrow Table enumerates Table rows compact 5`] = ` -Array [ - 5, - 5, - "cccc", -] -`; - -exports[`simple file Arrow Table toString() prints a pretty Table 1`] = ` -" foo, bar, baz - 1, 1, aa -null, null, null - 3, null, null - 4, 4, bbb - 5, 5, cccc" -`; - -exports[`simple file Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`simple file Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, foo, bar, baz - 0, 1, 1, aa - 1, null, null, null - 2, 3, null, null - 3, 4, 4, bbb - 4, 5, 5, cccc" -`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 1`] = `"foo"`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 2`] = `"Int"`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 3`] = `5`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 4`] = `1`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 5`] = `null`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 6`] = `3`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 7`] = `4`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 8`] = `5`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 9`] = `"bar"`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 10`] = `"FloatingPoint"`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 11`] = `5`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 12`] = `1`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 13`] = `null`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 14`] = `null`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 15`] = `4`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 16`] = `5`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 17`] = `"baz"`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 18`] = `"Utf8"`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 19`] = `5`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 20`] = `"aa"`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 21`] = `null`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 22`] = `null`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 23`] = `"bbb"`; - -exports[`simple stream Arrow Table creates a Table from Arrow buffers 24`] = `"cccc"`; - -exports[`simple stream Arrow Table enumerates Table rows 1`] = ` -Object { - "bar": 1, - "baz": "aa", - "foo": 1, -} -`; - -exports[`simple stream Arrow Table enumerates Table rows 2`] = ` -Object { - "bar": null, - "baz": null, - "foo": null, -} -`; - -exports[`simple stream Arrow Table enumerates Table rows 3`] = ` -Object { - "bar": null, - "baz": null, - "foo": 3, -} -`; - -exports[`simple stream Arrow Table enumerates Table rows 4`] = ` -Object { - "bar": 4, - "baz": "bbb", - "foo": 4, -} -`; - -exports[`simple stream Arrow Table enumerates Table rows 5`] = ` -Object { - "bar": 5, - "baz": "cccc", - "foo": 5, -} -`; - -exports[`simple stream Arrow Table enumerates Table rows compact 1`] = ` -Array [ - 1, - 1, - "aa", -] -`; - -exports[`simple stream Arrow Table enumerates Table rows compact 2`] = ` -Array [ - null, - null, - null, -] -`; - -exports[`simple stream Arrow Table enumerates Table rows compact 3`] = ` -Array [ - 3, - null, - null, -] -`; - -exports[`simple stream Arrow Table enumerates Table rows compact 4`] = ` -Array [ - 4, - 4, - "bbb", -] -`; - -exports[`simple stream Arrow Table enumerates Table rows compact 5`] = ` -Array [ - 5, - 5, - "cccc", -] -`; - -exports[`simple stream Arrow Table toString() prints a pretty Table 1`] = ` -" foo, bar, baz - 1, 1, aa -null, null, null - 3, null, null - 4, 4, bbb - 5, 5, cccc" -`; - -exports[`simple stream Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`simple stream Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, foo, bar, baz - 0, 1, 1, aa - 1, null, null, null - 2, 3, null, null - 3, 4, 4, bbb - 4, 5, 5, cccc" -`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 1`] = `"struct_nullable"`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 2`] = `"Struct_"`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 3`] = `17`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 4`] = `null`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 5`] = ` -Array [ - null, - "MhRNxD4", -] -`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 6`] = ` -Array [ - 137773603, - "3F9HBxK", -] -`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 7`] = ` -Array [ - 410361374, - "aVd88fp", -] -`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 8`] = `null`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 9`] = ` -Array [ - null, - "3loZrRf", -] -`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 10`] = `null`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 11`] = `null`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 12`] = ` -Array [ - null, - null, -] -`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 13`] = ` -Array [ - null, - null, -] -`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 14`] = `null`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 15`] = ` -Array [ - null, - "78SLiRw", -] -`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 16`] = `null`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 17`] = `null`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 18`] = ` -Array [ - null, - "0ilsf82", -] -`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 19`] = ` -Array [ - null, - "LjS9MbU", -] -`; - -exports[`struct file Arrow Table creates a Table from Arrow buffers 20`] = ` -Array [ - null, - null, -] -`; - -exports[`struct file Arrow Table enumerates Table rows 1`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct file Arrow Table enumerates Table rows 2`] = ` -Object { - "struct_nullable": Array [ - null, - "MhRNxD4", - ], -} -`; - -exports[`struct file Arrow Table enumerates Table rows 3`] = ` -Object { - "struct_nullable": Array [ - 137773603, - "3F9HBxK", - ], -} -`; - -exports[`struct file Arrow Table enumerates Table rows 4`] = ` -Object { - "struct_nullable": Array [ - 410361374, - "aVd88fp", - ], -} -`; - -exports[`struct file Arrow Table enumerates Table rows 5`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct file Arrow Table enumerates Table rows 6`] = ` -Object { - "struct_nullable": Array [ - null, - "3loZrRf", - ], -} -`; - -exports[`struct file Arrow Table enumerates Table rows 7`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct file Arrow Table enumerates Table rows 8`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct file Arrow Table enumerates Table rows 9`] = ` -Object { - "struct_nullable": Array [ - null, - null, - ], -} -`; - -exports[`struct file Arrow Table enumerates Table rows 10`] = ` -Object { - "struct_nullable": Array [ - null, - null, - ], -} -`; - -exports[`struct file Arrow Table enumerates Table rows 11`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct file Arrow Table enumerates Table rows 12`] = ` -Object { - "struct_nullable": Array [ - null, - "78SLiRw", - ], -} -`; - -exports[`struct file Arrow Table enumerates Table rows 13`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct file Arrow Table enumerates Table rows 14`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct file Arrow Table enumerates Table rows 15`] = ` -Object { - "struct_nullable": Array [ - null, - "0ilsf82", - ], -} -`; - -exports[`struct file Arrow Table enumerates Table rows 16`] = ` -Object { - "struct_nullable": Array [ - null, - "LjS9MbU", - ], -} -`; - -exports[`struct file Arrow Table enumerates Table rows 17`] = ` -Object { - "struct_nullable": Array [ - null, - null, - ], -} -`; - -exports[`struct file Arrow Table enumerates Table rows compact 1`] = ` -Array [ - null, -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 2`] = ` -Array [ - Array [ - null, - "MhRNxD4", - ], -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 3`] = ` -Array [ - Array [ - 137773603, - "3F9HBxK", - ], -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 4`] = ` -Array [ - Array [ - 410361374, - "aVd88fp", - ], -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 5`] = ` -Array [ - null, -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 6`] = ` -Array [ - Array [ - null, - "3loZrRf", - ], -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 7`] = ` -Array [ - null, -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 8`] = ` -Array [ - null, -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 9`] = ` -Array [ - Array [ - null, - null, - ], -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 10`] = ` -Array [ - Array [ - null, - null, - ], -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 11`] = ` -Array [ - null, -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 12`] = ` -Array [ - Array [ - null, - "78SLiRw", - ], -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 13`] = ` -Array [ - null, -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 14`] = ` -Array [ - null, -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 15`] = ` -Array [ - Array [ - null, - "0ilsf82", - ], -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 16`] = ` -Array [ - Array [ - null, - "LjS9MbU", - ], -] -`; - -exports[`struct file Arrow Table enumerates Table rows compact 17`] = ` -Array [ - Array [ - null, - null, - ], -] -`; - -exports[`struct file Arrow Table toString() prints a pretty Table 1`] = ` -" struct_nullable - null - ,MhRNxD4 -137773603,3F9HBxK -410361374,aVd88fp - null - ,3loZrRf - null - null - , - , - null - ,78SLiRw - null - null - ,0ilsf82 - ,LjS9MbU - ," -`; - -exports[`struct file Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`struct file Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, struct_nullable - 0, null - 1, ,MhRNxD4 - 2, 137773603,3F9HBxK - 3, 410361374,aVd88fp - 4, null - 5, ,3loZrRf - 6, null - 7, null - 8, , - 9, , - 10, null - 11, ,78SLiRw - 12, null - 13, null - 14, ,0ilsf82 - 15, ,LjS9MbU - 16, ," -`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 1`] = `"struct_nullable"`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 2`] = `"Struct_"`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 3`] = `17`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 4`] = `null`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 5`] = ` -Array [ - null, - "MhRNxD4", -] -`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 6`] = ` -Array [ - 137773603, - "3F9HBxK", -] -`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 7`] = ` -Array [ - 410361374, - "aVd88fp", -] -`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 8`] = `null`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 9`] = ` -Array [ - null, - "3loZrRf", -] -`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 10`] = `null`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 11`] = `null`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 12`] = ` -Array [ - null, - null, -] -`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 13`] = ` -Array [ - null, - null, -] -`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 14`] = `null`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 15`] = ` -Array [ - null, - "78SLiRw", -] -`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 16`] = `null`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 17`] = `null`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 18`] = ` -Array [ - null, - "0ilsf82", -] -`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 19`] = ` -Array [ - null, - "LjS9MbU", -] -`; - -exports[`struct stream Arrow Table creates a Table from Arrow buffers 20`] = ` -Array [ - null, - null, -] -`; - -exports[`struct stream Arrow Table enumerates Table rows 1`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 2`] = ` -Object { - "struct_nullable": Array [ - null, - "MhRNxD4", - ], -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 3`] = ` -Object { - "struct_nullable": Array [ - 137773603, - "3F9HBxK", - ], -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 4`] = ` -Object { - "struct_nullable": Array [ - 410361374, - "aVd88fp", - ], -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 5`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 6`] = ` -Object { - "struct_nullable": Array [ - null, - "3loZrRf", - ], -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 7`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 8`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 9`] = ` -Object { - "struct_nullable": Array [ - null, - null, - ], -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 10`] = ` -Object { - "struct_nullable": Array [ - null, - null, - ], -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 11`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 12`] = ` -Object { - "struct_nullable": Array [ - null, - "78SLiRw", - ], -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 13`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 14`] = ` -Object { - "struct_nullable": null, -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 15`] = ` -Object { - "struct_nullable": Array [ - null, - "0ilsf82", - ], -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 16`] = ` -Object { - "struct_nullable": Array [ - null, - "LjS9MbU", - ], -} -`; - -exports[`struct stream Arrow Table enumerates Table rows 17`] = ` -Object { - "struct_nullable": Array [ - null, - null, - ], -} -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 1`] = ` -Array [ - null, -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 2`] = ` -Array [ - Array [ - null, - "MhRNxD4", - ], -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 3`] = ` -Array [ - Array [ - 137773603, - "3F9HBxK", - ], -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 4`] = ` -Array [ - Array [ - 410361374, - "aVd88fp", - ], -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 5`] = ` -Array [ - null, -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 6`] = ` -Array [ - Array [ - null, - "3loZrRf", - ], -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 7`] = ` -Array [ - null, -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 8`] = ` -Array [ - null, -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 9`] = ` -Array [ - Array [ - null, - null, - ], -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 10`] = ` -Array [ - Array [ - null, - null, - ], -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 11`] = ` -Array [ - null, -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 12`] = ` -Array [ - Array [ - null, - "78SLiRw", - ], -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 13`] = ` -Array [ - null, -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 14`] = ` -Array [ - null, -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 15`] = ` -Array [ - Array [ - null, - "0ilsf82", - ], -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 16`] = ` -Array [ - Array [ - null, - "LjS9MbU", - ], -] -`; - -exports[`struct stream Arrow Table enumerates Table rows compact 17`] = ` -Array [ - Array [ - null, - null, - ], -] -`; - -exports[`struct stream Arrow Table toString() prints a pretty Table 1`] = ` -" struct_nullable - null - ,MhRNxD4 -137773603,3F9HBxK -410361374,aVd88fp - null - ,3loZrRf - null - null - , - , - null - ,78SLiRw - null - null - ,0ilsf82 - ,LjS9MbU - ," -`; - -exports[`struct stream Arrow Table toString() prints an empty Table 1`] = `""`; - -exports[`struct stream Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, struct_nullable - 0, null - 1, ,MhRNxD4 - 2, 137773603,3F9HBxK - 3, 410361374,aVd88fp - 4, null - 5, ,3loZrRf - 6, null - 7, null - 8, , - 9, , - 10, null - 11, ,78SLiRw - 12, null - 13, null - 14, ,0ilsf82 - 15, ,LjS9MbU - 16, ," -`; diff --git a/js/test/arrows/file/dictionary.arrow b/js/test/arrows/file/dictionary.arrow deleted file mode 100644 index 34d41db1f2001..0000000000000 Binary files a/js/test/arrows/file/dictionary.arrow and /dev/null differ diff --git a/js/test/arrows/file/dictionary2.arrow b/js/test/arrows/file/dictionary2.arrow deleted file mode 100644 index 1537f54db79ee..0000000000000 Binary files a/js/test/arrows/file/dictionary2.arrow and /dev/null differ diff --git a/js/test/arrows/file/multi_dictionary.arrow b/js/test/arrows/file/multi_dictionary.arrow deleted file mode 100644 index 113d30da78565..0000000000000 Binary files a/js/test/arrows/file/multi_dictionary.arrow and /dev/null differ diff --git a/js/test/arrows/file/simple.arrow b/js/test/arrows/file/simple.arrow deleted file mode 100644 index 838db6dc8eda5..0000000000000 Binary files a/js/test/arrows/file/simple.arrow and /dev/null differ diff --git a/js/test/arrows/file/struct.arrow b/js/test/arrows/file/struct.arrow deleted file mode 100644 index 3d2c018e6c27c..0000000000000 Binary files a/js/test/arrows/file/struct.arrow and /dev/null differ diff --git a/js/test/arrows/multi/count/records.arrow b/js/test/arrows/multi/count/records.arrow deleted file mode 100644 index 00d883762d369..0000000000000 Binary files a/js/test/arrows/multi/count/records.arrow and /dev/null differ diff --git a/js/test/arrows/multi/count/schema.arrow b/js/test/arrows/multi/count/schema.arrow deleted file mode 100644 index dfd24e9e0018c..0000000000000 Binary files a/js/test/arrows/multi/count/schema.arrow and /dev/null differ diff --git a/js/test/arrows/multi/latlong/records.arrow b/js/test/arrows/multi/latlong/records.arrow deleted file mode 100644 index 563d12d175d4e..0000000000000 Binary files a/js/test/arrows/multi/latlong/records.arrow and /dev/null differ diff --git a/js/test/arrows/multi/latlong/schema.arrow b/js/test/arrows/multi/latlong/schema.arrow deleted file mode 100644 index 638b2ab622f8e..0000000000000 Binary files a/js/test/arrows/multi/latlong/schema.arrow and /dev/null differ diff --git a/js/test/arrows/multi/origins/records.arrow b/js/test/arrows/multi/origins/records.arrow deleted file mode 100644 index 49a8c407e176e..0000000000000 Binary files a/js/test/arrows/multi/origins/records.arrow and /dev/null differ diff --git a/js/test/arrows/multi/origins/schema.arrow b/js/test/arrows/multi/origins/schema.arrow deleted file mode 100644 index 0d10fb0e2d135..0000000000000 Binary files a/js/test/arrows/multi/origins/schema.arrow and /dev/null differ diff --git a/js/test/arrows/stream/dictionary.arrow b/js/test/arrows/stream/dictionary.arrow deleted file mode 100644 index 17ca48b3a97f5..0000000000000 Binary files a/js/test/arrows/stream/dictionary.arrow and /dev/null differ diff --git a/js/test/arrows/stream/simple.arrow b/js/test/arrows/stream/simple.arrow deleted file mode 100644 index 2c68c0e44b0af..0000000000000 Binary files a/js/test/arrows/stream/simple.arrow and /dev/null differ diff --git a/js/test/arrows/stream/struct.arrow b/js/test/arrows/stream/struct.arrow deleted file mode 100644 index 4e97b7084f6b7..0000000000000 Binary files a/js/test/arrows/stream/struct.arrow and /dev/null differ diff --git a/js/test/integration/test-config.ts b/js/test/integration/test-config.ts new file mode 100644 index 0000000000000..d185ecc922c47 --- /dev/null +++ b/js/test/integration/test-config.ts @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as fs from 'fs'; +import * as path from 'path'; +import * as glob from 'glob'; + +export const sources = (process.env.TEST_SOURCES + ? JSON.parse(process.env.TEST_SOURCES + '') + : [`cpp`, `java`]) as ['cpp' | 'java']; + +export const formats = (process.env.TEST_FORMATS + ? JSON.parse(process.env.TEST_FORMATS + '') + : [`file`, `stream`]) as ['file' | 'stream']; + +export const config = sources.reduce((sources, source) => ({ + ...sources, + [source]: formats.reduce((formats, format) => ({ + ...formats, + [format]: loadArrows(source, format) + }), {}) +}), {}) as { + [k in 'cpp' | 'java']: { + [k in 'file' | 'stream']: Arrows + } +}; + +export type Arrows = { name: string, buffers: Uint8Array[] }[]; + +function loadArrows(source: string, format: string) { + const arrows = []; + const filenames = glob.sync(path.resolve(__dirname, `data/${source}/${format}`, `*.arrow`)); + for (const filename of filenames) { + const { name } = path.parse(filename); + arrows.push({ name, buffers: [fs.readFileSync(filename)] }); + } + return arrows as Arrows; +} diff --git a/js/test/integration/validate-tests.ts b/js/test/integration/validate-tests.ts new file mode 100644 index 0000000000000..c612d62ad0c04 --- /dev/null +++ b/js/test/integration/validate-tests.ts @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as fs from 'fs'; +import * as path from 'path'; + +if (!process.env.JSON_PATH || !process.env.ARROW_PATH) { + throw new Error('Integration tests need paths to both json and arrow files'); +} + +const jsonPath = path.resolve(process.env.JSON_PATH + ''); +const arrowPath = path.resolve(process.env.ARROW_PATH + ''); + +if (!fs.existsSync(jsonPath) || !fs.existsSync(arrowPath)) { + throw new Error('Integration tests need both json and arrow files to exist'); +} + +/* tslint:disable */ +const { parse } = require('json-bignum'); + +const jsonData = parse(fs.readFileSync(jsonPath, 'utf8')); +const arrowBuffers: Uint8Array[] = [fs.readFileSync(arrowPath)]; + +import Arrow from '../Arrow'; +import { zip } from 'ix/iterable/zip'; +import { toArray } from 'ix/iterable/toarray'; + +const { Table, read } = Arrow; + +expect.extend({ + toEqualVector(v1: any, v2: any) { + + const format = (x: any, y: any, msg= ' ') => `${ + this.utils.printExpected(x)}${ + msg}${ + this.utils.printReceived(y) + }`; + + let getFailures = new Array(); + let propsFailures = new Array(); + let iteratorFailures = new Array(); + let allFailures = [ + { title: 'get', failures: getFailures }, + { title: 'props', failures: propsFailures }, + { title: 'iterator', failures: iteratorFailures } + ]; + + let props = ['name', 'type', 'length', 'nullable', 'nullCount', 'metadata']; + for (let i = -1, n = props.length; ++i < n;) { + const prop = props[i]; + if (this.utils.stringify(v1[prop]) !== this.utils.stringify(v2[prop])) { + propsFailures.push(`${prop}: ${format(v1[prop], v2[prop], ' !== ')}`); + } + } + + for (let i = -1, n = v1.length; ++i < n;) { + let x1 = v1.get(i), x2 = v2.get(i); + if (this.utils.stringify(x1) !== this.utils.stringify(x2)) { + getFailures.push(`${i}: ${format(x1, x2, ' !== ')}`); + } + } + + let i = -1; + for (let [x1, x2] of zip(v1, v2)) { + ++i; + if (this.utils.stringify(x1) !== this.utils.stringify(x2)) { + iteratorFailures.push(`${i}: ${format(x1, x2, ' !== ')}`); + } + } + + return { + pass: allFailures.every(({ failures }) => failures.length === 0), + message: () => [ + `${v1.name}: (${format('json', 'arrow', ' !== ')})\n`, + ...allFailures.map(({ failures, title }) => + !failures.length ? `` : [`${title}:`, ...failures].join(`\n`)) + ].join('\n') + }; + } +}); + +describe(`Integration`, () => { + testReaderIntegration(); + testTableFromBuffersIntegration(); +}); + +function testReaderIntegration() { + test(`json and arrow buffers report the same values`, () => { + expect.hasAssertions(); + const jsonVectors = toArray(read(jsonData)); + const binaryVectors = toArray(read(arrowBuffers)); + for (const [jVectors, bVectors] of zip(jsonVectors, binaryVectors)) { + expect(jVectors.length).toEqual(bVectors.length); + for (let i = -1, n = jVectors.length; ++i < n;) { + (expect(jVectors[i]) as any).toEqualVector(bVectors[i]); + } + } + }); +} + +function testTableFromBuffersIntegration() { + test(`json and arrow buffers report the same values`, () => { + expect.hasAssertions(); + const jsonTable = Table.from(jsonData); + const binaryTable = Table.from(arrowBuffers); + const jsonVectors = jsonTable.columns; + const binaryVectors = binaryTable.columns; + expect(jsonTable.length).toEqual(binaryTable.length); + expect(jsonVectors.length).toEqual(binaryVectors.length); + for (let i = -1, n = jsonVectors.length; ++i < n;) { + (expect(jsonVectors[i]) as any).toEqualVector(binaryVectors[i]); + } + }); +} diff --git a/js/test/reader-tests.ts b/js/test/reader-tests.ts deleted file mode 100644 index a7f9f4110237f..0000000000000 --- a/js/test/reader-tests.ts +++ /dev/null @@ -1,50 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { readBuffers } from './Arrow'; -import arrowTestConfigurations from './test-config'; - -for (let [name, ...buffers] of arrowTestConfigurations) { - describe(`${name} readBuffers`, () => { - test(`enumerates each batch as an Array of Vectors`, () => { - expect.hasAssertions(); - for (let vectors of readBuffers(...buffers)) { - for (let vector of vectors) { - expect(vector.name).toMatchSnapshot(); - expect(vector.type).toMatchSnapshot(); - expect(vector.length).toMatchSnapshot(); - for (let i = -1, n = vector.length; ++i < n;) { - expect(vector.get(i)).toMatchSnapshot(); - } - } - } - }); - test(`vector iterators report the same values as get`, () => { - expect.hasAssertions(); - for (let vectors of readBuffers(...buffers)) { - for (let vector of vectors) { - let i = -1, n = vector.length; - for (let v of vector) { - expect(++i).toBeLessThan(n); - expect(v).toEqual(vector.get(i)); - } - expect(++i).toEqual(n); - } - } - }); - }); -} diff --git a/js/test/table-tests.ts b/js/test/table-tests.ts deleted file mode 100644 index c840299155af4..0000000000000 --- a/js/test/table-tests.ts +++ /dev/null @@ -1,88 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Table, readBuffers } from './Arrow'; -import arrowTestConfigurations from './test-config'; - -for (let [name, ...buffers] of arrowTestConfigurations) { - describe(`${name} Table`, () => { - test(`creates a Table from Arrow buffers`, () => { - expect.hasAssertions(); - const table = Table.from(...buffers); - for (const vector of table.cols()) { - expect(vector.name).toMatchSnapshot(); - expect(vector.type).toMatchSnapshot(); - expect(vector.length).toMatchSnapshot(); - for (let i = -1, n = vector.length; ++i < n;) { - expect(vector.get(i)).toMatchSnapshot(); - } - } - }); - test(`vector iterators report the same values as get`, () => { - expect.hasAssertions(); - const table = Table.from(...buffers); - for (const vector of table.cols()) { - let i = -1, n = vector.length; - for (let v of vector) { - expect(++i).toBeLessThan(n); - expect(v).toEqual(vector.get(i)); - } - expect(++i).toEqual(n); - } - }); - test(`batch and Table Vectors report the same values`, () => { - expect.hasAssertions(); - let rowsTotal = 0, table = Table.from(...buffers); - for (let vectors of readBuffers(...buffers)) { - let rowsNow = Math.max(...vectors.map((v) => v.length)); - for (let vi = -1, vn = vectors.length; ++vi < vn;) { - let v1 = vectors[vi]; - let v2 = table.getColumnAt(vi); - expect(v1.name).toEqual(v2.name); - expect(v1.type).toEqual(v2.type); - for (let i = -1, n = v1.length; ++i < n;) { - expect(v1.get(i)).toEqual(v2.get(i + rowsTotal)); - } - } - rowsTotal += rowsNow; - } - }); - test(`enumerates Table rows`, () => { - expect.hasAssertions(); - const table = Table.from(...buffers); - for (const row of table.rows()) { - expect(row).toMatchSnapshot(); - } - }); - test(`enumerates Table rows compact`, () => { - expect.hasAssertions(); - const table = Table.from(...buffers); - for (const row of table.rows(true)) { - expect(row).toMatchSnapshot(); - } - }); - test(`toString() prints an empty Table`, () => { - expect(Table.from().toString()).toMatchSnapshot(); - }); - test(`toString() prints a pretty Table`, () => { - expect(Table.from(...buffers).toString()).toMatchSnapshot(); - }); - test(`toString({ index: true }) prints a pretty Table with an Index column`, () => { - expect(Table.from(...buffers).toString({ index: true })).toMatchSnapshot(); - }); - }); -} diff --git a/js/test/test-config.ts b/js/test/test-config.ts deleted file mode 100644 index b31ff11ad4173..0000000000000 --- a/js/test/test-config.ts +++ /dev/null @@ -1,42 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import * as fs from 'fs'; -import * as path from 'path'; -const arrowFormats = ['file', 'stream']; -const arrowFileNames = ['simple', 'struct', 'dictionary', 'dictionary2', 'multi_dictionary']; -const multipartArrows = ['count', 'latlong', 'origins']; -export let arrowTestConfigurations = []; - -arrowTestConfigurations = arrowFormats.reduce((configs, format) => { - return arrowFileNames.reduce((configs, name) => { - const arrowPath = path.resolve(__dirname, `./arrows/${format}/${name}.arrow`); - try { - const arrowFile = fs.readFileSync(arrowPath); - return [...configs, [`${name} ${format} Arrow`, arrowFile]]; - } catch (e) {} - return configs; - }, configs); -}, arrowTestConfigurations); - -arrowTestConfigurations = multipartArrows.reduce((configs, folder) => { - const schemaPath = path.resolve(__dirname, `./arrows/multi/${folder}/schema.arrow`); - const recordsPath = path.resolve(__dirname, `./arrows/multi/${folder}/records.arrow`); - return [...configs, [`multipart ${folder} Arrow`, fs.readFileSync(schemaPath), fs.readFileSync(recordsPath)]]; -}, arrowTestConfigurations); - -export default arrowTestConfigurations; diff --git a/js/test/tsconfig.json b/js/test/tsconfig.json index c1ae204212b8f..838bb1d7041e3 100644 --- a/js/test/tsconfig.json +++ b/js/test/tsconfig.json @@ -2,7 +2,7 @@ "extends": "../tsconfig.json", "include": ["./**/*.ts"], "compilerOptions": { - "target": "ESNEXT", + "target": "es2015", "module": "commonjs", "allowJs": true, "importHelpers": false, diff --git a/js/test/unit/int-tests.ts b/js/test/unit/int-tests.ts new file mode 100644 index 0000000000000..74c96e8efbb6d --- /dev/null +++ b/js/test/unit/int-tests.ts @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { + Int64, + Uint64, + Int128 +} from '../Arrow'; + +describe(`Uint64`, () => { + test(`gets expected high/low bytes`, () => { + let i = new Uint64(new Uint32Array([5, 0])); + expect(i.high()).toEqual(0); + expect(i.low()).toEqual(5); + }); + test(`adds 32-bit numbers`, () => { + let a = new Uint64(new Uint32Array([5, 0])); + let b = new Uint64(new Uint32Array([9, 0])); + let expected = new Uint64(new Uint32Array([14, 0])); + expect(a.plus(b)).toEqual(expected); + }); + test(`addition overflows 32-bit numbers`, () => { + let a = new Uint64(new Uint32Array([0xffffffff, 0])); + let b = new Uint64(new Uint32Array([9, 0])); + let expected = new Uint64(new Uint32Array([8, 1])); + expect(a.plus(b)).toEqual(expected); + }); + test(`multiplies 32-bit numbers`, () => { + let a = new Uint64(new Uint32Array([5, 0])); + let b = new Uint64(new Uint32Array([9, 0])); + let expected = new Uint64(new Uint32Array([45, 0])); + expect(a.times(b)).toEqual(expected); + }); + test(`multiplication overflows 32-bit numbers`, () => { + let a = new Uint64(new Uint32Array([0x80000000, 0])); + let b = new Uint64(new Uint32Array([3, 0])); + let expected = new Uint64(new Uint32Array([0x80000000, 1])); + expect(a.times(b)).toEqual(expected); + }); + test(`multiplication is associative`, () => { + let a = new Uint64(new Uint32Array([0x80000000, 0])); + let b = new Uint64(new Uint32Array([3, 0])); + expect(Uint64.multiply(a, b)).toEqual(Uint64.multiply(b,a)); + }); + test(`lessThan works on 32-bit numbers`, () => { + let a = new Uint64(new Uint32Array([0x0000abcd, 0])); + let b = new Uint64(new Uint32Array([0x0000abcf, 0])); + expect(a.lessThan(b)).toBeTruthy(); + }); + test(`lessThan works on 64-bit numbers`, () => { + let a = new Uint64(new Uint32Array([123, 32])); + let b = new Uint64(new Uint32Array([568, 32])); + expect(a.lessThan(b)).toBeTruthy(); + }); +}); + +describe(`Int64`, () => { + test(`gets expected high/low bytes`, () => { + let i = new Int64(new Uint32Array([5, 0])); + expect(i.high()).toEqual(0); + expect(i.low()).toEqual(5); + }); + test(`adds 32-bit numbers`, () => { + let a = new Int64(new Uint32Array([5, 0])); + let b = new Int64(new Uint32Array([9, 0])); + let expected = new Int64(new Uint32Array([14, 0])); + expect(a.plus(b)).toEqual(expected); + }); + test(`adds negative 32-bit numbers`, () => { + let a = new Int64(new Uint32Array([56789 , 0])); + let b = new Int64(new Uint32Array([-66789, -1])); + let expected = new Int64(new Uint32Array([-10000, -1])); + expect(a.plus(b)).toEqual(expected); + }); + test(`addition overflows 32-bit numbers`, () => { + let a = new Int64(new Uint32Array([0xffffffff, 0])); + let b = new Int64(new Uint32Array([9, 0])); + let expected = new Int64(new Uint32Array([8, 1])); + expect(a.plus(b)).toEqual(expected); + }); + test(`multiplies 32-bit numbers`, () => { + let a = new Int64(new Uint32Array([5, 0])); + let b = new Int64(new Uint32Array([9, 0])); + let expected = new Int64(new Uint32Array([45, 0])); + expect(a.times(b)).toEqual(expected); + }); + test(`multiplication overflows 32-bit numbers`, () => { + let a = new Int64(new Uint32Array([0x80000000, 0])); + let b = new Int64(new Uint32Array([3, 0])); + let expected = new Int64(new Uint32Array([0x80000000, 1])); + expect(a.times(b)).toEqual(expected); + }); + test(`multiplication works on negative numbers`, () => { + let a = new Int64(new Uint32Array([-5, -1])); + let b = new Int64(new Uint32Array([-100, -1])); + expect(a.times(b)).toEqual(new Int64(new Uint32Array([ 500, 0]))); + expect(a.times(b)).toEqual(new Int64(new Uint32Array([ -50000, -1]))); + expect(a.times(b)).toEqual(new Int64(new Uint32Array([5000000, 0]))); + }); + test(`multiplication is associative`, () => { + let a = new Int64(new Uint32Array([0x80000000, 0])); + let b = new Int64(new Uint32Array([3, 0])); + expect(Int64.multiply(a, b)).toEqual(Int64.multiply(b,a)); + }); + test(`lessThan works on 32-bit numbers`, () => { + let a = new Int64(new Uint32Array([0x0000abcd, 0])); + let b = new Int64(new Uint32Array([0x0000abcf, 0])); + expect(a.lessThan(b)).toBeTruthy(); + }); + test(`lessThan works on 64-bit numbers`, () => { + let a = new Int64(new Uint32Array([123, 32])); + let b = new Int64(new Uint32Array([568, 32])); + expect(a.lessThan(b)).toBeTruthy(); + }); + test(`lessThan works on negative numbers`, () => { + let a = new Int64(new Uint32Array([0, -158])); + let b = new Int64(new Uint32Array([-3, -1])); + expect(a.lessThan(b)).toBeTruthy(); + }); + test(`lessThan works on mixed numbers`, () => { + let a = new Int64(new Uint32Array([-3, -1])); + let b = new Int64(new Uint32Array([ 0, 3])); + expect(a.lessThan(b)).toBeTruthy(); + }); + test(`negate works on 32-bit number`, () => { + expect (new Int64(new Uint32Array([123456, 0])).negate()).toEqual(new Int64(new Uint32Array([-123456, -1]))); + }); + test(`double negation is noop`, () => { + let test = new Int64(new Uint32Array([6789, 12345])); + let expected = new Int64(new Uint32Array([6789, 12345])); + expect(test.negate().negate()).toEqual(expected); + }); + test(`negate works on 64-bit number`, () => { + expect (new Int64(new Uint32Array([0xb74abf15, 0x62c])).negate()).toEqual(new Int64(new Uint32Array([0x48b540eb, 0xfffff9d3]))); + }); + test(`fromString parses string`, () => { + expect(Int64.fromString('6789123456789')).toEqual(new Int64(new Uint32Array([0xb74abf15, 0x62c]))); + }); + test(`fromString parses negative string`, () => { + expect(Int64.fromString('-6789123456789')).toEqual(new Int64(new Uint32Array([0x48b540eb, 0xfffff9d3]))); + }); +}); + +describe(`Int128`, () => { + test(`gets expected bytes`, () => { + let i = new Int128(new Uint32Array([4, 3, 2, 1])); + expect(i.high().high()).toEqual(1); + expect(i.high().low() ).toEqual(2); + expect(i.low().high() ).toEqual(3); + expect(i.low().low() ).toEqual(4); + }); + test(`adds 32-bit numbers`, () => { + let a = new Int128(new Uint32Array([5, 0, 0, 0])); + let b = new Int128(new Uint32Array([9, 0, 0, 0])); + let expected = new Int128(new Uint32Array([14, 0, 0, 0])); + expect(a.plus(b)).toEqual(expected); + }); + test(`adds negative 32-bit numbers`, () => { + let a = new Int128(new Uint32Array([56789 , 0, 0, 0])); + let b = new Int128(new Uint32Array([-66789, -1, -1, -1])); + let expected = new Int128(new Uint32Array([-10000, -1, -1, -1])); + expect(a.plus(b)).toEqual(expected); + }); + test(`addition overflows 32-bit numbers`, () => { + let a = new Int128(new Uint32Array([0xffffffff, 0, 0, 0])); + let b = new Int128(new Uint32Array([9, 0, 0, 0])); + let expected = new Int128(new Uint32Array([8, 1, 0, 0])); + expect(a.plus(b)).toEqual(expected); + }); + test(`multiplies 32-bit numbers`, () => { + let a = new Int128(new Uint32Array([5, 0, 0, 0])); + let b = new Int128(new Uint32Array([9, 0, 0, 0])); + let expected = new Int128(new Uint32Array([45, 0, 0, 0])); + expect(a.times(b)).toEqual(expected); + }); + test(`multiplication overflows 32-bit numbers`, () => { + let a = new Int128(new Uint32Array([0x80000000, 0, 0, 0])); + let b = new Int128(new Uint32Array([3, 0, 0, 0])); + let expected = new Int128(new Uint32Array([0x80000000, 1, 0, 0])); + expect(a.times(b)).toEqual(expected); + }); + test(`multiplication works on negative numbers`, () => { + let a = new Int128(new Uint32Array([-5, -1, -1, -1])); + let b = new Int128(new Uint32Array([-100, -1, -1, -1])); + expect(a.times(b)).toEqual(new Int128(new Uint32Array([ 500, 0, 0, 0]))); + expect(a.times(b)).toEqual(new Int128(new Uint32Array([ -50000, -1, -1, -1]))); + expect(a.times(b)).toEqual(new Int128(new Uint32Array([5000000, 0, 0, 0]))); + }); + test(`multiplication is associative`, () => { + let a = new Int128(new Uint32Array([4, 3, 2, 1])); + let b = new Int128(new Uint32Array([3, 0, 0, 0])); + expect(Int128.multiply(a, b)).toEqual(Int128.multiply(b,a)); + }); + test(`multiplication can produce 128-bit number`, () => { + let a = new Int128(new Uint32Array([0, 0xf0000000, 0, 0])); + let b = new Int128(new Uint32Array([0, 0x10000000, 0, 0])); + expect(a.times(b)).toEqual(new Int128(new Uint32Array([0x00000000, 0x00000000, 0x00000000, 0xf000000]))); + }); + test(`fromString parses string`, () => { + expect(Int128.fromString('1002111867823618826746863804903129070')) + .toEqual(new Int64(new Uint32Array([0x00c0ffee, + 0x00c0ffee, + 0x00c0ffee, + 0x00c0ffee]))); + }); + test(`fromString parses negative string`, () => { + expect(Int128.fromString('-12345678901234567890123456789012345678')) + .toEqual(new Int64(new Uint32Array([0x21c70cb2, + 0x3bb66faf, + 0x0ffdccec, + 0xf6b64f09]))); + }); +}); diff --git a/js/test/vector-tests.ts b/js/test/unit/vector-tests.ts similarity index 58% rename from js/test/vector-tests.ts rename to js/test/unit/vector-tests.ts index 0c9ef4404ed6a..75706229ab172 100644 --- a/js/test/vector-tests.ts +++ b/js/test/unit/vector-tests.ts @@ -15,11 +15,14 @@ // specific language governing permissions and limitations // under the License. -import { flatbuffers } from 'flatbuffers'; -import Long = flatbuffers.Long; -import { - BitVector, - TypedVector, +import Arrow, { + TypedArray, + TypedArrayConstructor, + NumericVectorConstructor, +} from '../Arrow'; + +const { + BoolVector, Int64Vector, Uint64Vector, Int8Vector, @@ -28,26 +31,39 @@ import { Uint8Vector, Uint16Vector, Uint32Vector, + Float16Vector, Float32Vector, Float64Vector, -} from './Arrow'; +} = Arrow; + +const FixedSizeVectors = { + Int64Vector: [Int64Vector, Int32Array] as [NumericVectorConstructor, any], + Uint64Vector: [Uint64Vector, Uint32Array] as [NumericVectorConstructor, any] +}; -const LongVectors = { Int64Vector, Uint64Vector }; -const ByteVectors = { Int8Vector, Int16Vector, Int32Vector, Uint8Vector, Uint16Vector, Uint32Vector, Float32Vector, Float64Vector }; +const FixedWidthVectors = { + Int8Vector: [Int8Vector, Int8Array] as [NumericVectorConstructor, any], + Int16Vector: [Int16Vector, Int16Array] as [NumericVectorConstructor, any], + Int32Vector: [Int32Vector, Int32Array] as [NumericVectorConstructor, any], + Uint8Vector: [Uint8Vector, Uint8Array] as [NumericVectorConstructor, any], + Uint16Vector: [Uint16Vector, Uint16Array] as [NumericVectorConstructor, any], + Uint32Vector: [Uint32Vector, Uint32Array] as [NumericVectorConstructor, any], + Float32Vector: [Float32Vector, Float32Array] as [NumericVectorConstructor, any], + Float64Vector: [Float64Vector, Float64Array] as [NumericVectorConstructor, any] +}; -const longVectors = toMap(LongVectors, Object.keys(LongVectors)); -const byteVectors = toMap(ByteVectors, Object.keys(ByteVectors)); +const fixedSizeVectors = toMap(FixedSizeVectors, Object.keys(FixedSizeVectors)); +const fixedWidthVectors = toMap(FixedWidthVectors, Object.keys(FixedWidthVectors)); const bytes = Array.from( { length: 5 }, () => Uint8Array.from( { length: 64 }, () => Math.random() * 255 | 0)); -describe(`BitVector`, () => { - const vector = new BitVector(new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0])); +describe(`BoolVector`, () => { + const vector = new BoolVector({ data: new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0]) }); const values = [true, true, false, true, true, false, false, false]; const n = values.length; - vector.length = 1; test(`gets expected values`, () => { let i = -1; while (++i < n) { @@ -62,11 +78,11 @@ describe(`BitVector`, () => { } }); test(`can set values to true and false`, () => { - const v = new BitVector(new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0])); + const v = new BoolVector({ data: new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0]) }); const expected1 = [true, true, false, true, true, false, false, false]; const expected2 = [true, true, true, true, true, false, false, false]; const expected3 = [true, true, false, false, false, false, true, true]; - function validate(expected) { + function validate(expected: boolean[]) { for (let i = -1; ++i < n;) { expect(v.get(i)).toEqual(expected[i]); } @@ -88,55 +104,93 @@ describe(`BitVector`, () => { validate(expected1); }); test(`packs 0 values`, () => { - expect(BitVector.pack([])).toEqual( + expect(BoolVector.pack([])).toEqual( new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])); }); test(`packs 3 values`, () => { - expect(BitVector.pack([ + expect(BoolVector.pack([ true, false, true ])).toEqual(new Uint8Array([5, 0, 0, 0, 0, 0, 0, 0])); }); test(`packs 8 values`, () => { - expect(BitVector.pack([ + expect(BoolVector.pack([ true, true, false, true, true, false, false, false ])).toEqual(new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0])); }); test(`packs 25 values`, () => { - expect(BitVector.pack([ + expect(BoolVector.pack([ true, true, false, true, true, false, false, false, false, false, false, true, true, false, true, true, false ])).toEqual(new Uint8Array([27, 216, 0, 0, 0, 0, 0, 0])); }); test(`from with boolean Array packs values`, () => { - expect(BitVector.from([ - true, false, true - ]).slice()).toEqual(new Uint8Array([5, 0, 0, 0, 0, 0, 0, 0])); + expect(new BoolVector({ + data: BoolVector.pack([true, false, true]) + }).slice()).toEqual(new Uint8Array([5, 0, 0, 0, 0, 0, 0, 0])); + }); +}); + +describe('Float16Vector', () => { + const values = concatTyped(Uint16Array, ...bytes); + const vector = bytes + .map((b) => new Float16Vector({ data: new Uint16Array(b.buffer) })) + .reduce((v: any, v2) => v.concat(v2)); + const n = values.length; + const clamp = (x: number) => Math.min((x - 32767) / 32767, 1); + test(`gets expected values`, () => { + let i = -1; + while (++i < n) { + expect(vector.get(i)).toEqual(clamp(values[i])); + } + }); + test(`iterates expected values`, () => { + expect.hasAssertions(); + let i = -1; + for (let v of vector) { + expect(++i).toBeLessThan(n); + expect(v).toEqual(clamp(values[i])); + } + }); + test(`slices the entire array`, () => { + expect(vector.slice()).toEqual(values); + }); + test(`slice returns a TypedArray`, () => { + expect(vector.slice()).toBeInstanceOf(Uint16Array); + }); + test(`slices from -20 to length`, () => { + expect(vector.slice(-20)).toEqual(values.slice(-20)); + }); + test(`slices from 0 to -20`, () => { + expect(vector.slice(0, -20)).toEqual(values.slice(0, -20)); + }); + test(`slices the array from 0 to length - 20`, () => { + expect(vector.slice(0, n - 20)).toEqual(values.slice(0, n - 20)); + }); + test(`slices the array from 0 to length + 20`, () => { + expect(vector.slice(0, n + 20)).toEqual( + concatTyped(Uint16Array, values, values.slice(0, 20))); }); }); -for (const [VectorName, VectorType] of longVectors) { - const ArrayType = VectorType.prototype.arrayType; +for (const [VectorName, [VectorType, ArrayType]] of fixedSizeVectors) { describe(`${VectorName}`, () => { const values = concatTyped(ArrayType, ...bytes); - const bLists = bytes.map((b) => new ArrayType(b.buffer)); - const vector = new VectorType(null, ...bLists); - const n = vector.length = values.length * 0.5; + const vector = bytes + .map((b) => new VectorType({ data: new ArrayType(b.buffer) })) + .reduce((v: any, v2) => v.concat(v2)); + const n = values.length * 0.5; test(`gets expected values`, () => { let i = -1; while (++i < n) { - expect(vector.get(i)).toEqual(new Long( - values[i * 2], values[i * 2 + 1] - )); + expect(vector.get(i)).toEqual(values.slice(2 * i, 2 * (i + 1))); } }); test(`iterates expected values`, () => { let i = -1; for (let v of vector) { expect(++i).toBeLessThan(n); - expect(v).toEqual(new Long( - values[i * 2], values[i * 2 + 1] - )); + expect(v).toEqual(values.slice(2 * i, 2 * (i + 1))); } }); test(`slices the entire array`, () => { @@ -161,13 +215,14 @@ for (const [VectorName, VectorType] of longVectors) { }); } -for (const [VectorName, VectorType] of byteVectors) { - const ArrayType = VectorType.prototype.arrayType; +for (const [VectorName, [VectorType, ArrayType]] of fixedWidthVectors) { describe(`${VectorName}`, () => { const values = concatTyped(ArrayType, ...bytes); - const bLists = bytes.map((b) => new ArrayType(b.buffer)); - const vector = new VectorType(null, ...bLists); - const n = vector.length = values.length; + const vector = bytes + .map((b) => new VectorType({ data: new ArrayType(b.buffer) })) + .reduce((v: any, v2) => v.concat(v2)); + + const n = values.length; test(`gets expected values`, () => { let i = -1; while (++i < n) { @@ -204,21 +259,21 @@ for (const [VectorName, VectorType] of byteVectors) { }); } -function toMap(entries: any, keys: string[]) { +function toMap(entries: Record, keys: string[]) { return keys.reduce((map, key) => { map.set(key, entries[key] as T); return map; }, new Map()); } -function concatTyped(ArrayType: any, ...bytes: any[]) { - const BPM = ArrayType.BYTES_PER_ELEMENT; +function concatTyped(ArrayType: TypedArrayConstructor, ...bytes: any[]) { + const BPE = ArrayType.BYTES_PER_ELEMENT; return bytes.reduce((v, bytes) => { - const l = bytes.byteLength / BPM; + const l = bytes.byteLength / BPE; const a = new ArrayType(v.length + l); const b = new ArrayType(bytes.buffer); a.set(v); a.set(b, v.length); return a; - }, new ArrayType(0)) as Array; + }, new ArrayType(0)) as T; } \ No newline at end of file diff --git a/js/tsconfig/tsconfig.base.json b/js/tsconfig/tsconfig.base.json index 4a46ed1f0af9b..8b8210198960a 100644 --- a/js/tsconfig/tsconfig.base.json +++ b/js/tsconfig/tsconfig.base.json @@ -3,23 +3,33 @@ "include": ["../src/**/*.ts"], "compileOnSave": false, "compilerOptions": { - "lib": ["dom", "esnext", "esnext.asynciterable"], + + /* Basic stuff */ "moduleResolution": "node", + "lib": ["dom", "esnext", "esnext.asynciterable"], + + /* Control what is emitted */ + "declaration": true, + "noEmitOnError": true, + "removeComments": false, + "downlevelIteration": true, + + /* Create inline sourcemaps with sources */ "sourceMap": false, "inlineSources": true, "inlineSourceMap": true, - "declaration": true, - "skipLibCheck": true, + + /* The most restrictive settings possible */ + "strict": true, + "skipLibCheck": false, "importHelpers": true, "noEmitHelpers": true, - "noImplicitAny": false, - "noEmitOnError": false, - "noImplicitThis": true, + "noImplicitAny": true, "noUnusedLocals": true, - "removeComments": false, - "downlevelIteration": true, - "noImplicitUseStrict": true, - "preserveConstEnums": false, + "noImplicitReturns": true, + "allowUnusedLabels": false, + "noUnusedParameters": true, + "allowUnreachableCode": false, "noFallthroughCasesInSwitch": true, "forceConsistentCasingInFileNames": true } diff --git a/js/tsconfig/tsconfig.es2015.cls.json b/js/tsconfig/tsconfig.es2015.cls.json index 11ccc04d58375..fccacb349d023 100644 --- a/js/tsconfig/tsconfig.es2015.cls.json +++ b/js/tsconfig/tsconfig.es2015.cls.json @@ -4,7 +4,8 @@ "compilerOptions": { "target": "ES2015", "module": "es2015", + "declaration": false, "noEmitHelpers": true, - "importHelpers": false + "importHelpers": true } } diff --git a/js/tsconfig/tsconfig.es5.cls.json b/js/tsconfig/tsconfig.es5.cls.json index 55f7ea52cf362..4df18aa595d92 100644 --- a/js/tsconfig/tsconfig.es5.cls.json +++ b/js/tsconfig/tsconfig.es5.cls.json @@ -2,9 +2,10 @@ { "extends": "./tsconfig.base.json", "compilerOptions": { - "target": "ES5", + "target": "es2015", "module": "es2015", + "declaration": false, "noEmitHelpers": true, - "importHelpers": false + "importHelpers": true } } diff --git a/js/tsconfig/tsconfig.esnext.cls.json b/js/tsconfig/tsconfig.esnext.cls.json index 009a5ac10d644..03206c9d77d38 100644 --- a/js/tsconfig/tsconfig.esnext.cls.json +++ b/js/tsconfig/tsconfig.esnext.cls.json @@ -4,7 +4,8 @@ "compilerOptions": { "target": "ESNEXT", "module": "es2015", + "declaration": false, "noEmitHelpers": true, - "importHelpers": false + "importHelpers": true } } diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8c7348298db5f..e9de08ba19730 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -61,9 +61,18 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) + option(PYARROW_PARQUET_USE_SHARED + "Rely on parquet shared libraries where relevant" + ON) + option(PYARROW_BOOST_USE_SHARED + "Rely on boost shared libraries on linking static parquet" + OFF) option(PYARROW_BUILD_PLASMA "Build the PyArrow Plasma integration" OFF) + option(PYARROW_BUILD_ORC + "Build the PyArrow ORC integration" + OFF) option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) @@ -282,24 +291,45 @@ if (PYARROW_BUILD_PARQUET) endif() include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) - if (PYARROW_BUNDLE_ARROW_CPP) - bundle_arrow_lib(PARQUET_SHARED_LIB - ABI_VERSION ${PARQUET_ABI_VERSION} - SO_VERSION ${PARQUET_SO_VERSION}) + if (PYARROW_PARQUET_USE_SHARED) + if (PYARROW_BUNDLE_ARROW_CPP) + bundle_arrow_lib(PARQUET_SHARED_LIB + ABI_VERSION ${PARQUET_ABI_VERSION} + SO_VERSION ${PARQUET_SO_VERSION}) + if (MSVC) + bundle_arrow_implib(PARQUET_SHARED_IMP_LIB) + endif() + endif() if (MSVC) - bundle_arrow_implib(PARQUET_SHARED_IMP_LIB) + ADD_THIRDPARTY_LIB(parquet + SHARED_LIB ${PARQUET_SHARED_IMP_LIB}) + else() + ADD_THIRDPARTY_LIB(parquet + SHARED_LIB ${PARQUET_SHARED_LIB}) endif() - endif() - if (MSVC) - ADD_THIRDPARTY_LIB(parquet - SHARED_LIB ${PARQUET_SHARED_IMP_LIB}) + set(LINK_LIBS + ${LINK_LIBS} + parquet_shared) else() + find_package(Thrift) + if (PYARROW_BOOST_USE_SHARED) + set(Boost_USE_STATIC_LIBS OFF) + else() + set(Boost_USE_STATIC_LIBS ON) + endif() + find_package(Boost COMPONENTS regex REQUIRED) + ADD_THIRDPARTY_LIB(boost_regex + STATIC_LIB ${Boost_REGEX_LIBRARY_RELEASE}) ADD_THIRDPARTY_LIB(parquet - SHARED_LIB ${PARQUET_SHARED_LIB}) + STATIC_LIB ${PARQUET_STATIC_LIB}) + ADD_THIRDPARTY_LIB(thrift + STATIC_LIB ${THRIFT_STATIC_LIB}) + set(LINK_LIBS + ${LINK_LIBS} + parquet_static + thrift_static + boost_regex_static) endif() - set(LINK_LIBS - ${LINK_LIBS} - parquet_shared) set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _parquet) @@ -330,6 +360,14 @@ if (PYARROW_BUILD_PLASMA) file(COPY ${PLASMA_EXECUTABLE} DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}) endif() + +if (PYARROW_BUILD_ORC) + ## ORC + set(CYTHON_EXTENSIONS + ${CYTHON_EXTENSIONS} + _orc) +endif() + ############################################################ # Setup and build Cython modules ############################################################ diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst index 6bceba3c650b6..2d3e39c6954a1 100644 --- a/python/doc/source/api.rst +++ b/python/doc/source/api.rst @@ -50,7 +50,7 @@ Type and Schema Factory Functions date64 binary string - decimal + decimal128 list_ struct dictionary @@ -71,7 +71,18 @@ Type checking functions is_integer is_signed_integer is_unsigned_integer + is_int8 + is_int16 + is_int32 + is_int64 + is_uint8 + is_uint16 + is_uint32 + is_uint64 is_floating + is_float16 + is_float32 + is_float64 is_decimal is_list is_struct @@ -80,7 +91,11 @@ Type checking functions is_temporal is_timestamp is_date + is_date32 + is_date64 is_time + is_time32 + is_time64 is_null is_binary is_unicode @@ -132,6 +147,7 @@ Array Types .. autosummary:: :toctree: generated/ + array Array BooleanArray DictionaryArray @@ -155,7 +171,7 @@ Array Types Date32Array Date64Array TimestampArray - DecimalArray + Decimal128Array ListArray .. _api.table: @@ -168,6 +184,8 @@ Tables and Record Batches .. autosummary:: :toctree: generated/ + column + chunked_array ChunkedArray Column RecordBatch @@ -192,7 +210,11 @@ Input / Output and Shared Memory :toctree: generated/ allocate_buffer + compress + decompress + frombuffer Buffer + ResizableBuffer BufferReader BufferOutputStream NativeFile @@ -238,6 +260,7 @@ Serialization and IPC serialize serialize_to deserialize + deserialize_components deserialize_from read_serialized SerializedPyObject diff --git a/python/doc/source/development.rst b/python/doc/source/development.rst index 3ca460d463a06..01844fa18d133 100644 --- a/python/doc/source/development.rst +++ b/python/doc/source/development.rst @@ -84,7 +84,7 @@ from conda-forge: conda create -y -q -n pyarrow-dev \ python=3.6 numpy six setuptools cython pandas pytest \ cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \ - brotli jemalloc lz4-c zstd -c conda-forge + gflags brotli jemalloc lz4-c zstd -c conda-forge source activate pyarrow-dev @@ -108,7 +108,7 @@ building Arrow C++: .. code-block:: shell - brew update && brew bundle --file=python/Brewfile + brew update && brew bundle --file=arrow/python/Brewfile On Debian/Ubuntu, you need the following minimal set of dependencies. All other dependencies will be automatically built by Arrow's third-party toolchain. @@ -175,6 +175,9 @@ Now build and install the Arrow C++ libraries: If you don't want to build and install the Plasma in-memory object store, you can omit the ``-DARROW_PLASMA=on`` flag. +To add support for the experimental Apache ORC integration, include +``-DARROW_ORC=on`` in these flags. + Now, optionally build and install the Apache Parquet libraries in your toolchain: @@ -205,6 +208,9 @@ Now, build pyarrow: If you did not build parquet-cpp, you can omit ``--with-parquet`` and if you did not build with plasma, you can omit ``--with-plasma``. +If you built with the experimental Apache ORC integration, include +``--with-orc`` in these flags. + You should be able to run the unit tests with: .. code-block:: shell @@ -230,7 +236,7 @@ You should be able to run the unit tests with: ====================== 181 passed, 17 skipped in 0.98 seconds =========== -You can build a wheel by running: +To build a self-contained wheel (include Arrow C++ and Parquet C++), one can set `--bundle-arrow-cpp`: .. code-block:: shell @@ -256,16 +262,11 @@ First, starting from fresh clones of Apache Arrow and parquet-cpp: .. code-block:: shell - conda create -n arrow-dev cmake git boost-cpp ^ - flatbuffers snappy zlib brotli thrift-cpp rapidjson - activate arrow-dev - -As one git housekeeping item, we must run this command in our Arrow clone: - -.. code-block:: shell - - cd arrow - git config core.symlinks true + conda create -y -q -n pyarrow-dev ^ + python=3.6 numpy six setuptools cython pandas pytest ^ + cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib ^ + gflags brotli lz4-c zstd -c conda-forge + activate pyarrow-dev Now, we build and install Arrow C++ libraries @@ -279,7 +280,7 @@ Now, we build and install Arrow C++ libraries -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^ -DCMAKE_BUILD_TYPE=Release ^ -DARROW_BUILD_TESTS=on ^ - -DARROW_CXXFLAGS="/WX" ^ + -DARROW_CXXFLAGS="/WX /MP" ^ -DARROW_PYTHON=on .. cmake --build . --target INSTALL --config Release cd ..\.. diff --git a/python/doc/source/index.rst b/python/doc/source/index.rst index b933d2359f720..c35f20be86396 100644 --- a/python/doc/source/index.rst +++ b/python/doc/source/index.rst @@ -18,10 +18,14 @@ Apache Arrow (Python) ===================== -Arrow is a columnar in-memory analytics layer designed to accelerate big data. -It houses a set of canonical in-memory representations of flat and hierarchical -data along with multiple language-bindings for structure manipulation. It also -provides IPC and common algorithm implementations. +Apache Arrow is a cross-language development platform for in-memory data. It +specifies a standardized language-independent columnar memory format for flat +and hierarchical data, organized for efficient analytic operations on modern +hardware. It also provides computational libraries and zero-copy streaming +messaging and interprocess communication. + +The Arrow Python bindings have first-class integration with NumPy, pandas, and +built-in Python objects. This is the documentation of the Python API of Apache Arrow. For more details on the format and other language bindings see diff --git a/python/doc/source/ipc.rst b/python/doc/source/ipc.rst index 17fe84e03633d..6842cb5be9f43 100644 --- a/python/doc/source/ipc.rst +++ b/python/doc/source/ipc.rst @@ -256,6 +256,83 @@ Lastly, we use this context as an additioanl argument to ``pyarrow.serialize``: buf = pa.serialize(val, context=context).to_buffer() restored_val = pa.deserialize(buf, context=context) +The ``SerializationContext`` also has convenience methods ``serialize`` and +``deserialize``, so these are equivalent statements: + +.. code-block:: python + + buf = context.serialize(val).to_buffer() + restored_val = context.deserialize(buf) + +Component-based Serialization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For serializing Python objects containing some number of NumPy arrays, Arrow +buffers, or other data types, it may be desirable to transport their serialized +representation without having to produce an intermediate copy using the +``to_buffer`` method. To motivate this, support we have a list of NumPy arrays: + +.. ipython:: python + + import numpy as np + data = [np.random.randn(10, 10) for i in range(5)] + +The call ``pa.serialize(data)`` does not copy the memory inside each of these +NumPy arrays. This serialized representation can be then decomposed into a +dictionary containing a sequence of ``pyarrow.Buffer`` objects containing +metadata for each array and references to the memory inside the arrays. To do +this, use the ``to_components`` method: + +.. ipython:: python + + serialized = pa.serialize(data) + components = serialized.to_components() + +The particular details of the output of ``to_components`` are not too +important. The objects in the ``'data'`` field are ``pyarrow.Buffer`` objects, +which are zero-copy convertible to Python ``memoryview`` objects: + +.. ipython:: python + + memoryview(components['data'][0]) + +A memoryview can be converted back to a ``Buffer`` with ``pyarrow.frombuffer``: + +.. ipython:: python + + mv = memoryview(components['data'][0]) + buf = pa.frombuffer(mv) + +An object can be reconstructed from its component-based representation using +``deserialize_components``: + +.. ipython:: python + + restored_data = pa.deserialize_components(components) + restored_data[0] + +``deserialize_components`` is also available as a method on +``SerializationContext`` objects. + +Serializing pandas Objects +-------------------------- + +We provide a serialization context that has optimized handling of pandas +objects like ``DataFrame`` and ``Series``. This is the +``pyarrow.pandas_serialization_context`` member. Combined with component-based +serialization above, this enables zero-copy transport of pandas DataFrame +objects not containing any Python objects: + +.. ipython:: python + + import pandas as pd + df = pd.DataFrame({'a': [1, 2, 3, 4, 5]}) + context = pa.pandas_serialization_context + serialized_df = context.serialize(df) + df_components = serialized_df.to_components() + original_df = context.deserialize_components(df_components) + original_df + Feather Format -------------- diff --git a/python/manylinux1/Dockerfile-x86_64 b/python/manylinux1/Dockerfile-x86_64 index 69d8d3dbf8cef..919a32be715b0 100644 --- a/python/manylinux1/Dockerfile-x86_64 +++ b/python/manylinux1/Dockerfile-x86_64 @@ -29,5 +29,5 @@ RUN /check_arrow_visibility.sh WORKDIR / RUN git clone https://github.com/apache/parquet-cpp.git WORKDIR /parquet-cpp -RUN ARROW_HOME=/arrow-dist cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DPARQUET_BUILD_TESTS=OFF -DPARQUET_BOOST_USE_SHARED=OFF -GNinja . +RUN ARROW_HOME=/arrow-dist cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DPARQUET_BUILD_TESTS=OFF -DPARQUET_BUILD_SHARED=OFF -DPARQUET_BUILD_STATIC=ON -DPARQUET_BOOST_USE_SHARED=OFF -GNinja . RUN ninja install diff --git a/python/manylinux1/README.md b/python/manylinux1/README.md index a74f7a27b930a..3d462ff2f7213 100644 --- a/python/manylinux1/README.md +++ b/python/manylinux1/README.md @@ -37,7 +37,7 @@ git clone ../../ arrow # Build the native baseimage docker build -t arrow-base-x86_64 -f Dockerfile-x86_64 . # Build the python packages -docker run --rm -t -i -v $PWD:/io arrow-base-x86_64 /io/build_arrow.sh +docker run --shm-size=2g --rm -t -i -v $PWD:/io arrow-base-x86_64 /io/build_arrow.sh # Now the new packages are located in the dist/ folder ls -l dist/ ``` diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index 074bd0056a948..4d816bec9b6a4 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -40,11 +40,11 @@ cd /arrow/python # PyArrow build configuration export PYARROW_BUILD_TYPE='release' export PYARROW_WITH_PARQUET=1 +export PYARROW_WITH_STATIC_PARQUET=1 export PYARROW_WITH_PLASMA=1 export PYARROW_BUNDLE_ARROW_CPP=1 -# Need as otherwise arrow_io is sometimes not linked -export LDFLAGS="-Wl,--no-as-needed" export PKG_CONFIG_PATH=/arrow-dist/lib64/pkgconfig +export PYARROW_CMAKE_OPTIONS='-DTHRIFT_HOME=/usr' # Ensure the target directory exists mkdir -p /io/dist @@ -58,14 +58,14 @@ for PYTHON in ${PYTHON_VERSIONS}; do ARROW_BUILD_DIR=/arrow/cpp/build-PY${PYTHON} mkdir -p "${ARROW_BUILD_DIR}" pushd "${ARROW_BUILD_DIR}" - PATH="$(cpython_path $PYTHON)/bin:$PATH" cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=off -DARROW_RPATH_ORIGIN=ON -DARROW_JEMALLOC_USE_SHARED=OFF -DARROW_PYTHON=ON -DPythonInterp_FIND_VERSION=${PYTHON} -DARROW_PLASMA=ON .. + PATH="$(cpython_path $PYTHON)/bin:$PATH" cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JEMALLOC=off -DARROW_RPATH_ORIGIN=ON -DARROW_JEMALLOC_USE_SHARED=OFF -DARROW_PYTHON=ON -DPythonInterp_FIND_VERSION=${PYTHON} -DARROW_PLASMA=ON -DARROW_ORC=ON .. make -j5 install popd # Clear output directory rm -rf dist/ echo "=== (${PYTHON}) Building wheel ===" - PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER setup.py build_ext --inplace --with-parquet --bundle-arrow-cpp + PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER setup.py build_ext --inplace --with-parquet --with-static-parquet --bundle-arrow-cpp PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER setup.py bdist_wheel echo "=== (${PYTHON}) Test the existence of optional modules ===" diff --git a/python/manylinux1/scripts/check_arrow_visibility.sh b/python/manylinux1/scripts/check_arrow_visibility.sh index 27a30f7479bb6..bed357edf664a 100755 --- a/python/manylinux1/scripts/check_arrow_visibility.sh +++ b/python/manylinux1/scripts/check_arrow_visibility.sh @@ -17,10 +17,13 @@ # under the License. nm -D -C /arrow-dist/lib64/libarrow.so > nm_arrow.log +grep ' T ' nm_arrow.log | grep -v arrow > visible_symbols.log -if [[ `grep ' T ' nm_arrow.log | grep -v arrow | wc -l` -eq 2 ]] +if [[ `cat visible_symbols.log | wc -l` -eq 2 ]] then exit 0 fi +cat visible_symbols.log + exit 1 diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 1215c822d2e47..a245fe6796023 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -35,15 +35,15 @@ uint8, uint16, uint32, uint64, time32, time64, timestamp, date32, date64, float16, float32, float64, - binary, string, decimal, - list_, struct, dictionary, field, + binary, string, decimal128, + list_, struct, union, dictionary, field, type_for_alias, DataType, NAType, Field, Schema, schema, Array, Tensor, - array, + array, chunked_array, column, from_numpy_dtype, NullArray, NumericArray, IntegerArray, FloatingPointArray, @@ -52,13 +52,13 @@ Int16Array, UInt16Array, Int32Array, UInt32Array, Int64Array, UInt64Array, - ListArray, + ListArray, UnionArray, BinaryArray, StringArray, FixedSizeBinaryArray, DictionaryArray, Date32Array, Date64Array, TimestampArray, Time32Array, Time64Array, - DecimalArray, StructArray, + Decimal128Array, StructArray, ArrayValue, Scalar, NA, BooleanValue, Int8Value, Int16Value, Int32Value, Int64Value, @@ -71,18 +71,21 @@ # ARROW-1683: Remove after 0.8.0? from pyarrow.lib import TimestampType -from pyarrow.lib import (HdfsFile, NativeFile, PythonFile, - FixedSizeBufferWriter, - Buffer, BufferReader, BufferOutputStream, - OSFile, MemoryMappedFile, memory_map, - allocate_buffer, frombuffer, - memory_map, create_memory_map, - have_libhdfs, have_libhdfs3, MockOutputStream) +# Buffers, allocation +from pyarrow.lib import (Buffer, ResizableBuffer, compress, decompress, + allocate_buffer, frombuffer) from pyarrow.lib import (MemoryPool, total_allocated_bytes, set_memory_pool, default_memory_pool, log_memory_allocations) +from pyarrow.lib import (HdfsFile, NativeFile, PythonFile, + FixedSizeBufferWriter, + BufferReader, BufferOutputStream, + OSFile, MemoryMappedFile, memory_map, + create_memory_map, have_libhdfs, have_libhdfs3, + MockOutputStream) + from pyarrow.lib import (ChunkedArray, Column, RecordBatch, Table, concat_tables) @@ -98,6 +101,7 @@ # Serialization from pyarrow.lib import (deserialize_from, deserialize, + deserialize_components, serialize, serialize_to, read_serialized, SerializedPyObject, SerializationContext, SerializationCallbackError, @@ -121,6 +125,7 @@ localfs = LocalFileSystem.get_instance() from pyarrow.serialization import (_default_serialization_context, + pandas_serialization_context, register_default_serialization_handlers) import pyarrow.types as types diff --git a/python/pyarrow/_orc.pxd b/python/pyarrow/_orc.pxd new file mode 100644 index 0000000000000..411691510423c --- /dev/null +++ b/python/pyarrow/_orc.pxd @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from libc.string cimport const_char +from libcpp.vector cimport vector as std_vector +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus, + CTable, CMemoryPool, + CKeyValueMetadata, + CRecordBatch, + CTable, + RandomAccessFile, OutputStream, + TimeUnit) + + +cdef extern from "arrow/adapters/orc/adapter.h" namespace "arrow::adapters::orc" nogil: + cdef cppclass ORCFileReader: + + @staticmethod + CStatus Open(const shared_ptr[RandomAccessFile]& file, + CMemoryPool* pool, + unique_ptr[ORCFileReader]* reader) + + CStatus ReadSchema(shared_ptr[CSchema]* out) + + CStatus ReadStripe(int64_t stripe, shared_ptr[CRecordBatch]* out) + CStatus ReadStripe(int64_t stripe, std_vector[int], shared_ptr[CRecordBatch]* out) + + CStatus Read(shared_ptr[CTable]* out) + CStatus Read(std_vector[int], shared_ptr[CTable]* out) + + int64_t NumberOfStripes() + + int64_t NumberOfRows() diff --git a/python/pyarrow/_orc.pyx b/python/pyarrow/_orc.pyx new file mode 100644 index 0000000000000..7ff4bac6dc95f --- /dev/null +++ b/python/pyarrow/_orc.pyx @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from cython.operator cimport dereference as deref +from libcpp.vector cimport vector as std_vector +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.lib cimport (check_status, + MemoryPool, maybe_unbox_memory_pool, + Schema, pyarrow_wrap_schema, + RecordBatch, + pyarrow_wrap_table, + get_reader) +import six + + +cdef class ORCReader: + cdef: + object source + CMemoryPool* allocator + unique_ptr[ORCFileReader] reader + + def __cinit__(self, MemoryPool memory_pool=None): + self.allocator = maybe_unbox_memory_pool(memory_pool) + + def open(self, object source): + cdef: + shared_ptr[RandomAccessFile] rd_handle + + self.source = source + + get_reader(source, &rd_handle) + with nogil: + check_status(ORCFileReader.Open(rd_handle, self.allocator, + &self.reader)) + + def schema(self): + """ + The arrow schema for this file. + + Returns + ------- + schema : pyarrow.Schema + """ + cdef: + shared_ptr[CSchema] sp_arrow_schema + + with nogil: + check_status(deref(self.reader).ReadSchema(&sp_arrow_schema)) + + return pyarrow_wrap_schema(sp_arrow_schema) + + def nrows(self): + return deref(self.reader).NumberOfRows(); + + def nstripes(self): + return deref(self.reader).NumberOfStripes(); + + def read_stripe(self, n, include_indices=None): + cdef: + shared_ptr[CRecordBatch] sp_record_batch + RecordBatch batch + int64_t stripe + std_vector[int] indices + + stripe = n + + if include_indices is None: + with nogil: + check_status(deref(self.reader).ReadStripe(stripe, &sp_record_batch)) + else: + indices = include_indices + with nogil: + check_status(deref(self.reader).ReadStripe(stripe, indices, &sp_record_batch)) + + batch = RecordBatch() + batch.init(sp_record_batch) + return batch + + def read(self, include_indices=None): + cdef: + shared_ptr[CTable] sp_table + std_vector[int] indices + + if include_indices is None: + with nogil: + check_status(deref(self.reader).Read(&sp_table)) + else: + indices = include_indices + with nogil: + check_status(deref(self.reader).Read(indices, &sp_table)) + + return pyarrow_wrap_table(sp_table) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 5094232bdc7b4..55b66b53d160d 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -37,6 +37,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: cdef cppclass ColumnPath: c_string ToDotString() + vector[c_string] ToDotVector() cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: @@ -105,6 +106,11 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0" ParquetVersion_V2" parquet::ParquetVersion::PARQUET_2_0" + enum ParquetSortOrder" parquet::SortOrder::type": + ParquetSortOrder_SIGNED" parquet::SortOrder::SIGNED" + ParquetSortOrder_UNSIGNED" parquet::SortOrder::UNSIGNED" + ParquetSortOrder_UNKNOWN" parquet::SortOrder::UNKNOWN" + cdef cppclass ColumnDescriptor: c_bool Equals(const ColumnDescriptor& other) @@ -126,6 +132,8 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: c_bool Equals(const SchemaDescriptor& other) int num_columns() + cdef c_string FormatStatValue(ParquetType parquet_type, const char* val) + cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass ColumnReader: @@ -155,10 +163,52 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass RowGroupReader: pass + cdef cppclass CEncodedStatistics" parquet::EncodedStatistics": + const c_string& max() const + const c_string& min() const + int64_t null_count + int64_t distinct_count + bint has_min + bint has_max + bint has_null_count + bint has_distinct_count + + cdef cppclass CRowGroupStatistics" parquet::RowGroupStatistics": + int64_t null_count() const + int64_t distinct_count() const + int64_t num_values() const + bint HasMinMax() + void Reset() + c_string EncodeMin() + c_string EncodeMax() + CEncodedStatistics Encode() + void SetComparator() + ParquetType physical_type() const + + cdef cppclass CColumnChunkMetaData" parquet::ColumnChunkMetaData": + int64_t file_offset() const + const c_string& file_path() const + + ParquetType type() const + int64_t num_values() const + shared_ptr[ColumnPath] path_in_schema() const + bint is_stats_set() const + shared_ptr[CRowGroupStatistics] statistics() const + ParquetCompression compression() const + const vector[ParquetEncoding]& encodings() const + + bint has_dictionary_page() const + int64_t dictionary_page_offset() const + int64_t data_page_offset() const + int64_t index_page_offset() const + int64_t total_compressed_size() const + int64_t total_uncompressed_size() const + cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData": int num_columns() int64_t num_rows() int64_t total_byte_size() + unique_ptr[CColumnChunkMetaData] ColumnChunk(int i) const cdef cppclass CFileMetaData" parquet::FileMetaData": uint32_t size() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index b096fa1b4d337..147af217579e7 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -35,6 +35,212 @@ from pyarrow.lib import ArrowException, NativeFile import six +try: + from textwrap import indent +except ImportError: + def indent(text, prefix): + lines = [prefix + line for line in text.splitlines(True)] + return ''.join(lines) + + +cdef class RowGroupStatistics: + cdef: + shared_ptr[CRowGroupStatistics] statistics + + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[CRowGroupStatistics]& statistics): + self.statistics = statistics + + def __repr__(self): + return """{0} + has_min_max: {1} + min: {2} + max: {3} + null_count: {4} + distinct_count: {5} + num_values: {6} + physical_type: {7}""".format(object.__repr__(self), + self.has_min_max, + self.min, + self.max, + self.null_count, + self.distinct_count, + self.num_values, + self.physical_type) + + property has_min_max: + + def __get__(self): + return self.statistics.get().HasMinMax() + + property min: + + def __get__(self): + raw_physical_type = self.statistics.get().physical_type() + encode_min = self.statistics.get().EncodeMin() + + min_value = FormatStatValue(raw_physical_type, encode_min.c_str()) + return frombytes(min_value) + + property max: + + def __get__(self): + raw_physical_type = self.statistics.get().physical_type() + encode_max = self.statistics.get().EncodeMax() + + max_value = FormatStatValue(raw_physical_type, encode_max.c_str()) + return frombytes(max_value) + + property null_count: + + def __get__(self): + return self.statistics.get().null_count() + + property distinct_count: + + def __get__(self): + return self.statistics.get().distinct_count() + + property num_values: + + def __get__(self): + return self.statistics.get().num_values() + + property physical_type: + + def __get__(self): + physical_type = self.statistics.get().physical_type() + return physical_type_name_from_enum(physical_type) + + +cdef class ColumnChunkMetaData: + cdef: + unique_ptr[CColumnChunkMetaData] up_metadata + CColumnChunkMetaData* metadata + + def __cinit__(self): + pass + + cdef init(self, const CRowGroupMetaData& row_group_metadata, int i): + self.up_metadata = row_group_metadata.ColumnChunk(i) + self.metadata = self.up_metadata.get() + + def __repr__(self): + statistics = indent(repr(self.statistics), 4 * ' ') + return """{0} + file_offset: {1} + file_path: {2} + type: {3} + num_values: {4} + path_in_schema: {5} + is_stats_set: {6} + statistics: +{7} + compression: {8} + encodings: {9} + has_dictionary_page: {10} + dictionary_page_offset: {11} + data_page_offset: {12} + index_page_offset: {13} + total_compressed_size: {14} + total_uncompressed_size: {15}""".format(object.__repr__(self), + self.file_offset, + self.file_path, + self.type, + self.num_values, + self.path_in_schema, + self.is_stats_set, + statistics, + self.compression, + self.encodings, + self.has_dictionary_page, + self.dictionary_page_offset, + self.data_page_offset, + self.index_page_offset, + self.total_compressed_size, + self.total_uncompressed_size) + + property file_offset: + + def __get__(self): + return self.metadata.file_offset() + + property file_path: + + def __get__(self): + return frombytes(self.metadata.file_path()) + + property type: + + def __get__(self): + return physical_type_name_from_enum(self.metadata.type()) + + property num_values: + + def __get__(self): + return self.metadata.num_values() + + property path_in_schema: + + def __get__(self): + path = self.metadata.path_in_schema().get().ToDotString() + return frombytes(path) + + property is_stats_set: + + def __get__(self): + return self.metadata.is_stats_set() + + property statistics: + + def __get__(self): + statistics = RowGroupStatistics() + statistics.init(self.metadata.statistics()) + return statistics + + property compression: + + def __get__(self): + return self.metadata.compression() + + property encodings: + + def __get__(self): + return map(encoding_name_from_enum, + self.metadata.encodings()) + + property has_dictionary_page: + + def __get__(self): + return self.metadata.has_dictionary_page() + + property dictionary_page_offset: + + def __get__(self): + return self.metadata.dictionary_page_offset() + + property data_page_offset: + + def __get__(self): + return self.metadata.data_page_offset() + + property index_page_offset: + + def __get__(self): + return self.metadata.index_page_offset() + + property total_compressed_size: + + def __get__(self): + return self.metadata.total_compressed_size() + + property total_uncompressed_size: + + def __get__(self): + return self.metadata.total_uncompressed_size() + cdef class RowGroupMetaData: cdef: @@ -52,6 +258,11 @@ cdef class RowGroupMetaData: self.metadata = self.up_metadata.get() self.parent = parent + def column(self, int i): + chunk = ColumnChunkMetaData() + chunk.init(deref(self.metadata), i) + return chunk + def __repr__(self): return """{0} num_columns: {1} @@ -371,14 +582,29 @@ cdef logical_type_name_from_enum(ParquetLogicalType type_): }.get(type_, 'UNKNOWN') +cdef encoding_name_from_enum (ParquetEncoding encoding_): + return { + ParquetEncoding_PLAIN: "PLAIN", + ParquetEncoding_PLAIN_DICTIONARY: "PLAIN_DICTIONARY", + ParquetEncoding_RLE: "RLE", + ParquetEncoding_BIT_PACKED: "BIT_PACKED", + ParquetEncoding_DELTA_BINARY_PACKED: "DELTA_BINARY_PACKED", + ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: "DELTA_LENGTH_BYTE_ARRAY", + ParquetEncoding_DELTA_BYTE_ARRAY: "DELTA_BYTE_ARRAY", + ParquetEncoding_RLE_DICTIONARY: "RLE_DICTIONARY", + }.get(encoding_, 'UNKNOWN') + + cdef class ParquetReader: cdef: object source CMemoryPool* allocator unique_ptr[FileReader] reader - column_idx_map FileMetaData _metadata + cdef public: + _column_idx_map + def __cinit__(self, MemoryPool memory_pool=None): self.allocator = maybe_unbox_memory_pool(memory_pool) self._metadata = None @@ -400,6 +626,23 @@ cdef class ParquetReader: check_status(OpenFile(rd_handle, self.allocator, properties, c_metadata, &self.reader)) + property column_paths: + + def __get__(self): + cdef: + FileMetaData container = self.metadata + const CFileMetaData* metadata = container._metadata + vector[c_string] path + int i = 0 + + paths = [] + for i in range(0, metadata.num_columns()): + path = (metadata.schema().Column(i) + .path().get().ToDotVector()) + paths.append([frombytes(x) for x in path]) + + return paths + @property def metadata(self): cdef: @@ -505,14 +748,14 @@ cdef class ParquetReader: const CFileMetaData* metadata = container._metadata int i = 0 - if self.column_idx_map is None: - self.column_idx_map = {} + if self._column_idx_map is None: + self._column_idx_map = {} for i in range(0, metadata.num_columns()): col_bytes = tobytes(metadata.schema().Column(i) .path().get().ToDotString()) - self.column_idx_map[col_bytes] = i + self._column_idx_map[col_bytes] = i - return self.column_idx_map[tobytes(column_name)] + return self._column_idx_map[tobytes(column_name)] def read_column(self, int column_index): cdef: diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 7da5c3caffdc2..cca9425881b00 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -77,7 +77,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type, return pyarrow_wrap_array(chunked_out.get().chunk(0)) -cdef DataType _ensure_type(object type): +cdef inline DataType _ensure_type(object type): if type is None: return None elif not isinstance(type, DataType): @@ -162,6 +162,7 @@ def array(object obj, type=None, mask=None, return DictionaryArray.from_arrays( values.codes, values.categories.values, mask=mask, ordered=values.ordered, + from_pandas=from_pandas, memory_pool=memory_pool) else: values, type = pdcompat.get_datetimetz_type(values, obj.dtype, @@ -227,6 +228,15 @@ cdef CFunctionContext* _context() nogil: return _global_ctx.ctx.get() +cdef wrap_datum(const CDatum& datum): + if datum.kind() == DatumType_ARRAY: + return pyarrow_wrap_array(MakeArray(datum.array())) + elif datum.kind() == DatumType_CHUNKED_ARRAY: + return pyarrow_wrap_chunked_array(datum.chunked_array()) + else: + raise ValueError("Unable to wrap Datum in a Python object") + + cdef class Array: cdef void init(self, const shared_ptr[CArray]& sp_array): @@ -269,6 +279,29 @@ cdef class Array: return pyarrow_wrap_array(result) + def unique(self): + """ + Compute distinct elements in array + """ + cdef shared_ptr[CArray] result + + with nogil: + check_status(Unique(_context(), CDatum(self.sp_array), &result)) + + return pyarrow_wrap_array(result) + + def dictionary_encode(self): + """ + Compute dictionary-encoded representation of array + """ + cdef CDatum out + + with nogil: + check_status(DictionaryEncode(_context(), CDatum(self.sp_array), + &out)) + + return wrap_datum(out) + @staticmethod def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None): """ @@ -596,7 +629,7 @@ cdef class FixedSizeBinaryArray(Array): pass -cdef class DecimalArray(FixedSizeBinaryArray): +cdef class Decimal128Array(FixedSizeBinaryArray): pass @@ -630,6 +663,58 @@ cdef class ListArray(Array): return pyarrow_wrap_array(out) +cdef class UnionArray(Array): + + @staticmethod + def from_dense(Array types, Array value_offsets, list children): + """ + Construct dense UnionArray from arrays of int8 types, int32 offsets and + children arrays + + Parameters + ---------- + types : Array (int8 type) + value_offsets : Array (int32 type) + children : list + + Returns + ------- + union_array : UnionArray + """ + cdef shared_ptr[CArray] out + cdef vector[shared_ptr[CArray]] c + cdef Array child + for child in children: + c.push_back(child.sp_array) + with nogil: + check_status(CUnionArray.MakeDense( + deref(types.ap), deref(value_offsets.ap), c, &out)) + return pyarrow_wrap_array(out) + + @staticmethod + def from_sparse(Array types, list children): + """ + Construct sparse UnionArray from arrays of int8 types and children + arrays + + Parameters + ---------- + types : Array (int8 type) + children : list + + Returns + ------- + union_array : UnionArray + """ + cdef shared_ptr[CArray] out + cdef vector[shared_ptr[CArray]] c + cdef Array child + for child in children: + c.push_back(child.sp_array) + with nogil: + check_status(CUnionArray.MakeSparse(deref(types.ap), c, &out)) + return pyarrow_wrap_array(out) + cdef class StringArray(Array): pass @@ -649,6 +734,9 @@ cdef class DictionaryArray(Array): return box_scalar(dictionary.type, dictionary.sp_array, index.as_py()) + def dictionary_encode(self): + return self + property dictionary: def __get__(self): @@ -671,7 +759,7 @@ cdef class DictionaryArray(Array): @staticmethod def from_arrays(indices, dictionary, mask=None, ordered=False, - MemoryPool memory_pool=None): + from_pandas=False, MemoryPool memory_pool=None): """ Construct Arrow DictionaryArray from array of indices (must be non-negative integers) and corresponding array of dictionary values @@ -682,15 +770,20 @@ cdef class DictionaryArray(Array): dictionary : ndarray or pandas.Series mask : ndarray or pandas.Series, boolean type True values indicate that indices are actually null + from_pandas : boolean, default False + If True, the indices should be treated as though they originated in + a pandas.Categorical (null encoded as -1) ordered : boolean, default False Set to True if the category values are ordered + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise uses default pool Returns ------- dict_array : DictionaryArray """ cdef: - Array arrow_indices, arrow_dictionary + Array _indices, _dictionary DictionaryArray result shared_ptr[CDataType] c_type shared_ptr[CArray] c_result @@ -699,29 +792,28 @@ cdef class DictionaryArray(Array): if mask is not None: raise NotImplementedError( "mask not implemented with Arrow array inputs yet") - arrow_indices = indices + _indices = indices else: - if mask is None: - mask = indices == -1 - else: - mask = mask | (indices == -1) - arrow_indices = Array.from_pandas(indices, mask=mask, - memory_pool=memory_pool) + if from_pandas: + if mask is None: + mask = indices == -1 + else: + mask = mask | (indices == -1) + _indices = array(indices, mask=mask, memory_pool=memory_pool) if isinstance(dictionary, Array): - arrow_dictionary = dictionary + _dictionary = dictionary else: - arrow_dictionary = Array.from_pandas(dictionary, - memory_pool=memory_pool) + _dictionary = array(dictionary, memory_pool=memory_pool) - if not isinstance(arrow_indices, IntegerArray): + if not isinstance(_indices, IntegerArray): raise ValueError('Indices must be integer type') cdef c_bool c_ordered = ordered - c_type.reset(new CDictionaryType(arrow_indices.type.sp_type, - arrow_dictionary.sp_array, c_ordered)) - c_result.reset(new CDictionaryArray(c_type, arrow_indices.sp_array)) + c_type.reset(new CDictionaryType(_indices.type.sp_type, + _dictionary.sp_array, c_ordered)) + c_result.reset(new CDictionaryArray(c_type, _indices.sp_array)) result = DictionaryArray() result.init(c_result) @@ -784,11 +876,12 @@ cdef dict _array_classes = { _Type_FLOAT: FloatArray, _Type_DOUBLE: DoubleArray, _Type_LIST: ListArray, + _Type_UNION: UnionArray, _Type_BINARY: BinaryArray, _Type_STRING: StringArray, _Type_DICTIONARY: DictionaryArray, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, - _Type_DECIMAL: DecimalArray, + _Type_DECIMAL: Decimal128Array, _Type_STRUCT: StructArray, } diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index f9c148b14e368..1b19ca0e4029b 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -70,7 +70,7 @@ class Categorical(ClassPlaceholder): if PY2: - import cPickle + import cPickle as builtin_pickle try: from cdecimal import Decimal @@ -103,7 +103,12 @@ def tobytes(o): def frombytes(o): return o + + def unichar(s): + return unichr(s) else: + import pickle as builtin_pickle + unicode_type = str def lzip(*x): return list(zip(*x)) @@ -131,6 +136,9 @@ def tobytes(o): def frombytes(o): return o.decode('utf8') + def unichar(s): + return chr(s) + try: import cloudpickle as pickle except ImportError: diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index 926df0e30e565..ff7809575620a 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -275,8 +275,11 @@ def exists(self, path): return self.fs.exists(path) @implements(FileSystem.mkdir) - def mkdir(self, path): - return self.fs.mkdir(path) + def mkdir(self, path, create_parents=True): + if create_parents: + return self.fs.mkdirs(path) + else: + return self.fs.mkdir(path) @implements(FileSystem.open) def open(self, path, mode='rb'): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 731ef94971da0..91bc96dc63f89 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -67,6 +67,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_DICTIONARY" arrow::Type::DICTIONARY" _Type_MAP" arrow::Type::MAP" + enum UnionMode" arrow::UnionMode::type": + _UnionMode_SPARSE" arrow::UnionMode::SPARSE" + _UnionMode_DENSE" arrow::UnionMode::DENSE" + enum TimeUnit" arrow::TimeUnit::type": TimeUnit_SECOND" arrow::TimeUnit::SECOND" TimeUnit_MILLI" arrow::TimeUnit::MILLI" @@ -86,6 +90,14 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string ToString() + cdef cppclass CArrayData" arrow::ArrayData": + shared_ptr[CDataType] type + int64_t length + int64_t null_count + int64_t offset + vector[shared_ptr[CBuffer]] buffers + vector[shared_ptr[CArrayData]] child_data + cdef cppclass CArray" arrow::Array": shared_ptr[CDataType] type() @@ -98,9 +110,13 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool Equals(const CArray& arr) c_bool IsNull(int i) + shared_ptr[CArrayData] data() + shared_ptr[CArray] Slice(int64_t offset) shared_ptr[CArray] Slice(int64_t offset, int64_t length) + shared_ptr[CArray] MakeArray(const shared_ptr[CArrayData]& data) + CStatus DebugPrint(const CArray& arr, int indent) cdef cppclass CFixedWidthType" arrow::FixedWidthType"(CDataType): @@ -153,26 +169,27 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CBuffer" arrow::Buffer": CBuffer(const uint8_t* data, int64_t size) - uint8_t* data() + const uint8_t* data() + uint8_t* mutable_data() int64_t size() shared_ptr[CBuffer] parent() c_bool is_mutable() const + c_bool Equals(const CBuffer& other) cdef cppclass CMutableBuffer" arrow::MutableBuffer"(CBuffer): CMutableBuffer(const uint8_t* data, int64_t size) - uint8_t* mutable_data() + + cdef cppclass CResizableBuffer" arrow::ResizableBuffer"(CMutableBuffer): + CStatus Resize(const int64_t new_size, c_bool shrink_to_fit) + CStatus Reserve(const int64_t new_size) CStatus AllocateBuffer(CMemoryPool* pool, const int64_t size, shared_ptr[CBuffer]* out) CStatus AllocateResizableBuffer(CMemoryPool* pool, const int64_t size, - shared_ptr[ResizableBuffer]* out) - - cdef cppclass ResizableBuffer(CBuffer): - CStatus Resize(int64_t nbytes) - CStatus Reserve(int64_t nbytes) + shared_ptr[CResizableBuffer]* out) - cdef cppclass PoolBuffer(ResizableBuffer): + cdef cppclass PoolBuffer(CResizableBuffer): PoolBuffer() PoolBuffer(CMemoryPool*) @@ -193,10 +210,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int byte_width() int bit_width() - cdef cppclass CDecimalType" arrow::DecimalType"(CFixedSizeBinaryType): + cdef cppclass CDecimal128Type \ + " arrow::Decimal128Type"(CFixedSizeBinaryType): + CDecimal128Type(int precision, int scale) int precision() int scale() - CDecimalType(int precision, int scale) cdef cppclass CField" arrow::Field": const c_string& name() @@ -222,6 +240,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStructType" arrow::StructType"(CDataType): CStructType(const vector[shared_ptr[CField]]& fields) + cdef cppclass CUnionType" arrow::UnionType"(CDataType): + CUnionType(const vector[shared_ptr[CField]]& fields, + const vector[uint8_t]& type_codes, UnionMode mode) + UnionMode mode() + cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, @@ -303,7 +326,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CFixedSizeBinaryArray" arrow::FixedSizeBinaryArray"(CArray): const uint8_t* GetValue(int i) - cdef cppclass CDecimalArray" arrow::DecimalArray"(CFixedSizeBinaryArray): + cdef cppclass CDecimal128Array" arrow::Decimal128Array"( + CFixedSizeBinaryArray + ): c_string FormatValue(int i) cdef cppclass CListArray" arrow::ListArray"(CArray): @@ -317,6 +342,22 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() + cdef cppclass CUnionArray" arrow::UnionArray"(CArray): + @staticmethod + CStatus MakeSparse(const CArray& type_ids, + const vector[shared_ptr[CArray]]& children, + shared_ptr[CArray]* out) + + @staticmethod + CStatus MakeDense(const CArray& type_ids, const CArray& value_offsets, + const vector[shared_ptr[CArray]]& children, + shared_ptr[CArray]* out) + uint8_t* raw_type_ids() + int32_t value_offset(int i) + shared_ptr[CArray] child(int pos) + const CArray* UnsafeChild(int pos) + UnionMode mode() + cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray): const uint8_t* GetValue(int i, int32_t* length) @@ -336,6 +377,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CStatus ValidateArray(const CArray& array) cdef cppclass CChunkedArray" arrow::ChunkedArray": + CChunkedArray(const vector[shared_ptr[CArray]]& arrays) int64_t length() int64_t null_count() int num_chunks() @@ -349,8 +391,13 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CColumn(const shared_ptr[CField]& field, const vector[shared_ptr[CArray]]& chunks) + CColumn(const shared_ptr[CField]& field, + const shared_ptr[CChunkedArray]& data) + c_bool Equals(const CColumn& other) + shared_ptr[CField] field() + int64_t length() int64_t null_count() const c_string& name() @@ -358,8 +405,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CChunkedArray] data() cdef cppclass CRecordBatch" arrow::RecordBatch": - CRecordBatch(const shared_ptr[CSchema]& schema, int64_t num_rows, - const vector[shared_ptr[CArray]]& columns) + @staticmethod + shared_ptr[CRecordBatch] Make( + const shared_ptr[CSchema]& schema, int64_t num_rows, + const vector[shared_ptr[CArray]]& columns) c_bool Equals(const CRecordBatch& other) @@ -382,6 +431,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CTable(const shared_ptr[CSchema]& schema, const vector[shared_ptr[CColumn]]& columns) + @staticmethod + shared_ptr[CTable] Make( + const shared_ptr[CSchema]& schema, + const vector[shared_ptr[CColumn]]& columns) + @staticmethod CStatus FromRecordBatches( const vector[shared_ptr[CRecordBatch]]& batches, @@ -402,6 +456,13 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CTable] ReplaceSchemaMetadata( const shared_ptr[CKeyValueMetadata]& metadata) + cdef cppclass RecordBatchReader: + CStatus ReadNext(shared_ptr[CRecordBatch]* out) + + cdef cppclass TableBatchReader(RecordBatchReader): + TableBatchReader(const CTable& table) + void set_chunksize(int64_t chunksize) + cdef cppclass CTensor" arrow::Tensor": shared_ptr[CDataType] type() shared_ptr[CBuffer] data() @@ -582,7 +643,7 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: cdef cppclass CBufferOutputStream \ " arrow::io::BufferOutputStream"(OutputStream): - CBufferOutputStream(const shared_ptr[ResizableBuffer]& buffer) + CBufferOutputStream(const shared_ptr[CResizableBuffer]& buffer) cdef cppclass CMockOutputStream \ " arrow::io::MockOutputStream"(OutputStream): @@ -608,6 +669,7 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: MessageType_V1" arrow::ipc::MetadataVersion::V1" MessageType_V2" arrow::ipc::MetadataVersion::V2" MessageType_V3" arrow::ipc::MetadataVersion::V3" + MessageType_V4" arrow::ipc::MetadataVersion::V4" cdef cppclass CMessage" arrow::ipc::Message": CStatus Open(const shared_ptr[CBuffer]& metadata, @@ -627,17 +689,16 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: c_string FormatMessageType(MessageType type) cdef cppclass CMessageReader" arrow::ipc::MessageReader": - CStatus ReadNextMessage(unique_ptr[CMessage]* out) + @staticmethod + unique_ptr[CMessageReader] Open(const shared_ptr[InputStream]& stream) - cdef cppclass CInputStreamMessageReader \ - " arrow::ipc::InputStreamMessageReader": - CInputStreamMessageReader(const shared_ptr[InputStream]& stream) + CStatus ReadNextMessage(unique_ptr[CMessage]* out) cdef cppclass CRecordBatchWriter" arrow::ipc::RecordBatchWriter": CStatus Close() CStatus WriteRecordBatch(const CRecordBatch& batch, c_bool allow_64bit) - CStatus WriteTable(const CTable& table) + CStatus WriteTable(const CTable& table, int64_t max_chunksize) cdef cppclass CRecordBatchReader" arrow::ipc::RecordBatchReader": shared_ptr[CSchema] schema() @@ -749,11 +810,42 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: c_bool allow_int_overflow c_bool allow_time_truncate + enum DatumType" arrow::compute::Datum::type": + DatumType_NONE" arrow::compute::Datum::NONE" + DatumType_SCALAR" arrow::compute::Datum::SCALAR" + DatumType_ARRAY" arrow::compute::Datum::ARRAY" + DatumType_CHUNKED_ARRAY" arrow::compute::Datum::CHUNKED_ARRAY" + DatumType_RECORD_BATCH" arrow::compute::Datum::RECORD_BATCH" + DatumType_TABLE" arrow::compute::Datum::TABLE" + DatumType_COLLECTION" arrow::compute::Datum::COLLECTION" + + cdef cppclass CDatum" arrow::compute::Datum": + CDatum() + CDatum(const shared_ptr[CArray]& value) + CDatum(const shared_ptr[CChunkedArray]& value) + CDatum(const shared_ptr[CRecordBatch]& value) + CDatum(const shared_ptr[CTable]& value) + + DatumType kind() + + shared_ptr[CArrayData] array() + shared_ptr[CChunkedArray] chunked_array() + CStatus Cast(CFunctionContext* context, const CArray& array, const shared_ptr[CDataType]& to_type, const CCastOptions& options, shared_ptr[CArray]* out) + CStatus Cast(CFunctionContext* context, const CDatum& value, + const shared_ptr[CDataType]& to_type, + const CCastOptions& options, CDatum* out) + + CStatus Unique(CFunctionContext* context, const CDatum& value, + shared_ptr[CArray]* out) + + CStatus DictionaryEncode(CFunctionContext* context, const CDatum& value, + CDatum* out) + cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[CDataType] GetPrimitiveType(Type type) @@ -822,12 +914,12 @@ cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil: shared_ptr[CRecordBatch] batch vector[shared_ptr[CTensor]] tensors + CStatus WriteTo(OutputStream* dst) + CStatus GetComponents(CMemoryPool* pool, PyObject** dst) + CStatus SerializeObject(object context, object sequence, CSerializedPyObject* out) - CStatus WriteSerializedObject(const CSerializedPyObject& obj, - OutputStream* dst) - CStatus DeserializeObject(object context, const CSerializedPyObject& obj, PyObject* base, PyObject** out) @@ -835,6 +927,10 @@ cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil: CStatus ReadSerializedObject(RandomAccessFile* src, CSerializedPyObject* out) + CStatus GetSerializedFromComponents(int num_tensors, int num_buffers, + object buffers, + CSerializedPyObject* out) + cdef extern from 'arrow/python/init.h': int arrow_init_numpy() except -1 @@ -842,3 +938,26 @@ cdef extern from 'arrow/python/init.h': cdef extern from 'arrow/python/config.h' namespace 'arrow::py': void set_numpy_nan(object o) + + +cdef extern from 'arrow/util/compression.h' namespace 'arrow' nogil: + enum CompressionType" arrow::Compression::type": + CompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED" + CompressionType_SNAPPY" arrow::Compression::SNAPPY" + CompressionType_GZIP" arrow::Compression::GZIP" + CompressionType_BROTLI" arrow::Compression::BROTLI" + CompressionType_ZSTD" arrow::Compression::ZSTD" + CompressionType_LZ4" arrow::Compression::LZ4" + + cdef cppclass CCodec" arrow::Codec": + @staticmethod + CStatus Create(CompressionType codec, unique_ptr[CCodec]* out) + + CStatus Decompress(int64_t input_len, const uint8_t* input, + int64_t output_len, uint8_t* output_buffer) + + CStatus Compress(int64_t input_len, const uint8_t* input, + int64_t output_buffer_len, uint8_t* output_buffer, + int64_t* output_length) + + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 495e31b5a2176..619ba365c2df7 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -600,6 +600,23 @@ cdef class Buffer: # TODO(wesm): buffer slicing raise NotImplementedError + def equals(self, Buffer other): + """ + Determine if two buffers contain exactly the same data + + Parameters + ---------- + other : Buffer + + Returns + ------- + are_equal : True if buffer contents and size are equal + """ + cdef c_bool result = False + with nogil: + result = self.buffer.get().Equals(deref(other.buffer.get())) + return result + def to_pybytes(self): return cp.PyBytes_FromStringAndSize( self.buffer.get().data(), @@ -644,13 +661,37 @@ cdef class Buffer: return self.size +cdef class ResizableBuffer(Buffer): + + cdef void init_rz(self, const shared_ptr[CResizableBuffer]& buffer): + self.init( buffer) + + def resize(self, int64_t new_size, shrink_to_fit=False): + """ + Resize buffer to indicated size + + Parameters + ---------- + new_size : int64_t + New size of buffer (padding may be added internally) + shrink_to_fit : boolean, default False + If new_size is less than the current size, shrink internal + capacity, otherwise leave at current capacity + """ + cdef c_bool c_shrink_to_fit = shrink_to_fit + with nogil: + check_status(( self.buffer.get()) + .Resize(new_size, c_shrink_to_fit)) + + cdef shared_ptr[PoolBuffer] _allocate_buffer(CMemoryPool* pool): cdef shared_ptr[PoolBuffer] result result.reset(new PoolBuffer(pool)) return result -def allocate_buffer(int64_t size, MemoryPool pool=None): +def allocate_buffer(int64_t size, MemoryPool memory_pool=None, + resizable=False): """ Allocate mutable fixed-size buffer @@ -658,17 +699,27 @@ def allocate_buffer(int64_t size, MemoryPool pool=None): ---------- size : int Number of bytes to allocate (plus internal padding) - pool : MemoryPool, optional + memory_pool : MemoryPool, optional Uses default memory pool if not provided + resizable : boolean, default False + + Returns + ------- + buffer : Buffer or ResizableBuffer """ cdef: shared_ptr[CBuffer] buffer - CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + shared_ptr[CResizableBuffer] rz_buffer + CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - check_status(AllocateBuffer(cpool, size, &buffer)) - - return pyarrow_wrap_buffer(buffer) + if resizable: + with nogil: + check_status(AllocateResizableBuffer(cpool, size, &rz_buffer)) + return pyarrow_wrap_resizable_buffer(rz_buffer) + else: + with nogil: + check_status(AllocateBuffer(cpool, size, &buffer)) + return pyarrow_wrap_buffer(buffer) cdef class BufferOutputStream(NativeFile): @@ -679,7 +730,7 @@ cdef class BufferOutputStream(NativeFile): def __cinit__(self, MemoryPool memory_pool=None): self.buffer = _allocate_buffer(maybe_unbox_memory_pool(memory_pool)) self.wr_file.reset(new CBufferOutputStream( - self.buffer)) + self.buffer)) self.is_readable = 0 self.is_writeable = 1 self.is_open = True @@ -783,3 +834,145 @@ cdef get_writer(object source, shared_ptr[OutputStream]* writer): else: raise TypeError('Unable to read from object of type: {0}' .format(type(source))) + + +# --------------------------------------------------------------------- + +cdef CompressionType _get_compression_type(object name): + if name is None or name == 'uncompressed': + return CompressionType_UNCOMPRESSED + elif name == 'snappy': + return CompressionType_SNAPPY + elif name == 'gzip': + return CompressionType_GZIP + elif name == 'brotli': + return CompressionType_BROTLI + elif name == 'zstd': + return CompressionType_ZSTD + elif name == 'lz4': + return CompressionType_LZ4 + else: + raise ValueError("Unrecognized compression type: {0}" + .format(str(name))) + + +def compress(object buf, codec='lz4', asbytes=False, memory_pool=None): + """ + Compress pyarrow.Buffer or Python object supporting the buffer (memoryview) + protocol + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol + codec : string, default 'lz4' + Compression codec. + Supported types: {'brotli, 'gzip', 'lz4', 'snappy', 'zstd'} + asbytes : boolean, default False + Return result as Python bytes object, otherwise Buffer + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any + + Returns + ------- + compressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + cdef: + CompressionType c_codec = _get_compression_type(codec) + unique_ptr[CCodec] compressor + cdef CBuffer* c_buf + cdef PyObject* pyobj + cdef ResizableBuffer out_buf + + with nogil: + check_status(CCodec.Create(c_codec, &compressor)) + + if not isinstance(buf, Buffer): + buf = frombuffer(buf) + + c_buf = ( buf).buffer.get() + + cdef int64_t max_output_size = (compressor.get() + .MaxCompressedLen(c_buf.size(), + c_buf.data())) + cdef uint8_t* output_buffer = NULL + + if asbytes: + pyobj = PyBytes_FromStringAndSizeNative(NULL, max_output_size) + output_buffer = cp.PyBytes_AS_STRING( pyobj) + else: + out_buf = allocate_buffer(max_output_size, memory_pool=memory_pool, + resizable=True) + output_buffer = out_buf.buffer.get().mutable_data() + + cdef int64_t output_length = 0 + with nogil: + check_status(compressor.get() + .Compress(c_buf.size(), c_buf.data(), + max_output_size, output_buffer, + &output_length)) + + if asbytes: + cp._PyBytes_Resize(&pyobj, output_length) + return PyObject_to_object(pyobj) + else: + out_buf.resize(output_length) + return out_buf + + +def decompress(object buf, decompressed_size=None, codec='lz4', + asbytes=False, memory_pool=None): + """ + Decompress data from buffer-like object + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or memoryview-compatible object + decompressed_size : int64_t, default None + If not specified, will be computed if the codec is able to determine + the uncompressed buffer size + codec : string, default 'lz4' + Compression codec. + Supported types: {'brotli, 'gzip', 'lz4', 'snappy', 'zstd'} + asbytes : boolean, default False + Return result as Python bytes object, otherwise Buffer + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any + + Returns + ------- + uncompressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + cdef: + CompressionType c_codec = _get_compression_type(codec) + unique_ptr[CCodec] compressor + cdef CBuffer* c_buf + cdef Buffer out_buf + + with nogil: + check_status(CCodec.Create(c_codec, &compressor)) + + if not isinstance(buf, Buffer): + buf = frombuffer(buf) + + c_buf = ( buf).buffer.get() + + if decompressed_size is None: + raise ValueError("Must pass decompressed_size for {0} codec" + .format(codec)) + + cdef int64_t output_size = decompressed_size + cdef uint8_t* output_buffer = NULL + + if asbytes: + pybuf = cp.PyBytes_FromStringAndSize(NULL, output_size) + output_buffer = cp.PyBytes_AS_STRING(pybuf) + else: + out_buf = allocate_buffer(output_size, memory_pool=memory_pool) + output_buffer = out_buf.buffer.get().mutable_data() + + with nogil: + check_status(compressor.get() + .Decompress(c_buf.size(), c_buf.data(), + output_size, output_buffer)) + + return pybuf if asbytes else out_buf diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 27e9167750991..7534b0d0e87ec 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -125,9 +125,11 @@ cdef class MessageReader: def open_stream(source): cdef MessageReader result = MessageReader() cdef shared_ptr[InputStream] in_stream + cdef unique_ptr[CMessageReader] reader get_input_stream(source, &in_stream) with nogil: - result.reader.reset(new CInputStreamMessageReader(in_stream)) + reader = CMessageReader.Open(in_stream) + result.reader.reset(reader.release()) return result @@ -202,7 +204,7 @@ cdef class _RecordBatchWriter: check_status(self.writer.get() .WriteRecordBatch(deref(batch.batch), 1)) - def write_table(self, Table table): + def write_table(self, Table table, chunksize=None): """ Write RecordBatch to stream @@ -210,8 +212,16 @@ cdef class _RecordBatchWriter: ---------- batch : RecordBatch """ + cdef: + # Chunksize must be > 0 to have any impact + int64_t c_chunksize = -1 + + if chunksize is not None: + c_chunksize = chunksize + with nogil: - check_status(self.writer.get().WriteTable(table.table[0])) + check_status(self.writer.get().WriteTable(table.table[0], + c_chunksize)) def close(self): """ diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 8fdcf553c13fc..90f749d6db633 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -56,6 +56,11 @@ cdef class DictionaryType(DataType): const CDictionaryType* dict_type +cdef class UnionType(DataType): + cdef: + list child_types + + cdef class TimestampType(DataType): cdef: const CTimestampType* ts_type @@ -76,9 +81,9 @@ cdef class FixedSizeBinaryType(DataType): const CFixedSizeBinaryType* fixed_size_binary_type -cdef class DecimalType(FixedSizeBinaryType): +cdef class Decimal128Type(FixedSizeBinaryType): cdef: - const CDecimalType* decimal_type + const CDecimal128Type* decimal128_type cdef class Field: @@ -139,6 +144,13 @@ cdef class ListValue(ArrayValue): cdef getitem(self, int64_t i) +cdef class UnionValue(ArrayValue): + cdef: + CUnionArray* ap + list value_types + + cdef getitem(self, int64_t i) + cdef class StringValue(ArrayValue): pass @@ -234,7 +246,7 @@ cdef class FixedSizeBinaryArray(Array): pass -cdef class DecimalArray(FixedSizeBinaryArray): +cdef class Decimal128Array(FixedSizeBinaryArray): pass @@ -242,6 +254,10 @@ cdef class ListArray(Array): pass +cdef class UnionArray(Array): + pass + + cdef class StringArray(Array): pass @@ -307,6 +323,11 @@ cdef class Buffer: cdef void init(self, const shared_ptr[CBuffer]& buffer) +cdef class ResizableBuffer(Buffer): + + cdef void init_rz(self, const shared_ptr[CResizableBuffer]& buffer) + + cdef class NativeFile: cdef: shared_ptr[RandomAccessFile] rd_file @@ -327,6 +348,8 @@ cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader) cdef get_writer(object source, shared_ptr[OutputStream]* writer) cdef public object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf) +cdef public object pyarrow_wrap_resizable_buffer( + const shared_ptr[CResizableBuffer]& buf) cdef public object pyarrow_wrap_data_type(const shared_ptr[CDataType]& type) cdef public object pyarrow_wrap_field(const shared_ptr[CField]& field) cdef public object pyarrow_wrap_schema(const shared_ptr[CSchema]& type) diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 6f4451e3f5a41..b4ca49cafe160 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -92,6 +92,8 @@ Type_UNION = _Type_UNION Type_DICTIONARY = _Type_DICTIONARY Type_MAP = _Type_MAP +UnionMode_SPARSE = _UnionMode_SPARSE +UnionMode_DENSE = _UnionMode_DENSE # Exception types include "error.pxi" diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py new file mode 100644 index 0000000000000..22451d521825e --- /dev/null +++ b/python/pyarrow/orc.py @@ -0,0 +1,149 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from itertools import count +from numbers import Integral + +from pyarrow import _orc +from pyarrow import types +from pyarrow.lib import Schema + + +def _is_map(typ): + return (types.is_list(typ) and + types.is_struct(typ.value_type) and + typ.value_type.num_children == 2 and + typ.value_type[0].name == 'key' and + typ.value_type[1].name == 'value') + + +def _traverse(typ, counter): + if isinstance(typ, Schema) or types.is_struct(typ): + for field in typ: + path = (field.name,) + yield path, next(counter) + for sub, c in _traverse(field.type, counter): + yield path + sub, c + elif _is_map(typ): + for sub_c in _traverse(typ.value_type, counter): + yield sub_c + elif types.is_list(typ): + # Skip one index for list type, since this can never be selected + # directly + next(counter) + for sub_c in _traverse(typ.value_type, counter): + yield sub_c + elif types.is_union(typ): + # Union types not supported, just skip the indexes + for dtype in typ: + next(counter) + for sub_c in _traverse(dtype, counter): + pass + + +def _schema_to_indices(schema): + return {'.'.join(i): c for i, c in _traverse(schema, count(1))} + + +class ORCFile(object): + """ + Reader interface for a single ORC file + + Parameters + ---------- + source : str or pyarrow.io.NativeFile + Readable source. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. + """ + def __init__(self, source): + self.reader = _orc.ORCReader() + self.reader.open(source) + self._column_index_lookup = _schema_to_indices(self.schema) + + @property + def schema(self): + """The file schema, as an arrow schema""" + return self.reader.schema() + + @property + def nrows(self): + """The number of rows in the file""" + return self.reader.nrows() + + @property + def nstripes(self): + """The number of stripes in the file""" + return self.reader.nstripes() + + def _select_indices(self, columns=None): + if columns is None: + return None + + schema = self.schema + indices = [] + for col in columns: + if isinstance(col, Integral): + col = int(col) + if 0 <= col < len(schema): + col = schema[col].name + else: + raise ValueError("Column indices must be in 0 <= ind < %d," + " got %d" % (len(schema), col)) + if col in self._column_index_lookup: + indices.append(self._column_index_lookup[col]) + else: + raise ValueError("Unknown column name %r" % col) + + return indices + + def read_stripe(self, n, columns=None): + """Read a single stripe from the file. + + Parameters + ---------- + n : int + The stripe index + columns : list + If not None, only these columns will be read from the stripe. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e' + + Returns + ------- + pyarrow.lib.RecordBatch + Content of the stripe as a RecordBatch. + """ + include_indices = self._select_indices(columns) + return self.reader.read_stripe(n, include_indices=include_indices) + + def read(self, columns=None): + """Read the whole file. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e' + + Returns + ------- + pyarrow.lib.Table + Content of the file as a Table. + """ + include_indices = self._select_indices(columns) + return self.reader.read(include_indices=include_indices) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index d6c844c8490f5..f3089d2a012a6 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -20,6 +20,7 @@ import json import re +import pandas.core.internals as _int import numpy as np import pandas as pd @@ -29,13 +30,6 @@ from pyarrow.compat import PY2, zip_longest # noqa -INDEX_LEVEL_NAME_REGEX = re.compile(r'^__index_level_\d+__$') - - -def is_unnamed_index_level(name): - return INDEX_LEVEL_NAME_REGEX.match(name) is not None - - def infer_dtype(column): try: return pd.api.types.infer_dtype(column) @@ -87,7 +81,7 @@ def get_logical_type(arrow_type): return 'list[{}]'.format(get_logical_type(arrow_type.value_type)) elif isinstance(arrow_type, pa.lib.TimestampType): return 'datetimetz' if arrow_type.tz is not None else 'datetime' - elif isinstance(arrow_type, pa.lib.DecimalType): + elif isinstance(arrow_type, pa.lib.Decimal128Type): return 'decimal' raise NotImplementedError(str(arrow_type)) @@ -116,7 +110,11 @@ def get_logical_type_from_numpy(pandas_collection): except KeyError: if hasattr(pandas_collection.dtype, 'tz'): return 'datetimetz' - return infer_dtype(pandas_collection) + result = infer_dtype(pandas_collection) + + if result == 'string': + return 'bytes' if PY2 else 'unicode' + return result def get_extension_dtype_info(column): @@ -128,7 +126,7 @@ def get_extension_dtype_info(column): 'num_categories': len(cats.categories), 'ordered': cats.ordered, } - physical_dtype = 'object' + physical_dtype = str(cats.codes.dtype) elif hasattr(dtype, 'tz'): metadata = {'timezone': str(dtype.tz)} physical_dtype = 'datetime64[ns]' @@ -138,14 +136,18 @@ def get_extension_dtype_info(column): return physical_dtype, metadata -def get_column_metadata(column, name, arrow_type): +def get_column_metadata(column, name, arrow_type, field_name): """Construct the metadata for a given column Parameters ---------- - column : pandas.Series + column : pandas.Series or pandas.Index name : str arrow_type : pyarrow.DataType + field_name : str + Equivalent to `name` when `column` is a `Series`, otherwise if `column` + is a pandas Index then `field_name` will not be the same as `name`. + This is the name of the field in the arrow Table's schema. Returns ------- @@ -161,7 +163,7 @@ def get_column_metadata(column, name, arrow_type): } string_dtype = 'object' - if not isinstance(name, six.string_types): + if name is not None and not isinstance(name, six.string_types): raise TypeError( 'Column name must be a string. Got column {} of type {}'.format( name, type(name).__name__ @@ -170,29 +172,14 @@ def get_column_metadata(column, name, arrow_type): return { 'name': name, + 'field_name': str(field_name), 'pandas_type': logical_type, 'numpy_type': string_dtype, 'metadata': extra_metadata, } -def index_level_name(index, i): - """Return the name of an index level or a default name if `index.name` is - None. - - Parameters - ---------- - index : pandas.Index - i : int - - Returns - ------- - name : str - """ - if index.name is not None: - return index.name - else: - return '__index_level_{:d}__'.format(i) +index_level_name = '__index_level_{:d}__'.format def construct_metadata(df, column_names, index_levels, preserve_index, types): @@ -215,20 +202,28 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types): index_types = types[ncolumns - len(index_levels):] column_metadata = [ - get_column_metadata(df[col_name], name=sanitized_name, - arrow_type=arrow_type) - for col_name, sanitized_name, arrow_type in - zip(df.columns, column_names, df_types) + get_column_metadata( + df[col_name], + name=sanitized_name, + arrow_type=arrow_type, + field_name=sanitized_name + ) for col_name, sanitized_name, arrow_type in zip( + df.columns, column_names, df_types + ) ] if preserve_index: - index_column_names = [index_level_name(level, i) - for i, level in enumerate(index_levels)] + index_column_names = list(map( + index_level_name, range(len(index_levels)) + )) index_column_metadata = [ - get_column_metadata(level, name=index_level_name(level, i), - arrow_type=arrow_type) - for i, (level, arrow_type) in enumerate( - zip(index_levels, index_types) + get_column_metadata( + level, + name=level.name, + arrow_type=arrow_type, + field_name=field_name, + ) for i, (level, arrow_type, field_name) in enumerate( + zip(index_levels, index_types, index_column_names) ) ] @@ -236,9 +231,16 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types): for level in getattr(df.columns, 'levels', [df.columns]): string_dtype, extra_metadata = get_extension_dtype_info(level) + + pandas_type = get_logical_type_from_numpy(level) + if pandas_type == 'unicode': + assert not extra_metadata + extra_metadata = {'encoding': 'UTF-8'} + column_index = { 'name': level.name, - 'pandas_type': get_logical_type_from_numpy(level), + 'field_name': level.name, + 'pandas_type': pandas_type, 'numpy_type': string_dtype, 'metadata': extra_metadata, } @@ -287,6 +289,8 @@ def _column_name_to_strings(name): return tuple(map(_column_name_to_strings, name)) elif isinstance(name, collections.Sequence): raise TypeError("Unsupported type for MultiIndex level") + elif name is None: + return None return str(name) @@ -301,10 +305,18 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1): columns_to_convert = [] convert_types = [] + + if not df.columns.is_unique: + raise ValueError( + 'Duplicate column names found: {}'.format(list(df.columns)) + ) + for name in df.columns: col = df[name] if not isinstance(name, six.string_types): - name = str(_column_name_to_strings(name)) + name = _column_name_to_strings(name) + if name is not None: + name = str(name) if schema is not None: field = schema.field_by_name(name) @@ -317,7 +329,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1): for i, column in enumerate(index_columns): columns_to_convert.append(column) convert_types.append(None) - names.append(index_level_name(column, i)) + names.append(index_level_name(i)) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the @@ -357,7 +369,8 @@ def get_datetimetz_type(values, dtype, type_): if values.dtype.type != np.datetime64: return values, type_ - if isinstance(dtype, DatetimeTZDtype): + if isinstance(dtype, DatetimeTZDtype) and type_ is None: + # If no user type passed, construct a tz-aware timestamp type tz = dtype.tz unit = dtype.unit type_ = pa.timestamp(unit, tz) @@ -367,17 +380,89 @@ def get_datetimetz_type(values, dtype, type_): return values, type_ +# ---------------------------------------------------------------------- +# Converting pandas.DataFrame to a dict containing only NumPy arrays or other +# objects friendly to pyarrow.serialize + + +def dataframe_to_serialized_dict(frame): + block_manager = frame._data -def make_datetimetz(tz): + blocks = [] + axes = [ax for ax in block_manager.axes] + + for block in block_manager.blocks: + values = block.values + block_data = {} + + if isinstance(block, _int.DatetimeTZBlock): + block_data['timezone'] = values.tz.zone + values = values.values + elif isinstance(block, _int.CategoricalBlock): + block_data.update(dictionary=values.categories, + ordered=values.ordered) + values = values.codes + + block_data.update( + placement=block.mgr_locs.as_array, + block=values + ) + blocks.append(block_data) + + return { + 'blocks': blocks, + 'axes': axes + } + + +def serialized_dict_to_dataframe(data): + reconstructed_blocks = [_reconstruct_block(block) + for block in data['blocks']] + + block_mgr = _int.BlockManager(reconstructed_blocks, data['axes']) + return pd.DataFrame(block_mgr) + + +def _reconstruct_block(item): + # Construct the individual blocks converting dictionary types to pandas + # categorical types and Timestamps-with-timezones types to the proper + # pandas Blocks + + block_arr = item['block'] + placement = item['placement'] + if 'dictionary' in item: + cat = pd.Categorical.from_codes(block_arr, + categories=item['dictionary'], + ordered=item['ordered']) + block = _int.make_block(cat, placement=placement, + klass=_int.CategoricalBlock, + fastpath=True) + elif 'timezone' in item: + dtype = _make_datetimetz(item['timezone']) + block = _int.make_block(block_arr, placement=placement, + klass=_int.DatetimeTZBlock, + dtype=dtype, fastpath=True) + else: + block = _int.make_block(block_arr, placement=placement) + + return block + + +def _make_datetimetz(tz): from pyarrow.compat import DatetimeTZDtype return DatetimeTZDtype('ns', tz=tz) -def table_to_blockmanager(options, table, memory_pool, nthreads=1): - import pandas.core.internals as _int - import pyarrow.lib as lib +# ---------------------------------------------------------------------- +# Converting pyarrow.Table efficiently to pandas.DataFrame + + +def table_to_blockmanager(options, table, memory_pool, nthreads=1, + categoricals=None): + from pyarrow.compat import DatetimeTZDtype index_columns = [] + columns = [] column_indexes = [] index_arrays = [] index_names = [] @@ -390,56 +475,64 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] + columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) block_table = table + index_columns_set = frozenset(index_columns) + + # 0. 'field_name' is the name of the column in the arrow Table + # 1. 'name' is the user-facing name of the column, that is, it came from + # pandas + # 2. 'field_name' and 'name' differ for index columns + # 3. We fall back on c['name'] for backwards compatibility + logical_index_names = [ + c['name'] for c in columns + if c.get('field_name', c['name']) in index_columns_set + ] + + # There must be the same number of field names and physical names + # (fields in the arrow Table) + assert len(logical_index_names) == len(index_columns_set) + + # It can never be the case in a released version of pyarrow that + # c['name'] is None *and* 'field_name' is not a key in the column metadata, + # because the change to allow c['name'] to be None and the change to add + # 'field_name' are in the same release (0.8.0) + assert all( + (c['name'] is None and 'field_name' in c) or c['name'] is not None + for c in columns + ) + # Build up a list of index columns and names while removing those columns # from the original table - for name in index_columns: - i = schema.get_field_index(name) + for raw_name, logical_name in zip(index_columns, logical_index_names): + i = schema.get_field_index(raw_name) if i != -1: col = table.column(i) - index_name = None if is_unnamed_index_level(name) else name col_pandas = col.to_pandas() values = col_pandas.values - if not values.flags.writeable: + if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() - index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) - index_names.append(index_name) + if isinstance(col_pandas.dtype, DatetimeTZDtype): + index_array = (pd.Series(values).dt.tz_localize('utc') + .dt.tz_convert(col_pandas.dtype.tz)) + else: + index_array = pd.Series(values, dtype=col_pandas.dtype) + index_arrays.append(index_array) + index_names.append( + _backwards_compatible_index_name(raw_name, logical_name) + ) block_table = block_table.remove_column( - block_table.schema.get_field_index(name) + block_table.schema.get_field_index(raw_name) ) - # Convert an arrow table to Block from the internal pandas API - result = lib.table_to_blocks(options, block_table, nthreads, memory_pool) - - # Construct the individual blocks converting dictionary types to pandas - # categorical types and Timestamps-with-timezones types to the proper - # pandas Blocks - blocks = [] - for item in result: - block_arr = item['block'] - placement = item['placement'] - if 'dictionary' in item: - cat = pd.Categorical(block_arr, - categories=item['dictionary'], - ordered=item['ordered'], fastpath=True) - block = _int.make_block(cat, placement=placement, - klass=_int.CategoricalBlock, - fastpath=True) - elif 'timezone' in item: - dtype = make_datetimetz(item['timezone']) - block = _int.make_block(block_arr, placement=placement, - klass=_int.DatetimeTZBlock, - dtype=dtype, fastpath=True) - else: - block = _int.make_block(block_arr, placement=placement) - blocks.append(block) + blocks = _table_to_blocks(options, block_table, nthreads, memory_pool) # Construct the row index if len(index_arrays) > 1: @@ -450,6 +543,15 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] + if columns: + columns_name_dict = { + c.get('field_name', str(c['name'])): c['name'] for c in columns + } + columns_values = [ + columns_name_dict.get(name, name) for name in column_strings + ] + else: + columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of @@ -459,52 +561,87 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): # Create the column index # Construct the base index - if not column_strings: - columns = pd.Index(column_strings) + if not columns_values: + columns = pd.Index(columns_values) else: columns = pd.MultiIndex.from_tuples( - list(map(to_pair, column_strings)), + list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: + columns = _reconstruct_columns_from_metadata(columns, column_indexes) - # Get levels and labels, and provide sane defaults if the index has a - # single level to avoid if/else spaghetti. - levels = getattr(columns, 'levels', None) or [columns] - labels = getattr(columns, 'labels', None) or [ - pd.RangeIndex(len(level)) for level in levels - ] + # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 + columns = _flatten_single_level_multiindex(columns) - # Convert each level to the dtype provided in the metadata - levels_dtypes = [ - (level, col_index.get('numpy_type', level.dtype)) - for level, col_index in zip_longest( - levels, column_indexes, fillvalue={} - ) - ] - new_levels = [ - _level if _level.dtype == _dtype else _level.astype(_dtype) - for _level, _dtype in levels_dtypes - ] - columns = pd.MultiIndex( - levels=new_levels, - labels=labels, - names=columns.names + axes = [columns, index] + return _int.BlockManager(blocks, axes) + + +def _backwards_compatible_index_name(raw_name, logical_name): + # Part of table_to_blockmanager + pattern = r'^__index_level_\d+__$' + if raw_name == logical_name and re.match(pattern, raw_name) is not None: + return None + else: + return logical_name + + +def _reconstruct_columns_from_metadata(columns, column_indexes): + # Part of table_to_blockmanager + + # Get levels and labels, and provide sane defaults if the index has a + # single level to avoid if/else spaghetti. + levels = getattr(columns, 'levels', None) or [columns] + labels = getattr(columns, 'labels', None) or [ + pd.RangeIndex(len(level)) for level in levels + ] + + # Convert each level to the dtype provided in the metadata + levels_dtypes = [ + (level, col_index.get('numpy_type', level.dtype)) + for level, col_index in zip_longest( + levels, column_indexes, fillvalue={} ) + ] + new_levels = [ + _level if _level.dtype == _dtype else _level.astype(_dtype) + for _level, _dtype in levels_dtypes + ] + + return pd.MultiIndex( + levels=new_levels, + labels=labels, + names=columns.names + ) + + +def _table_to_blocks(options, block_table, nthreads, memory_pool): + # Part of table_to_blockmanager + + # Convert an arrow table to Block from the internal pandas API + result = pa.lib.table_to_blocks(options, block_table, nthreads, + memory_pool) + + # Defined above + return [_reconstruct_block(item) for item in result] - # flatten a single level column MultiIndex for pandas 0.21.0 :( - if isinstance(columns, pd.MultiIndex) and columns.nlevels == 1: - levels, = columns.levels - labels, = columns.labels + +def _flatten_single_level_multiindex(index): + if isinstance(index, pd.MultiIndex) and index.nlevels == 1: + levels, = index.levels + labels, = index.labels # Cheaply check that we do not somehow have duplicate column names - assert len(levels) == len(labels), 'Found non-unique column index' - columns = levels[labels] + if not index.is_unique: + raise ValueError('Found non-unique column index') - axes = [columns, index] - return _int.BlockManager(blocks, axes) + return pd.Index([levels[_label] if _label != -1 else None + for _label in labels], + name=index.names[0]) + return index def _add_any_metadata(table, pandas_metadata): @@ -512,19 +649,36 @@ def _add_any_metadata(table, pandas_metadata): schema = table.schema + index_columns = pandas_metadata['index_columns'] + n_index_levels = len(index_columns) + n_columns = len(pandas_metadata['columns']) - n_index_levels + # Add time zones for i, col_meta in enumerate(pandas_metadata['columns']): - if col_meta['pandas_type'] == 'datetimetz': - col = table[i] - converted = col.to_pandas() - tz = col_meta['metadata']['timezone'] - tz_aware_type = pa.timestamp('ns', tz=tz) - with_metadata = pa.Array.from_pandas(converted.values, - type=tz_aware_type) - - field = pa.field(schema[i].name, tz_aware_type) - modified_columns[i] = pa.Column.from_array(field, - with_metadata) + + raw_name = col_meta.get('field_name') + if not raw_name: + # deal with metadata written with arrow < 0.8 + raw_name = col_meta['name'] + if i >= n_columns: + # index columns + raw_name = index_columns[i - n_columns] + if raw_name is None: + raw_name = 'None' + + idx = schema.get_field_index(raw_name) + if idx != -1: + if col_meta['pandas_type'] == 'datetimetz': + col = table[idx] + converted = col.to_pandas() + tz = col_meta['metadata']['timezone'] + tz_aware_type = pa.timestamp('ns', tz=tz) + with_metadata = pa.Array.from_pandas(converted.values, + type=tz_aware_type) + + field = pa.field(schema[idx].name, tz_aware_type) + modified_columns[idx] = pa.Column.from_array(field, + with_metadata) if len(modified_columns) > 0: columns = [] diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 9dcc30c8af479..151e0df8a22d0 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +from collections import defaultdict import os import inspect import json @@ -54,6 +55,24 @@ def __init__(self, source, metadata=None, common_metadata=None): self.reader = ParquetReader() self.reader.open(source, metadata=metadata) self.common_metadata = common_metadata + self._nested_paths_by_prefix = self._build_nested_paths() + + def _build_nested_paths(self): + paths = self.reader.column_paths + + result = defaultdict(list) + + def _visit_piece(i, key, rest): + result[key].append(i) + + if len(rest) > 0: + nested_key = '.'.join((key, rest[0])) + _visit_piece(i, nested_key, rest[1:]) + + for i, path in enumerate(paths): + _visit_piece(i, path[0], path[1:]) + + return result @property def metadata(self): @@ -75,7 +94,9 @@ def read_row_group(self, i, columns=None, nthreads=1, Parameters ---------- columns: list - If not None, only these columns will be read from the row group. + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e' nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe @@ -100,7 +121,9 @@ def read(self, columns=None, nthreads=1, use_pandas_metadata=False): Parameters ---------- columns: list - If not None, only these columns will be read from the file. + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e' nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe @@ -143,7 +166,11 @@ def _get_column_indices(self, column_names, use_pandas_metadata=False): if column_names is None: return None - indices = list(map(self.reader.column_name_idx, column_names)) + indices = [] + + for name in column_names: + if name in self._nested_paths_by_prefix: + indices.extend(self._nested_paths_by_prefix[name]) if use_pandas_metadata: file_keyvalues = self.metadata.metadata @@ -260,7 +287,7 @@ def __init__(self, where, schema, flavor=None, self.is_open = True def __del__(self): - if self.is_open: + if getattr(self, 'is_open', False): self.close() def write_table(self, table, row_group_size=None): @@ -421,10 +448,6 @@ def read(self, columns=None, nthreads=1, partitions=None, return table -def _is_parquet_file(path): - return path.endswith('parq') or path.endswith('parquet') - - class PartitionSet(object): """A data structure for cataloguing the observed Parquet partitions at a particular level. So if we have @@ -556,14 +579,14 @@ def _visit_level(self, level, base_path, part_keys): filtered_files = [] for path in files: full_path = self.pathsep.join((base_path, path)) - if _is_parquet_file(path): - filtered_files.append(full_path) - elif path.endswith('_common_metadata'): + if path.endswith('_common_metadata'): self.common_metadata_path = full_path elif path.endswith('_metadata'): self.metadata_path = full_path - elif not self._should_silently_exclude(path): + elif self._should_silently_exclude(path): print('Ignoring path: {0}'.format(full_path)) + else: + filtered_files.append(full_path) # ARROW-1079: Filter out "private" directories starting with underscore filtered_directories = [self.pathsep.join((base_path, x)) @@ -573,7 +596,7 @@ def _visit_level(self, level, base_path, part_keys): filtered_files.sort() filtered_directories.sort() - if len(files) > 0 and len(filtered_directories) > 0: + if len(filtered_files) > 0 and len(filtered_directories) > 0: raise ValueError('Found files in an intermediate ' 'directory: {0}'.format(base_path)) elif len(filtered_directories) > 0: @@ -841,7 +864,9 @@ def read_table(source, columns=None, nthreads=1, metadata=None, name or directory name. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. columns: list - If not None, only these columns will be read from the file. + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e' nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe @@ -879,7 +904,9 @@ def read_pandas(source, columns=None, nthreads=1, metadata=None): name. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. columns: list - If not None, only these columns will be read from the file. + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e' nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe @@ -939,6 +966,14 @@ def write_table(table, where, row_group_size=None, version='1.0', """.format(_parquet_writer_arg_docs) +def _mkdir_if_not_exists(fs, path): + if fs._isfilestore() and not fs.exists(path): + try: + fs.mkdir(path) + except OSError: + assert fs.exists(path) + + def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ @@ -985,8 +1020,7 @@ def write_to_dataset(table, root_path, partition_cols=None, else: fs = _ensure_filesystem(filesystem) - if fs._isfilestore() and not fs.exists(root_path): - fs.mkdir(root_path) + _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() @@ -1004,8 +1038,7 @@ def write_to_dataset(table, root_path, partition_cols=None, subtable = Table.from_pandas(subgroup, preserve_index=preserve_index) prefix = "/".join([root_path, subdir]) - if fs._isfilestore() and not fs.exists(prefix): - fs.mkdir(prefix) + _mkdir_if_not_exists(fs, prefix) outfile = compat.guid() + ".parquet" full_path = "/".join([prefix, outfile]) with fs.open(full_path, 'wb') as f: diff --git a/python/pyarrow/plasma.pyx b/python/pyarrow/plasma.pyx index bc0e94e64906e..29e233b6e4e67 100644 --- a/python/pyarrow/plasma.pyx +++ b/python/pyarrow/plasma.pyx @@ -30,7 +30,7 @@ import collections import pyarrow from pyarrow.lib cimport Buffer, NativeFile, check_status -from pyarrow.includes.libarrow cimport (CMutableBuffer, CBuffer, +from pyarrow.includes.libarrow cimport (CBuffer, CMutableBuffer, CFixedSizeBufferWriter, CStatus) @@ -81,7 +81,7 @@ cdef extern from "plasma/client.h" nogil: CStatus Create(const CUniqueID& object_id, int64_t data_size, const uint8_t* metadata, int64_t metadata_size, - uint8_t** data) + const shared_ptr[CBuffer]* data) CStatus Get(const CUniqueID* object_ids, int64_t num_objects, int64_t timeout_ms, CObjectBuffer* object_buffers) @@ -118,9 +118,9 @@ cdef extern from "plasma/client.h" nogil: cdef struct CObjectBuffer" plasma::ObjectBuffer": int64_t data_size - uint8_t* data + shared_ptr[CBuffer] data int64_t metadata_size - uint8_t* metadata + shared_ptr[CBuffer] metadata def make_object_id(object_id): @@ -136,6 +136,9 @@ cdef class ObjectID: CUniqueID data def __cinit__(self, object_id): + if not isinstance(object_id, bytes) or len(object_id) != 20: + raise ValueError("Object ID must by 20 bytes," + " is " + str(object_id)) self.data = CUniqueID.from_binary(object_id) def __richcmp__(ObjectID self, ObjectID object_id, operation): @@ -245,10 +248,8 @@ cdef class PlasmaClient: check_status(self.client.get().Get(ids.data(), ids.size(), timeout_ms, result[0].data())) - cdef _make_plasma_buffer(self, ObjectID object_id, uint8_t* data, + cdef _make_plasma_buffer(self, ObjectID object_id, shared_ptr[CBuffer] buffer, int64_t size): - cdef shared_ptr[CBuffer] buffer - buffer.reset(new CBuffer(data, size)) result = PlasmaBuffer(object_id, self) result.init(buffer) return result @@ -296,12 +297,12 @@ cdef class PlasmaClient: not be created because the plasma store is unable to evict enough objects to create room for it. """ - cdef uint8_t* data + cdef shared_ptr[CBuffer] data with nogil: check_status(self.client.get().Create(object_id.data, data_size, (metadata.data()), metadata.size(), &data)) - return self._make_mutable_plasma_buffer(object_id, data, data_size) + return self._make_mutable_plasma_buffer(object_id, data.get().mutable_data(), data_size) def get_buffers(self, object_ids, timeout_ms=-1): """ @@ -370,7 +371,7 @@ cdef class PlasmaClient: object_buffers[i].metadata_size)) return result - def put(self, object value, ObjectID object_id=None, + def put(self, object value, ObjectID object_id=None, int memcopy_threads=6, serialization_context=None): """ Store a Python value into the object store. @@ -382,6 +383,9 @@ cdef class PlasmaClient: object_id : ObjectID, default None If this is provided, the specified object ID will be used to refer to the object. + memcopy_threads : int, default 6 + The number of threads to use to write the serialized object into + the object store for large objects. serialization_context : pyarrow.SerializationContext, default None Custom serialization and deserialization context. @@ -394,7 +398,7 @@ cdef class PlasmaClient: serialized = pyarrow.serialize(value, serialization_context) buffer = self.create(target_id, serialized.total_bytes) stream = pyarrow.FixedSizeBufferWriter(buffer) - stream.set_memcopy_threads(4) + stream.set_memcopy_threads(memcopy_threads) serialized.write_to(stream) self.seal(target_id) return target_id diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 9f1051228047a..2fdb606a7d1c7 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -43,6 +43,13 @@ cdef public api object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf): return result +cdef public api object pyarrow_wrap_resizable_buffer( + const shared_ptr[CResizableBuffer]& buf): + cdef ResizableBuffer result = ResizableBuffer() + result.init_rz(buf) + return result + + cdef public api bint pyarrow_is_data_type(object type_): return isinstance(type_, DataType) @@ -72,13 +79,13 @@ cdef public api object pyarrow_wrap_data_type( elif type.get().id() == _Type_STRUCT: out = StructType() elif type.get().id() == _Type_UNION: - out = StructType() + out = UnionType() elif type.get().id() == _Type_TIMESTAMP: out = TimestampType() elif type.get().id() == _Type_FIXED_SIZE_BINARY: out = FixedSizeBinaryType() elif type.get().id() == _Type_DECIMAL: - out = DecimalType() + out = Decimal128Type() else: out = DataType() diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index c37ed3b200ea3..1bc5ed7a372a8 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -258,7 +258,7 @@ cdef class DecimalValue(ArrayValue): def as_py(self): cdef: - CDecimalArray* ap = self.sp_array.get() + CDecimal128Array* ap = self.sp_array.get() c_string s = ap.FormatValue(self.index) return _pydecimal.Decimal(s.decode('utf8')) @@ -315,6 +315,24 @@ cdef class ListValue(ArrayValue): return result +cdef class UnionValue(ArrayValue): + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + self.ap = sp_array.get() + + cdef getitem(self, int64_t i): + cdef int8_t type_id = self.ap.raw_type_ids()[i] + cdef shared_ptr[CArray] child = self.ap.child(type_id) + if self.ap.mode() == _UnionMode_SPARSE: + return box_scalar(self.type[type_id], child, i) + else: + return box_scalar(self.type[type_id], child, + self.ap.value_offset(i)) + + def as_py(self): + return self.getitem(self.index).as_py() + cdef class FixedSizeBinaryValue(ArrayValue): def as_py(self): @@ -364,6 +382,7 @@ cdef dict _scalar_classes = { _Type_FLOAT: FloatValue, _Type_DOUBLE: DoubleValue, _Type_LIST: ListValue, + _Type_UNION: UnionValue, _Type_BINARY: BinaryValue, _Type_STRING: StringValue, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue, diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi index 4e9ab8eb3b374..d95d582fe537e 100644 --- a/python/pyarrow/serialization.pxi +++ b/python/pyarrow/serialization.pxi @@ -59,6 +59,22 @@ cdef class SerializationContext: self.custom_serializers = dict() self.custom_deserializers = dict() + def clone(self): + """ + Return copy of this SerializationContext + + Returns + ------- + clone : SerializationContext + """ + result = SerializationContext() + result.type_to_type_id = self.type_to_type_id.copy() + result.whitelisted_types = self.whitelisted_types.copy() + result.custom_serializers = self.custom_serializers.copy() + result.custom_deserializers = self.custom_deserializers.copy() + + return result + def register_type(self, type_, type_id, pickle=False, custom_serializer=None, custom_deserializer=None): """EXPERIMENTAL: Add type to the list of types we can serialize. @@ -88,17 +104,26 @@ cdef class SerializationContext: self.custom_deserializers[type_id] = custom_deserializer def _serialize_callback(self, obj): - if type(obj) not in self.type_to_type_id: + found = False + for type_ in type(obj).__mro__: + if type_ in self.type_to_type_id: + found = True + break + + if not found: raise SerializationCallbackError( "pyarrow does not know how to " - "serialize objects of type {}.".format(type(obj)), obj) - type_id = self.type_to_type_id[type(obj)] + "serialize objects of type {}.".format(type(obj)), obj + ) + + # use the closest match to type(obj) + type_id = self.type_to_type_id[type_] if type_id in self.types_to_pickle: serialized_obj = {"data": pickle.dumps(obj), "pickle": True} elif type_id in self.custom_serializers: serialized_obj = {"data": self.custom_serializers[type_id](obj)} else: - if is_named_tuple(type(obj)): + if is_named_tuple(type_): serialized_obj = {} serialized_obj["_pa_getnewargs_"] = obj.__getnewargs__() elif hasattr(obj, "__dict__"): @@ -137,6 +162,30 @@ cdef class SerializationContext: obj.__dict__.update(serialized_obj) return obj + def serialize(self, obj): + """ + Call pyarrow.serialize and pass this SerializationContext + """ + return serialize(obj, context=self) + + def serialize_to(self, object value, sink): + """ + Call pyarrow.serialize_to and pass this SerializationContext + """ + return serialize_to(value, sink, context=self) + + def deserialize(self, what): + """ + Call pyarrow.deserialize and pass this SerializationContext + """ + return deserialize(what, context=self) + + def deserialize_components(self, what): + """ + Call pyarrow.deserialize_components and pass this SerializationContext + """ + return deserialize_components(what, context=self) + _default_serialization_context = SerializationContext() @@ -156,7 +205,7 @@ cdef class SerializedPyObject: def __get__(self): cdef CMockOutputStream mock_stream with nogil: - check_status(WriteSerializedObject(self.data, &mock_stream)) + check_status(self.data.WriteTo(&mock_stream)) return mock_stream.GetExtentBytesWritten() @@ -170,7 +219,7 @@ cdef class SerializedPyObject: cdef _write_to(self, OutputStream* stream): with nogil: - check_status(WriteSerializedObject(self.data, stream)) + check_status(self.data.WriteTo(stream)) def deserialize(self, SerializationContext context=None): """ @@ -200,6 +249,46 @@ cdef class SerializedPyObject: self.write_to(sink) return output + @staticmethod + def from_components(components): + """ + Reconstruct SerializedPyObject from output of + SerializedPyObject.to_components + """ + cdef: + int num_tensors = components['num_tensors'] + int num_buffers = components['num_buffers'] + list buffers = components['data'] + SerializedPyObject result = SerializedPyObject() + + with nogil: + check_status(GetSerializedFromComponents(num_tensors, num_buffers, + buffers, &result.data)) + + return result + + def to_components(self, memory_pool=None): + """ + Return the decomposed dict representation of the serialized object + containing a collection of Buffer objects which maximize opportunities + for zero-copy + + Parameters + ---------- + memory_pool : MemoryPool default None + Pool to use for necessary allocations + + Returns + + """ + cdef PyObject* result + cdef CMemoryPool* c_pool = maybe_unbox_memory_pool(memory_pool) + + with nogil: + check_status(self.data.GetComponents(c_pool, &result)) + + return PyObject_to_object(result) + def serialize(object value, SerializationContext context=None): """EXPERIMENTAL: Serialize a Python sequence @@ -292,6 +381,24 @@ def deserialize_from(source, object base, SerializationContext context=None): return serialized.deserialize(context) +def deserialize_components(components, SerializationContext context=None): + """ + Reconstruct Python object from output of SerializedPyObject.to_components + + Parameters + ---------- + components : dict + Output of SerializedPyObject.to_components + context : SerializationContext, default None + + Returns + ------- + object : the Python object that was originally serialized + """ + serialized = SerializedPyObject.from_components(components) + return serialized.deserialize(context) + + def deserialize(obj, SerializationContext context=None): """ EXPERIMENTAL: Deserialize Python object from Buffer or other Python object diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py index 9dc8ee6dee9ad..61f2e83f3193d 100644 --- a/python/pyarrow/serialization.py +++ b/python/pyarrow/serialization.py @@ -16,12 +16,90 @@ # under the License. from collections import OrderedDict, defaultdict +import six import sys import numpy as np -from pyarrow import serialize_pandas, deserialize_pandas -from pyarrow.lib import _default_serialization_context +from pyarrow.compat import builtin_pickle +from pyarrow.lib import _default_serialization_context, frombuffer + +try: + import cloudpickle +except ImportError: + cloudpickle = builtin_pickle + + +# ---------------------------------------------------------------------- +# Set up serialization for numpy with dtype object (primitive types are +# handled efficiently with Arrow's Tensor facilities, see +# python_to_arrow.cc) + +def _serialize_numpy_array_list(obj): + return obj.tolist(), obj.dtype.str + + +def _deserialize_numpy_array_list(data): + return np.array(data[0], dtype=np.dtype(data[1])) + + +def _pickle_to_buffer(x): + pickled = builtin_pickle.dumps(x, protocol=builtin_pickle.HIGHEST_PROTOCOL) + return frombuffer(pickled) + + +def _load_pickle_from_buffer(data): + as_memoryview = memoryview(data) + if six.PY2: + return builtin_pickle.loads(as_memoryview.tobytes()) + else: + return builtin_pickle.loads(as_memoryview) + + +_serialize_numpy_array_pickle = _pickle_to_buffer +_deserialize_numpy_array_pickle = _load_pickle_from_buffer + + +# ---------------------------------------------------------------------- +# pandas-specific serialization matters + +def _register_custom_pandas_handlers(context): + # ARROW-1784, faster path for pandas-only visibility + + try: + import pandas as pd + except ImportError: + return + + import pyarrow.pandas_compat as pdcompat + + def _serialize_pandas_dataframe(obj): + return pdcompat.dataframe_to_serialized_dict(obj) + + def _deserialize_pandas_dataframe(data): + return pdcompat.serialized_dict_to_dataframe(data) + + def _serialize_pandas_series(obj): + return _serialize_pandas_dataframe(pd.DataFrame({obj.name: obj})) + + def _deserialize_pandas_series(data): + deserialized = _deserialize_pandas_dataframe(data) + return deserialized[deserialized.columns[0]] + + context.register_type( + pd.Series, 'pd.Series', + custom_serializer=_serialize_pandas_series, + custom_deserializer=_deserialize_pandas_series) + + context.register_type( + pd.Index, 'pd.Index', + custom_serializer=_pickle_to_buffer, + custom_deserializer=_load_pickle_from_buffer) + + context.register_type( + pd.DataFrame, 'pd.DataFrame', + custom_serializer=_serialize_pandas_dataframe, + custom_deserializer=_deserialize_pandas_dataframe) def register_default_serialization_handlers(serialization_context): @@ -69,53 +147,12 @@ def _deserialize_default_dict(data): type(lambda: 0), "function", pickle=True) - # ---------------------------------------------------------------------- - # Set up serialization for numpy with dtype object (primitive types are - # handled efficiently with Arrow's Tensor facilities, see - # python_to_arrow.cc) - - def _serialize_numpy_array(obj): - return obj.tolist(), obj.dtype.str - - def _deserialize_numpy_array(data): - return np.array(data[0], dtype=np.dtype(data[1])) + serialization_context.register_type(type, "type", pickle=True) serialization_context.register_type( np.ndarray, 'np.array', - custom_serializer=_serialize_numpy_array, - custom_deserializer=_deserialize_numpy_array) - - # ---------------------------------------------------------------------- - # Set up serialization for pandas Series and DataFrame - - try: - import pandas as pd - - def _serialize_pandas_series(obj): - return serialize_pandas(pd.DataFrame({obj.name: obj})) - - def _deserialize_pandas_series(data): - deserialized = deserialize_pandas(data) - return deserialized[deserialized.columns[0]] - - def _serialize_pandas_dataframe(obj): - return serialize_pandas(obj) - - def _deserialize_pandas_dataframe(data): - return deserialize_pandas(data) - - serialization_context.register_type( - pd.Series, 'pd.Series', - custom_serializer=_serialize_pandas_series, - custom_deserializer=_deserialize_pandas_series) - - serialization_context.register_type( - pd.DataFrame, 'pd.DataFrame', - custom_serializer=_serialize_pandas_dataframe, - custom_deserializer=_deserialize_pandas_dataframe) - except ImportError: - # no pandas - pass + custom_serializer=_serialize_numpy_array_list, + custom_deserializer=_deserialize_numpy_array_list) # ---------------------------------------------------------------------- # Set up serialization for pytorch tensors @@ -140,5 +177,14 @@ def _deserialize_torch_tensor(data): # no torch pass + _register_custom_pandas_handlers(serialization_context) + register_default_serialization_handlers(_default_serialization_context) + +pandas_serialization_context = _default_serialization_context.clone() + +pandas_serialization_context.register_type( + np.ndarray, 'np.array', + custom_serializer=_serialize_numpy_array_pickle, + custom_deserializer=_deserialize_numpy_array_pickle) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 6165a6622b836..b03ee26702245 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -108,6 +108,15 @@ cdef class ChunkedArray: return pyarrow_wrap_array(self.chunked_array.chunk(i)) + property chunks: + + def __get__(self): + cdef int i + chunks = [] + for i in range(self.num_chunks): + chunks.append(self.chunk(i)) + return chunks + def iterchunks(self): for i in range(self.num_chunks): yield self.chunk(i) @@ -122,6 +131,83 @@ cdef class ChunkedArray: return result +def chunked_array(arrays, type=None): + """ + Construct chunked array from list of array-like objects + + Parameters + ---------- + arrays : list of Array or values coercible to arrays + type : DataType or string coercible to DataType + + Returns + ------- + ChunkedArray + """ + cdef: + Array arr + vector[shared_ptr[CArray]] c_arrays + shared_ptr[CChunkedArray] sp_chunked_array + + for x in arrays: + if isinstance(x, Array): + arr = x + if type is not None: + assert x.type == type + else: + arr = array(x, type=type) + + c_arrays.push_back(arr.sp_array) + + sp_chunked_array.reset(new CChunkedArray(c_arrays)) + return pyarrow_wrap_chunked_array(sp_chunked_array) + + +def column(object field_or_name, arr): + """ + Create Column object from field/string and array-like data + + Parameters + ---------- + field_or_name : string or Field + arr : Array, list of Arrays, or ChunkedArray + + Returns + ------- + column : Column + """ + cdef: + Field boxed_field + Array _arr + ChunkedArray _carr + shared_ptr[CColumn] sp_column + + if isinstance(arr, list): + arr = chunked_array(arr) + elif not isinstance(arr, (Array, ChunkedArray)): + arr = array(arr) + + if isinstance(field_or_name, Field): + boxed_field = field_or_name + if arr.type != boxed_field.type: + raise ValueError('Passed field type does not match array') + else: + boxed_field = field(field_or_name, arr.type) + + if isinstance(arr, Array): + _arr = arr + sp_column.reset(new CColumn(boxed_field.sp_field, _arr.sp_array)) + elif isinstance(arr, ChunkedArray): + _carr = arr + sp_column.reset(new CColumn(boxed_field.sp_field, + _carr.sp_chunked_array)) + else: + raise ValueError("Unsupported type for column(...): {}" + .format(type(arr))) + + return pyarrow_wrap_column(sp_column) + + cdef class Column: """ Named vector of elements of equal type. @@ -143,25 +229,47 @@ cdef class Column: result = StringIO() result.write(object.__repr__(self)) data = self.data - for i in range(len(data)): - result.write('\nchunk {0}: {1}'.format(i, repr(data.chunk(0)))) + for i, chunk in enumerate(data.chunks): + result.write('\nchunk {0}: {1}'.format(i, repr(chunk))) return result.getvalue() @staticmethod - def from_array(object field_or_name, Array arr): - cdef Field boxed_field + def from_array(*args): + return column(*args) - if isinstance(field_or_name, Field): - boxed_field = field_or_name - if arr.type != boxed_field.type: - raise ValueError('Passed field type does not match array') - else: - boxed_field = field(field_or_name, arr.type) + def cast(self, object target_type, safe=True): + """ + Cast column values to another data type - cdef shared_ptr[CColumn] sp_column - sp_column.reset(new CColumn(boxed_field.sp_field, arr.sp_array)) - return pyarrow_wrap_column(sp_column) + Parameters + ---------- + target_type : DataType + Type to cast to + safe : boolean, default True + Check for overflows or other unsafe conversions + + Returns + ------- + casted : Column + """ + cdef: + CCastOptions options + shared_ptr[CArray] result + DataType type + CDatum out + + type = _ensure_type(target_type) + + options.allow_int_overflow = not safe + options.allow_time_truncate = not safe + + with nogil: + check_status(Cast(_context(), CDatum(self.column.data()), + type.sp_type, options, &out)) + + casted_data = pyarrow_wrap_chunked_array(out.chunked_array()) + return column(self.name, casted_data) def to_pandas(self, strings_to_categorical=False, zero_copy_only=False): """ @@ -241,6 +349,10 @@ cdef class Column: self._check_nullptr() return self.column.length() + @property + def field(self): + return pyarrow_wrap_field(self.column.field()) + @property def shape(self): """ @@ -345,7 +457,10 @@ cdef _schema_from_arrays(arrays, names, dict metadata, else: raise TypeError(type(val)) - c_name = tobytes(names[i]) + if names[i] is None: + c_name = tobytes(u'None') + else: + c_name = tobytes(names[i]) fields[i].reset(new CField(c_name, type_, True)) schema.reset(new CSchema(fields, unbox_metadata(metadata))) @@ -609,7 +724,6 @@ cdef class RecordBatch: Array arr c_string c_name shared_ptr[CSchema] schema - shared_ptr[CRecordBatch] batch vector[shared_ptr[CArray]] c_arrays int64_t num_rows int64_t i @@ -625,8 +739,8 @@ cdef class RecordBatch: for arr in arrays: c_arrays.push_back(arr.sp_array) - batch.reset(new CRecordBatch(schema, num_rows, c_arrays)) - return pyarrow_wrap_batch(batch) + return pyarrow_wrap_batch( + CRecordBatch.Make(schema, num_rows, c_arrays)) def table_to_blocks(PandasOptions options, Table table, int nthreads, @@ -831,8 +945,7 @@ cdef class Table: else: raise ValueError(type(arrays[i])) - table.reset(new CTable(c_schema, columns)) - return pyarrow_wrap_table(table) + return pyarrow_wrap_table(CTable.Make(c_schema, columns)) @staticmethod def from_batches(batches): @@ -858,6 +971,44 @@ cdef class Table: return pyarrow_wrap_table(c_table) + def to_batches(self, chunksize=None): + """ + Convert Table to list of (contiguous) RecordBatch objects, with optimal + maximum chunk size + + Parameters + ---------- + chunksize : int, default None + Maximum size for RecordBatch chunks. Individual chunks may be + smaller depending on the chunk layout of individual columns + + Returns + ------- + batches : list of RecordBatch + """ + cdef: + unique_ptr[TableBatchReader] reader + int64_t c_chunksize + list result = [] + shared_ptr[CRecordBatch] batch + + reader.reset(new TableBatchReader(deref(self.table))) + + if chunksize is not None: + c_chunksize = chunksize + reader.get().set_chunksize(c_chunksize) + + while True: + with nogil: + check_status(reader.get().ReadNext(&batch)) + + if batch.get() == NULL: + break + + result.append(pyarrow_wrap_batch(batch)) + + return result + def to_pandas(self, nthreads=None, strings_to_categorical=False, memory_pool=None, zero_copy_only=False): """ diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index c6bd6c9b3a2d7..e27682232a22d 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pytest import skip +from pytest import skip, mark groups = [ @@ -70,6 +70,18 @@ def pytest_addoption(parser): default=False, help=('Run only the {0} test group'.format(group))) + parser.addoption('--runslow', action='store_true', + default=False, help='run slow tests') + + +def pytest_collection_modifyitems(config, items): + if not config.getoption('--runslow'): + skip_slow = mark.skip(reason='need --runslow option to run') + + for item in items: + if 'slow' in item.keywords: + item.add_marker(skip_slow) + def pytest_runtest_setup(item): only_set = False diff --git a/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet b/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet new file mode 100644 index 0000000000000..e9efd9b390ed4 Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet differ diff --git a/python/pyarrow/tests/data/v0.7.1.column-metadata-handling.parquet b/python/pyarrow/tests/data/v0.7.1.column-metadata-handling.parquet new file mode 100644 index 0000000000000..d48041f518d55 Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.column-metadata-handling.parquet differ diff --git a/python/pyarrow/tests/data/v0.7.1.parquet b/python/pyarrow/tests/data/v0.7.1.parquet new file mode 100644 index 0000000000000..44670bcd19afa Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.parquet differ diff --git a/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet b/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet new file mode 100644 index 0000000000000..34097ca12c7e2 Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet differ diff --git a/js/prepublish.sh b/python/pyarrow/tests/deserialize_buffer.py similarity index 81% rename from js/prepublish.sh rename to python/pyarrow/tests/deserialize_buffer.py index b40504ae808cb..982dc6695d590 100644 --- a/js/prepublish.sh +++ b/python/pyarrow/tests/deserialize_buffer.py @@ -1,5 +1,3 @@ -#!/usr/bin/env bash - # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,10 +15,12 @@ # specific language governing permissions and limitations # under the License. -npm run clean -npm run lint -npm run build -npm run test -npm --no-git-tag-version version patch &>/dev/null -npm run bundle -npm run lerna:publish \ No newline at end of file +# This file is called from a test in test_serialization.py. + +import sys + +import pyarrow as pa + +with open(sys.argv[1], 'rb') as f: + data = f.read() + pa.deserialize(data) diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py index c145e96342668..f11da3c6ce945 100644 --- a/python/pyarrow/tests/pandas_examples.py +++ b/python/pyarrow/tests/pandas_examples.py @@ -110,6 +110,14 @@ def dataframe_with_lists(include_index=False): [0.], np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] + fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) + arrays['bytes_list'] = [ + [b"1", b"f"], + None, + [b"1"], + [b"1", b"2", b"3"], + [], + ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"ä"], diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index e3a4c97567ee6..fa38c9257854e 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -235,6 +235,28 @@ def test_list_from_arrays(): assert result.equals(expected) +def test_union_from_dense(): + binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') + int64 = pa.array([1, 2, 3], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32') + + result = pa.UnionArray.from_dense(types, value_offsets, [binary, int64]) + + assert result.to_pylist() == [b'a', 1, b'c', b'b', 2, 3, b'd'] + + +def test_union_from_sparse(): + binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'], + type='binary') + int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + + result = pa.UnionArray.from_sparse(types, [binary, int64]) + + assert result.to_pylist() == [b'a', 1, b'b', b'c', 2, 3, b'd'] + + def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case @@ -273,6 +295,18 @@ def test_cast_integers_safe(): in_arr.cast(out_type) +def test_cast_column(): + arrays = [pa.array([1, 2, 3]), pa.array([4, 5, 6])] + + col = pa.column('foo', arrays) + + target = pa.float64() + casted = col.cast(target) + + expected = pa.column('foo', [x.cast(target) for x in arrays]) + assert casted.equals(expected) + + def test_cast_integers_unsafe(): # We let NumPy do the unsafe casting unsafe_cases = [ @@ -297,8 +331,12 @@ def test_cast_timestamp_unit(): s_nyc = s.dt.tz_localize('tzlocal()').dt.tz_convert('America/New_York') us_with_tz = pa.timestamp('us', tz='America/New_York') + arr = pa.Array.from_pandas(s_nyc, type=us_with_tz) + # ARROW-1906 + assert arr.type == us_with_tz + arr2 = pa.Array.from_pandas(s, type=pa.timestamp('us')) assert arr[0].as_py() == s_nyc[0] @@ -328,6 +366,86 @@ def test_cast_signed_to_unsigned(): _check_cast_case(case) +def test_unique_simple(): + cases = [ + (pa.array([1, 2, 3, 1, 2, 3]), pa.array([1, 2, 3])), + (pa.array(['foo', None, 'bar', 'foo']), + pa.array(['foo', 'bar'])) + ] + for arr, expected in cases: + result = arr.unique() + assert result.equals(expected) + + +def test_dictionary_encode_simple(): + cases = [ + (pa.array([1, 2, 3, None, 1, 2, 3]), + pa.DictionaryArray.from_arrays( + pa.array([0, 1, 2, None, 0, 1, 2], type='int32'), + [1, 2, 3])), + (pa.array(['foo', None, 'bar', 'foo']), + pa.DictionaryArray.from_arrays( + pa.array([0, None, 1, 0], type='int32'), + ['foo', 'bar'])) + ] + for arr, expected in cases: + result = arr.dictionary_encode() + assert result.equals(expected) + + +def test_cast_time32_to_int(): + arr = pa.array(np.array([0, 1, 2], dtype='int32'), + type=pa.time32('s')) + expected = pa.array([0, 1, 2], type='i4') + + result = arr.cast('i4') + assert result.equals(expected) + + +def test_cast_time64_to_int(): + arr = pa.array(np.array([0, 1, 2], dtype='int64'), + type=pa.time64('us')) + expected = pa.array([0, 1, 2], type='i8') + + result = arr.cast('i8') + assert result.equals(expected) + + +def test_cast_timestamp_to_int(): + arr = pa.array(np.array([0, 1, 2], dtype='int64'), + type=pa.timestamp('us')) + expected = pa.array([0, 1, 2], type='i8') + + result = arr.cast('i8') + assert result.equals(expected) + + +def test_cast_date32_to_int(): + arr = pa.array([0, 1, 2], type='i4') + + result1 = arr.cast('date32') + result2 = result1.cast('i4') + + expected1 = pa.array([ + datetime.date(1970, 1, 1), + datetime.date(1970, 1, 2), + datetime.date(1970, 1, 3) + ]).cast('date32') + + assert result1.equals(expected1) + assert result2.equals(arr) + + +def test_cast_date64_to_int(): + arr = pa.array(np.array([0, 1, 2], dtype='int64'), + type=pa.date64()) + expected = pa.array([0, 1, 2], type='i8') + + result = arr.cast('i8') + + assert result.equals(expected) + + def test_simple_type_construction(): result = pa.lib.TimestampType() with pytest.raises(TypeError): @@ -356,7 +474,7 @@ def test_simple_type_construction(): (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), - (pa.decimal(18, 3), 'decimal'), + (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), @@ -381,6 +499,14 @@ def test_array_conversions_no_sentinel_values(): assert arr3.null_count == 0 +def test_array_from_numpy_datetimeD(): + arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]') + + result = pa.array(arr) + expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32()) + assert result.equals(expected) + + def test_array_from_numpy_ascii(): arr = np.array(['abcde', 'abc', ''], dtype='|S5') diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 414266ddb14ed..d7760da2f9b47 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -178,6 +178,25 @@ def test_date(self): assert arr[2].as_py() == datetime.date(1970, 1, 1) assert arr[3].as_py() == datetime.date(2040, 2, 26) + def test_date32(self): + data = [datetime.date(2000, 1, 1), None] + arr = pa.array(data, type=pa.date32()) + + data2 = [10957, None] + arr2 = pa.array(data2, type=pa.date32()) + + for x in [arr, arr2]: + assert len(x) == 2 + assert x.type == pa.date32() + assert x.null_count == 1 + assert x[0].as_py() == datetime.date(2000, 1, 1) + assert x[1] is pa.NA + + # Overflow + data3 = [2**32, None] + with pytest.raises(pa.ArrowException): + pa.array(data3, type=pa.date32()) + def test_timestamp(self): data = [ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456), @@ -293,9 +312,16 @@ def test_mixed_types_fails(self): with self.assertRaises(pa.ArrowException): pa.array(data) + def test_mixed_types_with_specified_type_fails(self): + data = ['-10', '-5', {'a': 1}, '0', '5', '10'] + + type = pa.string() + with self.assertRaises(pa.ArrowInvalid): + pa.array(data, type=type) + def test_decimal(self): data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')] - type = pa.decimal(precision=7, scale=3) + type = pa.decimal128(precision=7, scale=3) arr = pa.array(data, type=type) assert arr.to_pylist() == data @@ -303,32 +329,32 @@ def test_decimal_different_precisions(self): data = [ decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234') ] - type = pa.decimal(precision=13, scale=3) + type = pa.decimal128(precision=13, scale=3) arr = pa.array(data, type=type) assert arr.to_pylist() == data def test_decimal_no_scale(self): data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')] - type = pa.decimal(precision=10) + type = pa.decimal128(precision=10) arr = pa.array(data, type=type) assert arr.to_pylist() == data def test_decimal_negative(self): data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')] - type = pa.decimal(precision=10, scale=6) + type = pa.decimal128(precision=10, scale=6) arr = pa.array(data, type=type) assert arr.to_pylist() == data def test_decimal_no_whole_part(self): data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')] - type = pa.decimal(precision=7, scale=7) + type = pa.decimal128(precision=7, scale=7) arr = pa.array(data, type=type) assert arr.to_pylist() == data def test_decimal_large_integer(self): data = [decimal.Decimal('-394029506937548693.42983'), decimal.Decimal('32358695912932.01033')] - type = pa.decimal(precision=23, scale=5) + type = pa.decimal128(precision=23, scale=5) arr = pa.array(data, type=type) assert arr.to_pylist() == data diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index d00bf1b28eddc..83b1da135eea4 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -30,7 +30,7 @@ import pandas as pd import pandas.util.testing as tm -from pyarrow.compat import u +from pyarrow.compat import u, PY2 import pyarrow as pa import pyarrow.types as patypes @@ -60,70 +60,73 @@ def _alltypes_example(size=100): }) -class TestPandasConversion(object): +def _check_pandas_roundtrip(df, expected=None, nthreads=1, + expected_schema=None, + check_dtype=True, schema=None, + preserve_index=False, + as_batch=False): + klass = pa.RecordBatch if as_batch else pa.Table + table = klass.from_pandas(df, schema=schema, + preserve_index=preserve_index, + nthreads=nthreads) - def setUp(self): - pass - - def tearDown(self): - pass - - def _check_pandas_roundtrip(self, df, expected=None, nthreads=1, - expected_schema=None, - check_dtype=True, schema=None, - preserve_index=False, - as_batch=False): - klass = pa.RecordBatch if as_batch else pa.Table - table = klass.from_pandas(df, schema=schema, - preserve_index=preserve_index, - nthreads=nthreads) - - result = table.to_pandas(nthreads=nthreads) - if expected_schema: - assert table.schema.equals(expected_schema) - if expected is None: - expected = df - tm.assert_frame_equal(result, expected, check_dtype=check_dtype, - check_index_type=('equiv' if preserve_index - else False)) - - def _check_series_roundtrip(self, s, type_=None): - arr = pa.array(s, from_pandas=True, type=type_) - - result = pd.Series(arr.to_pandas(), name=s.name) - if patypes.is_timestamp(arr.type) and arr.type.tz is not None: - result = (result.dt.tz_localize('utc') - .dt.tz_convert(arr.type.tz)) - - tm.assert_series_equal(s, result) - - def _check_array_roundtrip(self, values, expected=None, mask=None, - type=None): - arr = pa.array(values, from_pandas=True, mask=mask, type=type) - result = arr.to_pandas() - - values_nulls = pd.isnull(values) - if mask is None: - assert arr.null_count == values_nulls.sum() - else: - assert arr.null_count == (mask | values_nulls).sum() + result = table.to_pandas(nthreads=nthreads) + if expected_schema: + assert table.schema.equals(expected_schema) + if expected is None: + expected = df + tm.assert_frame_equal(result, expected, check_dtype=check_dtype, + check_index_type=('equiv' if preserve_index + else False)) - if mask is None: - tm.assert_series_equal(pd.Series(result), pd.Series(values), - check_names=False) - else: - expected = pd.Series(np.ma.masked_array(values, mask=mask)) - tm.assert_series_equal(pd.Series(result), expected, - check_names=False) + +def _check_series_roundtrip(s, type_=None): + arr = pa.array(s, from_pandas=True, type=type_) + + result = pd.Series(arr.to_pandas(), name=s.name) + if patypes.is_timestamp(arr.type) and arr.type.tz is not None: + result = (result.dt.tz_localize('utc') + .dt.tz_convert(arr.type.tz)) + + tm.assert_series_equal(s, result) + + +def _check_array_roundtrip(values, expected=None, mask=None, + type=None): + arr = pa.array(values, from_pandas=True, mask=mask, type=type) + result = arr.to_pandas() + + values_nulls = pd.isnull(values) + if mask is None: + assert arr.null_count == values_nulls.sum() + else: + assert arr.null_count == (mask | values_nulls).sum() + + if mask is None: + tm.assert_series_equal(pd.Series(result), pd.Series(values), + check_names=False) + else: + expected = pd.Series(np.ma.masked_array(values, mask=mask)) + tm.assert_series_equal(pd.Series(result), expected, + check_names=False) + + +def _check_array_from_pandas_roundtrip(np_array): + arr = pa.array(np_array, from_pandas=True) + result = arr.to_pandas() + npt.assert_array_equal(result, np_array) + + +class TestPandasConversion(object): def test_all_none_objects(self): df = pd.DataFrame({'a': [None, None, None]}) - self._check_pandas_roundtrip(df) + _check_pandas_roundtrip(df) def test_all_none_category(self): df = pd.DataFrame({'a': [None, None, None]}) df['a'] = df['a'].astype('category') - self._check_pandas_roundtrip(df) + _check_pandas_roundtrip(df) def test_non_string_columns(self): df = pd.DataFrame({0: [1, 2, 3]}) @@ -133,14 +136,14 @@ def test_non_string_columns(self): def test_column_index_names_are_preserved(self): df = pd.DataFrame({'data': [1, 2, 3]}) df.columns.names = ['a'] - self._check_pandas_roundtrip(df, preserve_index=True) + _check_pandas_roundtrip(df, preserve_index=True) def test_multiindex_columns(self): columns = pd.MultiIndex.from_arrays([ ['one', 'two'], ['X', 'Y'] ]) df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns) - self._check_pandas_roundtrip(df, preserve_index=True) + _check_pandas_roundtrip(df, preserve_index=True) def test_multiindex_columns_with_dtypes(self): columns = pd.MultiIndex.from_arrays( @@ -151,15 +154,47 @@ def test_multiindex_columns_with_dtypes(self): names=['level_1', 'level_2'], ) df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns) - self._check_pandas_roundtrip(df, preserve_index=True) + _check_pandas_roundtrip(df, preserve_index=True) def test_integer_index_column(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')]) - self._check_pandas_roundtrip(df, preserve_index=True) + _check_pandas_roundtrip(df, preserve_index=True) + + def test_index_metadata_field_name(self): + # test None case, and strangely named non-index columns + df = pd.DataFrame( + [(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)], + index=pd.MultiIndex.from_arrays( + [['c', 'b', 'a'], [3, 2, 1]], + names=[None, 'foo'] + ), + columns=['a', None, '__index_level_0__'], + ) + t = pa.Table.from_pandas(df, preserve_index=True) + raw_metadata = t.schema.metadata + + js = json.loads(raw_metadata[b'pandas'].decode('utf8')) + + col1, col2, col3, idx0, foo = js['columns'] + + assert col1['name'] == 'a' + assert col1['name'] == col1['field_name'] + + assert col2['name'] is None + assert col2['field_name'] == 'None' + + assert col3['name'] == '__index_level_0__' + assert col3['name'] == col3['field_name'] + + idx0_name, foo_name = js['index_columns'] + assert idx0_name == '__index_level_0__' + assert idx0['field_name'] == idx0_name + assert idx0['name'] is None + + assert foo_name == '__index_level_1__' + assert foo['name'] == 'foo' def test_categorical_column_index(self): - # I *really* hope no one uses category dtypes for single level column - # indexes df = pd.DataFrame( [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)], columns=pd.Index(list('def'), dtype='category') @@ -171,15 +206,36 @@ def test_categorical_column_index(self): column_indexes, = js['column_indexes'] assert column_indexes['name'] is None assert column_indexes['pandas_type'] == 'categorical' - assert column_indexes['numpy_type'] == 'object' + assert column_indexes['numpy_type'] == 'int8' md = column_indexes['metadata'] assert md['num_categories'] == 3 assert md['ordered'] is False + def test_string_column_index(self): + df = pd.DataFrame( + [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)], + columns=pd.Index(list('def'), name='stringz') + ) + t = pa.Table.from_pandas(df, preserve_index=True) + raw_metadata = t.schema.metadata + js = json.loads(raw_metadata[b'pandas'].decode('utf8')) + + column_indexes, = js['column_indexes'] + assert column_indexes['name'] == 'stringz' + assert column_indexes['name'] == column_indexes['field_name'] + assert column_indexes['pandas_type'] == ('bytes' if PY2 else 'unicode') + assert column_indexes['numpy_type'] == 'object' + + md = column_indexes['metadata'] + + if not PY2: + assert len(md) == 1 + assert md['encoding'] == 'UTF-8' + else: + assert md is None or 'encoding' not in md + def test_datetimetz_column_index(self): - # I *really* hope no one uses category dtypes for single level column - # indexes df = pd.DataFrame( [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)], columns=pd.date_range( @@ -198,6 +254,23 @@ def test_datetimetz_column_index(self): md = column_indexes['metadata'] assert md['timezone'] == 'America/New_York' + def test_datetimetz_row_index(self): + df = pd.DataFrame({ + 'a': pd.date_range( + start='2017-01-01', periods=3, tz='America/New_York' + ) + }) + df = df.set_index('a') + + _check_pandas_roundtrip(df, preserve_index=True) + + def test_categorical_row_index(self): + df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]}) + df['a'] = df.a.astype('category') + df = df.set_index('a') + + _check_pandas_roundtrip(df, preserve_index=True) + def test_float_no_nulls(self): data = {} fields = [] @@ -211,12 +284,32 @@ def test_float_no_nulls(self): df = pd.DataFrame(data) schema = pa.schema(fields) - self._check_pandas_roundtrip(df, expected_schema=schema) + _check_pandas_roundtrip(df, expected_schema=schema) def test_zero_copy_success(self): result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True) npt.assert_array_equal(result, [0, 1, 2]) + def test_duplicate_column_names_does_not_crash(self): + df = pd.DataFrame([(1, 'a'), (2, 'b')], columns=list('aa')) + with pytest.raises(ValueError): + pa.Table.from_pandas(df) + + def test_dictionary_indices_boundscheck(self): + # ARROW-1658. No validation of indices leads to segfaults in pandas + indices = [[0, 1], [0, -1]] + + for inds in indices: + arr = pa.DictionaryArray.from_arrays(inds, ['a']) + batch = pa.RecordBatch.from_arrays([arr], ['foo']) + table = pa.Table.from_batches([batch, batch, batch]) + + with pytest.raises(pa.ArrowException): + arr.to_pandas() + + with pytest.raises(pa.ArrowException): + table.to_pandas() + def test_zero_copy_dictionaries(self): arr = pa.DictionaryArray.from_arrays( np.array([0, 0]), @@ -290,8 +383,8 @@ def test_float_object_nulls(self): expected = pd.DataFrame({'floats': pd.to_numeric(arr)}) field = pa.field('floats', pa.float64()) schema = pa.schema([field]) - self._check_pandas_roundtrip(df, expected=expected, - expected_schema=schema) + _check_pandas_roundtrip(df, expected=expected, + expected_schema=schema) def test_int_object_nulls(self): arr = np.array([None, 1, np.int64(3)] * 5, dtype=object) @@ -299,8 +392,8 @@ def test_int_object_nulls(self): expected = pd.DataFrame({'ints': pd.to_numeric(arr)}) field = pa.field('ints', pa.int64()) schema = pa.schema([field]) - self._check_pandas_roundtrip(df, expected=expected, - expected_schema=schema) + _check_pandas_roundtrip(df, expected=expected, + expected_schema=schema) def test_integer_no_nulls(self): data = OrderedDict() @@ -325,7 +418,7 @@ def test_integer_no_nulls(self): df = pd.DataFrame(data) schema = pa.schema(fields) - self._check_pandas_roundtrip(df, expected_schema=schema) + _check_pandas_roundtrip(df, expected_schema=schema) def test_integer_with_nulls(self): # pandas requires upcast to float dtype @@ -373,7 +466,7 @@ def test_boolean_no_nulls(self): df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) field = pa.field('bools', pa.bool_()) schema = pa.schema([field]) - self._check_pandas_roundtrip(df, expected_schema=schema) + _check_pandas_roundtrip(df, expected_schema=schema) def test_boolean_nulls(self): # pandas requires upcast to object dtype @@ -403,7 +496,7 @@ def test_boolean_object_nulls(self): df = pd.DataFrame({'bools': arr}) field = pa.field('bools', pa.bool_()) schema = pa.schema([field]) - self._check_pandas_roundtrip(df, expected_schema=schema) + _check_pandas_roundtrip(df, expected_schema=schema) def test_all_nulls_cast_numeric(self): arr = np.array([None], dtype=object) @@ -423,7 +516,7 @@ def test_unicode(self): field = pa.field('strings', pa.string()) schema = pa.schema([field]) - self._check_pandas_roundtrip(df, expected_schema=schema) + _check_pandas_roundtrip(df, expected_schema=schema) def test_bytes_to_binary(self): values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan] @@ -434,7 +527,7 @@ def test_bytes_to_binary(self): values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan] expected = pd.DataFrame({'strings': values2}) - self._check_pandas_roundtrip(df, expected) + _check_pandas_roundtrip(df, expected) @pytest.mark.large_memory def test_bytes_exceed_2gb(self): @@ -477,7 +570,7 @@ def test_timestamps_notimezone_no_nulls(self): }) field = pa.field('datetime64', pa.timestamp('ns')) schema = pa.schema([field]) - self._check_pandas_roundtrip( + _check_pandas_roundtrip( df, expected_schema=schema, ) @@ -492,7 +585,7 @@ def test_timestamps_notimezone_nulls(self): }) field = pa.field('datetime64', pa.timestamp('ns')) schema = pa.schema([field]) - self._check_pandas_roundtrip( + _check_pandas_roundtrip( df, expected_schema=schema, ) @@ -507,9 +600,9 @@ def test_timestamps_with_timezone(self): }) df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern') .to_frame()) - self._check_pandas_roundtrip(df) + _check_pandas_roundtrip(df) - self._check_series_roundtrip(df['datetime64']) + _check_series_roundtrip(df['datetime64']) # drop-in a null and ns instead of ms df = pd.DataFrame({ @@ -523,7 +616,17 @@ def test_timestamps_with_timezone(self): df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern') .to_frame()) - self._check_pandas_roundtrip(df) + _check_pandas_roundtrip(df) + + def test_datetime64_to_date32(self): + # ARROW-1718 + arr = pa.array([date(2017, 10, 23), None]) + c = pa.Column.from_array("d", arr) + s = c.to_pandas() + + arr2 = pa.Array.from_pandas(s, type=pa.date32()) + + assert arr2.equals(arr.cast('date32')) def test_date_infer(self): df = pd.DataFrame({ @@ -540,6 +643,15 @@ def test_date_infer(self): expected['date'] = pd.to_datetime(df['date']) tm.assert_frame_equal(result, expected) + def test_date_mask(self): + arr = np.array([date(2017, 4, 3), date(2017, 4, 4)], + dtype='datetime64[D]') + mask = [True, False] + result = pa.array(arr, mask=np.array(mask)) + expected = np.array([None, date(2017, 4, 4)], dtype='datetime64[D]') + expected = pa.array(expected, from_pandas=True) + assert expected.equals(result) + def test_date_objects_typed(self): arr = np.array([ date(2017, 4, 3), @@ -606,13 +718,13 @@ def test_timedelta(self): def test_column_of_arrays(self): df, schema = dataframe_with_arrays() - self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema) + _check_pandas_roundtrip(df, schema=schema, expected_schema=schema) table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) assert table.schema.equals(schema) for column in df.columns: field = schema.field_by_name(column) - self._check_array_roundtrip(df[column], type=field.type) + _check_array_roundtrip(df[column], type=field.type) def test_column_of_arrays_to_py(self): # Test regression in ARROW-1199 not caught in above test @@ -633,13 +745,13 @@ def test_column_of_arrays_to_py(self): def test_column_of_lists(self): df, schema = dataframe_with_lists() - self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema) + _check_pandas_roundtrip(df, schema=schema, expected_schema=schema) table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) assert table.schema.equals(schema) for column in df.columns: field = schema.field_by_name(column) - self._check_array_roundtrip(df[column], type=field.type) + _check_array_roundtrip(df[column], type=field.type) def test_column_of_lists_chunked(self): # ARROW-1357 @@ -691,7 +803,7 @@ def test_column_of_lists_strided(self): arr = df['int64'].values[::3] assert arr.strides[0] != 8 - self._check_array_roundtrip(arr) + _check_array_roundtrip(arr) def test_nested_lists_all_none(self): data = np.array([[None, None], None], dtype=object) @@ -710,8 +822,8 @@ def test_nested_lists_all_none(self): def test_threaded_conversion(self): df = _alltypes_example() - self._check_pandas_roundtrip(df, nthreads=2) - self._check_pandas_roundtrip(df, nthreads=2, as_batch=True) + _check_pandas_roundtrip(df, nthreads=2) + _check_pandas_roundtrip(df, nthreads=2, as_batch=True) def test_category(self): repeats = 5 @@ -729,7 +841,7 @@ def test_category(self): 'strings': v1 * repeats, 'strings2': v1 * repeats, 'strings3': v3 * repeats}) - self._check_pandas_roundtrip(df) + _check_pandas_roundtrip(df) arrays = [ pd.Categorical(v1 * repeats), @@ -737,7 +849,7 @@ def test_category(self): pd.Categorical(v3 * repeats) ] for values in arrays: - self._check_array_roundtrip(values) + _check_array_roundtrip(values) def test_mixed_types_fails(self): data = pd.DataFrame({'a': ['a', 1, 2.0]}) @@ -784,9 +896,9 @@ def test_strided_data_import(self): df = pd.DataFrame(case, columns=columns) col = df['a'] - self._check_pandas_roundtrip(df) - self._check_array_roundtrip(col) - self._check_array_roundtrip(col, mask=strided_mask) + _check_pandas_roundtrip(df) + _check_array_roundtrip(col) + _check_array_roundtrip(col, mask=strided_mask) def test_decimal_32_from_pandas(self): expected = pd.DataFrame({ @@ -796,7 +908,7 @@ def test_decimal_32_from_pandas(self): ] }) converted = pa.Table.from_pandas(expected, preserve_index=False) - field = pa.field('decimals', pa.decimal(7, 3)) + field = pa.field('decimals', pa.decimal128(7, 3)) schema = pa.schema([field]) assert converted.schema.equals(schema) @@ -819,7 +931,7 @@ def test_decimal_64_from_pandas(self): ] }) converted = pa.Table.from_pandas(expected, preserve_index=False) - field = pa.field('decimals', pa.decimal(12, 6)) + field = pa.field('decimals', pa.decimal128(12, 6)) schema = pa.schema([field]) assert converted.schema.equals(schema) @@ -842,7 +954,7 @@ def test_decimal_128_from_pandas(self): ] }) converted = pa.Table.from_pandas(expected, preserve_index=False) - field = pa.field('decimals', pa.decimal(26, 11)) + field = pa.field('decimals', pa.decimal128(26, 11)) schema = pa.schema([field]) assert converted.schema.equals(schema) @@ -946,11 +1058,6 @@ def test_arrow_time_to_pandas(self): tm.assert_frame_equal(df, expected_df) - def _check_array_from_pandas_roundtrip(self, np_array): - arr = pa.array(np_array, from_pandas=True) - result = arr.to_pandas() - npt.assert_array_equal(result, np_array) - def test_numpy_datetime64_columns(self): datetime64_ns = np.array([ '2007-07-13T01:23:34.123456789', @@ -958,7 +1065,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') - self._check_array_from_pandas_roundtrip(datetime64_ns) + _check_array_from_pandas_roundtrip(datetime64_ns) datetime64_us = np.array([ '2007-07-13T01:23:34.123456', @@ -966,7 +1073,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432539', '2010-08-13T05:46:57.437699'], dtype='datetime64[us]') - self._check_array_from_pandas_roundtrip(datetime64_us) + _check_array_from_pandas_roundtrip(datetime64_us) datetime64_ms = np.array([ '2007-07-13T01:23:34.123', @@ -974,7 +1081,7 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56.432', '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') - self._check_array_from_pandas_roundtrip(datetime64_ms) + _check_array_from_pandas_roundtrip(datetime64_ms) datetime64_s = np.array([ '2007-07-13T01:23:34', @@ -982,15 +1089,16 @@ def test_numpy_datetime64_columns(self): '2006-01-13T12:34:56', '2010-08-13T05:46:57'], dtype='datetime64[s]') - self._check_array_from_pandas_roundtrip(datetime64_s) + _check_array_from_pandas_roundtrip(datetime64_s) + def test_numpy_datetime64_day_unit(self): datetime64_d = np.array([ '2007-07-13', None, '2006-01-15', '2010-08-19'], dtype='datetime64[D]') - self._check_array_from_pandas_roundtrip(datetime64_d) + _check_array_from_pandas_roundtrip(datetime64_d) def test_all_nones(self): def _check_series(s): @@ -1037,8 +1145,8 @@ def test_partial_schema(self): pa.field('c', pa.int64()) ]) - self._check_pandas_roundtrip(df, schema=partial_schema, - expected_schema=expected_schema) + _check_pandas_roundtrip(df, schema=partial_schema, + expected_schema=expected_schema) def test_structarray(self): ints = pa.array([None, 2, 3], type=pa.int64()) @@ -1073,7 +1181,7 @@ def test_infer_lists(self): pa.field('nested_strs', pa.list_(pa.list_(pa.string()))) ]) - self._check_pandas_roundtrip(df, expected_schema=expected_schema) + _check_pandas_roundtrip(df, expected_schema=expected_schema) def test_infer_numpy_array(self): data = OrderedDict([ @@ -1087,7 +1195,7 @@ def test_infer_numpy_array(self): pa.field('ints', pa.list_(pa.int64())) ]) - self._check_pandas_roundtrip(df, expected_schema=expected_schema) + _check_pandas_roundtrip(df, expected_schema=expected_schema) def test_metadata_with_mixed_types(self): df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']}) @@ -1142,12 +1250,12 @@ def test_table_str_to_categorical(self): def test_table_batch_empty_dataframe(self): df = pd.DataFrame({}) - self._check_pandas_roundtrip(df) - self._check_pandas_roundtrip(df, as_batch=True) + _check_pandas_roundtrip(df) + _check_pandas_roundtrip(df, as_batch=True) df2 = pd.DataFrame({}, index=[0, 1, 2]) - self._check_pandas_roundtrip(df2, preserve_index=True) - self._check_pandas_roundtrip(df2, as_batch=True, preserve_index=True) + _check_pandas_roundtrip(df2, preserve_index=True) + _check_pandas_roundtrip(df2, as_batch=True, preserve_index=True) def test_array_from_pandas_date_with_mask(self): m = np.array([True, False, True]) @@ -1188,6 +1296,94 @@ def test_array_from_pandas_typed_array_with_mask(self, t, data, expected): assert pa.Array.from_pandas(expected, type=pa.list_(t())).equals(result) + def test_table_column_subset_metadata(self): + # ARROW-1883 + df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) + table = pa.Table.from_pandas(df) + + table_subset = table.remove_column(1) + result = table_subset.to_pandas() + tm.assert_frame_equal(result, df[['a']]) + + table_subset2 = table_subset.remove_column(1) + result = table_subset2.to_pandas() + tm.assert_frame_equal(result, df[['a']]) + + # non-default index + for index in [ + pd.Index(['a', 'b', 'c'], name='index'), + pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')]: + df = pd.DataFrame({'a': [1, 2, 3], + 'b': [.1, .2, .3]}, index=index) + table = pa.Table.from_pandas(df) + + table_subset = table.remove_column(1) + result = table_subset.to_pandas() + tm.assert_frame_equal(result, df[['a']]) + + table_subset2 = table_subset.remove_column(1) + result = table_subset2.to_pandas() + tm.assert_frame_equal(result, df[['a']].reset_index(drop=True)) + + def test_empty_list_roundtrip(self): + empty_list_array = np.empty((3,), dtype=object) + empty_list_array.fill([]) + + df = pd.DataFrame({'a': np.array(['1', '2', '3']), + 'b': empty_list_array}) + tbl = pa.Table.from_pandas(df) + + result = tbl.to_pandas() + + tm.assert_frame_equal(result, df) + + +def _fully_loaded_dataframe_example(): + from distutils.version import LooseVersion + + index = pd.MultiIndex.from_arrays([ + pd.date_range('2000-01-01', periods=5).repeat(2), + np.tile(np.array(['foo', 'bar'], dtype=object), 5) + ]) + + c1 = pd.date_range('2000-01-01', periods=10) + data = { + 0: c1, + 1: c1.tz_localize('utc'), + 2: c1.tz_localize('US/Eastern'), + 3: c1[::2].tz_localize('utc').repeat(2).astype('category'), + 4: ['foo', 'bar'] * 5, + 5: pd.Series(['foo', 'bar'] * 5).astype('category').values, + 6: [True, False] * 5, + 7: np.random.randn(10), + 8: np.random.randint(0, 100, size=10), + 9: pd.period_range('2013', periods=10, freq='M') + } + + if LooseVersion(pd.__version__) >= '0.21': + # There is an issue with pickling IntervalIndex in pandas 0.20.x + data[10] = pd.interval_range(start=1, freq=1, periods=10) + + return pd.DataFrame(data, index=index) + + +def _check_serialize_components_roundtrip(df): + ctx = pa.pandas_serialization_context + + components = ctx.serialize(df).to_components() + deserialized = ctx.deserialize_components(components) + + tm.assert_frame_equal(df, deserialized) + + +def test_serialize_deserialize_pandas(): + # ARROW-1784, serialize and deserialize DataFrame by decomposing + # BlockManager + df = _fully_loaded_dataframe_example() + _check_serialize_components_roundtrip(df) + def _pytime_from_micros(val): microseconds = val % 1000000 diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 9e7fc8863e759..b0764fdec1768 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -50,7 +50,7 @@ def tearDown(self): pass def test_file_not_exist(self): - with self.assertRaises(pa.ArrowIOError): + with pytest.raises(pa.ArrowIOError): FeatherReader('test_invalid_file') def _get_null_counts(self, path, columns=None): @@ -98,7 +98,7 @@ def _assert_error_on_write(self, df, exc, path=None): def f(): write_feather(df, path) - self.assertRaises(exc, f) + pytest.raises(exc, f) def test_num_rows_attr(self): df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]}) @@ -466,3 +466,8 @@ def test_unsupported(self): # non-strings df = pd.DataFrame({'a': ['a', 1, 2.0]}) self._assert_error_on_write(df, ValueError) + + @pytest.mark.slow + def test_large_dataframe(self): + df = pd.DataFrame({'A': np.arange(400000000)}) + self._check_pandas_roundtrip(df) diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 98c465adcdb3d..e60dd35de66fe 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -182,6 +182,42 @@ def test_allocate_buffer(): assert buf.to_pybytes()[:5] == bit +def test_allocate_buffer_resizable(): + buf = pa.allocate_buffer(100, resizable=True) + assert isinstance(buf, pa.ResizableBuffer) + + buf.resize(200) + assert buf.size == 200 + + +def test_compress_decompress(): + INPUT_SIZE = 10000 + test_data = (np.random.randint(0, 255, size=INPUT_SIZE) + .astype(np.uint8) + .tostring()) + test_buf = pa.frombuffer(test_data) + + codecs = ['lz4', 'snappy', 'gzip', 'zstd', 'brotli'] + for codec in codecs: + compressed_buf = pa.compress(test_buf, codec=codec) + compressed_bytes = pa.compress(test_data, codec=codec, asbytes=True) + + assert isinstance(compressed_bytes, bytes) + + decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE, + codec=codec) + decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE, + codec=codec, asbytes=True) + + assert isinstance(decompressed_bytes, bytes) + + assert decompressed_buf.equals(test_buf) + assert decompressed_bytes == test_data + + with pytest.raises(ValueError): + pa.decompress(compressed_bytes, codec=codec) + + def test_buffer_memoryview_is_immutable(): val = b'some data' diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 68c0c80aa6187..9cd5f807662d6 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -168,6 +168,29 @@ def test_stream_write_dispatch(self): assert_frame_equal(table.to_pandas(), pd.concat([df, df], ignore_index=True)) + def test_stream_write_table_batches(self): + # ARROW-504 + df = pd.DataFrame({ + 'one': np.random.randn(20), + }) + + b1 = pa.RecordBatch.from_pandas(df[:10], preserve_index=False) + b2 = pa.RecordBatch.from_pandas(df, preserve_index=False) + + table = pa.Table.from_batches([b1, b2, b1]) + + writer = self._get_writer(self.sink, table.schema) + writer.write_table(table, chunksize=15) + writer.close() + + batches = list(pa.open_stream(pa.BufferReader(self._get_source()))) + + assert list(map(len, batches)) == [10, 15, 5, 10] + result_table = pa.Table.from_batches(batches) + assert_frame_equal(result_table.to_pandas(), + pd.concat([df[:10], df, df[:10]], + ignore_index=True)) + def test_simple_roundtrip(self): _, batches = self.write_batches() file_contents = pa.BufferReader(self._get_source()) @@ -432,16 +455,23 @@ def test_serialize_pandas_no_preserve_index(): def test_serialize_with_pandas_objects(): df = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3]) + s = pd.Series([1, 2, 3, 4]) data = { 'a_series': df['a'], - 'a_frame': df + 'a_frame': df, + 's_series': s } serialized = pa.serialize(data).to_buffer() deserialized = pa.deserialize(serialized) assert_frame_equal(deserialized['a_frame'], df) + assert_series_equal(deserialized['a_series'], df['a']) + assert deserialized['a_series'].name == 'a' + + assert_series_equal(deserialized['s_series'], s) + assert deserialized['s_series'].name is None def test_schema_batch_serialize_methods(): diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index a7fe98ce71cd1..c2bb31c9bcf51 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -16,13 +16,17 @@ # under the License. from os.path import join as pjoin + import datetime +import decimal import io -import os import json +import os + import pytest -from pyarrow.compat import guid, u, BytesIO +from pyarrow.compat import guid, u, BytesIO, unichar, frombytes +from pyarrow.tests import util from pyarrow.filesystem import LocalFileSystem import pyarrow as pa from .pandas_examples import dataframe_with_arrays, dataframe_with_lists @@ -114,6 +118,24 @@ def test_pandas_parquet_2_0_rountrip(tmpdir): tm.assert_frame_equal(df, df_read) +@parquet +def test_chunked_table_write(tmpdir): + # ARROW-232 + df = alltypes_sample(size=10) + + # The nanosecond->ms conversion is a nuisance, so we just avoid it here + del df['datetime'] + + batch = pa.RecordBatch.from_pandas(df) + table = pa.Table.from_batches([batch] * 3) + _check_roundtrip(table, version='2.0') + + df, _ = dataframe_with_lists() + batch = pa.RecordBatch.from_pandas(df) + table = pa.Table.from_batches([batch] * 3) + _check_roundtrip(table, version='2.0') + + @parquet def test_pandas_parquet_datetime_tz(): import pyarrow.parquet as pq @@ -469,13 +491,61 @@ def test_parquet_metadata_api(): schema[-1] # Row group - rg_meta = meta.row_group(0) - repr(rg_meta) + for rg in range(meta.num_row_groups): + rg_meta = meta.row_group(rg) + repr(rg_meta) + + for col in range(rg_meta.num_columns): + col_meta = rg_meta.column(col) + repr(col_meta) assert rg_meta.num_rows == len(df) assert rg_meta.num_columns == ncols + 1 # +1 for index +@parquet +@pytest.mark.parametrize( + 'data, dtype, min_value, max_value, null_count, num_values', + [ + ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4), + ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4), + ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4), + ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4), + ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4), + ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4), + ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4), + ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4), + ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4), + ( + [u'', u'b', unichar(1000), None, u'aaa'], + str, u' ', frombytes((unichar(1000) + u' ').encode('utf-8')), 1, 4 + ), + ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5), + ] +) +def test_parquet_column_statistics_api( + data, + dtype, + min_value, + max_value, + null_count, + num_values): + df = pd.DataFrame({'data': data}, dtype=dtype) + + fileh = make_sample_file(df) + + meta = fileh.metadata + + rg_meta = meta.row_group(0) + col_meta = rg_meta.column(0) + + stat = col_meta.statistics + assert stat.min == min_value + assert stat.max == max_value + assert stat.null_count == null_count + assert stat.num_values == num_values + + @parquet def test_compare_schemas(): df = alltypes_sample(size=10000) @@ -972,15 +1042,18 @@ def _visit_level(base_dir, level, part_keys): if level == DEPTH - 1: # Generate example data - file_path = pjoin(level_dir, 'data.parq') + file_path = pjoin(level_dir, guid()) filtered_df = _filter_partition(df, this_part_keys) part_table = pa.Table.from_pandas(filtered_df) with fs.open(file_path, 'wb') as f: _write_table(part_table, f) assert fs.exists(file_path) + + _touch(pjoin(level_dir, '_SUCCESS')) else: _visit_level(level_dir, level + 1, this_part_keys) + _touch(pjoin(level_dir, '_SUCCESS')) _visit_level(base_dir, 0, []) @@ -1053,6 +1126,11 @@ def _filter_partition(df, part_keys): return df[predicate].drop(to_drop, axis=1) +def _touch(path): + with open(path, 'wb'): + pass + + @parquet def test_read_multiple_files(tmpdir): import pyarrow.parquet as pq @@ -1080,8 +1158,7 @@ def test_read_multiple_files(tmpdir): paths.append(path) # Write a _SUCCESS.crc file - with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f: - f.write(b'0') + _touch(pjoin(dirpath, '_SUCCESS.crc')) def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): dataset = pq.ParquetDataset(paths, **kwargs) @@ -1171,7 +1248,8 @@ def test_dataset_read_pandas(tmpdir): @parquet -def test_dataset_read_pandas_common_metadata(tmpdir): +@pytest.mark.parametrize('preserve_index', [True, False]) +def test_dataset_read_pandas_common_metadata(tmpdir, preserve_index): # ARROW-1103 import pyarrow.parquet as pq @@ -1186,15 +1264,11 @@ def test_dataset_read_pandas_common_metadata(tmpdir): paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) - df.index = pd.Index(np.arange(i * size, (i + 1) * size)) - df.index.name = 'index' + df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index') - path = pjoin(dirpath, '{0}.parquet'.format(i)) + path = pjoin(dirpath, '{:d}.parquet'.format(i)) - df_ex_index = df.reset_index(drop=True) - df_ex_index['index'] = df.index - table = pa.Table.from_pandas(df_ex_index, - preserve_index=False) + table = pa.Table.from_pandas(df, preserve_index=preserve_index) # Obliterate metadata table = table.replace_schema_metadata(None) @@ -1206,7 +1280,9 @@ def test_dataset_read_pandas_common_metadata(tmpdir): paths.append(path) # Write _metadata common file - table_for_metadata = pa.Table.from_pandas(df) + table_for_metadata = pa.Table.from_pandas( + df, preserve_index=preserve_index + ) pq.write_metadata(table_for_metadata.schema, pjoin(dirpath, '_metadata')) @@ -1214,7 +1290,7 @@ def test_dataset_read_pandas_common_metadata(tmpdir): columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) - + expected.index.name = df.index.name if preserve_index else None tm.assert_frame_equal(result, expected) @@ -1387,3 +1463,191 @@ def test_large_table_int32_overflow(): table = pa.Table.from_arrays([parr], names=['one']) f = io.BytesIO() _write_table(table, f) + + +@parquet +def test_index_column_name_duplicate(tmpdir): + data = { + 'close': { + pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998, + pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998, + }, + 'time': { + pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp( + '2017-06-30 01:31:00' + ), + pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp( + '2017-06-30 01:32:00' + ), + } + } + path = str(tmpdir / 'data.parquet') + dfx = pd.DataFrame(data).set_index('time', drop=False) + tdfx = pa.Table.from_pandas(dfx) + _write_table(tdfx, path) + arrow_table = _read_table(path) + result_df = arrow_table.to_pandas() + tm.assert_frame_equal(result_df, dfx) + + +@parquet +def test_parquet_nested_convenience(tmpdir): + # ARROW-1684 + import pyarrow.parquet as pq + + df = pd.DataFrame({ + 'a': [[1, 2, 3], None, [4, 5], []], + 'b': [[1.], None, None, [6., 7.]], + }) + + path = str(tmpdir / 'nested_convenience.parquet') + + table = pa.Table.from_pandas(df, preserve_index=False) + _write_table(table, path) + + read = pq.read_table(path, columns=['a']) + tm.assert_frame_equal(read.to_pandas(), df[['a']]) + + read = pq.read_table(path, columns=['a', 'b']) + tm.assert_frame_equal(read.to_pandas(), df) + + +@parquet +def test_backwards_compatible_index_naming(): + expected_string = b"""\ +carat cut color clarity depth table price x y z + 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 + 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 + 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 + 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 + 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 + 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 + 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 + 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 + 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 + 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" + expected = pd.read_csv( + io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0 + ) + path = os.path.join(os.path.dirname(__file__), 'data', 'v0.7.1.parquet') + t = _read_table(path) + result = t.to_pandas() + tm.assert_frame_equal(result, expected) + + +@parquet +def test_backwards_compatible_index_multi_level_named(): + expected_string = b"""\ +carat cut color clarity depth table price x y z + 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 + 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 + 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 + 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 + 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 + 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 + 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 + 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 + 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 + 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" + expected = pd.read_csv( + io.BytesIO(expected_string), + sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0 + ).sort_index() + path = os.path.join( + os.path.dirname(__file__), 'data', 'v0.7.1.all-named-index.parquet' + ) + t = _read_table(path) + result = t.to_pandas() + tm.assert_frame_equal(result, expected) + + +@parquet +def test_backwards_compatible_index_multi_level_some_named(): + expected_string = b"""\ +carat cut color clarity depth table price x y z + 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 + 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 + 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 + 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 + 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 + 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 + 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 + 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 + 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 + 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" + expected = pd.read_csv( + io.BytesIO(expected_string), + sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0 + ).sort_index() + expected.index = expected.index.set_names(['cut', None, 'clarity']) + path = os.path.join( + os.path.dirname(__file__), 'data', 'v0.7.1.some-named-index.parquet' + ) + t = _read_table(path) + result = t.to_pandas() + tm.assert_frame_equal(result, expected) + + +@parquet +def test_backwards_compatible_column_metadata_handling(): + expected = pd.DataFrame( + {'a': [1, 2, 3], 'b': [.1, .2, .3], + 'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) + expected.index = pd.MultiIndex.from_arrays( + [['a', 'b', 'c'], + pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')], + names=['index', None]) + + path = os.path.join( + os.path.dirname(__file__), 'data', + 'v0.7.1.column-metadata-handling.parquet' + ) + t = _read_table(path) + result = t.to_pandas() + tm.assert_frame_equal(result, expected) + + t = _read_table(path, columns=['a']) + result = t.to_pandas() + tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) + + +@parquet +def test_decimal_roundtrip(tmpdir): + num_values = 10 + + columns = {} + + for precision in range(1, 39): + for scale in range(0, precision + 1): + with util.random_seed(0): + random_decimal_values = [ + util.randdecimal(precision, scale) + for _ in range(num_values) + ] + column_name = ('dec_precision_{:d}_scale_{:d}' + .format(precision, scale)) + columns[column_name] = random_decimal_values + + expected = pd.DataFrame(columns) + filename = tmpdir.join('decimals.parquet') + string_filename = str(filename) + t = pa.Table.from_pandas(expected) + _write_table(t, string_filename) + result_table = _read_table(string_filename) + result = result_table.to_pandas() + tm.assert_frame_equal(result, expected) + + +@parquet +@pytest.mark.xfail( + raises=pa.ArrowException, reason='Parquet does not support negative scale' +) +def test_decimal_roundtrip_negative_scale(tmpdir): + expected = pd.DataFrame({'decimal_num': [decimal.Decimal('1.23E4')]}) + filename = tmpdir.join('decimals.parquet') + string_filename = str(filename) + t = pa.Table.from_pandas(expected) + _write_table(t, string_filename) + result_table = _read_table(string_filename) + result = result_table.to_pandas() + tm.assert_frame_equal(result, expected) diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py index b73d92d14fd43..9ea6476670441 100644 --- a/python/pyarrow/tests/test_plasma.py +++ b/python/pyarrow/tests/test_plasma.py @@ -102,7 +102,8 @@ def assert_get_object_equal(unit_test, client1, client2, object_id, def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY, use_valgrind=False, use_profiler=False, - stdout_file=None, stderr_file=None): + stdout_file=None, stderr_file=None, + use_one_memory_mapped_file=False): """Start a plasma store process. Args: use_valgrind (bool): True if the plasma store should be started inside @@ -113,6 +114,8 @@ def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY, no redirection should happen, then this should be None. stderr_file: A file handle opened for writing to redirect stderr to. If no redirection should happen, then this should be None. + use_one_memory_mapped_file: If True, then the store will use only a + single memory-mapped file. Return: A tuple of the name of the plasma store socket and the process ID of the plasma store process. @@ -124,6 +127,8 @@ def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY, command = [plasma_store_executable, "-s", plasma_store_name, "-m", str(plasma_store_memory)] + if use_one_memory_mapped_file: + command += ["-f"] if use_valgrind: pid = subprocess.Popen(["valgrind", "--track-origins=yes", @@ -147,10 +152,14 @@ def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY, class TestPlasmaClient(object): def setup_method(self, test_method): + use_one_memory_mapped_file = (test_method == + self.test_use_one_memory_mapped_file) + import pyarrow.plasma as plasma # Start Plasma store. plasma_store_name, self.p = start_plasma_store( - use_valgrind=os.getenv("PLASMA_VALGRIND") == "1") + use_valgrind=os.getenv("PLASMA_VALGRIND") == "1", + use_one_memory_mapped_file=use_one_memory_mapped_file) # Connect to Plasma. self.plasma_client = plasma.connect(plasma_store_name, "", 64) # For the eviction test @@ -301,12 +310,14 @@ def __init__(self, val): serialization_context = pa.SerializationContext() serialization_context.register_type(CustomType, 20*b"\x00") - object_id = self.plasma_client.put(val, None, serialization_context) + object_id = self.plasma_client.put( + val, None, serialization_context=serialization_context) with pytest.raises(pa.ArrowSerializationError): result = self.plasma_client.get(object_id) - result = self.plasma_client.get(object_id, -1, serialization_context) + result = self.plasma_client.get( + object_id, -1, serialization_context=serialization_context) assert result.val == val.val def test_store_arrow_objects(self): @@ -351,7 +362,7 @@ def test_store_pandas_dataframe(self): # Read the DataFrame. [data] = self.plasma_client.get_buffers([object_id]) reader = pa.RecordBatchStreamReader(pa.BufferReader(data)) - result = reader.get_next_batch().to_pandas() + result = reader.read_next_batch().to_pandas() pd.util.testing.assert_frame_equal(df, result) @@ -720,3 +731,27 @@ def test_subscribe_deletions(self): assert object_ids[i] == recv_objid assert -1 == recv_dsize assert -1 == recv_msize + + def test_use_one_memory_mapped_file(self): + # Fill the object store up with a large number of small objects and let + # them go out of scope. + for _ in range(100): + create_object( + self.plasma_client, + np.random.randint(1, DEFAULT_PLASMA_STORE_MEMORY // 20), 0) + # Create large objects that require the full object store size, and + # verify that they fit. + for _ in range(2): + create_object(self.plasma_client, DEFAULT_PLASMA_STORE_MEMORY, 0) + # Verify that an object that is too large does not fit. + with pytest.raises(pa.lib.PlasmaStoreFull): + create_object(self.plasma_client, DEFAULT_PLASMA_STORE_MEMORY + 1, + 0) + + +@pytest.mark.plasma +def test_object_id_size(): + import pyarrow.plasma as plasma + with pytest.raises(ValueError): + plasma.ObjectID("hello") + plasma.ObjectID(20 * b"0") diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index d6b2655b7c6a0..dbca139e20570 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -319,13 +319,21 @@ def test_type_schema_pickling(): pa.field('a', 'int8'), pa.field('b', 'string') ]), + pa.union([ + pa.field('a', pa.int8()), + pa.field('b', pa.int16()) + ], pa.lib.UnionMode_SPARSE), + pa.union([ + pa.field('a', pa.int8()), + pa.field('b', pa.int16()) + ], pa.lib.UnionMode_DENSE), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), - pa.decimal(12, 2), + pa.decimal128(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index 7878a09228d06..6116556386b1a 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -21,6 +21,7 @@ from collections import namedtuple, OrderedDict, defaultdict import datetime +import os import string import sys @@ -209,13 +210,24 @@ def make_serialization_context(): serialization_context = make_serialization_context() -def serialization_roundtrip(value, f): +def serialization_roundtrip(value, f, ctx=serialization_context): f.seek(0) - pa.serialize_to(value, f, serialization_context) + pa.serialize_to(value, f, ctx) f.seek(0) - result = pa.deserialize_from(f, None, serialization_context) + result = pa.deserialize_from(f, None, ctx) assert_equal(value, result) + _check_component_roundtrip(value) + + +def _check_component_roundtrip(value): + # Test to/from components + serialized = pa.serialize(value) + components = serialized.to_components() + from_comp = pa.SerializedPyObject.from_components(components) + recons = from_comp.deserialize() + assert_equal(value, recons) + @pytest.yield_fixture(scope='session') def large_memory_map(tmpdir_factory, size=100*1024*1024): @@ -235,6 +247,7 @@ def test_primitive_serialization(large_memory_map): with pa.memory_map(large_memory_map, mode="r+") as mmap: for obj in PRIMITIVE_OBJECTS: serialization_roundtrip(obj, mmap) + serialization_roundtrip(obj, mmap, pa.pandas_serialization_context) def test_serialize_to_buffer(): @@ -338,7 +351,7 @@ def deserialize_dummy_class(serialized_obj): return serialized_obj pa._default_serialization_context.register_type( - DummyClass, "DummyClass", pickle=False, + DummyClass, "DummyClass", custom_serializer=serialize_dummy_class, custom_deserializer=deserialize_dummy_class) @@ -357,7 +370,7 @@ def deserialize_buffer_class(serialized_obj): return serialized_obj pa._default_serialization_context.register_type( - BufferClass, "BufferClass", pickle=False, + BufferClass, "BufferClass", custom_serializer=serialize_buffer_class, custom_deserializer=deserialize_buffer_class) @@ -416,3 +429,129 @@ class TempClass(object): with pytest.raises(pa.DeserializationCallbackError) as err: serialized_object.deserialize(deserialization_context) assert err.value.type_id == 20*b"\x00" + + +def test_fallback_to_subclasses(): + + class SubFoo(Foo): + def __init__(self): + Foo.__init__(self) + + # should be able to serialize/deserialize an instance + # if a base class has been registered + serialization_context = pa.SerializationContext() + serialization_context.register_type(Foo, "Foo") + + subfoo = SubFoo() + # should fallbact to Foo serializer + serialized_object = pa.serialize(subfoo, serialization_context) + + reconstructed_object = serialized_object.deserialize( + serialization_context + ) + assert type(reconstructed_object) == Foo + + +class Serializable(object): + pass + + +def serialize_serializable(obj): + return {"type": type(obj), "data": obj.__dict__} + + +def deserialize_serializable(obj): + val = obj["type"].__new__(obj["type"]) + val.__dict__.update(obj["data"]) + return val + + +class SerializableClass(Serializable): + def __init__(self): + self.value = 3 + + +def test_serialize_subclasses(): + + # This test shows how subclasses can be handled in an idiomatic way + # by having only a serializer for the base class + + # This technique should however be used with care, since pickling + # type(obj) with couldpickle will include the full class definition + # in the serialized representation. + # This means the class definition is part of every instance of the + # object, which in general is not desirable; registering all subclasses + # with register_type will result in faster and more memory + # efficient serialization. + + serialization_context.register_type( + Serializable, "Serializable", + custom_serializer=serialize_serializable, + custom_deserializer=deserialize_serializable) + + a = SerializableClass() + serialized = pa.serialize(a) + + deserialized = serialized.deserialize() + assert type(deserialized).__name__ == SerializableClass.__name__ + assert deserialized.value == 3 + + +def test_serialize_to_components_invalid_cases(): + buf = pa.frombuffer(b'hello') + + components = { + 'num_tensors': 0, + 'num_buffers': 1, + 'data': [buf] + } + + with pytest.raises(pa.ArrowException): + pa.deserialize_components(components) + + components = { + 'num_tensors': 1, + 'num_buffers': 0, + 'data': [buf, buf] + } + + with pytest.raises(pa.ArrowException): + pa.deserialize_components(components) + + +@pytest.mark.skipif(os.name == 'nt', reason="deserialize_regex not pickleable") +def test_deserialize_in_different_process(): + from multiprocessing import Process, Queue + import re + + regex = re.compile(r"\d+\.\d*") + + serialization_context = pa.SerializationContext() + serialization_context.register_type(type(regex), "Regex", pickle=True) + + serialized = pa.serialize(regex, serialization_context) + serialized_bytes = serialized.to_buffer().to_pybytes() + + def deserialize_regex(serialized, q): + import pyarrow as pa + q.put(pa.deserialize(serialized)) + + q = Queue() + p = Process(target=deserialize_regex, args=(serialized_bytes, q)) + p.start() + assert q.get().pattern == regex.pattern + p.join() + + +def test_deserialize_buffer_in_different_process(): + import tempfile + import subprocess + + f = tempfile.NamedTemporaryFile(delete=False) + b = pa.serialize(pa.frombuffer(b'hello')).to_buffer() + f.write(b.to_pybytes()) + f.close() + + dir_path = os.path.dirname(os.path.realpath(__file__)) + python_file = os.path.join(dir_path, 'deserialize_buffer.py') + subprocess.check_call(['python', python_file, f.name]) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 4282224660a1a..e72761d32f634 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -21,42 +21,55 @@ import pandas as pd import pytest -from pyarrow.compat import unittest import pyarrow as pa -class TestColumn(unittest.TestCase): +def test_column_basics(): + data = [ + pa.array([-10, -5, 0, 5, 10]) + ] + table = pa.Table.from_arrays(data, names=['a']) + column = table.column(0) + assert column.name == 'a' + assert column.length() == 5 + assert len(column) == 5 + assert column.shape == (5,) + assert column.to_pylist() == [-10, -5, 0, 5, 10] + + +def test_column_factory_function(): + # ARROW-1575 + arr = pa.array([0, 1, 2, 3, 4]) + arr2 = pa.array([5, 6, 7, 8]) + + col1 = pa.Column.from_array('foo', arr) + col2 = pa.Column.from_array(pa.field('foo', arr.type), arr) + + assert col1.equals(col2) - def test_basics(self): - data = [ - pa.array([-10, -5, 0, 5, 10]) - ] - table = pa.Table.from_arrays(data, names=['a']) - column = table.column(0) - assert column.name == 'a' - assert column.length() == 5 - assert len(column) == 5 - assert column.shape == (5,) - assert column.to_pylist() == [-10, -5, 0, 5, 10] + col3 = pa.column('foo', [arr, arr2]) + chunked_arr = pa.chunked_array([arr, arr2]) + col4 = pa.column('foo', chunked_arr) + assert col3.equals(col4) - def test_from_array(self): - arr = pa.array([0, 1, 2, 3, 4]) + col5 = pa.column('foo', arr.to_pandas()) + assert col5.equals(pa.column('foo', arr)) - col1 = pa.Column.from_array('foo', arr) - col2 = pa.Column.from_array(pa.field('foo', arr.type), arr) + # Type mismatch + with pytest.raises(ValueError): + pa.Column.from_array(pa.field('foo', pa.string()), arr) - assert col1.equals(col2) - def test_pandas(self): - data = [ - pa.array([-10, -5, 0, 5, 10]) - ] - table = pa.Table.from_arrays(data, names=['a']) - column = table.column(0) - series = column.to_pandas() - assert series.name == 'a' - assert series.shape == (5,) - assert series.iloc[0] == -10 +def test_column_to_pandas(): + data = [ + pa.array([-10, -5, 0, 5, 10]) + ] + table = pa.Table.from_arrays(data, names=['a']) + column = table.column(0) + series = column.to_pandas() + assert series.name == 'a' + assert series.shape == (5,) + assert series.iloc[0] == -10 def test_recordbatch_basics(): @@ -200,6 +213,31 @@ def test_recordbatchlist_schema_equals(): pa.Table.from_batches([batch1, batch2]) +def test_table_to_batches(): + df1 = pd.DataFrame({'a': list(range(10))}) + df2 = pd.DataFrame({'a': list(range(10, 30))}) + + batch1 = pa.RecordBatch.from_pandas(df1, preserve_index=False) + batch2 = pa.RecordBatch.from_pandas(df2, preserve_index=False) + + table = pa.Table.from_batches([batch1, batch2, batch1]) + + expected_df = pd.concat([df1, df2, df1], ignore_index=True) + + batches = table.to_batches() + assert len(batches) == 3 + + assert_frame_equal(pa.Table.from_batches(batches).to_pandas(), + expected_df) + + batches = table.to_batches(chunksize=15) + assert list(map(len, batches)) == [10, 15, 5, 10] + + assert_frame_equal(table.to_pandas(), expected_df) + assert_frame_equal(pa.Table.from_batches(batches).to_pandas(), + expected_df) + + def test_table_basics(): data = [ pa.array(range(5)), @@ -274,6 +312,20 @@ def test_table_remove_column(): assert t2.equals(expected) +def test_table_remove_column_empty(): + # ARROW-1865 + data = [ + pa.array(range(5)), + ] + table = pa.Table.from_arrays(data, names=['a']) + + t2 = table.remove_column(0) + assert len(t2) == len(table) + + t3 = t2.add_column(0, table[0]) + assert t3.equals(table) + + def test_concat_tables(): data = [ list(range(5)), diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index e6ff5b1560c1d..68dc499cf48b4 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import pytest + import pyarrow as pa import pyarrow.types as types @@ -56,7 +58,7 @@ def test_is_null(): def test_is_decimal(): - assert types.is_decimal(pa.decimal(19, 4)) + assert types.is_decimal(pa.decimal128(19, 4)) assert not types.is_decimal(pa.int32()) @@ -85,16 +87,17 @@ def test_is_nested_or_struct(): assert not types.is_nested(pa.int32()) -# TODO(wesm): Union types not yet implemented in pyarrow +def test_is_union(): + assert types.is_union(pa.union([pa.field('a', pa.int32()), + pa.field('b', pa.int8()), + pa.field('c', pa.string())], + pa.lib.UnionMode_SPARSE)) + assert not types.is_union(pa.list_(pa.int32())) -# def test_is_union(): -# assert types.is_union(pa.union([pa.field('a', pa.int32()), -# pa.field('b', pa.int8()), -# pa.field('c', pa.string())])) -# assert not types.is_union(pa.list_(pa.int32())) # TODO(wesm): is_map, once implemented + def test_is_binary_string(): assert types.is_binary(pa.binary()) assert not types.is_binary(pa.string()) @@ -136,3 +139,48 @@ def test_is_temporal_date_time_timestamp(): def test_timestamp_type(): # See ARROW-1683 assert isinstance(pa.timestamp('ns'), pa.TimestampType) + + +def test_types_hashable(): + types = [ + pa.null(), + pa.int32(), + pa.time32('s'), + pa.time64('us'), + pa.date32(), + pa.timestamp('us'), + pa.string(), + pa.binary(), + pa.binary(10), + pa.list_(pa.int32()), + pa.struct([pa.field('a', pa.int32()), + pa.field('b', pa.int8()), + pa.field('c', pa.string())]) + ] + + in_dict = {} + for i, type_ in enumerate(types): + assert hash(type_) == hash(type_) + in_dict[type_] = i + assert in_dict[type_] == i + + +@pytest.mark.parametrize('t,check_func', [ + (pa.date32(), types.is_date32), + (pa.date64(), types.is_date64), + (pa.time32('s'), types.is_time32), + (pa.time64('ns'), types.is_time64), + (pa.int8(), types.is_int8), + (pa.int16(), types.is_int16), + (pa.int32(), types.is_int32), + (pa.int64(), types.is_int64), + (pa.uint8(), types.is_uint8), + (pa.uint16(), types.is_uint16), + (pa.uint32(), types.is_uint32), + (pa.uint64(), types.is_uint64), + (pa.float16(), types.is_float16), + (pa.float32(), types.is_float32), + (pa.float64(), types.is_float64) +]) +def test_exact_primitive_types(t, check_func): + assert check_func(t) diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py new file mode 100644 index 0000000000000..a3ba9000c2f62 --- /dev/null +++ b/python/pyarrow/tests/util.py @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Utility functions for testing +""" + +import decimal +import random +import contextlib + + +def randsign(): + """Randomly choose either 1 or -1. + + Returns + ------- + sign : int + """ + return random.choice((-1, 1)) + + +@contextlib.contextmanager +def random_seed(seed): + """Set the random seed inside of a context manager. + + Parameters + ---------- + seed : int + The seed to set + + Notes + ----- + This function is useful when you want to set a random seed but not affect + the random state of other functions using the random module. + """ + original_state = random.getstate() + random.seed(seed) + try: + yield + finally: + random.setstate(original_state) + + +def randdecimal(precision, scale): + """Generate a random decimal value with specified precision and scale. + + Parameters + ---------- + precision : int + The maximum number of digits to generate. Must be an integer between 1 + and 38 inclusive. + scale : int + The maximum number of digits following the decimal point. Must be an + integer greater than or equal to 0. + + Returns + ------- + decimal_value : decimal.Decimal + A random decimal.Decimal object with the specifed precision and scale. + """ + assert 1 <= precision <= 38, 'precision must be between 1 and 38 inclusive' + if scale < 0: + raise ValueError( + 'randdecimal does not yet support generating decimals with ' + 'negative scale' + ) + max_whole_value = 10 ** (precision - scale) - 1 + whole = random.randint(-max_whole_value, max_whole_value) + + if not scale: + return decimal.Decimal(whole) + + max_fractional_value = 10 ** scale - 1 + fractional = random.randint(0, max_fractional_value) + + return decimal.Decimal( + '{}.{}'.format(whole, str(fractional).rjust(scale, '0')) + ) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index c9a490960ec38..1563b57855cd9 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -69,6 +69,9 @@ cdef class DataType: ) return frombytes(self.type.ToString()) + def __hash__(self): + return hash(str(self)) + def __reduce__(self): return self.__class__, (), self.__getstate__() @@ -186,7 +189,32 @@ cdef class UnionType(DataType): cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) + self.child_types = [ + pyarrow_wrap_data_type(type.get().child(i).get().type()) + for i in range(self.num_children)] + property num_children: + + def __get__(self): + return self.type.num_children() + + property mode: + + def __get__(self): + cdef CUnionType* type = self.sp_type.get() + return type.mode() + + def __getitem__(self, i): + return self.child_types[i] + + def __getstate__(self): + children = [pyarrow_wrap_field(self.type.child(i)) + for i in range(self.num_children)] + return children, self.mode + + def __setstate__(self, state): + cdef DataType reconstituted = union(*state) + self.init(reconstituted.sp_type) cdef class TimestampType(DataType): @@ -262,28 +290,28 @@ cdef class FixedSizeBinaryType(DataType): return self.fixed_size_binary_type.byte_width() -cdef class DecimalType(FixedSizeBinaryType): +cdef class Decimal128Type(FixedSizeBinaryType): cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) - self.decimal_type = type.get() + self.decimal128_type = type.get() def __getstate__(self): return (self.precision, self.scale) def __setstate__(self, state): - cdef DataType reconstituted = decimal(*state) + cdef DataType reconstituted = decimal128(*state) self.init(reconstituted.sp_type) property precision: def __get__(self): - return self.decimal_type.precision() + return self.decimal128_type.precision() property scale: def __get__(self): - return self.decimal_type.scale() + return self.decimal128_type.scale() cdef class Field: @@ -925,9 +953,9 @@ def float64(): return primitive_type(_Type_DOUBLE) -cpdef DataType decimal(int precision, int scale=0): +cpdef DataType decimal128(int precision, int scale=0): """ - Create decimal type with precision and scale + Create decimal type with precision and scale and 128bit width Parameters ---------- @@ -936,10 +964,10 @@ cpdef DataType decimal(int precision, int scale=0): Returns ------- - decimal_type : DecimalType + decimal_type : Decimal128Type """ cdef shared_ptr[CDataType] decimal_type - decimal_type.reset(new CDecimalType(precision, scale)) + decimal_type.reset(new CDecimal128Type(precision, scale)) return pyarrow_wrap_data_type(decimal_type) @@ -1056,6 +1084,31 @@ def struct(fields): return pyarrow_wrap_data_type(struct_type) +def union(children_fields, mode): + """ + Create UnionType from children fields. + """ + cdef: + Field child_field + vector[shared_ptr[CField]] c_fields + vector[uint8_t] type_codes + shared_ptr[CDataType] union_type + int i + + for i, child_field in enumerate(children_fields): + type_codes.push_back(i) + c_fields.push_back(child_field.sp_field) + + if mode == UnionMode_SPARSE: + union_type.reset(new CUnionType(c_fields, type_codes, + _UnionMode_SPARSE)) + else: + union_type.reset(new CUnionType(c_fields, type_codes, + _UnionMode_DENSE)) + + return pyarrow_wrap_data_type(union_type) + + cdef dict _type_aliases = { 'null': null, 'i1': int8, diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 440d7eb09daa9..24557148a6363 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -32,6 +32,13 @@ _NESTED_TYPES = {lib.Type_LIST, lib.Type_STRUCT, lib.Type_UNION, lib.Type_MAP} +def is_null(t): + """ + Return True if value is an instance of a null type + """ + return t.id == lib.Type_NA + + def is_boolean(t): """ Return True if value is an instance of a boolean type @@ -41,25 +48,81 @@ def is_boolean(t): def is_integer(t): """ - Return True if value is an instance of an integer type + Return True if value is an instance of any integer type """ return t.id in _INTEGER_TYPES def is_signed_integer(t): """ - Return True if value is an instance of a signed integer type + Return True if value is an instance of any signed integer type """ return t.id in _SIGNED_INTEGER_TYPES def is_unsigned_integer(t): """ - Return True if value is an instance of an unsigned integer type + Return True if value is an instance of any unsigned integer type """ return t.id in _UNSIGNED_INTEGER_TYPES +def is_int8(t): + """ + Return True if value is an instance of an int8 type + """ + return t.id == lib.Type_INT8 + + +def is_int16(t): + """ + Return True if value is an instance of an int16 type + """ + return t.id == lib.Type_INT16 + + +def is_int32(t): + """ + Return True if value is an instance of an int32 type + """ + return t.id == lib.Type_INT32 + + +def is_int64(t): + """ + Return True if value is an instance of an int64 type + """ + return t.id == lib.Type_INT64 + + +def is_uint8(t): + """ + Return True if value is an instance of an uint8 type + """ + return t.id == lib.Type_UINT8 + + +def is_uint16(t): + """ + Return True if value is an instance of an uint16 type + """ + return t.id == lib.Type_UINT16 + + +def is_uint32(t): + """ + Return True if value is an instance of an uint32 type + """ + return t.id == lib.Type_UINT32 + + +def is_uint64(t): + """ + Return True if value is an instance of an uint64 type + """ + return t.id == lib.Type_UINT64 + + def is_floating(t): """ Return True if value is an instance of a floating point numeric type @@ -67,6 +130,27 @@ def is_floating(t): return t.id in _FLOATING_TYPES +def is_float16(t): + """ + Return True if value is an instance of an float16 (half-precision) type + """ + return t.id == lib.Type_HALF_FLOAT + + +def is_float32(t): + """ + Return True if value is an instance of an float32 (single precision) type + """ + return t.id == lib.Type_FLOAT + + +def is_float64(t): + """ + Return True if value is an instance of an float64 (double precision) type + """ + return t.id == lib.Type_DOUBLE + + def is_list(t): """ Return True if value is an instance of a list type @@ -117,11 +201,18 @@ def is_time(t): return t.id in _TIME_TYPES -def is_null(t): +def is_time32(t): """ - Return True if value is an instance of a null type + Return True if value is an instance of a time32 type """ - return t.id == lib.Type_NA + return t.id == lib.Type_TIME32 + + +def is_time64(t): + """ + Return True if value is an instance of a time64 type + """ + return t.id == lib.Type_TIME64 def is_binary(t): @@ -159,6 +250,20 @@ def is_date(t): return t.id in _DATE_TYPES +def is_date32(t): + """ + Return True if value is an instance of a date32 (days) type + """ + return t.id == lib.Type_DATE32 + + +def is_date64(t): + """ + Return True if value is an instance of a date64 (milliseconds) type + """ + return t.id == lib.Type_DATE64 + + def is_map(t): """ Return True if value is an instance of a map logical type diff --git a/python/requirements.txt b/python/requirements.txt index d2e28a7747ba8..8d0c33afa69a6 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,4 +1,4 @@ pytest -cloudpickle +cloudpickle>=0.4.0 numpy>=1.10.0 six diff --git a/python/setup.py b/python/setup.py index ccab8fb6581f2..3d3831dc048c6 100644 --- a/python/setup.py +++ b/python/setup.py @@ -82,7 +82,9 @@ def run(self): user_options = ([('extra-cmake-args=', None, 'extra arguments for CMake'), ('build-type=', None, 'build type (debug or release)'), ('with-parquet', None, 'build the Parquet extension'), + ('with-static-parquet', None, 'link parquet statically'), ('with-plasma', None, 'build the Plasma extension'), + ('with-orc', None, 'build the ORC extension'), ('bundle-arrow-cpp', None, 'bundle the Arrow C++ libraries')] + _build_ext.user_options) @@ -102,14 +104,21 @@ def initialize_options(self): self.with_parquet = strtobool( os.environ.get('PYARROW_WITH_PARQUET', '0')) + self.with_static_parquet = strtobool( + os.environ.get('PYARROW_WITH_STATIC_PARQUET', '0')) + self.with_static_boost = strtobool( + os.environ.get('PYARROW_WITH_STATIC_BOOST', '1')) self.with_plasma = strtobool( os.environ.get('PYARROW_WITH_PLASMA', '0')) + self.with_orc = strtobool( + os.environ.get('PYARROW_WITH_ORC', '0')) self.bundle_arrow_cpp = strtobool( os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0')) CYTHON_MODULE_NAMES = [ 'lib', '_parquet', + '_orc', 'plasma'] def _run_cmake(self): @@ -144,10 +153,17 @@ def _run_cmake(self): if self.with_parquet: cmake_options.append('-DPYARROW_BUILD_PARQUET=on') + if self.with_static_parquet: + cmake_options.append('-DPYARROW_PARQUET_USE_SHARED=off') + if not self.with_static_boost: + cmake_options.append('-DPYARROW_BOOST_USE_SHARED=on') if self.with_plasma: cmake_options.append('-DPYARROW_BUILD_PLASMA=on') + if self.with_orc: + cmake_options.append('-DPYARROW_BUILD_ORC=on') + if len(self.cmake_cxxflags) > 0: cmake_options.append('-DPYARROW_CXXFLAGS="{0}"' .format(self.cmake_cxxflags)) @@ -225,7 +241,7 @@ def _run_cmake(self): move_shared_libs(build_prefix, build_lib, "arrow_python") if self.with_plasma: move_shared_libs(build_prefix, build_lib, "plasma") - if self.with_parquet: + if self.with_parquet and not self.with_static_parquet: move_shared_libs(build_prefix, build_lib, "parquet") print('Bundling includes: ' + pjoin(build_prefix, 'include')) @@ -275,6 +291,8 @@ def _failure_permitted(self, name): return True if name == 'plasma' and not self.with_plasma: return True + if name == '_orc' and not self.with_orc: + return True return False def _get_inplace_dir(self): @@ -390,6 +408,16 @@ def has_ext_modules(foo): install_requires.append('futures') +def parse_version(root): + from setuptools_scm import version_from_scm + import setuptools_scm.git + describe = setuptools_scm.git.DEFAULT_DESCRIBE + " --match 'apache-arrow-[0-9]*'" + version = setuptools_scm.git.parse(root, describe) + if not version: + return version_from_scm(root) + else: + return version + setup( name="pyarrow", packages=['pyarrow', 'pyarrow.tests'], @@ -408,7 +436,7 @@ def has_ext_modules(foo): 'plasma_store = pyarrow:_plasma_store_entry_point' ] }, - use_scm_version={"root": "..", "relative_to": __file__}, + use_scm_version={"root": "..", "relative_to": __file__, "parse": parse_version}, setup_requires=['setuptools_scm', 'cython >= 0.23'], install_requires=install_requires, tests_require=['pytest'], diff --git a/python/testing/README.md b/python/testing/README.md index 07970a231b54b..0ebeec4a1c3e7 100644 --- a/python/testing/README.md +++ b/python/testing/README.md @@ -23,4 +23,26 @@ ```shell ./test_hdfs.sh -``` \ No newline at end of file +``` + +## Testing Dask integration + +Initial integration testing with Dask has been Dockerized. +To invoke the test run the following command in the `arrow` +root-directory: + +```shell +bash dev/dask_integration.sh +``` + +This script will create a `dask` directory on the same level as +`arrow`. It will clone the Dask project from Github into `dask` +and do a Python `--user` install. The Docker code will use the parent +directory of `arrow` as `$HOME` and that's where Python will +install `dask` into a `.local` directory. + +The output of the Docker session will contain the results of tests +of the Dask dataframe followed by the single integration test that +now exists for Arrow. That test creates a set of `csv`-files and then +does parallel reading of `csv`-files into a Dask dataframe. The code +for this test resides here in the `dask_test` directory. diff --git a/python/testing/dask_tests/test_dask_integration.py b/python/testing/dask_tests/test_dask_integration.py new file mode 100644 index 0000000000000..842c45f57d1f7 --- /dev/null +++ b/python/testing/dask_tests/test_dask_integration.py @@ -0,0 +1,58 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + +from datetime import date, timedelta +import csv +from random import randint + +import pyarrow as pa + +dd = pytest.importorskip('dask.dataframe') + + +def make_datafiles(tmpdir, prefix='data', num_files=20): + rowcount = 5000 + fieldnames = ['date', 'temperature', 'dewpoint'] + start_date = date(1900, 1, 1) + for i in range(num_files): + filename = '{0}/{1}-{2}.csv'.format(tmpdir, prefix, i) + with open(filename, 'w') as outcsv: + writer = csv.DictWriter(outcsv, fieldnames) + writer.writeheader() + the_date = start_date + for _ in range(rowcount): + temperature = randint(-10, 35) + dewpoint = temperature - randint(0, 10) + writer.writerow({'date': the_date, 'temperature': temperature, + 'dewpoint': dewpoint}) + the_date += timedelta(days=1) + + +def test_dask_file_read(tmpdir): + prefix = 'data' + make_datafiles(tmpdir, prefix) + # Read all datafiles in parallel + datafiles = '{0}/{1}-*.csv'.format(tmpdir, prefix) + dask_df = dd.read_csv(datafiles) + # Convert Dask dataframe to Arrow table + table = pa.Table.from_pandas(dask_df.compute()) + # Second column (1) is temperature + dask_temp = int(1000 * dask_df['temperature'].mean().compute()) + arrow_temp = int(1000 * table[1].to_pandas().mean()) + assert dask_temp == arrow_temp diff --git a/site/_config.yml b/site/_config.yml index a6c5575d0680f..cbcf97dd3b0d1 100644 --- a/site/_config.yml +++ b/site/_config.yml @@ -30,6 +30,7 @@ exclude: - Gemfile - Gemfile.lock - _docs/format/* + - ruby - asf-site - scripts - README.md diff --git a/site/_data/contributors.yml b/site/_data/contributors.yml index 2a8d95b77b239..19ca53ad48599 100644 --- a/site/_data/contributors.yml +++ b/site/_data/contributors.yml @@ -32,4 +32,7 @@ - name: Michael König apacheId: MathMagique githubId: MathMagique +- name: Siddharth Teotia + apacheId: siddharthteotia + githubId: siddharthteotia # End contributors.yml diff --git a/site/_includes/header.html b/site/_includes/header.html index 6c0ec30f39ca7..03b3c8750cdb8 100644 --- a/site/_includes/header.html +++ b/site/_includes/header.html @@ -27,6 +27,7 @@
  • Mailing List
  • Slack Channel
  • Committers
  • +
  • Powered By