From 65db0da80b6a1fb6887b7ac1df24e2423d41dfb9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 22 Mar 2016 18:45:13 -0700 Subject: [PATCH] ARROW-67: C++ metadata flatbuffer serialization and data movement to memory maps Several things here: * Add Google flatbuffers dependency * Flatbuffers IDL draft in collaboration with @jacques-n and @stevenmphillips * Add Schema wrapper in Cython * arrow::Schema conversion to/from flatbuffer representation * Remove unneeded physical layout types from type.h * Refactor ListType to be a nested type with a single child * Implement shared memory round-trip for numeric row batches * mmap-based shared memory interface and MemorySource abstract API Quite a bit of judicious code cleaning and consolidation as part of this. For example, List types are now internally equivalent to a nested type with 1 named child field (versus a struct, which can have any number of child fields). Associated JIRAs: ARROW-48, ARROW-57, ARROW-58 Author: Wes McKinney Closes #28 from wesm/cpp-ipc-draft and squashes the following commits: 0cef7ea [Wes McKinney] Add NullArray type now that Array is virtual, fix pyarrow build 5e841f7 [Wes McKinney] Create explicit PrimitiveArray subclasses to avoid unwanted template instantiation 6fa6319 [Wes McKinney] ARROW-28: Draft C++ shared memory IPC workflow and related refactoring / scaffolding / cleaning. --- ci/travis_before_script_cpp.sh | 9 +- ci/travis_script_cpp.sh | 6 +- cpp/CMakeLists.txt | 96 ++++-- cpp/cmake_modules/FindFlatbuffers.cmake | 95 ++++++ cpp/setup_build_env.sh | 5 +- cpp/src/arrow/CMakeLists.txt | 8 + cpp/src/arrow/api.h | 11 +- cpp/src/arrow/array-test.cc | 14 +- cpp/src/arrow/array.cc | 26 +- cpp/src/arrow/array.h | 27 +- cpp/src/arrow/builder.h | 2 +- cpp/src/arrow/{table => }/column-benchmark.cc | 5 +- cpp/src/arrow/{table => }/column-test.cc | 10 +- cpp/src/arrow/{table => }/column.cc | 4 +- cpp/src/arrow/{table => }/column.h | 13 +- cpp/src/arrow/ipc/.gitignore | 1 + cpp/src/arrow/ipc/CMakeLists.txt | 51 +++ cpp/src/arrow/ipc/adapter.cc | 305 +++++++++++++++++ cpp/src/arrow/ipc/adapter.h | 86 +++++ cpp/src/arrow/ipc/ipc-adapter-test.cc | 112 +++++++ cpp/src/arrow/ipc/ipc-memory-test.cc | 82 +++++ cpp/src/arrow/ipc/ipc-metadata-test.cc | 99 ++++++ cpp/src/arrow/ipc/memory.cc | 162 +++++++++ cpp/src/arrow/ipc/memory.h | 131 ++++++++ cpp/src/arrow/ipc/metadata-internal.cc | 317 ++++++++++++++++++ cpp/src/arrow/ipc/metadata-internal.h | 69 ++++ cpp/src/arrow/ipc/metadata.cc | 238 +++++++++++++ cpp/src/arrow/ipc/metadata.h | 146 ++++++++ .../{types/floating.h => ipc/test-common.h} | 43 ++- cpp/src/arrow/{table => }/schema-test.cc | 48 ++- cpp/src/arrow/{table => }/schema.cc | 11 +- cpp/src/arrow/{table => }/schema.h | 8 +- cpp/src/arrow/{table => }/table-test.cc | 18 +- cpp/src/arrow/{table => }/table.cc | 35 +- cpp/src/arrow/{table => }/table.h | 58 +++- cpp/src/arrow/table/test-common.h | 54 --- cpp/src/arrow/test-util.h | 68 +++- cpp/src/arrow/type.cc | 24 +- cpp/src/arrow/type.h | 177 ++++------ cpp/src/arrow/types/CMakeLists.txt | 2 - cpp/src/arrow/types/boolean.h | 2 +- cpp/src/arrow/types/collection.h | 2 +- cpp/src/arrow/types/construct.cc | 53 +-- cpp/src/arrow/types/construct.h | 11 +- cpp/src/arrow/types/datetime.h | 16 +- cpp/src/arrow/types/floating.cc | 22 -- cpp/src/arrow/types/integer.cc | 22 -- cpp/src/arrow/types/integer.h | 57 ---- cpp/src/arrow/types/json.cc | 1 - cpp/src/arrow/types/json.h | 4 +- cpp/src/arrow/types/list-test.cc | 28 +- cpp/src/arrow/types/list.cc | 29 ++ cpp/src/arrow/types/list.h | 28 +- cpp/src/arrow/types/primitive-test.cc | 41 +-- cpp/src/arrow/types/primitive.cc | 16 +- cpp/src/arrow/types/primitive.h | 102 +++--- cpp/src/arrow/types/string-test.cc | 54 ++- cpp/src/arrow/types/string.h | 55 +-- cpp/src/arrow/types/struct-test.cc | 15 +- cpp/src/arrow/types/test-common.h | 5 +- cpp/src/arrow/types/union.h | 18 +- cpp/src/arrow/util/bit-util-test.cc | 4 +- cpp/src/arrow/util/bit-util.h | 1 - cpp/src/arrow/util/buffer-test.cc | 3 +- cpp/src/arrow/util/buffer.cc | 2 +- cpp/src/arrow/util/memory-pool-test.cc | 7 +- cpp/src/arrow/util/memory-pool.cc | 6 +- cpp/src/arrow/util/memory-pool.h | 2 +- cpp/src/arrow/util/status.cc | 3 + cpp/src/arrow/util/status.h | 6 + cpp/src/arrow/util/test_main.cc | 2 +- cpp/thirdparty/build_thirdparty.sh | 9 + cpp/thirdparty/download_thirdparty.sh | 5 + cpp/thirdparty/versions.sh | 4 + format/Message.fbs | 183 ++++++++++ python/pyarrow/__init__.py | 4 +- python/pyarrow/array.pxd | 2 +- python/pyarrow/array.pyx | 47 ++- python/pyarrow/includes/libarrow.pxd | 107 ++++-- python/pyarrow/includes/pyarrow.pxd | 5 +- python/pyarrow/scalar.pyx | 24 +- python/pyarrow/schema.pxd | 6 +- python/pyarrow/schema.pyx | 155 ++++++--- python/pyarrow/tests/test_schema.py | 28 +- .../pyarrow/tests/test_table.py | 39 ++- python/src/pyarrow/adapters/builtin.cc | 20 +- python/src/pyarrow/helpers.cc | 15 +- python/src/pyarrow/helpers.h | 5 +- 88 files changed, 3113 insertions(+), 838 deletions(-) create mode 100644 cpp/cmake_modules/FindFlatbuffers.cmake rename cpp/src/arrow/{table => }/column-benchmark.cc (94%) rename cpp/src/arrow/{table => }/column-test.cc (93%) rename cpp/src/arrow/{table => }/column.cc (96%) rename cpp/src/arrow/{table => }/column.h (93%) create mode 100644 cpp/src/arrow/ipc/.gitignore create mode 100644 cpp/src/arrow/ipc/CMakeLists.txt create mode 100644 cpp/src/arrow/ipc/adapter.cc create mode 100644 cpp/src/arrow/ipc/adapter.h create mode 100644 cpp/src/arrow/ipc/ipc-adapter-test.cc create mode 100644 cpp/src/arrow/ipc/ipc-memory-test.cc create mode 100644 cpp/src/arrow/ipc/ipc-metadata-test.cc create mode 100644 cpp/src/arrow/ipc/memory.cc create mode 100644 cpp/src/arrow/ipc/memory.h create mode 100644 cpp/src/arrow/ipc/metadata-internal.cc create mode 100644 cpp/src/arrow/ipc/metadata-internal.h create mode 100644 cpp/src/arrow/ipc/metadata.cc create mode 100644 cpp/src/arrow/ipc/metadata.h rename cpp/src/arrow/{types/floating.h => ipc/test-common.h} (59%) rename cpp/src/arrow/{table => }/schema-test.cc (72%) rename cpp/src/arrow/{table => }/schema.cc (88%) rename cpp/src/arrow/{table => }/schema.h (91%) rename cpp/src/arrow/{table => }/table-test.cc (92%) rename cpp/src/arrow/{table => }/table.cc (69%) rename cpp/src/arrow/{table => }/table.h (55%) delete mode 100644 cpp/src/arrow/table/test-common.h delete mode 100644 cpp/src/arrow/types/floating.cc delete mode 100644 cpp/src/arrow/types/integer.cc delete mode 100644 cpp/src/arrow/types/integer.h create mode 100644 format/Message.fbs rename cpp/src/arrow/table/CMakeLists.txt => python/pyarrow/tests/test_table.py (58%) diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 49dcc395fbc83..193c76feba1d7 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -19,7 +19,14 @@ echo $GTEST_HOME : ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install} -cmake -DARROW_BUILD_BENCHMARKS=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR +CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" + +if [ $TRAVIS_OS_NAME == "linux" ]; then + cmake -DARROW_TEST_MEMCHECK=on $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR +else + cmake $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR +fi + make -j4 make install diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index d96b98f8d37f5..997bdf35e83d2 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -8,10 +8,6 @@ pushd $CPP_BUILD_DIR make lint -if [ $TRAVIS_OS_NAME == "linux" ]; then - valgrind --tool=memcheck --leak-check=yes --error-exitcode=1 ctest -L unittest -else - ctest -L unittest -fi +ctest -L unittest popd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 268c1d11e1e8e..6d701079b482c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -51,7 +51,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(ARROW_PARQUET "Build the Parquet adapter and link to libparquet" OFF) - + option(ARROW_TEST_MEMCHECK + "Run the test suite using valgrind --tool=memcheck" + OFF) option(ARROW_BUILD_TESTS "Build the Arrow googletest unit tests" ON) @@ -60,6 +62,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the Arrow micro benchmarks" OFF) + option(ARROW_IPC + "Build the Arrow IPC extensions" + ON) + endif() if(NOT ARROW_BUILD_TESTS) @@ -260,17 +266,17 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") include_directories(src) ############################################################ -# Benchmarking +# Benchmarking ############################################################ # Add a new micro benchmark, with or without an executable that should be built. # If benchmarks are enabled then they will be run along side unit tests with ctest. -# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, +# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, # respectively. # # REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component # (e.g. monotime-benchmark) or contain additional components (e.g. # net/net_util-benchmark). Either way, the last component must be a globally -# unique name. +# unique name. # The benchmark will registered as unit test with ctest with a label # of 'benchmark'. @@ -281,7 +287,7 @@ function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) return() endif() get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) - + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}.cc) # This benchmark has a corresponding .cc file, set it up as an executable. set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") @@ -294,7 +300,7 @@ function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) set(BENCHMARK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}) set(NO_COLOR "") endif() - + add_test(${BENCHMARK_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark") @@ -345,9 +351,18 @@ function(ADD_ARROW_TEST REL_TEST_NAME) set(TEST_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}) endif() - add_test(${TEST_NAME} - ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) + if (ARROW_TEST_MEMCHECK) + SET_PROPERTY(TARGET ${TEST_NAME} + APPEND_STRING PROPERTY + COMPILE_FLAGS " -DARROW_VALGRIND") + add_test(${TEST_NAME} + valgrind --tool=memcheck --leak-check=full --error-exitcode=1 ${TEST_PATH}) + else() + add_test(${TEST_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) + endif() set_tests_properties(${TEST_NAME} PROPERTIES LABELS "unittest") + if(ARGN) set_tests_properties(${TEST_NAME} PROPERTIES ${ARGN}) endif() @@ -403,7 +418,7 @@ if ("$ENV{GTEST_HOME}" STREQUAL "") set(GTest_HOME ${THIRDPARTY_DIR}/googletest-release-1.7.0) endif() -## Google Benchmark +## Google Benchmark if ("$ENV{GBENCHMARK_HOME}" STREQUAL "") set(GBENCHMARK_HOME ${THIRDPARTY_DIR}/installed) endif() @@ -487,24 +502,10 @@ if (UNIX) add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py --verbose=2 --linelength=90 - --filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11 - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h`) + --filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11,-runtime/references + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) endif (UNIX) -#---------------------------------------------------------------------- -# Parquet adapter - -if(ARROW_PARQUET) - find_package(Parquet REQUIRED) - include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(parquet - STATIC_LIB ${PARQUET_STATIC_LIB} - SHARED_LIB ${PARQUET_SHARED_LIB}) - - add_subdirectory(src/arrow/parquet) - list(APPEND LINK_LIBS arrow_parquet parquet) -endif() - ############################################################ # Subdirectories ############################################################ @@ -515,15 +516,18 @@ set(LIBARROW_LINK_LIBS set(ARROW_SRCS src/arrow/array.cc src/arrow/builder.cc + src/arrow/column.cc + src/arrow/schema.cc + src/arrow/table.cc src/arrow/type.cc - src/arrow/table/column.cc - src/arrow/table/schema.cc - src/arrow/table/table.cc + # IPC / Shared memory library; to be turned into an optional component + src/arrow/ipc/adapter.cc + src/arrow/ipc/memory.cc + src/arrow/ipc/metadata.cc + src/arrow/ipc/metadata-internal.cc src/arrow/types/construct.cc - src/arrow/types/floating.cc - src/arrow/types/integer.cc src/arrow/types/json.cc src/arrow/types/list.cc src/arrow/types/primitive.cc @@ -559,9 +563,39 @@ target_link_libraries(arrow ${LIBARROW_LINK_LIBS}) add_subdirectory(src/arrow) add_subdirectory(src/arrow/util) -add_subdirectory(src/arrow/table) add_subdirectory(src/arrow/types) install(TARGETS arrow LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) + +#---------------------------------------------------------------------- +# Parquet adapter library + +if(ARROW_PARQUET) + find_package(Parquet REQUIRED) + include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(parquet + STATIC_LIB ${PARQUET_STATIC_LIB} + SHARED_LIB ${PARQUET_SHARED_LIB}) + + add_subdirectory(src/arrow/parquet) + list(APPEND LINK_LIBS arrow_parquet parquet) +endif() + +#---------------------------------------------------------------------- +# IPC library + +## Flatbuffers +if(ARROW_IPC) + find_package(Flatbuffers REQUIRED) + message(STATUS "Flatbuffers include dir: ${FLATBUFFERS_INCLUDE_DIR}") + message(STATUS "Flatbuffers static library: ${FLATBUFFERS_STATIC_LIB}") + message(STATUS "Flatbuffers compiler: ${FLATBUFFERS_COMPILER}") + include_directories(SYSTEM ${FLATBUFFERS_INCLUDE_DIR}) + add_library(flatbuffers STATIC IMPORTED) + set_target_properties(flatbuffers PROPERTIES + IMPORTED_LOCATION ${FLATBUFFERS_STATIC_LIB}) + + add_subdirectory(src/arrow/ipc) +endif() diff --git a/cpp/cmake_modules/FindFlatbuffers.cmake b/cpp/cmake_modules/FindFlatbuffers.cmake new file mode 100644 index 0000000000000..ee472d1c8995f --- /dev/null +++ b/cpp/cmake_modules/FindFlatbuffers.cmake @@ -0,0 +1,95 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Tries to find Flatbuffers headers and libraries. +# +# Usage of this module as follows: +# +# find_package(Flatbuffers) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Flatbuffers_HOME - +# When set, this path is inspected instead of standard library locations as +# the root of the Flatbuffers installation. The environment variable +# FLATBUFFERS_HOME overrides this veriable. +# +# This module defines +# FLATBUFFERS_INCLUDE_DIR, directory containing headers +# FLATBUFFERS_LIBS, directory containing flatbuffers libraries +# FLATBUFFERS_STATIC_LIB, path to libflatbuffers.a +# FLATBUFFERS_FOUND, whether flatbuffers has been found + +if( NOT "$ENV{FLATBUFFERS_HOME}" STREQUAL "") + file( TO_CMAKE_PATH "$ENV{FLATBUFFERS_HOME}" _native_path ) + list( APPEND _flatbuffers_roots ${_native_path} ) +elseif ( Flatbuffers_HOME ) + list( APPEND _flatbuffers_roots ${Flatbuffers_HOME} ) +endif() + +# Try the parameterized roots, if they exist +if ( _flatbuffers_roots ) + find_path( FLATBUFFERS_INCLUDE_DIR NAMES flatbuffers/flatbuffers.h + PATHS ${_flatbuffers_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + find_library( FLATBUFFERS_LIBRARIES NAMES flatbuffers + PATHS ${_flatbuffers_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib" ) +else () + find_path( FLATBUFFERS_INCLUDE_DIR NAMES flatbuffers/flatbuffers.h ) + find_library( FLATBUFFERS_LIBRARIES NAMES flatbuffers ) +endif () + +find_program(FLATBUFFERS_COMPILER flatc + $ENV{FLATBUFFERS_HOME}/bin + /usr/local/bin + /usr/bin + NO_DEFAULT_PATH +) + +if (FLATBUFFERS_INCLUDE_DIR AND FLATBUFFERS_LIBRARIES) + set(FLATBUFFERS_FOUND TRUE) + get_filename_component( FLATBUFFERS_LIBS ${FLATBUFFERS_LIBRARIES} PATH ) + set(FLATBUFFERS_LIB_NAME libflatbuffers) + set(FLATBUFFERS_STATIC_LIB ${FLATBUFFERS_LIBS}/${FLATBUFFERS_LIB_NAME}.a) +else () + set(FLATBUFFERS_FOUND FALSE) +endif () + +if (FLATBUFFERS_FOUND) + if (NOT Flatbuffers_FIND_QUIETLY) + message(STATUS "Found the Flatbuffers library: ${FLATBUFFERS_LIBRARIES}") + endif () +else () + if (NOT Flatbuffers_FIND_QUIETLY) + set(FLATBUFFERS_ERR_MSG "Could not find the Flatbuffers library. Looked in ") + if ( _flatbuffers_roots ) + set(FLATBUFFERS_ERR_MSG "${FLATBUFFERS_ERR_MSG} in ${_flatbuffers_roots}.") + else () + set(FLATBUFFERS_ERR_MSG "${FLATBUFFERS_ERR_MSG} system search paths.") + endif () + if (Flatbuffers_FIND_REQUIRED) + message(FATAL_ERROR "${FLATBUFFERS_ERR_MSG}") + else (Flatbuffers_FIND_REQUIRED) + message(STATUS "${FLATBUFFERS_ERR_MSG}") + endif (Flatbuffers_FIND_REQUIRED) + endif () +endif () + +mark_as_advanced( + FLATBUFFERS_INCLUDE_DIR + FLATBUFFERS_LIBS + FLATBUFFERS_STATIC_LIB + FLATBUFFERS_COMPILER +) diff --git a/cpp/setup_build_env.sh b/cpp/setup_build_env.sh index 04688e7d59400..6520dbd43f705 100755 --- a/cpp/setup_build_env.sh +++ b/cpp/setup_build_env.sh @@ -2,11 +2,12 @@ SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) -./thirdparty/download_thirdparty.sh || { echo "download_thirdparty.sh failed" ; return; } -./thirdparty/build_thirdparty.sh || { echo "build_thirdparty.sh failed" ; return; } +./thirdparty/download_thirdparty.sh || { echo "download_thirdparty.sh failed" ; return; } +./thirdparty/build_thirdparty.sh || { echo "build_thirdparty.sh failed" ; return; } source thirdparty/versions.sh export GTEST_HOME=$SOURCE_DIR/thirdparty/$GTEST_BASEDIR export GBENCHMARK_HOME=$SOURCE_DIR/thirdparty/installed +export FLATBUFFERS_HOME=$SOURCE_DIR/thirdparty/installed echo "Build env initialized" diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 73e6a9b22c94a..2d42edcfbd499 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -19,7 +19,10 @@ install(FILES api.h array.h + column.h builder.h + schema.h + table.h type.h DESTINATION include/arrow) @@ -30,3 +33,8 @@ install(FILES set(ARROW_TEST_LINK_LIBS ${ARROW_MIN_TEST_LIBS}) ADD_ARROW_TEST(array-test) +ADD_ARROW_TEST(column-test) +ADD_ARROW_TEST(schema-test) +ADD_ARROW_TEST(table-test) + +ADD_ARROW_BENCHMARK(column-benchmark) diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index c73d4b386cf54..7be7f88c22eb6 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -22,20 +22,19 @@ #include "arrow/array.h" #include "arrow/builder.h" +#include "arrow/column.h" +#include "arrow/schema.h" +#include "arrow/table.h" #include "arrow/type.h" -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/table.h" - #include "arrow/types/boolean.h" #include "arrow/types/construct.h" -#include "arrow/types/floating.h" -#include "arrow/types/integer.h" #include "arrow/types/list.h" +#include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index df827aaa113aa..eded5941e892e 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -15,30 +15,26 @@ // specific language governing permissions and limitations // under the License. -#include - #include #include #include #include +#include "gtest/gtest.h" + #include "arrow/array.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/integer.h" #include "arrow/types/primitive.h" #include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" -#include "arrow/util/status.h" namespace arrow { -static TypePtr int32 = TypePtr(new Int32Type()); - class TestArray : public ::testing::Test { public: void SetUp() { - pool_ = GetDefaultMemoryPool(); + pool_ = default_memory_pool(); } protected: @@ -75,10 +71,10 @@ TEST_F(TestArray, TestIsNull) { if (x > 0) ++null_count; } - std::shared_ptr null_buf = bytes_to_null_buffer(nulls.data(), + std::shared_ptr null_buf = test::bytes_to_null_buffer(nulls.data(), nulls.size()); std::unique_ptr arr; - arr.reset(new Array(int32, nulls.size(), null_count, null_buf)); + arr.reset(new Int32Array(nulls.size(), nullptr, null_count, null_buf)); ASSERT_EQ(null_count, arr->null_count()); ASSERT_EQ(5, null_buf->size()); diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index ee4ef66d11e26..5a5bc1069db13 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -28,11 +28,6 @@ namespace arrow { Array::Array(const TypePtr& type, int32_t length, int32_t null_count, const std::shared_ptr& nulls) { - Init(type, length, null_count, nulls); -} - -void Array::Init(const TypePtr& type, int32_t length, int32_t null_count, - const std::shared_ptr& nulls) { type_ = type; length_ = length; null_count_ = null_count; @@ -42,4 +37,25 @@ void Array::Init(const TypePtr& type, int32_t length, int32_t null_count, } } +bool Array::EqualsExact(const Array& other) const { + if (this == &other) return true; + if (length_ != other.length_ || null_count_ != other.null_count_ || + type_enum() != other.type_enum()) { + return false; + } + if (null_count_ > 0) { + return nulls_->Equals(*other.nulls_, util::bytes_for_bits(length_)); + } else { + return true; + } +} + +bool NullArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) return true; + if (Type::NA != arr->type_enum()) { + return false; + } + return arr->length() == length_; +} + } // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 85e853e2ae5e2..65fc0aaf583e9 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -40,20 +40,11 @@ class Buffer; // explicitly increment its reference count class Array { public: - Array() : - null_count_(0), - length_(0), - nulls_(nullptr), - null_bits_(nullptr) {} - Array(const TypePtr& type, int32_t length, int32_t null_count = 0, const std::shared_ptr& nulls = nullptr); virtual ~Array() {} - void Init(const TypePtr& type, int32_t length, int32_t null_count, - const std::shared_ptr& nulls); - // Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { return null_count_ > 0 && util::get_bit(null_bits_, i); @@ -63,12 +54,15 @@ class Array { int32_t null_count() const { return null_count_;} const std::shared_ptr& type() const { return type_;} - LogicalType::type logical_type() const { return type_->type;} + Type::type type_enum() const { return type_->type;} const std::shared_ptr& nulls() const { return nulls_; } + bool EqualsExact(const Array& arr) const; + virtual bool Equals(const std::shared_ptr& arr) const = 0; + protected: TypePtr type_; int32_t null_count_; @@ -78,9 +72,22 @@ class Array { const uint8_t* null_bits_; private: + Array() {} DISALLOW_COPY_AND_ASSIGN(Array); }; +// Degenerate null type Array +class NullArray : public Array { + public: + NullArray(const std::shared_ptr& type, int32_t length) : + Array(type, length, length, nullptr) {} + + explicit NullArray(int32_t length) : + NullArray(std::make_shared(), length) {} + + bool Equals(const std::shared_ptr& arr) const override; +}; + typedef std::shared_ptr ArrayPtr; } // namespace arrow diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 8cc689c3e81ee..d5d1fdf95af17 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -99,7 +99,7 @@ class ArrayBuilder { int32_t capacity_; // Child value array builders. These are owned by this class - std::vector > children_; + std::vector> children_; private: DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); diff --git a/cpp/src/arrow/table/column-benchmark.cc b/cpp/src/arrow/column-benchmark.cc similarity index 94% rename from cpp/src/arrow/table/column-benchmark.cc rename to cpp/src/arrow/column-benchmark.cc index c01146d7b096f..69ee52c3e09ea 100644 --- a/cpp/src/arrow/table/column-benchmark.cc +++ b/cpp/src/arrow/column-benchmark.cc @@ -19,15 +19,14 @@ #include "benchmark/benchmark.h" #include "arrow/test-util.h" -#include "arrow/table/test-common.h" -#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" #include "arrow/util/memory-pool.h" namespace arrow { namespace { template std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { - auto pool = GetDefaultMemoryPool(); + auto pool = default_memory_pool(); auto data = std::make_shared(pool); auto nulls = std::make_shared(pool); data->Resize(length * sizeof(typename ArrayType::value_type)); diff --git a/cpp/src/arrow/table/column-test.cc b/cpp/src/arrow/column-test.cc similarity index 93% rename from cpp/src/arrow/table/column-test.cc rename to cpp/src/arrow/column-test.cc index 3b102e48c87cf..0630785630e81 100644 --- a/cpp/src/arrow/table/column-test.cc +++ b/cpp/src/arrow/column-test.cc @@ -15,18 +15,18 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include #include -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/test-common.h" +#include "gtest/gtest.h" + +#include "arrow/column.h" +#include "arrow/schema.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" using std::shared_ptr; using std::vector; diff --git a/cpp/src/arrow/table/column.cc b/cpp/src/arrow/column.cc similarity index 96% rename from cpp/src/arrow/table/column.cc rename to cpp/src/arrow/column.cc index 573e650875944..46acf8df2ff57 100644 --- a/cpp/src/arrow/table/column.cc +++ b/cpp/src/arrow/column.cc @@ -15,11 +15,12 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/table/column.h" +#include "arrow/column.h" #include #include +#include "arrow/array.h" #include "arrow/type.h" #include "arrow/util/status.h" @@ -28,6 +29,7 @@ namespace arrow { ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) { length_ = 0; + null_count_ = 0; for (const std::shared_ptr& chunk : chunks) { length_ += chunk->length(); null_count_ += chunk->null_count(); diff --git a/cpp/src/arrow/table/column.h b/cpp/src/arrow/column.h similarity index 93% rename from cpp/src/arrow/table/column.h rename to cpp/src/arrow/column.h index dfc7516e26aac..1ad97b20863c8 100644 --- a/cpp/src/arrow/table/column.h +++ b/cpp/src/arrow/column.h @@ -15,19 +15,22 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TABLE_COLUMN_H -#define ARROW_TABLE_COLUMN_H +#ifndef ARROW_COLUMN_H +#define ARROW_COLUMN_H +#include #include #include #include -#include "arrow/array.h" #include "arrow/type.h" namespace arrow { -typedef std::vector > ArrayVector; +class Array; +class Status; + +typedef std::vector> ArrayVector; // A data structure managing a list of primitive Arrow arrays logically as one // large array @@ -102,4 +105,4 @@ class Column { } // namespace arrow -#endif // ARROW_TABLE_COLUMN_H +#endif // ARROW_COLUMN_H diff --git a/cpp/src/arrow/ipc/.gitignore b/cpp/src/arrow/ipc/.gitignore new file mode 100644 index 0000000000000..8150d7efe33c4 --- /dev/null +++ b/cpp/src/arrow/ipc/.gitignore @@ -0,0 +1 @@ +*_generated.h \ No newline at end of file diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt new file mode 100644 index 0000000000000..383684f42f952 --- /dev/null +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####################################### +# arrow_ipc +####################################### + +# Headers: top level +install(FILES + adapter.h + metadata.h + memory.h + DESTINATION include/arrow/ipc) + +ADD_ARROW_TEST(ipc-adapter-test) +ADD_ARROW_TEST(ipc-memory-test) +ADD_ARROW_TEST(ipc-metadata-test) + +# make clean will delete the generated file +set_source_files_properties(Metadata_generated.h PROPERTIES GENERATED TRUE) + +set(OUTPUT_DIR ${CMAKE_SOURCE_DIR}/src/arrow/ipc) +set(FBS_OUTPUT_FILES "${OUTPUT_DIR}/Message_generated.h") + +set(FBS_SRC ${CMAKE_SOURCE_DIR}/../format/Message.fbs) +get_filename_component(ABS_FBS_SRC ${FBS_SRC} ABSOLUTE) + +add_custom_command( + OUTPUT ${FBS_OUTPUT_FILES} + COMMAND ${FLATBUFFERS_COMPILER} -c -o ${OUTPUT_DIR} ${ABS_FBS_SRC} + DEPENDS ${ABS_FBS_SRC} + COMMENT "Running flatc compiler on ${FBS_SRC}" + VERBATIM +) + +add_custom_target(metadata_fbs DEPENDS ${FBS_OUTPUT_FILES}) +add_dependencies(arrow metadata_fbs) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc new file mode 100644 index 0000000000000..7cdb965f5f45c --- /dev/null +++ b/cpp/src/arrow/ipc/adapter.cc @@ -0,0 +1,305 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/adapter.h" + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/ipc/memory.h" +#include "arrow/ipc/Message_generated.h" +#include "arrow/ipc/metadata.h" +#include "arrow/ipc/metadata-internal.h" +#include "arrow/schema.h" +#include "arrow/table.h" +#include "arrow/type.h" +#include "arrow/types/construct.h" +#include "arrow/types/primitive.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { + +namespace flatbuf = apache::arrow::flatbuf; + +namespace ipc { + +static bool IsPrimitive(const DataType* type) { + switch (type->type) { + // NA is null type or "no type", considered primitive for now + case Type::NA: + case Type::BOOL: + case Type::UINT8: + case Type::INT8: + case Type::UINT16: + case Type::INT16: + case Type::UINT32: + case Type::INT32: + case Type::UINT64: + case Type::INT64: + case Type::FLOAT: + case Type::DOUBLE: + return true; + default: + return false; + } +} + +// ---------------------------------------------------------------------- +// Row batch write path + +Status VisitArray(const Array* arr, std::vector* field_nodes, + std::vector>* buffers) { + if (IsPrimitive(arr->type().get())) { + const PrimitiveArray* prim_arr = static_cast(arr); + + field_nodes->push_back( + flatbuf::FieldNode(prim_arr->length(), prim_arr->null_count())); + + if (prim_arr->null_count() > 0) { + buffers->push_back(prim_arr->nulls()); + } else { + // Push a dummy zero-length buffer, not to be copied + buffers->push_back(std::make_shared(nullptr, 0)); + } + buffers->push_back(prim_arr->data()); + } else if (arr->type_enum() == Type::LIST) { + // TODO(wesm) + return Status::NotImplemented("List type"); + } else if (arr->type_enum() == Type::STRUCT) { + // TODO(wesm) + return Status::NotImplemented("Struct type"); + } + + return Status::OK(); +} + +class RowBatchWriter { + public: + explicit RowBatchWriter(const RowBatch* batch) : + batch_(batch) {} + + Status AssemblePayload() { + // Perform depth-first traversal of the row-batch + for (int i = 0; i < batch_->num_columns(); ++i) { + const Array* arr = batch_->column(i).get(); + RETURN_NOT_OK(VisitArray(arr, &field_nodes_, &buffers_)); + } + return Status::OK(); + } + + Status Write(MemorySource* dst, int64_t position, int64_t* data_header_offset) { + // Write out all the buffers contiguously and compute the total size of the + // memory payload + int64_t offset = 0; + for (size_t i = 0; i < buffers_.size(); ++i) { + const Buffer* buffer = buffers_[i].get(); + int64_t size = buffer->size(); + + // TODO(wesm): We currently have no notion of shared memory page id's, + // but we've included it in the metadata IDL for when we have it in the + // future. Use page=0 for now + // + // Note that page ids are a bespoke notion for Arrow and not a feature we + // are using from any OS-level shared memory. The thought is that systems + // may (in the future) associate integer page id's with physical memory + // pages (according to whatever is the desired shared memory mechanism) + buffer_meta_.push_back(flatbuf::Buffer(0, position + offset, size)); + + if (size > 0) { + RETURN_NOT_OK(dst->Write(position + offset, buffer->data(), size)); + offset += size; + } + } + + // Now that we have computed the locations of all of the buffers in shared + // memory, the data header can be converted to a flatbuffer and written out + // + // Note: The memory written here is prefixed by the size of the flatbuffer + // itself as an int32_t. On reading from a MemorySource, you will have to + // determine the data header size then request a buffer such that you can + // construct the flatbuffer data accessor object (see arrow::ipc::Message) + std::shared_ptr data_header; + RETURN_NOT_OK(WriteDataHeader(batch_->num_rows(), offset, + field_nodes_, buffer_meta_, &data_header)); + + // Write the data header at the end + RETURN_NOT_OK(dst->Write(position + offset, data_header->data(), + data_header->size())); + + *data_header_offset = position + offset; + return Status::OK(); + } + + // This must be called after invoking AssemblePayload + int64_t DataHeaderSize() { + // TODO(wesm): In case it is needed, compute the upper bound for the size + // of the buffer containing the flatbuffer data header. + return 0; + } + + // Total footprint of buffers. This must be called after invoking + // AssemblePayload + int64_t TotalBytes() { + int64_t total = 0; + for (const std::shared_ptr& buffer : buffers_) { + total += buffer->size(); + } + return total; + } + + private: + const RowBatch* batch_; + + std::vector field_nodes_; + std::vector buffer_meta_; + std::vector> buffers_; +}; + +Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, + int64_t* header_offset) { + RowBatchWriter serializer(batch); + RETURN_NOT_OK(serializer.AssemblePayload()); + return serializer.Write(dst, position, header_offset); +} +// ---------------------------------------------------------------------- +// Row batch read path + +static constexpr int64_t INIT_METADATA_SIZE = 4096; + +class RowBatchReader::Impl { + public: + Impl(MemorySource* source, const std::shared_ptr& metadata) : + source_(source), + metadata_(metadata) { + num_buffers_ = metadata->num_buffers(); + num_flattened_fields_ = metadata->num_fields(); + } + + Status AssembleBatch(const std::shared_ptr& schema, + std::shared_ptr* out) { + std::vector> arrays(schema->num_fields()); + + // The field_index and buffer_index are incremented in NextArray based on + // how much of the batch is "consumed" (through nested data reconstruction, + // for example) + field_index_ = 0; + buffer_index_ = 0; + for (int i = 0; i < schema->num_fields(); ++i) { + const Field* field = schema->field(i).get(); + RETURN_NOT_OK(NextArray(field, &arrays[i])); + } + + *out = std::make_shared(schema, metadata_->length(), + arrays); + return Status::OK(); + } + + private: + // Traverse the flattened record batch metadata and reassemble the + // corresponding array containers + Status NextArray(const Field* field, std::shared_ptr* out) { + const std::shared_ptr& type = field->type; + + // pop off a field + if (field_index_ >= num_flattened_fields_) { + return Status::Invalid("Ran out of field metadata, likely malformed"); + } + + // This only contains the length and null count, which we need to figure + // out what to do with the buffers. For example, if null_count == 0, then + // we can skip that buffer without reading from shared memory + FieldMetadata field_meta = metadata_->field(field_index_++); + + if (IsPrimitive(type.get())) { + std::shared_ptr nulls; + std::shared_ptr data; + if (field_meta.null_count == 0) { + nulls = nullptr; + ++buffer_index_; + } else { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &nulls)); + } + if (field_meta.length > 0) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &data)); + } else { + data.reset(new Buffer(nullptr, 0)); + } + return MakePrimitiveArray(type, field_meta.length, data, + field_meta.null_count, nulls, out); + } else { + return Status::NotImplemented("Non-primitive types not complete yet"); + } + } + + Status GetBuffer(int buffer_index, std::shared_ptr* out) { + BufferMetadata metadata = metadata_->buffer(buffer_index); + return source_->ReadAt(metadata.offset, metadata.length, out); + } + + MemorySource* source_; + std::shared_ptr metadata_; + + int field_index_; + int buffer_index_; + int num_buffers_; + int num_flattened_fields_; +}; + +Status RowBatchReader::Open(MemorySource* source, int64_t position, + std::shared_ptr* out) { + std::shared_ptr metadata; + RETURN_NOT_OK(source->ReadAt(position, INIT_METADATA_SIZE, &metadata)); + + int32_t metadata_size = *reinterpret_cast(metadata->data()); + + // We may not need to call source->ReadAt again + if (metadata_size > static_cast(INIT_METADATA_SIZE - sizeof(int32_t))) { + // We don't have enough data, read the indicated metadata size. + RETURN_NOT_OK(source->ReadAt(position + sizeof(int32_t), + metadata_size, &metadata)); + } + + // TODO(wesm): buffer slicing here would be better in case ReadAt returns + // allocated memory + + std::shared_ptr message; + RETURN_NOT_OK(Message::Open(metadata, &message)); + + if (message->type() != Message::RECORD_BATCH) { + return Status::Invalid("Metadata message is not a record batch"); + } + + std::shared_ptr batch_meta = message->GetRecordBatch(); + + std::shared_ptr result(new RowBatchReader()); + result->impl_.reset(new Impl(source, batch_meta)); + *out = result; + + return Status::OK(); +} + +Status RowBatchReader::GetRowBatch(const std::shared_ptr& schema, + std::shared_ptr* out) { + return impl_->AssembleBatch(schema, out); +} + + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h new file mode 100644 index 0000000000000..26dea6d04b889 --- /dev/null +++ b/cpp/src/arrow/ipc/adapter.h @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Public API for writing and accessing (with zero copy, if possible) Arrow +// data in shared memory + +#ifndef ARROW_IPC_ADAPTER_H +#define ARROW_IPC_ADAPTER_H + +#include +#include + +namespace arrow { + +class Array; +class RowBatch; +class Schema; +class Status; + +namespace ipc { + +class MemorySource; +class RecordBatchMessage; + +// ---------------------------------------------------------------------- +// Write path + +// Write the RowBatch (collection of equal-length Arrow arrays) to the memory +// source at the indicated position +// +// First, each of the memory buffers are written out end-to-end in starting at +// the indicated position. +// +// Then, this function writes the batch metadata as a flatbuffer (see +// format/Message.fbs -- the RecordBatch message type) like so: +// +// +// +// Finally, the memory offset to the start of the metadata / data header is +// returned in an out-variable +Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, + int64_t* header_offset); + +// int64_t GetRowBatchMetadata(const RowBatch* batch); + +// Compute the precise number of bytes needed in a contiguous memory segment to +// write the row batch. This involves generating the complete serialized +// Flatbuffers metadata. +int64_t GetRowBatchSize(const RowBatch* batch); + +// ---------------------------------------------------------------------- +// "Read" path; does not copy data if the MemorySource does not + +class RowBatchReader { + public: + static Status Open(MemorySource* source, int64_t position, + std::shared_ptr* out); + + // Reassemble the row batch. A Schema is required to be able to construct the + // right array containers + Status GetRowBatch(const std::shared_ptr& schema, + std::shared_ptr* out); + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_MEMORY_H diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc new file mode 100644 index 0000000000000..d75998f0a5dd2 --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/ipc/adapter.h" +#include "arrow/ipc/memory.h" +#include "arrow/ipc/test-common.h" + +#include "arrow/test-util.h" +#include "arrow/types/primitive.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +class TestWriteRowBatch : public ::testing::Test, public MemoryMapFixture { + public: + void SetUp() { + pool_ = default_memory_pool(); + } + void TearDown() { + MemoryMapFixture::TearDown(); + } + + void InitMemoryMap(int64_t size) { + std::string path = "test-write-row-batch"; + MemoryMapFixture::CreateFile(path, size); + ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &mmap_)); + } + + protected: + MemoryPool* pool_; + std::shared_ptr mmap_; +}; + +const auto INT32 = std::make_shared(); + +TEST_F(TestWriteRowBatch, IntegerRoundTrip) { + const int length = 1000; + + // Make the schema + auto f0 = std::make_shared("f0", INT32); + auto f1 = std::make_shared("f1", INT32); + std::shared_ptr schema(new Schema({f0, f1})); + + // Example data + + auto data = std::make_shared(pool_); + ASSERT_OK(data->Resize(length * sizeof(int32_t))); + test::rand_uniform_int(length, 0, 0, std::numeric_limits::max(), + reinterpret_cast(data->mutable_data())); + + auto nulls = std::make_shared(pool_); + int null_bytes = util::bytes_for_bits(length); + ASSERT_OK(nulls->Resize(null_bytes)); + test::random_bytes(null_bytes, 0, nulls->mutable_data()); + + auto a0 = std::make_shared(length, data); + auto a1 = std::make_shared(length, data, + test::bitmap_popcount(nulls->data(), length), nulls); + + RowBatch batch(schema, length, {a0, a1}); + + // TODO(wesm): computing memory requirements for a row batch + // 64k is plenty of space + InitMemoryMap(1 << 16); + + int64_t header_location; + ASSERT_OK(WriteRowBatch(mmap_.get(), &batch, 0, &header_location)); + + std::shared_ptr result; + ASSERT_OK(RowBatchReader::Open(mmap_.get(), header_location, &result)); + + std::shared_ptr batch_result; + ASSERT_OK(result->GetRowBatch(schema, &batch_result)); + EXPECT_EQ(batch.num_rows(), batch_result->num_rows()); + + for (int i = 0; i < batch.num_columns(); ++i) { + EXPECT_TRUE(batch.column(i)->Equals(batch_result->column(i))) + << i << batch.column_name(i); + } +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/ipc-memory-test.cc b/cpp/src/arrow/ipc/ipc-memory-test.cc new file mode 100644 index 0000000000000..332ad2a2b809b --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-memory-test.cc @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/ipc/memory.h" +#include "arrow/ipc/test-common.h" +#include "arrow/test-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +class TestMemoryMappedSource : public ::testing::Test, public MemoryMapFixture { + public: + void TearDown() { + MemoryMapFixture::TearDown(); + } +}; + +TEST_F(TestMemoryMappedSource, InvalidUsages) { +} + +TEST_F(TestMemoryMappedSource, WriteRead) { + const int64_t buffer_size = 1024; + std::vector buffer(buffer_size); + + test::random_bytes(1024, 0, buffer.data()); + + const int reps = 5; + + std::string path = "ipc-write-read-test"; + CreateFile(path, reps * buffer_size); + + std::shared_ptr result; + ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &result)); + + int64_t position = 0; + + std::shared_ptr out_buffer; + for (int i = 0; i < reps; ++i) { + ASSERT_OK(result->Write(position, buffer.data(), buffer_size)); + ASSERT_OK(result->ReadAt(position, buffer_size, &out_buffer)); + + ASSERT_EQ(0, memcmp(out_buffer->data(), buffer.data(), buffer_size)); + + position += buffer_size; + } +} + +TEST_F(TestMemoryMappedSource, InvalidFile) { + std::string non_existent_path = "invalid-file-name-asfd"; + + std::shared_ptr result; + ASSERT_RAISES(IOError, MemoryMappedSource::Open(non_existent_path, + MemorySource::READ_ONLY, &result)); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/ipc-metadata-test.cc b/cpp/src/arrow/ipc/ipc-metadata-test.cc new file mode 100644 index 0000000000000..ceabec0fa7c29 --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-metadata-test.cc @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/ipc/metadata.h" +#include "arrow/schema.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/util/status.h" + +namespace arrow { + +class Buffer; + +static inline void assert_schema_equal(const Schema* lhs, const Schema* rhs) { + if (!lhs->Equals(*rhs)) { + std::stringstream ss; + ss << "left schema: " << lhs->ToString() << std::endl + << "right schema: " << rhs->ToString() << std::endl; + FAIL() << ss.str(); + } +} + +class TestSchemaMessage : public ::testing::Test { + public: + void SetUp() {} + + void CheckRoundtrip(const Schema* schema) { + std::shared_ptr buffer; + ASSERT_OK(ipc::WriteSchema(schema, &buffer)); + + std::shared_ptr message; + ASSERT_OK(ipc::Message::Open(buffer, &message)); + + ASSERT_EQ(ipc::Message::SCHEMA, message->type()); + + std::shared_ptr schema_msg = message->GetSchema(); + ASSERT_EQ(schema->num_fields(), schema_msg->num_fields()); + + std::shared_ptr schema2; + ASSERT_OK(schema_msg->GetSchema(&schema2)); + + assert_schema_equal(schema, schema2.get()); + } +}; + +const std::shared_ptr INT32 = std::make_shared(); + +TEST_F(TestSchemaMessage, PrimitiveFields) { + auto f0 = std::make_shared("f0", std::make_shared()); + auto f1 = std::make_shared("f1", std::make_shared()); + auto f2 = std::make_shared("f2", std::make_shared()); + auto f3 = std::make_shared("f3", std::make_shared()); + auto f4 = std::make_shared("f4", std::make_shared()); + auto f5 = std::make_shared("f5", std::make_shared()); + auto f6 = std::make_shared("f6", std::make_shared()); + auto f7 = std::make_shared("f7", std::make_shared()); + auto f8 = std::make_shared("f8", std::make_shared()); + auto f9 = std::make_shared("f9", std::make_shared()); + auto f10 = std::make_shared("f10", std::make_shared()); + + Schema schema({f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10}); + CheckRoundtrip(&schema); +} + +TEST_F(TestSchemaMessage, NestedFields) { + auto type = std::make_shared(std::make_shared()); + auto f0 = std::make_shared("f0", type); + + std::shared_ptr type2(new StructType({ + std::make_shared("k1", INT32), + std::make_shared("k2", INT32), + std::make_shared("k3", INT32)})); + auto f1 = std::make_shared("f1", type2); + + Schema schema({f0, f1}); + CheckRoundtrip(&schema); +} + +} // namespace arrow diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc new file mode 100644 index 0000000000000..e630ccd109b77 --- /dev/null +++ b/cpp/src/arrow/ipc/memory.cc @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/memory.h" + +#include // For memory-mapping +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +MemorySource::MemorySource(AccessMode access_mode) : + access_mode_(access_mode) {} + +MemorySource::~MemorySource() {} + +// Implement MemoryMappedSource + +class MemoryMappedSource::Impl { + public: + Impl() : + file_(nullptr), + is_open_(false), + data_(nullptr) {} + + ~Impl() { + if (is_open_) { + munmap(data_, size_); + fclose(file_); + } + } + + Status Open(const std::string& path, MemorySource::AccessMode mode) { + if (is_open_) { + return Status::IOError("A file is already open"); + } + + path_ = path; + + if (mode == MemorySource::READ_WRITE) { + file_ = fopen(path.c_str(), "r+b"); + } else { + file_ = fopen(path.c_str(), "rb"); + } + if (file_ == nullptr) { + std::stringstream ss; + ss << "Unable to open file, errno: " << errno; + return Status::IOError(ss.str()); + } + + fseek(file_, 0L, SEEK_END); + if (ferror(file_)) { + return Status::IOError("Unable to seek to end of file"); + } + size_ = ftell(file_); + + fseek(file_, 0L, SEEK_SET); + is_open_ = true; + + // TODO(wesm): Add read-only version of this + data_ = reinterpret_cast(mmap(nullptr, size_, + PROT_READ | PROT_WRITE, + MAP_SHARED, fileno(file_), 0)); + if (data_ == nullptr) { + std::stringstream ss; + ss << "Memory mapping file failed, errno: " << errno; + return Status::IOError(ss.str()); + } + + return Status::OK(); + } + + int64_t size() const { + return size_; + } + + uint8_t* data() { + return data_; + } + + private: + std::string path_; + FILE* file_; + int64_t size_; + bool is_open_; + + // The memory map + uint8_t* data_; +}; + +MemoryMappedSource::MemoryMappedSource(AccessMode access_mode) : + MemorySource(access_mode) {} + +Status MemoryMappedSource::Open(const std::string& path, AccessMode access_mode, + std::shared_ptr* out) { + std::shared_ptr result(new MemoryMappedSource(access_mode)); + + result->impl_.reset(new Impl()); + RETURN_NOT_OK(result->impl_->Open(path, access_mode)); + + *out = result; + return Status::OK(); +} + +int64_t MemoryMappedSource::Size() const { + return impl_->size(); +} + +Status MemoryMappedSource::Close() { + // munmap handled in ::Impl dtor + return Status::OK(); +} + +Status MemoryMappedSource::ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) { + if (position < 0 || position >= impl_->size()) { + return Status::Invalid("position is out of bounds"); + } + + nbytes = std::min(nbytes, impl_->size() - position); + *out = std::make_shared(impl_->data() + position, nbytes); + return Status::OK(); +} + +Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, + int64_t nbytes) { + if (position < 0 || position >= impl_->size()) { + return Status::Invalid("position is out of bounds"); + } + + // TODO(wesm): verify we are not writing past the end of the buffer + uint8_t* dst = impl_->data() + position; + memcpy(dst, data, nbytes); + + return Status::OK(); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/memory.h b/cpp/src/arrow/ipc/memory.h new file mode 100644 index 0000000000000..0b4d8347c342f --- /dev/null +++ b/cpp/src/arrow/ipc/memory.h @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Public API for different interprocess memory sharing mechanisms + +#ifndef ARROW_IPC_MEMORY_H +#define ARROW_IPC_MEMORY_H + +#include +#include +#include + +#include "arrow/util/macros.h" + +namespace arrow { + +class Buffer; +class MutableBuffer; +class Status; + +namespace ipc { + +// Abstract output stream +class OutputStream { + public: + virtual ~OutputStream() {} + // Close the output stream + virtual Status Close() = 0; + + // The current position in the output stream + virtual int64_t Tell() const = 0; + + // Write bytes to the stream + virtual Status Write(const uint8_t* data, int64_t length) = 0; +}; + +// An output stream that writes to a MutableBuffer, such as one obtained from a +// memory map +class BufferOutputStream : public OutputStream { + public: + explicit BufferOutputStream(const std::shared_ptr& buffer): + buffer_(buffer) {} + + // Implement the OutputStream interface + Status Close() override; + int64_t Tell() const override; + Status Write(const uint8_t* data, int64_t length) override; + + // Returns the number of bytes remaining in the buffer + int64_t bytes_remaining() const; + + private: + std::shared_ptr buffer_; + int64_t capacity_; + int64_t position_; +}; + +class MemorySource { + public: + // Indicates the access permissions of the memory source + enum AccessMode { + READ_ONLY, + READ_WRITE + }; + + virtual ~MemorySource(); + + // Retrieve a buffer of memory from the source of the indicates size and at + // the indicated location + // @returns: arrow::Status indicating success / failure. The buffer is set + // into the *out argument + virtual Status ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) = 0; + + virtual Status Close() = 0; + + virtual Status Write(int64_t position, const uint8_t* data, int64_t nbytes) = 0; + + // @return: the size in bytes of the memory source + virtual int64_t Size() const = 0; + + protected: + explicit MemorySource(AccessMode access_mode = AccessMode::READ_WRITE); + + AccessMode access_mode_; + + private: + DISALLOW_COPY_AND_ASSIGN(MemorySource); +}; + +// A memory source that uses memory-mapped files for memory interactions +class MemoryMappedSource : public MemorySource { + public: + static Status Open(const std::string& path, AccessMode access_mode, + std::shared_ptr* out); + + Status Close() override; + + Status ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) override; + + Status Write(int64_t position, const uint8_t* data, int64_t nbytes) override; + + // @return: the size in bytes of the memory source + int64_t Size() const override; + + private: + explicit MemoryMappedSource(AccessMode access_mode); + // Hide the internal details of this class for now + class Impl; + std::unique_ptr impl_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_MEMORY_H diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc new file mode 100644 index 0000000000000..14b186906c3a0 --- /dev/null +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -0,0 +1,317 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/metadata-internal.h" + +#include +#include +#include +#include +#include +#include + +#include "arrow/ipc/Message_generated.h" +#include "arrow/schema.h" +#include "arrow/type.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +typedef flatbuffers::FlatBufferBuilder FBB; +typedef flatbuffers::Offset FieldOffset; +typedef flatbuffers::Offset Offset; + +namespace arrow { + +namespace flatbuf = apache::arrow::flatbuf; + +namespace ipc { + +const std::shared_ptr BOOL = std::make_shared(); +const std::shared_ptr INT8 = std::make_shared(); +const std::shared_ptr INT16 = std::make_shared(); +const std::shared_ptr INT32 = std::make_shared(); +const std::shared_ptr INT64 = std::make_shared(); +const std::shared_ptr UINT8 = std::make_shared(); +const std::shared_ptr UINT16 = std::make_shared(); +const std::shared_ptr UINT32 = std::make_shared(); +const std::shared_ptr UINT64 = std::make_shared(); +const std::shared_ptr FLOAT = std::make_shared(); +const std::shared_ptr DOUBLE = std::make_shared(); + +static Status IntFromFlatbuffer(const flatbuf::Int* int_data, + std::shared_ptr* out) { + if (int_data->bitWidth() % 8 != 0) { + return Status::NotImplemented("Integers not in cstdint are not implemented"); + } else if (int_data->bitWidth() > 64) { + return Status::NotImplemented("Integers with more than 64 bits not implemented"); + } + + switch (int_data->bitWidth()) { + case 8: + *out = int_data->is_signed() ? INT8 : UINT8; + break; + case 16: + *out = int_data->is_signed() ? INT16 : UINT16; + break; + case 32: + *out = int_data->is_signed() ? INT32 : UINT32; + break; + case 64: + *out = int_data->is_signed() ? INT64 : UINT64; + break; + default: + *out = nullptr; + break; + } + return Status::OK(); +} + +static Status FloatFromFlatuffer(const flatbuf::FloatingPoint* float_data, + std::shared_ptr* out) { + if (float_data->precision() == flatbuf::Precision_SINGLE) { + *out = FLOAT; + } else { + *out = DOUBLE; + } + return Status::OK(); +} + +static Status TypeFromFlatbuffer(flatbuf::Type type, + const void* type_data, const std::vector>& children, + std::shared_ptr* out) { + switch (type) { + case flatbuf::Type_NONE: + return Status::Invalid("Type metadata cannot be none"); + case flatbuf::Type_Int: + return IntFromFlatbuffer(static_cast(type_data), out); + case flatbuf::Type_Bit: + return Status::NotImplemented("Type is not implemented"); + case flatbuf::Type_FloatingPoint: + return FloatFromFlatuffer(static_cast(type_data), + out); + case flatbuf::Type_Binary: + case flatbuf::Type_Utf8: + return Status::NotImplemented("Type is not implemented"); + case flatbuf::Type_Bool: + *out = BOOL; + return Status::OK(); + case flatbuf::Type_Decimal: + case flatbuf::Type_Timestamp: + case flatbuf::Type_List: + if (children.size() != 1) { + return Status::Invalid("List must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); + case flatbuf::Type_Tuple: + *out = std::make_shared(children); + return Status::OK(); + case flatbuf::Type_Union: + return Status::NotImplemented("Type is not implemented"); + default: + return Status::Invalid("Unrecognized type"); + } +} + +// Forward declaration +static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, + FieldOffset* offset); + +static Offset IntToFlatbuffer(FBB& fbb, int bitWidth, + bool is_signed) { + return flatbuf::CreateInt(fbb, bitWidth, is_signed).Union(); +} + +static Offset FloatToFlatbuffer(FBB& fbb, + flatbuf::Precision precision) { + return flatbuf::CreateFloatingPoint(fbb, precision).Union(); +} + +static Status ListToFlatbuffer(FBB& fbb, const std::shared_ptr& type, + std::vector* out_children, Offset* offset) { + FieldOffset field; + RETURN_NOT_OK(FieldToFlatbuffer(fbb, type->child(0), &field)); + out_children->push_back(field); + *offset = flatbuf::CreateList(fbb).Union(); + return Status::OK(); +} + +static Status StructToFlatbuffer(FBB& fbb, const std::shared_ptr& type, + std::vector* out_children, Offset* offset) { + FieldOffset field; + for (int i = 0; i < type->num_children(); ++i) { + RETURN_NOT_OK(FieldToFlatbuffer(fbb, type->child(i), &field)); + out_children->push_back(field); + } + *offset = flatbuf::CreateTuple(fbb).Union(); + return Status::OK(); +} + +#define INT_TO_FB_CASE(BIT_WIDTH, IS_SIGNED) \ + *out_type = flatbuf::Type_Int; \ + *offset = IntToFlatbuffer(fbb, BIT_WIDTH, IS_SIGNED); \ + break; + + +static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr& type, + std::vector* children, + flatbuf::Type* out_type, Offset* offset) { + switch (type->type) { + case Type::BOOL: + *out_type = flatbuf::Type_Bool; + *offset = flatbuf::CreateBool(fbb).Union(); + break; + case Type::UINT8: + INT_TO_FB_CASE(8, false); + case Type::INT8: + INT_TO_FB_CASE(8, true); + case Type::UINT16: + INT_TO_FB_CASE(16, false); + case Type::INT16: + INT_TO_FB_CASE(16, true); + case Type::UINT32: + INT_TO_FB_CASE(32, false); + case Type::INT32: + INT_TO_FB_CASE(32, true); + case Type::UINT64: + INT_TO_FB_CASE(64, false); + case Type::INT64: + INT_TO_FB_CASE(64, true); + case Type::FLOAT: + *out_type = flatbuf::Type_FloatingPoint; + *offset = FloatToFlatbuffer(fbb, flatbuf::Precision_SINGLE); + break; + case Type::DOUBLE: + *out_type = flatbuf::Type_FloatingPoint; + *offset = FloatToFlatbuffer(fbb, flatbuf::Precision_DOUBLE); + break; + case Type::LIST: + *out_type = flatbuf::Type_List; + return ListToFlatbuffer(fbb, type, children, offset); + case Type::STRUCT: + *out_type = flatbuf::Type_Tuple; + return StructToFlatbuffer(fbb, type, children, offset); + default: + std::stringstream ss; + ss << "Unable to convert type: " << type->ToString() + << std::endl; + return Status::NotImplemented(ss.str()); + } + return Status::OK(); +} + +static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, + FieldOffset* offset) { + auto fb_name = fbb.CreateString(field->name); + + flatbuf::Type type_enum; + Offset type_data; + std::vector children; + + RETURN_NOT_OK(TypeToFlatbuffer(fbb, field->type, &children, &type_enum, &type_data)); + auto fb_children = fbb.CreateVector(children); + + *offset = flatbuf::CreateField(fbb, fb_name, field->nullable, type_enum, + type_data, fb_children); + + return Status::OK(); +} + +Status FieldFromFlatbuffer(const flatbuf::Field* field, + std::shared_ptr* out) { + std::shared_ptr type; + + auto children = field->children(); + std::vector> child_fields(children->size()); + for (size_t i = 0; i < children->size(); ++i) { + RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), &child_fields[i])); + } + + RETURN_NOT_OK(TypeFromFlatbuffer(field->type_type(), + field->type(), child_fields, &type)); + + *out = std::make_shared(field->name()->str(), type); + return Status::OK(); +} + +// Implement MessageBuilder + +Status MessageBuilder::SetSchema(const Schema* schema) { + header_type_ = flatbuf::MessageHeader_Schema; + + std::vector field_offsets; + for (int i = 0; i < schema->num_fields(); ++i) { + const std::shared_ptr& field = schema->field(i); + FieldOffset offset; + RETURN_NOT_OK(FieldToFlatbuffer(fbb_, field, &offset)); + field_offsets.push_back(offset); + } + + header_ = flatbuf::CreateSchema(fbb_, fbb_.CreateVector(field_offsets)).Union(); + body_length_ = 0; + return Status::OK(); +} + +Status MessageBuilder::SetRecordBatch(int32_t length, int64_t body_length, + const std::vector& nodes, + const std::vector& buffers) { + header_type_ = flatbuf::MessageHeader_RecordBatch; + header_ = flatbuf::CreateRecordBatch(fbb_, length, + fbb_.CreateVectorOfStructs(nodes), + fbb_.CreateVectorOfStructs(buffers)).Union(); + body_length_ = body_length; + + return Status::OK(); +} + + +Status WriteDataHeader(int32_t length, int64_t body_length, + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out) { + MessageBuilder message; + RETURN_NOT_OK(message.SetRecordBatch(length, body_length, nodes, buffers)); + RETURN_NOT_OK(message.Finish()); + return message.GetBuffer(out); +} + +Status MessageBuilder::Finish() { + auto message = flatbuf::CreateMessage(fbb_, header_type_, header_, + body_length_); + fbb_.Finish(message); + return Status::OK(); +} + +Status MessageBuilder::GetBuffer(std::shared_ptr* out) { + // The message buffer is prefixed by the size of the complete flatbuffer as + // int32_t + // + int32_t size = fbb_.GetSize(); + + auto result = std::make_shared(); + RETURN_NOT_OK(result->Resize(size + sizeof(int32_t))); + + uint8_t* dst = result->mutable_data(); + memcpy(dst, reinterpret_cast(&size), sizeof(int32_t)); + memcpy(dst + sizeof(int32_t), fbb_.GetBufferPointer(), size); + + *out = result; + return Status::OK(); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h new file mode 100644 index 0000000000000..f7365d2a49f95 --- /dev/null +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IPC_METADATA_INTERNAL_H +#define ARROW_IPC_METADATA_INTERNAL_H + +#include +#include +#include +#include + +#include "arrow/ipc/Message_generated.h" + +namespace arrow { + +namespace flatbuf = apache::arrow::flatbuf; + +class Buffer; +struct Field; +class Schema; +class Status; + +namespace ipc { + +Status FieldFromFlatbuffer(const flatbuf::Field* field, + std::shared_ptr* out); + +class MessageBuilder { + public: + Status SetSchema(const Schema* schema); + + Status SetRecordBatch(int32_t length, int64_t body_length, + const std::vector& nodes, + const std::vector& buffers); + + Status Finish(); + + Status GetBuffer(std::shared_ptr* out); + + private: + flatbuf::MessageHeader header_type_; + flatbuffers::Offset header_; + int64_t body_length_; + flatbuffers::FlatBufferBuilder fbb_; +}; + +Status WriteDataHeader(int32_t length, int64_t body_length, + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out); + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_METADATA_INTERNAL_H diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc new file mode 100644 index 0000000000000..642f21a41e640 --- /dev/null +++ b/cpp/src/arrow/ipc/metadata.cc @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/metadata.h" + +#include +#include +#include +#include + +// Generated C++ flatbuffer IDL +#include "arrow/ipc/Message_generated.h" +#include "arrow/ipc/metadata-internal.h" + +#include "arrow/schema.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { + +namespace flatbuf = apache::arrow::flatbuf; + +namespace ipc { + +Status WriteSchema(const Schema* schema, std::shared_ptr* out) { + MessageBuilder message; + RETURN_NOT_OK(message.SetSchema(schema)); + RETURN_NOT_OK(message.Finish()); + return message.GetBuffer(out); +} + +//---------------------------------------------------------------------- +// Message reader + +class Message::Impl { + public: + explicit Impl(const std::shared_ptr& buffer, + const flatbuf::Message* message) : + buffer_(buffer), + message_(message) {} + + Message::Type type() const { + switch (message_->header_type()) { + case flatbuf::MessageHeader_Schema: + return Message::SCHEMA; + case flatbuf::MessageHeader_DictionaryBatch: + return Message::DICTIONARY_BATCH; + case flatbuf::MessageHeader_RecordBatch: + return Message::RECORD_BATCH; + default: + return Message::NONE; + } + } + + const void* header() const { + return message_->header(); + } + + int64_t body_length() const { + return message_->bodyLength(); + } + + private: + // Owns the memory this message accesses + std::shared_ptr buffer_; + + const flatbuf::Message* message_; +}; + +class SchemaMessage::Impl { + public: + explicit Impl(const void* schema) : + schema_(static_cast(schema)) {} + + const flatbuf::Field* field(int i) const { + return schema_->fields()->Get(i); + } + + int num_fields() const { + return schema_->fields()->size(); + } + + private: + const flatbuf::Schema* schema_; +}; + +Message::Message() {} + +Status Message::Open(const std::shared_ptr& buffer, + std::shared_ptr* out) { + std::shared_ptr result(new Message()); + + // The buffer is prefixed by its size as int32_t + const uint8_t* fb_head = buffer->data() + sizeof(int32_t); + const flatbuf::Message* message = flatbuf::GetMessage(fb_head); + + // TODO(wesm): verify message + result->impl_.reset(new Impl(buffer, message)); + *out = result; + + return Status::OK(); +} + +Message::Type Message::type() const { + return impl_->type(); +} + +int64_t Message::body_length() const { + return impl_->body_length(); +} + +std::shared_ptr Message::get_shared_ptr() { + return this->shared_from_this(); +} + +std::shared_ptr Message::GetSchema() { + return std::make_shared(this->shared_from_this(), + impl_->header()); +} + +SchemaMessage::SchemaMessage(const std::shared_ptr& message, + const void* schema) { + message_ = message; + impl_.reset(new Impl(schema)); +} + +int SchemaMessage::num_fields() const { + return impl_->num_fields(); +} + +Status SchemaMessage::GetField(int i, std::shared_ptr* out) const { + const flatbuf::Field* field = impl_->field(i); + return FieldFromFlatbuffer(field, out); +} + +Status SchemaMessage::GetSchema(std::shared_ptr* out) const { + std::vector> fields(num_fields()); + for (int i = 0; i < this->num_fields(); ++i) { + RETURN_NOT_OK(GetField(i, &fields[i])); + } + *out = std::make_shared(fields); + return Status::OK(); +} + +class RecordBatchMessage::Impl { + public: + explicit Impl(const void* batch) : + batch_(static_cast(batch)) { + nodes_ = batch_->nodes(); + buffers_ = batch_->buffers(); + } + + const flatbuf::FieldNode* field(int i) const { + return nodes_->Get(i); + } + + const flatbuf::Buffer* buffer(int i) const { + return buffers_->Get(i); + } + + int32_t length() const { + return batch_->length(); + } + + int num_buffers() const { + return batch_->buffers()->size(); + } + + int num_fields() const { + return batch_->nodes()->size(); + } + + private: + const flatbuf::RecordBatch* batch_; + const flatbuffers::Vector* nodes_; + const flatbuffers::Vector* buffers_; +}; + +std::shared_ptr Message::GetRecordBatch() { + return std::make_shared(this->shared_from_this(), + impl_->header()); +} + +RecordBatchMessage::RecordBatchMessage(const std::shared_ptr& message, + const void* batch) { + message_ = message; + impl_.reset(new Impl(batch)); +} + +// TODO(wesm): Copying the flatbuffer data isn't great, but this will do for +// now +FieldMetadata RecordBatchMessage::field(int i) const { + const flatbuf::FieldNode* node = impl_->field(i); + + FieldMetadata result; + result.length = node->length(); + result.null_count = node->null_count(); + return result; +} + +BufferMetadata RecordBatchMessage::buffer(int i) const { + const flatbuf::Buffer* buffer = impl_->buffer(i); + + BufferMetadata result; + result.page = buffer->page(); + result.offset = buffer->offset(); + result.length = buffer->length(); + return result; +} + +int32_t RecordBatchMessage::length() const { + return impl_->length(); +} + +int RecordBatchMessage::num_buffers() const { + return impl_->num_buffers(); +} + +int RecordBatchMessage::num_fields() const { + return impl_->num_fields(); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/metadata.h b/cpp/src/arrow/ipc/metadata.h new file mode 100644 index 0000000000000..c7288529b9fbd --- /dev/null +++ b/cpp/src/arrow/ipc/metadata.h @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// C++ object model and user API for interprocess schema messaging + +#ifndef ARROW_IPC_METADATA_H +#define ARROW_IPC_METADATA_H + +#include +#include + +namespace arrow { + +class Buffer; +struct Field; +class Schema; +class Status; + +namespace ipc { + +//---------------------------------------------------------------------- +// Message read/write APIs + +// Serialize arrow::Schema as a Flatbuffer +Status WriteSchema(const Schema* schema, std::shared_ptr* out); + +//---------------------------------------------------------------------- + +// Read interface classes. We do not fully deserialize the flatbuffers so that +// individual fields metadata can be retrieved from very large schema without +// + +class Message; + +// Container for serialized Schema metadata contained in an IPC message +class SchemaMessage { + public: + // Accepts an opaque flatbuffer pointer + SchemaMessage(const std::shared_ptr& message, const void* schema); + + int num_fields() const; + + // Construct an arrow::Field for the i-th value in the metadata + Status GetField(int i, std::shared_ptr* out) const; + + // Construct a complete Schema from the message. May be expensive for very + // large schemas if you are only interested in a few fields + Status GetSchema(std::shared_ptr* out) const; + + private: + // Parent, owns the flatbuffer data + std::shared_ptr message_; + + class Impl; + std::unique_ptr impl_; +}; + +// Field metadata +struct FieldMetadata { + int32_t length; + int32_t null_count; +}; + +struct BufferMetadata { + int32_t page; + int64_t offset; + int64_t length; +}; + +// Container for serialized record batch metadata contained in an IPC message +class RecordBatchMessage { + public: + // Accepts an opaque flatbuffer pointer + RecordBatchMessage(const std::shared_ptr& message, + const void* batch_meta); + + FieldMetadata field(int i) const; + BufferMetadata buffer(int i) const; + + int32_t length() const; + int num_buffers() const; + int num_fields() const; + + private: + // Parent, owns the flatbuffer data + std::shared_ptr message_; + + class Impl; + std::unique_ptr impl_; +}; + +class DictionaryBatchMessage { + public: + int64_t id() const; + std::unique_ptr data() const; +}; + +class Message : public std::enable_shared_from_this { + public: + enum Type { + NONE, + SCHEMA, + DICTIONARY_BATCH, + RECORD_BATCH + }; + + static Status Open(const std::shared_ptr& buffer, + std::shared_ptr* out); + + std::shared_ptr get_shared_ptr(); + + int64_t body_length() const; + + Type type() const; + + // These methods only to be invoked if you have checked the message type + std::shared_ptr GetSchema(); + std::shared_ptr GetRecordBatch(); + std::shared_ptr GetDictionaryBatch(); + + private: + Message(); + + // Hide serialization details from user API + class Impl; + std::unique_ptr impl_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_METADATA_H diff --git a/cpp/src/arrow/types/floating.h b/cpp/src/arrow/ipc/test-common.h similarity index 59% rename from cpp/src/arrow/types/floating.h rename to cpp/src/arrow/ipc/test-common.h index e7522781d33e3..0fccce941071b 100644 --- a/cpp/src/arrow/types/floating.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -15,22 +15,39 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TYPES_FLOATING_H -#define ARROW_TYPES_FLOATING_H +#ifndef ARROW_IPC_TEST_COMMON_H +#define ARROW_IPC_TEST_COMMON_H +#include +#include #include - -#include "arrow/types/primitive.h" -#include "arrow/type.h" +#include namespace arrow { - -typedef PrimitiveArrayImpl FloatArray; -typedef PrimitiveArrayImpl DoubleArray; - -typedef PrimitiveBuilder FloatBuilder; -typedef PrimitiveBuilder DoubleBuilder; - +namespace ipc { + +class MemoryMapFixture { + public: + void TearDown() { + for (auto path : tmp_files_) { + std::remove(path.c_str()); + } + } + + void CreateFile(const std::string path, int64_t size) { + FILE* file = fopen(path.c_str(), "w"); + if (file != nullptr) { + tmp_files_.push_back(path); + } + ftruncate(fileno(file), size); + fclose(file); + } + + private: + std::vector tmp_files_; +}; + +} // namespace ipc } // namespace arrow -#endif // ARROW_TYPES_FLOATING_H +#endif // ARROW_IPC_TEST_COMMON_H diff --git a/cpp/src/arrow/table/schema-test.cc b/cpp/src/arrow/schema-test.cc similarity index 72% rename from cpp/src/arrow/table/schema-test.cc rename to cpp/src/arrow/schema-test.cc index 9dfade2695311..a1de1dc5ac8a4 100644 --- a/cpp/src/arrow/table/schema-test.cc +++ b/cpp/src/arrow/schema-test.cc @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include -#include "arrow/table/schema.h" +#include "gtest/gtest.h" + +#include "arrow/schema.h" #include "arrow/type.h" -#include "arrow/types/string.h" using std::shared_ptr; using std::vector; @@ -32,25 +32,20 @@ namespace arrow { const auto INT32 = std::make_shared(); TEST(TestField, Basics) { - shared_ptr ftype = INT32; - shared_ptr ftype_nn = std::make_shared(false); - Field f0("f0", ftype); - Field f0_nn("f0", ftype_nn); + Field f0("f0", INT32); + Field f0_nn("f0", INT32, false); ASSERT_EQ(f0.name, "f0"); - ASSERT_EQ(f0.type->ToString(), ftype->ToString()); + ASSERT_EQ(f0.type->ToString(), INT32->ToString()); - ASSERT_TRUE(f0.nullable()); - ASSERT_FALSE(f0_nn.nullable()); + ASSERT_TRUE(f0.nullable); + ASSERT_FALSE(f0_nn.nullable); } TEST(TestField, Equals) { - shared_ptr ftype = INT32; - shared_ptr ftype_nn = std::make_shared(false); - - Field f0("f0", ftype); - Field f0_nn("f0", ftype_nn); - Field f0_other("f0", ftype); + Field f0("f0", INT32); + Field f0_nn("f0", INT32, false); + Field f0_other("f0", INT32); ASSERT_EQ(f0, f0_other); ASSERT_NE(f0, f0_nn); @@ -63,12 +58,12 @@ class TestSchema : public ::testing::Test { TEST_F(TestSchema, Basics) { auto f0 = std::make_shared("f0", INT32); - auto f1 = std::make_shared("f1", std::make_shared(false)); + auto f1 = std::make_shared("f1", std::make_shared(), false); auto f1_optional = std::make_shared("f1", std::make_shared()); auto f2 = std::make_shared("f2", std::make_shared()); - vector > fields = {f0, f1, f2}; + vector> fields = {f0, f1, f2}; auto schema = std::make_shared(fields); ASSERT_EQ(3, schema->num_fields()); @@ -78,7 +73,7 @@ TEST_F(TestSchema, Basics) { auto schema2 = std::make_shared(fields); - vector > fields3 = {f0, f1_optional, f2}; + vector> fields3 = {f0, f1_optional, f2}; auto schema3 = std::make_shared(fields3); ASSERT_TRUE(schema->Equals(schema2)); ASSERT_FALSE(schema->Equals(schema3)); @@ -88,21 +83,20 @@ TEST_F(TestSchema, Basics) { } TEST_F(TestSchema, ToString) { - auto f0 = std::make_shared("f0", std::make_shared()); - auto f1 = std::make_shared("f1", std::make_shared(false)); + auto f0 = std::make_shared("f0", INT32); + auto f1 = std::make_shared("f1", std::make_shared(), false); auto f2 = std::make_shared("f2", std::make_shared()); auto f3 = std::make_shared("f3", std::make_shared(std::make_shared())); - vector > fields = {f0, f1, f2, f3}; + vector> fields = {f0, f1, f2, f3}; auto schema = std::make_shared(fields); std::string result = schema->ToString(); - std::string expected = R"(f0 int32 -f1 uint8 not null -f2 string -f3 list -)"; + std::string expected = R"(f0: int32 +f1: uint8 not null +f2: string +f3: list)"; ASSERT_EQ(expected, result); } diff --git a/cpp/src/arrow/table/schema.cc b/cpp/src/arrow/schema.cc similarity index 88% rename from cpp/src/arrow/table/schema.cc rename to cpp/src/arrow/schema.cc index d49d0a713e7f4..18aad0e806ff2 100644 --- a/cpp/src/arrow/table/schema.cc +++ b/cpp/src/arrow/schema.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/table/schema.h" +#include "arrow/schema.h" #include #include @@ -26,7 +26,7 @@ namespace arrow { -Schema::Schema(const std::vector >& fields) : +Schema::Schema(const std::vector>& fields) : fields_(fields) {} bool Schema::Equals(const Schema& other) const { @@ -49,8 +49,13 @@ bool Schema::Equals(const std::shared_ptr& other) const { std::string Schema::ToString() const { std::stringstream buffer; + int i = 0; for (auto field : fields_) { - buffer << field->ToString() << std::endl; + if (i > 0) { + buffer << std::endl; + } + buffer << field->ToString(); + ++i; } return buffer.str(); } diff --git a/cpp/src/arrow/table/schema.h b/cpp/src/arrow/schema.h similarity index 91% rename from cpp/src/arrow/table/schema.h rename to cpp/src/arrow/schema.h index 103f01b26e3ca..52f3c1ceae46d 100644 --- a/cpp/src/arrow/table/schema.h +++ b/cpp/src/arrow/schema.h @@ -22,13 +22,13 @@ #include #include -#include "arrow/type.h" - namespace arrow { +struct Field; + class Schema { public: - explicit Schema(const std::vector >& fields); + explicit Schema(const std::vector>& fields); // Returns true if all of the schema fields are equal bool Equals(const Schema& other) const; @@ -47,7 +47,7 @@ class Schema { } private: - std::vector > fields_; + std::vector> fields_; }; } // namespace arrow diff --git a/cpp/src/arrow/table/table-test.cc b/cpp/src/arrow/table-test.cc similarity index 92% rename from cpp/src/arrow/table/table-test.cc rename to cpp/src/arrow/table-test.cc index 8b354e8503c71..4c7b8f80486de 100644 --- a/cpp/src/arrow/table/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -15,19 +15,19 @@ // specific language governing permissions and limitations // under the License. -#include -#include #include #include #include -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/table.h" -#include "arrow/table/test-common.h" +#include "gtest/gtest.h" + +#include "arrow/column.h" +#include "arrow/schema.h" +#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" +#include "arrow/util/status.h" using std::shared_ptr; using std::vector; @@ -45,7 +45,7 @@ class TestTable : public TestBase { auto f1 = std::make_shared("f1", UINT8); auto f2 = std::make_shared("f2", INT16); - vector > fields = {f0, f1, f2}; + vector> fields = {f0, f1, f2}; schema_ = std::make_shared(fields); columns_ = { @@ -58,7 +58,7 @@ class TestTable : public TestBase { protected: std::unique_ptr table_; shared_ptr schema_; - vector > columns_; + vector> columns_; }; TEST_F(TestTable, EmptySchema) { diff --git a/cpp/src/arrow/table/table.cc b/cpp/src/arrow/table.cc similarity index 69% rename from cpp/src/arrow/table/table.cc rename to cpp/src/arrow/table.cc index 0c788b8fe3ff3..e405c1d508c22 100644 --- a/cpp/src/arrow/table/table.cc +++ b/cpp/src/arrow/table.cc @@ -15,20 +15,30 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/table/table.h" +#include "arrow/table.h" +#include #include #include -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/type.h" +#include "arrow/column.h" +#include "arrow/schema.h" #include "arrow/util/status.h" namespace arrow { +RowBatch::RowBatch(const std::shared_ptr& schema, int num_rows, + const std::vector>& columns) : + schema_(schema), + num_rows_(num_rows), + columns_(columns) {} + +const std::string& RowBatch::column_name(int i) const { + return schema_->field(i)->name; +} + Table::Table(const std::string& name, const std::shared_ptr& schema, - const std::vector >& columns) : + const std::vector>& columns) : name_(name), schema_(schema), columns_(columns) { @@ -40,7 +50,7 @@ Table::Table(const std::string& name, const std::shared_ptr& schema, } Table::Table(const std::string& name, const std::shared_ptr& schema, - const std::vector >& columns, int64_t num_rows) : + const std::vector>& columns, int64_t num_rows) : name_(name), schema_(schema), columns_(columns), @@ -51,16 +61,19 @@ Status Table::ValidateColumns() const { return Status::Invalid("Number of columns did not match schema"); } - if (columns_.size() == 0) { - return Status::OK(); - } - // Make sure columns are all the same length for (size_t i = 0; i < columns_.size(); ++i) { const Column* col = columns_[i].get(); + if (col == nullptr) { + std::stringstream ss; + ss << "Column " << i << " named " << col->name() + << " was null"; + return Status::Invalid(ss.str()); + } if (col->length() != num_rows_) { std::stringstream ss; - ss << "Column " << i << " expected length " + ss << "Column " << i << " named " << col->name() + << " expected length " << num_rows_ << " but got length " << col->length(); diff --git a/cpp/src/arrow/table/table.h b/cpp/src/arrow/table.h similarity index 55% rename from cpp/src/arrow/table/table.h rename to cpp/src/arrow/table.h index b0129387b710c..e2f73a2eeddcb 100644 --- a/cpp/src/arrow/table/table.h +++ b/cpp/src/arrow/table.h @@ -15,28 +15,74 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TABLE_TABLE_H -#define ARROW_TABLE_TABLE_H +#ifndef ARROW_TABLE_H +#define ARROW_TABLE_H +#include #include #include #include namespace arrow { +class Array; class Column; class Schema; class Status; +// A row batch is a simpler and more rigid table data structure intended for +// use primarily in shared memory IPC. It contains a schema (metadata) and a +// corresponding vector of equal-length Arrow arrays +class RowBatch { + public: + // num_rows is a parameter to allow for row batches of a particular size not + // having any materialized columns. Each array should have the same length as + // num_rows + RowBatch(const std::shared_ptr& schema, int num_rows, + const std::vector>& columns); + + // @returns: the table's schema + const std::shared_ptr& schema() const { + return schema_; + } + + // @returns: the i-th column + // Note: Does not boundscheck + const std::shared_ptr& column(int i) const { + return columns_[i]; + } + + const std::string& column_name(int i) const; + + // @returns: the number of columns in the table + int num_columns() const { + return columns_.size(); + } + + // @returns: the number of rows (the corresponding length of each column) + int64_t num_rows() const { + return num_rows_; + } + + private: + std::shared_ptr schema_; + int num_rows_; + std::vector> columns_; +}; + // Immutable container of fixed-length columns conforming to a particular schema class Table { public: // If columns is zero-length, the table's number of rows is zero Table(const std::string& name, const std::shared_ptr& schema, - const std::vector >& columns); + const std::vector>& columns); + // num_rows is a parameter to allow for tables of a particular size not + // having any materialized columns. Each column should therefore have the + // same length as num_rows -- you can validate this using + // Table::ValidateColumns Table(const std::string& name, const std::shared_ptr& schema, - const std::vector >& columns, int64_t num_rows); + const std::vector>& columns, int64_t num_rows); // @returns: the table's name, if any (may be length 0) const std::string& name() const { @@ -72,11 +118,11 @@ class Table { std::string name_; std::shared_ptr schema_; - std::vector > columns_; + std::vector> columns_; int64_t num_rows_; }; } // namespace arrow -#endif // ARROW_TABLE_TABLE_H +#endif // ARROW_TABLE_H diff --git a/cpp/src/arrow/table/test-common.h b/cpp/src/arrow/table/test-common.h deleted file mode 100644 index 50a5f6a2f5018..0000000000000 --- a/cpp/src/arrow/table/test-common.h +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include - -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/table.h" -#include "arrow/test-util.h" -#include "arrow/type.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/buffer.h" -#include "arrow/util/memory-pool.h" - -namespace arrow { - -class TestBase : public ::testing::Test { - public: - void SetUp() { - pool_ = GetDefaultMemoryPool(); - } - - template - std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { - auto data = std::make_shared(pool_); - auto nulls = std::make_shared(pool_); - EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); - EXPECT_OK(nulls->Resize(util::bytes_for_bits(length))); - return std::make_shared(length, data, 10, nulls); - } - - protected: - MemoryPool* pool_; -}; - -} // namespace arrow diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 0898c8e3e3aa3..a9fb2a7644ab3 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -18,26 +18,39 @@ #ifndef ARROW_TEST_UTIL_H_ #define ARROW_TEST_UTIL_H_ -#include +#include #include +#include #include #include +#include "gtest/gtest.h" + +#include "arrow/type.h" +#include "arrow/column.h" +#include "arrow/schema.h" +#include "arrow/table.h" #include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" #include "arrow/util/random.h" #include "arrow/util/status.h" #define ASSERT_RAISES(ENUM, expr) \ do { \ Status s = (expr); \ - ASSERT_TRUE(s.Is##ENUM()); \ + if (!s.Is##ENUM()) { \ + FAIL() << s.ToString(); \ + } \ } while (0) #define ASSERT_OK(expr) \ do { \ Status s = (expr); \ - ASSERT_TRUE(s.ok()); \ + if (!s.ok()) { \ + FAIL() << s.ToString(); \ + } \ } while (0) @@ -50,6 +63,27 @@ namespace arrow { +class TestBase : public ::testing::Test { + public: + void SetUp() { + pool_ = default_memory_pool(); + } + + template + std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { + auto data = std::make_shared(pool_); + auto nulls = std::make_shared(pool_); + EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); + EXPECT_OK(nulls->Resize(util::bytes_for_bits(length))); + return std::make_shared(length, data, 10, nulls); + } + + protected: + MemoryPool* pool_; +}; + +namespace test { + template void randint(int64_t N, T lower, T upper, std::vector* out) { Random rng(random_seed()); @@ -84,6 +118,33 @@ void random_nulls(int64_t n, double pct_null, std::vector* nulls) { } } +static inline void random_bytes(int n, uint32_t seed, uint8_t* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution d(0, 255); + + for (int i = 0; i < n; ++i) { + out[i] = d(gen) & 0xFF; + } +} + +template +void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution d(min_value, max_value); + for (int i = 0; i < n; ++i) { + out[i] = d(gen); + } +} + +static inline int bitmap_popcount(const uint8_t* data, int length) { + int count = 0; + for (int i = 0; i < length; ++i) { + // TODO: accelerate this + if (util::get_bit(data, i)) ++count; + } + return count; +} + static inline int null_count(const std::vector& nulls) { int result = 0; for (size_t i = 0; i < nulls.size(); ++i) { @@ -102,6 +163,7 @@ std::shared_ptr bytes_to_null_buffer(uint8_t* bytes, int length) { return out; } +} // namespace test } // namespace arrow #endif // ARROW_TEST_UTIL_H_ diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 0a2e817ad30c6..f7f835e96a729 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -24,45 +24,37 @@ namespace arrow { std::string Field::ToString() const { std::stringstream ss; - ss << this->name << " " << this->type->ToString(); + ss << this->name << ": " << this->type->ToString(); + if (!this->nullable) { + ss << " not null"; + } return ss.str(); } DataType::~DataType() {} -StringType::StringType(bool nullable) - : DataType(LogicalType::STRING, nullable) {} - -StringType::StringType(const StringType& other) - : StringType(other.nullable) {} +StringType::StringType() : DataType(Type::STRING) {} std::string StringType::ToString() const { std::string result(name()); - if (!nullable) { - result.append(" not null"); - } return result; } std::string ListType::ToString() const { std::stringstream s; - s << "list<" << value_type->ToString() << ">"; - if (!this->nullable) { - s << " not null"; - } + s << "list<" << value_field()->ToString() << ">"; return s.str(); } std::string StructType::ToString() const { std::stringstream s; s << "struct<"; - for (size_t i = 0; i < fields_.size(); ++i) { + for (int i = 0; i < this->num_children(); ++i) { if (i > 0) s << ", "; - const std::shared_ptr& field = fields_[i]; + const std::shared_ptr& field = this->child(i); s << field->name << ": " << field->type->ToString(); } s << ">"; - if (!nullable) s << " not null"; return s.str(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 00b01ea86e8a5..5984b6718ddbe 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -18,62 +18,34 @@ #ifndef ARROW_TYPE_H #define ARROW_TYPE_H +#include #include #include #include namespace arrow { -// Physical data type that describes the memory layout of values. See details -// for each type -enum class LayoutEnum: char { - // A physical type consisting of some non-negative number of bytes - BYTE = 0, - - // A physical type consisting of some non-negative number of bits - BIT = 1, - - // A parametric variable-length value type. Full specification requires a - // child logical type - LIST = 2, - - // A collection of multiple equal-length child arrays. Parametric type taking - // 1 or more child logical types - STRUCT = 3, - - // An array with heterogeneous value types. Parametric types taking 1 or more - // child logical types - DENSE_UNION = 4, - SPARSE_UNION = 5 -}; - - -struct LayoutType { - LayoutEnum type; - explicit LayoutType(LayoutEnum type) : type(type) {} -}; - // Data types in this library are all *logical*. They can be expressed as // either a primitive physical type (bytes or bits of some fixed size), a // nested type consisting of other data types, or another data type (e.g. a // timestamp encoded as an int64) -struct LogicalType { +struct Type { enum type { // A degenerate NULL type represented as 0 bytes/bits NA = 0, - // Little-endian integer types - UINT8 = 1, - INT8 = 2, - UINT16 = 3, - INT16 = 4, - UINT32 = 5, - INT32 = 6, - UINT64 = 7, - INT64 = 8, - // A boolean value represented as 1 bit - BOOL = 9, + BOOL = 1, + + // Little-endian integer types + UINT8 = 2, + INT8 = 3, + UINT16 = 4, + INT16 = 5, + UINT32 = 6, + INT32 = 7, + UINT64 = 8, + INT64 = 9, // 4-byte floating point value FLOAT = 10, @@ -131,30 +103,38 @@ struct LogicalType { }; }; +struct Field; + struct DataType { - LogicalType::type type; - bool nullable; + Type::type type; - explicit DataType(LogicalType::type type, bool nullable = true) : - type(type), - nullable(nullable) {} + std::vector> children_; + + explicit DataType(Type::type type) : + type(type) {} virtual ~DataType(); bool Equals(const DataType* other) { // Call with a pointer so more friendly to subclasses - return this == other || (this->type == other->type && - this->nullable == other->nullable); + return this == other || (this->type == other->type); } bool Equals(const std::shared_ptr& other) { return Equals(other.get()); } + const std::shared_ptr& child(int i) const { + return children_[i]; + } + + int num_children() const { + return children_.size(); + } + virtual std::string ToString() const = 0; }; -typedef std::shared_ptr LayoutPtr; typedef std::shared_ptr TypePtr; // A field is a piece of metadata that includes (for now) a name and a data @@ -166,9 +146,13 @@ struct Field { // The field's data type TypePtr type; - Field(const std::string& name, const TypePtr& type) : + // Fields can be nullable + bool nullable; + + Field(const std::string& name, const TypePtr& type, bool nullable = true) : name(name), - type(type) {} + type(type), + nullable(nullable) {} bool operator==(const Field& other) const { return this->Equals(other); @@ -180,6 +164,7 @@ struct Field { bool Equals(const Field& other) const { return (this == &other) || (this->name == other.name && + this->nullable == other.nullable && this->type->Equals(other.type.get())); } @@ -187,36 +172,12 @@ struct Field { return Equals(*other.get()); } - bool nullable() const { - return this->type->nullable; - } - std::string ToString() const; }; -struct BytesType : public LayoutType { - int size; - - explicit BytesType(int size) - : LayoutType(LayoutEnum::BYTE), - size(size) {} - - BytesType(const BytesType& other) - : BytesType(other.size) {} -}; - -struct ListLayoutType : public LayoutType { - LayoutPtr value_type; - - explicit ListLayoutType(const LayoutPtr& value_type) - : LayoutType(LayoutEnum::BYTE), - value_type(value_type) {} -}; - template struct PrimitiveType : public DataType { - explicit PrimitiveType(bool nullable = true) - : DataType(Derived::type_enum, nullable) {} + PrimitiveType() : DataType(Derived::type_enum) {} std::string ToString() const override; }; @@ -224,22 +185,19 @@ struct PrimitiveType : public DataType { template inline std::string PrimitiveType::ToString() const { std::string result(static_cast(this)->name()); - if (!nullable) { - result.append(" not null"); - } return result; } -#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ - typedef C_TYPE c_type; \ - static constexpr LogicalType::type type_enum = LogicalType::ENUM; \ - static constexpr int size = SIZE; \ - \ - explicit TYPENAME(bool nullable = true) \ - : PrimitiveType(nullable) {} \ - \ - static const char* name() { \ - return NAME; \ +#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ + typedef C_TYPE c_type; \ + static constexpr Type::type type_enum = Type::ENUM; \ + static constexpr int size = SIZE; \ + \ + TYPENAME() \ + : PrimitiveType() {} \ + \ + static const char* name() { \ + return NAME; \ } struct NullType : public PrimitiveType { @@ -292,11 +250,23 @@ struct DoubleType : public PrimitiveType { struct ListType : public DataType { // List can contain any other logical value type - TypePtr value_type; + explicit ListType(const std::shared_ptr& value_type) + : DataType(Type::LIST) { + children_ = {std::make_shared("item", value_type)}; + } + + explicit ListType(const std::shared_ptr& value_field) + : DataType(Type::LIST) { + children_ = {value_field}; + } - explicit ListType(const TypePtr& value_type, bool nullable = true) - : DataType(LogicalType::LIST, nullable), - value_type(value_type) {} + const std::shared_ptr& value_field() const { + return children_[0]; + } + + const std::shared_ptr& value_type() const { + return children_[0]->type; + } static char const *name() { return "list"; @@ -307,9 +277,7 @@ struct ListType : public DataType { // String is a logical type consisting of a physical list of 1-byte values struct StringType : public DataType { - explicit StringType(bool nullable = true); - - StringType(const StringType& other); + StringType(); static char const *name() { return "string"; @@ -319,20 +287,9 @@ struct StringType : public DataType { }; struct StructType : public DataType { - std::vector > fields_; - - explicit StructType(const std::vector >& fields, - bool nullable = true) - : DataType(LogicalType::STRUCT, nullable) { - fields_ = fields; - } - - const std::shared_ptr& field(int i) const { - return fields_[i]; - } - - int num_children() const { - return fields_.size(); + explicit StructType(const std::vector>& fields) + : DataType(Type::STRUCT) { + children_ = fields; } std::string ToString() const override; diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index 57cabdefd2525..595b3be6e1661 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -26,8 +26,6 @@ install(FILES construct.h datetime.h decimal.h - floating.h - integer.h json.h list.h primitive.h diff --git a/cpp/src/arrow/types/boolean.h b/cpp/src/arrow/types/boolean.h index a5023d7b368d2..1cb91f9ba4966 100644 --- a/cpp/src/arrow/types/boolean.h +++ b/cpp/src/arrow/types/boolean.h @@ -22,7 +22,7 @@ namespace arrow { -typedef PrimitiveArrayImpl BooleanArray; +// typedef PrimitiveArrayImpl BooleanArray; class BooleanBuilder : public ArrayBuilder { }; diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h index 42a9c926bb134..46d84f1f183c8 100644 --- a/cpp/src/arrow/types/collection.h +++ b/cpp/src/arrow/types/collection.h @@ -25,7 +25,7 @@ namespace arrow { -template +template struct CollectionType : public DataType { std::vector child_types_; diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 43f01a3051385..290decd81ff42 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -19,24 +19,26 @@ #include -#include "arrow/types/floating.h" -#include "arrow/types/integer.h" +#include "arrow/type.h" +#include "arrow/types/primitive.h" #include "arrow/types/list.h" #include "arrow/types/string.h" +#include "arrow/util/buffer.h" #include "arrow/util/status.h" namespace arrow { class ArrayBuilder; -// Initially looked at doing this with vtables, but shared pointers makes it -// difficult - #define BUILDER_CASE(ENUM, BuilderType) \ - case LogicalType::ENUM: \ + case Type::ENUM: \ out->reset(new BuilderType(pool, type)); \ return Status::OK(); +// Initially looked at doing this with vtables, but shared pointers makes it +// difficult +// +// TODO(wesm): come up with a less monolithic strategy Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out) { switch (type->type) { @@ -56,30 +58,41 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(STRING, StringBuilder); - case LogicalType::LIST: + case Type::LIST: { std::shared_ptr value_builder; const std::shared_ptr& value_type = static_cast( - type.get())->value_type; + type.get())->value_type(); RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); out->reset(new ListBuilder(pool, type, value_builder)); return Status::OK(); } - // BUILDER_CASE(CHAR, CharBuilder); - - // BUILDER_CASE(VARCHAR, VarcharBuilder); - // BUILDER_CASE(BINARY, BinaryBuilder); - - // BUILDER_CASE(DATE, DateBuilder); - // BUILDER_CASE(TIMESTAMP, TimestampBuilder); - // BUILDER_CASE(TIME, TimeBuilder); + default: + return Status::NotImplemented(type->ToString()); + } +} - // BUILDER_CASE(LIST, ListBuilder); - // BUILDER_CASE(STRUCT, StructBuilder); - // BUILDER_CASE(DENSE_UNION, DenseUnionBuilder); - // BUILDER_CASE(SPARSE_UNION, SparseUnionBuilder); +#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ + case Type::ENUM: \ + out->reset(new ArrayType(type, length, data, null_count, nulls)); \ + return Status::OK(); +Status MakePrimitiveArray(const std::shared_ptr& type, + int32_t length, const std::shared_ptr& data, + int32_t null_count, const std::shared_ptr& nulls, + std::shared_ptr* out) { + switch (type->type) { + MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT8, Int8Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT16, UInt16Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT16, Int16Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT32, UInt32Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray); + MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray); default: return Status::NotImplemented(type->ToString()); } diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 59ebe1acddc98..089c484c58bee 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -18,19 +18,26 @@ #ifndef ARROW_TYPES_CONSTRUCT_H #define ARROW_TYPES_CONSTRUCT_H +#include #include -#include "arrow/type.h" - namespace arrow { +class Array; class ArrayBuilder; +class Buffer; +struct DataType; class MemoryPool; class Status; Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); +Status MakePrimitiveArray(const std::shared_ptr& type, + int32_t length, const std::shared_ptr& data, + int32_t null_count, const std::shared_ptr& nulls, + std::shared_ptr* out); + } // namespace arrow #endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index 765fc29dd57ae..e57b66ab46adb 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -31,8 +31,8 @@ struct DateType : public DataType { Unit unit; - explicit DateType(Unit unit = Unit::DAY, bool nullable = true) - : DataType(LogicalType::DATE, nullable), + explicit DateType(Unit unit = Unit::DAY) + : DataType(Type::DATE), unit(unit) {} DateType(const DateType& other) @@ -41,10 +41,6 @@ struct DateType : public DataType { static char const *name() { return "date"; } - - // virtual std::string ToString() { - // return name(); - // } }; @@ -58,8 +54,8 @@ struct TimestampType : public DataType { Unit unit; - explicit TimestampType(Unit unit = Unit::MILLI, bool nullable = true) - : DataType(LogicalType::TIMESTAMP, nullable), + explicit TimestampType(Unit unit = Unit::MILLI) + : DataType(Type::TIMESTAMP), unit(unit) {} TimestampType(const TimestampType& other) @@ -68,10 +64,6 @@ struct TimestampType : public DataType { static char const *name() { return "timestamp"; } - - // virtual std::string ToString() { - // return name(); - // } }; } // namespace arrow diff --git a/cpp/src/arrow/types/floating.cc b/cpp/src/arrow/types/floating.cc deleted file mode 100644 index bde28266e638c..0000000000000 --- a/cpp/src/arrow/types/floating.cc +++ /dev/null @@ -1,22 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/types/floating.h" - -namespace arrow { - -} // namespace arrow diff --git a/cpp/src/arrow/types/integer.cc b/cpp/src/arrow/types/integer.cc deleted file mode 100644 index 4696536616971..0000000000000 --- a/cpp/src/arrow/types/integer.cc +++ /dev/null @@ -1,22 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/types/integer.h" - -namespace arrow { - -} // namespace arrow diff --git a/cpp/src/arrow/types/integer.h b/cpp/src/arrow/types/integer.h deleted file mode 100644 index 568419124941f..0000000000000 --- a/cpp/src/arrow/types/integer.h +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_INTEGER_H -#define ARROW_TYPES_INTEGER_H - -#include -#include - -#include "arrow/types/primitive.h" -#include "arrow/type.h" - -namespace arrow { - -// Array containers - -typedef PrimitiveArrayImpl UInt8Array; -typedef PrimitiveArrayImpl Int8Array; - -typedef PrimitiveArrayImpl UInt16Array; -typedef PrimitiveArrayImpl Int16Array; - -typedef PrimitiveArrayImpl UInt32Array; -typedef PrimitiveArrayImpl Int32Array; - -typedef PrimitiveArrayImpl UInt64Array; -typedef PrimitiveArrayImpl Int64Array; - -// Builders - -typedef PrimitiveBuilder UInt8Builder; -typedef PrimitiveBuilder UInt16Builder; -typedef PrimitiveBuilder UInt32Builder; -typedef PrimitiveBuilder UInt64Builder; - -typedef PrimitiveBuilder Int8Builder; -typedef PrimitiveBuilder Int16Builder; -typedef PrimitiveBuilder Int32Builder; -typedef PrimitiveBuilder Int64Builder; - -} // namespace arrow - -#endif // ARROW_TYPES_INTEGER_H diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc index 168e370d51a14..fb731edd6073f 100644 --- a/cpp/src/arrow/types/json.cc +++ b/cpp/src/arrow/types/json.cc @@ -20,7 +20,6 @@ #include #include "arrow/type.h" -#include "arrow/types/string.h" #include "arrow/types/union.h" namespace arrow { diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h index b67fb3807aded..9c850afac0af4 100644 --- a/cpp/src/arrow/types/json.h +++ b/cpp/src/arrow/types/json.h @@ -28,8 +28,8 @@ struct JSONScalar : public DataType { static TypePtr dense_type; static TypePtr sparse_type; - explicit JSONScalar(bool dense = true, bool nullable = true) - : DataType(LogicalType::JSON_SCALAR, nullable), + explicit JSONScalar(bool dense = true) + : DataType(Type::JSON_SCALAR), dense(dense) {} }; diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 02991de2648e7..eb55ca868eeee 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -15,20 +15,21 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include #include #include +#include "gtest/gtest.h" + #include "arrow/array.h" +#include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/types/construct.h" -#include "arrow/types/integer.h" #include "arrow/types/list.h" -#include "arrow/types/string.h" +#include "arrow/types/primitive.h" #include "arrow/types/test-common.h" #include "arrow/util/status.h" @@ -39,27 +40,24 @@ using std::vector; namespace arrow { -class ArrayBuilder; - TEST(TypesTest, TestListType) { std::shared_ptr vt = std::make_shared(); ListType list_type(vt); - ASSERT_EQ(list_type.type, LogicalType::LIST); + ASSERT_EQ(list_type.type, Type::LIST); ASSERT_EQ(list_type.name(), string("list")); - ASSERT_EQ(list_type.ToString(), string("list")); + ASSERT_EQ(list_type.ToString(), string("list")); - ASSERT_EQ(list_type.value_type->type, vt->type); - ASSERT_EQ(list_type.value_type->type, vt->type); + ASSERT_EQ(list_type.value_type()->type, vt->type); + ASSERT_EQ(list_type.value_type()->type, vt->type); - std::shared_ptr st = std::make_shared(false); - std::shared_ptr lt = std::make_shared(st, false); - ASSERT_EQ(lt->ToString(), string("list not null")); + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ(lt->ToString(), string("list")); - ListType lt2(lt, false); - ASSERT_EQ(lt2.ToString(), - string("list not null> not null")); + ListType lt2(lt); + ASSERT_EQ(lt2.ToString(), string("list>")); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 69a79a77fabe0..670ee4da11675 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -19,4 +19,33 @@ namespace arrow { +bool ListArray::EqualsExact(const ListArray& other) const { + if (this == &other) return true; + if (null_count_ != other.null_count_) { + return false; + } + + bool equal_offsets = offset_buf_->Equals(*other.offset_buf_, + length_ + 1); + bool equal_nulls = true; + if (null_count_ > 0) { + equal_nulls = nulls_->Equals(*other.nulls_, + util::bytes_for_bits(length_)); + } + + if (!(equal_offsets && equal_nulls)) { + return false; + } + + return values()->Equals(other.values()); +} + +bool ListArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) return true; + if (this->type_enum() != arr->type_enum()) { + return false; + } + return EqualsExact(*static_cast(arr.get())); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 210c76a046c21..141f762458b3b 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -21,12 +21,10 @@ #include #include #include -#include #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/type.h" -#include "arrow/types/integer.h" #include "arrow/types/primitive.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" @@ -38,29 +36,19 @@ class MemoryPool; class ListArray : public Array { public: - ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {} - ListArray(const TypePtr& type, int32_t length, std::shared_ptr offsets, const ArrayPtr& values, int32_t null_count = 0, - std::shared_ptr nulls = nullptr) { - Init(type, length, offsets, values, null_count, nulls); - } - - virtual ~ListArray() {} - - void Init(const TypePtr& type, int32_t length, std::shared_ptr offsets, - const ArrayPtr& values, - int32_t null_count = 0, - std::shared_ptr nulls = nullptr) { + std::shared_ptr nulls = nullptr) : + Array(type, length, null_count, nulls) { offset_buf_ = offsets; offsets_ = offsets == nullptr? nullptr : reinterpret_cast(offset_buf_->data()); - values_ = values; - Array::Init(type, length, null_count, nulls); } + virtual ~ListArray() {} + // Return a shared pointer in case the requestor desires to share ownership // with this array. const std::shared_ptr& values() const {return values_;} @@ -77,6 +65,9 @@ class ListArray : public Array { int32_t value_offset(int i) { return offsets_[i];} int32_t value_length(int i) { return offsets_[i + 1] - offsets_[i];} + bool EqualsExact(const ListArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; + protected: std::shared_ptr offset_buf_; const int32_t* offsets_; @@ -137,8 +128,6 @@ class ListBuilder : public Int32Builder { template std::shared_ptr Transfer() { - auto result = std::make_shared(); - std::shared_ptr items = value_builder_->Finish(); // Add final offset if the length is non-zero @@ -146,8 +135,9 @@ class ListBuilder : public Int32Builder { raw_buffer()[length_] = items->length(); } - result->Init(type_, length_, values_, items, + auto result = std::make_shared(type_, length_, values_, items, null_count_, nulls_); + values_ = nulls_ = nullptr; capacity_ = length_ = null_count_ = 0; diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index f35a258e2cb57..7eae8cda8c488 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -15,21 +15,17 @@ // specific language governing permissions and limitations // under the License. -#include - #include #include #include #include -#include "arrow/array.h" +#include "gtest/gtest.h" + #include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/boolean.h" #include "arrow/types/construct.h" -#include "arrow/types/floating.h" -#include "arrow/types/integer.h" #include "arrow/types/primitive.h" #include "arrow/types/test-common.h" #include "arrow/util/bit-util.h" @@ -43,23 +39,17 @@ using std::vector; namespace arrow { -TEST(TypesTest, TestBytesType) { - BytesType t1(3); - - ASSERT_EQ(t1.type, LayoutEnum::BYTE); - ASSERT_EQ(t1.size, 3); -} - +class Array; #define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ TEST(TypesTest, TestPrimitive_##ENUM) { \ KLASS tp; \ \ - ASSERT_EQ(tp.type, LogicalType::ENUM); \ + ASSERT_EQ(tp.type, Type::ENUM); \ ASSERT_EQ(tp.name(), string(NAME)); \ \ KLASS tp_copy = tp; \ - ASSERT_EQ(tp_copy.type, LogicalType::ENUM); \ + ASSERT_EQ(tp_copy.type, Type::ENUM); \ } PRIMITIVE_TEST(Int8Type, INT8, "int8"); @@ -109,22 +99,20 @@ class TestPrimitiveBuilder : public TestBuilder { void RandomData(int N, double pct_null = 0.1) { Attrs::draw(N, &draws_); - random_nulls(N, pct_null, &nulls_); + test::random_nulls(N, pct_null, &nulls_); } void CheckNullable() { - ArrayType expected; int size = builder_->length(); auto ex_data = std::make_shared( reinterpret_cast(draws_.data()), size * sizeof(T)); - auto ex_nulls = bytes_to_null_buffer(nulls_.data(), size); - - int32_t ex_null_count = null_count(nulls_); + auto ex_nulls = test::bytes_to_null_buffer(nulls_.data(), size); + int32_t ex_null_count = test::null_count(nulls_); - expected.Init(size, ex_data, ex_null_count, ex_nulls); + auto expected = std::make_shared(size, ex_data, ex_null_count, ex_nulls); std::shared_ptr result = std::dynamic_pointer_cast( builder_->Finish()); @@ -135,18 +123,17 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(0, builder_->null_count()); ASSERT_EQ(nullptr, builder_->buffer()); - ASSERT_TRUE(result->Equals(expected)); + ASSERT_TRUE(result->EqualsExact(*expected.get())); ASSERT_EQ(ex_null_count, result->null_count()); } void CheckNonNullable() { - ArrayType expected; int size = builder_nn_->length(); auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), size * sizeof(T)); - expected.Init(size, ex_data); + auto expected = std::make_shared(size, ex_data); std::shared_ptr result = std::dynamic_pointer_cast( builder_nn_->Finish()); @@ -156,7 +143,7 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(0, builder_nn_->capacity()); ASSERT_EQ(nullptr, builder_nn_->buffer()); - ASSERT_TRUE(result->Equals(expected)); + ASSERT_TRUE(result->EqualsExact(*expected.get())); ASSERT_EQ(0, result->null_count()); } @@ -183,8 +170,8 @@ class TestPrimitiveBuilder : public TestBuilder { #define PINT_DECL(CapType, c_type, LOWER, UPPER) \ struct P##CapType { \ PTYPE_DECL(CapType, c_type); \ - static void draw(int N, vector* draws) { \ - randint(N, LOWER, UPPER, draws); \ + static void draw(int N, vector* draws) { \ + test::randint(N, LOWER, UPPER, draws); \ } \ } diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index c86260b0fc641..32b8bfa7f1bd4 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -26,16 +26,16 @@ namespace arrow { // ---------------------------------------------------------------------- // Primitive array base -void PrimitiveArray::Init(const TypePtr& type, int32_t length, +PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& nulls) { - Array::Init(type, length, null_count, nulls); + const std::shared_ptr& nulls) : + Array(type, length, null_count, nulls) { data_ = data; raw_data_ = data == nullptr? nullptr : data_->data(); } -bool PrimitiveArray::Equals(const PrimitiveArray& other) const { +bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { if (this == &other) return true; if (null_count_ != other.null_count_) { return false; @@ -50,4 +50,12 @@ bool PrimitiveArray::Equals(const PrimitiveArray& other) const { } } +bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) return true; + if (this->type_enum() != arr->type_enum()) { + return false; + } + return EqualsExact(*static_cast(arr.get())); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 22ab59c309a1d..e01027cf55c39 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -21,7 +21,6 @@ #include #include #include -#include #include "arrow/array.h" #include "arrow/builder.h" @@ -38,64 +37,57 @@ class MemoryPool; // Base class for fixed-size logical types class PrimitiveArray : public Array { public: - PrimitiveArray() : Array(), data_(nullptr), raw_data_(nullptr) {} - - virtual ~PrimitiveArray() {} - - void Init(const TypePtr& type, int32_t length, + PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& nulls = nullptr); + virtual ~PrimitiveArray() {} const std::shared_ptr& data() const { return data_;} - bool Equals(const PrimitiveArray& other) const; + bool EqualsExact(const PrimitiveArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; protected: std::shared_ptr data_; const uint8_t* raw_data_; }; - -template -class PrimitiveArrayImpl : public PrimitiveArray { - public: - typedef typename TypeClass::c_type value_type; - - PrimitiveArrayImpl() : PrimitiveArray() {} - - virtual ~PrimitiveArrayImpl() {} - - PrimitiveArrayImpl(int32_t length, const std::shared_ptr& data, - int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) { - Init(length, data, null_count, nulls); - } - - void Init(int32_t length, const std::shared_ptr& data, - int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) { - TypePtr type(new TypeClass()); - PrimitiveArray::Init(type, length, data, null_count, nulls); - } - - bool Equals(const PrimitiveArrayImpl& other) const { - return PrimitiveArray::Equals(*static_cast(&other)); - } - - const value_type* raw_data() const { - return reinterpret_cast(raw_data_); - } - - value_type Value(int i) const { - return raw_data()[i]; - } - - TypeClass* exact_type() const { - return static_cast(type_); - } +#define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ +class NAME : public PrimitiveArray { \ + public: \ + using value_type = T; \ + using PrimitiveArray::PrimitiveArray; \ + NAME(int32_t length, const std::shared_ptr& data, \ + int32_t null_count = 0, \ + const std::shared_ptr& nulls = nullptr) : \ + PrimitiveArray(std::make_shared(), length, data, \ + null_count, nulls) {} \ + \ + bool EqualsExact(const NAME& other) const { \ + return PrimitiveArray::EqualsExact( \ + *static_cast(&other)); \ + } \ + \ + const T* raw_data() const { \ + return reinterpret_cast(raw_data_); \ + } \ + \ + T Value(int i) const { \ + return raw_data()[i]; \ + } \ }; +NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type, uint8_t); +NUMERIC_ARRAY_DECL(Int8Array, Int8Type, int8_t); +NUMERIC_ARRAY_DECL(UInt16Array, UInt16Type, uint16_t); +NUMERIC_ARRAY_DECL(Int16Array, Int16Type, int16_t); +NUMERIC_ARRAY_DECL(UInt32Array, UInt32Type, uint32_t); +NUMERIC_ARRAY_DECL(Int32Array, Int32Type, int32_t); +NUMERIC_ARRAY_DECL(UInt64Array, UInt64Type, uint64_t); +NUMERIC_ARRAY_DECL(Int64Array, Int64Type, int64_t); +NUMERIC_ARRAY_DECL(FloatArray, FloatType, float); +NUMERIC_ARRAY_DECL(DoubleArray, DoubleType, double); template class PrimitiveBuilder : public ArrayBuilder { @@ -202,8 +194,9 @@ class PrimitiveBuilder : public ArrayBuilder { } std::shared_ptr Finish() override { - std::shared_ptr result = std::make_shared(); - result->PrimitiveArray::Init(type_, length_, values_, null_count_, nulls_); + std::shared_ptr result = std::make_shared( + type_, length_, values_, null_count_, nulls_); + values_ = nulls_ = nullptr; capacity_ = length_ = null_count_ = 0; return result; @@ -222,6 +215,21 @@ class PrimitiveBuilder : public ArrayBuilder { int elsize_; }; +// Builders + +typedef PrimitiveBuilder UInt8Builder; +typedef PrimitiveBuilder UInt16Builder; +typedef PrimitiveBuilder UInt32Builder; +typedef PrimitiveBuilder UInt64Builder; + +typedef PrimitiveBuilder Int8Builder; +typedef PrimitiveBuilder Int16Builder; +typedef PrimitiveBuilder Int32Builder; +typedef PrimitiveBuilder Int64Builder; + +typedef PrimitiveBuilder FloatBuilder; +typedef PrimitiveBuilder DoubleBuilder; + } // namespace arrow #endif // ARROW_TYPES_PRIMITIVE_H diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 6381093dcbb45..7dc3d682cdc15 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -15,21 +15,20 @@ // specific language governing permissions and limitations // under the License. -#include #include +#include #include #include #include +#include "gtest/gtest.h" + #include "arrow/array.h" -#include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/construct.h" -#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/test-common.h" -#include "arrow/util/status.h" namespace arrow { @@ -38,14 +37,14 @@ class Buffer; TEST(TypesTest, TestCharType) { CharType t1(5); - ASSERT_EQ(t1.type, LogicalType::CHAR); + ASSERT_EQ(t1.type, Type::CHAR); ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.ToString(), std::string("char(5)")); // Test copy constructor CharType t2 = t1; - ASSERT_EQ(t2.type, LogicalType::CHAR); + ASSERT_EQ(t2.type, Type::CHAR); ASSERT_EQ(t2.size, 5); } @@ -53,22 +52,20 @@ TEST(TypesTest, TestCharType) { TEST(TypesTest, TestVarcharType) { VarcharType t1(5); - ASSERT_EQ(t1.type, LogicalType::VARCHAR); + ASSERT_EQ(t1.type, Type::VARCHAR); ASSERT_EQ(t1.size, 5); - ASSERT_EQ(t1.physical_type.size, 6); ASSERT_EQ(t1.ToString(), std::string("varchar(5)")); // Test copy constructor VarcharType t2 = t1; - ASSERT_EQ(t2.type, LogicalType::VARCHAR); + ASSERT_EQ(t2.type, Type::VARCHAR); ASSERT_EQ(t2.size, 5); - ASSERT_EQ(t2.physical_type.size, 6); } TEST(TypesTest, TestStringType) { StringType str; - ASSERT_EQ(str.type, LogicalType::STRING); + ASSERT_EQ(str.type, Type::STRING); ASSERT_EQ(str.name(), std::string("string")); } @@ -90,15 +87,16 @@ class TestStringContainer : public ::testing::Test { length_ = offsets_.size() - 1; int nchars = chars_.size(); - value_buf_ = to_buffer(chars_); + value_buf_ = test::to_buffer(chars_); values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); - offsets_buf_ = to_buffer(offsets_); + offsets_buf_ = test::to_buffer(offsets_); - nulls_buf_ = bytes_to_null_buffer(nulls_.data(), nulls_.size()); - null_count_ = null_count(nulls_); + nulls_buf_ = test::bytes_to_null_buffer(nulls_.data(), nulls_.size()); + null_count_ = test::null_count(nulls_); - strings_.Init(length_, offsets_buf_, values_, null_count_, nulls_buf_); + strings_ = std::make_shared(length_, offsets_buf_, values_, + null_count_, nulls_buf_); } protected: @@ -116,28 +114,28 @@ class TestStringContainer : public ::testing::Test { int length_; ArrayPtr values_; - StringArray strings_; + std::shared_ptr strings_; }; TEST_F(TestStringContainer, TestArrayBasics) { - ASSERT_EQ(length_, strings_.length()); - ASSERT_EQ(1, strings_.null_count()); + ASSERT_EQ(length_, strings_->length()); + ASSERT_EQ(1, strings_->null_count()); } TEST_F(TestStringContainer, TestType) { - TypePtr type = strings_.type(); + TypePtr type = strings_->type(); - ASSERT_EQ(LogicalType::STRING, type->type); - ASSERT_EQ(LogicalType::STRING, strings_.logical_type()); + ASSERT_EQ(Type::STRING, type->type); + ASSERT_EQ(Type::STRING, strings_->type_enum()); } TEST_F(TestStringContainer, TestListFunctions) { int pos = 0; for (size_t i = 0; i < expected_.size(); ++i) { - ASSERT_EQ(pos, strings_.value_offset(i)); - ASSERT_EQ(expected_[i].size(), strings_.value_length(i)); + ASSERT_EQ(pos, strings_->value_offset(i)); + ASSERT_EQ(expected_[i].size(), strings_->value_length(i)); pos += expected_[i].size(); } } @@ -151,9 +149,9 @@ TEST_F(TestStringContainer, TestDestructor) { TEST_F(TestStringContainer, TestGetString) { for (size_t i = 0; i < expected_.size(); ++i) { if (nulls_[i]) { - ASSERT_TRUE(strings_.IsNull(i)); + ASSERT_TRUE(strings_->IsNull(i)); } else { - ASSERT_EQ(expected_[i], strings_.GetString(i)); + ASSERT_EQ(expected_[i], strings_->GetString(i)); } } } @@ -199,7 +197,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { Done(); ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(reps * null_count(is_null), result_->null_count()); + ASSERT_EQ(reps * test::null_count(is_null), result_->null_count()); ASSERT_EQ(reps * 6, result_->values()->length()); int32_t length; diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 8ccc0a9698a54..2b3fba5ce0932 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -25,25 +25,21 @@ #include "arrow/array.h" #include "arrow/type.h" -#include "arrow/types/integer.h" #include "arrow/types/list.h" +#include "arrow/types/primitive.h" #include "arrow/util/status.h" namespace arrow { -class ArrayBuilder; class Buffer; class MemoryPool; struct CharType : public DataType { int size; - BytesType physical_type; - - explicit CharType(int size, bool nullable = true) - : DataType(LogicalType::CHAR, nullable), - size(size), - physical_type(BytesType(size)) {} + explicit CharType(int size) + : DataType(Type::CHAR), + size(size) {} CharType(const CharType& other) : CharType(other.size) {} @@ -56,54 +52,36 @@ struct CharType : public DataType { struct VarcharType : public DataType { int size; - BytesType physical_type; - - explicit VarcharType(int size, bool nullable = true) - : DataType(LogicalType::VARCHAR, nullable), - size(size), - physical_type(BytesType(size + 1)) {} + explicit VarcharType(int size) + : DataType(Type::VARCHAR), + size(size) {} VarcharType(const VarcharType& other) : VarcharType(other.size) {} virtual std::string ToString() const; }; -static const LayoutPtr byte1(new BytesType(1)); -static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1)); - // TODO: add a BinaryArray layer in between class StringArray : public ListArray { public: - StringArray() : ListArray(), bytes_(nullptr), raw_bytes_(nullptr) {} - - StringArray(int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, - int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) { - Init(length, offsets, values, null_count, nulls); - } - - void Init(const TypePtr& type, int32_t length, + StringArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) { - ListArray::Init(type, length, offsets, values, null_count, nulls); - - // TODO: type validation for values array - + const std::shared_ptr& nulls = nullptr) : + ListArray(type, length, offsets, values, null_count, nulls) { // For convenience bytes_ = static_cast(values.get()); raw_bytes_ = bytes_->raw_data(); } - void Init(int32_t length, const std::shared_ptr& offsets, + StringArray(int32_t length, + const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) { - TypePtr type(new StringType()); - Init(type, length, offsets, values, null_count, nulls); - } + const std::shared_ptr& nulls = nullptr) : + StringArray(std::make_shared(), length, offsets, values, + null_count, nulls) {} // Compute the pointer t const uint8_t* GetValue(int i, int32_t* out_length) const { @@ -125,9 +103,6 @@ class StringArray : public ListArray { }; // Array builder - - - class StringBuilder : public ListBuilder { public: explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 9a4777e8b983d..d94396f42c52a 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -15,16 +15,13 @@ // specific language governing permissions and limitations // under the License. -#include - #include #include #include +#include "gtest/gtest.h" + #include "arrow/type.h" -#include "arrow/types/integer.h" -#include "arrow/types/string.h" -#include "arrow/types/struct.h" using std::shared_ptr; using std::string; @@ -42,13 +39,13 @@ TEST(TestStructType, Basics) { TypePtr f2_type = TypePtr(new UInt8Type()); auto f2 = std::make_shared("f2", f2_type); - vector > fields = {f0, f1, f2}; + vector> fields = {f0, f1, f2}; StructType struct_type(fields); - ASSERT_TRUE(struct_type.field(0)->Equals(f0)); - ASSERT_TRUE(struct_type.field(1)->Equals(f1)); - ASSERT_TRUE(struct_type.field(2)->Equals(f2)); + ASSERT_TRUE(struct_type.child(0)->Equals(f0)); + ASSERT_TRUE(struct_type.child(1)->Equals(f1)); + ASSERT_TRUE(struct_type.child(2)->Equals(f2)); ASSERT_EQ(struct_type.ToString(), "struct"); diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h index 1744efce7d631..227aca632ef3c 100644 --- a/cpp/src/arrow/types/test-common.h +++ b/cpp/src/arrow/types/test-common.h @@ -18,11 +18,12 @@ #ifndef ARROW_TYPES_TEST_COMMON_H #define ARROW_TYPES_TEST_COMMON_H -#include #include #include #include +#include "gtest/gtest.h" + #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/memory-pool.h" @@ -34,7 +35,7 @@ namespace arrow { class TestBuilder : public ::testing::Test { public: void SetUp() { - pool_ = GetDefaultMemoryPool(); + pool_ = default_memory_pool(); type_ = TypePtr(new UInt8Type()); builder_.reset(new UInt8Builder(pool_, type_)); builder_nn_.reset(new UInt8Builder(pool_, type_)); diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h index 9aff780c6a392..29cda90b972dd 100644 --- a/cpp/src/arrow/types/union.h +++ b/cpp/src/arrow/types/union.h @@ -30,8 +30,8 @@ namespace arrow { class Buffer; -struct DenseUnionType : public CollectionType { - typedef CollectionType Base; +struct DenseUnionType : public CollectionType { + typedef CollectionType Base; explicit DenseUnionType(const std::vector& child_types) : Base() { @@ -42,8 +42,8 @@ struct DenseUnionType : public CollectionType { }; -struct SparseUnionType : public CollectionType { - typedef CollectionType Base; +struct SparseUnionType : public CollectionType { + typedef CollectionType Base; explicit SparseUnionType(const std::vector& child_types) : Base() { @@ -55,28 +55,20 @@ struct SparseUnionType : public CollectionType { class UnionArray : public Array { - public: - UnionArray() : Array() {} - protected: // The data are types encoded as int16 Buffer* types_; - std::vector > children_; + std::vector> children_; }; class DenseUnionArray : public UnionArray { - public: - DenseUnionArray() : UnionArray() {} - protected: Buffer* offset_buf_; }; class SparseUnionArray : public UnionArray { - public: - SparseUnionArray() : UnionArray() {} }; } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 7506ca5b5531c..220bff084fd6e 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -#include - #include "arrow/util/bit-util.h" +#include "gtest/gtest.h" + namespace arrow { TEST(UtilTests, TestNextPower2) { diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 5e7197f901222..1d2d1d5f9d7e4 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -19,7 +19,6 @@ #define ARROW_UTIL_BIT_UTIL_H #include -#include #include namespace arrow { diff --git a/cpp/src/arrow/util/buffer-test.cc b/cpp/src/arrow/util/buffer-test.cc index 9f1fd91432b4d..1d58226d84a46 100644 --- a/cpp/src/arrow/util/buffer-test.cc +++ b/cpp/src/arrow/util/buffer-test.cc @@ -15,11 +15,12 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include +#include "gtest/gtest.h" + #include "arrow/test-util.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 50f4716769d70..04cdcd75cd41a 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -40,7 +40,7 @@ std::shared_ptr MutableBuffer::GetImmutableView() { PoolBuffer::PoolBuffer(MemoryPool* pool) : ResizableBuffer(nullptr, 0) { if (pool == nullptr) { - pool = GetDefaultMemoryPool(); + pool = default_memory_pool(); } pool_ = pool; } diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index 954b5f951b558..6ef07a07ada3f 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -15,10 +15,11 @@ // specific language governing permissions and limitations // under the License. -#include #include #include +#include "gtest/gtest.h" + #include "arrow/test-util.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" @@ -26,7 +27,7 @@ namespace arrow { TEST(DefaultMemoryPool, MemoryTracking) { - MemoryPool* pool = GetDefaultMemoryPool(); + MemoryPool* pool = default_memory_pool(); uint8_t* data; ASSERT_OK(pool->Allocate(100, &data)); @@ -37,7 +38,7 @@ TEST(DefaultMemoryPool, MemoryTracking) { } TEST(DefaultMemoryPool, OOM) { - MemoryPool* pool = GetDefaultMemoryPool(); + MemoryPool* pool = default_memory_pool(); uint8_t* data; int64_t to_alloc = std::numeric_limits::max(); diff --git a/cpp/src/arrow/util/memory-pool.cc b/cpp/src/arrow/util/memory-pool.cc index 5820346e5a739..0b885e9376a62 100644 --- a/cpp/src/arrow/util/memory-pool.cc +++ b/cpp/src/arrow/util/memory-pool.cc @@ -70,9 +70,9 @@ void InternalMemoryPool::Free(uint8_t* buffer, int64_t size) { InternalMemoryPool::~InternalMemoryPool() {} -MemoryPool* GetDefaultMemoryPool() { - static InternalMemoryPool default_memory_pool; - return &default_memory_pool; +MemoryPool* default_memory_pool() { + static InternalMemoryPool default_memory_pool_; + return &default_memory_pool_; } } // namespace arrow diff --git a/cpp/src/arrow/util/memory-pool.h b/cpp/src/arrow/util/memory-pool.h index a7cb10dae1703..0d2478686f5a4 100644 --- a/cpp/src/arrow/util/memory-pool.h +++ b/cpp/src/arrow/util/memory-pool.h @@ -34,7 +34,7 @@ class MemoryPool { virtual int64_t bytes_allocated() const = 0; }; -MemoryPool* GetDefaultMemoryPool(); +MemoryPool* default_memory_pool(); } // namespace arrow diff --git a/cpp/src/arrow/util/status.cc b/cpp/src/arrow/util/status.cc index c6e113ebea590..43cb87e1a8c56 100644 --- a/cpp/src/arrow/util/status.cc +++ b/cpp/src/arrow/util/status.cc @@ -54,6 +54,9 @@ std::string Status::CodeAsString() const { case StatusCode::Invalid: type = "Invalid"; break; + case StatusCode::IOError: + type = "IOError"; + break; case StatusCode::NotImplemented: type = "NotImplemented"; break; diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index 47fda40db2596..b5931232dbdcb 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -63,6 +63,7 @@ enum class StatusCode: char { OutOfMemory = 1, KeyError = 2, Invalid = 3, + IOError = 4, NotImplemented = 10, }; @@ -97,12 +98,17 @@ class Status { return Status(StatusCode::Invalid, msg, -1); } + static Status IOError(const std::string& msg) { + return Status(StatusCode::IOError, msg, -1); + } + // Returns true iff the status indicates success. bool ok() const { return (state_ == NULL); } bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } bool IsKeyError() const { return code() == StatusCode::KeyError; } bool IsInvalid() const { return code() == StatusCode::Invalid; } + bool IsIOError() const { return code() == StatusCode::IOError; } // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. diff --git a/cpp/src/arrow/util/test_main.cc b/cpp/src/arrow/util/test_main.cc index 00139f36742ed..adc8466fb0be9 100644 --- a/cpp/src/arrow/util/test_main.cc +++ b/cpp/src/arrow/util/test_main.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include +#include "gtest/gtest.h" int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/cpp/thirdparty/build_thirdparty.sh b/cpp/thirdparty/build_thirdparty.sh index 294737cc50522..3d5f532b16309 100755 --- a/cpp/thirdparty/build_thirdparty.sh +++ b/cpp/thirdparty/build_thirdparty.sh @@ -17,6 +17,7 @@ else case $arg in "gtest") F_GTEST=1 ;; "gbenchmark") F_GBENCHMARK=1 ;; + "flatbuffers") F_FLATBUFFERS=1 ;; *) echo "Unknown module: $arg"; exit 1 ;; esac done @@ -78,6 +79,14 @@ if [ -n "$F_ALL" -o -n "$F_GBENCHMARK" ]; then make VERBOSE=1 install || { echo "make $GBENCHMARK_ERROR" ; exit 1; } fi +FLATBUFFERS_ERROR="failed for flatbuffers" +if [ -n "$F_ALL" -o -n "$F_FLATBUFFERS" ]; then + cd $TP_DIR/$FLATBUFFERS_BASEDIR + + CXXFLAGS=-fPIC cmake -DCMAKE_INSTALL_PREFIX:PATH=$PREFIX -DFLATBUFFERS_BUILD_TESTS=OFF . || { echo "cmake $FLATBUFFERS_ERROR" ; exit 1; } + make -j$PARALLEL + make install +fi echo "---------------------" echo "Thirdparty dependencies built and installed into $PREFIX successfully" diff --git a/cpp/thirdparty/download_thirdparty.sh b/cpp/thirdparty/download_thirdparty.sh index d22c559b3e3ba..d299afc15222b 100755 --- a/cpp/thirdparty/download_thirdparty.sh +++ b/cpp/thirdparty/download_thirdparty.sh @@ -25,3 +25,8 @@ if [ ! -d ${GBENCHMARK_BASEDIR} ]; then echo "Fetching google benchmark" download_extract_and_cleanup $GBENCHMARK_URL fi + +if [ ! -d ${FLATBUFFERS_BASEDIR} ]; then + echo "Fetching flatbuffers" + download_extract_and_cleanup $FLATBUFFERS_URL +fi diff --git a/cpp/thirdparty/versions.sh b/cpp/thirdparty/versions.sh index 9cfc7cd94b58c..cb455b4eadd3b 100755 --- a/cpp/thirdparty/versions.sh +++ b/cpp/thirdparty/versions.sh @@ -5,3 +5,7 @@ GTEST_BASEDIR=googletest-release-$GTEST_VERSION GBENCHMARK_VERSION=1.0.0 GBENCHMARK_URL="https://github.com/google/benchmark/archive/v${GBENCHMARK_VERSION}.tar.gz" GBENCHMARK_BASEDIR=benchmark-$GBENCHMARK_VERSION + +FLATBUFFERS_VERSION=1.3.0 +FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/v${FLATBUFFERS_VERSION}.tar.gz" +FLATBUFFERS_BASEDIR=flatbuffers-$FLATBUFFERS_VERSION diff --git a/format/Message.fbs b/format/Message.fbs new file mode 100644 index 0000000000000..3ffd20332087a --- /dev/null +++ b/format/Message.fbs @@ -0,0 +1,183 @@ +namespace apache.arrow.flatbuf; + +/// ---------------------------------------------------------------------- +/// Logical types and their metadata (if any) +/// +/// These are stored in the flatbuffer in the Type union below + +/// A Tuple in the flatbuffer metadata is the same as an Arrow Struct +/// (according to the physical memory layout). We used Tuple here as Struct is +/// a reserved word in Flatbuffers +table Tuple { +} + +table List { +} + +enum UnionMode:int { Sparse, Dense } + +table Union { + mode: UnionMode; +} + +table Bit { +} + +table Int { + bitWidth: int; // 1 to 64 + is_signed: bool; +} + +enum Precision:int {SINGLE, DOUBLE} + +table FloatingPoint { + precision: Precision; +} + +table Utf8 { +} + +table Binary { +} + +table Bool { +} + +table Decimal { + precision: int; + scale: int; +} + +table Timestamp { + timezone: string; +} + +table JSONScalar { + dense:bool=true; +} + +/// ---------------------------------------------------------------------- +/// Top-level Type value, enabling extensible type-specific metadata. We can +/// add new logical types to Type without breaking backwards compatibility + +union Type { + Int, + Bit, + FloatingPoint, + Binary, + Utf8, + Bool, + Decimal, + Timestamp, + List, + Tuple, + Union, + JSONScalar +} + +/// ---------------------------------------------------------------------- +/// A field represents a named column in a record / row batch or child of a +/// nested type. +/// +/// - children is only for nested Arrow arrays +/// - For primitive types, children will have length 0 +/// - nullable should default to true in general + +table Field { + // Name is not required, in i.e. a List + name: string; + nullable: bool; + type: Type; + children: [Field]; +} + +/// ---------------------------------------------------------------------- +/// A Schema describes the columns in a row batch + +table Schema { + fields: [Field]; +} + +/// ---------------------------------------------------------------------- +/// Data structures for describing a table row batch (a collection of +/// equal-length Arrow arrays) + +/// A Buffer represents a single contiguous memory segment +struct Buffer { + /// The shared memory page id where this buffer is located. Currently this is + /// not used + page: int; + + /// The relative offset into the shared memory page where the bytes for this + /// buffer starts + offset: long; + + /// The absolute length (in bytes) of the memory buffer. The memory is found + /// from offset (inclusive) to offset + length (non-inclusive). + length: long; +} + +/// Metadata about a field at some level of a nested type tree (but not +/// its children). +/// +/// For example, a List with values [[1, 2, 3], null, [4], [5, 6], null] +/// would have {length: 5, null_count: 2} for its List node, and {length: 6, +/// null_count: 0} for its Int16 node, as separate FieldNode structs +struct FieldNode { + /// The number of value slots in the Arrow array at this level of a nested + /// tree + length: int; + + /// The number of observed nulls. Fields with null_count == 0 may choose not + /// to write their physical null bitmap out as a materialized buffer, instead + /// setting the length of the null buffer to 0. + null_count: int; +} + +/// A data header describing the shared memory layout of a "record" or "row" +/// batch. Some systems call this a "row batch" internally and others a "record +/// batch". +table RecordBatch { + /// number of records / rows. The arrays in the batch should all have this + /// length + length: int; + + /// Nodes correspond to the pre-ordered flattened logical schema + nodes: [FieldNode]; + + /// Buffers correspond to the pre-ordered flattened buffer tree + /// + /// The number of buffers appended to this list depends on the schema. For + /// example, most primitive arrays will have 2 buffers, 1 for the null bitmap + /// and 1 for the values. For struct arrays, there will only be a single + /// buffer for the null bitmap + buffers: [Buffer]; +} + +/// ---------------------------------------------------------------------- +/// For sending dictionary encoding information. Any Field can be +/// dictionary-encoded, but in this case none of its children may be +/// dictionary-encoded. +/// +/// TODO(wesm): To be documented in more detail + +table DictionaryBatch { + id: long; + data: RecordBatch; +} + +/// ---------------------------------------------------------------------- +/// The root Message type + +/// This union enables us to easily send different message types without +/// redundant storage, and in the future we can easily add new message types. +union MessageHeader { + Schema, DictionaryBatch, RecordBatch +} + +table Message { + header: MessageHeader; + bodyLength: long; +} + +root_type Message; diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 8d93a156bcc3d..9a080709bebda 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -35,4 +35,6 @@ uint8, uint16, uint32, uint64, float_, double, string, list_, struct, field, - DataType, Field, Schema) + DataType, Field, Schema, schema) + +from pyarrow.array import RowBatch diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd index d0d3486c032fe..de3c77419623f 100644 --- a/python/pyarrow/array.pxd +++ b/python/pyarrow/array.pxd @@ -16,7 +16,7 @@ # under the License. from pyarrow.includes.common cimport shared_ptr -from pyarrow.includes.libarrow cimport CArray, LogicalType +from pyarrow.includes.libarrow cimport CArray from pyarrow.scalar import NA diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index bceb333c94ea5..c5d40ddd7a481 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -28,6 +28,9 @@ from pyarrow.error cimport check_status cimport pyarrow.scalar as scalar from pyarrow.scalar import NA +from pyarrow.schema cimport Schema +import pyarrow.schema as schema + def total_allocated_bytes(): cdef MemoryPool* pool = pyarrow.GetMemoryPool() return pool.bytes_allocated() @@ -155,12 +158,12 @@ cdef class StringArray(Array): cdef dict _array_classes = { - LogicalType_NA: NullArray, - LogicalType_BOOL: BooleanArray, - LogicalType_INT64: Int64Array, - LogicalType_DOUBLE: DoubleArray, - LogicalType_LIST: ListArray, - LogicalType_STRING: StringArray, + Type_NA: NullArray, + Type_BOOL: BooleanArray, + Type_INT64: Int64Array, + Type_DOUBLE: DoubleArray, + Type_LIST: ListArray, + Type_STRING: StringArray, } cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): @@ -190,3 +193,35 @@ def from_pylist(object list_obj, DataType type=None): raise NotImplementedError return box_arrow_array(sp_array) + +#---------------------------------------------------------------------- +# Table-like data structures + +cdef class RowBatch: + """ + + """ + cdef readonly: + Schema schema + int num_rows + list arrays + + def __cinit__(self, Schema schema, int num_rows, list arrays): + self.schema = schema + self.num_rows = num_rows + self.arrays = arrays + + if len(self.schema) != len(arrays): + raise ValueError('Mismatch number of data arrays and ' + 'schema fields') + + def __len__(self): + return self.num_rows + + property num_columns: + + def __get__(self): + return len(self.arrays) + + def __getitem__(self, i): + return self.arrays[i] diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index baba112833e0d..e6afcbd79b69f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -21,31 +21,30 @@ from pyarrow.includes.common cimport * cdef extern from "arrow/api.h" namespace "arrow" nogil: - enum LogicalType" arrow::LogicalType::type": - LogicalType_NA" arrow::LogicalType::NA" + enum Type" arrow::Type::type": + Type_NA" arrow::Type::NA" - LogicalType_BOOL" arrow::LogicalType::BOOL" + Type_BOOL" arrow::Type::BOOL" - LogicalType_UINT8" arrow::LogicalType::UINT8" - LogicalType_INT8" arrow::LogicalType::INT8" - LogicalType_UINT16" arrow::LogicalType::UINT16" - LogicalType_INT16" arrow::LogicalType::INT16" - LogicalType_UINT32" arrow::LogicalType::UINT32" - LogicalType_INT32" arrow::LogicalType::INT32" - LogicalType_UINT64" arrow::LogicalType::UINT64" - LogicalType_INT64" arrow::LogicalType::INT64" + Type_UINT8" arrow::Type::UINT8" + Type_INT8" arrow::Type::INT8" + Type_UINT16" arrow::Type::UINT16" + Type_INT16" arrow::Type::INT16" + Type_UINT32" arrow::Type::UINT32" + Type_INT32" arrow::Type::INT32" + Type_UINT64" arrow::Type::UINT64" + Type_INT64" arrow::Type::INT64" - LogicalType_FLOAT" arrow::LogicalType::FLOAT" - LogicalType_DOUBLE" arrow::LogicalType::DOUBLE" + Type_FLOAT" arrow::Type::FLOAT" + Type_DOUBLE" arrow::Type::DOUBLE" - LogicalType_STRING" arrow::LogicalType::STRING" + Type_STRING" arrow::Type::STRING" - LogicalType_LIST" arrow::LogicalType::LIST" - LogicalType_STRUCT" arrow::LogicalType::STRUCT" + Type_LIST" arrow::Type::LIST" + Type_STRUCT" arrow::Type::STRUCT" cdef cppclass CDataType" arrow::DataType": - LogicalType type - c_bool nullable + Type type c_bool Equals(const CDataType* other) @@ -55,8 +54,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int64_t bytes_allocated() cdef cppclass CListType" arrow::ListType"(CDataType): - CListType(const shared_ptr[CDataType]& value_type, - c_bool nullable) + CListType(const shared_ptr[CDataType]& value_type) cdef cppclass CStringType" arrow::StringType"(CDataType): pass @@ -65,21 +63,26 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string name shared_ptr[CDataType] type - CField(const c_string& name, const shared_ptr[CDataType]& type) + c_bool nullable + + CField(const c_string& name, const shared_ptr[CDataType]& type, + c_bool nullable) cdef cppclass CStructType" arrow::StructType"(CDataType): - CStructType(const vector[shared_ptr[CField]]& fields, - c_bool nullable) + CStructType(const vector[shared_ptr[CField]]& fields) cdef cppclass CSchema" arrow::Schema": - CSchema(const shared_ptr[CField]& fields) + CSchema(const vector[shared_ptr[CField]]& fields) + const shared_ptr[CField]& field(int i) + int num_fields() + c_string ToString() cdef cppclass CArray" arrow::Array": const shared_ptr[CDataType]& type() int32_t length() int32_t null_count() - LogicalType logical_type() + Type type_enum() c_bool IsNull(int i) @@ -122,3 +125,57 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStringArray" arrow::StringArray"(CListArray): c_string GetString(int i) + + +cdef extern from "arrow/api.h" namespace "arrow" nogil: + # We can later add more of the common status factory methods as needed + cdef CStatus CStatus_OK "Status::OK"() + + cdef cppclass CStatus "arrow::Status": + CStatus() + + c_string ToString() + + c_bool ok() + c_bool IsOutOfMemory() + c_bool IsKeyError() + c_bool IsNotImplemented() + c_bool IsInvalid() + + cdef cppclass Buffer: + uint8_t* data() + int64_t size() + + +cdef extern from "arrow/ipc/metadata.h" namespace "arrow::ipc" nogil: + cdef cppclass SchemaMessage: + int num_fields() + CStatus GetField(int i, shared_ptr[CField]* out) + CStatus GetSchema(shared_ptr[CSchema]* out) + + cdef cppclass FieldMetadata: + pass + + cdef cppclass BufferMetadata: + pass + + cdef cppclass RecordBatchMessage: + pass + + cdef cppclass DictionaryBatchMessage: + pass + + enum MessageType" arrow::ipc::Message::Type": + MessageType_SCHEMA" arrow::ipc::Message::SCHEMA" + MessageType_RECORD_BATCH" arrow::ipc::Message::RECORD_BATCH" + MessageType_DICTIONARY_BATCH" arrow::ipc::Message::DICTIONARY_BATCH" + + cdef cppclass Message: + CStatus Open(const shared_ptr[Buffer]& buf, + shared_ptr[Message]* out) + int64_t body_length() + MessageType type() + + shared_ptr[SchemaMessage] GetSchema() + shared_ptr[RecordBatchMessage] GetRecordBatch() + shared_ptr[DictionaryBatchMessage] GetDictionaryBatch() diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 9a0c004b7684a..eedfc85446810 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -18,8 +18,7 @@ # distutils: language = c++ from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CDataType, LogicalType, - MemoryPool) +from pyarrow.includes.libarrow cimport CArray, CDataType, Type, MemoryPool cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: # We can later add more of the common status factory methods as needed @@ -39,7 +38,7 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: c_bool IsNotImplemented() c_bool IsArrowError() - shared_ptr[CDataType] GetPrimitiveType(LogicalType type, c_bool nullable) + shared_ptr[CDataType] GetPrimitiveType(Type type) Status ConvertPySequence(object obj, shared_ptr[CArray]* out) MemoryPool* GetMemoryPool() diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index 261a38967c495..04f013d6ca706 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -172,18 +172,18 @@ cdef class ListValue(ArrayValue): cdef dict _scalar_classes = { - LogicalType_UINT8: Int8Value, - LogicalType_UINT16: Int16Value, - LogicalType_UINT32: Int32Value, - LogicalType_UINT64: Int64Value, - LogicalType_INT8: Int8Value, - LogicalType_INT16: Int16Value, - LogicalType_INT32: Int32Value, - LogicalType_INT64: Int64Value, - LogicalType_FLOAT: FloatValue, - LogicalType_DOUBLE: DoubleValue, - LogicalType_LIST: ListValue, - LogicalType_STRING: StringValue + Type_UINT8: Int8Value, + Type_UINT16: Int16Value, + Type_UINT32: Int32Value, + Type_UINT64: Int64Value, + Type_INT8: Int8Value, + Type_INT16: Int16Value, + Type_INT32: Int32Value, + Type_INT64: Int64Value, + Type_FLOAT: FloatValue, + Type_DOUBLE: DoubleValue, + Type_LIST: ListValue, + Type_STRING: StringValue } cdef object box_arrow_scalar(DataType type, diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd index 07b9bd04da20e..61458b765c742 100644 --- a/python/pyarrow/schema.pxd +++ b/python/pyarrow/schema.pxd @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.includes.common cimport shared_ptr +from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport CDataType, CField, CSchema cdef class DataType: @@ -33,9 +33,13 @@ cdef class Field: cdef readonly: DataType type + cdef init(self, const shared_ptr[CField]& field) + cdef class Schema: cdef: shared_ptr[CSchema] sp_schema CSchema* schema + cdef init(self, const vector[shared_ptr[CField]]& fields) + cdef DataType box_data_type(const shared_ptr[CDataType]& type) diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx index ea878720d5bb8..b3bf02aad76bb 100644 --- a/python/pyarrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -54,94 +54,153 @@ cdef class DataType: cdef class Field: - def __cinit__(self, object name, DataType type): - self.type = type - self.sp_field.reset(new CField(tobytes(name), type.sp_type)) - self.field = self.sp_field.get() + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[CField]& field): + self.sp_field = field + self.field = field.get() + + @classmethod + def from_py(cls, object name, DataType type, bint nullable=True): + cdef Field result = Field() + result.type = type + result.sp_field.reset(new CField(tobytes(name), type.sp_type, + nullable)) + result.field = result.sp_field.get() + + return result def __repr__(self): return 'Field({0!r}, type={1})'.format(self.name, str(self.type)) + property nullable: + + def __get__(self): + return self.field.nullable + property name: def __get__(self): return frombytes(self.field.name) +cdef class Schema: + + def __cinit__(self): + pass + + def __len__(self): + return self.schema.num_fields() + + def __getitem__(self, i): + if i < 0 or i >= len(self): + raise IndexError("{0} is out of bounds".format(i)) + + cdef Field result = Field() + result.init(self.schema.field(i)) + result.type = box_data_type(result.field.type) + + return result + + cdef init(self, const vector[shared_ptr[CField]]& fields): + self.schema = new CSchema(fields) + self.sp_schema.reset(self.schema) + + @classmethod + def from_fields(cls, fields): + cdef: + Schema result + Field field + vector[shared_ptr[CField]] c_fields + + c_fields.resize(len(fields)) + + for i in range(len(fields)): + field = fields[i] + c_fields[i] = field.sp_field + + result = Schema() + result.init(c_fields) + + return result + + def __repr__(self): + return frombytes(self.schema.ToString()) + cdef dict _type_cache = {} -cdef DataType primitive_type(LogicalType type, bint nullable=True): - if (type, nullable) in _type_cache: - return _type_cache[type, nullable] +cdef DataType primitive_type(Type type): + if type in _type_cache: + return _type_cache[type] cdef DataType out = DataType() - out.init(pyarrow.GetPrimitiveType(type, nullable)) + out.init(pyarrow.GetPrimitiveType(type)) - _type_cache[type, nullable] = out + _type_cache[type] = out return out #------------------------------------------------------------ # Type factory functions -def field(name, type): - return Field(name, type) +def field(name, type, bint nullable=True): + return Field.from_py(name, type, nullable) cdef set PRIMITIVE_TYPES = set([ - LogicalType_NA, LogicalType_BOOL, - LogicalType_UINT8, LogicalType_INT8, - LogicalType_UINT16, LogicalType_INT16, - LogicalType_UINT32, LogicalType_INT32, - LogicalType_UINT64, LogicalType_INT64, - LogicalType_FLOAT, LogicalType_DOUBLE]) + Type_NA, Type_BOOL, + Type_UINT8, Type_INT8, + Type_UINT16, Type_INT16, + Type_UINT32, Type_INT32, + Type_UINT64, Type_INT64, + Type_FLOAT, Type_DOUBLE]) def null(): - return primitive_type(LogicalType_NA) + return primitive_type(Type_NA) -def bool_(c_bool nullable=True): - return primitive_type(LogicalType_BOOL, nullable) +def bool_(): + return primitive_type(Type_BOOL) -def uint8(c_bool nullable=True): - return primitive_type(LogicalType_UINT8, nullable) +def uint8(): + return primitive_type(Type_UINT8) -def int8(c_bool nullable=True): - return primitive_type(LogicalType_INT8, nullable) +def int8(): + return primitive_type(Type_INT8) -def uint16(c_bool nullable=True): - return primitive_type(LogicalType_UINT16, nullable) +def uint16(): + return primitive_type(Type_UINT16) -def int16(c_bool nullable=True): - return primitive_type(LogicalType_INT16, nullable) +def int16(): + return primitive_type(Type_INT16) -def uint32(c_bool nullable=True): - return primitive_type(LogicalType_UINT32, nullable) +def uint32(): + return primitive_type(Type_UINT32) -def int32(c_bool nullable=True): - return primitive_type(LogicalType_INT32, nullable) +def int32(): + return primitive_type(Type_INT32) -def uint64(c_bool nullable=True): - return primitive_type(LogicalType_UINT64, nullable) +def uint64(): + return primitive_type(Type_UINT64) -def int64(c_bool nullable=True): - return primitive_type(LogicalType_INT64, nullable) +def int64(): + return primitive_type(Type_INT64) -def float_(c_bool nullable=True): - return primitive_type(LogicalType_FLOAT, nullable) +def float_(): + return primitive_type(Type_FLOAT) -def double(c_bool nullable=True): - return primitive_type(LogicalType_DOUBLE, nullable) +def double(): + return primitive_type(Type_DOUBLE) -def string(c_bool nullable=True): +def string(): """ UTF8 string """ - return primitive_type(LogicalType_STRING, nullable) + return primitive_type(Type_STRING) -def list_(DataType value_type, c_bool nullable=True): +def list_(DataType value_type): cdef DataType out = DataType() - out.init(shared_ptr[CDataType]( - new CListType(value_type.sp_type, nullable))) + out.init(shared_ptr[CDataType](new CListType(value_type.sp_type))) return out -def struct(fields, c_bool nullable=True): +def struct(fields): """ """ @@ -154,9 +213,11 @@ def struct(fields, c_bool nullable=True): c_fields.push_back(field.sp_field) out.init(shared_ptr[CDataType]( - new CStructType(c_fields, nullable))) + new CStructType(c_fields))) return out +def schema(fields): + return Schema.from_fields(fields) cdef DataType box_data_type(const shared_ptr[CDataType]& type): cdef DataType out = DataType() diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 0235526198f35..2894ea8f84451 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -18,6 +18,8 @@ from pyarrow.compat import unittest import pyarrow as arrow +A = arrow + class TestTypes(unittest.TestCase): @@ -28,15 +30,12 @@ def test_integers(self): for name in dtypes: factory = getattr(arrow, name) t = factory() - t_required = factory(False) - assert str(t) == name - assert str(t_required) == '{0} not null'.format(name) def test_list(self): value_type = arrow.int32() list_type = arrow.list_(value_type) - assert str(list_type) == 'list' + assert str(list_type) == 'list' def test_string(self): t = arrow.string() @@ -47,5 +46,26 @@ def test_field(self): f = arrow.field('foo', t) assert f.name == 'foo' + assert f.nullable assert f.type is t assert repr(f) == "Field('foo', type=string)" + + f = arrow.field('foo', t, False) + assert not f.nullable + + def test_schema(self): + fields = [ + A.field('foo', A.int32()), + A.field('bar', A.string()), + A.field('baz', A.list_(A.int8())) + ] + sch = A.schema(fields) + + assert len(sch) == 3 + assert sch[0].name == 'foo' + assert sch[0].type == fields[0].type + + assert repr(sch) == """\ +foo: int32 +bar: string +baz: list""" diff --git a/cpp/src/arrow/table/CMakeLists.txt b/python/pyarrow/tests/test_table.py similarity index 58% rename from cpp/src/arrow/table/CMakeLists.txt rename to python/pyarrow/tests/test_table.py index d9f00e74a37db..2e24445bd0c22 100644 --- a/cpp/src/arrow/table/CMakeLists.txt +++ b/python/pyarrow/tests/test_table.py @@ -15,19 +15,26 @@ # specific language governing permissions and limitations # under the License. -####################################### -# arrow_table -####################################### - -# Headers: top level -install(FILES - column.h - schema.h - table.h - DESTINATION include/arrow/table) - -ADD_ARROW_TEST(column-test) -ADD_ARROW_TEST(schema-test) -ADD_ARROW_TEST(table-test) - -ADD_ARROW_BENCHMARK(column-benchmark) +from pyarrow.compat import unittest +import pyarrow as arrow + +A = arrow + + +class TestRowBatch(unittest.TestCase): + + def test_basics(self): + data = [ + A.from_pylist(range(5)), + A.from_pylist([-10, -5, 0, 5, 10]) + ] + num_rows = 5 + + descr = A.schema([A.field('c0', data[0].type), + A.field('c1', data[1].type)]) + + batch = A.RowBatch(descr, num_rows, data) + + assert len(batch) == num_rows + assert batch.num_rows == num_rows + assert batch.num_columns == len(data) diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index bb7905236c59c..acb13acecaf33 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -27,7 +27,7 @@ using arrow::ArrayBuilder; using arrow::DataType; -using arrow::LogicalType; +using arrow::Type; namespace pyarrow { @@ -356,17 +356,17 @@ class ListConverter : public TypedConverter { // Dynamic constructor for sequence converters std::shared_ptr GetConverter(const std::shared_ptr& type) { switch (type->type) { - case LogicalType::BOOL: + case Type::BOOL: return std::make_shared(); - case LogicalType::INT64: + case Type::INT64: return std::make_shared(); - case LogicalType::DOUBLE: + case Type::DOUBLE: return std::make_shared(); - case LogicalType::STRING: + case Type::STRING: return std::make_shared(); - case LogicalType::LIST: + case Type::LIST: return std::make_shared(); - case LogicalType::STRUCT: + case Type::STRUCT: default: return nullptr; break; @@ -378,7 +378,7 @@ Status ListConverter::Init(const std::shared_ptr& builder) { typed_builder_ = static_cast(builder.get()); value_converter_ = GetConverter(static_cast( - builder->type().get())->value_type); + builder->type().get())->value_type()); if (value_converter_ == nullptr) { return Status::NotImplemented("value type not implemented"); } @@ -393,8 +393,8 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { PY_RETURN_NOT_OK(InferArrowType(obj, &size, &type)); // Handle NA / NullType case - if (type->type == LogicalType::NA) { - out->reset(new arrow::Array(type, size, size)); + if (type->type == Type::NA) { + out->reset(new arrow::NullArray(type, size)); return Status::OK(); } diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc index 0921fc4994599..08003aabf9f22 100644 --- a/python/src/pyarrow/helpers.cc +++ b/python/src/pyarrow/helpers.cc @@ -37,19 +37,14 @@ const std::shared_ptr FLOAT = std::make_shared(); const std::shared_ptr DOUBLE = std::make_shared(); const std::shared_ptr STRING = std::make_shared(); -#define GET_PRIMITIVE_TYPE(NAME, Type) \ - case LogicalType::NAME: \ - if (nullable) { \ - return NAME; \ - } else { \ - return std::make_shared(nullable); \ - } \ +#define GET_PRIMITIVE_TYPE(NAME, Class) \ + case Type::NAME: \ + return NAME; \ break; -std::shared_ptr GetPrimitiveType(LogicalType::type type, - bool nullable) { +std::shared_ptr GetPrimitiveType(Type::type type) { switch (type) { - case LogicalType::NA: + case Type::NA: return NA; GET_PRIMITIVE_TYPE(UINT8, UInt8Type); GET_PRIMITIVE_TYPE(INT8, Int8Type); diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h index e41568d5881d4..ec42bb31d3b9b 100644 --- a/python/src/pyarrow/helpers.h +++ b/python/src/pyarrow/helpers.h @@ -24,7 +24,7 @@ namespace pyarrow { using arrow::DataType; -using arrow::LogicalType; +using arrow::Type; extern const std::shared_ptr NA; extern const std::shared_ptr BOOL; @@ -40,8 +40,7 @@ extern const std::shared_ptr FLOAT; extern const std::shared_ptr DOUBLE; extern const std::shared_ptr STRING; -std::shared_ptr GetPrimitiveType(LogicalType::type type, - bool nullable); +std::shared_ptr GetPrimitiveType(Type::type type); } // namespace pyarrow