From 5704d8d644154f21597125d14d20c9e9ae4379d0 Mon Sep 17 00:00:00 2001 From: Kengo Seki Date: Sun, 9 Dec 2018 11:30:20 -0600 Subject: [PATCH 01/45] ARROW-3940: [Python/Documentation] Add required packages to the development instruction I mistakenly closed #3102 so I'll submit the revised PR. @wesm would you take a look at this? Author: Kengo Seki Closes #3126 from sekikn/ARROW-3940-2 and squashes the following commits: 15e369eb0 ARROW-3940: Add required packages to the development instruction --- docs/source/python/development.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst index e86a0be0d04a4..4258feef79f44 100644 --- a/docs/source/python/development.rst +++ b/docs/source/python/development.rst @@ -125,9 +125,13 @@ dependencies will be automatically built by Arrow's third-party toolchain. libboost-filesystem-dev \ libboost-system-dev \ libboost-regex-dev \ + python-dev \ + autoconf \ flex \ bison +If you are building Arrow for Python 3, install ``python3-dev`` instead of ``python-dev``. + On Arch Linux, you can get these dependencies via pacman. .. code-block:: shell @@ -185,6 +189,12 @@ Now build and install the Arrow C++ libraries: If you don't want to build and install the Plasma in-memory object store, you can omit the ``-DARROW_PLASMA=on`` flag. +Also, if multiple versions of Python are installed in your environment, +you may have to pass additional parameters to cmake so that +it can find the right executable, headers and libraries. +For example, specifying `-DPYTHON_EXECUTABLE=$VIRTUAL_ENV/bin/python` +(assuming that you're in virtualenv) enables cmake to choose +the python executable which you are using. .. note:: @@ -227,6 +237,7 @@ libraries), one can set ``--bundle-arrow-cpp``: .. code-block:: shell + pip install wheel # if not installed python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ --with-parquet --with-plasma --bundle-arrow-cpp bdist_wheel From 1dee3f4e794ead69490073cb0e7d99cb6cf1169f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sun, 9 Dec 2018 13:28:34 -0600 Subject: [PATCH 02/45] ARROW-3303: [C++] API for creating arrays from simple JSON string Author: Antoine Pitrou Closes #3084 from pitrou/ARROW-3303-json-values and squashes the following commits: 1b9f4b510 ARROW-3303: API for creating arrays from simple JSON string --- cpp/CMakeLists.txt | 5 + cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array-dict-test.cc | 281 +++------- cpp/src/arrow/array-test.cc | 17 + cpp/src/arrow/builder.h | 3 + cpp/src/arrow/ipc/CMakeLists.txt | 2 + cpp/src/arrow/ipc/ipc-json-simple-test.cc | 594 ++++++++++++++++++++++ cpp/src/arrow/ipc/json-internal.h | 1 + cpp/src/arrow/ipc/json-simple.cc | 508 ++++++++++++++++++ cpp/src/arrow/ipc/json-simple.h | 56 ++ cpp/src/arrow/pretty_print-test.cc | 76 +-- cpp/src/arrow/test-util.cc | 13 +- cpp/src/arrow/test-util.h | 12 + cpp/src/arrow/util/decimal.cc | 14 +- cpp/src/arrow/util/decimal.h | 5 + 15 files changed, 1299 insertions(+), 289 deletions(-) create mode 100644 cpp/src/arrow/ipc/ipc-json-simple-test.cc create mode 100644 cpp/src/arrow/ipc/json-simple.cc create mode 100644 cpp/src/arrow/ipc/json-simple.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6deb339f4c2f0..68ac84e42dd6a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -416,6 +416,11 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) set(ARROW_WITH_ZSTD ON) endif() +if(ARROW_BUILD_TESTS) + # JSON parsing of arrays is required for Arrow unit tests + set(ARROW_IPC ON) +endif() + if(PARQUET_BUILD_EXAMPLES OR PARQUET_BUILD_EXECUTABLES) set(ARROW_PARQUET ON) set(ARROW_BUILD_STATIC ON) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6858f3c4c4fbe..8e932680de034 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -142,6 +142,7 @@ if (ARROW_IPC) ipc/feather.cc ipc/json.cc ipc/json-internal.cc + ipc/json-simple.cc ipc/message.cc ipc/metadata-internal.cc ipc/reader.cc diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index 4c8dcc067b8c5..cc471a3e54066 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -60,54 +60,31 @@ TYPED_TEST(TestDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - NumericBuilder dict_builder; - ASSERT_OK(dict_builder.Append(static_cast(1))); - ASSERT_OK(dict_builder.Append(static_cast(2))); - std::shared_ptr dict_array; - ASSERT_OK(dict_builder.Finish(&dict_array)); - auto dtype = std::make_shared(int8(), dict_array); + auto dict_array = ArrayFromJSON(std::make_shared(), "[1, 2]"); + auto dict_type = std::make_shared(int8(), dict_array); - Int8Builder int_builder; - ASSERT_OK(int_builder.Append(0)); - ASSERT_OK(int_builder.Append(1)); - ASSERT_OK(int_builder.Append(0)); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dict_type, int_array); - DictionaryArray expected(dtype, int_array); ASSERT_TRUE(expected.Equals(result)); } TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { - NumericBuilder builder; - // DictionaryBuilder builder; - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(1))); + auto type = std::make_shared(); - std::shared_ptr intermediate_result; - ASSERT_OK(builder.Finish(&intermediate_result)); + auto intermediate_result = ArrayFromJSON(type, "[1, 2, 1]"); DictionaryBuilder dictionary_builder(default_memory_pool()); ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result)); std::shared_ptr result; ASSERT_OK(dictionary_builder.Finish(&result)); // Build expected data - NumericBuilder dict_builder; - ASSERT_OK(dict_builder.Append(static_cast(1))); - ASSERT_OK(dict_builder.Append(static_cast(2))); - std::shared_ptr dict_array; - ASSERT_OK(dict_builder.Finish(&dict_array)); - auto dtype = std::make_shared(int8(), dict_array); + auto dict_array = ArrayFromJSON(type, "[1, 2]"); + auto dict_type = std::make_shared(int8(), dict_array); - Int8Builder int_builder; - ASSERT_OK(int_builder.Append(0)); - ASSERT_OK(int_builder.Append(1)); - ASSERT_OK(int_builder.Append(0)); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dict_type, int_array); - DictionaryArray expected(dtype, int_array); ASSERT_TRUE(expected.Equals(result)); } @@ -150,120 +127,74 @@ TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) { } TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) { + using c_type = typename TypeParam::c_type; + auto type = std::make_shared(); + DictionaryBuilder builder(default_memory_pool()); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); std::shared_ptr result; FinishAndCheckPadding(&builder, &result); // Build expected data for the initial dictionary - NumericBuilder dict_builder1; - ASSERT_OK(dict_builder1.Append(static_cast(1))); - ASSERT_OK(dict_builder1.Append(static_cast(2))); - std::shared_ptr dict_array1; - ASSERT_OK(dict_builder1.Finish(&dict_array1)); - auto dtype1 = std::make_shared(int8(), dict_array1); - - Int8Builder int_builder1; - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - std::shared_ptr int_array1; - ASSERT_OK(int_builder1.Finish(&int_array1)); + auto dict_type1 = dictionary(int8(), ArrayFromJSON(type, "[1, 2]")); + DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]")); - DictionaryArray expected(dtype1, int_array1); ASSERT_TRUE(expected.Equals(result)); // extend the dictionary builder with new data - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(3))); std::shared_ptr result_delta; ASSERT_OK(builder.Finish(&result_delta)); // Build expected data for the delta dictionary - NumericBuilder dict_builder2; - ASSERT_OK(dict_builder2.Append(static_cast(3))); - std::shared_ptr dict_array2; - ASSERT_OK(dict_builder2.Finish(&dict_array2)); - auto dtype2 = std::make_shared(int8(), dict_array2); - - Int8Builder int_builder2; - ASSERT_OK(int_builder2.Append(1)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(0)); - ASSERT_OK(int_builder2.Append(2)); - std::shared_ptr int_array2; - ASSERT_OK(int_builder2.Finish(&int_array2)); + auto dict_type2 = dictionary(int8(), ArrayFromJSON(type, "[3]")); + DictionaryArray expected_delta(dict_type2, ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]")); - DictionaryArray expected_delta(dtype2, int_array2); ASSERT_TRUE(expected_delta.Equals(result_delta)); } TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) { + using c_type = typename TypeParam::c_type; + auto type = std::make_shared(); + DictionaryBuilder builder(default_memory_pool()); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); std::shared_ptr result; FinishAndCheckPadding(&builder, &result); // Build expected data for the initial dictionary - NumericBuilder dict_builder1; - ASSERT_OK(dict_builder1.Append(static_cast(1))); - ASSERT_OK(dict_builder1.Append(static_cast(2))); - std::shared_ptr dict_array1; - ASSERT_OK(dict_builder1.Finish(&dict_array1)); - auto dtype1 = std::make_shared(int8(), dict_array1); - - Int8Builder int_builder1; - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - std::shared_ptr int_array1; - ASSERT_OK(int_builder1.Finish(&int_array1)); + auto dict_type1 = dictionary(int8(), ArrayFromJSON(type, "[1, 2]")); + DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]")); - DictionaryArray expected(dtype1, int_array1); ASSERT_TRUE(expected.Equals(result)); // extend the dictionary builder with new data - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(3))); std::shared_ptr result_delta1; ASSERT_OK(builder.Finish(&result_delta1)); // Build expected data for the delta dictionary - NumericBuilder dict_builder2; - ASSERT_OK(dict_builder2.Append(static_cast(3))); - std::shared_ptr dict_array2; - ASSERT_OK(dict_builder2.Finish(&dict_array2)); - auto dtype2 = std::make_shared(int8(), dict_array2); - - Int8Builder int_builder2; - ASSERT_OK(int_builder2.Append(1)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(0)); - ASSERT_OK(int_builder2.Append(2)); - std::shared_ptr int_array2; - ASSERT_OK(int_builder2.Finish(&int_array2)); + auto dict_type2 = dictionary(int8(), ArrayFromJSON(type, "[3]")); + DictionaryArray expected_delta1(dict_type2, ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]")); - DictionaryArray expected_delta1(dtype2, int_array2); ASSERT_TRUE(expected_delta1.Equals(result_delta1)); // extend the dictionary builder with new data again @@ -277,23 +208,9 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) { ASSERT_OK(builder.Finish(&result_delta2)); // Build expected data for the delta dictionary again - NumericBuilder dict_builder3; - ASSERT_OK(dict_builder3.Append(static_cast(4))); - ASSERT_OK(dict_builder3.Append(static_cast(5))); - std::shared_ptr dict_array3; - ASSERT_OK(dict_builder3.Finish(&dict_array3)); - auto dtype3 = std::make_shared(int8(), dict_array3); - - Int8Builder int_builder3; - ASSERT_OK(int_builder3.Append(0)); - ASSERT_OK(int_builder3.Append(1)); - ASSERT_OK(int_builder3.Append(2)); - ASSERT_OK(int_builder3.Append(3)); - ASSERT_OK(int_builder3.Append(4)); - std::shared_ptr int_array3; - ASSERT_OK(int_builder3.Finish(&int_array3)); + auto dict_type3 = dictionary(int8(), ArrayFromJSON(type, "[4, 5]")); + DictionaryArray expected_delta2(dict_type3, ArrayFromJSON(int8(), "[0, 1, 2, 3, 4]")); - DictionaryArray expected_delta2(dtype3, int_array3); ASSERT_TRUE(expected_delta2.Equals(result_delta2)); } @@ -308,21 +225,10 @@ TEST(TestStringDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - StringBuilder str_builder; - ASSERT_OK(str_builder.Append("test")); - ASSERT_OK(str_builder.Append("test2")); - std::shared_ptr str_array; - ASSERT_OK(str_builder.Finish(&str_array)); - auto dtype = std::make_shared(int8(), str_array); - - Int8Builder int_builder; - ASSERT_OK(int_builder.Append(0)); - ASSERT_OK(int_builder.Append(1)); - ASSERT_OK(int_builder.Append(0)); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); - + auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test\", \"test2\"]")); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); DictionaryArray expected(dtype, int_array); + ASSERT_TRUE(expected.Equals(result)); } @@ -373,21 +279,10 @@ TEST(TestStringDictionaryBuilder, DeltaDictionary) { ASSERT_OK(builder.Finish(&result)); // Build expected data - StringBuilder str_builder1; - ASSERT_OK(str_builder1.Append("test")); - ASSERT_OK(str_builder1.Append("test2")); - std::shared_ptr str_array1; - ASSERT_OK(str_builder1.Finish(&str_array1)); - auto dtype1 = std::make_shared(int8(), str_array1); - - Int8Builder int_builder1; - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - ASSERT_OK(int_builder1.Append(0)); - std::shared_ptr int_array1; - ASSERT_OK(int_builder1.Finish(&int_array1)); + auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test\", \"test2\"]")); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dtype, int_array); - DictionaryArray expected(dtype1, int_array1); ASSERT_TRUE(expected.Equals(result)); // build a delta dictionary @@ -399,20 +294,10 @@ TEST(TestStringDictionaryBuilder, DeltaDictionary) { FinishAndCheckPadding(&builder, &result_delta); // Build expected data - StringBuilder str_builder2; - ASSERT_OK(str_builder2.Append("test3")); - std::shared_ptr str_array2; - ASSERT_OK(str_builder2.Finish(&str_array2)); - auto dtype2 = std::make_shared(int8(), str_array2); - - Int8Builder int_builder2; - ASSERT_OK(int_builder2.Append(1)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(1)); - std::shared_ptr int_array2; - ASSERT_OK(int_builder2.Finish(&int_array2)); - + auto dtype2 = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test3\"]")); + auto int_array2 = ArrayFromJSON(int8(), "[1, 2, 1]"); DictionaryArray expected_delta(dtype2, int_array2); + ASSERT_TRUE(expected_delta.Equals(result_delta)); } @@ -647,7 +532,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) { TEST(TestDecimalDictionaryBuilder, Basic) { // Build the dictionary Array - const auto& decimal_type = arrow::decimal(2, 0); + auto decimal_type = arrow::decimal(2, 0); DictionaryBuilder builder(decimal_type, default_memory_pool()); // Test data @@ -660,20 +545,9 @@ TEST(TestDecimalDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - FixedSizeBinaryBuilder decimal_builder(decimal_type); - ASSERT_OK(decimal_builder.Append(Decimal128(12).ToBytes())); - ASSERT_OK(decimal_builder.Append(Decimal128(11).ToBytes())); + auto dtype = dictionary(int8(), ArrayFromJSON(decimal_type, "[\"12\", \"11\"]")); + DictionaryArray expected(dtype, ArrayFromJSON(int8(), "[0, 0, 1, 0]")); - std::shared_ptr decimal_array; - ASSERT_OK(decimal_builder.Finish(&decimal_array)); - auto dtype = arrow::dictionary(int8(), decimal_array); - - Int8Builder int_builder; - ASSERT_OK(int_builder.AppendValues({0, 0, 1, 0})); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); - - DictionaryArray expected(dtype, int_array); ASSERT_TRUE(expected.Equals(result)); } @@ -758,26 +632,20 @@ TEST(TestDictionary, Basics) { TEST(TestDictionary, Equals) { vector is_valid = {true, true, false, true, true, true}; + std::shared_ptr dict, dict2, indices, indices2, indices3; - std::shared_ptr dict; - vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(dict_values, &dict); + dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]"); std::shared_ptr dict_type = dictionary(int16(), dict); - std::shared_ptr dict2; - vector dict2_values = {"foo", "bar", "baz", "qux"}; - ArrayFromVector(dict2_values, &dict2); + dict2 = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\", \"qux\"]"); std::shared_ptr dict2_type = dictionary(int16(), dict2); - std::shared_ptr indices; vector indices_values = {1, 2, -1, 0, 2, 0}; ArrayFromVector(is_valid, indices_values, &indices); - std::shared_ptr indices2; vector indices2_values = {1, 2, 0, 0, 2, 0}; ArrayFromVector(is_valid, indices2_values, &indices2); - std::shared_ptr indices3; vector indices3_values = {1, 1, 0, 0, 2, 0}; ArrayFromVector(is_valid, indices3_values, &indices3); @@ -825,17 +693,10 @@ TEST(TestDictionary, Equals) { } TEST(TestDictionary, Validate) { - vector is_valid = {true, true, false, true, true, true}; - - std::shared_ptr dict; - vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(dict_values, &dict); + auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]"); std::shared_ptr dict_type = dictionary(int16(), dict); - std::shared_ptr indices; - vector indices_values = {1, 2, 0, 0, 2, 0}; - ArrayFromVector(is_valid, indices_values, &indices); - + auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0, 2, 0]"); std::shared_ptr arr = std::make_shared(dict_type, indices); // Only checking index type for now @@ -857,28 +718,20 @@ TEST(TestDictionary, Validate) { } TEST(TestDictionary, FromArray) { - std::shared_ptr dict; - vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(dict_values, &dict); + auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]"); std::shared_ptr dict_type = dictionary(int16(), dict); - std::shared_ptr indices1; - vector indices_values1 = {1, 2, 0, 0, 2, 0}; - ArrayFromVector(indices_values1, &indices1); - - std::shared_ptr indices2; - vector indices_values2 = {1, 2, 0, 3, 2, 0}; - ArrayFromVector(indices_values2, &indices2); + auto indices1 = ArrayFromJSON(int16(), "[1, 2, 0, 0, 2, 0]"); + auto indices2 = ArrayFromJSON(int16(), "[1, 2, 0, 3, 2, 0]"); + // Invalid index is masked by null std::shared_ptr indices3; vector is_valid3 = {true, true, false, true, true, true}; vector indices_values3 = {1, 2, -1, 0, 2, 0}; ArrayFromVector(is_valid3, indices_values3, &indices3); - std::shared_ptr indices4; - vector is_valid4 = {true, true, false, true, true, true}; - vector indices_values4 = {1, 2, 1, 3, 2, 0}; - ArrayFromVector(is_valid4, indices_values4, &indices4); + // Index out of bounds + auto indices4 = ArrayFromJSON(int16(), "[1, 2, null, 3, 2, 0]"); std::shared_ptr arr1, arr2, arr3, arr4; ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices1, &arr1)); diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 1a88740a4ac08..de0885e6f5f3a 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -246,6 +246,23 @@ TEST_F(TestArray, BuildLargeInMemoryArray) { TEST_F(TestArray, TestCopy) {} +// ---------------------------------------------------------------------- +// Null type tests + +TEST(TestNullBuilder, Basics) { + NullBuilder builder; + std::shared_ptr array; + + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Append(nullptr)); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Finish(&array)); + + const auto& null_array = checked_cast(*array); + ASSERT_EQ(null_array.length(), 3); + ASSERT_EQ(null_array.null_count(), 3); +} + // ---------------------------------------------------------------------- // Primitive type tests diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 34398eebebfb6..607fa1745a5a0 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -20,6 +20,7 @@ #include // IWYU pragma: keep #include +#include #include #include #include @@ -235,6 +236,8 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { return Status::OK(); } + Status Append(std::nullptr_t value) { return AppendNull(); } + Status FinishInternal(std::shared_ptr* out) override; }; diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 9c384c3e9901c..40cebf1823e2c 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -20,6 +20,7 @@ ADD_ARROW_TEST(feather-test) ADD_ARROW_TEST(ipc-read-write-test) +ADD_ARROW_TEST(ipc-json-simple-test) ADD_ARROW_TEST(ipc-json-test) if (NOT ARROW_BOOST_HEADER_ONLY) @@ -84,6 +85,7 @@ install(FILES dictionary.h feather.h json.h + json-simple.h message.h reader.h writer.h diff --git a/cpp/src/arrow/ipc/ipc-json-simple-test.cc b/cpp/src/arrow/ipc/ipc-json-simple-test.cc new file mode 100644 index 0000000000000..45525212d2f4b --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-json-simple-test.cc @@ -0,0 +1,594 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/ipc/json-simple.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#if defined(_MSC_VER) +// "warning C4307: '+': integral constant overflow" +#pragma warning(disable : 4307) +#endif + +namespace arrow { +namespace ipc { +namespace internal { +namespace json { + +using ::arrow::internal::checked_cast; + +// Avoid undefined behaviour on signed overflow +template +Signed SafeSignedAdd(Signed u, Signed v) { + using Unsigned = typename std::make_unsigned::type; + return static_cast(static_cast(u) + static_cast(v)); +} + +// Special case for 8-bit ints (must output their decimal value, not the +// corresponding ASCII character) +void JSONArrayInternal(std::ostream* ss, int8_t value) { + *ss << static_cast(value); +} + +void JSONArrayInternal(std::ostream* ss, uint8_t value) { + *ss << static_cast(value); +} + +template +void JSONArrayInternal(std::ostream* ss, const Value& value) { + *ss << value; +} + +template +void JSONArrayInternal(std::ostream* ss, const Value& value, Tail... tail) { + JSONArrayInternal(ss, value); + *ss << ", "; + JSONArrayInternal(ss, std::forward(tail)...); +} + +template +std::string JSONArray(Args... args) { + std::stringstream ss; + ss << "["; + JSONArrayInternal(&ss, std::forward(args)...); + ss << "]"; + return ss.str(); +} + +template +void AssertJSONArray(const std::shared_ptr& type, const std::string& json, + const std::vector& values) { + std::shared_ptr actual, expected; + + ASSERT_OK(ArrayFromJSON(type, json, &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector(type, values, &expected); + AssertArraysEqual(*expected, *actual); +} + +template +void AssertJSONArray(const std::shared_ptr& type, const std::string& json, + const std::vector& is_valid, + const std::vector& values) { + std::shared_ptr actual, expected; + + ASSERT_OK(ArrayFromJSON(type, json, &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector(type, is_valid, values, &expected); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestHelper, JSONArray) { + // Test the JSONArray helper func + std::string s = + JSONArray(123, -4.5, static_cast(-12), static_cast(34)); + ASSERT_EQ(s, "[123, -4.5, -12, 34]"); + s = JSONArray(9223372036854775807LL, 9223372036854775808ULL, -9223372036854775807LL - 1, + 18446744073709551615ULL); + ASSERT_EQ(s, + "[9223372036854775807, 9223372036854775808, -9223372036854775808, " + "18446744073709551615]"); +} + +TEST(TestHelper, SafeSignedAdd) { + ASSERT_EQ(0, SafeSignedAdd(-128, -128)); + ASSERT_EQ(1, SafeSignedAdd(-128, -127)); + ASSERT_EQ(-128, SafeSignedAdd(1, 127)); + ASSERT_EQ(-2147483648LL, SafeSignedAdd(1, 2147483647)); +} + +template +class TestIntegers : public ::testing::Test {}; + +TYPED_TEST_CASE_P(TestIntegers); + +TYPED_TEST_P(TestIntegers, Basics) { + using T = TypeParam; + using c_type = typename T::c_type; + + std::shared_ptr expected, actual; + std::shared_ptr type = TypeTraits::type_singleton(); + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[4, 0, 5]", {4, 0, 5}); + AssertJSONArray(type, "[4, null, 5]", {true, false, true}, {4, 0, 5}); + + // Test limits + const auto min_val = std::numeric_limits::min(); + const auto max_val = std::numeric_limits::max(); + std::string json_string = JSONArray(0, 1, min_val); + AssertJSONArray(type, json_string, {0, 1, min_val}); + json_string = JSONArray(0, 1, max_val); + AssertJSONArray(type, json_string, {0, 1, max_val}); +} + +TYPED_TEST_P(TestIntegers, Errors) { + using T = TypeParam; + + std::shared_ptr array; + std::shared_ptr type = TypeTraits::type_singleton(); + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "0", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "{}", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0.0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"0\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); +} + +TYPED_TEST_P(TestIntegers, OutOfBounds) { + using T = TypeParam; + using c_type = typename T::c_type; + + std::shared_ptr array; + std::shared_ptr type = TypeTraits::type_singleton(); + + if (type->id() == Type::UINT64) { + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[18446744073709551616]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-1]", &array)); + } else if (type->id() == Type::INT64) { + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[9223372036854775808]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-9223372036854775809]", &array)); + } else if (std::is_signed::value) { + const auto lower = SafeSignedAdd(std::numeric_limits::min(), -1); + const auto upper = SafeSignedAdd(std::numeric_limits::max(), +1); + auto json_string = JSONArray(lower); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string, &array)); + json_string = JSONArray(upper); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string, &array)); + } else { + const auto upper = static_cast(std::numeric_limits::max()) + 1; + auto json_string = JSONArray(upper); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string, &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-1]", &array)); + } +} + +REGISTER_TYPED_TEST_CASE_P(TestIntegers, Basics, Errors, OutOfBounds); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestIntegers, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestIntegers, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestIntegers, Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestIntegers, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestIntegers, UInt8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestIntegers, UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestIntegers, UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestIntegers, UInt64Type); + +TEST(TestNull, Basics) { + std::shared_ptr type = null(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[null, null]", {nullptr, nullptr}); +} + +TEST(TestNull, Errors) { + std::shared_ptr type = null(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[NaN]", &array)); +} + +TEST(TestBoolean, Basics) { + std::shared_ptr type = boolean(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[false, true, false]", {false, true, false}); + AssertJSONArray(type, "[false, true, null]", {true, true, false}, + {false, true, false}); +} + +TEST(TestBoolean, Errors) { + std::shared_ptr type = boolean(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"true\"]", &array)); +} + +TEST(TestFloat, Basics) { + std::shared_ptr type = float32(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[1, 2.5, -3e4]", {1.0f, 2.5f, -3.0e4f}); + AssertJSONArray(type, "[-0.0, Inf, -Inf, null]", {true, true, true, false}, + {-0.0f, INFINITY, -INFINITY, 0.0f}); + + // Check NaN separately as AssertArraysEqual simply memcmp's array contents + // and NaNs can have many bit representations. + ASSERT_OK(ArrayFromJSON(type, "[NaN]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + float value = checked_cast(*actual).Value(0); + ASSERT_TRUE(std::isnan(value)); +} + +TEST(TestFloat, Errors) { + std::shared_ptr type = float32(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[true]", &array)); +} + +TEST(TestDouble, Basics) { + std::shared_ptr type = float64(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[1, 2.5, -3e4]", {1.0, 2.5, -3.0e4}); + AssertJSONArray(type, "[-0.0, Inf, -Inf, null]", {true, true, true, false}, + {-0.0, INFINITY, -INFINITY, 0.0}); + + ASSERT_OK(ArrayFromJSON(type, "[NaN]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + double value = checked_cast(*actual).Value(0); + ASSERT_TRUE(std::isnan(value)); +} + +TEST(TestDouble, Errors) { + std::shared_ptr type = float64(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[true]", &array)); +} + +TEST(TestString, Basics) { + std::shared_ptr type = utf8(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[\"\", \"foo\"]", {"", "foo"}); + AssertJSONArray(type, "[\"\", null]", {true, false}, {"", ""}); + // NUL character in string + std::string s = "some"; + s += '\x00'; + s += "char"; + AssertJSONArray(type, "[\"\", \"some\\u0000char\"]", {"", s}); +} + +TEST(TestString, Errors) { + std::shared_ptr type = utf8(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); +} + +TEST(TestDecimal, Basics) { + std::shared_ptr type = decimal(10, 4); + std::shared_ptr expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + Decimal128Builder builder(type); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[\"123.4567\", \"-78.9000\"]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + Decimal128Builder builder(type); + ASSERT_OK(builder.Append(Decimal128(1234567))); + ASSERT_OK(builder.Append(Decimal128(-789000))); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[\"123.4567\", null]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + Decimal128Builder builder(type); + ASSERT_OK(builder.Append(Decimal128(1234567))); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestDecimal, Errors) { + std::shared_ptr type = decimal(10, 4); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[12.3456]", &array)); + // Bad scale + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"12.345\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"12.34560\"]", &array)); +} + +TEST(TestList, IntegerList) { + auto pool = default_memory_pool(); + std::shared_ptr type = list(int64()); + std::shared_ptr offsets, values, expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0}, &offsets); + ArrayFromVector({}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[4, 5], [], [6]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 2, 2, 3}, &offsets); + ArrayFromVector({4, 5, 6}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[], [null], [6, null]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 0, 1, 3}, &offsets); + auto is_valid = std::vector{false, true, false}; + ArrayFromVector(is_valid, {0, 6, 0}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [], null]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestList, IntegerListErrors) { + std::shared_ptr type = list(int64()); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]", &array)); +} + +TEST(TestList, NullList) { + auto pool = default_memory_pool(); + std::shared_ptr type = list(null()); + std::shared_ptr offsets, values, expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0}, &offsets); + values = std::make_shared(0); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[], [null], [null, null]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 0, 1, 3}, &offsets); + values = std::make_shared(3); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [], null]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestList, IntegerListList) { + auto pool = default_memory_pool(); + std::shared_ptr type = list(list(uint8())); + std::shared_ptr offsets, values, nested, expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &nested)); + ArrayFromVector({0, 2, 3}, &offsets); + ASSERT_OK(ListArray::FromArrays(*offsets, *nested, pool, &expected)); + ASSERT_EQ(actual->length(), 2); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &nested)); + ArrayFromVector({0, 0, 1, 4, 5}, &offsets); + ASSERT_OK(ListArray::FromArrays(*offsets, *nested, pool, &expected)); + ASSERT_EQ(actual->length(), 4); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [null], [[null]]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto& child_builder = checked_cast(*list_builder.value_builder()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(child_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(child_builder.Append()); + ASSERT_OK(list_builder.Finish(&expected)); + } +} + +TEST(TestStruct, SimpleStruct) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = struct_({field_a, field_b}); + std::shared_ptr a, b, expected, actual; + std::shared_ptr null_bitmap; + std::vector is_valid; + std::vector> children; + + // Trivial + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({}, &a); + ArrayFromVector({}, &b); + children.assign({a, b}); + expected = std::make_shared(type, 0, children); + AssertArraysEqual(*expected, *actual); + + // Non-empty + ArrayFromVector({5, 6}, &a); + ArrayFromVector({true, false}, &b); + children.assign({a, b}); + expected = std::make_shared(type, 2, children); + + ASSERT_OK(ArrayFromJSON(type, "[[5, true], [6, false]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); + ASSERT_OK(ArrayFromJSON(type, "[{\"a\": 5, \"b\": true}, {\"b\": false, \"a\": 6}]", + &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); + + // With nulls + is_valid = {false, true, false, false}; + ArrayFromVector(is_valid, {0, 5, 6, 0}, &a); + is_valid = {false, false, true, false}; + ArrayFromVector(is_valid, {false, true, false, false}, &b); + children.assign({a, b}); + BitmapFromVector({false, true, true, true}, &null_bitmap); + expected = std::make_shared(type, 4, children, null_bitmap, 1); + + ASSERT_OK( + ArrayFromJSON(type, "[null, [5, null], [null, false], [null, null]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); + // When using object notation, null members can be omitted + ASSERT_OK(ArrayFromJSON(type, "[null, {\"a\": 5, \"b\": null}, {\"b\": false}, {}]", + &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestStruct, NestedStruct) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto field_c = field("c", float64()); + std::shared_ptr nested_type = struct_({field_a, field_b}); + auto field_nested = field("nested", nested_type); + std::shared_ptr type = struct_({field_nested, field_c}); + std::shared_ptr expected, actual; + std::shared_ptr null_bitmap; + std::vector is_valid; + std::vector> children(2); + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({}, &children[0]); + ArrayFromVector({}, &children[1]); + children[0] = std::make_shared(nested_type, 0, children); + ArrayFromVector({}, &children[1]); + expected = std::make_shared(type, 0, children); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[[5, true], 1.5], [[6, false], -3e2]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({5, 6}, &children[0]); + ArrayFromVector({true, false}, &children[1]); + children[0] = std::make_shared(nested_type, 2, children); + ArrayFromVector({1.5, -300.0}, &children[1]); + expected = std::make_shared(type, 2, children); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [[5, null], null], [null, -3e2]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + is_valid = {false, true, false}; + ArrayFromVector(is_valid, {0, 5, 0}, &children[0]); + is_valid = {false, false, false}; + ArrayFromVector(is_valid, {false, false, false}, &children[1]); + BitmapFromVector({false, true, false}, &null_bitmap); + children[0] = std::make_shared(nested_type, 3, children, null_bitmap, 2); + is_valid = {false, false, true}; + ArrayFromVector(is_valid, {0.0, 0.0, -300.0}, &children[1]); + BitmapFromVector({false, true, true}, &null_bitmap); + expected = std::make_shared(type, 3, children, null_bitmap, 1); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestStruct, Errors) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = struct_({field_a, field_b}); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0, true]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, true, 1]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[true, 0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"b\": 0, \"a\": true}]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"c\": 0}]", &array)); +} + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 8807a56551789..5516e2dd72a2e 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -36,6 +36,7 @@ #include "rapidjson/document.h" // IWYU pragma: export #include "rapidjson/encodings.h" // IWYU pragma: export +#include "rapidjson/error/en.h" // IWYU pragma: export #include "rapidjson/stringbuffer.h" // IWYU pragma: export #include "rapidjson/writer.h" // IWYU pragma: export diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc new file mode 100644 index 0000000000000..b69bd76f51611 --- /dev/null +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -0,0 +1,508 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/ipc/json-internal.h" +#include "arrow/ipc/json-simple.h" +#include "arrow/memory_pool.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" +#include "arrow/util/string_view.h" + +namespace arrow { +namespace ipc { +namespace internal { +namespace json { + +using ::arrow::internal::checked_cast; + +static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; + +static Status JSONTypeError(const char* expected_type, rj::Type json_type) { + std::stringstream ss; + ss << "Expected " << expected_type << " or null, got type " << json_type; + return Status::Invalid(ss.str()); +} + +class Converter { + public: + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status AppendValue(const rj::Value& json_obj) = 0; + + virtual Status AppendNull() = 0; + + virtual Status AppendValues(const rj::Value& json_array) = 0; + + virtual std::shared_ptr builder() = 0; + + virtual Status Finish(std::shared_ptr* out) { + auto builder = this->builder(); + if (builder->length() == 0) { + // Make sure the builder was initialized + RETURN_NOT_OK(builder->Resize(1)); + } + return builder->Finish(out); + } + + protected: + std::shared_ptr type_; +}; + +Status GetConverter(const std::shared_ptr&, std::shared_ptr* out); + +// CRTP +template +class ConcreteConverter : public Converter { + public: + Status AppendValues(const rj::Value& json_array) override { + auto self = static_cast(this); + if (!json_array.IsArray()) { + return JSONTypeError("array", json_array.GetType()); + } + auto size = json_array.Size(); + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(self->AppendValue(json_array[i])); + } + return Status::OK(); + } +}; + +// TODO : dates and times? +// TODO : binary / fixed size binary? + +// ------------------------------------------------------------------------ +// Converter for null arrays + +class NullConverter : public ConcreteConverter { + public: + explicit NullConverter(const std::shared_ptr& type) { + type_ = type; + builder_ = std::make_shared(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + return JSONTypeError("null", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for boolean arrays + +class BooleanConverter : public ConcreteConverter { + public: + explicit BooleanConverter(const std::shared_ptr& type) { + type_ = type; + builder_ = std::make_shared(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + if (json_obj.IsBool()) { + return builder_->Append(json_obj.GetBool()); + } + return JSONTypeError("boolean", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for int arrays + +template +class IntegerConverter : public ConcreteConverter> { + using c_type = typename Type::c_type; + static constexpr auto is_signed = std::is_signed::value; + + public: + explicit IntegerConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared>(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + return AppendNumber(json_obj); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + // Append signed integer value + template + typename std::enable_if::value, Status>::type AppendNumber( + const rj::Value& json_obj) { + if (json_obj.IsInt64()) { + int64_t v64 = json_obj.GetInt64(); + c_type v = static_cast(v64); + if (v == v64) { + return builder_->Append(v); + } else { + std::stringstream ss; + ss << "Value " << v64 << " out of bounds for " << this->type_->ToString(); + return Status::Invalid(ss.str()); + } + } else { + return JSONTypeError("signed int", json_obj.GetType()); + } + } + + // Append unsigned integer value + template + typename std::enable_if::value, Status>::type AppendNumber( + const rj::Value& json_obj) { + if (json_obj.IsUint64()) { + uint64_t v64 = json_obj.GetUint64(); + c_type v = static_cast(v64); + if (v == v64) { + return builder_->Append(v); + } else { + std::stringstream ss; + ss << "Value " << v64 << " out of bounds for " << this->type_->ToString(); + return Status::Invalid(ss.str()); + } + return builder_->Append(v); + } else { + return JSONTypeError("unsigned int", json_obj.GetType()); + } + } + + std::shared_ptr> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for float arrays + +template +class FloatConverter : public ConcreteConverter> { + using c_type = typename Type::c_type; + + public: + explicit FloatConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared>(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + if (json_obj.IsNumber()) { + c_type v = static_cast(json_obj.GetDouble()); + return builder_->Append(v); + } else { + return JSONTypeError("number", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for decimal arrays + +class DecimalConverter : public ConcreteConverter { + public: + explicit DecimalConverter(const std::shared_ptr& type) { + this->type_ = type; + decimal_type_ = checked_cast(type.get()); + builder_ = std::make_shared(type); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + if (json_obj.IsString()) { + int32_t precision, scale; + Decimal128 d; + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + RETURN_NOT_OK(Decimal128::FromString(view, &d, &precision, &scale)); + if (scale != decimal_type_->scale()) { + std::stringstream ss; + ss << "Invalid scale for decimal: expected " << decimal_type_->scale() << ", got " + << scale; + return Status::Invalid(ss.str()); + } + return builder_->Append(d); + } + return JSONTypeError("decimal string", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; + Decimal128Type* decimal_type_; +}; + +// ------------------------------------------------------------------------ +// Converter for string arrays + +class StringConverter : public ConcreteConverter { + public: + explicit StringConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared(type, default_memory_pool()); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + if (json_obj.IsString()) { + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + return builder_->Append(view); + } else { + return JSONTypeError("string", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for list arrays + +class ListConverter : public ConcreteConverter { + public: + explicit ListConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + const auto& list_type = checked_cast(*type_); + RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); + auto child_builder = child_converter_->builder(); + builder_ = std::make_shared(default_memory_pool(), child_builder, type_); + return Status::OK(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + RETURN_NOT_OK(builder_->Append()); + // Extend the child converter with this JSON array + return child_converter_->AppendValues(json_obj); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; + std::shared_ptr child_converter_; +}; + +// ------------------------------------------------------------------------ +// Converter for struct arrays + +class StructConverter : public ConcreteConverter { + public: + explicit StructConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + std::vector> child_builders; + for (const auto& field : type_->children()) { + std::shared_ptr child_converter; + RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); + child_converters_.push_back(child_converter); + child_builders.push_back(child_converter->builder()); + } + builder_ = std::make_shared(type_, default_memory_pool(), + std::move(child_builders)); + return Status::OK(); + } + + Status AppendNull() override { + for (auto& converter : child_converters_) { + RETURN_NOT_OK(converter->AppendNull()); + } + return builder_->AppendNull(); + } + + // Append a JSON value that is either an array of N elements in order + // or an object mapping struct names to values (omitted struct members + // are mapped to null). + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsArray()) { + auto size = json_obj.Size(); + auto expected_size = static_cast(type_->num_children()); + if (size != expected_size) { + std::stringstream ss; + ss << "Expected array of size " << expected_size << ", got array of size " + << size; + return Status::Invalid(ss.str()); + } + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); + } + return builder_->Append(); + } + if (json_obj.IsObject()) { + auto remaining = json_obj.MemberCount(); + auto num_children = type_->num_children(); + for (int32_t i = 0; i < num_children; ++i) { + const auto& field = type_->child(i); + auto it = json_obj.FindMember(field->name()); + if (it != json_obj.MemberEnd()) { + --remaining; + RETURN_NOT_OK(child_converters_[i]->AppendValue(it->value)); + } else { + RETURN_NOT_OK(child_converters_[i]->AppendNull()); + } + } + if (remaining > 0) { + std::stringstream ss; + ss << "Unexpected members in JSON object for type " << type_->ToString(); + return Status::Invalid(ss.str()); + } + return builder_->Append(); + } + return JSONTypeError("array or object", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; + std::vector> child_converters_; +}; + +// ------------------------------------------------------------------------ +// General conversion functions + +Status GetConverter(const std::shared_ptr& type, + std::shared_ptr* out) { + std::shared_ptr res; + +#define SIMPLE_CONVERTER_CASE(ID, CLASS) \ + case ID: \ + res = std::make_shared(type); \ + break; + + switch (type->id()) { + SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::NA, NullConverter) + SIMPLE_CONVERTER_CASE(Type::BOOL, BooleanConverter) + SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter) + SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter) + SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) + SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) + SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) + SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) + default: { + std::stringstream ss; + ss << "JSON conversion to " << type->ToString() << " not implemented"; + return Status::NotImplemented(ss.str()); + } + } + +#undef SIMPLE_CONVERTER_CASE + + RETURN_NOT_OK(res->Init()); + *out = res; + return Status::OK(); +} + +Status ArrayFromJSON(const std::shared_ptr& type, + const util::string_view& json_string, std::shared_ptr* out) { + std::shared_ptr converter; + RETURN_NOT_OK(GetConverter(type, &converter)); + + rj::Document json_doc; + json_doc.Parse(json_string.data(), json_string.length()); + if (json_doc.HasParseError()) { + std::stringstream ss; + ss << "JSON parse error at offset " << json_doc.GetErrorOffset() << ": " + << GetParseError_En(json_doc.GetParseError()); + return Status::Invalid(ss.str()); + } + + // The JSON document should be an array, append it + RETURN_NOT_OK(converter->AppendValues(json_doc)); + return converter->Finish(out); +} + +Status ArrayFromJSON(const std::shared_ptr& type, + const std::string& json_string, std::shared_ptr* out) { + return ArrayFromJSON(type, util::string_view(json_string), out); +} + +Status ArrayFromJSON(const std::shared_ptr& type, const char* json_string, + std::shared_ptr* out) { + return ArrayFromJSON(type, util::string_view(json_string), out); +} + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json-simple.h b/cpp/src/arrow/ipc/json-simple.h new file mode 100644 index 0000000000000..da6483ff1556f --- /dev/null +++ b/cpp/src/arrow/ipc/json-simple.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement a simple JSON representation format for arrays + +#ifndef ARROW_IPC_JSON_SIMPLE_H +#define ARROW_IPC_JSON_SIMPLE_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; + +namespace ipc { +namespace internal { +namespace json { + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const std::string& json, + std::shared_ptr* out); + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const util::string_view& json, + std::shared_ptr* out); + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const char* json, + std::shared_ptr* out); + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_JSON_SIMPLE_H diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 482bc4370fdca..8434e59b0ce79 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -163,16 +163,7 @@ TEST_F(TestPrettyPrint, StructTypeBasic) { auto simple_2 = field("two", int32()); auto simple_struct = struct_({simple_1, simple_2}); - auto int_builder_1 = std::make_shared(); - auto int_builder_2 = std::make_shared(); - StructBuilder builder(simple_struct, default_memory_pool(), - {int_builder_1, int_builder_2}); - ASSERT_OK(builder.Append()); - ASSERT_OK(int_builder_1->Append(11)); - ASSERT_OK(int_builder_2->Append(22)); - - std::shared_ptr array; - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(simple_struct, "[[11, 22]]"); static const char* ex = R"expected(-- is_valid: all not null -- child 0 type: int32 @@ -202,22 +193,7 @@ TEST_F(TestPrettyPrint, StructTypeAdvanced) { auto simple_2 = field("two", int32()); auto simple_struct = struct_({simple_1, simple_2}); - auto int_builder_1 = std::make_shared(); - auto int_builder_2 = std::make_shared(); - StructBuilder builder(simple_struct, default_memory_pool(), - {int_builder_1, int_builder_2}); - ASSERT_OK(builder.Append()); - ASSERT_OK(int_builder_1->Append(11)); - ASSERT_OK(int_builder_2->Append(22)); - ASSERT_OK(builder.AppendNull()); - ASSERT_OK(int_builder_1->AppendNull()); - ASSERT_OK(int_builder_2->AppendNull()); - ASSERT_OK(builder.Append()); - ASSERT_OK(int_builder_1->AppendNull()); - ASSERT_OK(int_builder_2->Append(33)); - - std::shared_ptr array; - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(simple_struct, "[[11, 22], null, [null, 33]]"); static const char* ex = R"expected(-- is_valid: [ @@ -251,24 +227,9 @@ TEST_F(TestPrettyPrint, BinaryType) { } TEST_F(TestPrettyPrint, ListType) { - Int64Builder* int_builder = new Int64Builder(); - ListBuilder list_builder(default_memory_pool(), - std::unique_ptr(int_builder)); - - ASSERT_OK(list_builder.Append()); - ASSERT_OK(int_builder->AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.Append(false)); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(int_builder->Append(4)); - ASSERT_OK(int_builder->Append(6)); - ASSERT_OK(int_builder->Append(7)); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(int_builder->Append(2)); - ASSERT_OK(int_builder->Append(3)); + auto list_type = list(int64()); + auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); - std::shared_ptr array; - ASSERT_OK(list_builder.Finish(&array)); static const char* ex = R"expected([ [ null @@ -340,19 +301,7 @@ TEST_F(TestPrettyPrint, Decimal128Type) { int32_t s = 4; auto type = decimal(p, s); - - Decimal128Builder builder(type); - Decimal128 val; - - ASSERT_OK(Decimal128::FromString("123.4567", &val)); - ASSERT_OK(builder.Append(val)); - - ASSERT_OK(Decimal128::FromString("456.7891", &val)); - ASSERT_OK(builder.Append(val)); - ASSERT_OK(builder.AppendNull()); - - std::shared_ptr array; - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(type, "[\"123.4567\", \"456.7891\", null]"); static const char* ex = "[\n 123.4567,\n 456.7891,\n null\n]"; CheckArray(*array, {0}, ex); @@ -392,10 +341,7 @@ TEST_F(TestPrettyPrint, DictionaryType) { } TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { - std::vector is_valid = {true, true, false, true, false}; - std::vector values = {0, 1, 2, 3, 4}; - std::shared_ptr array; - ArrayFromVector(is_valid, values, &array); + auto array = ArrayFromJSON(int32(), "[0, 1, null, 3, null]"); ChunkedArray chunked_array({array}); static const char* expected = R"expected([ @@ -432,11 +378,8 @@ TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { } TEST_F(TestPrettyPrint, ColumnPrimitiveType) { - std::vector is_valid = {true, true, false, true, false}; - std::vector values = {0, 1, 2, 3, 4}; - std::shared_ptr array; - ArrayFromVector(is_valid, values, &array); std::shared_ptr int_field = field("column", int32()); + auto array = ArrayFromJSON(int_field->type(), "[0, 1, null, 3, null]"); Column column(int_field, ArrayVector({array})); static const char* expected = R"expected(column: int32 @@ -475,11 +418,8 @@ TEST_F(TestPrettyPrint, ColumnPrimitiveType) { } TEST_F(TestPrettyPrint, TablePrimitive) { - std::vector is_valid = {true, true, false, true, false}; - std::vector values = {0, 1, 2, 3, 4}; - std::shared_ptr array; - ArrayFromVector(is_valid, values, &array); std::shared_ptr int_field = field("column", int32()); + auto array = ArrayFromJSON(int_field->type(), "[0, 1, null, 3, null]"); std::shared_ptr column = std::make_shared(int_field, ArrayVector({array})); std::shared_ptr table_schema = schema({int_field}); diff --git a/cpp/src/arrow/test-util.cc b/cpp/src/arrow/test-util.cc index 7fb96cda7af73..38e07dd060ae4 100644 --- a/cpp/src/arrow/test-util.cc +++ b/cpp/src/arrow/test-util.cc @@ -41,6 +41,7 @@ #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/ipc/json-simple.h" #include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/status.h" @@ -51,13 +52,15 @@ #include "arrow/util/decimal.h" #include "arrow/util/logging.h" -void sleep_for(double seconds) { - std::this_thread::sleep_for( - std::chrono::nanoseconds(static_cast(seconds * 1e9))); -} - namespace arrow { +std::shared_ptr ArrayFromJSON(const std::shared_ptr& type, + const std::string& json) { + std::shared_ptr out; + ABORT_NOT_OK(ipc::internal::json::ArrayFromJSON(type, json, &out)); + return out; +} + void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { const int random_seed = 0; std::default_random_engine gen(random_seed); diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index a01fd7d84a601..7829ac25678a9 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -169,6 +169,12 @@ static inline Status GetBitmapFromVector(const std::vector& is_valid, return Status::OK(); } +template +inline void BitmapFromVector(const std::vector& is_valid, + std::shared_ptr* out) { + ASSERT_OK(GetBitmapFromVector(is_valid, out)); +} + // Sets approximately pct_null of the first n bytes in null_bytes to zero // and the rest to non-zero (true) values. ARROW_EXPORT void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes); @@ -247,6 +253,12 @@ Status MakeRandomBuffer(int64_t length, MemoryPool* pool, return Status::OK(); } +// ArrayFromJSON: construct an Array from a simple JSON representation + +ARROW_EXPORT +std::shared_ptr ArrayFromJSON(const std::shared_ptr&, + const std::string& json); + // ArrayFromVector: construct an Array from vectors of C values template diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 9d22e005e7276..fda7746c6b4e0 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -337,8 +337,8 @@ bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* out) } // namespace -Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision, - int32_t* scale) { +Status Decimal128::FromString(const util::string_view& s, Decimal128* out, + int32_t* precision, int32_t* scale) { if (s.empty()) { return Status::Invalid("Empty string cannot be converted to decimal"); } @@ -393,6 +393,16 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* pr return Status::OK(); } +Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision, + int32_t* scale) { + return FromString(util::string_view(s), out, precision, scale); +} + +Status Decimal128::FromString(const char* s, Decimal128* out, int32_t* precision, + int32_t* scale) { + return FromString(util::string_view(s), out, precision, scale); +} + Decimal128& Decimal128::Negate() { low_bits_ = ~low_bits_ + 1; high_bits_ = ~high_bits_; diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 26b82a42f70a7..fe76d25eb41d0 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -27,6 +27,7 @@ #include "arrow/status.h" #include "arrow/util/macros.h" +#include "arrow/util/string_view.h" #include "arrow/util/type_traits.h" #include "arrow/util/visibility.h" @@ -128,6 +129,10 @@ class ARROW_EXPORT Decimal128 { /// precision and scale if they're passed in and not null. static Status FromString(const std::string& s, Decimal128* out, int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); + static Status FromString(const util::string_view& s, Decimal128* out, + int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); + static Status FromString(const char* s, Decimal128* out, int32_t* precision = NULLPTR, + int32_t* scale = NULLPTR); /// \brief Convert from a big endian byte representation. The length must be /// between 1 and 16 From d3d7669221ee0c714d3095388d769c99d3e51b2b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 9 Dec 2018 14:45:15 -0600 Subject: [PATCH 03/45] ARROW-3969: [Rust] Format using stable rustfmt Author: Andy Grove Closes #3138 from andygrove/ARROW-3969 and squashes the following commits: 99b6256b6 move rustfmt installation into travis_install_cargo 55ab06fee Be more explicit and don't assume order of stable vs nightly builds 5fed7dbc2 simplify cca7da3ab oops, wrong command 9b2e5b771 Format using stable rustfmt --- ci/travis_install_cargo.sh | 1 + ci/travis_script_rust.sh | 9 ++------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/ci/travis_install_cargo.sh b/ci/travis_install_cargo.sh index f433033091ce1..e4a6b3b3493f3 100755 --- a/ci/travis_install_cargo.sh +++ b/ci/travis_install_cargo.sh @@ -21,6 +21,7 @@ set -e # ensure that both toolchains are installed rustup install stable +rustup component add rustfmt rustup install nightly pip install 'travis-cargo<0.2' --user diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh index 02a32cdabe818..55cce8f354e44 100755 --- a/ci/travis_script_rust.sh +++ b/ci/travis_script_rust.sh @@ -26,13 +26,8 @@ pushd $RUST_DIR # show activated toolchain rustup show -# check code formatting only for Rust nightly -if [ $RUSTUP_TOOLCHAIN == "nightly" ] -then - # raises on any formatting errors - rustup component add rustfmt-preview - cargo fmt --all -- --check -fi +# raises on any formatting errors +cargo +stable fmt --all -- --check # raises on any warnings cargo rustc -- -D warnings From cc24218ed8a5abe0a8d35cb6fd7ef1a283384be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 9 Dec 2018 22:22:08 +0100 Subject: [PATCH 04/45] ARROW-3963: [Packaging/Docker] Nightly test for building sphinx documentations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test is here: https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=docker-docs Author: Krisztián Szűcs Closes #3130 from kszucs/ARROW-3963 and squashes the following commits: 0b5be2cc add docker-docs to docker group 1575909e path corrections 51768fc0 use sphinx-build command instead of setup.py 60635acc error msg a93fcad6 merge _as_type and ensure_type 8d3d58fd nightly test for building cpp and python docs --- ci/docker_build_sphinx.sh | 4 +--- dev/tasks/tests.yml | 15 +++++++++++++++ docker-compose.yml | 2 +- docs/Dockerfile | 1 + python/pyarrow/gandiva.pyx | 20 +++++++++++++------- python/pyarrow/tests/test_csv.py | 2 +- python/pyarrow/types.pxi | 21 ++++++++------------- python/pyarrow/types.py | 3 +-- 8 files changed, 41 insertions(+), 27 deletions(-) diff --git a/ci/docker_build_sphinx.sh b/ci/docker_build_sphinx.sh index 957804325adf1..4a65f8155fb16 100755 --- a/ci/docker_build_sphinx.sh +++ b/ci/docker_build_sphinx.sh @@ -22,9 +22,7 @@ pushd /arrow/cpp/apidoc doxygen popd -pushd /arrow/python -python setup.py build_sphinx -s ../docs/source --build-dir ../docs/_build -popd +sphinx-build -b html /arrow/docs/source /arrow/docs/_build/html mkdir -p /arrow/site/asf-site/docs/latest rsync -r /arrow/docs/_build/html/ /arrow/site/asf-site/docs/latest/ diff --git a/dev/tasks/tests.yml b/dev/tasks/tests.yml index c158481de461e..d51fa7eac7a35 100644 --- a/dev/tasks/tests.yml +++ b/dev/tasks/tests.yml @@ -31,6 +31,7 @@ groups: - docker-python-3.6-alpine - docker-java - docker-js + - docker-docs - docker-lint - docker-iwyu - docker-clang-format @@ -174,6 +175,20 @@ tasks: - docker-compose build python-alpine - docker-compose run python-alpine + ###################### Documentation building tests ######################### + + docker-docs: + platform: linux + template: docker-tests/travis.linux.yml + params: + environment: + PYTHON_VERSION: 3.6 + commands: + - docker-compose build cpp + - docker-compose build python + - docker-compose build docs + - docker-compose run docs + ############################## Linter tests ################################# docker-lint: diff --git a/docker-compose.yml b/docker-compose.yml index d6f11004233e5..51f1a49542212 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -196,7 +196,7 @@ services: build: context: . dockerfile: docs/Dockerfile - volumes: *volumes + volumes: *ubuntu-volumes ######################### Integration Tests ################################# diff --git a/docs/Dockerfile b/docs/Dockerfile index 4908110b7fb56..31ad84e17ee48 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -21,6 +21,7 @@ ADD ci/conda_env_sphinx.yml /arrow/ci/ RUN conda install -c conda-forge \ --file arrow/ci/conda_env_sphinx.yml && \ conda clean --all + CMD arrow/ci/docker_build_cpp.sh && \ arrow/ci/docker_build_python.sh && \ arrow/ci/docker_build_sphinx.sh diff --git a/python/pyarrow/gandiva.pyx b/python/pyarrow/gandiva.pyx index 418d0d61502b3..76e55d6ba27ef 100644 --- a/python/pyarrow/gandiva.pyx +++ b/python/pyarrow/gandiva.pyx @@ -28,10 +28,9 @@ from libc.stdint cimport int64_t, int32_t, uint8_t, uintptr_t from pyarrow.includes.libarrow cimport * from pyarrow.compat import frombytes -from pyarrow.types import _as_type from pyarrow.lib cimport (Array, DataType, Field, MemoryPool, RecordBatch, Schema, check_status, pyarrow_wrap_array, - pyarrow_wrap_data_type) + pyarrow_wrap_data_type, ensure_type) from pyarrow.includes.libgandiva cimport ( CCondition, CExpression, @@ -173,8 +172,10 @@ cdef class Filter: return self def evaluate(self, RecordBatch batch, MemoryPool pool, dtype='int32'): - cdef shared_ptr[CSelectionVector] selection - cdef DataType type = _as_type(dtype) + cdef: + DataType type = ensure_type(dtype) + shared_ptr[CSelectionVector] selection + if type.id == _Type_INT16: check_status(SelectionVector_MakeInt16( batch.num_rows, pool.pool, &selection)) @@ -187,6 +188,7 @@ cdef class Filter: else: raise ValueError("'dtype' of the selection vector should be " "one of 'int16', 'int32' and 'int64'.") + check_status(self.filter.get().Evaluate( batch.sp_batch.get()[0], selection)) return SelectionVector.create(selection) @@ -195,8 +197,10 @@ cdef class Filter: cdef class TreeExprBuilder: def make_literal(self, value, dtype): - cdef shared_ptr[CNode] r - cdef DataType type = _as_type(dtype) + cdef: + DataType type = ensure_type(dtype) + shared_ptr[CNode] r + if type.id == _Type_BOOL: r = TreeExprBuilder_MakeBoolLiteral(value) elif type.id == _Type_UINT8: @@ -225,6 +229,7 @@ cdef class TreeExprBuilder: r = TreeExprBuilder_MakeBinaryLiteral(value) else: raise TypeError("Didn't recognize dtype " + str(dtype)) + return Node.create(r) def make_expression(self, Node root_node, Field return_field): @@ -353,7 +358,8 @@ cdef class TreeExprBuilder: return Node.create(r) def make_in_expression(self, Node node, values, dtype): - cdef DataType type = _as_type(dtype) + cdef DataType type = ensure_type(dtype) + if type.id == _Type_INT32: return self._make_in_expression_int32(node, values) elif type.id == _Type_INT64: diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 115595bbb877c..c5816de8a4203 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -146,7 +146,7 @@ def test_convert_options(): opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} - with pytest.raises(TypeError, match='data type expected'): + with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index d5d99e4044e23..1ebd196fabf95 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -869,7 +869,7 @@ def field(name, type, bint nullable=True, dict metadata=None): cdef: shared_ptr[CKeyValueMetadata] c_meta Field result = Field.__new__(Field) - DataType _type = _as_type(type) + DataType _type = ensure_type(type, allow_none=False) if metadata is not None: convert_metadata(metadata, &c_meta) @@ -1479,20 +1479,15 @@ def type_for_alias(name): return alias() -def _as_type(typ): - if isinstance(typ, DataType): - return typ - elif isinstance(typ, six.string_types): - return type_for_alias(typ) - else: - raise TypeError("data type expected, got '%r'" % (type(typ),)) - - -cdef DataType ensure_type(object type, c_bool allow_none=False): - if allow_none and type is None: +cdef DataType ensure_type(object ty, c_bool allow_none=False): + if allow_none and ty is None: return None + elif isinstance(ty, DataType): + return ty + elif isinstance(ty, six.string_types): + return type_for_alias(ty) else: - return _as_type(type) + raise TypeError('DataType expected, got {!r}'.format(type(ty))) def schema(fields, dict metadata=None): diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index d07dccaedfb97..2bd70276e7ea1 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -19,8 +19,7 @@ from pyarrow.lib import (is_boolean_value, # noqa is_integer_value, - is_float_value, - _as_type) + is_float_value) import pyarrow.lib as lib From 7a5631dedc2de4c7740cd978949322cefdce8251 Mon Sep 17 00:00:00 2001 From: c-bata Date: Mon, 10 Dec 2018 10:37:30 +0900 Subject: [PATCH 05/45] ARROW-3964: [Go] Refactor examples of csv reader Example of godoc doesn't include input file(testdata/simple.csv). So it's hard to understand the output. This PR refactors it. screenshot https://godoc.org/github.com/apache/arrow/go/arrow/csv Author: c-bata Closes #3131 from c-bata/refactor-csv-reader-example and squashes the following commits: eed8e29b Refactor examples of csv reader for Go --- go/arrow/csv/csv_test.go | 48 +++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/go/arrow/csv/csv_test.go b/go/arrow/csv/csv_test.go index aaafb37554b87..97f31cc209c27 100644 --- a/go/arrow/csv/csv_test.go +++ b/go/arrow/csv/csv_test.go @@ -20,8 +20,6 @@ import ( "bytes" "fmt" "io/ioutil" - "log" - "os" "testing" "github.com/apache/arrow/go/arrow" @@ -30,17 +28,24 @@ import ( ) func Example() { - f, err := os.Open("testdata/simple.csv") - if err != nil { - log.Fatal(err) - } - defer f.Close() + f := bytes.NewBufferString(`## a simple set of data: int64;float64;string +0;0;str-0 +1;1;str-1 +2;2;str-2 +3;3;str-3 +4;4;str-4 +5;5;str-5 +6;6;str-6 +7;7;str-7 +8;8;str-8 +9;9;str-9 +`) schema := arrow.NewSchema( []arrow.Field{ - arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64}, - arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64}, - arrow.Field{Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, }, nil, ) @@ -90,17 +95,24 @@ func Example() { } func Example_withChunk() { - f, err := os.Open("testdata/simple.csv") - if err != nil { - log.Fatal(err) - } - defer f.Close() + f := bytes.NewBufferString(`## a simple set of data: int64;float64;string +0;0;str-0 +1;1;str-1 +2;2;str-2 +3;3;str-3 +4;4;str-4 +5;5;str-5 +6;6;str-6 +7;7;str-7 +8;8;str-8 +9;9;str-9 +`) schema := arrow.NewSchema( []arrow.Field{ - arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64}, - arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64}, - arrow.Field{Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, }, nil, ) From a4063edf262caeb57c6cb7365e08756788c736a3 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Mon, 10 Dec 2018 10:50:03 +0900 Subject: [PATCH 06/45] ARROW-3967: [Gandiva] [C++] Make node.h public Because some methods in node.h is useful in bindings. C GLib Gandiva bindings want to use LiteralNode::holder() to access raw literal data. Author: Yosuke Shiro Closes #3135 from shiro615/make-gandiva-node-header-public and squashes the following commits: 5950c52b Make node.h public --- cpp/src/gandiva/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index bd497dcb92882..68f02f03cf29b 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -112,9 +112,13 @@ install(FILES expression.h expression_registry.h filter.h + func_descriptor.h function_signature.h gandiva_aliases.h + literal_holder.h logging.h + node.h + node_visitor.h projector.h selection_vector.h tree_expr_builder.h From 1dc906e0fe76d558c13febb02c4c63bc4eeba50b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 10 Dec 2018 12:31:21 +0900 Subject: [PATCH 07/45] ARROW-3885: [Rust] Release prepare step should increment Rust version Author: Andy Grove Closes #3096 from andygrove/ARROW-3885 and squashes the following commits: 7d15ee77 add commit step 0d98c2cf revert to 0.11.0 ready for next prepare step a7f60835 update release prepare step to increment Rust version ac6e5fc0 Set version to 0.11.0 and update prepare script b39b7c4b Update Rust version to 0.12.0 --- dev/release/00-prepare.sh | 9 +++++++++ rust/Cargo.toml | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 9282cbfd2771d..5ff4ddc8f28a6 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -76,6 +76,15 @@ if [ "$#" -eq 2 ]; then git commit -m "[Release] Update .deb package names for $nextVersion" cd - + echo "prepare release ${version} in Rust crate" + + cd "${SOURCE_DIR}/../../rust" + sed -i.bak -r -e "s/version = \"$version\"/version = \"$nextVersion\"/g" Cargo.toml + rm -f Cargo.toml.bak + git add Cargo.toml + git commit -m "[Release] Update Rust Cargo.toml version for $nextVersion" + cd - + echo "Finish staging binary artifacts by running: sh dev/release/01-perform.sh" else diff --git a/rust/Cargo.toml b/rust/Cargo.toml index b56cd6fb30091..39de50c8a336d 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "0.10.0" +version = "0.11.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" From 612bdca20c9685911cfa5de6f87993f0544fb7aa Mon Sep 17 00:00:00 2001 From: Praveen Date: Sun, 9 Dec 2018 21:45:10 -0600 Subject: [PATCH 08/45] ARROW-3970: [Gandiva][C++] Remove unnecessary boost dependencies. Removed the dynamic dependencies since we do not need them. Author: Praveen Closes #3137 from praveenbingo/ARROW-3970 and squashes the following commits: 6e3a6bbdc ARROW-3970: Added more time for a benchmark test. fbb551645 ARROW-3970: Remove unnecessary boost dynamic dependencies. --- cpp/src/gandiva/CMakeLists.txt | 3 --- .../org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 68f02f03cf29b..1f76f7841590a 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -64,9 +64,6 @@ set(SRC_FILES annotator.cc set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared - ${BOOST_REGEX_LIBRARY} - ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY} LLVM::LLVM_INTERFACE ${RE2_LIBRARY}) diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java index cd297034df80f..c4d6bd9070613 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java @@ -58,7 +58,7 @@ public void testAdd3() throws Exception { 1 * MILLION, 16 * THOUSAND, 4); System.out.println("Time taken for projecting 1m records of add3 is " + timeTaken + "ms"); - Assert.assertTrue(timeTaken <= 10 * toleranceRatio); + Assert.assertTrue(timeTaken <= 13 * toleranceRatio); } @Test From d6284cf89c75f4767996abe087a8eb203401fb6d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Dec 2018 14:05:14 +0100 Subject: [PATCH 09/45] ARROW-3792: [C++] Writing a list-type chunked column to Parquet fails if any chunk is 0-length Thanks to @tanyaschlusser to providing a minimal reproduction to help find the underlying problem Author: Wes McKinney Closes #3141 from wesm/ARROW-3792 and squashes the following commits: 1ed82a57 Add test case and fix --- cpp/src/parquet/arrow/writer.cc | 5 +++++ python/pyarrow/tests/test_parquet.py | 33 ++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index ef5de07d87f16..402cbf0f2027c 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -861,6 +861,11 @@ Status ArrowColumnWriter::TypedWriteBatch( } Status ArrowColumnWriter::Write(const Array& data) { + if (data.length() == 0) { + // Write nothing when length is 0 + return Status::OK(); + } + ::arrow::Type::type values_type; RETURN_NOT_OK(GetLeafType(*data.type(), &values_type)); diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index c14056e8533b8..89d3224580463 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +from collections import OrderedDict import datetime import decimal import io @@ -2224,6 +2225,34 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir): def test_writing_empty_lists(): # ARROW-2591: [Python] Segmentation fault issue in pq.write_table - arr = pa.array([[], []], pa.list_(pa.int32())) - table = pa.Table.from_arrays([arr], ['test']) + arr1 = pa.array([[], []], pa.list_(pa.int32())) + table = pa.Table.from_arrays([arr1], ['list(int32)']) _check_roundtrip(table) + + +def test_write_nested_zero_length_array_chunk_failure(): + # Bug report in ARROW-3792 + cols = OrderedDict( + int32=pa.int32(), + list_string=pa.list_(pa.string()) + ) + data = [[], [OrderedDict(int32=1, list_string=('G',)), ]] + + # This produces a table with a column like + # )> + # [ + # [], + # [ + # [ + # "G" + # ] + # ] + # ] + # + # Each column is a ChunkedArray with 2 elements + my_arrays = [pa.array(batch, type=pa.struct(cols)).flatten() + for batch in data] + my_batches = [pa.RecordBatch.from_arrays(batch, pa.schema(cols)) + for batch in my_arrays] + tbl = pa.Table.from_batches(my_batches, pa.schema(cols)) + _check_roundtrip(tbl) From e4761e07d6d32e8c3fddac20f0abca0bb89543ad Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 10 Dec 2018 09:12:40 -0600 Subject: [PATCH 10/45] ARROW-3727: [Python] Document use of foreign_buffer() Author: Antoine Pitrou Closes #3146 from pitrou/ARROW-3727-foreign-buffer-doc and squashes the following commits: e81a5f0cf ARROW-3727: Document use of foreign_buffer() --- docs/source/python/memory.rst | 23 ++++++++++++++--------- python/pyarrow/io.pxi | 8 ++++++-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/docs/source/python/memory.rst b/docs/source/python/memory.rst index 1ee81e754da1c..0d30866d0aa4d 100644 --- a/docs/source/python/memory.rst +++ b/docs/source/python/memory.rst @@ -35,8 +35,8 @@ Referencing and Allocating Memory pyarrow.Buffer -------------- -The :class:`~pyarrow.Buffer` object wraps the C++ ``arrow::Buffer`` type and is -the primary tool for memory management in Apache Arrow in C++. It permits +The :class:`Buffer` object wraps the C++ :cpp:class:`arrow::Buffer` type +which is the primary tool for memory management in Apache Arrow in C++. It permits higher-level array classes to safely interact with memory which they may or may not own. ``arrow::Buffer`` can be zero-copy sliced to permit Buffers to cheaply reference other Buffers, while preserving memory lifetime and clean @@ -46,8 +46,9 @@ There are many implementations of ``arrow::Buffer``, but they all provide a standard interface: a data pointer and length. This is similar to Python's built-in `buffer protocol` and ``memoryview`` objects. -A :class:`~pyarrow.Buffer` can be created from any Python object which -implements the buffer protocol. Let's consider a bytes object: +A :class:`Buffer` can be created from any Python object implementing +the buffer protocol by calling the :func:`py_buffer` function. Let's consider +a bytes object: .. ipython:: python @@ -61,18 +62,22 @@ implements the buffer protocol. Let's consider a bytes object: Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the ``data`` bytes object. -The Buffer's ``to_pybytes`` method can convert to a Python byte string: +External memory, under the form of a raw pointer and size, can also be +referenced using the :func:`foreign_buffer` function. + +Buffers can be used in circumstances where a Python buffer or memoryview is +required, and such conversions are zero-copy: .. ipython:: python - buf.to_pybytes() + memoryview(buf) -Buffers can be used in circumstances where a Python buffer or memoryview is -required, and such conversions are also zero-copy: +The Buffer's :meth:`~Buffer.to_pybytes` method converts the Buffer's data to a +Python bytestring (thus making a copy of the data): .. ipython:: python - memoryview(buf) + buf.to_pybytes() Memory Pools ------------ diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 9f7dc7bc8386f..97abde8f892af 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1173,10 +1173,14 @@ def py_buffer(object obj): return pyarrow_wrap_buffer(buf) -def foreign_buffer(address, size, base): +def foreign_buffer(address, size, base=None): """ Construct an Arrow buffer with the given *address* and *size*, - backed by the Python *base* object. + optionally backed by the Python *base* object. + + The *base* object, if given, will be kept alive as long as this buffer + is alive, including accross language boundaries (for example if the + buffer is referenced by C++ code). """ cdef: intptr_t c_addr = address From fa5d5ad98349dddf98b66d67e8737f77bd261d1f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 10 Dec 2018 09:15:01 -0600 Subject: [PATCH 11/45] ARROW-3980: [C++] Fix CRTP use in json-simple.cc Nudge the compiler into devirtualizing method calls. Author: Antoine Pitrou Closes #3144 from pitrou/ARROW-3980-json-crtp and squashes the following commits: ef96713a2 ARROW-3980: Fix CRTP use in json-simple.cc --- cpp/src/arrow/ipc/json-simple.cc | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index b69bd76f51611..a8d120036e4f5 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -98,7 +98,7 @@ class ConcreteConverter : public Converter { // ------------------------------------------------------------------------ // Converter for null arrays -class NullConverter : public ConcreteConverter { +class NullConverter final : public ConcreteConverter { public: explicit NullConverter(const std::shared_ptr& type) { type_ = type; @@ -109,7 +109,7 @@ class NullConverter : public ConcreteConverter { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } return JSONTypeError("null", json_obj.GetType()); } @@ -123,7 +123,7 @@ class NullConverter : public ConcreteConverter { // ------------------------------------------------------------------------ // Converter for boolean arrays -class BooleanConverter : public ConcreteConverter { +class BooleanConverter final : public ConcreteConverter { public: explicit BooleanConverter(const std::shared_ptr& type) { type_ = type; @@ -134,7 +134,7 @@ class BooleanConverter : public ConcreteConverter { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } if (json_obj.IsBool()) { return builder_->Append(json_obj.GetBool()); @@ -152,7 +152,7 @@ class BooleanConverter : public ConcreteConverter { // Converter for int arrays template -class IntegerConverter : public ConcreteConverter> { +class IntegerConverter final : public ConcreteConverter> { using c_type = typename Type::c_type; static constexpr auto is_signed = std::is_signed::value; @@ -166,7 +166,7 @@ class IntegerConverter : public ConcreteConverter> { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } return AppendNumber(json_obj); } @@ -220,7 +220,7 @@ class IntegerConverter : public ConcreteConverter> { // Converter for float arrays template -class FloatConverter : public ConcreteConverter> { +class FloatConverter final : public ConcreteConverter> { using c_type = typename Type::c_type; public: @@ -233,7 +233,7 @@ class FloatConverter : public ConcreteConverter> { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } if (json_obj.IsNumber()) { c_type v = static_cast(json_obj.GetDouble()); @@ -252,7 +252,7 @@ class FloatConverter : public ConcreteConverter> { // ------------------------------------------------------------------------ // Converter for decimal arrays -class DecimalConverter : public ConcreteConverter { +class DecimalConverter final : public ConcreteConverter { public: explicit DecimalConverter(const std::shared_ptr& type) { this->type_ = type; @@ -264,7 +264,7 @@ class DecimalConverter : public ConcreteConverter { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } if (json_obj.IsString()) { int32_t precision, scale; @@ -292,7 +292,7 @@ class DecimalConverter : public ConcreteConverter { // ------------------------------------------------------------------------ // Converter for string arrays -class StringConverter : public ConcreteConverter { +class StringConverter final : public ConcreteConverter { public: explicit StringConverter(const std::shared_ptr& type) { this->type_ = type; @@ -303,7 +303,7 @@ class StringConverter : public ConcreteConverter { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } if (json_obj.IsString()) { auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); @@ -322,7 +322,7 @@ class StringConverter : public ConcreteConverter { // ------------------------------------------------------------------------ // Converter for list arrays -class ListConverter : public ConcreteConverter { +class ListConverter final : public ConcreteConverter { public: explicit ListConverter(const std::shared_ptr& type) { type_ = type; } @@ -338,7 +338,7 @@ class ListConverter : public ConcreteConverter { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } RETURN_NOT_OK(builder_->Append()); // Extend the child converter with this JSON array @@ -355,7 +355,7 @@ class ListConverter : public ConcreteConverter { // ------------------------------------------------------------------------ // Converter for struct arrays -class StructConverter : public ConcreteConverter { +class StructConverter final : public ConcreteConverter { public: explicit StructConverter(const std::shared_ptr& type) { type_ = type; } From 7a296bd597ab7061ff8f39280d3d6a9a694faf79 Mon Sep 17 00:00:00 2001 From: Pindikura Ravindra Date: Mon, 10 Dec 2018 09:28:47 -0600 Subject: [PATCH 12/45] ARROW-3977: [Gandiva] fix label during ctest invoc Author: Pindikura Ravindra Closes #3139 from pravindra/ci and squashes the following commits: 3372401c3 ARROW-3977: temporary disable valgrind c51b23aff ARROW-3977: fix label during ctest invoc --- .travis.yml | 3 ++- ci/travis_script_gandiva_cpp.sh | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index b877e205b5bd0..7489d72c80502 100644 --- a/.travis.yml +++ b/.travis.yml @@ -110,7 +110,8 @@ matrix: env: - ARROW_TRAVIS_GANDIVA=1 - ARROW_TRAVIS_USE_TOOLCHAIN=1 - - ARROW_TRAVIS_VALGRIND=1 + # ARROW-3979 temporarily disabled. + - ARROW_TRAVIS_VALGRIND=0 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" before_script: diff --git a/ci/travis_script_gandiva_cpp.sh b/ci/travis_script_gandiva_cpp.sh index 4d0a9b7a6bac4..f3c379393fe14 100755 --- a/ci/travis_script_gandiva_cpp.sh +++ b/ci/travis_script_gandiva_cpp.sh @@ -23,10 +23,7 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh pushd $CPP_BUILD_DIR -PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva,unittest - -# not running in parallel, since some of them are benchmarks -PATH=$ARROW_BUILD_TYPE:$PATH ctest -VV -L gandiva,integ +PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva popd From e6b96aa30e68e3bbd020babbaa79eea83b747f0c Mon Sep 17 00:00:00 2001 From: Paddy Horan Date: Mon, 10 Dec 2018 16:30:30 +0100 Subject: [PATCH 13/45] ARROW-3687: [Rust] Anything measuring array slots should be `usize` Author: Paddy Horan Closes #3142 from paddyhoran/ARROW-3687 and squashes the following commits: c0a75e9c Fixed lint issues (outdated stable) 0b39fe87 Updated subtraction to be checked c4c223c5 Fixing lints for stable fmt d6aec71c All values measuring array slots changed to `usize` --- rust/src/array.rs | 81 +++++++++++++++++++-------------------- rust/src/array_data.rs | 61 +++++++++++++++-------------- rust/src/bitmap.rs | 6 +-- rust/src/buffer.rs | 18 ++++----- rust/src/builder.rs | 70 ++++++++++++++++----------------- rust/src/csv/reader.rs | 8 ++-- rust/src/memory.rs | 8 ++-- rust/src/record_batch.rs | 2 +- rust/src/tensor.rs | 72 +++++++++++++++++----------------- rust/src/util/bit_util.rs | 18 ++++----- 10 files changed, 173 insertions(+), 171 deletions(-) diff --git a/rust/src/array.rs b/rust/src/array.rs index ca1d2a5cdb1e7..51bc8d993c19b 100644 --- a/rust/src/array.rs +++ b/rust/src/array.rs @@ -47,27 +47,27 @@ pub trait Array: Send + Sync { } /// Returns the length (i.e., number of elements) of this array - fn len(&self) -> i64 { + fn len(&self) -> usize { self.data().len() } /// Returns the offset of this array - fn offset(&self) -> i64 { + fn offset(&self) -> usize { self.data().offset() } /// Returns whether the element at index `i` is null - fn is_null(&self, i: i64) -> bool { + fn is_null(&self, i: usize) -> bool { self.data().is_null(i) } /// Returns whether the element at index `i` is not null - fn is_valid(&self, i: i64) -> bool { + fn is_valid(&self, i: usize) -> bool { self.data().is_valid(i) } /// Returns the total number of nulls in this array - fn null_count(&self) -> i64 { + fn null_count(&self) -> usize { self.data().null_count() } } @@ -158,7 +158,7 @@ impl Array for PrimitiveArray { /// Implementation for primitive arrays with numeric types. /// Boolean arrays are bit-packed and so implemented separately. impl PrimitiveArray { - pub fn new(length: i64, values: Buffer, null_count: i64, offset: i64) -> Self { + pub fn new(length: usize, values: Buffer, null_count: usize, offset: usize) -> Self { let array_data = ArrayData::builder(T::get_data_type()) .len(length) .add_buffer(values) @@ -176,7 +176,7 @@ impl PrimitiveArray { } /// Returns the length of this array - pub fn len(&self) -> i64 { + pub fn len(&self) -> usize { self.data.len() } @@ -188,16 +188,16 @@ impl PrimitiveArray { /// Returns the primitive value at index `i`. /// /// Note this doesn't do any bound checking, for performance reason. - pub fn value(&self, i: i64) -> T::Native { + pub fn value(&self, i: usize) -> T::Native { unsafe { *(self.raw_values().offset(i as isize)) } } /// Returns a slice for the given offset and length /// /// Note this doesn't do any bound checking, for performance reason. - pub fn value_slice(&self, offset: i64, len: i64) -> &[T::Native] { - let raw = unsafe { std::slice::from_raw_parts(self.raw_values(), self.len() as usize) }; - &raw[offset as usize..offset as usize + len as usize] + pub fn value_slice(&self, offset: usize, len: usize) -> &[T::Native] { + let raw = unsafe { std::slice::from_raw_parts(self.raw_values(), self.len()) }; + &raw[offset..offset + len] } /// Returns the minimum value in the array, according to the natural order. @@ -220,7 +220,7 @@ impl PrimitiveArray { if data.is_null(i) { continue; } - let m = self.value(i as i64); + let m = self.value(i); match n { None => n = Some(m), Some(nn) => { @@ -234,14 +234,14 @@ impl PrimitiveArray { } // Returns a new primitive array builder - pub fn builder(capacity: i64) -> PrimitiveArrayBuilder { + pub fn builder(capacity: usize) -> PrimitiveArrayBuilder { PrimitiveArrayBuilder::::new(capacity) } } /// Specific implementation for Boolean arrays due to bit-packing impl PrimitiveArray { - pub fn new(length: i64, values: Buffer, null_count: i64, offset: i64) -> Self { + pub fn new(length: usize, values: Buffer, null_count: usize, offset: usize) -> Self { let array_data = ArrayData::builder(DataType::Boolean) .len(length) .add_buffer(values) @@ -259,14 +259,14 @@ impl PrimitiveArray { } /// Returns the boolean value at index `i`. - pub fn value(&self, i: i64) -> bool { + pub fn value(&self, i: usize) -> bool { let offset = i + self.offset(); assert!(offset < self.data.len()); - unsafe { bit_util::get_bit_raw(self.raw_values.get() as *const u8, offset as usize) } + unsafe { bit_util::get_bit_raw(self.raw_values.get() as *const u8, offset) } } // Returns a new primitive array builder - pub fn builder(capacity: i64) -> BooleanBuilder { + pub fn builder(capacity: usize) -> BooleanBuilder { BooleanBuilder::new(capacity) } } @@ -279,7 +279,7 @@ macro_rules! def_numeric_from_vec { impl From> for PrimitiveArray<$ty> { fn from(data: Vec<$native_ty>) -> Self { let array_data = ArrayData::builder($ty_id) - .len(data.len() as i64) + .len(data.len()) .add_buffer(Buffer::from(data.to_byte_slice())) .build(); PrimitiveArray::from(array_data) @@ -290,7 +290,7 @@ macro_rules! def_numeric_from_vec { impl From>> for PrimitiveArray<$ty> { fn from(data: Vec>) -> Self { let data_len = data.len(); - let num_bytes = bit_util::ceil(data_len as i64, 8) as usize; + let num_bytes = bit_util::ceil(data_len, 8); let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); let mut val_buf = MutableBuffer::new(data_len * mem::size_of::<$native_ty>()); @@ -310,7 +310,7 @@ macro_rules! def_numeric_from_vec { } let array_data = ArrayData::builder($ty_id) - .len(data_len as i64) + .len(data_len) .add_buffer(val_buf.freeze()) .null_bit_buffer(null_buf.freeze()) .build(); @@ -334,7 +334,7 @@ def_numeric_from_vec!(Float64Type, f64, DataType::Float64); /// Constructs a boolean array from a vector. Should only be used for testing. impl From> for BooleanArray { fn from(data: Vec) -> Self { - let num_byte = bit_util::ceil(data.len() as i64, 8) as usize; + let num_byte = bit_util::ceil(data.len(), 8); let mut mut_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false); { let mut_slice = mut_buf.data_mut(); @@ -345,7 +345,7 @@ impl From> for BooleanArray { } } let array_data = ArrayData::builder(DataType::Boolean) - .len(data.len() as i64) + .len(data.len()) .add_buffer(mut_buf.freeze()) .build(); BooleanArray::from(array_data) @@ -354,8 +354,8 @@ impl From> for BooleanArray { impl From>> for BooleanArray { fn from(data: Vec>) -> Self { - let data_len = data.len() as i64; - let num_byte = bit_util::ceil(data_len, 8) as usize; + let data_len = data.len(); + let num_byte = bit_util::ceil(data_len, 8); let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false); let mut val_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false); @@ -425,7 +425,7 @@ impl ListArray { /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - pub fn value_offset(&self, i: i64) -> i32 { + pub fn value_offset(&self, i: usize) -> i32 { self.value_offset_at(self.data.offset() + i) } @@ -433,13 +433,13 @@ impl ListArray { /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - pub fn value_length(&self, mut i: i64) -> i32 { + pub fn value_length(&self, mut i: usize) -> i32 { i += self.data.offset(); self.value_offset_at(i + 1) - self.value_offset_at(i) } #[inline] - fn value_offset_at(&self, i: i64) -> i32 { + fn value_offset_at(&self, i: usize) -> i32 { unsafe { *self.value_offsets.get().offset(i as isize) } } } @@ -503,11 +503,8 @@ pub struct BinaryArray { impl BinaryArray { /// Returns the element at index `i` as a byte slice. - pub fn get_value(&self, i: i64) -> &[u8] { - assert!( - i >= 0 && i < self.data.len(), - "BinaryArray out of bounds access" - ); + pub fn get_value(&self, i: usize) -> &[u8] { + assert!(i < self.data.len(), "BinaryArray out of bounds access"); let offset = i.checked_add(self.data.offset()).unwrap(); unsafe { let pos = self.value_offset_at(offset); @@ -521,7 +518,7 @@ impl BinaryArray { /// Returns the element at index `i` as a string. /// /// Note this doesn't do any bound checking, for performance reason. - pub fn get_string(&self, i: i64) -> String { + pub fn get_string(&self, i: usize) -> String { let slice = self.get_value(i); unsafe { String::from_utf8_unchecked(Vec::from(slice)) } } @@ -530,7 +527,7 @@ impl BinaryArray { /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - pub fn value_offset(&self, i: i64) -> i32 { + pub fn value_offset(&self, i: usize) -> i32 { self.value_offset_at(self.data.offset() + i) } @@ -538,13 +535,13 @@ impl BinaryArray { /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - pub fn value_length(&self, mut i: i64) -> i32 { + pub fn value_length(&self, mut i: usize) -> i32 { i += self.data.offset(); self.value_offset_at(i + 1) - self.value_offset_at(i) } #[inline] - fn value_offset_at(&self, i: i64) -> i32 { + fn value_offset_at(&self, i: usize) -> i32 { unsafe { *self.value_offsets.get().offset(i as isize) } } } @@ -582,7 +579,7 @@ impl<'a> From> for BinaryArray { values.extend_from_slice(s.as_bytes()); } let array_data = ArrayData::builder(DataType::Utf8) - .len(v.len() as i64) + .len(v.len()) .add_buffer(Buffer::from(offsets.to_byte_slice())) .add_buffer(Buffer::from(&values[..])) .build(); @@ -664,7 +661,7 @@ impl Array for StructArray { } /// Returns the length (i.e., number of elements) of this array - fn len(&self) -> i64 { + fn len(&self) -> usize { self.boxed_fields[0].len() } } @@ -876,8 +873,8 @@ mod tests { assert_eq!(6, list_array.value_offset(2)); assert_eq!(2, list_array.value_length(2)); for i in 0..3 { - assert!(list_array.is_valid(i as i64)); - assert!(!list_array.is_null(i as i64)); + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); } // Now test with a non-zero offset @@ -991,8 +988,8 @@ mod tests { assert_eq!(5, binary_array.value_offset(2)); assert_eq!(7, binary_array.value_length(2)); for i in 0..3 { - assert!(binary_array.is_valid(i as i64)); - assert!(!binary_array.is_null(i as i64)); + assert!(binary_array.is_valid(i)); + assert!(!binary_array.is_null(i)); } // Test binary array with offset diff --git a/rust/src/array_data.rs b/rust/src/array_data.rs index b288d4a804535..36a817ee579a0 100644 --- a/rust/src/array_data.rs +++ b/rust/src/array_data.rs @@ -31,13 +31,13 @@ pub struct ArrayData { data_type: DataType, /// The number of elements in this array data - len: i64, + len: usize, /// The number of null elements in this array data - null_count: i64, + null_count: usize, /// The offset into this array data - offset: i64, + offset: usize, /// The buffers for this array data. Note that depending on the array types, this /// could hold different kinds of buffers (e.g., value buffer, value offset buffer) @@ -54,25 +54,28 @@ pub struct ArrayData { } pub type ArrayDataRef = Arc; -pub const UNKNOWN_NULL_COUNT: i64 = -1; impl ArrayData { pub fn new( data_type: DataType, - len: i64, - mut null_count: i64, + len: usize, + null_count: Option, null_bit_buffer: Option, - offset: i64, + offset: usize, buffers: Vec, child_data: Vec, ) -> Self { - if null_count < 0 { - null_count = if let Some(ref buf) = null_bit_buffer { - len - bit_util::count_set_bits_offset(buf.data(), offset as usize) - } else { - 0 - }; - } + let null_count = match null_count { + None => { + if let Some(ref buf) = null_bit_buffer { + len.checked_sub(bit_util::count_set_bits_offset(buf.data(), offset)) + .unwrap() + } else { + 0 + } + } + Some(null_count) => null_count, + }; let null_bitmap = null_bit_buffer.map(Bitmap::from); Self { data_type, @@ -106,7 +109,7 @@ impl ArrayData { } /// Returns whether the element at index `i` is null - pub fn is_null(&self, i: i64) -> bool { + pub fn is_null(&self, i: usize) -> bool { if let Some(ref b) = self.null_bitmap { return !b.is_set(i); } @@ -119,7 +122,7 @@ impl ArrayData { } /// Returns whether the element at index `i` is not null - pub fn is_valid(&self, i: i64) -> bool { + pub fn is_valid(&self, i: usize) -> bool { if let Some(ref b) = self.null_bitmap { return b.is_set(i); } @@ -127,17 +130,17 @@ impl ArrayData { } /// Returns the length (i.e., number of elements) of this array - pub fn len(&self) -> i64 { + pub fn len(&self) -> usize { self.len } /// Returns the offset of this array - pub fn offset(&self) -> i64 { + pub fn offset(&self) -> usize { self.offset } /// Returns the total number of nulls in this array - pub fn null_count(&self) -> i64 { + pub fn null_count(&self) -> usize { self.null_count } } @@ -145,10 +148,10 @@ impl ArrayData { /// Builder for `ArrayData` type pub struct ArrayDataBuilder { data_type: DataType, - len: i64, - null_count: i64, + len: usize, + null_count: Option, null_bit_buffer: Option, - offset: i64, + offset: usize, buffers: Vec, child_data: Vec, } @@ -158,7 +161,7 @@ impl ArrayDataBuilder { Self { data_type, len: 0, - null_count: UNKNOWN_NULL_COUNT, + null_count: None, null_bit_buffer: None, offset: 0, buffers: vec![], @@ -166,13 +169,13 @@ impl ArrayDataBuilder { } } - pub fn len(mut self, n: i64) -> Self { + pub fn len(mut self, n: usize) -> Self { self.len = n; self } - pub fn null_count(mut self, n: i64) -> Self { - self.null_count = n; + pub fn null_count(mut self, n: usize) -> Self { + self.null_count = Some(n); self } @@ -181,7 +184,7 @@ impl ArrayDataBuilder { self } - pub fn offset(mut self, n: i64) -> Self { + pub fn offset(mut self, n: usize) -> Self { self.offset = n; self } @@ -230,7 +233,7 @@ mod tests { #[test] fn test_new() { - let arr_data = ArrayData::new(DataType::Boolean, 10, 1, None, 2, vec![], vec![]); + let arr_data = ArrayData::new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]); assert_eq!(10, arr_data.len()); assert_eq!(1, arr_data.null_count()); assert_eq!(2, arr_data.offset()); @@ -244,7 +247,7 @@ mod tests { let child_arr_data = Arc::new(ArrayData::new( DataType::Int32, 10, - 0, + Some(0), None, 0, vec![], diff --git a/rust/src/bitmap.rs b/rust/src/bitmap.rs index 742fac5587b3e..3d5a77f78a51e 100644 --- a/rust/src/bitmap.rs +++ b/rust/src/bitmap.rs @@ -45,9 +45,9 @@ impl Bitmap { self.bits.len() } - pub fn is_set(&self, i: i64) -> bool { - assert!(i < (self.bits.len() << 3) as i64); - unsafe { bit_util::get_bit_raw(self.bits.raw_data(), i as usize) } + pub fn is_set(&self, i: usize) -> bool { + assert!(i < (self.bits.len() << 3)); + unsafe { bit_util::get_bit_raw(self.bits.raw_data(), i) } } } diff --git a/rust/src/buffer.rs b/rust/src/buffer.rs index 4b7d2a0d3c97e..b9c159f33857a 100644 --- a/rust/src/buffer.rs +++ b/rust/src/buffer.rs @@ -49,7 +49,7 @@ impl PartialEq for BufferData { if self.len != other.len { return false; } - unsafe { memory::memcmp(self.ptr, other.ptr, self.len as usize) == 0 } + unsafe { memory::memcmp(self.ptr, other.ptr, self.len) == 0 } } } @@ -73,7 +73,7 @@ impl Buffer { /// Returns the number of bytes in the buffer pub fn len(&self) -> usize { - self.data.len - self.offset as usize + self.data.len - self.offset } /// Returns whether the buffer is empty. @@ -128,7 +128,7 @@ impl> From for Buffer { // allocate aligned memory buffer let slice = p.as_ref(); let len = slice.len() * mem::size_of::(); - let buffer = memory::allocate_aligned((len) as i64).unwrap(); + let buffer = memory::allocate_aligned(len).unwrap(); unsafe { memory::memcpy(buffer, slice.as_ptr(), len); } @@ -151,12 +151,12 @@ pub struct MutableBuffer { impl MutableBuffer { /// Allocate a new mutable buffer with initial capacity to be `capacity`. pub fn new(capacity: usize) -> Self { - let new_capacity = bit_util::round_upto_multiple_of_64(capacity as i64); + let new_capacity = bit_util::round_upto_multiple_of_64(capacity); let ptr = memory::allocate_aligned(new_capacity).unwrap(); Self { data: ptr, len: 0, - capacity: new_capacity as usize, + capacity: new_capacity, } } @@ -193,8 +193,8 @@ impl MutableBuffer { /// Returns the new capacity for this buffer. pub fn reserve(&mut self, capacity: usize) -> Result { if capacity > self.capacity { - let new_capacity = bit_util::round_upto_multiple_of_64(capacity as i64); - let new_capacity = cmp::max(new_capacity, self.capacity as i64 * 2) as usize; + let new_capacity = bit_util::round_upto_multiple_of_64(capacity); + let new_capacity = cmp::max(new_capacity, self.capacity * 2); let new_data = memory::reallocate(self.capacity, new_capacity, self.data)?; self.data = new_data as *mut u8; self.capacity = new_capacity; @@ -213,7 +213,7 @@ impl MutableBuffer { if new_len > self.len { self.reserve(new_len)?; } else { - let new_capacity = bit_util::round_upto_multiple_of_64(new_len as i64) as usize; + let new_capacity = bit_util::round_upto_multiple_of_64(new_len); if new_capacity < self.capacity { let new_data = memory::reallocate(self.capacity, new_capacity, self.data)?; self.data = new_data as *mut u8; @@ -287,7 +287,7 @@ impl PartialEq for MutableBuffer { if self.len != other.len { return false; } - unsafe { memory::memcmp(self.data, other.data, self.len as usize) == 0 } + unsafe { memory::memcmp(self.data, other.data, self.len) == 0 } } } diff --git a/rust/src/builder.rs b/rust/src/builder.rs index 2cbdce0c8570b..fc781ffa50641 100644 --- a/rust/src/builder.rs +++ b/rust/src/builder.rs @@ -33,7 +33,7 @@ use crate::util::bit_util; /// Buffer builder with zero-copy build method pub struct BufferBuilder { buffer: MutableBuffer, - len: i64, + len: usize, _marker: PhantomData, } @@ -53,11 +53,11 @@ pub type Float64BufferBuilder = BufferBuilder; // numeric types and boolean types, while still be able to call methods on buffer builder // with generic primitive type. pub trait BufferBuilderTrait { - fn new(capacity: i64) -> Self; - fn len(&self) -> i64; - fn capacity(&self) -> i64; - fn advance(&mut self, i: i64) -> Result<()>; - fn reserve(&mut self, n: i64) -> Result<()>; + fn new(capacity: usize) -> Self; + fn len(&self) -> usize; + fn capacity(&self) -> usize; + fn advance(&mut self, i: usize) -> Result<()>; + fn reserve(&mut self, n: usize) -> Result<()>; fn push(&mut self, v: T::Native) -> Result<()>; fn push_slice(&mut self, slice: &[T::Native]) -> Result<()>; fn finish(self) -> Buffer; @@ -65,8 +65,8 @@ pub trait BufferBuilderTrait { impl BufferBuilderTrait for BufferBuilder { /// Creates a builder with a fixed initial capacity - default fn new(capacity: i64) -> Self { - let buffer = MutableBuffer::new(capacity as usize * mem::size_of::()); + default fn new(capacity: usize) -> Self { + let buffer = MutableBuffer::new(capacity * mem::size_of::()); Self { buffer, len: 0, @@ -75,28 +75,28 @@ impl BufferBuilderTrait for BufferBuilder { } /// Returns the number of array elements (slots) in the builder - fn len(&self) -> i64 { + fn len(&self) -> usize { self.len } /// Returns the current capacity of the builder (number of elements) - fn capacity(&self) -> i64 { + fn capacity(&self) -> usize { let bit_capacity = self.buffer.capacity() * 8; - (bit_capacity / T::get_bit_width()) as i64 + (bit_capacity / T::get_bit_width()) } // Advances the `len` of the underlying `Buffer` by `i` slots of type T - default fn advance(&mut self, i: i64) -> Result<()> { - let new_buffer_len = (self.len + i) as usize * mem::size_of::(); + default fn advance(&mut self, i: usize) -> Result<()> { + let new_buffer_len = (self.len + i) * mem::size_of::(); self.buffer.resize(new_buffer_len)?; self.len += i; Ok(()) } /// Reserves memory for `n` elements of type `T`. - default fn reserve(&mut self, n: i64) -> Result<()> { + default fn reserve(&mut self, n: usize) -> Result<()> { let new_capacity = self.len + n; - let byte_capacity = mem::size_of::() * new_capacity as usize; + let byte_capacity = mem::size_of::() * new_capacity; self.buffer.reserve(byte_capacity)?; Ok(()) } @@ -109,7 +109,7 @@ impl BufferBuilderTrait for BufferBuilder { /// Pushes a slice of type `T`, growing the internal buffer as needed. default fn push_slice(&mut self, slice: &[T::Native]) -> Result<()> { - let array_slots = slice.len() as i64; + let array_slots = slice.len(); self.reserve(array_slots)?; self.write_bytes(slice.to_byte_slice(), array_slots) } @@ -124,7 +124,7 @@ impl BufferBuilder { /// Writes a byte slice to the underlying buffer and updates the `len`, i.e. the number array /// elements in the builder. Also, converts the `io::Result` required by the `Write` trait /// to the Arrow `Result` type. - fn write_bytes(&mut self, bytes: &[u8], len_added: i64) -> Result<()> { + fn write_bytes(&mut self, bytes: &[u8], len_added: usize) -> Result<()> { let write_result = self.buffer.write(bytes); // `io::Result` has many options one of which we use, so pattern matching is overkill here if write_result.is_err() { @@ -140,9 +140,9 @@ impl BufferBuilder { impl BufferBuilderTrait for BufferBuilder { /// Creates a builder with a fixed initial capacity. - fn new(capacity: i64) -> Self { + fn new(capacity: usize) -> Self { let byte_capacity = bit_util::ceil(capacity, 8); - let actual_capacity = bit_util::round_upto_multiple_of_64(byte_capacity) as usize; + let actual_capacity = bit_util::round_upto_multiple_of_64(byte_capacity); let mut buffer = MutableBuffer::new(actual_capacity); buffer.set_null_bits(0, actual_capacity); Self { @@ -153,9 +153,9 @@ impl BufferBuilderTrait for BufferBuilder { } // Advances the `len` of the underlying `Buffer` by `i` slots of type T - fn advance(&mut self, i: i64) -> Result<()> { + fn advance(&mut self, i: usize) -> Result<()> { let new_buffer_len = bit_util::ceil(self.len + i, 8); - self.buffer.resize(new_buffer_len as usize)?; + self.buffer.resize(new_buffer_len)?; self.len += i; Ok(()) } @@ -167,7 +167,7 @@ impl BufferBuilderTrait for BufferBuilder { // For performance the `len` of the buffer is not updated on each push but // is updated in the `freeze` method instead. unsafe { - bit_util::set_bit_raw(self.buffer.raw_data() as *mut u8, (self.len) as usize); + bit_util::set_bit_raw(self.buffer.raw_data() as *mut u8, self.len); } } self.len += 1; @@ -184,10 +184,10 @@ impl BufferBuilderTrait for BufferBuilder { } /// Reserves memory for `n` elements of type `T`. - fn reserve(&mut self, n: i64) -> Result<()> { + fn reserve(&mut self, n: usize) -> Result<()> { let new_capacity = self.len + n; if new_capacity > self.capacity() { - let new_byte_capacity = bit_util::ceil(new_capacity, 8) as usize; + let new_byte_capacity = bit_util::ceil(new_capacity, 8); let existing_capacity = self.buffer.capacity(); let new_capacity = self.buffer.reserve(new_byte_capacity)?; self.buffer @@ -199,7 +199,7 @@ impl BufferBuilderTrait for BufferBuilder { /// Consumes this and returns an immutable `Buffer`. fn finish(mut self) -> Buffer { // `push` does not update the buffer's `len` so do it before `freeze` is called. - let new_buffer_len = bit_util::ceil(self.len, 8) as usize; + let new_buffer_len = bit_util::ceil(self.len, 8); debug_assert!(new_buffer_len >= self.buffer.len()); self.buffer.resize(new_buffer_len).unwrap(); self.buffer.freeze() @@ -216,7 +216,7 @@ pub trait ArrayBuilder { fn into_any(self) -> Box; /// Returns the number of array slots in the builder - fn len(&self) -> i64; + fn len(&self) -> usize; /// Builds the array fn finish(self) -> Self::ArrayType; @@ -250,7 +250,7 @@ impl ArrayBuilder for PrimitiveArrayBuilder { } /// Returns the number of array slots in the builder - fn len(&self) -> i64 { + fn len(&self) -> usize { self.values_builder.len } @@ -270,7 +270,7 @@ impl ArrayBuilder for PrimitiveArrayBuilder { impl PrimitiveArrayBuilder { /// Creates a new primitive array builder - pub fn new(capacity: i64) -> Self { + pub fn new(capacity: usize) -> Self { Self { values_builder: BufferBuilder::::new(capacity), bitmap_builder: BooleanBufferBuilder::new(capacity), @@ -278,7 +278,7 @@ impl PrimitiveArrayBuilder { } /// Returns the capacity of this builder measured in slots of type `T` - pub fn capacity(&self) -> i64 { + pub fn capacity(&self) -> usize { self.values_builder.capacity() } @@ -318,7 +318,7 @@ pub struct ListArrayBuilder { offsets_builder: Int32BufferBuilder, bitmap_builder: BooleanBufferBuilder, values_builder: T, - len: i64, + len: usize, } impl ListArrayBuilder { @@ -348,7 +348,7 @@ where } /// Returns the number of array slots in the builder - fn len(&self) -> i64 { + fn len(&self) -> usize { self.len } @@ -410,7 +410,7 @@ impl ArrayBuilder for BinaryArrayBuilder { } /// Returns the number of array slots in the builder - fn len(&self) -> i64 { + fn len(&self) -> usize { self.builder.len() } @@ -422,7 +422,7 @@ impl ArrayBuilder for BinaryArrayBuilder { impl BinaryArrayBuilder { /// Creates a new `BinaryArrayBuilder`, `capacity` is the number of bytes in the values array - pub fn new(capacity: i64) -> Self { + pub fn new(capacity: usize) -> Self { let values_builder = UInt8Builder::new(capacity); Self { builder: ListArrayBuilder::new(values_builder), @@ -736,8 +736,8 @@ mod tests { assert_eq!(6, list_array.value_offset(2)); assert_eq!(2, list_array.value_length(2)); for i in 0..3 { - assert!(list_array.is_valid(i as i64)); - assert!(!list_array.is_null(i as i64)); + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); } } diff --git a/rust/src/csv/reader.rs b/rust/src/csv/reader.rs index 697ace653b691..956408e4a40c3 100644 --- a/rust/src/csv/reader.rs +++ b/rust/src/csv/reader.rs @@ -44,13 +44,15 @@ use std::fs::File; use std::io::BufReader; use std::sync::Arc; +use csv as csv_crate; + use crate::array::{ArrayRef, BinaryArray}; use crate::builder::*; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::RecordBatch; -use csv_crate::{StringRecord, StringRecordsIntoIter}; +use self::csv_crate::{StringRecord, StringRecordsIntoIter}; /// CSV file reader pub struct Reader { @@ -91,7 +93,7 @@ fn build_primitive_array( rows: &[StringRecord], col_idx: &usize, ) -> Result { - let mut builder = PrimitiveArrayBuilder::::new(rows.len() as i64); + let mut builder = PrimitiveArrayBuilder::::new(rows.len()); for row_index in 0..rows.len() { match rows[row_index].get(*col_idx) { Some(s) if s.len() > 0 => match s.parse::() { @@ -161,7 +163,7 @@ impl Reader { &DataType::Float32 => build_primitive_array::(rows, i), &DataType::Float64 => build_primitive_array::(rows, i), &DataType::Utf8 => { - let values_builder: UInt8Builder = UInt8Builder::new(rows.len() as i64); + let values_builder: UInt8Builder = UInt8Builder::new(rows.len()); let mut list_builder = ListArrayBuilder::new(values_builder); for row_index in 0..rows.len() { match rows[row_index].get(*i) { diff --git a/rust/src/memory.rs b/rust/src/memory.rs index 193eff12d6f6f..763cb48f50f9e 100644 --- a/rust/src/memory.rs +++ b/rust/src/memory.rs @@ -31,7 +31,7 @@ extern "C" { } #[cfg(windows)] -pub fn allocate_aligned(size: i64) -> Result<*mut u8> { +pub fn allocate_aligned(size: usize) -> Result<*mut u8> { let page = unsafe { _aligned_malloc(size as libc::size_t, ALIGNMENT as libc::size_t) }; match page { 0 => Err(ArrowError::MemoryError( @@ -42,10 +42,10 @@ pub fn allocate_aligned(size: i64) -> Result<*mut u8> { } #[cfg(not(windows))] -pub fn allocate_aligned(size: i64) -> Result<*mut u8> { +pub fn allocate_aligned(size: usize) -> Result<*mut u8> { unsafe { let mut page: *mut libc::c_void = mem::uninitialized(); - let result = libc::posix_memalign(&mut page, ALIGNMENT, size as usize); + let result = libc::posix_memalign(&mut page, ALIGNMENT, size); match result { 0 => Ok(mem::transmute::<*mut libc::c_void, *mut u8>(page)), _ => Err(ArrowError::MemoryError( @@ -72,7 +72,7 @@ pub fn free_aligned(p: *const u8) { pub fn reallocate(old_size: usize, new_size: usize, pointer: *const u8) -> Result<*const u8> { unsafe { let old_src = mem::transmute::<*const u8, *mut libc::c_void>(pointer); - let result = allocate_aligned(new_size as i64)?; + let result = allocate_aligned(new_size)?; let dst = mem::transmute::<*const u8, *mut libc::c_void>(result); libc::memcpy(dst, old_src, cmp::min(old_size, new_size)); free_aligned(pointer); diff --git a/rust/src/record_batch.rs b/rust/src/record_batch.rs index 4cb5c8e7db4df..2666770460e84 100644 --- a/rust/src/record_batch.rs +++ b/rust/src/record_batch.rs @@ -52,7 +52,7 @@ impl RecordBatch { self.columns.len() } - pub fn num_rows(&self) -> i64 { + pub fn num_rows(&self) -> usize { self.columns[0].data().len() } diff --git a/rust/src/tensor.rs b/rust/src/tensor.rs index ec56aeb4cccd5..175b68d81f188 100644 --- a/rust/src/tensor.rs +++ b/rust/src/tensor.rs @@ -23,30 +23,30 @@ use crate::buffer::Buffer; use crate::datatypes::*; /// Computes the strides required assuming a row major memory layout -fn compute_row_major_strides(shape: &Vec) -> Vec { +fn compute_row_major_strides(shape: &Vec) -> Vec { let mut remaining_bytes = mem::size_of::(); for i in shape { remaining_bytes = remaining_bytes - .checked_mul(*i as usize) + .checked_mul(*i) .expect("Overflow occurred when computing row major strides."); } - let mut strides = Vec::::new(); + let mut strides = Vec::::new(); for i in shape { - remaining_bytes /= *i as usize; - strides.push(remaining_bytes as i64); + remaining_bytes /= *i; + strides.push(remaining_bytes); } strides } /// Computes the strides required assuming a column major memory layout -fn compute_column_major_strides(shape: &Vec) -> Vec { +fn compute_column_major_strides(shape: &Vec) -> Vec { let mut remaining_bytes = mem::size_of::(); - let mut strides = Vec::::new(); + let mut strides = Vec::::new(); for i in shape { - strides.push(remaining_bytes as i64); + strides.push(remaining_bytes); remaining_bytes = remaining_bytes - .checked_mul(*i as usize) + .checked_mul(*i) .expect("Overflow occurred when computing column major strides."); } strides @@ -56,8 +56,8 @@ fn compute_column_major_strides(shape: &Vec) -> Vec< pub struct Tensor<'a, T: ArrowPrimitiveType> { data_type: DataType, buffer: Buffer, - shape: Option>, - strides: Option>, + shape: Option>, + strides: Option>, names: Option>, _marker: PhantomData, } @@ -78,8 +78,8 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { /// Creates a new `Tensor` pub fn new( buffer: Buffer, - shape: Option>, - strides: Option>, + shape: Option>, + strides: Option>, names: Option>, ) -> Self { match &shape { @@ -122,7 +122,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { /// Creates a new Tensor using row major memory layout pub fn new_row_major( buffer: Buffer, - shape: Option>, + shape: Option>, names: Option>, ) -> Self { let strides = match &shape { @@ -135,7 +135,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { /// Creates a new Tensor using column major memory layout pub fn new_column_major( buffer: Buffer, - shape: Option>, + shape: Option>, names: Option>, ) -> Self { let strides = match &shape { @@ -151,7 +151,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { } /// The sizes of the dimensions - pub fn shape(&self) -> Option<&Vec> { + pub fn shape(&self) -> Option<&Vec> { self.shape.as_ref() } @@ -161,7 +161,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { } /// The number of bytes between elements in each dimension - pub fn strides(&self) -> Option<&Vec> { + pub fn strides(&self) -> Option<&Vec> { self.strides.as_ref() } @@ -171,24 +171,24 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { } /// The number of dimensions - pub fn ndim(&self) -> i64 { + pub fn ndim(&self) -> usize { match &self.shape { None => 0, - Some(v) => v.len() as i64, + Some(v) => v.len(), } } /// The name of dimension i - pub fn dim_name(&self, i: i64) -> Option<&'a str> { + pub fn dim_name(&self, i: usize) -> Option<&'a str> { match &self.names { None => None, - Some(ref names) => Some(&names[i as usize]), + Some(ref names) => Some(&names[i]), } } /// The total number of elements in the `Tensor` - pub fn size(&self) -> i64 { - (self.buffer.len() / mem::size_of::()) as i64 + pub fn size(&self) -> usize { + (self.buffer.len() / mem::size_of::()) } /// Indicates if the data is laid out contiguously in memory @@ -223,15 +223,15 @@ mod tests { fn test_compute_row_major_strides() { assert_eq!( vec![48, 8], - compute_row_major_strides::(&vec![4_i64, 6]) + compute_row_major_strides::(&vec![4_usize, 6]) ); assert_eq!( vec![24, 4], - compute_row_major_strides::(&vec![4_i64, 6]) + compute_row_major_strides::(&vec![4_usize, 6]) ); assert_eq!( vec![6, 1], - compute_row_major_strides::(&vec![4_i64, 6]) + compute_row_major_strides::(&vec![4_usize, 6]) ); } @@ -239,15 +239,15 @@ mod tests { fn test_compute_column_major_strides() { assert_eq!( vec![8, 32], - compute_column_major_strides::(&vec![4_i64, 6]) + compute_column_major_strides::(&vec![4_usize, 6]) ); assert_eq!( vec![4, 16], - compute_column_major_strides::(&vec![4_i64, 6]) + compute_column_major_strides::(&vec![4_usize, 6]) ); assert_eq!( vec![1, 4], - compute_column_major_strides::(&vec![4_i64, 6]) + compute_column_major_strides::(&vec![4_usize, 6]) ); } @@ -283,7 +283,7 @@ mod tests { let buf = builder.finish(); let tensor = Int32Tensor::new(buf, Some(vec![2, 8]), None, None); assert_eq!(16, tensor.size()); - assert_eq!(Some(vec![2_i64, 8]).as_ref(), tensor.shape()); + assert_eq!(Some(vec![2_usize, 8]).as_ref(), tensor.shape()); assert_eq!(None, tensor.strides()); assert_eq!(2, tensor.ndim()); assert_eq!(None, tensor.names()); @@ -298,8 +298,8 @@ mod tests { let buf = builder.finish(); let tensor = Int32Tensor::new_row_major(buf, Some(vec![2, 8]), None); assert_eq!(16, tensor.size()); - assert_eq!(Some(vec![2_i64, 8]).as_ref(), tensor.shape()); - assert_eq!(Some(vec![32_i64, 4]).as_ref(), tensor.strides()); + assert_eq!(Some(vec![2_usize, 8]).as_ref(), tensor.shape()); + assert_eq!(Some(vec![32_usize, 4]).as_ref(), tensor.strides()); assert_eq!(None, tensor.names()); assert_eq!(2, tensor.ndim()); assert_eq!(true, tensor.is_row_major()); @@ -316,8 +316,8 @@ mod tests { let buf = builder.finish(); let tensor = Int32Tensor::new_column_major(buf, Some(vec![2, 8]), None); assert_eq!(16, tensor.size()); - assert_eq!(Some(vec![2_i64, 8]).as_ref(), tensor.shape()); - assert_eq!(Some(vec![4_i64, 8]).as_ref(), tensor.strides()); + assert_eq!(Some(vec![2_usize, 8]).as_ref(), tensor.shape()); + assert_eq!(Some(vec![4_usize, 8]).as_ref(), tensor.strides()); assert_eq!(None, tensor.names()); assert_eq!(2, tensor.ndim()); assert_eq!(false, tensor.is_row_major()); @@ -335,8 +335,8 @@ mod tests { let names = vec!["Dim 1", "Dim 2"]; let tensor = Int64Tensor::new_column_major(buf, Some(vec![2, 4]), Some(names)); assert_eq!(8, tensor.size()); - assert_eq!(Some(vec![2_i64, 4]).as_ref(), tensor.shape()); - assert_eq!(Some(vec![8_i64, 16]).as_ref(), tensor.strides()); + assert_eq!(Some(vec![2_usize, 4]).as_ref(), tensor.shape()); + assert_eq!(Some(vec![8_usize, 16]).as_ref(), tensor.strides()); assert_eq!("Dim 1", tensor.dim_name(0).unwrap()); assert_eq!("Dim 2", tensor.dim_name(1).unwrap()); assert_eq!(2, tensor.ndim()); diff --git a/rust/src/util/bit_util.rs b/rust/src/util/bit_util.rs index da6d10d269ca2..3f7f4cb573b49 100644 --- a/rust/src/util/bit_util.rs +++ b/rust/src/util/bit_util.rs @@ -30,13 +30,13 @@ static POPCOUNT_TABLE: [u8; 256] = [ /// Returns the nearest number that is `>=` than `num` and is a multiple of 64 #[inline] -pub fn round_upto_multiple_of_64(num: i64) -> i64 { +pub fn round_upto_multiple_of_64(num: usize) -> usize { round_upto_power_of_2(num, 64) } /// Returns the nearest multiple of `factor` that is `>=` than `num`. Here `factor` must /// be a power of 2. -fn round_upto_power_of_2(num: i64, factor: i64) -> i64 { +fn round_upto_power_of_2(num: usize, factor: usize) -> usize { debug_assert!(factor > 0 && (factor & (factor - 1)) == 0); (num + (factor - 1)) & !(factor - 1) } @@ -73,20 +73,20 @@ pub unsafe fn set_bit_raw(data: *mut u8, i: usize) { /// Returns the number of 1-bits in `data` #[inline] -pub fn count_set_bits(data: &[u8]) -> i64 { - let mut count: i64 = 0; +pub fn count_set_bits(data: &[u8]) -> usize { + let mut count: usize = 0; for u in data { - count += POPCOUNT_TABLE[*u as usize] as i64; + count += POPCOUNT_TABLE[*u as usize] as usize; } count } /// Returns the number of 1-bits in `data`, starting from `offset`. #[inline] -pub fn count_set_bits_offset(data: &[u8], offset: usize) -> i64 { +pub fn count_set_bits_offset(data: &[u8], offset: usize) -> usize { debug_assert!(offset <= (data.len() << 3)); - let start_byte_pos = (offset >> 3) as usize; + let start_byte_pos = offset >> 3; let start_bit_pos = offset & 7; if start_bit_pos == 0 { @@ -95,7 +95,7 @@ pub fn count_set_bits_offset(data: &[u8], offset: usize) -> i64 { let mut result = 0; result += count_set_bits(&data[start_byte_pos + 1..]); for i in start_bit_pos..8 { - if get_bit(&data[start_byte_pos..start_byte_pos + 1], i as usize) { + if get_bit(&data[start_byte_pos..start_byte_pos + 1], i) { result += 1; } } @@ -105,7 +105,7 @@ pub fn count_set_bits_offset(data: &[u8], offset: usize) -> i64 { /// Returns the ceil of `value`/`divisor` #[inline] -pub fn ceil(value: i64, divisor: i64) -> i64 { +pub fn ceil(value: usize, divisor: usize) -> usize { let mut result = value / divisor; if value % divisor != 0 { result += 1 From 8973cfe4332e4b8e917fb52e47168ccea8b9653d Mon Sep 17 00:00:00 2001 From: Praveen Date: Mon, 10 Dec 2018 16:39:43 +0100 Subject: [PATCH 14/45] ARROW-3983: [Gandiva][Crossbow] Link Boost statically in JAR packaging scripts Use static boost libraries while packaging Gandiva. Author: Praveen Closes #3145 from praveenbingo/ARROW-3983 and squashes the following commits: 2a704969 ARROW-3983: Use static version of boost. --- dev/tasks/gandiva-jars/build-cpp.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/tasks/gandiva-jars/build-cpp.sh b/dev/tasks/gandiva-jars/build-cpp.sh index a0538cf6f3116..21289dee5a6b1 100755 --- a/dev/tasks/gandiva-jars/build-cpp.sh +++ b/dev/tasks/gandiva-jars/build-cpp.sh @@ -29,6 +29,7 @@ pushd arrow/cpp -DARROW_GANDIVA=ON \ -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ -DARROW_BUILD_UTILITIES=OFF \ + -DARROW_BOOST_USE_SHARED=OFF \ .. make -j4 ctest From 9da458437162574f3e0d82e4a51dc6c1589b9f94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 10 Dec 2018 16:42:53 +0100 Subject: [PATCH 15/45] ARROW-2624: [Python] Random schema generator for Arrow conversion and Parquet testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - introduced hypothesis to generate pyarrow types, fields and schemas - test cases to highlight the functionality provided by hypothesis - hypothesis tests are disabled by default - represent kev-value metadata as OrderedDict on python side instead of plain dicts (pickling was indeterministic, found this bug by hypo) - unified multiple metadata conversion paths to a single one (pyarrow_wrap_metadata, pyarrow_unwrap_metadata) Also resolves: [ARROW-3901: [Python] Make Schema hashable](https://issues.apache.org/jira/browse/ARROW-3901) Follow-up issue: [ARROW-3903: [Python] Random data generator for ... testing](https://issues.apache.org/jira/browse/ARROW-3903) Author: Krisztián Szűcs Closes #3046 from kszucs/ARROW-2624 and squashes the following commits: 3e27ad15 hypo profiles 88b107bb install hypothesis for msvc wheel test 8fb6d0bc make pyarrow_wrap_metadata private 80a276be manylinux 26e6ecd6 manylinux e385d243 manylinux b6fe7576 append in unwrap 0e28e5df ci fixes efeb65ee use conde_env_python.yml in travis 1f7ad6b6 don't validate metadata type pyarrow_wrap_metadata 14e444d9 introduce requirements-test.txt 11b020c0 install hypothesis on appveyor and travis 6bd5b21e license header a8fae546 remove unbox_metadata e8c0f3f5 add hypo as test dependency; hashing test e7bab691 remove box_metadata f1ae290e hypothesis strategies for pyarrow types; deterministic key-value metadata conversions --- ci/appveyor-cpp-build.bat | 2 +- ci/conda_env_python.yml | 2 + ci/cpp-msvc-build-main.bat | 2 +- ci/travis_script_python.sh | 10 +- dev/release/rat_exclude_files.txt | 1 + dev/release/verify-release-candidate.sh | 2 +- python/manylinux1/build_arrow.sh | 5 +- .../manylinux1/scripts/build_virtualenvs.sh | 2 +- python/pyarrow/includes/libarrow.pxd | 8 +- python/pyarrow/lib.pxd | 6 +- python/pyarrow/public-api.pxi | 25 ++++ python/pyarrow/table.pxi | 60 ++++---- python/pyarrow/tests/conftest.py | 34 ++++- python/pyarrow/tests/strategies.py | 138 ++++++++++++++++++ python/pyarrow/tests/test_types.py | 50 +++++++ python/pyarrow/types.pxi | 88 +++++------ python/requirements-test.txt | 5 + python/requirements.txt | 9 +- python/setup.py | 3 +- 19 files changed, 348 insertions(+), 104 deletions(-) create mode 100644 python/pyarrow/tests/strategies.py create mode 100644 python/requirements-test.txt diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 91212a63fe3ac..b8e431613210a 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -91,7 +91,7 @@ if "%JOB%" == "Build_Debug" ( conda create -n arrow -q -y ^ python=%PYTHON% ^ - six pytest setuptools numpy pandas cython ^ + six pytest setuptools numpy pandas cython hypothesis ^ thrift-cpp=0.11.0 boost-cpp ^ -c conda-forge diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index 429851eb2f5ae..c187155275eaa 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -16,6 +16,8 @@ # under the License. cython +cloudpickle +hypothesis nomkl numpy pandas diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index ef961b2e0f26e..7349f8d3aca6b 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -112,6 +112,6 @@ pip install %WHEEL_PATH% || exit /B python -c "import pyarrow" || exit /B python -c "import pyarrow.parquet" || exit /B -pip install pandas pickle5 pytest pytest-faulthandler || exit /B +pip install pandas pickle5 pytest pytest-faulthandler hypothesis || exit /B py.test -r sxX --durations=15 --pyargs pyarrow.tests || exit /B diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index e4290ed8ee026..b316c81f3b6b0 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -51,13 +51,11 @@ if [ $ARROW_TRAVIS_PYTHON_JVM == "1" ]; then CONDA_JVM_DEPS="jpype1" fi -conda install -y -q pip \ - nomkl \ - cloudpickle \ +conda install -y -q \ + --file $TRAVIS_BUILD_DIR/ci/conda_env_python.yml \ + pip \ numpy=1.13.1 \ - ${CONDA_JVM_DEPS} \ - pandas \ - cython + ${CONDA_JVM_DEPS} if [ "$ARROW_TRAVIS_PYTHON_DOCS" == "1" ] && [ "$PYTHON_VERSION" == "3.6" ]; then # Install documentation dependencies diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 0baf29edd83e4..e274d97548068 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -129,6 +129,7 @@ python/MANIFEST.in python/pyarrow/includes/__init__.pxd python/pyarrow/tests/__init__.py python/requirements.txt +python/requirements-test.txt pax_global_header MANIFEST.in __init__.pxd diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 5b666630d17a0..57b1850337067 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -189,7 +189,7 @@ test_and_install_cpp() { test_python() { pushd python - pip install -r requirements.txt + pip install -r requirements-test.txt python setup.py build_ext --inplace --with-parquet --with-plasma py.test pyarrow -v --pdb diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index 44816526d2179..904297375ef25 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -107,7 +107,7 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER setup.py bdist_wheel PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER setup.py sdist - echo "=== (${PYTHON}) Test the existence of optional modules ===" + echo "=== (${PYTHON}) Ensure the existence of mandatory modules ===" $PIP install -r requirements.txt echo "=== (${PYTHON}) Tag the wheel with manylinux1 ===" @@ -122,6 +122,9 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER -c "import pyarrow.parquet" PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER -c "import pyarrow.plasma" + echo "=== (${PYTHON}) Install modules required for testing ===" + pip install -r requirements-test.txt + # The TensorFlow test will be skipped here, since TensorFlow is not # manylinux1 compatible; however, the wheels will support TensorFlow on # a TensorFlow compatible system diff --git a/python/manylinux1/scripts/build_virtualenvs.sh b/python/manylinux1/scripts/build_virtualenvs.sh index 18f3b0dd4657e..14100317d974f 100755 --- a/python/manylinux1/scripts/build_virtualenvs.sh +++ b/python/manylinux1/scripts/build_virtualenvs.sh @@ -41,7 +41,7 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do echo "=== (${PYTHON}, ${U_WIDTH}) Preparing virtualenv for tests ===" "$(cpython_path $PYTHON ${U_WIDTH})/bin/virtualenv" -p ${PYTHON_INTERPRETER} --no-download /venv-test-${PYTHON}-${U_WIDTH} source /venv-test-${PYTHON}-${U_WIDTH}/bin/activate - pip install pytest 'numpy==1.14.5' 'pandas==0.23.4' + pip install pytest hypothesis 'numpy==1.14.5' 'pandas==0.23.4' deactivate done diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index c5e745708308f..61517e4f09d21 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -23,9 +23,15 @@ cdef extern from "arrow/util/key_value_metadata.h" namespace "arrow" nogil: cdef cppclass CKeyValueMetadata" arrow::KeyValueMetadata": CKeyValueMetadata() CKeyValueMetadata(const unordered_map[c_string, c_string]&) + CKeyValueMetadata(const vector[c_string]& keys, + const vector[c_string]& values) - c_bool Equals(const CKeyValueMetadata& other) + void reserve(int64_t n) + int64_t size() const + c_string key(int64_t i) const + c_string value(int64_t i) const + c_bool Equals(const CKeyValueMetadata& other) void Append(const c_string& key, const c_string& value) void ToUnorderedMap(unordered_map[c_string, c_string]*) const diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 098ae62c8f492..745a049e32a7c 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -384,11 +384,13 @@ cdef get_reader(object source, c_bool use_memory_map, shared_ptr[RandomAccessFile]* reader) cdef get_writer(object source, shared_ptr[OutputStream]* writer) -cdef dict box_metadata(const CKeyValueMetadata* sp_metadata) - # Default is allow_none=False cdef DataType ensure_type(object type, c_bool allow_none=*) +cdef shared_ptr[CKeyValueMetadata] pyarrow_unwrap_metadata(object meta) +cdef object pyarrow_wrap_metadata( + const shared_ptr[const CKeyValueMetadata]& meta) + # # Public Cython API for 3rd party code # diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index e8798c5edbc7d..ef54c7ab42f74 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -92,6 +92,31 @@ cdef public api object pyarrow_wrap_data_type( return out +cdef object pyarrow_wrap_metadata( + const shared_ptr[const CKeyValueMetadata]& meta): + cdef const CKeyValueMetadata* cmeta = meta.get() + + if cmeta == nullptr: + return None + + result = OrderedDict() + for i in range(cmeta.size()): + result[cmeta.key(i)] = cmeta.value(i) + + return result + + +cdef shared_ptr[CKeyValueMetadata] pyarrow_unwrap_metadata(object meta): + cdef vector[c_string] keys, values + + if isinstance(meta, dict): + keys = map(tobytes, meta.keys()) + values = map(tobytes, meta.values()) + return make_shared[CKeyValueMetadata](keys, values) + + return shared_ptr[CKeyValueMetadata]() + + cdef public api bint pyarrow_is_field(object field): return isinstance(field, Field) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 0d529d3787614..fd565afae5acf 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -634,26 +634,22 @@ cdef class Column: return pyarrow_wrap_chunked_array(self.column.data()) -cdef shared_ptr[const CKeyValueMetadata] unbox_metadata(dict metadata): - if metadata is None: - return nullptr - cdef: - unordered_map[c_string, c_string] unordered_metadata = metadata - return ( - make_shared[CKeyValueMetadata](unordered_metadata)) - - -cdef _schema_from_arrays(arrays, names, dict metadata, - shared_ptr[CSchema]* schema): +cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): cdef: Column col c_string c_name vector[shared_ptr[CField]] fields shared_ptr[CDataType] type_ Py_ssize_t K = len(arrays) + shared_ptr[CKeyValueMetadata] c_meta + + if metadata is not None: + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + c_meta = pyarrow_unwrap_metadata(metadata) if K == 0: - schema.reset(new CSchema(fields, unbox_metadata(metadata))) + schema.reset(new CSchema(fields, c_meta)) return fields.resize(K) @@ -684,7 +680,7 @@ cdef _schema_from_arrays(arrays, names, dict metadata, c_name = tobytes(names[i]) fields[i].reset(new CField(c_name, type_, True)) - schema.reset(new CSchema(fields, unbox_metadata(metadata))) + schema.reset(new CSchema(fields, c_meta)) cdef class RecordBatch: @@ -715,7 +711,7 @@ cdef class RecordBatch: def __len__(self): return self.batch.num_rows() - def replace_schema_metadata(self, dict metadata=None): + def replace_schema_metadata(self, metadata=None): """ EXPERIMENTAL: Create shallow copy of record batch by replacing schema key-value metadata with the indicated new metadata (which may be None, @@ -729,15 +725,19 @@ cdef class RecordBatch: ------- shallow_copy : RecordBatch """ - cdef shared_ptr[CKeyValueMetadata] c_meta + cdef: + shared_ptr[CKeyValueMetadata] c_meta + shared_ptr[CRecordBatch] c_batch + if metadata is not None: - convert_metadata(metadata, &c_meta) + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + c_meta = pyarrow_unwrap_metadata(metadata) - cdef shared_ptr[CRecordBatch] new_batch with nogil: - new_batch = self.batch.ReplaceSchemaMetadata(c_meta) + c_batch = self.batch.ReplaceSchemaMetadata(c_meta) - return pyarrow_wrap_batch(new_batch) + return pyarrow_wrap_batch(c_batch) @property def num_columns(self): @@ -953,7 +953,7 @@ cdef class RecordBatch: return cls.from_arrays(arrays, names, metadata) @staticmethod - def from_arrays(list arrays, names, dict metadata=None): + def from_arrays(list arrays, names, metadata=None): """ Construct a RecordBatch from multiple pyarrow.Arrays @@ -1062,7 +1062,7 @@ cdef class Table: columns = [col.data for col in self.columns] return _reconstruct_table, (columns, self.schema) - def replace_schema_metadata(self, dict metadata=None): + def replace_schema_metadata(self, metadata=None): """ EXPERIMENTAL: Create shallow copy of table by replacing schema key-value metadata with the indicated new metadata (which may be None, @@ -1076,15 +1076,19 @@ cdef class Table: ------- shallow_copy : Table """ - cdef shared_ptr[CKeyValueMetadata] c_meta + cdef: + shared_ptr[CKeyValueMetadata] c_meta + shared_ptr[CTable] c_table + if metadata is not None: - convert_metadata(metadata, &c_meta) + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + c_meta = pyarrow_unwrap_metadata(metadata) - cdef shared_ptr[CTable] new_table with nogil: - new_table = self.table.ReplaceSchemaMetadata(c_meta) + c_table = self.table.ReplaceSchemaMetadata(c_meta) - return pyarrow_wrap_table(new_table) + return pyarrow_wrap_table(c_table) def flatten(self, MemoryPool memory_pool=None): """ @@ -1225,7 +1229,7 @@ cdef class Table: return cls.from_arrays(arrays, names=names, metadata=metadata) @staticmethod - def from_arrays(arrays, names=None, schema=None, dict metadata=None): + def from_arrays(arrays, names=None, schema=None, metadata=None): """ Construct a Table from Arrow arrays or columns @@ -1236,6 +1240,8 @@ cdef class Table: names: list of str, optional Names for the table columns. If Columns passed, will be inferred. If Arrays passed, this argument is required + schema : Schema, default None + If not passed, will be inferred from the arrays Returns ------- diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 6cdedbbb507cc..69e8e82e2532a 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -15,7 +15,9 @@ # specific language governing permissions and limitations # under the License. +import os import pytest +import hypothesis as h try: import pathlib @@ -23,7 +25,20 @@ import pathlib2 as pathlib # py2 compat +# setup hypothesis profiles +h.settings.register_profile('ci', max_examples=1000) +h.settings.register_profile('dev', max_examples=10) +h.settings.register_profile('debug', max_examples=10, + verbosity=h.Verbosity.verbose) + +# load default hypothesis profile, either set HYPOTHESIS_PROFILE environment +# variable or pass --hypothesis-profile option to pytest, to see the generated +# examples try: pytest pyarrow -sv --only-hypothesis --hypothesis-profile=debug +h.settings.load_profile(os.environ.get('HYPOTHESIS_PROFILE', 'default')) + + groups = [ + 'hypothesis', 'gandiva', 'hdfs', 'large_memory', @@ -36,6 +51,7 @@ defaults = { + 'hypothesis': False, 'gandiva': False, 'hdfs': False, 'large_memory': False, @@ -84,16 +100,15 @@ def pytest_configure(config): def pytest_addoption(parser): for group in groups: - parser.addoption('--{0}'.format(group), action='store_true', - default=defaults[group], - help=('Enable the {0} test group'.format(group))) + for flag in ['--{0}', '--enable-{0}']: + parser.addoption(flag.format(group), action='store_true', + default=defaults[group], + help=('Enable the {0} test group'.format(group))) - for group in groups: parser.addoption('--disable-{0}'.format(group), action='store_true', default=False, help=('Disable the {0} test group'.format(group))) - for group in groups: parser.addoption('--only-{0}'.format(group), action='store_true', default=False, help=('Run only the {0} test group'.format(group))) @@ -115,15 +130,18 @@ def pytest_runtest_setup(item): only_set = False for group in groups: + flag = '--{0}'.format(group) only_flag = '--only-{0}'.format(group) + enable_flag = '--enable-{0}'.format(group) disable_flag = '--disable-{0}'.format(group) - flag = '--{0}'.format(group) if item.config.getoption(only_flag): only_set = True elif getattr(item.obj, group, None): - if (item.config.getoption(disable_flag) or - not item.config.getoption(flag)): + is_enabled = (item.config.getoption(flag) or + item.config.getoption(enable_flag)) + is_disabled = item.config.getoption(disable_flag) + if is_disabled or not is_enabled: pytest.skip('{0} NOT enabled'.format(flag)) if only_set: diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py new file mode 100644 index 0000000000000..bc8ded2e896d0 --- /dev/null +++ b/python/pyarrow/tests/strategies.py @@ -0,0 +1,138 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +import hypothesis.strategies as st + + +# TODO(kszucs): alphanum_text, surrogate_text +custom_text = st.text( + alphabet=st.characters( + min_codepoint=0x41, + max_codepoint=0x7E + ) +) + +null_type = st.just(pa.null()) +bool_type = st.just(pa.bool_()) + +binary_type = st.just(pa.binary()) +string_type = st.just(pa.string()) + +signed_integer_types = st.sampled_from([ + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64() +]) +unsigned_integer_types = st.sampled_from([ + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64() +]) +integer_types = st.one_of(signed_integer_types, unsigned_integer_types) + +floating_types = st.sampled_from([ + pa.float16(), + pa.float32(), + pa.float64() +]) +decimal_type = st.builds( + pa.decimal128, + precision=st.integers(min_value=0, max_value=38), + scale=st.integers(min_value=0, max_value=38) +) +numeric_types = st.one_of(integer_types, floating_types, decimal_type) + +date_types = st.sampled_from([ + pa.date32(), + pa.date64() +]) +time_types = st.sampled_from([ + pa.time32('s'), + pa.time32('ms'), + pa.time64('us'), + pa.time64('ns') +]) +timestamp_types = st.sampled_from([ + pa.timestamp('s'), + pa.timestamp('ms'), + pa.timestamp('us'), + pa.timestamp('ns') +]) +temporal_types = st.one_of(date_types, time_types, timestamp_types) + +primitive_types = st.one_of( + null_type, + bool_type, + binary_type, + string_type, + numeric_types, + temporal_types +) + +metadata = st.dictionaries(st.text(), st.text()) + + +@st.defines_strategy +def fields(type_strategy=primitive_types): + return st.builds(pa.field, name=custom_text, type=type_strategy, + nullable=st.booleans(), metadata=metadata) + + +@st.defines_strategy +def list_types(item_strategy=primitive_types): + return st.builds(pa.list_, item_strategy) + + +@st.defines_strategy +def struct_types(item_strategy=primitive_types): + return st.builds(pa.struct, st.lists(fields(item_strategy))) + + +@st.defines_strategy +def complex_types(inner_strategy=primitive_types): + return list_types(inner_strategy) | struct_types(inner_strategy) + + +@st.defines_strategy +def nested_list_types(item_strategy=primitive_types): + return st.recursive(item_strategy, list_types) + + +@st.defines_strategy +def nested_struct_types(item_strategy=primitive_types): + return st.recursive(item_strategy, struct_types) + + +@st.defines_strategy +def nested_complex_types(inner_strategy=primitive_types): + return st.recursive(inner_strategy, complex_types) + + +@st.defines_strategy +def schemas(type_strategy=primitive_types): + return st.builds(pa.schema, st.lists(fields(type_strategy))) + + +complex_schemas = schemas(complex_types()) + + +all_types = st.one_of(primitive_types, complex_types(), nested_complex_types()) +all_fields = fields(all_types) +all_schemas = schemas(all_types) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 176ce8769f488..310656d86fd47 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -19,11 +19,14 @@ import pickle import pytest +import hypothesis as h +import hypothesis.strategies as st import pandas as pd import numpy as np import pyarrow as pa import pyarrow.types as types +import pyarrow.tests.strategies as past def get_many_types(): @@ -466,15 +469,27 @@ def test_field_metadata(): def test_field_add_remove_metadata(): + import collections + f0 = pa.field('foo', pa.int32()) assert f0.metadata is None metadata = {b'foo': b'bar', b'pandas': b'badger'} + metadata2 = collections.OrderedDict([ + (b'a', b'alpha'), + (b'b', b'beta') + ]) f1 = f0.add_metadata(metadata) assert f1.metadata == metadata + f2 = f0.add_metadata(metadata2) + assert f2.metadata == metadata2 + + with pytest.raises(TypeError): + f0.add_metadata([1, 2, 3]) + f3 = f1.remove_metadata() assert f3.metadata is None @@ -533,3 +548,38 @@ def test_schema_from_pandas(data): schema = pa.Schema.from_pandas(df) expected = pa.Table.from_pandas(df).schema assert schema == expected + + +@h.given( + past.all_types | + past.all_fields | + past.all_schemas +) +@h.example( + pa.field(name='', type=pa.null(), metadata={'0': '', '': ''}) +) +def test_pickling(field): + data = pickle.dumps(field) + assert pickle.loads(data) == field + + +@h.given( + st.lists(past.all_types) | + st.lists(past.all_fields) | + st.lists(past.all_schemas) +) +def test_hashing(items): + h.assume( + # well, this is still O(n^2), but makes the input unique + all(not a.equals(b) for i, a in enumerate(items) for b in items[:i]) + ) + + container = {} + for i, item in enumerate(items): + assert hash(item) == hash(item) + container[item] = i + + assert len(container) == len(items) + + for i, item in enumerate(items): + assert container[item] == i diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 1ebd196fabf95..f69190c1c2eaa 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -430,11 +430,9 @@ cdef class Field: @property def metadata(self): - cdef shared_ptr[const CKeyValueMetadata] metadata = ( - self.field.metadata()) - return box_metadata(metadata.get()) + return pyarrow_wrap_metadata(self.field.metadata()) - def add_metadata(self, dict metadata): + def add_metadata(self, metadata): """ Add metadata as dict of string keys and values to Field @@ -447,14 +445,18 @@ cdef class Field: ------- field : pyarrow.Field """ - cdef shared_ptr[CKeyValueMetadata] c_meta - convert_metadata(metadata, &c_meta) + cdef: + shared_ptr[CField] c_field + shared_ptr[CKeyValueMetadata] c_meta - cdef shared_ptr[CField] new_field + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + + c_meta = pyarrow_unwrap_metadata(metadata) with nogil: - new_field = self.field.AddMetadata(c_meta) + c_field = self.field.AddMetadata(c_meta) - return pyarrow_wrap_field(new_field) + return pyarrow_wrap_field(c_field) def remove_metadata(self): """ @@ -515,6 +517,9 @@ cdef class Schema: def __reduce__(self): return schema, (list(self), self.metadata) + def __hash__(self): + return hash((tuple(self), self.metadata)) + @property def names(self): """ @@ -544,9 +549,7 @@ cdef class Schema: @property def metadata(self): - cdef shared_ptr[const CKeyValueMetadata] metadata = ( - self.schema.metadata()) - return box_metadata(metadata.get()) + return pyarrow_wrap_metadata(self.schema.metadata()) def __eq__(self, other): try: @@ -728,7 +731,7 @@ cdef class Schema: return pyarrow_wrap_schema(new_schema) - def add_metadata(self, dict metadata): + def add_metadata(self, metadata): """ Add metadata as dict of string keys and values to Schema @@ -741,14 +744,18 @@ cdef class Schema: ------- schema : pyarrow.Schema """ - cdef shared_ptr[CKeyValueMetadata] c_meta - convert_metadata(metadata, &c_meta) + cdef: + shared_ptr[CKeyValueMetadata] c_meta + shared_ptr[CSchema] c_schema - cdef shared_ptr[CSchema] new_schema + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + + c_meta = pyarrow_unwrap_metadata(metadata) with nogil: - new_schema = self.schema.AddMetadata(c_meta) + c_schema = self.schema.AddMetadata(c_meta) - return pyarrow_wrap_schema(new_schema) + return pyarrow_wrap_schema(c_schema) def serialize(self, memory_pool=None): """ @@ -810,15 +817,6 @@ cdef class Schema: return self.__str__() -cdef dict box_metadata(const CKeyValueMetadata* metadata): - cdef unordered_map[c_string, c_string] result - if metadata != nullptr: - metadata.ToUnorderedMap(&result) - return result - else: - return None - - cdef dict _type_cache = {} @@ -832,25 +830,12 @@ cdef DataType primitive_type(Type type): _type_cache[type] = out return out + # ----------------------------------------------------------- # Type factory functions -cdef int convert_metadata(dict metadata, - shared_ptr[CKeyValueMetadata]* out) except -1: - cdef: - shared_ptr[CKeyValueMetadata] meta = ( - make_shared[CKeyValueMetadata]()) - c_string key, value - - for py_key, py_value in metadata.items(): - key = tobytes(py_key) - value = tobytes(py_value) - meta.get().Append(key, value) - out[0] = meta - return 0 - -def field(name, type, bint nullable=True, dict metadata=None): +def field(name, type, bint nullable=True, metadata=None): """ Create a pyarrow.Field instance @@ -867,17 +852,21 @@ def field(name, type, bint nullable=True, dict metadata=None): field : pyarrow.Field """ cdef: - shared_ptr[CKeyValueMetadata] c_meta Field result = Field.__new__(Field) DataType _type = ensure_type(type, allow_none=False) + shared_ptr[CKeyValueMetadata] c_meta if metadata is not None: - convert_metadata(metadata, &c_meta) + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + c_meta = pyarrow_unwrap_metadata(metadata) - result.sp_field.reset(new CField(tobytes(name), _type.sp_type, - nullable == 1, c_meta)) + result.sp_field.reset( + new CField(tobytes(name), _type.sp_type, nullable, c_meta) + ) result.field = result.sp_field.get() result.type = _type + return result @@ -1490,7 +1479,7 @@ cdef DataType ensure_type(object ty, c_bool allow_none=False): raise TypeError('DataType expected, got {!r}'.format(type(ty))) -def schema(fields, dict metadata=None): +def schema(fields, metadata=None): """ Construct pyarrow.Schema from collection of fields @@ -1535,11 +1524,14 @@ def schema(fields, dict metadata=None): c_fields.push_back(py_field.sp_field) if metadata is not None: - convert_metadata(metadata, &c_meta) + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + c_meta = pyarrow_unwrap_metadata(metadata) c_schema.reset(new CSchema(c_fields, c_meta)) result = Schema.__new__(Schema) result.init_schema(c_schema) + return result diff --git a/python/requirements-test.txt b/python/requirements-test.txt new file mode 100644 index 0000000000000..482e88860669a --- /dev/null +++ b/python/requirements-test.txt @@ -0,0 +1,5 @@ +-r requirements.txt +pandas +pytest +hypothesis +pathlib2; python_version < "3.4" diff --git a/python/requirements.txt b/python/requirements.txt index ddedd757da224..3a23d1dacf81e 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,6 +1,3 @@ -six -pytest -cloudpickle>=0.4.0 -numpy>=1.14.0 -futures; python_version < "3" -pathlib2; python_version < "3.4" +six>=1.0.0 +numpy>=1.14 +futures; python_version < "3.2" diff --git a/python/setup.py b/python/setup.py index e6a88712c0e09..b8d192ddaec45 100755 --- a/python/setup.py +++ b/python/setup.py @@ -577,7 +577,8 @@ def has_ext_modules(foo): }, setup_requires=['setuptools_scm', 'cython >= 0.27'] + setup_requires, install_requires=install_requires, - tests_require=['pytest', 'pandas', 'pathlib2; python_version < "3.4"'], + tests_require=['pytest', 'pandas', 'hypothesis', + 'pathlib2; python_version < "3.4"'], description="Python library for Apache Arrow", long_description=long_description, long_description_content_type="text/markdown", From 9c8ddae11622ace00a187c46412309af82191b74 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Mon, 10 Dec 2018 09:51:41 -0600 Subject: [PATCH 16/45] ARROW-3942: [R] Feather api fixes Some fixes to follow up open #3043, and added the columns argument to `read_feather` that can be: - character vector - integer vector : 1-based in R - NULL: to get all columns (the default) Also adds `as_tibble` argument to read_feather to switch between data.frame and arrow::Table return value Author: Romain Francois Closes #3106 from romainfrancois/ARROW-3942/feather and squashes the following commits: 13061af4d fixed link in documentation ce414c153 + as_tibble argument to read_feather() d6c30a38b + columns argument to read_feather() 46a6fbb69 Update feather factories --- r/NAMESPACE | 16 ++--- r/R/RcppExports.R | 4 +- r/R/feather.R | 44 ++++++++------ ..._table_reader.Rd => FeatherTableReader.Rd} | 6 +- ..._table_writer.Rd => FeatherTableWriter.Rd} | 6 +- r/man/read_feather.Rd | 10 +++- r/src/RcppExports.cpp | 9 +-- r/src/feather.cpp | 32 +++++++++- r/tests/testthat/test-feather.R | 59 ++++++++++++++++--- 9 files changed, 134 insertions(+), 52 deletions(-) rename r/man/{feather_table_reader.Rd => FeatherTableReader.Rd} (80%) rename r/man/{feather_table_writer.Rd => FeatherTableWriter.Rd} (74%) diff --git a/r/NAMESPACE b/r/NAMESPACE index cc5961e5ba148..65d60d846f4cb 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -8,6 +8,12 @@ S3method("==","arrow::RecordBatch") S3method("==","arrow::ipc::Message") S3method(BufferReader,"arrow::Buffer") S3method(BufferReader,default) +S3method(FeatherTableReader,"arrow::io::RandomAccessFile") +S3method(FeatherTableReader,"arrow::ipc::feather::TableReader") +S3method(FeatherTableReader,character) +S3method(FeatherTableReader,default) +S3method(FeatherTableReader,fs_path) +S3method(FeatherTableWriter,"arrow::io::OutputStream") S3method(FixedSizeBufferWriter,"arrow::Buffer") S3method(FixedSizeBufferWriter,default) S3method(MessageReader,"arrow::io::InputStream") @@ -33,12 +39,6 @@ S3method(buffer,default) S3method(buffer,integer) S3method(buffer,numeric) S3method(buffer,raw) -S3method(feather_table_reader,"arrow::io::RandomAccessFile") -S3method(feather_table_reader,"arrow::ipc::feather::TableReader") -S3method(feather_table_reader,character) -S3method(feather_table_reader,default) -S3method(feather_table_reader,fs_path) -S3method(feather_table_writer,"arrow::io::OutputStream") S3method(length,"arrow::Array") S3method(names,"arrow::RecordBatch") S3method(print,"arrow-enum") @@ -70,6 +70,8 @@ S3method(write_feather_RecordBatch,fs_path) export(BufferOutputStream) export(BufferReader) export(DateUnit) +export(FeatherTableReader) +export(FeatherTableWriter) export(FileMode) export(FileOutputStream) export(FixedSizeBufferWriter) @@ -95,8 +97,6 @@ export(date64) export(decimal) export(default_memory_pool) export(dictionary) -export(feather_table_reader) -export(feather_table_writer) export(field) export(float16) export(float32) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index ccf854927b76e..0310eab2027b9 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -445,8 +445,8 @@ ipc___feather___TableReader__GetColumn <- function(reader, i) { .Call(`_arrow_ipc___feather___TableReader__GetColumn`, reader, i) } -ipc___feather___TableReader__Read <- function(reader) { - .Call(`_arrow_ipc___feather___TableReader__Read`, reader) +ipc___feather___TableReader__Read <- function(reader, columns) { + .Call(`_arrow_ipc___feather___TableReader__Read`, reader, columns) } ipc___feather___TableReader__Open <- function(stream) { diff --git a/r/R/feather.R b/r/R/feather.R index bae71d31bc1e5..064652145c8e4 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -35,7 +35,9 @@ num_columns = function() ipc___feather___TableReader__num_columns(self), GetColumnName = function(i) ipc___feather___TableReader__GetColumnName(self, i), GetColumn = function(i) shared_ptr(`arrow::Column`, ipc___feather___TableReader__GetColumn(self, i)), - Read = function() shared_ptr(`arrow::Table`, ipc___feather___TableReader__Read(self)) + Read = function(columns) { + shared_ptr(`arrow::Table`, ipc___feather___TableReader__Read(self, columns)) + } ) ) @@ -44,12 +46,12 @@ #' @param stream an OutputStream #' #' @export -feather_table_writer <- function(stream) { - UseMethod("feather_table_writer") +FeatherTableWriter <- function(stream) { + UseMethod("FeatherTableWriter") } #' @export -`feather_table_writer.arrow::io::OutputStream` <- function(stream){ +`FeatherTableWriter.arrow::io::OutputStream` <- function(stream){ unique_ptr(`arrow::ipc::feather::TableWriter`, ipc___feather___TableWriter__Open(stream)) } @@ -107,7 +109,7 @@ write_feather_RecordBatch <- function(data, stream) { #' @export #' @method write_feather_RecordBatch arrow::io::OutputStream `write_feather_RecordBatch.arrow::io::OutputStream` <- function(data, stream) { - ipc___TableWriter__RecordBatch__WriteFeather(feather_table_writer(stream), data) + ipc___TableWriter__RecordBatch__WriteFeather(FeatherTableWriter(stream), data) } #' A arrow::ipc::feather::TableReader to read from a file @@ -117,44 +119,50 @@ write_feather_RecordBatch <- function(data, stream) { #' @param ... extra parameters #' #' @export -feather_table_reader <- function(file, mmap = TRUE, ...){ - UseMethod("feather_table_reader") +FeatherTableReader <- function(file, mmap = TRUE, ...){ + UseMethod("FeatherTableReader") } #' @export -feather_table_reader.default <- function(file, mmap = TRUE, ...) { +FeatherTableReader.default <- function(file, mmap = TRUE, ...) { stop("unsupported") } #' @export -feather_table_reader.character <- function(file, mmap = TRUE, ...) { - feather_table_reader(fs::path_abs(file), mmap = mmap, ...) +FeatherTableReader.character <- function(file, mmap = TRUE, ...) { + FeatherTableReader(fs::path_abs(file), mmap = mmap, ...) } #' @export -feather_table_reader.fs_path <- function(file, mmap = TRUE, ...) { +FeatherTableReader.fs_path <- function(file, mmap = TRUE, ...) { stream <- if(isTRUE(mmap)) mmap_open(file, ...) else ReadableFile(file, ...) - feather_table_reader(stream) + FeatherTableReader(stream) } #' @export -`feather_table_reader.arrow::io::RandomAccessFile` <- function(file, mmap = TRUE, ...){ +`FeatherTableReader.arrow::io::RandomAccessFile` <- function(file, mmap = TRUE, ...){ unique_ptr(`arrow::ipc::feather::TableReader`, ipc___feather___TableReader__Open(file)) } #' @export -`feather_table_reader.arrow::ipc::feather::TableReader` <- function(file, mmap = TRUE, ...){ +`FeatherTableReader.arrow::ipc::feather::TableReader` <- function(file, mmap = TRUE, ...){ file } #' Read a feather file #' -#' @param file a arrow::ipc::feather::TableReader or whatever the [feather_table_reader()] function can handle +#' @param file a arrow::ipc::feather::TableReader or whatever the [FeatherTableReader()] function can handle +#' @param columns names if the columns to read. The default `NULL` means all columns +#' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble. #' @param ... additional parameters #' -#' @return an arrow::Table +#' @return a data frame if `as_tibble` is `TRUE` (the default), or a [arrow::Table][arrow__Table] otherwise #' #' @export -read_feather <- function(file, ...){ - feather_table_reader(file, ...)$Read() +read_feather <- function(file, columns = NULL, as_tibble = TRUE, ...){ + out <- FeatherTableReader(file, ...)$Read(columns) + if (isTRUE(as_tibble)) { + out <- as_tibble(out) + } + out } diff --git a/r/man/feather_table_reader.Rd b/r/man/FeatherTableReader.Rd similarity index 80% rename from r/man/feather_table_reader.Rd rename to r/man/FeatherTableReader.Rd index fb1c53429f860..15a260bd57cf6 100644 --- a/r/man/feather_table_reader.Rd +++ b/r/man/FeatherTableReader.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/feather.R -\name{feather_table_reader} -\alias{feather_table_reader} +\name{FeatherTableReader} +\alias{FeatherTableReader} \title{A arrow::ipc::feather::TableReader to read from a file} \usage{ -feather_table_reader(file, mmap = TRUE, ...) +FeatherTableReader(file, mmap = TRUE, ...) } \arguments{ \item{file}{A file path, arrow::io::RandomAccessFile} diff --git a/r/man/feather_table_writer.Rd b/r/man/FeatherTableWriter.Rd similarity index 74% rename from r/man/feather_table_writer.Rd rename to r/man/FeatherTableWriter.Rd index 36035aca12090..3acf5971a71b3 100644 --- a/r/man/feather_table_writer.Rd +++ b/r/man/FeatherTableWriter.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/feather.R -\name{feather_table_writer} -\alias{feather_table_writer} +\name{FeatherTableWriter} +\alias{FeatherTableWriter} \title{Create TableWriter that writes into a stream} \usage{ -feather_table_writer(stream) +FeatherTableWriter(stream) } \arguments{ \item{stream}{an OutputStream} diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index e86b86b99e9e2..31fd36ab65a26 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -4,15 +4,19 @@ \alias{read_feather} \title{Read a feather file} \usage{ -read_feather(file, ...) +read_feather(file, columns = NULL, as_tibble = TRUE, ...) } \arguments{ -\item{file}{a arrow::ipc::feather::TableReader or whatever the \code{\link[=feather_table_reader]{feather_table_reader()}} function can handle} +\item{file}{a arrow::ipc::feather::TableReader or whatever the \code{\link[=FeatherTableReader]{FeatherTableReader()}} function can handle} + +\item{columns}{names if the columns to read. The default \code{NULL} means all columns} + +\item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.} \item{...}{additional parameters} } \value{ -an arrow::Table +a data frame if \code{as_tibble} is \code{TRUE} (the default), or a \link[=arrow__Table]{arrow::Table} otherwise } \description{ Read a feather file diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index bca4eafdee4ce..e5a784eb70c23 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -1244,13 +1244,14 @@ BEGIN_RCPP END_RCPP } // ipc___feather___TableReader__Read -std::shared_ptr ipc___feather___TableReader__Read(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__Read(SEXP readerSEXP) { +std::shared_ptr ipc___feather___TableReader__Read(const std::unique_ptr& reader, SEXP columns); +RcppExport SEXP _arrow_ipc___feather___TableReader__Read(SEXP readerSEXP, SEXP columnsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::unique_ptr& >::type reader(readerSEXP); - rcpp_result_gen = Rcpp::wrap(ipc___feather___TableReader__Read(reader)); + Rcpp::traits::input_parameter< SEXP >::type columns(columnsSEXP); + rcpp_result_gen = Rcpp::wrap(ipc___feather___TableReader__Read(reader, columns)); return rcpp_result_gen; END_RCPP } @@ -2262,7 +2263,7 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_ipc___feather___TableReader__num_columns", (DL_FUNC) &_arrow_ipc___feather___TableReader__num_columns, 1}, {"_arrow_ipc___feather___TableReader__GetColumnName", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumnName, 2}, {"_arrow_ipc___feather___TableReader__GetColumn", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumn, 2}, - {"_arrow_ipc___feather___TableReader__Read", (DL_FUNC) &_arrow_ipc___feather___TableReader__Read, 1}, + {"_arrow_ipc___feather___TableReader__Read", (DL_FUNC) &_arrow_ipc___feather___TableReader__Read, 2}, {"_arrow_ipc___feather___TableReader__Open", (DL_FUNC) &_arrow_ipc___feather___TableReader__Open, 1}, {"_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3}, {"_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, diff --git a/r/src/feather.cpp b/r/src/feather.cpp index 7b84deefadb9c..8389156c3847b 100644 --- a/r/src/feather.cpp +++ b/r/src/feather.cpp @@ -115,9 +115,37 @@ std::shared_ptr ipc___feather___TableReader__GetColumn( // [[Rcpp::export]] std::shared_ptr ipc___feather___TableReader__Read( - const std::unique_ptr& reader) { + const std::unique_ptr& reader, SEXP columns) { std::shared_ptr table; - STOP_IF_NOT_OK(reader->Read(&table)); + + switch (TYPEOF(columns)) { + case INTSXP: { + R_xlen_t n = XLENGTH(columns); + std::vector indices(n); + int* p_columns = INTEGER(columns); + for (int i = 0; i < n; i++) { + indices[i] = p_columns[i] - 1; + } + STOP_IF_NOT_OK(reader->Read(indices, &table)); + break; + } + case STRSXP: { + R_xlen_t n = XLENGTH(columns); + std::vector names(n); + for (R_xlen_t i = 0; i < n; i++) { + names[i] = CHAR(STRING_ELT(columns, i)); + } + STOP_IF_NOT_OK(reader->Read(names, &table)); + break; + } + case NILSXP: + STOP_IF_NOT_OK(reader->Read(&table)); + break; + default: + Rcpp::stop("incompatible column specification"); + break; + }; + return table; } diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 715017fb5865c..23fdc58fd781e 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -34,25 +34,66 @@ test_that("feather read/write round trip", { expect_true(fs::file_exists(tf3)) tab1 <- read_feather(tf1) - expect_is(tab1, "arrow::Table") + expect_is(tab1, "data.frame") tab2 <- read_feather(tf2) - expect_is(tab2, "arrow::Table") + expect_is(tab2, "data.frame") tab3 <- read_feather(tf3) - expect_is(tab3, "arrow::Table") + expect_is(tab3, "data.frame") # reading directly from arrow::io::MemoryMappedFile tab4 <- read_feather(mmap_open(tf3)) - expect_is(tab4, "arrow::Table") + expect_is(tab4, "data.frame") # reading directly from arrow::io::ReadableFile tab5 <- read_feather(ReadableFile(tf3)) - expect_is(tab5, "arrow::Table") + expect_is(tab5, "data.frame") + + expect_equal(tib, tab1) + expect_equal(tib, tab2) + expect_equal(tib, tab3) + expect_equal(tib, tab4) + expect_equal(tib, tab5) +}) + +test_that("feather handles columns = ", { + tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) + + tf1 <- local_tempfile() + write_feather(tib, tf1) + expect_true(fs::file_exists(tf1)) + + tab1 <- read_feather(tf1, columns = c("x", "y")) + expect_is(tab1, "data.frame") + + expect_equal(tib[, c("x", "y")], as_tibble(tab1)) +}) + +test_that("feather handles columns = ", { + tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) + + tf1 <- local_tempfile() + write_feather(tib, tf1) + expect_true(fs::file_exists(tf1)) + + tab1 <- read_feather(tf1, columns = 1:2) + expect_is(tab1, "data.frame") + + expect_equal(tib[, c("x", "y")], as_tibble(tab1)) +}) + +test_that("feather read/write round trip", { + tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) + + tf1 <- local_tempfile() + write_feather(tib, tf1) + expect_true(fs::file_exists(tf1)) + + tab1 <- read_feather(tf1, as_tibble = FALSE) + expect_is(tab1, "arrow::Table") expect_equal(tib, as_tibble(tab1)) - expect_equal(tib, as_tibble(tab2)) - expect_equal(tib, as_tibble(tab3)) - expect_equal(tib, as_tibble(tab4)) - expect_equal(tib, as_tibble(tab5)) }) + + From 12201841212967c78e31b2d2840b55b1707c4e7b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Dec 2018 14:02:21 -0600 Subject: [PATCH 17/45] ARROW-3641: [Python] Remove unneeded public keyword from pyarrow public C APIs According to https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#c-api-declarations it is not necessary to use `public` here. If we want to be able to refer to Cython extension types at the C API level (at some point, this may not be a bad idea), then we must use `public` with those. Author: Wes McKinney Closes #3147 from wesm/ARROW-3641 and squashes the following commits: f09902cb4 Remove unneeded public keyword from pyarrow public APIs --- python/pyarrow/public-api.pxi | 58 +++++++++++++++++------------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index ef54c7ab42f74..7bd9154dfa8d7 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -24,11 +24,11 @@ from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, # methods don't use Status to indicate a successful operation. -cdef public api bint pyarrow_is_buffer(object buffer): +cdef api bint pyarrow_is_buffer(object buffer): return isinstance(buffer, Buffer) -cdef public api shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer): +cdef api shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer): cdef Buffer buf if pyarrow_is_buffer(buffer): buf = (buffer) @@ -37,24 +37,24 @@ cdef public api shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer): return shared_ptr[CBuffer]() -cdef public api object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf): +cdef api object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf): cdef Buffer result = Buffer.__new__(Buffer) result.init(buf) return result -cdef public api object pyarrow_wrap_resizable_buffer( +cdef api object pyarrow_wrap_resizable_buffer( const shared_ptr[CResizableBuffer]& buf): cdef ResizableBuffer result = ResizableBuffer.__new__(ResizableBuffer) result.init_rz(buf) return result -cdef public api bint pyarrow_is_data_type(object type_): +cdef api bint pyarrow_is_data_type(object type_): return isinstance(type_, DataType) -cdef public api shared_ptr[CDataType] pyarrow_unwrap_data_type( +cdef api shared_ptr[CDataType] pyarrow_unwrap_data_type( object data_type): cdef DataType type_ if pyarrow_is_data_type(data_type): @@ -64,7 +64,7 @@ cdef public api shared_ptr[CDataType] pyarrow_unwrap_data_type( return shared_ptr[CDataType]() -cdef public api object pyarrow_wrap_data_type( +cdef api object pyarrow_wrap_data_type( const shared_ptr[CDataType]& type): cdef DataType out @@ -117,11 +117,11 @@ cdef shared_ptr[CKeyValueMetadata] pyarrow_unwrap_metadata(object meta): return shared_ptr[CKeyValueMetadata]() -cdef public api bint pyarrow_is_field(object field): +cdef api bint pyarrow_is_field(object field): return isinstance(field, Field) -cdef public api shared_ptr[CField] pyarrow_unwrap_field(object field): +cdef api shared_ptr[CField] pyarrow_unwrap_field(object field): cdef Field field_ if pyarrow_is_field(field): field_ = (field) @@ -130,7 +130,7 @@ cdef public api shared_ptr[CField] pyarrow_unwrap_field(object field): return shared_ptr[CField]() -cdef public api object pyarrow_wrap_field(const shared_ptr[CField]& field): +cdef api object pyarrow_wrap_field(const shared_ptr[CField]& field): if field.get() == NULL: return None cdef Field out = Field.__new__(Field) @@ -138,11 +138,11 @@ cdef public api object pyarrow_wrap_field(const shared_ptr[CField]& field): return out -cdef public api bint pyarrow_is_schema(object schema): +cdef api bint pyarrow_is_schema(object schema): return isinstance(schema, Schema) -cdef public api shared_ptr[CSchema] pyarrow_unwrap_schema(object schema): +cdef api shared_ptr[CSchema] pyarrow_unwrap_schema(object schema): cdef Schema sch if pyarrow_is_schema(schema): sch = (schema) @@ -151,17 +151,17 @@ cdef public api shared_ptr[CSchema] pyarrow_unwrap_schema(object schema): return shared_ptr[CSchema]() -cdef public api object pyarrow_wrap_schema(const shared_ptr[CSchema]& schema): +cdef api object pyarrow_wrap_schema(const shared_ptr[CSchema]& schema): cdef Schema out = Schema.__new__(Schema) out.init_schema(schema) return out -cdef public api bint pyarrow_is_array(object array): +cdef api bint pyarrow_is_array(object array): return isinstance(array, Array) -cdef public api shared_ptr[CArray] pyarrow_unwrap_array(object array): +cdef api shared_ptr[CArray] pyarrow_unwrap_array(object array): cdef Array arr if pyarrow_is_array(array): arr = (array) @@ -170,7 +170,7 @@ cdef public api shared_ptr[CArray] pyarrow_unwrap_array(object array): return shared_ptr[CArray]() -cdef public api object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array): +cdef api object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array): if sp_array.get() == NULL: raise ValueError('Array was NULL') @@ -186,7 +186,7 @@ cdef public api object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array): return arr -cdef public api object pyarrow_wrap_chunked_array( +cdef api object pyarrow_wrap_chunked_array( const shared_ptr[CChunkedArray]& sp_array): if sp_array.get() == NULL: raise ValueError('ChunkedArray was NULL') @@ -201,11 +201,11 @@ cdef public api object pyarrow_wrap_chunked_array( return arr -cdef public api bint pyarrow_is_tensor(object tensor): +cdef api bint pyarrow_is_tensor(object tensor): return isinstance(tensor, Tensor) -cdef public api shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor): +cdef api shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor): cdef Tensor ten if pyarrow_is_tensor(tensor): ten = (tensor) @@ -214,7 +214,7 @@ cdef public api shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor): return shared_ptr[CTensor]() -cdef public api object pyarrow_wrap_tensor( +cdef api object pyarrow_wrap_tensor( const shared_ptr[CTensor]& sp_tensor): if sp_tensor.get() == NULL: raise ValueError('Tensor was NULL') @@ -224,11 +224,11 @@ cdef public api object pyarrow_wrap_tensor( return tensor -cdef public api bint pyarrow_is_column(object column): +cdef api bint pyarrow_is_column(object column): return isinstance(column, Column) -cdef public api shared_ptr[CColumn] pyarrow_unwrap_column(object column): +cdef api shared_ptr[CColumn] pyarrow_unwrap_column(object column): cdef Column col if pyarrow_is_column(column): col = (column) @@ -237,17 +237,17 @@ cdef public api shared_ptr[CColumn] pyarrow_unwrap_column(object column): return shared_ptr[CColumn]() -cdef public api object pyarrow_wrap_column(const shared_ptr[CColumn]& ccolumn): +cdef api object pyarrow_wrap_column(const shared_ptr[CColumn]& ccolumn): cdef Column column = Column.__new__(Column) column.init(ccolumn) return column -cdef public api bint pyarrow_is_table(object table): +cdef api bint pyarrow_is_table(object table): return isinstance(table, Table) -cdef public api shared_ptr[CTable] pyarrow_unwrap_table(object table): +cdef api shared_ptr[CTable] pyarrow_unwrap_table(object table): cdef Table tab if pyarrow_is_table(table): tab = (table) @@ -256,17 +256,17 @@ cdef public api shared_ptr[CTable] pyarrow_unwrap_table(object table): return shared_ptr[CTable]() -cdef public api object pyarrow_wrap_table(const shared_ptr[CTable]& ctable): +cdef api object pyarrow_wrap_table(const shared_ptr[CTable]& ctable): cdef Table table = Table.__new__(Table) table.init(ctable) return table -cdef public api bint pyarrow_is_batch(object batch): +cdef api bint pyarrow_is_batch(object batch): return isinstance(batch, RecordBatch) -cdef public api shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch): +cdef api shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch): cdef RecordBatch bat if pyarrow_is_batch(batch): bat = (batch) @@ -275,7 +275,7 @@ cdef public api shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch): return shared_ptr[CRecordBatch]() -cdef public api object pyarrow_wrap_batch( +cdef api object pyarrow_wrap_batch( const shared_ptr[CRecordBatch]& cbatch): cdef RecordBatch batch = RecordBatch.__new__(RecordBatch) batch.init(cbatch) From 24d00c0783e07ba4a7247779f569cd745ae60185 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Dec 2018 17:55:23 -0600 Subject: [PATCH 18/45] ARROW-3248: [C++] Add "arrow" prefix to Arrow core unit tests, use PREFIX instead of file name for csv, io, ipc tests. Modular target cleanup I added a section to the cpp/README.md about the modular build targets. Author: Wes McKinney Closes #3152 from wesm/ARROW-3248 and squashes the following commits: ba3a3e58c Need to add arrow- prefix to some Travis scripts 1f3daaf78 Rename io/ipc tests/executables. Add appropriate labels/prefixes to all unit tests/benchmarks. Add labels option to ADD_BENCHMARK --- ci/cpp-msvc-build-main.bat | 2 +- ci/travis_script_python.sh | 4 +- cpp/CMakeLists.txt | 12 ----- cpp/README.md | 15 +++++- cpp/cmake_modules/BuildUtils.cmake | 24 +++++++--- cpp/cmake_modules/ThirdpartyToolchain.cmake | 6 +-- cpp/src/arrow/CMakeLists.txt | 48 +++++++++++++++++++ cpp/src/arrow/csv/CMakeLists.txt | 18 ++++--- .../{csv-chunker-test.cc => chunker-test.cc} | 0 ...builder-test.cc => column-builder-test.cc} | 0 ...er-benchmark.cc => converter-benchmark.cc} | 0 ...sv-converter-test.cc => converter-test.cc} | 0 ...arser-benchmark.cc => parser-benchmark.cc} | 0 .../{csv-parser-test.cc => parser-test.cc} | 0 cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt | 2 +- cpp/src/arrow/io/CMakeLists.txt | 24 ++++++---- .../{io-buffered-test.cc => buffered-test.cc} | 0 ...-compressed-test.cc => compressed-test.cc} | 0 ...io-file-benchmark.cc => file-benchmark.cc} | 0 .../io/{io-file-test.cc => file-test.cc} | 0 .../io/{io-hdfs-test.cc => hdfs-test.cc} | 0 ...emory-benchmark.cc => memory-benchmark.cc} | 0 .../io/{io-memory-test.cc => memory-test.cc} | 0 ...io-readahead-test.cc => readahead-test.cc} | 0 cpp/src/arrow/ipc/CMakeLists.txt | 13 +++-- ...son-simple-test.cc => json-simple-test.cc} | 0 .../ipc/{ipc-json-test.cc => json-test.cc} | 0 ...e-benchmark.cc => read-write-benchmark.cc} | 0 ...-read-write-test.cc => read-write-test.cc} | 0 cpp/src/arrow/util/CMakeLists.txt | 6 +-- cpp/src/gandiva/CMakeLists.txt | 4 +- cpp/src/gandiva/tests/CMakeLists.txt | 2 +- cpp/src/parquet/CMakeLists.txt | 4 +- cpp/src/plasma/CMakeLists.txt | 18 +++++-- 34 files changed, 145 insertions(+), 57 deletions(-) rename cpp/src/arrow/csv/{csv-chunker-test.cc => chunker-test.cc} (100%) rename cpp/src/arrow/csv/{csv-column-builder-test.cc => column-builder-test.cc} (100%) rename cpp/src/arrow/csv/{csv-converter-benchmark.cc => converter-benchmark.cc} (100%) rename cpp/src/arrow/csv/{csv-converter-test.cc => converter-test.cc} (100%) rename cpp/src/arrow/csv/{csv-parser-benchmark.cc => parser-benchmark.cc} (100%) rename cpp/src/arrow/csv/{csv-parser-test.cc => parser-test.cc} (100%) rename cpp/src/arrow/io/{io-buffered-test.cc => buffered-test.cc} (100%) rename cpp/src/arrow/io/{io-compressed-test.cc => compressed-test.cc} (100%) rename cpp/src/arrow/io/{io-file-benchmark.cc => file-benchmark.cc} (100%) rename cpp/src/arrow/io/{io-file-test.cc => file-test.cc} (100%) rename cpp/src/arrow/io/{io-hdfs-test.cc => hdfs-test.cc} (100%) rename cpp/src/arrow/io/{io-memory-benchmark.cc => memory-benchmark.cc} (100%) rename cpp/src/arrow/io/{io-memory-test.cc => memory-test.cc} (100%) rename cpp/src/arrow/io/{io-readahead-test.cc => readahead-test.cc} (100%) rename cpp/src/arrow/ipc/{ipc-json-simple-test.cc => json-simple-test.cc} (100%) rename cpp/src/arrow/ipc/{ipc-json-test.cc => json-test.cc} (100%) rename cpp/src/arrow/ipc/{ipc-read-write-benchmark.cc => read-write-benchmark.cc} (100%) rename cpp/src/arrow/ipc/{ipc-read-write-test.cc => read-write-test.cc} (100%) diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index 7349f8d3aca6b..8703dc9631773 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -55,7 +55,7 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ .. || exit /B cmake --build . --target install --config %CONFIGURATION% || exit /B -@rem Needed so python-test.exe works +@rem Needed so arrow-python-test.exe works set OLD_PYTHONHOME=%PYTHONHOME% set PYTHONHOME=%CONDA_PREFIX% diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index b316c81f3b6b0..25bec262d861c 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -38,7 +38,7 @@ conda activate $CONDA_ENV_DIR # We should use zlib in the target Python directory to avoid loading # wrong libpython on macOS at run-time. If we use zlib in # $ARROW_BUILD_TOOLCHAIN and libpython3.6m.dylib exists in both -# $ARROW_BUILD_TOOLCHAIN and $CONDA_ENV_DIR, python-test uses +# $ARROW_BUILD_TOOLCHAIN and $CONDA_ENV_DIR, arrow-python-test uses # libpython3.6m.dylib on $ARROW_BUILD_TOOLCHAIN not $CONDA_ENV_DIR. # libpython3.6m.dylib on $ARROW_BUILD_TOOLCHAIN doesn't have NumPy. So # python-test fails. @@ -113,7 +113,7 @@ ninja install popd # python-test isn't run by travis_script_cpp.sh, exercise it here -$ARROW_CPP_BUILD_DIR/$ARROW_BUILD_TYPE/python-test +$ARROW_CPP_BUILD_DIR/$ARROW_BUILD_TYPE/arrow-python-test pushd $ARROW_PYTHON_DIR diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 68ac84e42dd6a..7140d05d577f2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -793,18 +793,6 @@ endif() add_subdirectory(src/arrow) -if(ARROW_FLIGHT) - add_subdirectory(src/arrow/flight) -endif() - -if(ARROW_PYTHON) - add_subdirectory(src/arrow/python) -endif() - -if(ARROW_HIVESERVER2) - add_subdirectory(src/arrow/dbi/hiveserver2) -endif() - if(ARROW_PARQUET) add_subdirectory(src/parquet) add_subdirectory(tools/parquet) diff --git a/cpp/README.md b/cpp/README.md index 394b23d69f8fc..7d0851762c291 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -82,7 +82,18 @@ environment variable (which requires the `locales` package or equivalent): export LC_ALL="en_US.UTF-8" ``` -## Building and Developing Parquet Libraries +## Modular Build Targets + +Since there are several major parts of the C++ project, we have provided +modular CMake targets for building each component along with its dependencies, +unit tests, and benchmarks (if enabled): + +* `make arrow` for Arrow core libraries +* `make parquet` for Parquet libraries +* `make gandiva` for Gandiva (LLVM expression compiler) libraries +* `make plasma` for Plasma libraries, server + +## Parquet Development Notes To build the C++ libraries for Apache Parquet, add the flag `-DARROW_PARQUET=ON` when invoking CMake. The Parquet libraries and unit tests @@ -120,7 +131,7 @@ with the `--ARROW_BUILD_BENCHMARKS` parameter set correctly: cmake -DARROW_BUILD_BENCHMARKS=ON .. and instead of make unittest run either `make; ctest` to run both unit tests -and benchmarks or `make runbenchmark` to run only the benchmark tests. +and benchmarks or `make benchmark` to run only the benchmark tests. Benchmark logs will be placed in the build directory under `build/benchmark-logs`. diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 916b9ebddb88e..bcf672823b424 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -290,7 +290,7 @@ endfunction() ############################################################ # Add a new micro benchmark, with or without an executable that should be built. # If benchmarks are enabled then they will be run along side unit tests with ctest. -# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, +# 'make benchmark' and 'make unittest' to build/run only benchmark or unittests, # respectively. # # REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component @@ -306,10 +306,10 @@ endfunction() # \arg PREFIX a string to append to the name of the benchmark executable. For # example, if you have src/arrow/foo/bar-benchmark.cc, then PREFIX "foo" will # create test executable foo-bar-benchmark -function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) +function(ADD_BENCHMARK REL_BENCHMARK_NAME) set(options) set(one_value_args) - set(multi_value_args EXTRA_LINK_LIBS DEPENDENCIES PREFIX) + set(multi_value_args EXTRA_LINK_LIBS DEPENDENCIES PREFIX LABELS) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if(ARG_UNPARSED_ARGUMENTS) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") @@ -329,7 +329,7 @@ function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") add_executable(${BENCHMARK_NAME} "${REL_BENCHMARK_NAME}.cc") target_link_libraries(${BENCHMARK_NAME} ${ARROW_BENCHMARK_LINK_LIBS}) - add_dependencies(runbenchmark ${BENCHMARK_NAME}) + add_dependencies(benchmark ${BENCHMARK_NAME}) set(NO_COLOR "--color_print=false") if (ARG_EXTRA_LINK_LIBS) @@ -345,9 +345,21 @@ function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) add_dependencies(${BENCHMARK_NAME} ${ARG_DEPENDENCIES}) endif() + if (ARG_LABELS) + set(ARG_LABELS "${ARG_LABELS}") + else() + set(ARG_LABELS benchmark) + endif() + + foreach (TEST_LABEL ${ARG_LABELS}) + add_dependencies(${TEST_LABEL} ${BENCHMARK_NAME}) + endforeach() + add_test(${BENCHMARK_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) - set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark") + set_property(TEST ${BENCHMARK_NAME} + APPEND PROPERTY + LABELS ${ARG_LABELS}) endfunction() ############################################################ @@ -377,7 +389,7 @@ endfunction() # multiple unit tests in some subgroup, you can assign a test to multiple # groups using the syntax unittest;GROUP2;GROUP3. Custom targets for the group # names must exist -function(ADD_ARROW_TEST REL_TEST_NAME) +function(ADD_TEST_CASE REL_TEST_NAME) set(options NO_VALGRIND ENABLED) set(one_value_args) set(multi_value_args SOURCES STATIC_LINK_LIBS EXTRA_LINK_LIBS EXTRA_INCLUDES diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 9829a4d3fbd80..6850b0bddefc5 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -627,7 +627,7 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) endif() if(ARROW_BUILD_BENCHMARKS) - add_custom_target(runbenchmark ctest -L benchmark) + add_custom_target(benchmark ctest -L benchmark) if("$ENV{GBENCHMARK_HOME}" STREQUAL "") if(NOT MSVC) @@ -664,11 +664,11 @@ if(ARROW_BUILD_BENCHMARKS) message(STATUS "GBenchmark include dir: ${GBENCHMARK_INCLUDE_DIR}") message(STATUS "GBenchmark static library: ${GBENCHMARK_STATIC_LIB}") include_directories(SYSTEM ${GBENCHMARK_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(benchmark + ADD_THIRDPARTY_LIB(gbenchmark STATIC_LIB ${GBENCHMARK_STATIC_LIB}) if(GBENCHMARK_VENDORED) - add_dependencies(benchmark_static gbenchmark_ep) + add_dependencies(gbenchmark_static gbenchmark_ep) endif() endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 8e932680de034..13aaeab494090 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -15,6 +15,42 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(arrow) + +# Adding unit tests part of the "arrow" portion of the test suite +function(ADD_ARROW_TEST REL_TEST_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + + if (ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "arrow") + endif() + ADD_TEST_CASE(${REL_TEST_NAME} + PREFIX ${PREFIX} + LABELS "unittest;arrow" + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + +function(ADD_ARROW_BENCHMARK REL_TEST_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if (ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "arrow") + endif() + ADD_BENCHMARK(${REL_TEST_NAME} + PREFIX ${PREFIX} + LABELS "benchmark;arrow" + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + set(ARROW_SRCS array.cc buffer.cc @@ -263,3 +299,15 @@ ADD_ARROW_BENCHMARK(column-benchmark) add_subdirectory(csv) add_subdirectory(io) add_subdirectory(util) + +if(ARROW_FLIGHT) + add_subdirectory(flight) +endif() + +if(ARROW_PYTHON) + add_subdirectory(python) +endif() + +if(ARROW_HIVESERVER2) + add_subdirectory(dbi/hiveserver2) +endif() diff --git a/cpp/src/arrow/csv/CMakeLists.txt b/cpp/src/arrow/csv/CMakeLists.txt index 84b080b1eef09..db23d6feff111 100644 --- a/cpp/src/arrow/csv/CMakeLists.txt +++ b/cpp/src/arrow/csv/CMakeLists.txt @@ -15,13 +15,19 @@ # specific language governing permissions and limitations # under the License. -ADD_ARROW_TEST(csv-chunker-test) -ADD_ARROW_TEST(csv-column-builder-test) -ADD_ARROW_TEST(csv-converter-test) -ADD_ARROW_TEST(csv-parser-test) +ADD_ARROW_TEST(chunker-test + PREFIX "arrow-csv") +ADD_ARROW_TEST(column-builder-test + PREFIX "arrow-csv") +ADD_ARROW_TEST(converter-test + PREFIX "arrow-csv") +ADD_ARROW_TEST(parser-test + PREFIX "arrow-csv") -ADD_ARROW_BENCHMARK(csv-converter-benchmark) -ADD_ARROW_BENCHMARK(csv-parser-benchmark) +ADD_ARROW_BENCHMARK(converter-benchmark + PREFIX "arrow-csv") +ADD_ARROW_BENCHMARK(parser-benchmark + PREFIX "arrow-csv") # Headers: top level file(GLOB_RECURSE ARROW_CSV_HEADERS "*.h") diff --git a/cpp/src/arrow/csv/csv-chunker-test.cc b/cpp/src/arrow/csv/chunker-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-chunker-test.cc rename to cpp/src/arrow/csv/chunker-test.cc diff --git a/cpp/src/arrow/csv/csv-column-builder-test.cc b/cpp/src/arrow/csv/column-builder-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-column-builder-test.cc rename to cpp/src/arrow/csv/column-builder-test.cc diff --git a/cpp/src/arrow/csv/csv-converter-benchmark.cc b/cpp/src/arrow/csv/converter-benchmark.cc similarity index 100% rename from cpp/src/arrow/csv/csv-converter-benchmark.cc rename to cpp/src/arrow/csv/converter-benchmark.cc diff --git a/cpp/src/arrow/csv/csv-converter-test.cc b/cpp/src/arrow/csv/converter-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-converter-test.cc rename to cpp/src/arrow/csv/converter-test.cc diff --git a/cpp/src/arrow/csv/csv-parser-benchmark.cc b/cpp/src/arrow/csv/parser-benchmark.cc similarity index 100% rename from cpp/src/arrow/csv/csv-parser-benchmark.cc rename to cpp/src/arrow/csv/parser-benchmark.cc diff --git a/cpp/src/arrow/csv/csv-parser-test.cc b/cpp/src/arrow/csv/parser-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-parser-test.cc rename to cpp/src/arrow/csv/parser-test.cc diff --git a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt index 3a16a7834c3c1..eb4446f05d971 100644 --- a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt +++ b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt @@ -115,7 +115,7 @@ if (ARROW_BUILD_TESTS) STATIC_LINK_LIBS "${ARROW_HIVESERVER2_TEST_LINK_LIBS}" LABELS "arrow_hiveserver2" ) - set_property(TARGET hiveserver2-test + set_property(TARGET arrow-hiveserver2-test APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-shadow-field") endif(ARROW_BUILD_TESTS) diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index d21bb16755271..80d68fb503bb9 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -18,19 +18,27 @@ # ---------------------------------------------------------------------- # arrow_io : Arrow IO interfaces -ADD_ARROW_TEST(io-buffered-test) -ADD_ARROW_TEST(io-compressed-test) -ADD_ARROW_TEST(io-file-test) +ADD_ARROW_TEST(buffered-test + PREFIX "arrow-io") +ADD_ARROW_TEST(compressed-test + PREFIX "arrow-io") +ADD_ARROW_TEST(file-test + PREFIX "arrow-io") if (ARROW_HDFS AND NOT ARROW_BOOST_HEADER_ONLY) - ADD_ARROW_TEST(io-hdfs-test NO_VALGRIND) + ADD_ARROW_TEST(hdfs-test NO_VALGRIND + PREFIX "arrow-io") endif() -ADD_ARROW_TEST(io-memory-test) -ADD_ARROW_TEST(io-readahead-test) +ADD_ARROW_TEST(memory-test + PREFIX "arrow-io") +ADD_ARROW_TEST(readahead-test + PREFIX "arrow-io") -ADD_ARROW_BENCHMARK(io-file-benchmark) -ADD_ARROW_BENCHMARK(io-memory-benchmark) +ADD_ARROW_BENCHMARK(file-benchmark + PREFIX "arrow-io") +ADD_ARROW_BENCHMARK(memory-benchmark + PREFIX "arrow-io") # Headers: top level install(FILES diff --git a/cpp/src/arrow/io/io-buffered-test.cc b/cpp/src/arrow/io/buffered-test.cc similarity index 100% rename from cpp/src/arrow/io/io-buffered-test.cc rename to cpp/src/arrow/io/buffered-test.cc diff --git a/cpp/src/arrow/io/io-compressed-test.cc b/cpp/src/arrow/io/compressed-test.cc similarity index 100% rename from cpp/src/arrow/io/io-compressed-test.cc rename to cpp/src/arrow/io/compressed-test.cc diff --git a/cpp/src/arrow/io/io-file-benchmark.cc b/cpp/src/arrow/io/file-benchmark.cc similarity index 100% rename from cpp/src/arrow/io/io-file-benchmark.cc rename to cpp/src/arrow/io/file-benchmark.cc diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/file-test.cc similarity index 100% rename from cpp/src/arrow/io/io-file-test.cc rename to cpp/src/arrow/io/file-test.cc diff --git a/cpp/src/arrow/io/io-hdfs-test.cc b/cpp/src/arrow/io/hdfs-test.cc similarity index 100% rename from cpp/src/arrow/io/io-hdfs-test.cc rename to cpp/src/arrow/io/hdfs-test.cc diff --git a/cpp/src/arrow/io/io-memory-benchmark.cc b/cpp/src/arrow/io/memory-benchmark.cc similarity index 100% rename from cpp/src/arrow/io/io-memory-benchmark.cc rename to cpp/src/arrow/io/memory-benchmark.cc diff --git a/cpp/src/arrow/io/io-memory-test.cc b/cpp/src/arrow/io/memory-test.cc similarity index 100% rename from cpp/src/arrow/io/io-memory-test.cc rename to cpp/src/arrow/io/memory-test.cc diff --git a/cpp/src/arrow/io/io-readahead-test.cc b/cpp/src/arrow/io/readahead-test.cc similarity index 100% rename from cpp/src/arrow/io/io-readahead-test.cc rename to cpp/src/arrow/io/readahead-test.cc diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 40cebf1823e2c..bda4ef3e417d5 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -19,9 +19,12 @@ # Messaging and interprocess communication ADD_ARROW_TEST(feather-test) -ADD_ARROW_TEST(ipc-read-write-test) -ADD_ARROW_TEST(ipc-json-simple-test) -ADD_ARROW_TEST(ipc-json-test) +ADD_ARROW_TEST(read-write-test + PREFIX "arrow-ipc") +ADD_ARROW_TEST(json-simple-test + PREFIX "arrow-ipc") +ADD_ARROW_TEST(json-test + PREFIX "arrow-ipc") if (NOT ARROW_BOOST_HEADER_ONLY) ADD_ARROW_TEST(json-integration-test @@ -116,6 +119,6 @@ if (ARROW_BUILD_UTILITIES) target_link_libraries(stream-to-file ${UTIL_LINK_LIBS}) endif() -ADD_ARROW_BENCHMARK(ipc-read-write-benchmark) - +ADD_ARROW_BENCHMARK(read-write-benchmark + PREFIX "arrow-ipc") ADD_ARROW_FUZZING(ipc-fuzzing-test) diff --git a/cpp/src/arrow/ipc/ipc-json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc similarity index 100% rename from cpp/src/arrow/ipc/ipc-json-simple-test.cc rename to cpp/src/arrow/ipc/json-simple-test.cc diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/json-test.cc similarity index 100% rename from cpp/src/arrow/ipc/ipc-json-test.cc rename to cpp/src/arrow/ipc/json-test.cc diff --git a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc b/cpp/src/arrow/ipc/read-write-benchmark.cc similarity index 100% rename from cpp/src/arrow/ipc/ipc-read-write-benchmark.cc rename to cpp/src/arrow/ipc/read-write-benchmark.cc diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/read-write-test.cc similarity index 100% rename from cpp/src/arrow/ipc/ipc-read-write-test.cc rename to cpp/src/arrow/ipc/read-write-test.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 6b9c3590b44dc..4f515b52e8e64 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -68,16 +68,16 @@ if (ARROW_BUILD_BENCHMARKS) add_library(arrow_benchmark_main benchmark_main.cc) if (APPLE) target_link_libraries(arrow_benchmark_main - benchmark_static + gbenchmark_static ) elseif(MSVC) target_link_libraries(arrow_benchmark_main - benchmark_static + gbenchmark_static Shlwapi.lib ) else() target_link_libraries(arrow_benchmark_main - benchmark_static + gbenchmark_static pthread ) endif() diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 1f76f7841590a..5d75aa271152b 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -153,11 +153,11 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) # and uses less disk space, but in some cases we need to force static # linking (see rationale below). if (ARG_USE_STATIC_LINKING) - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} ${TEST_ARGUMENTS} STATIC_LINK_LIBS ${GANDIVA_STATIC_TEST_LINK_LIBS}) else() - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} ${TEST_ARGUMENTS} STATIC_LINK_LIBS ${GANDIVA_SHARED_TEST_LINK_LIBS}) endif() diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index 1fd30aac495cf..9558fc0757f7b 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -32,6 +32,6 @@ ADD_GANDIVA_TEST(projector_test_static SOURCES projector_test.cc USE_STATIC_LINKING) -ADD_ARROW_BENCHMARK(micro_benchmarks +ADD_BENCHMARK(micro_benchmarks PREFIX "gandiva" EXTRA_LINK_LIBS gandiva_static) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 1538b58164b62..246f69dcc09fa 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -38,12 +38,12 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) # and uses less disk space, but in some cases we need to force static # linking (see rationale below). if (ARG_USE_STATIC_LINKING) - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} STATIC_LINK_LIBS ${PARQUET_STATIC_TEST_LINK_LIBS} PREFIX "parquet" LABELS "unittest;parquet") else() - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} STATIC_LINK_LIBS ${PARQUET_SHARED_TEST_LINK_LIBS} PREFIX "parquet" LABELS "unittest;parquet") diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 0f8916e6c48aa..4ea4b76066cf7 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -cmake_minimum_required(VERSION 3.2) +add_custom_target(plasma) # For the moment, Plasma is versioned like Arrow project(plasma VERSION "${ARROW_BASE_VERSION}") @@ -198,8 +198,20 @@ endif() # Unit tests ####################################### -ADD_ARROW_TEST(test/serialization_tests +# Adding unit tests part of the "arrow" portion of the test suite +function(ADD_PLASMA_TEST REL_TEST_NAME) + set(options) + set(one_value_args) + set(multi_value_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + ADD_TEST_CASE(${REL_TEST_NAME} + PREFIX "plasma" + LABELS "unittest;plasma" + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + +ADD_PLASMA_TEST(test/serialization_tests EXTRA_LINK_LIBS plasma_shared ${PLASMA_LINK_LIBS}) -ADD_ARROW_TEST(test/client_tests +ADD_PLASMA_TEST(test/client_tests EXTRA_LINK_LIBS plasma_shared ${PLASMA_LINK_LIBS} EXTRA_DEPENDENCIES plasma_store_server) From 2428945c0684bed4295f783caaf4a681ef785d90 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 10 Dec 2018 20:03:34 -0700 Subject: [PATCH 19/45] ARROW-3880: [Rust] Implement simple math operations for numeric arrays Author: Andy Grove Closes #3033 from andygrove/ARROW-3880 and squashes the following commits: 17cd418 merge from master afb3518 Move min and max to array_ops 0c77c61 code cleanup f8bfb41 move comparison ops to array_ops 7a5975e Move math ops into new array_ops source file 7946142 Address PR feedback adfe4b0 merge from master and fix conflicts 5ed5f6e add comparison operations 42c68af re-implement with generics 963def6 Merge branch 'master' into ARROW-3880 729cd9a fix formatting 405c63e re-implement using macros 5876fb7 save work a2b87e2 merge from master, comment out new methods 2a43b3f merge from master 06bbc4a improve handling of divide by zero, format for rust nightly 1ea98cf Improve error handling dcad28a cargo fmt 12dc05b Implement simple math operations for numeric arrays --- rust/Cargo.toml | 1 + rust/src/array.rs | 47 ----- rust/src/array_ops.rs | 418 ++++++++++++++++++++++++++++++++++++++++++ rust/src/error.rs | 2 + rust/src/lib.rs | 1 + 5 files changed, 422 insertions(+), 47 deletions(-) create mode 100644 rust/src/array_ops.rs diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 39de50c8a336d..aa23815f74085 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -42,6 +42,7 @@ serde_derive = "1.0.80" serde_json = "1.0.13" rand = "0.5" csv = "1.0.0" +num = "0.2" [dev-dependencies] criterion = "0.2" diff --git a/rust/src/array.rs b/rust/src/array.rs index 51bc8d993c19b..11e732a1267ea 100644 --- a/rust/src/array.rs +++ b/rust/src/array.rs @@ -200,39 +200,6 @@ impl PrimitiveArray { &raw[offset..offset + len] } - /// Returns the minimum value in the array, according to the natural order. - pub fn min(&self) -> Option { - self.min_max_helper(|a, b| a < b) - } - - /// Returns the maximum value in the array, according to the natural order. - pub fn max(&self) -> Option { - self.min_max_helper(|a, b| a > b) - } - - fn min_max_helper(&self, cmp: F) -> Option - where - F: Fn(T::Native, T::Native) -> bool, - { - let mut n: Option = None; - let data = self.data(); - for i in 0..data.len() { - if data.is_null(i) { - continue; - } - let m = self.value(i); - match n { - None => n = Some(m), - Some(nn) => { - if cmp(m, nn) { - n = Some(m) - } - } - } - } - n - } - // Returns a new primitive array builder pub fn builder(capacity: usize) -> PrimitiveArrayBuilder { PrimitiveArrayBuilder::::new(capacity) @@ -1218,20 +1185,6 @@ mod tests { BinaryArray::from(array_data); } - #[test] - fn test_buffer_array_min_max() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - assert_eq!(5, a.min().unwrap()); - assert_eq!(9, a.max().unwrap()); - } - - #[test] - fn test_buffer_array_min_max_with_nulls() { - let a = Int32Array::from(vec![Some(5), None, None, Some(8), Some(9)]); - assert_eq!(5, a.min().unwrap()); - assert_eq!(9, a.max().unwrap()); - } - #[test] fn test_access_array_concurrently() { let a = Int32Array::from(vec![5, 6, 7, 8, 9]); diff --git a/rust/src/array_ops.rs b/rust/src/array_ops.rs new file mode 100644 index 0000000000000..e73a858e951b1 --- /dev/null +++ b/rust/src/array_ops.rs @@ -0,0 +1,418 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::ops::{Add, Div, Mul, Sub}; + +use num::Zero; + +use crate::array::{Array, BooleanArray, PrimitiveArray}; +use crate::builder::{ArrayBuilder, PrimitiveArrayBuilder}; +use crate::datatypes; +use crate::datatypes::ArrowNumericType; +use crate::error::{ArrowError, Result}; + +pub fn add(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +where + T: datatypes::ArrowNumericType, + T::Native: Add + + Sub + + Mul + + Div + + Zero, +{ + math_op(left, right, |a, b| Ok(a + b)) +} + +pub fn subtract(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +where + T: datatypes::ArrowNumericType, + T::Native: Add + + Sub + + Mul + + Div + + Zero, +{ + math_op(left, right, |a, b| Ok(a - b)) +} + +pub fn multiply(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +where + T: datatypes::ArrowNumericType, + T::Native: Add + + Sub + + Mul + + Div + + Zero, +{ + math_op(left, right, |a, b| Ok(a * b)) +} + +pub fn divide(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +where + T: datatypes::ArrowNumericType, + T::Native: Add + + Sub + + Mul + + Div + + Zero, +{ + math_op(left, right, |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a / b) + } + }) +} + +fn math_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result> +where + T: datatypes::ArrowNumericType, + F: Fn(T::Native, T::Native) -> Result, +{ + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform math operation on two batches of different length".to_string(), + )); + } + let mut b = PrimitiveArrayBuilder::::new(left.len()); + for i in 0..left.len() { + let index = i; + if left.is_null(i) || right.is_null(i) { + b.push_null().unwrap(); + } else { + b.push(op(left.value(index), right.value(index))?).unwrap(); + } + } + Ok(b.finish()) +} + +/// Returns the minimum value in the array, according to the natural order. +pub fn min(array: &PrimitiveArray) -> Option +where + T: ArrowNumericType, +{ + min_max_helper(array, |a, b| a < b) +} + +/// Returns the maximum value in the array, according to the natural order. +pub fn max(array: &PrimitiveArray) -> Option +where + T: ArrowNumericType, +{ + min_max_helper(array, |a, b| a > b) +} + +fn min_max_helper(array: &PrimitiveArray, cmp: F) -> Option +where + T: ArrowNumericType, + F: Fn(T::Native, T::Native) -> bool, +{ + let mut n: Option = None; + let data = array.data(); + for i in 0..data.len() { + if data.is_null(i) { + continue; + } + let m = array.value(i); + match n { + None => n = Some(m), + Some(nn) => { + if cmp(m, nn) { + n = Some(m) + } + } + } + } + n +} + +pub fn eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| a == b) +} + +pub fn neq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| a != b) +} + +pub fn lt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| match (a, b) { + (None, _) => true, + (_, None) => false, + (Some(aa), Some(bb)) => aa < bb, + }) +} + +pub fn lt_eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| match (a, b) { + (None, _) => true, + (_, None) => false, + (Some(aa), Some(bb)) => aa <= bb, + }) +} + +pub fn gt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| match (a, b) { + (None, _) => false, + (_, None) => true, + (Some(aa), Some(bb)) => aa > bb, + }) +} + +pub fn gt_eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| match (a, b) { + (None, _) => false, + (_, None) => true, + (Some(aa), Some(bb)) => aa >= bb, + }) +} + +fn bool_op(left: &PrimitiveArray, right: &PrimitiveArray, op: F) -> Result +where + T: ArrowNumericType, + F: Fn(Option, Option) -> bool, +{ + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform math operation on two batches of different length".to_string(), + )); + } + let mut b = BooleanArray::builder(left.len()); + for i in 0..left.len() { + let index = i; + let l = if left.is_null(i) { + None + } else { + Some(left.value(index)) + }; + let r = if right.is_null(i) { + None + } else { + Some(right.value(index)) + }; + b.push(op(l, r)).unwrap(); + } + Ok(b.finish()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::array::{Float64Array, Int32Array}; + + #[test] + fn test_primitive_array_add() { + let a = Int32Array::from(vec![5, 6, 7, 8, 9]); + let b = Int32Array::from(vec![6, 7, 8, 9, 8]); + let c = add(&a, &b).unwrap(); + assert_eq!(11, c.value(0)); + assert_eq!(13, c.value(1)); + assert_eq!(15, c.value(2)); + assert_eq!(17, c.value(3)); + assert_eq!(17, c.value(4)); + } + + #[test] + fn test_primitive_array_add_mismatched_length() { + let a = Int32Array::from(vec![5, 6, 7, 8, 9]); + let b = Int32Array::from(vec![6, 7, 8]); + let e = add(&a, &b) + .err() + .expect("should have failed due to different lengths"); + assert_eq!( + "ComputeError(\"Cannot perform math operation on two batches of different length\")", + format!("{:?}", e) + ); + } + + #[test] + fn test_primitive_array_subtract() { + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + let b = Int32Array::from(vec![5, 4, 3, 2, 1]); + let c = subtract(&a, &b).unwrap(); + assert_eq!(-4, c.value(0)); + assert_eq!(-2, c.value(1)); + assert_eq!(0, c.value(2)); + assert_eq!(2, c.value(3)); + assert_eq!(4, c.value(4)); + } + + #[test] + fn test_primitive_array_multiply() { + let a = Int32Array::from(vec![5, 6, 7, 8, 9]); + let b = Int32Array::from(vec![6, 7, 8, 9, 8]); + let c = multiply(&a, &b).unwrap(); + assert_eq!(30, c.value(0)); + assert_eq!(42, c.value(1)); + assert_eq!(56, c.value(2)); + assert_eq!(72, c.value(3)); + assert_eq!(72, c.value(4)); + } + + #[test] + fn test_primitive_array_divide() { + let a = Int32Array::from(vec![15, 15, 8, 1, 9]); + let b = Int32Array::from(vec![5, 6, 8, 9, 1]); + let c = divide(&a, &b).unwrap(); + assert_eq!(3, c.value(0)); + assert_eq!(2, c.value(1)); + assert_eq!(1, c.value(2)); + assert_eq!(0, c.value(3)); + assert_eq!(9, c.value(4)); + } + + #[test] + fn test_primitive_array_divide_by_zero() { + let a = Int32Array::from(vec![15]); + let b = Int32Array::from(vec![0]); + assert_eq!( + ArrowError::DivideByZero, + divide(&a, &b).err().expect("divide by zero should fail") + ); + } + + #[test] + fn test_primitive_array_divide_f64() { + let a = Float64Array::from(vec![15.0, 15.0, 8.0]); + let b = Float64Array::from(vec![5.0, 6.0, 8.0]); + let c = divide(&a, &b).unwrap(); + assert_eq!(3.0, c.value(0)); + assert_eq!(2.5, c.value(1)); + assert_eq!(1.0, c.value(2)); + } + + #[test] + fn test_primitive_array_add_with_nulls() { + let a = Int32Array::from(vec![Some(5), None, Some(7), None]); + let b = Int32Array::from(vec![None, None, Some(6), Some(7)]); + let c = add(&a, &b).unwrap(); + assert_eq!(true, c.is_null(0)); + assert_eq!(true, c.is_null(1)); + assert_eq!(false, c.is_null(2)); + assert_eq!(true, c.is_null(3)); + assert_eq!(13, c.value(2)); + } + + #[test] + fn test_primitive_array_eq() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = eq(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(true, c.value(2)); + assert_eq!(false, c.value(3)); + assert_eq!(false, c.value(4)); + } + + #[test] + fn test_primitive_array_neq() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = neq(&a, &b).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(true, c.value(3)); + assert_eq!(true, c.value(4)); + } + + #[test] + fn test_primitive_array_lt() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = lt(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(true, c.value(3)); + assert_eq!(true, c.value(4)); + } + + #[test] + fn test_primitive_array_lt_eq() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = lt_eq(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(true, c.value(2)); + assert_eq!(true, c.value(3)); + assert_eq!(true, c.value(4)); + } + + #[test] + fn test_primitive_array_gt() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = gt(&a, &b).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(false, c.value(3)); + assert_eq!(false, c.value(4)); + } + + #[test] + fn test_primitive_array_gt_eq() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = gt_eq(&a, &b).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(true, c.value(2)); + assert_eq!(false, c.value(3)); + assert_eq!(false, c.value(4)); + } + + #[test] + fn test_buffer_array_min_max() { + let a = Int32Array::from(vec![5, 6, 7, 8, 9]); + assert_eq!(5, min(&a).unwrap()); + assert_eq!(9, max(&a).unwrap()); + } + + #[test] + fn test_buffer_array_min_max_with_nulls() { + let a = Int32Array::from(vec![Some(5), None, None, Some(8), Some(9)]); + assert_eq!(5, min(&a).unwrap()); + assert_eq!(9, max(&a).unwrap()); + } + +} diff --git a/rust/src/error.rs b/rust/src/error.rs index d82ee1190a68c..559b2d7205994 100644 --- a/rust/src/error.rs +++ b/rust/src/error.rs @@ -19,6 +19,8 @@ pub enum ArrowError { MemoryError(String), ParseError(String), + ComputeError(String), + DivideByZero, } pub type Result = ::std::result::Result; diff --git a/rust/src/lib.rs b/rust/src/lib.rs index e1670ff055971..b661c21279d22 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -27,6 +27,7 @@ extern crate serde_json; pub mod array; pub mod array_data; +pub mod array_ops; pub mod bitmap; pub mod buffer; pub mod builder; From bb3fa4b871b26df786c8f67b23208aae719b56e9 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Mon, 10 Dec 2018 21:30:34 -0600 Subject: [PATCH 20/45] ARROW-3993: [JS] CI Jobs Failing Use `gulp@4.0.0` rather than `gulp@next` Author: Brian Hulette Closes #3153 from TheNeuralBit/gulp-fix and squashes the following commits: e5d2e74c4 gulp@{next->4.0.0} --- integration/integration_test.py | 2 +- js/package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integration/integration_test.py b/integration/integration_test.py index 8021aa643263e..3bd37bdd80677 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -1053,7 +1053,7 @@ class CPPTester(Tester): 'ARROW_CPP_EXE_PATH', os.path.join(ARROW_HOME, 'cpp/build/debug')) - CPP_INTEGRATION_EXE = os.path.join(EXE_PATH, 'json-integration-test') + CPP_INTEGRATION_EXE = os.path.join(EXE_PATH, 'arrow-json-integration-test') STREAM_TO_FILE = os.path.join(EXE_PATH, 'stream-to-file') FILE_TO_STREAM = os.path.join(EXE_PATH, 'file-to-stream') diff --git a/js/package.json b/js/package.json index 9f76819c2e1fd..cf49e41dbe2f4 100644 --- a/js/package.json +++ b/js/package.json @@ -72,7 +72,7 @@ "del": "3.0.0", "glob": "7.1.3", "google-closure-compiler": "20181008.0.0", - "gulp": "next", + "gulp": "4.0.0", "gulp-json-transform": "0.4.5", "gulp-rename": "1.4.0", "gulp-sourcemaps": "2.6.4", From e7341356a365711ef2e19f2a4cb3ee59bc55296d Mon Sep 17 00:00:00 2001 From: "Korn, Uwe" Date: Tue, 11 Dec 2018 15:01:03 +0100 Subject: [PATCH 21/45] ARROW-3995: [CI] Use understandable names on Travis Author: Korn, Uwe Closes #3158 from xhochy/travis-names and squashes the following commits: f268f276 ARROW-3995: Use understandable names on Travis --- .travis.yml | 60 ++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7489d72c80502..42b1275d1c4bf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -50,8 +50,8 @@ matrix: - jdk: oraclejdk9 - language: r include: - # Lint C++, Python, R - - os: linux + - name: "Lint C++, Python, R" + os: linux language: python python: "3.6" env: @@ -62,8 +62,8 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh script: - $TRAVIS_BUILD_DIR/ci/travis_lint.sh - # C++ & Python w/ gcc 4.9 - - compiler: gcc + - name: "C++ & Python w/ gcc 4.9" + compiler: gcc language: cpp os: linux jdk: openjdk8 @@ -102,8 +102,8 @@ matrix: - export PLASMA_VALGRIND=1 - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh - # Gandiva C++ w/ gcc 4.9 and Java - - compiler: gcc + - name: "Gandiva C++ w/ gcc 4.9 and Java" + compiler: gcc language: cpp os: linux jdk: openjdk8 @@ -123,8 +123,8 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh - # [OS X] C++ & Python w/ XCode 6.4 - - compiler: clang + - name: "[OS X] C++ & Python w/ XCode 6.4" + compiler: clang language: cpp osx_image: xcode6.4 os: osx @@ -145,8 +145,8 @@ matrix: - if [ $ARROW_CI_CPP_AFFECTED == "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh; fi - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - # [OS X] Gandiva C++ w/ XCode 8.3 & Java - - compiler: clang + - name: "[OS X] Gandiva C++ w/ XCode 8.3 & Java" + compiler: clang language: cpp # xcode 7.3 has a bug in strptime. osx_image: xcode8.3 @@ -164,14 +164,14 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh - # [manylinux1] Python - - language: cpp + - name: "[manylinux1] Python" + language: cpp before_script: - if [ $ARROW_CI_PYTHON_AFFECTED == "1" ]; then docker pull quay.io/xhochy/arrow_manylinux1_x86_64_base:latest; fi script: - if [ $ARROW_CI_PYTHON_AFFECTED == "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_manylinux.sh; fi - # Java w/ OpenJDK 8 - - language: java + - name: "Java w/ OpenJDK 8" + language: java os: linux jdk: openjdk8 before_script: @@ -180,8 +180,8 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh - $TRAVIS_BUILD_DIR/ci/travis_script_javadoc.sh - # Java w/ Oracle JDK 9 - - language: java + - name: "Java w/ Oracle JDK 9" + language: java os: linux jdk: oraclejdk9 before_script: @@ -192,8 +192,8 @@ matrix: apt: packages: - oracle-java9-installer - # Integration w/ OpenJDK 8 - - language: java + - name: "Integration w/ OpenJDK 8" + language: java os: linux env: ARROW_TEST_GROUP=integration jdk: openjdk8 @@ -212,8 +212,8 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_integration.sh - $TRAVIS_BUILD_DIR/ci/travis_script_plasma_java_client.sh - # NodeJS - - language: node_js + - name: "NodeJS" + language: node_js os: linux node_js: - '10.1' @@ -223,8 +223,8 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_before_script_js.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_js.sh - # C++ & GLib & Ruby w/ gcc 4.9 - - compiler: gcc + - name: "C++ & GLib & Ruby w/ gcc 4.9" + compiler: gcc language: cpp os: linux env: @@ -245,8 +245,8 @@ matrix: script: - if [ $ARROW_CI_C_GLIB_AFFECTED = "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh; fi - $TRAVIS_BUILD_DIR/ci/travis_script_ruby.sh - # [OS X] C++ & GLib & Ruby w/ XCode 8.3 & homebrew - - compiler: clang + - name: "[OS X] C++ & GLib & Ruby w/ XCode 8.3 & homebrew" + compiler: clang osx_image: xcode8.3 os: osx env: @@ -266,8 +266,8 @@ matrix: script: - if [ $ARROW_CI_C_GLIB_AFFECTED = "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh; fi - $TRAVIS_BUILD_DIR/ci/travis_script_ruby.sh - # Rust - - language: rust + - name: Rust + language: rust cache: cargo addons: apt: @@ -289,8 +289,8 @@ matrix: - mkdir -p target/kcov - RUST_BACKTRACE=1 RUSTUP_TOOLCHAIN=stable cargo coverage --verbose - bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" - # Go - - language: go + - name: Go + language: go go_import_path: github.com/apache/arrow os: linux go: @@ -302,8 +302,8 @@ matrix: after_success: - pushd ${TRAVIS_BUILD_DIR}/go/arrow - bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" - # R - - language: r + - name: R + language: r cache: packages latex: false before_install: From a1eff5f3eee7609ce2a67b051d26aca810961f43 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 11 Dec 2018 15:16:06 +0100 Subject: [PATCH 22/45] ARROW-3986: [C++] Write prose documentation Author: Antoine Pitrou Closes #3149 from pitrou/ARROW-3986-cpp-prose-doc and squashes the following commits: 77e37940 ARROW-3986: Write prose documentation --- .gitignore | 9 ++ cpp/src/arrow/builder.h | 15 +- cpp/src/arrow/status.h | 61 +++++--- cpp/src/arrow/table.h | 7 + cpp/src/arrow/type.h | 16 ++- cpp/src/arrow/type_fwd.h | 7 + docs/source/cpp/api.rst | 8 +- docs/source/cpp/api/array.rst | 23 ++- docs/source/cpp/api/builder.rst | 59 ++++++++ docs/source/cpp/api/datatype.rst | 135 ++++++++++++++++++ docs/source/cpp/api/memory.rst | 4 +- docs/source/cpp/api/support.rst | 29 ++++ docs/source/cpp/arrays.rst | 211 ++++++++++++++++++++++++++++ docs/source/cpp/conventions.rst | 91 ++++++++++++ docs/source/cpp/datatypes.rst | 65 +++++++++ docs/source/cpp/getting_started.rst | 30 ++++ docs/source/cpp/index.rst | 65 +-------- docs/source/cpp/overview.rst | 93 ++++++++++++ docs/source/format/Metadata.rst | 2 + 19 files changed, 822 insertions(+), 108 deletions(-) create mode 100644 docs/source/cpp/api/builder.rst create mode 100644 docs/source/cpp/api/datatype.rst create mode 100644 docs/source/cpp/api/support.rst create mode 100644 docs/source/cpp/arrays.rst create mode 100644 docs/source/cpp/conventions.rst create mode 100644 docs/source/cpp/datatypes.rst create mode 100644 docs/source/cpp/getting_started.rst create mode 100644 docs/source/cpp/overview.rst diff --git a/.gitignore b/.gitignore index 5817efdcac091..61440bb504664 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ apache-rat-*.jar arrow-src.tar +arrow-src.tar.gz # Compiled source *.a @@ -36,10 +37,18 @@ MANIFEST *.sln *.iml +# Linux perf sample data +perf.data +perf.data.old + cpp/.idea/ cpp/apidoc/xml/ +docs/example.gz +docs/example1.dat +docs/example3.dat python/.eggs/ python/doc/ + .vscode .idea/ .pytest_cache/ diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 607fa1745a5a0..180b43a220f30 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -262,6 +262,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { return Status::OK(); } + /// \brief Append a single null element Status AppendNull() { ARROW_RETURN_NOT_OK(Reserve(1)); memset(raw_data_ + length_, 0, sizeof(value_type)); @@ -343,12 +344,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { return Status::OK(); } - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin uint8_t* indication valid(1) or null(0) values. - /// nullptr indicates all values are valid. - /// \return Status + // Same as above, with a pointer type ValidIter template typename std::enable_if::value, Status>::type AppendValues( ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { @@ -719,12 +715,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { return Status::OK(); } - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin uint8_t* indication valid(1) or null(0) values. - /// nullptr indicates all values are valid. - /// \return Status + // Same as above, for a pointer type ValidIter template typename std::enable_if::value, Status>::type AppendValues( ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index 7280133a65fb9..ddf3d7ee0e644 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -29,6 +29,7 @@ #ifdef ARROW_EXTRA_ERROR_CONTEXT +/// \brief Propagate any non-successful Status to the caller #define ARROW_RETURN_NOT_OK(s) \ do { \ ::arrow::Status _s = (s); \ @@ -41,6 +42,7 @@ #else +/// \brief Propagate any non-successful Status to the caller #define ARROW_RETURN_NOT_OK(s) \ do { \ ::arrow::Status _s = (s); \ @@ -107,6 +109,14 @@ enum class StatusCode : char { class ARROW_MUST_USE_RESULT ARROW_EXPORT Status; #endif +/// \brief Status outcome object (success or error) +/// +/// The Status object is an object holding the outcome of an operation. +/// The outcome is represented as a StatusCode, either success +/// (StatusCode::OK) or an error (any other of the StatusCode enumeration values). +/// +/// Additionally, if an error occurred, a specific error message is generally +/// attached. class ARROW_EXPORT Status { public: // Create a success status. @@ -135,45 +145,54 @@ class ARROW_EXPORT Status { Status& operator&=(const Status& s) noexcept; Status& operator&=(Status&& s) noexcept; - // Return a success status. + /// Return a success status static Status OK() { return Status(); } - // Return a success status with extra info + /// Return a success status with a specific message static Status OK(const std::string& msg) { return Status(StatusCode::OK, msg); } - // Return error status of an appropriate type. + /// Return an error status for out-of-memory conditions static Status OutOfMemory(const std::string& msg) { return Status(StatusCode::OutOfMemory, msg); } + /// Return an error status for failed key lookups (e.g. column name in a table) static Status KeyError(const std::string& msg) { return Status(StatusCode::KeyError, msg); } + /// Return an error status for type errors (such as mismatching data types) static Status TypeError(const std::string& msg) { return Status(StatusCode::TypeError, msg); } + /// Return an error status for unknown errors static Status UnknownError(const std::string& msg) { return Status(StatusCode::UnknownError, msg); } + /// Return an error status when an operation or a combination of operation and + /// data types is unimplemented static Status NotImplemented(const std::string& msg) { return Status(StatusCode::NotImplemented, msg); } + /// Return an error status for invalid data (for example a string that fails parsing) static Status Invalid(const std::string& msg) { return Status(StatusCode::Invalid, msg); } + /// Return an error status when a container's capacity would exceed its limits static Status CapacityError(const std::string& msg) { return Status(StatusCode::CapacityError, msg); } + /// Return an error status when some IO-related operation failed static Status IOError(const std::string& msg) { return Status(StatusCode::IOError, msg); } + /// Return an error status when some (de)serialization operation failed static Status SerializationError(const std::string& msg) { return Status(StatusCode::SerializationError, msg); } @@ -198,7 +217,6 @@ class ARROW_EXPORT Status { static Status StillExecuting() { return Status(StatusCode::StillExecuting, ""); } - // Return error status of an appropriate type. static Status CodeGenError(const std::string& msg) { return Status(StatusCode::CodeGenError, msg); } @@ -211,34 +229,42 @@ class ARROW_EXPORT Status { return Status(StatusCode::ExecutionError, msg); } - // Returns true iff the status indicates success. + /// Return true iff the status indicates success. bool ok() const { return (state_ == NULL); } + /// Return true iff the status indicates an out-of-memory error. bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } + /// Return true iff the status indicates a key lookup error. bool IsKeyError() const { return code() == StatusCode::KeyError; } + /// Return true iff the status indicates invalid data. bool IsInvalid() const { return code() == StatusCode::Invalid; } + /// Return true iff the status indicates an IO-related failure. bool IsIOError() const { return code() == StatusCode::IOError; } + /// Return true iff the status indicates a container reaching capacity limits. bool IsCapacityError() const { return code() == StatusCode::CapacityError; } + /// Return true iff the status indicates a type error. bool IsTypeError() const { return code() == StatusCode::TypeError; } + /// Return true iff the status indicates an unknown error. bool IsUnknownError() const { return code() == StatusCode::UnknownError; } + /// Return true iff the status indicates an unimplemented operation. bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } - // An object could not be serialized or deserialized. + /// Return true iff the status indicates a (de)serialization failure bool IsSerializationError() const { return code() == StatusCode::SerializationError; } - // An error from R + /// Return true iff the status indicates a R-originated error. bool IsRError() const { return code() == StatusCode::RError; } - // An error is propagated from a nested Python function. + /// Return true iff the status indicates a Python-originated error. bool IsPythonError() const { return code() == StatusCode::PythonError; } - // An object with this object ID already exists in the plasma store. + /// Return true iff the status indicates an already existing Plasma object. bool IsPlasmaObjectExists() const { return code() == StatusCode::PlasmaObjectExists; } - // An object was requested that doesn't exist in the plasma store. + /// Return true iff the status indicates a non-existent Plasma object. bool IsPlasmaObjectNonexistent() const { return code() == StatusCode::PlasmaObjectNonexistent; } - // An already sealed object is tried to be sealed again. + /// Return true iff the status indicates an already sealed Plasma object. bool IsPlasmaObjectAlreadySealed() const { return code() == StatusCode::PlasmaObjectAlreadySealed; } - // An object is too large to fit into the plasma store. + /// Return true iff the status indicates the Plasma store reached its capacity limit. bool IsPlasmaStoreFull() const { return code() == StatusCode::PlasmaStoreFull; } bool IsStillExecuting() const { return code() == StatusCode::StillExecuting; } @@ -251,16 +277,19 @@ class ARROW_EXPORT Status { bool IsExecutionError() const { return code() == StatusCode::ExecutionError; } - // Return a string representation of this status suitable for printing. - // Returns the string "OK" for success. + /// \brief Return a string representation of this status suitable for printing. + /// + /// The string "OK" is returned for success. std::string ToString() const; - // Return a string representation of the status code, without the message - // text or posix code information. + /// \brief Return a string representation of the status code, without the message + /// text or POSIX code information. std::string CodeAsString() const; + /// \brief Return the StatusCode value attached to this status. StatusCode code() const { return ok() ? StatusCode::OK : state_->code; } + /// \brief Return the specific error message attached to this status. std::string message() const { return ok() ? "" : state_->msg; } private: diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 119e4e4491225..9c478485b243c 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -39,7 +39,14 @@ class Status; /// as one large array class ARROW_EXPORT ChunkedArray { public: + /// \brief Construct a chunked array from a vector of arrays + /// + /// The vector should be non-empty and all its elements should have the same + /// data type. explicit ChunkedArray(const ArrayVector& chunks); + /// \brief Construct a chunked array from a vector of arrays and a data type + /// + /// As the data type is passed explicitly, the vector may be empty. ChunkedArray(const ArrayVector& chunks, const std::shared_ptr& type); /// \return the total length of the chunked array; computed on construction diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 63f0e2d237242..f187817b53f28 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -40,11 +40,11 @@ namespace arrow { class Array; class Field; -/// \brief Main data type enumeration -/// -/// This enumeration provides a quick way to interrogate the category -/// of a DataType instance. struct Type { + /// \brief Main data type enumeration + /// + /// This enumeration provides a quick way to interrogate the category + /// of a DataType instance. enum type { /// A NULL type having no physical storage NA, @@ -143,7 +143,7 @@ struct Type { /// nested type consisting of other data types, or another data type (e.g. a /// timestamp encoded as an int64). /// -/// Simple datatypes may be entirely described by their Type id, but +/// Simple datatypes may be entirely described by their Type::type id, but /// complex datatypes are usually parametric. class ARROW_EXPORT DataType { public: @@ -624,6 +624,7 @@ class ARROW_EXPORT Date64Type : public DateType { }; struct TimeUnit { + /// The unit for a time or timestamp DataType enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; }; @@ -837,6 +838,9 @@ class ARROW_EXPORT Schema { // Parametric factory functions // Other factory functions are in type_fwd.h +/// \addtogroup type-factories +/// @{ + /// \brief Create a FixedSizeBinaryType instance ARROW_EXPORT std::shared_ptr fixed_size_binary(int32_t byte_width); @@ -890,6 +894,8 @@ std::shared_ptr ARROW_EXPORT dictionary(const std::shared_ptr& index_type, const std::shared_ptr& values, bool ordered = false); +/// @} + /// \brief Create a Field instance /// /// \param name the field name diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index dbbe7092b4f12..2a83d8a664d80 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -137,6 +137,11 @@ using IntervalArray = NumericArray; // (parameter-free) Factory functions // Other factory functions are in type.h +/// \defgroup type-factories Factory functions for creating data types +/// +/// Factory functions for creating data types +/// @{ + /// \brief Return a NullType instance std::shared_ptr ARROW_EXPORT null(); /// \brief Return a BooleanType instance @@ -172,6 +177,8 @@ std::shared_ptr ARROW_EXPORT date32(); /// \brief Return a Date64Type instance std::shared_ptr ARROW_EXPORT date64(); +/// @} + } // namespace arrow #endif // ARROW_TYPE_FWD_H diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index 894ed1f907f6d..02aa4d62e3b31 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -20,8 +20,10 @@ API Reference ************* .. toctree:: - :maxdepth: 2 - :caption: Getting Started + :maxdepth: 3 - api/array + api/support api/memory + api/datatype + api/array + api/builder diff --git a/docs/source/cpp/api/array.rst b/docs/source/cpp/api/array.rst index aed18763b6ce7..bb981d1a0477d 100644 --- a/docs/source/cpp/api/array.rst +++ b/docs/source/cpp/api/array.rst @@ -15,19 +15,23 @@ .. specific language governing permissions and limitations .. under the License. -Array types -============= +====== +Arrays +====== .. doxygenclass:: arrow::Array :project: arrow_cpp :members: +Concrete array subclasses +========================= + .. doxygenclass:: arrow::DictionaryArray :project: arrow_cpp :members: -non-nested array types ----------------------- +Non-nested +---------- .. doxygenclass:: arrow::FlatArray :project: arrow_cpp @@ -65,8 +69,8 @@ non-nested array types :project: arrow_cpp :members: -nested array types ------------------- +Nested +------ .. doxygenclass:: arrow::UnionArray :project: arrow_cpp @@ -79,3 +83,10 @@ nested array types .. doxygenclass:: arrow::StructArray :project: arrow_cpp :members: + +Chunked Arrays +============== + +.. doxygenclass:: arrow::ChunkedArray + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/api/builder.rst b/docs/source/cpp/api/builder.rst new file mode 100644 index 0000000000000..0912706ac081c --- /dev/null +++ b/docs/source/cpp/api/builder.rst @@ -0,0 +1,59 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============== +Array Builders +============== + +.. doxygenclass:: arrow::ArrayBuilder + :members: + +Concrete builder subclasses +=========================== + +.. doxygenclass:: arrow::NullBuilder + :members: + +.. doxygenclass:: arrow::BooleanBuilder + :members: + +.. doxygenclass:: arrow::PrimitiveBuilder + :members: + +.. doxygenclass:: arrow::NumericBuilder + :members: + +.. doxygenclass:: arrow::BinaryBuilder + :members: + +.. doxygenclass:: arrow::StringBuilder + :members: + +.. doxygenclass:: arrow::FixedSizeBinaryBuilder + :members: + +.. doxygenclass:: arrow::Decimal128Builder + :members: + +.. doxygenclass:: arrow::ListBuilder + :members: + +.. doxygenclass:: arrow::StructBuilder + :members: + +.. doxygenclass:: arrow::DictionaryBuilder + :members: diff --git a/docs/source/cpp/api/datatype.rst b/docs/source/cpp/api/datatype.rst new file mode 100644 index 0000000000000..ee7844277df27 --- /dev/null +++ b/docs/source/cpp/api/datatype.rst @@ -0,0 +1,135 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +========== +Data Types +========== + +.. doxygenenum:: arrow::Type::type + +.. doxygenclass:: arrow::DataType + :members: + +.. _api-type-factories: + +Factory functions +================= + +These functions are recommended for creating data types. They may return +new objects or existing singletons, depending on the type requested. + +.. doxygengroup:: type-factories + :project: arrow_cpp + :content-only: + +Concrete type subclasses +======================== + +Primitive +--------- + +.. doxygenclass:: arrow::NullType + :members: + +.. doxygenclass:: arrow::BooleanType + :members: + +.. doxygenclass:: arrow::Int8Type + :members: + +.. doxygenclass:: arrow::Int16Type + :members: + +.. doxygenclass:: arrow::Int32Type + :members: + +.. doxygenclass:: arrow::Int64Type + :members: + +.. doxygenclass:: arrow::UInt8Type + :members: + +.. doxygenclass:: arrow::UInt16Type + :members: + +.. doxygenclass:: arrow::UInt32Type + :members: + +.. doxygenclass:: arrow::UInt64Type + :members: + +.. doxygenclass:: arrow::HalfFloatType + :members: + +.. doxygenclass:: arrow::FloatType + :members: + +.. doxygenclass:: arrow::DoubleType + :members: + +Time-related +------------ + +.. doxygenenum:: arrow::TimeUnit::type + +.. doxygenclass:: arrow::Date32Type + :members: + +.. doxygenclass:: arrow::Date64Type + :members: + +.. doxygenclass:: arrow::Time32Type + :members: + +.. doxygenclass:: arrow::Time64Type + :members: + +.. doxygenclass:: arrow::TimestampType + :members: + +Binary-like +----------- + +.. doxygenclass:: arrow::BinaryType + :members: + +.. doxygenclass:: arrow::StringType + :members: + +.. doxygenclass:: arrow::FixedSizeBinaryType + :members: + +.. doxygenclass:: arrow::Decimal128Type + :members: + +Nested +------ + +.. doxygenclass:: arrow::ListType + :members: + +.. doxygenclass:: arrow::StructType + :members: + +.. doxygenclass:: arrow::UnionType + :members: + +Dictionary-encoded +------------------ + +.. doxygenclass:: arrow::DictionaryType + :members: diff --git a/docs/source/cpp/api/memory.rst b/docs/source/cpp/api/memory.rst index fbb5dc818628c..1dc8e706d3e8d 100644 --- a/docs/source/cpp/api/memory.rst +++ b/docs/source/cpp/api/memory.rst @@ -18,8 +18,8 @@ Memory (management) =================== -Basic containers ----------------- +Buffers +------- .. doxygenclass:: arrow::Buffer :project: arrow_cpp diff --git a/docs/source/cpp/api/support.rst b/docs/source/cpp/api/support.rst new file mode 100644 index 0000000000000..b165a9973b4c1 --- /dev/null +++ b/docs/source/cpp/api/support.rst @@ -0,0 +1,29 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Programming Support +=================== + +Error return and reporting +-------------------------- + +.. doxygenclass:: arrow::Status + :project: arrow_cpp + :members: + +.. doxygendefine:: ARROW_RETURN_NOT_OK + diff --git a/docs/source/cpp/arrays.rst b/docs/source/cpp/arrays.rst new file mode 100644 index 0000000000000..0c5272d2aed5e --- /dev/null +++ b/docs/source/cpp/arrays.rst @@ -0,0 +1,211 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +====== +Arrays +====== + +The central type in Arrow is the class :class:`arrow::Array`. An array +represents a known-length sequence of values all having the same type. +Internally, those values are represented by one or several buffers, the +number and meaning of which depend on the array's data type, as documented +in :doc:`the Arrow data layout specification <../format/Layout>`. + +Those buffers consist of the value data itself and an optional bitmap buffer +that indicates which array entries are null values. The bitmap buffer +can be entirely omitted if the array is known to have zero null values. + +There are concrete subclasses of :class:`arrow::Array` for each data type, +that help you access individual values of the array. + +Building an array +================= + +As Arrow objects are immutable, there are classes provided that help you +build these objects incrementally from third-party data. These classes +are organized in a hierarchy around the :class:`arrow::ArrayBuilder` base class, +with concrete subclasses tailored for each particular data type. + +For example, to build an array of ``int64_t`` elements, we can use the +:class:`arrow::Int64Builder` class. In the following example, we build an array +of the range 1 to 8 where the element that should hold the value 4 is nulled:: + + arrow::Int64Builder builder; + builder.Append(1); + builder.Append(2); + builder.Append(3); + builder.AppendNull(); + builder.Append(5); + builder.Append(6); + builder.Append(7); + builder.Append(8); + + std::shared_ptr array; + arrow::Status st = builder.Finish(&array); + if (!st.ok()) { + // ... do something on array building failure + } + +The resulting Array (which can be casted to the concrete :class:`arrow::Int64Array` +subclass if you want to access its values) then consists of two +:class:`arrow::Buffer`\s. +The first buffer holds the null bitmap, which consists here of a single byte with +the bits ``0|0|0|0|1|0|0|0``. As we use `least-significant bit (LSB) numbering`_. +this indicates that the fourth entry in the array is null. The second +buffer is simply an ``int64_t`` array containing all the above values. +As the fourth entry is null, the value at that position in the buffer is +undefined. + +Here is how you could access the concrete array's contents:: + + // Cast the Array to its actual type to access its data + auto int64_array = std::static_pointer_cast(array); + + // Get the pointer to the null bitmap. + const uint8_t* null_bitmap = int64_array->null_bitmap_data(); + + // Get the pointer to the actual data + const int64_t* data = int64_array->raw_values(); + + // Alternatively, given an array index, query its null bit and value directly + int64_t index = 2; + if (!int64_array->IsNull(index)) { + int64_t value = int64_array->Value(index); + } + +.. note:: + :class:`arrow::Int64Array` (respectively :class:`arrow::Int64Builder`) is + just a ``typedef``, provided for convenience, of ``arrow::NumericArray`` + (respectively ``arrow::NumericBuilder``). + +.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering + +Performance +----------- + +While it is possible to build an array value-by-value as in the example above, +to attain highest performance it is recommended to use the bulk appending +methods (usually named ``AppendValues``) in the concrete :class:`arrow::ArrayBuilder` +subclasses. + +If you know the number of elements in advance, it is also recommended to +presize the working area by calling the :func:`~arrow::ArrayBuilder::Resize` +or :func:`~arrow::ArrayBuilder::Reserve` methods. + +Here is how one could rewrite the above example to take advantage of those +APIs:: + + arrow::Int64Builder builder; + // Make place for 8 values in total + builder.Resize(8); + // Bulk append the given values (with a null in 4th place as indicated by the + // validity vector) + std::vector validity = {true, true, true, false, true, true, true, true}; + std::vector values = {1, 2, 3, 0, 5, 6, 7, 8}; + builder.AppendValues(values, validity); + + std::shared_ptr array; + arrow::Status st = builder.Finish(&array); + +If you still must append values one by one, some concrete builder subclasses +have methods marked "Unsafe" that assume the working area has been correctly +presized, and offer higher performance in exchange:: + + arrow::Int64Builder builder; + // Make place for 8 values in total + builder.Resize(8); + builder.UnsafeAppend(1); + builder.UnsafeAppend(2); + builder.UnsafeAppend(3); + builder.UnsafeAppendNull(); + builder.UnsafeAppend(5); + builder.UnsafeAppend(6); + builder.UnsafeAppend(7); + builder.UnsafeAppend(8); + + std::shared_ptr array; + arrow::Status st = builder.Finish(&array); + + +Size Limitations and Recommendations +==================================== + +Some array types are structurally limited to 32-bit sizes. This is the case +for list arrays (which can hold up to 2^31 elements), string arrays and binary +arrays (which can hold up to 2GB of binary data), at least. Some other array +types can hold up to 2^63 elements in the C++ implementation, but other Arrow +implementations can have a 32-bit size limitation for those array types as well. + +For these reasons, it is recommended that huge data be chunked in subsets of +more reasonable size. + +Chunked Arrays +============== + +A :class:`arrow::ChunkedArray` is, like an array, a logical sequence of values; +but unlike a simple array, a chunked array does not require the entire sequence +to be physically contiguous in memory. Also, the constituents of a chunked array +need not have the same size, but they must all have the same data type. + +A chunked array is constructed by agregating any number of arrays. Here we'll +build a chunked array with the same logical values as in the example above, +but in two separate chunks:: + + std::vector> chunks; + std::shared_ptr array; + + // Build first chunk + arrow::Int64Builder builder; + builder.Append(1); + builder.Append(2); + builder.Append(3); + if (!builder.Finish(&array).ok()) { + // ... do something on array building failure + } + chunks.push_back(std::move(array)); + + // Build second chunk + builder.Reset(); + builder.AppendNull(); + builder.Append(5); + builder.Append(6); + builder.Append(7); + builder.Append(8); + if (!builder.Finish(&array).ok()) { + // ... do something on array building failure + } + chunks.push_back(std::move(array)); + + auto chunked_array = std::make_shared(std::move(chunks)); + + assert(chunked_array->num_chunks() == 2); + // Logical length in number of values + assert(chunked_array->length() == 8); + assert(chunked_array->null_count() == 1); + +Slicing +======= + +Like for physical memory buffers, it is possible to make zero-copy slices +of arrays and chunked arrays, to obtain an array or chunked array referring +to some logical subsequence of the data. This is done by calling the +:func:`arrow::Array::Slice` and :func:`arrow::ChunkedArray::Slice` methods, +respectively. + diff --git a/docs/source/cpp/conventions.rst b/docs/source/cpp/conventions.rst new file mode 100644 index 0000000000000..b0424358901b4 --- /dev/null +++ b/docs/source/cpp/conventions.rst @@ -0,0 +1,91 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Conventions +=========== + +The Arrow C++ API follows a few simple guidelines. As with many rules, +there may be exceptions. + +Language version +---------------- + +Arrow is C++11-compatible. A few backports are used for newer functionality, +for example the :class:`std::string_view` class. + +Namespacing +----------- + +All the Arrow API (except macros) is namespaced inside a ``arrow`` namespace, +and nested namespaces thereof. + +Safe pointers +------------- + +Arrow objects are usually passed and stored using safe pointers -- most of +the time :class:`std::shared_ptr` but sometimes also :class:`std::unique_ptr`. + +Immutability +------------ + +Many Arrow objects are immutable: once constructed, their logical properties +cannot change anymore. This makes it possible to use them in multi-threaded +scenarios without requiring tedious and error-prone synchronization. + +There are obvious exceptions to this, such as IO objects or mutable data buffers. + +Error reporting +--------------- + +Most APIs indicate a successful or erroneous outcome by returning a +:class:`arrow::Status` instance. Arrow doesn't throw exceptions of its +own, but third-party exceptions might propagate through, especially +:class:`std::bad_alloc` (but Arrow doesn't use the standard allocators for +large data). + +As a consequence, the result value of a function is generally passed as an +out-pointer parameter, rather than as a function return value. + +(however, functions which always determiniscally succeed may eschew this +convention and return their result directly) + +Here is an example of checking the outcome of an operation:: + + const int64_t buffer_size = 4096; + std::shared_ptr buffer; + + auto status = arrow::AllocateBuffer(buffer_size, &buffer); + if (!status.ok()) { + // ... handle error + } + +If the caller function itself returns a :class:`arrow::Status` and wants +to propagate any non-successful outcomes, a convenience macro +:cpp:func:`ARROW_RETURN_NON_OK` is available:: + + arrow::Status DoSomething() { + const int64_t buffer_size = 4096; + std::shared_ptr buffer; + ARROW_RETURN_NON_OK(arrow::AllocateBuffer(buffer_size, &buffer)); + // ... allocation successful, do something with buffer below + + // return success at the end + return Status::OK(); + } diff --git a/docs/source/cpp/datatypes.rst b/docs/source/cpp/datatypes.rst new file mode 100644 index 0000000000000..117c05b8755e7 --- /dev/null +++ b/docs/source/cpp/datatypes.rst @@ -0,0 +1,65 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Data Types +========== + +Data types govern how physical data is interpreted. Their :ref:`specification +` allows binary interoperability between different Arrow +implementations, including from different programming languages and runtimes +(for example it is possible to access the same data, without copying, from +both Python and Java using the :py:mod:`pyarrow.jvm` bridge module). + +Information about a data type in C++ can be represented in three ways: + +1. Using a :class:`arrow::DataType` instance (e.g. as a function argument) +2. Using a :class:`arrow::DataType` concrete subclass (e.g. as a template + parameter) +3. Using a :type:`arrow::Type::type` enum value (e.g. as the condition of + a switch statement) + +The first form (using a :class:`arrow::DataType` instance) is the most idiomatic +and flexible. Runtime-parametric types can only be fully represented with +a DataType instance. For example, a :class:`arrow::TimestampType` needs to be +constructed at runtime with a :type:`arrow::TimeUnit::type` parameter; a +:class:`arrow::Decimal128Type` with *scale* and *precision* parameters; +a :class:`arrow::ListType` with a full child type (itself a +:class:`arrow::DataType` instance). + +The two other forms can be used where performance is critical, in order to +avoid paying the price of dynamic typing and polymorphism. However, some +amount of runtime switching can still be required for parametric types. +It is not possible to reify all possible types at compile time, since Arrow +data types allows arbitrary nesting. + +Creating data types +------------------- + +To instantiate data types, it is recommended to call the provided +:ref:`factory functions `:: + + std::shared_ptr type; + + // A 16-bit integer type + type = arrow::int16(); + // A 64-bit timestamp type (with microsecond granularity) + type = arrow::timestamp(arrow::TimeUnit::MICRO); + // A list type of single-precision floating-point values + type = arrow::list(arrow::float32()); diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst new file mode 100644 index 0000000000000..8201c2ded0d92 --- /dev/null +++ b/docs/source/cpp/getting_started.rst @@ -0,0 +1,30 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Getting Started +=============== + +.. toctree:: + + overview + conventions + arrays + datatypes + diff --git a/docs/source/cpp/index.rst b/docs/source/cpp/index.rst index 4f874bac4fd1e..8c7ced0c2e7b8 100644 --- a/docs/source/cpp/index.rst +++ b/docs/source/cpp/index.rst @@ -20,69 +20,6 @@ C++ Implementation .. toctree:: :maxdepth: 2 - :caption: Getting Started + getting_started api - -Getting Started ---------------- - -The most basic structure in Arrow is an :cpp:class:`arrow::Array`. It holds a sequence -of values with known length all having the same type. It consists of the data -itself and an additional bitmap that indicates if the corresponding entry of -array is a null-value. Note that for array with zero null entries, we can omit -this bitmap. - -As Arrow objects are immutable, there are classes provided that should help you -build these objects. To build an array of ``int64_t`` elements, we can use the -:cpp:class:`arrow::Int64Builder`. In the following example, we build an array of -the range 1 to 8 where the element that should hold the number 4 is nulled. - -.. code:: - - Int64Builder builder; - builder.Append(1); - builder.Append(2); - builder.Append(3); - builder.AppendNull(); - builder.Append(5); - builder.Append(6); - builder.Append(7); - builder.Append(8); - - std::shared_ptr array; - builder.Finish(&array); - -The resulting Array (which can be casted to :cpp:class:`arrow::Int64Array` if you want -to access its values) then consists of two :cpp:class:`arrow::Buffer`. The first one is -the null bitmap holding a single byte with the bits ``0|0|0|0|1|0|0|0``. -As we use `least-significant bit (LSB) numbering`_. -this indicates that the fourth entry in the array is null. The second -buffer is simply an ``int64_t`` array containing all the above values. -As the fourth entry is null, the value at that position in the buffer is -undefined. - -.. code:: - - // Cast the Array to its actual type to access its data - std::shared_ptr int64_array = std::static_pointer_cast(array); - - // Get the pointer to the null bitmap. - const uint8_t* null_bitmap = int64_array->null_bitmap_data(); - - // Get the pointer to the actual data - const int64_t* data = int64_array->raw_values(); - -In the above example, we have yet skipped explaining two things in the code. -On constructing the builder, we have passed :cpp:func:`arrow::int64()` to it. This is -the type information with which the resulting array will be annotated. In -this simple form, it is solely a :cpp:class:`std::shared_ptr` -instantiation. - -Furthermore, we have passed :cpp:func:`arrow::default_memory_pool()` to the constructor. -This :cpp:class:`arrow::MemoryPool` is used for the allocations of heap memory. Besides -tracking the amount of memory allocated, the allocator also ensures that the -allocated memory regions are 64-byte aligned (as required by the Arrow -specification). - -.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering diff --git a/docs/source/cpp/overview.rst b/docs/source/cpp/overview.rst new file mode 100644 index 0000000000000..490efc1b7a2c1 --- /dev/null +++ b/docs/source/cpp/overview.rst @@ -0,0 +1,93 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +High-Level Overview +=================== + +The Arrow C++ library is comprised of different parts, each of which serves +a specific purpose. + +The physical layer +------------------ + +**Memory management** abstractions provide a uniform API over memory that +may be allocated through various means, such as heap allocation, the memory +mapping of a file or a static memory area. In particular, the **buffer** +abstraction represents a contiguous area of physical data. + +The one-dimensional layer +------------------------- + +**Data types** govern the *logical* interpretation of *physical* data. +Many operations in Arrow are parametered, at compile-time or at runtime, +by a data type. + +**Arrays** assemble one or several buffers with a data type, allowing to +view them as a logical contiguous sequence of values (possibly nested). + +**Chunked arrays** are a generalization of arrays, comprising several same-type +arrays into a longer logical sequence of values. + +The two-dimensional layer +------------------------- + +**Schemas** describe a logical collection of several pieces of data, +each with a distinct name and type, and optional metadata. + +**Columns** are like chunked arrays, but with optional metadata. + +**Tables** are collections of columns in accordance to a schema. They are +the most capable dataset-providing abstraction in Arrow. + +**Record batches** are collections of contiguous arrays, described +by a schema. They allow incremental construction or serialization of tables. + +The compute layer +----------------- + +**Datums** are flexible dataset references, able to hold for example an array or table +reference. + +**Kernels** are specialized computation functions running in a loop over a +given set of datums representing input and output parameters to the functions. + +The IO layer +------------ + +**Streams** allow untyped sequential or seekable access over external data +of various kinds (for example compressed or memory-mapped). + +The Inter-Process Communication (IPC) layer +------------------------------------------- + +A **messaging format** allows interchange of Arrow data between processes, using +as few copies as possible. + +The file formats layer +---------------------- + +Reading and writing Arrow data from/to various file formats is possible, for +example **Parquet**, **CSV**, **Orc** or the Arrow-specific **Feather** format. + +The devices layer +----------------- + +Basic **CUDA** integration is provided, allowing to describe Arrow data backed +by GPU-allocated memory. diff --git a/docs/source/format/Metadata.rst b/docs/source/format/Metadata.rst index 4ed82e0078e2c..293d0113875a6 100644 --- a/docs/source/format/Metadata.rst +++ b/docs/source/format/Metadata.rst @@ -266,6 +266,8 @@ detail for each type below): :: buffer 10: field 5 offsets buffer 11: field 5 data +.. _spec-logical-types: + Logical types ------------- From c7e986047a7066a4001227a2901f91bc2f2a17d2 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Tue, 11 Dec 2018 17:21:35 -0700 Subject: [PATCH 23/45] ARROW-3960: [Rust] remove extern crate for Rust 2018 This is a trivial change to remove "extern crate" definitions in lib.rs, to follow the new module system in Rust 2018 edition. Author: Chao Sun Author: Chao Sun Closes #3125 from sunchao/ARROW-3960 and squashes the following commits: 56a4393 Remove star import 0e5d06c Fixing json_internal error 53c13a9 ARROW-3960: remove extern crate for Rust 2018 --- rust/src/datatypes.rs | 4 +++- rust/src/lib.rs | 8 -------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index f91c75d7bd0c3..36cb818cdfc7a 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -26,8 +26,10 @@ use std::mem::size_of; use std::slice::from_raw_parts; use std::str::FromStr; +use serde_derive::{Deserialize, Serialize}; +use serde_json::{json, Value}; + use crate::error::{ArrowError, Result}; -use serde_json::Value; /// The possible relative types that are supported. /// diff --git a/rust/src/lib.rs b/rust/src/lib.rs index b661c21279d22..f41d08f1427a6 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -17,14 +17,6 @@ #![feature(specialization)] -extern crate csv as csv_crate; - -#[macro_use] -extern crate serde_derive; - -#[macro_use] -extern crate serde_json; - pub mod array; pub mod array_data; pub mod array_ops; From 28d16c0e5682edeeebb37d3724f17b82c10aa4cf Mon Sep 17 00:00:00 2001 From: kabukawa Date: Wed, 12 Dec 2018 15:28:54 +0900 Subject: [PATCH 24/45] ARROW-3996: [C++] Add missing packages on Linux [C++] Build requirement libraries add to README.md. * autoconf * Jemalloc * boost-regex Author: kabukawa Author: Kouhei Sutou Closes #3157 from kabukawa/apache-arrow-develop and squashes the following commits: a9f465a3 Add autoconf 45568fd1 Instration requirement add.(modified) dcee4855 Instration requirement add. --- cpp/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/README.md b/cpp/README.md index 7d0851762c291..1278ca046d432 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -36,9 +36,13 @@ Building Arrow requires: On Ubuntu/Debian you can install the requirements with: ```shell -sudo apt-get install cmake \ +sudo apt-get install \ + autoconf \ + build-essential \ + cmake \ libboost-dev \ libboost-filesystem-dev \ + libboost-regex-dev \ libboost-system-dev ``` From 527fed672260752e72a6572238d1029063091ef1 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Wed, 12 Dec 2018 17:50:13 +0900 Subject: [PATCH 25/45] ARROW-3913: [Gandiva] [GLib] Add GGandivaLiteralNode Support GGandivaLiteralNode including the following sub classes. - GGandivaUint8LiteralNode - GGandivaUint16LiteralNode - GGandivaUint32LiteralNode - GGandivaUint64LiteralNode - GGandivaInt8LiteralNode - GGandivaInt16LiteralNode - GGandivaInt32LiteralNode - GGandivaInt64LiteralNode - GGandivaFloatLiteralNode - GGandivaDoubleLiteralNode - GGandivaStringLiteralNode - GGandivaBinaryLiteralNode Author: Yosuke Shiro Author: Kouhei Sutou Closes #3092 from shiro615/glib-add-ggandiva-literal-node and squashes the following commits: fb49b256 Add a missing ref 91cebe1c Break a long line 3ad1e5d8 Support BooleanLiteralNode#value 28d301eb Fix class orders 7d70c89e Remove binary literal property 783a2868 Use g_bytes_ref in ggandiva_binary_literal_node_get_value() 4162234d Fix class orders 289dfce2 Add ggandiva_binary_literal_node_new_bytes() 77f9eb89 Remove (transfer full) to use return value.c_str() e43d525f Use static_pointer_cast 62a6dd5c Return GBytes in ggandiva_binary_literal_node_get_value() 48d1175d Remove unnecessary static_cast d7ac46b4 Remove (nullable) of size of binary literal 8f6643af Fix documents 3ded5866 Refactor ggandiva_literal_{}_node_new_raw() a54c6f58 Add the original raw value getter bb2f71be Rename Uint to UInt 34422ad1 Remove property 7a3fe325 Use 'const guint8 *value, gsize size' for binary data 138abbf8 Fix a typo ba501d60 Rename is_true to value a45fa752 Use MakeStringLiteral, MakeBinaryLiteral d8775e4a Fix property name in BooleanLiteralNode 62a4eb48 Add test case for LiteralNode 83876ccd Support GGandivaLiteralNode --- c_glib/gandiva-glib/node.cpp | 754 ++++++++++++++++++ c_glib/gandiva-glib/node.h | 236 ++++++ c_glib/gandiva-glib/node.hpp | 3 + .../test/gandiva/test-binary-literal-node.rb | 34 + .../test/gandiva/test-boolean-literal-node.rb | 28 + .../test/gandiva/test-double-literal-node.rb | 28 + .../test/gandiva/test-float-literal-node.rb | 34 + .../test/gandiva/test-int16-literal-node.rb | 28 + .../test/gandiva/test-int32-literal-node.rb | 28 + .../test/gandiva/test-int64-literal-node.rb | 28 + c_glib/test/gandiva/test-int8-literal-node.rb | 28 + .../test/gandiva/test-string-literal-node.rb | 28 + .../test/gandiva/test-uint16-literal-node.rb | 28 + .../test/gandiva/test-uint32-literal-node.rb | 28 + .../test/gandiva/test-uint64-literal-node.rb | 28 + .../test/gandiva/test-uint8-literal-node.rb | 28 + ruby/red-gandiva/lib/gandiva/loader.rb | 14 + .../test/test-boolean-literal-node.rb | 24 + 18 files changed, 1407 insertions(+) create mode 100644 c_glib/test/gandiva/test-binary-literal-node.rb create mode 100644 c_glib/test/gandiva/test-boolean-literal-node.rb create mode 100644 c_glib/test/gandiva/test-double-literal-node.rb create mode 100644 c_glib/test/gandiva/test-float-literal-node.rb create mode 100644 c_glib/test/gandiva/test-int16-literal-node.rb create mode 100644 c_glib/test/gandiva/test-int32-literal-node.rb create mode 100644 c_glib/test/gandiva/test-int64-literal-node.rb create mode 100644 c_glib/test/gandiva/test-int8-literal-node.rb create mode 100644 c_glib/test/gandiva/test-string-literal-node.rb create mode 100644 c_glib/test/gandiva/test-uint16-literal-node.rb create mode 100644 c_glib/test/gandiva/test-uint32-literal-node.rb create mode 100644 c_glib/test/gandiva/test-uint64-literal-node.rb create mode 100644 c_glib/test/gandiva/test-uint8-literal-node.rb create mode 100644 ruby/red-gandiva/test/test-boolean-literal-node.rb diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index 49d1d0b7168df..cdb9724d7ebbf 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -26,6 +26,15 @@ #include +template +Type +ggandiva_literal_node_get(GGandivaLiteralNode *node) +{ + auto gandiva_literal_node = + std::static_pointer_cast(ggandiva_node_get_raw(GGANDIVA_NODE(node))); + return boost::get(gandiva_literal_node->holder()); +} + G_BEGIN_DECLS /** @@ -40,6 +49,48 @@ G_BEGIN_DECLS * * #GGandivaFunctionNode is a class for a node in the expression tree, representing a function. * + * #GGandivaLiteralNode is a base class for a node in the expression tree, + * representing a literal. + * + * #GGandivaBooleanLiteralNode is a class for a node in the expression tree, + * representing a boolean literal. + * + * #GGandivaInt8LiteralNode is a class for a node in the expression tree, + * representing a 8-bit integer literal. + * + * #GGandivaUInt8LiteralNode is a class for a node in the expression tree, + * representing a 8-bit unsigned integer literal. + * + * #GGandivaInt16LiteralNode is a class for a node in the expression tree, + * representing a 16-bit integer literal. + * + * #GGandivaUInt16LiteralNode is a class for a node in the expression tree, + * representing a 16-bit unsigned integer literal. + * + * #GGandivaInt32LiteralNode is a class for a node in the expression tree, + * representing a 32-bit integer literal. + * + * #GGandivaUInt32LiteralNode is a class for a node in the expression tree, + * representing a 32-bit unsigned integer literal. + * + * #GGandivaInt64LiteralNode is a class for a node in the expression tree, + * representing a 64-bit integer literal. + * + * #GGandivaUInt64LiteralNode is a class for a node in the expression tree, + * representing a 64-bit unsigned integer literal. + * + * #GGandivaFloatLiteralNode is a class for a node in the expression tree, + * representing a 32-bit floating point literal. + * + * #GGandivaDoubleLiteralNode is a class for a node in the expression tree, + * representing a 64-bit floating point literal. + * + * #GGandivaBinaryLiteralNode is a class for a node in the expression tree, + * representing a binary literal. + * + * #GGandivaStringLiteralNode is a class for a node in the expression tree, + * representing an UTF-8 encoded string literal. + * * Since: 0.12.0 */ @@ -395,6 +446,654 @@ ggandiva_function_node_get_parameters(GGandivaFunctionNode *node) return priv->parameters; } + +G_DEFINE_TYPE(GGandivaLiteralNode, + ggandiva_literal_node, + GGANDIVA_TYPE_NODE) + +static void +ggandiva_literal_node_init(GGandivaLiteralNode *literal_node) +{ +} + +static void +ggandiva_literal_node_class_init(GGandivaLiteralNodeClass *klass) +{ +} + + +G_DEFINE_TYPE(GGandivaBooleanLiteralNode, + ggandiva_boolean_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_boolean_literal_node_init(GGandivaBooleanLiteralNode *boolean_literal_node) +{ +} + +static void +ggandiva_boolean_literal_node_class_init(GGandivaBooleanLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_boolean_literal_node_new: + * @value: The value of the boolean literal. + * + * Returns: A newly created #GGandivaBooleanLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaBooleanLiteralNode * +ggandiva_boolean_literal_node_new(gboolean value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(static_cast(value)); + return GGANDIVA_BOOLEAN_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_boolean_literal_node_get_value: + * @node: A #GGandivaBooleanLiteralNode. + * + * Returns: The value of the boolean literal. + * + * Since: 0.12.0 + */ +gboolean +ggandiva_boolean_literal_node_get_value(GGandivaBooleanLiteralNode *node) +{ + auto value = ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); + return static_cast(value); +} + + +G_DEFINE_TYPE(GGandivaInt8LiteralNode, + ggandiva_int8_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int8_literal_node_init(GGandivaInt8LiteralNode *int8_literal_node) +{ +} + +static void +ggandiva_int8_literal_node_class_init(GGandivaInt8LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int8_literal_node_new: + * @value: The value of the 8-bit integer literal. + * + * Returns: A newly created #GGandivaInt8LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt8LiteralNode * +ggandiva_int8_literal_node_new(gint8 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_int8_literal_node_get_value: + * @node: A #GGandivaInt8LiteralNode. + * + * Returns: The value of the 8-bit integer literal. + * + * Since: 0.12.0 + */ +gint8 +ggandiva_int8_literal_node_get_value(GGandivaInt8LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt8LiteralNode, + ggandiva_uint8_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint8_literal_node_init(GGandivaUInt8LiteralNode *uint8_literal_node) +{ +} + +static void +ggandiva_uint8_literal_node_class_init(GGandivaUInt8LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint8_literal_node_new: + * @value: The value of the 8-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt8LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt8LiteralNode * +ggandiva_uint8_literal_node_new(guint8 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_uint8_literal_node_get_value: + * @node: A #GGandivaUInt8LiteralNode. + * + * Returns: The value of the 8-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint8 +ggandiva_uint8_literal_node_get_value(GGandivaUInt8LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaInt16LiteralNode, + ggandiva_int16_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int16_literal_node_init(GGandivaInt16LiteralNode *int16_literal_node) +{ +} + +static void +ggandiva_int16_literal_node_class_init(GGandivaInt16LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int16_literal_node_new: + * @value: The value of the 16-bit integer literal. + * + * Returns: A newly created #GGandivaInt16LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt16LiteralNode * +ggandiva_int16_literal_node_new(gint16 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_int16_literal_node_get_value: + * @node: A #GGandivaInt16LiteralNode. + * + * Returns: The value of the 16-bit integer literal. + * + * Since: 0.12.0 + */ +gint16 +ggandiva_int16_literal_node_get_value(GGandivaInt16LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt16LiteralNode, + ggandiva_uint16_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint16_literal_node_init(GGandivaUInt16LiteralNode *uint16_literal_node) +{ +} + +static void +ggandiva_uint16_literal_node_class_init(GGandivaUInt16LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint16_literal_node_new: + * @value: The value of the 16-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt16LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt16LiteralNode * +ggandiva_uint16_literal_node_new(guint16 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_uint16_literal_node_get_value: + * @node: A #GGandivaUInt16LiteralNode. + * + * Returns: The value of the 16-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint16 +ggandiva_uint16_literal_node_get_value(GGandivaUInt16LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaInt32LiteralNode, + ggandiva_int32_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int32_literal_node_init(GGandivaInt32LiteralNode *int32_literal_node) +{ +} + +static void +ggandiva_int32_literal_node_class_init(GGandivaInt32LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int32_literal_node_new: + * @value: The value of the 32-bit integer literal. + * + * Returns: A newly created #GGandivaInt32LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt32LiteralNode * +ggandiva_int32_literal_node_new(gint32 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_int32_literal_node_get_value: + * @node: A #GGandivaInt32LiteralNode. + * + * Returns: The value of the 32-bit integer literal. + * + * Since: 0.12.0 + */ +gint32 +ggandiva_int32_literal_node_get_value(GGandivaInt32LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt32LiteralNode, + ggandiva_uint32_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint32_literal_node_init(GGandivaUInt32LiteralNode *uint32_literal_node) +{ +} + +static void +ggandiva_uint32_literal_node_class_init(GGandivaUInt32LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint32_literal_node_new: + * @value: The value of the 32-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt32LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt32LiteralNode * +ggandiva_uint32_literal_node_new(guint32 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_uint32_literal_node_get_value: + * @node: A #GGandivaUInt32LiteralNode. + * + * Returns: The value of the 32-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint32 +ggandiva_uint32_literal_node_get_value(GGandivaUInt32LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaInt64LiteralNode, + ggandiva_int64_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int64_literal_node_init(GGandivaInt64LiteralNode *int64_literal_node) +{ +} + +static void +ggandiva_int64_literal_node_class_init(GGandivaInt64LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int64_literal_node_new: + * @value: The value of the 64-bit integer literal. + * + * Returns: A newly created #GGandivaInt64LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt64LiteralNode * +ggandiva_int64_literal_node_new(gint64 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_int64_literal_node_get_value: + * @node: A #GGandivaInt64LiteralNode. + * + * Returns: The value of the 64-bit integer literal. + * + * Since: 0.12.0 + */ +gint64 +ggandiva_int64_literal_node_get_value(GGandivaInt64LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt64LiteralNode, + ggandiva_uint64_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint64_literal_node_init(GGandivaUInt64LiteralNode *uint64_literal_node) +{ +} + +static void +ggandiva_uint64_literal_node_class_init(GGandivaUInt64LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint64_literal_node_new: + * @value: The value of the 64-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt64LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt64LiteralNode * +ggandiva_uint64_literal_node_new(guint64 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_uint64_literal_node_get_value: + * @node: A #GGandivaUInt64LiteralNode. + * + * Returns: The value of the 64-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint64 +ggandiva_uint64_literal_node_get_value(GGandivaUInt64LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaFloatLiteralNode, + ggandiva_float_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_float_literal_node_init(GGandivaFloatLiteralNode *float_literal_node) +{ +} + +static void +ggandiva_float_literal_node_class_init(GGandivaFloatLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_float_literal_node_new: + * @value: The value of the 32-bit floating point literal. + * + * Returns: A newly created #GGandivaFloatLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaFloatLiteralNode * +ggandiva_float_literal_node_new(gfloat value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_FLOAT_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_float_literal_node_get_value: + * @node: A #GGandivaFloatLiteralNode. + * + * Returns: The value of the 32-bit floating point literal. + * + * Since: 0.12.0 + */ +gfloat +ggandiva_float_literal_node_get_value(GGandivaFloatLiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaDoubleLiteralNode, + ggandiva_double_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_double_literal_node_init(GGandivaDoubleLiteralNode *double_literal_node) +{ +} + +static void +ggandiva_double_literal_node_class_init(GGandivaDoubleLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_double_literal_node_new: + * @value: The value of the 64-bit floating point literal. + * + * Returns: A newly created #GGandivaDoubleLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaDoubleLiteralNode * +ggandiva_double_literal_node_new(gdouble value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_DOUBLE_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_double_literal_node_get_value: + * @node: A #GGandivaDoubleLiteralNode. + * + * Returns: The value of the 64-bit floating point literal. + * + * Since: 0.12.0 + */ +gdouble +ggandiva_double_literal_node_get_value(GGandivaDoubleLiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +typedef struct GGandivaBinaryLiteralNodePrivate_ { + GBytes *value; +} GGandivaBinaryLiteralNodePrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GGandivaBinaryLiteralNode, + ggandiva_binary_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +#define GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(object) \ + static_cast( \ + ggandiva_binary_literal_node_get_instance_private( \ + GGANDIVA_BINARY_LITERAL_NODE(object))) + +static void +ggandiva_binary_literal_node_dispose(GObject *object) +{ + auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(object); + + if (priv->value) { + g_bytes_unref(priv->value); + priv->value = nullptr; + } + + G_OBJECT_CLASS(ggandiva_binary_literal_node_parent_class)->dispose(object); +} + +static void +ggandiva_binary_literal_node_init(GGandivaBinaryLiteralNode *binary_literal_node) +{ +} + +static void +ggandiva_binary_literal_node_class_init(GGandivaBinaryLiteralNodeClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = ggandiva_binary_literal_node_dispose; +} + +/** + * ggandiva_binary_literal_node_new: + * @value: (array length=size): The value of the binary literal. + * @size: The number of bytes of the value. + * + * Returns: A newly created #GGandivaBinaryLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new(const guint8 *value, + gsize size) +{ + auto gandiva_node = + gandiva::TreeExprBuilder::MakeBinaryLiteral(std::string(reinterpret_cast(value), + size)); + return GGANDIVA_BINARY_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_binary_literal_node_new_bytes: + * @value: The value of the binary literal. + * + * Returns: A newly created #GGandivaBinaryLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new_bytes(GBytes *value) +{ + size_t value_size; + auto raw_value = g_bytes_get_data(value, &value_size); + auto gandiva_node = + gandiva::TreeExprBuilder::MakeBinaryLiteral( + std::string(reinterpret_cast(raw_value), + value_size)); + auto literal_node = ggandiva_literal_node_new_raw(&gandiva_node); + auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(literal_node); + priv->value = value; + g_bytes_ref(priv->value); + return GGANDIVA_BINARY_LITERAL_NODE(literal_node); +} + +/** + * ggandiva_binary_literal_node_get_value: + * @node: A #GGandivaBinaryLiteralNode. + * + * Returns: (transfer none): The value of the binary literal. + * + * Since: 0.12.0 + */ +GBytes * +ggandiva_binary_literal_node_get_value(GGandivaBinaryLiteralNode *node) +{ + auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(node); + if (!priv->value) { + auto value = ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); + priv->value = g_bytes_new(value.data(), value.size()); + } + + return priv->value; +} + + +G_DEFINE_TYPE(GGandivaStringLiteralNode, + ggandiva_string_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_string_literal_node_init(GGandivaStringLiteralNode *string_literal_node) +{ +} + +static void +ggandiva_string_literal_node_class_init(GGandivaStringLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_string_literal_node_new: + * @value: The value of the UTF-8 encoded string literal. + * + * Returns: A newly created #GGandivaStringLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaStringLiteralNode * +ggandiva_string_literal_node_new(const gchar *value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeStringLiteral(value); + return GGANDIVA_STRING_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_string_literal_node_get_value: + * @node: A #GGandivaStringLiteralNode. + * + * Returns: The value of the UTF-8 encoded string literal. + * + * Since: 0.12.0 + */ +const gchar * +ggandiva_string_literal_node_get_value(GGandivaStringLiteralNode *node) +{ + auto value = ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); + return value.c_str(); +} + G_END_DECLS std::shared_ptr @@ -434,3 +1133,58 @@ ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, priv->parameters = g_list_reverse(priv->parameters); return GGANDIVA_FUNCTION_NODE(function_node); } + +GGandivaLiteralNode * +ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node) +{ + GType type; + + switch ((*gandiva_node)->return_type()->id()) { + case arrow::Type::BOOL: + type = GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE; + break; + case arrow::Type::type::UINT8: + type = GGANDIVA_TYPE_UINT8_LITERAL_NODE; + break; + case arrow::Type::type::UINT16: + type = GGANDIVA_TYPE_UINT16_LITERAL_NODE; + break; + case arrow::Type::type::UINT32: + type = GGANDIVA_TYPE_UINT32_LITERAL_NODE; + break; + case arrow::Type::type::UINT64: + type = GGANDIVA_TYPE_UINT64_LITERAL_NODE; + break; + case arrow::Type::type::INT8: + type = GGANDIVA_TYPE_INT8_LITERAL_NODE; + break; + case arrow::Type::type::INT16: + type = GGANDIVA_TYPE_INT16_LITERAL_NODE; + break; + case arrow::Type::type::INT32: + type = GGANDIVA_TYPE_INT32_LITERAL_NODE; + break; + case arrow::Type::type::INT64: + type = GGANDIVA_TYPE_INT64_LITERAL_NODE; + break; + case arrow::Type::type::FLOAT: + type = GGANDIVA_TYPE_FLOAT_LITERAL_NODE; + break; + case arrow::Type::type::DOUBLE: + type = GGANDIVA_TYPE_DOUBLE_LITERAL_NODE; + break; + case arrow::Type::type::STRING: + type = GGANDIVA_TYPE_STRING_LITERAL_NODE; + break; + case arrow::Type::type::BINARY: + type = GGANDIVA_TYPE_BINARY_LITERAL_NODE; + break; + default: + type = GGANDIVA_TYPE_LITERAL_NODE; + break; + } + auto literal_node = GGANDIVA_LITERAL_NODE(g_object_new(type, + "node", gandiva_node, + NULL)); + return literal_node; +} diff --git a/c_glib/gandiva-glib/node.h b/c_glib/gandiva-glib/node.h index 98ab3afb6ae8f..183003fd9f68a 100644 --- a/c_glib/gandiva-glib/node.h +++ b/c_glib/gandiva-glib/node.h @@ -67,4 +67,240 @@ ggandiva_function_node_new(const gchar *name, GList * ggandiva_function_node_get_parameters(GGandivaFunctionNode *node); + +#define GGANDIVA_TYPE_LITERAL_NODE (ggandiva_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaLiteralNode, + ggandiva_literal_node, + GGANDIVA, + LITERAL_NODE, + GGandivaNode) +struct _GGandivaLiteralNodeClass +{ + GGandivaNodeClass parent_class; +}; + + +#define GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE (ggandiva_boolean_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaBooleanLiteralNode, + ggandiva_boolean_literal_node, + GGANDIVA, + BOOLEAN_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaBooleanLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaBooleanLiteralNode * +ggandiva_boolean_literal_node_new(gboolean value); +gboolean +ggandiva_boolean_literal_node_get_value(GGandivaBooleanLiteralNode *node); + + +#define GGANDIVA_TYPE_INT8_LITERAL_NODE (ggandiva_int8_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt8LiteralNode, + ggandiva_int8_literal_node, + GGANDIVA, + INT8_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt8LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt8LiteralNode * +ggandiva_int8_literal_node_new(gint8 value); +gint8 +ggandiva_int8_literal_node_get_value(GGandivaInt8LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT8_LITERAL_NODE (ggandiva_uint8_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt8LiteralNode, + ggandiva_uint8_literal_node, + GGANDIVA, + UINT8_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt8LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt8LiteralNode * +ggandiva_uint8_literal_node_new(guint8 value); +guint8 +ggandiva_uint8_literal_node_get_value(GGandivaUInt8LiteralNode *node); + + +#define GGANDIVA_TYPE_INT16_LITERAL_NODE (ggandiva_int16_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt16LiteralNode, + ggandiva_int16_literal_node, + GGANDIVA, + INT16_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt16LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt16LiteralNode * +ggandiva_int16_literal_node_new(gint16 value); +gint16 +ggandiva_int16_literal_node_get_value(GGandivaInt16LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT16_LITERAL_NODE (ggandiva_uint16_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt16LiteralNode, + ggandiva_uint16_literal_node, + GGANDIVA, + UINT16_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt16LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt16LiteralNode * +ggandiva_uint16_literal_node_new(guint16 value); +guint16 +ggandiva_uint16_literal_node_get_value(GGandivaUInt16LiteralNode *node); + + +#define GGANDIVA_TYPE_INT32_LITERAL_NODE (ggandiva_int32_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt32LiteralNode, + ggandiva_int32_literal_node, + GGANDIVA, + INT32_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt32LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt32LiteralNode * +ggandiva_int32_literal_node_new(gint32 value); +gint32 +ggandiva_int32_literal_node_get_value(GGandivaInt32LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT32_LITERAL_NODE (ggandiva_uint32_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt32LiteralNode, + ggandiva_uint32_literal_node, + GGANDIVA, + UINT32_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt32LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt32LiteralNode * +ggandiva_uint32_literal_node_new(guint32 value); +guint32 +ggandiva_uint32_literal_node_get_value(GGandivaUInt32LiteralNode *node); + + +#define GGANDIVA_TYPE_INT64_LITERAL_NODE (ggandiva_int64_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt64LiteralNode, + ggandiva_int64_literal_node, + GGANDIVA, + INT64_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt64LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt64LiteralNode * +ggandiva_int64_literal_node_new(gint64 value); +gint64 +ggandiva_int64_literal_node_get_value(GGandivaInt64LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT64_LITERAL_NODE (ggandiva_uint64_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt64LiteralNode, + ggandiva_uint64_literal_node, + GGANDIVA, + UINT64_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt64LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt64LiteralNode * +ggandiva_uint64_literal_node_new(guint64 value); +guint64 +ggandiva_uint64_literal_node_get_value(GGandivaUInt64LiteralNode *node); + + +#define GGANDIVA_TYPE_FLOAT_LITERAL_NODE (ggandiva_float_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaFloatLiteralNode, + ggandiva_float_literal_node, + GGANDIVA, + FLOAT_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaFloatLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaFloatLiteralNode * +ggandiva_float_literal_node_new(gfloat value); +gfloat +ggandiva_float_literal_node_get_value(GGandivaFloatLiteralNode *node); + + +#define GGANDIVA_TYPE_DOUBLE_LITERAL_NODE (ggandiva_double_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaDoubleLiteralNode, + ggandiva_double_literal_node, + GGANDIVA, + DOUBLE_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaDoubleLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaDoubleLiteralNode * +ggandiva_double_literal_node_new(gdouble value); +gdouble +ggandiva_double_literal_node_get_value(GGandivaDoubleLiteralNode *node); + + +#define GGANDIVA_TYPE_BINARY_LITERAL_NODE (ggandiva_binary_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaBinaryLiteralNode, + ggandiva_binary_literal_node, + GGANDIVA, + BINARY_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaBinaryLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new(const guint8 *value, + gsize size); +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new_bytes(GBytes *value); +GBytes * +ggandiva_binary_literal_node_get_value(GGandivaBinaryLiteralNode *node); + + +#define GGANDIVA_TYPE_STRING_LITERAL_NODE (ggandiva_string_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaStringLiteralNode, + ggandiva_string_literal_node, + GGANDIVA, + STRING_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaStringLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaStringLiteralNode * +ggandiva_string_literal_node_new(const gchar *value); +const gchar * +ggandiva_string_literal_node_get_value(GGandivaStringLiteralNode *node); + G_END_DECLS diff --git a/c_glib/gandiva-glib/node.hpp b/c_glib/gandiva-glib/node.hpp index 953c214beb9d6..7ff136003f174 100644 --- a/c_glib/gandiva-glib/node.hpp +++ b/c_glib/gandiva-glib/node.hpp @@ -21,6 +21,7 @@ #include +#include #include #include @@ -34,3 +35,5 @@ ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, const gchar *name, GList *parameters, GArrowDataType *return_type); +GGandivaLiteralNode * +ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node); diff --git a/c_glib/test/gandiva/test-binary-literal-node.rb b/c_glib/test/gandiva/test-binary-literal-node.rb new file mode 100644 index 0000000000000..93a54a361cc82 --- /dev/null +++ b/c_glib/test/gandiva/test-binary-literal-node.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaBinaryLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = "\x00\x01\x02\x03\x04" + end + + def test_new + literal_node = Gandiva::BinaryLiteralNode.new(@value) + assert_equal(@value, literal_node.value.to_s) + end + + def test_new_bytes + bytes_value = GLib::Bytes.new(@value) + literal_node = Gandiva::BinaryLiteralNode.new(bytes_value) + assert_equal(@value, literal_node.value.to_s) + end +end diff --git a/c_glib/test/gandiva/test-boolean-literal-node.rb b/c_glib/test/gandiva/test-boolean-literal-node.rb new file mode 100644 index 0000000000000..3d1f10c5e81c1 --- /dev/null +++ b/c_glib/test/gandiva/test-boolean-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaBooleanLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = true + literal_node = Gandiva::BooleanLiteralNode.new(value) + assert_equal(value, literal_node.value?) + end +end diff --git a/c_glib/test/gandiva/test-double-literal-node.rb b/c_glib/test/gandiva/test-double-literal-node.rb new file mode 100644 index 0000000000000..fd4bd08e4c254 --- /dev/null +++ b/c_glib/test/gandiva/test-double-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaDoubleLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = 1.5 + literal_node = Gandiva::DoubleLiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-float-literal-node.rb b/c_glib/test/gandiva/test-float-literal-node.rb new file mode 100644 index 0000000000000..202ec38fc5907 --- /dev/null +++ b/c_glib/test/gandiva/test-float-literal-node.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaFloatLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_new + assert_nothing_raised do + Gandiva::FloatLiteralNode.new(1.5) + end + end + + def test_value + value = 1.5 + literal_node = Gandiva::FloatLiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-int16-literal-node.rb b/c_glib/test/gandiva/test-int16-literal-node.rb new file mode 100644 index 0000000000000..9b5bb6822ebba --- /dev/null +++ b/c_glib/test/gandiva/test-int16-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt16LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = -3 + literal_node = Gandiva::Int16LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-int32-literal-node.rb b/c_glib/test/gandiva/test-int32-literal-node.rb new file mode 100644 index 0000000000000..9c94cdef4b125 --- /dev/null +++ b/c_glib/test/gandiva/test-int32-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt32LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = -3 + literal_node = Gandiva::Int32LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-int64-literal-node.rb b/c_glib/test/gandiva/test-int64-literal-node.rb new file mode 100644 index 0000000000000..e1b4b91d8c32c --- /dev/null +++ b/c_glib/test/gandiva/test-int64-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt64LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = -3 + literal_node = Gandiva::Int64LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-int8-literal-node.rb b/c_glib/test/gandiva/test-int8-literal-node.rb new file mode 100644 index 0000000000000..30f11fc81a60d --- /dev/null +++ b/c_glib/test/gandiva/test-int8-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt8LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = -3 + literal_node = Gandiva::Int8LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-string-literal-node.rb b/c_glib/test/gandiva/test-string-literal-node.rb new file mode 100644 index 0000000000000..a231f6111f40f --- /dev/null +++ b/c_glib/test/gandiva/test-string-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaStringLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = "Hello" + literal_node = Gandiva::StringLiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-uint16-literal-node.rb b/c_glib/test/gandiva/test-uint16-literal-node.rb new file mode 100644 index 0000000000000..e8bdd308969bb --- /dev/null +++ b/c_glib/test/gandiva/test-uint16-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt16LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = 3 + literal_node = Gandiva::UInt16LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-uint32-literal-node.rb b/c_glib/test/gandiva/test-uint32-literal-node.rb new file mode 100644 index 0000000000000..9d5995774dd97 --- /dev/null +++ b/c_glib/test/gandiva/test-uint32-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt32LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = 3 + literal_node = Gandiva::UInt32LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-uint64-literal-node.rb b/c_glib/test/gandiva/test-uint64-literal-node.rb new file mode 100644 index 0000000000000..56c46db81bd24 --- /dev/null +++ b/c_glib/test/gandiva/test-uint64-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt64LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = 3 + literal_node = Gandiva::UInt64LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-uint8-literal-node.rb b/c_glib/test/gandiva/test-uint8-literal-node.rb new file mode 100644 index 0000000000000..04f76cd76326f --- /dev/null +++ b/c_glib/test/gandiva/test-uint8-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt8LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = 3 + literal_node = Gandiva::UInt8LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/ruby/red-gandiva/lib/gandiva/loader.rb b/ruby/red-gandiva/lib/gandiva/loader.rb index 5a95897b61730..845275c3e7cbd 100644 --- a/ruby/red-gandiva/lib/gandiva/loader.rb +++ b/ruby/red-gandiva/lib/gandiva/loader.rb @@ -22,5 +22,19 @@ def load super("Gandiva", Gandiva) end end + + private + def load_method_info(info, klass, method_name) + case klass.name + when "Gandiva::BooleanLiteralNode" + case method_name + when "value?" + method_name = "value" + end + super(info, klass, method_name) + else + super + end + end end end diff --git a/ruby/red-gandiva/test/test-boolean-literal-node.rb b/ruby/red-gandiva/test/test-boolean-literal-node.rb new file mode 100644 index 0000000000000..d79f72994b6a0 --- /dev/null +++ b/ruby/red-gandiva/test/test-boolean-literal-node.rb @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestBooleanLiteralNode < Test::Unit::TestCase + def test_value + value = true + literal_node = Gandiva::BooleanLiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end From c0ac97f126c98fb29e81d6544adfea9d4ab74aff Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 12 Dec 2018 19:57:12 +0900 Subject: [PATCH 26/45] ARROW-4004: [GLib] Replace GPU with CUDA This is a follow-up change for #3088. Author: Kouhei Sutou Closes #3162 from kou/glib-replace-gpu-with-cuda and squashes the following commits: 8891e510 Replace GPU with CUDA --- c_glib/plasma-glib/plasma-glib.pc.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_glib/plasma-glib/plasma-glib.pc.in b/c_glib/plasma-glib/plasma-glib.pc.in index f3a82c237d0b9..c82fe69580f1f 100644 --- a/c_glib/plasma-glib/plasma-glib.pc.in +++ b/c_glib/plasma-glib/plasma-glib.pc.in @@ -25,4 +25,4 @@ Description: C API for Apache Arrow Plasma based on GLib Version: @VERSION@ Libs: -L${libdir} -lplasma-glib Cflags: -I${includedir} -Requires: plasma arrow-glib @ARROW_GPU_GLIB_PACKAGE@ +Requires: plasma arrow-glib @ARROW_CUDA_GLIB_PACKAGE@ From c029b772f35958feb723cdddb67dcf04ae302013 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 12 Dec 2018 08:13:36 -0600 Subject: [PATCH 27/45] ARROW-3976: [Ruby] Try to upgrade git to avoid errors caused by Homebrew on older git It seems that `brew update` can fail if the git version is too old, see https://github.com/Linuxbrew/brew/issues/820. Also adds retry logic Author: Kouhei Sutou Author: Wes McKinney Closes #3155 from wesm/ARROW-3976 and squashes the following commits: 0c7964ba3 Stop to use old Ruby 7dce4f0a3 travis_wait isn't available in custom shell script 6d41e7196 Make brew commands more robust 05044892b Incorporate code review 8c0454fd7 Try to upgrade git to avoid errors caused by Homebrew on older git, where --local argument is missing --- .travis.yml | 1 - ci/travis_install_osx.sh | 24 ++++++++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 42b1275d1c4bf..d1fc6dba35dd2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -256,7 +256,6 @@ matrix: - ARROW_TRAVIS_PLASMA=1 cache: addons: - rvm: 2.2 before_script: - if [ $ARROW_CI_RUBY_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_osx.sh diff --git a/ci/travis_install_osx.sh b/ci/travis_install_osx.sh index 47d6a637f7d58..6b6a4b2533d8b 100755 --- a/ci/travis_install_osx.sh +++ b/ci/travis_install_osx.sh @@ -23,13 +23,25 @@ set -e if [ "$ARROW_CI_RUBY_AFFECTED" = "1" ]; then brew_log_path=brew.log function run_brew() { - echo brew "$@" >> ${brew_log_path} - if ! gtimeout --signal=KILL 5m brew "$@" >> ${brew_log_path} 2>&1; then - cat ${brew_log_path} - rm ${brew_log_path} - false - fi + local i=0 + local n_tries=3 + while [[ $((i++)) < ${n_tries} ]]; do + echo "${i}: brew" "$@" >> ${brew_log_path} + if gtimeout --signal=KILL 9m brew "$@" >> ${brew_log_path} 2>&1; then + break + elif [[ ${i} == ${n_tries} ]]; then + cat ${brew_log_path} + rm ${brew_log_path} + false + fi + done } + + # ARROW-3976 Old versions of git can cause failures when Homebrew prints a + # donation solicitation. Attempt to update git + git --version + run_brew upgrade git + run_brew update run_brew upgrade python run_brew uninstall postgis From 67506d94b762d0ea3d26ba0e2df1399e566d145b Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 12 Dec 2018 08:16:30 -0600 Subject: [PATCH 28/45] ARROW-4002: [C++][Gandiva] Remove needless CMake version check I could build Gandiva with CMake 3.7.2 and LLVM 6.0.0 on Debian stretch. But I disabled Gandiva JNI. Author: Kouhei Sutou Closes #3161 from kou/cpp-gandiva-remove-cmake-version-check and squashes the following commits: 1506c546c Remove needless CMake version check --- cpp/src/gandiva/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 5d75aa271152b..5ef573875b660 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -15,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -# LLVM/Clang is required by multiple subdirs. -cmake_minimum_required(VERSION 3.11) - project(gandiva) find_package(LLVM) From a3ba1a2b54afdd2a55bd600d644722cf54b9ab5d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Dec 2018 09:19:41 -0600 Subject: [PATCH 29/45] ARROW-3988: [C++] Do not build unit tests by default, fix building Gandiva unit tests when ARROW_BUILD_TESTS=OFF I found while working on this that disabling `ARROW_GANDIVA_BUILD_TESTS` would break the build -- I think this was caused by some other changes I made. We should remove that option and instead use the new modular build targets and invoke unit tests using labels. So we would write ``` ninja gandiva # this will build all libraries and unit tests when ARROW_BUILD_TESTS=ON ctest -L gandiva ``` Author: Wes McKinney Closes #3156 from wesm/ARROW-3988 and squashes the following commits: 0420f9ed0 Remove arrow::PrimitiveBuilder from builder.rst for now because of Sphinx warning f8a33a5aa Fix gandiva test flag c4893534c Add ARROW_BUILD_TESTS to appveyor-cpp-test-cmake-script.bat 5c6a33271 Do not build unit tests by default, fix building Gandiva unit tests when ARROW_BUILD_TESTS=OFF --- ci/appveyor-cpp-build.bat | 3 +++ ci/appveyor-cpp-test-cmake-script.bat | 8 ++++++++ ci/cpp-msvc-build-main.bat | 1 + ci/travis_before_script_cpp.sh | 12 ++++++++++-- cpp/CMakeLists.txt | 14 +++++++------- cpp/README.md | 14 +++++++++----- cpp/cmake_modules/ThirdpartyToolchain.cmake | 7 ++++--- cpp/src/arrow/CMakeLists.txt | 3 ++- docs/source/cpp/api/builder.rst | 3 --- 9 files changed, 44 insertions(+), 21 deletions(-) diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index b8e431613210a..d20a0214f532c 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -34,6 +34,7 @@ if "%JOB%" == "Static_Crt_Build" ( -DARROW_USE_STATIC_CRT=ON ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=Debug ^ -DARROW_TEST_LINKAGE=static ^ -DARROW_CXXFLAGS="/MP" ^ @@ -51,6 +52,7 @@ if "%JOB%" == "Static_Crt_Build" ( -DARROW_USE_STATIC_CRT=ON ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=Release ^ -DARROW_TEST_LINKAGE=static ^ -DCMAKE_CXX_FLAGS_RELEASE="/MT %CMAKE_CXX_FLAGS_RELEASE%" ^ @@ -76,6 +78,7 @@ if "%JOB%" == "Build_Debug" ( cmake -G "%GENERATOR%" ^ -DARROW_VERBOSE_THIRDPARTY_BUILD=OFF ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_CXXFLAGS="/MP" ^ diff --git a/ci/appveyor-cpp-test-cmake-script.bat b/ci/appveyor-cpp-test-cmake-script.bat index 25bf9bddbbf39..8158a44260235 100644 --- a/ci/appveyor-cpp-test-cmake-script.bat +++ b/ci/appveyor-cpp-test-cmake-script.bat @@ -32,6 +32,7 @@ set FLATBUFFERS_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -49,6 +50,7 @@ set GFLAGS_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -66,6 +68,7 @@ set SNAPPY_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -83,6 +86,7 @@ set ZLIB_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -100,6 +104,7 @@ set BROTLI_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -117,6 +122,7 @@ set LZ4_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -134,6 +140,7 @@ set ZSTD_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -158,6 +165,7 @@ pushd %BUILD_DIR% set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. 2>output.txt diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index 8703dc9631773..560f5045af658 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -48,6 +48,7 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DARROW_CXXFLAGS="%ARROW_CXXFLAGS%" ^ -DCMAKE_CXX_FLAGS_RELEASE="/MD %CMAKE_CXX_FLAGS_RELEASE%" ^ -DARROW_PARQUET=ON ^ diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index f9e0602a80971..6465f28008006 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -41,7 +41,6 @@ if [ "$only_library_mode" == "no" ]; then fi CMAKE_COMMON_FLAGS="\ --DARROW_BUILD_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL \ -DARROW_NO_DEPRECATED_API=ON \ -DARROW_EXTRA_ERROR_CONTEXT=ON" @@ -61,7 +60,13 @@ pushd $ARROW_CPP_BUILD_DIR if [ $only_library_mode == "yes" ]; then CMAKE_COMMON_FLAGS="\ $CMAKE_COMMON_FLAGS \ --DARROW_BUILD_TESTS=OFF \ +-DARROW_BUILD_UTILITIES=OFF \ +-DARROW_INSTALL_NAME_RPATH=OFF" +else + CMAKE_COMMON_FLAGS="\ +$CMAKE_COMMON_FLAGS \ +-DARROW_BUILD_BENCHMARKS=ON \ +-DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_INSTALL_NAME_RPATH=OFF" fi @@ -92,6 +97,9 @@ fi if [ $ARROW_TRAVIS_GANDIVA == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON" + if [ $only_library_mode == "no" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA_BUILD_TESTS=ON" + fi fi if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7140d05d577f2..35707de574648 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -115,8 +115,12 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") OFF) option(ARROW_BUILD_TESTS - "Build the Arrow googletest unit tests" - ON) + "Build the Arrow googletest unit tests, default OFF" + OFF) + + option(ARROW_BUILD_BENCHMARKS + "Build the Arrow micro benchmarks, default OFF" + OFF) set(ARROW_TEST_LINKAGE "shared" CACHE STRING "Linkage of Arrow libraries with unit tests executables. \ @@ -126,10 +130,6 @@ static|shared (default shared)") "Only build unit tests having the indicated label or labels. \ Pass multiple labels by dividing with semicolons") - option(ARROW_BUILD_BENCHMARKS - "Build the Arrow micro benchmarks" - OFF) - option(ARROW_NO_DEPRECATED_API "Exclude deprecated APIs from build" OFF) @@ -322,7 +322,7 @@ Always OFF if building binaries" option(ARROW_GANDIVA_BUILD_TESTS "Build the Gandiva googletest unit tests" - ON) + OFF) endif() diff --git a/cpp/README.md b/cpp/README.md index 1278ca046d432..d1d76c17875d7 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -64,7 +64,7 @@ Simple debug build: cd arrow/cpp mkdir debug cd debug - cmake .. + cmake -DARROW_BUILD_TESTS=ON .. make unittest Simple release build: @@ -73,10 +73,14 @@ Simple release build: cd arrow/cpp mkdir release cd release - cmake .. -DCMAKE_BUILD_TYPE=Release + cmake -DARROW_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release .. make unittest -Detailed unit test logs will be placed in the build directory under `build/test-logs`. +If you do not need to build the test suite, you can omit the +`ARROW_BUILD_TESTS` option (the default is not to build the unit tests). + +Detailed unit test logs will be placed in the build directory under +`build/test-logs`. On some Linux distributions, running the test suite might require setting an explicit locale. If you see any locale-related errors, try setting the @@ -132,7 +136,7 @@ not use the macro. Follow the directions for simple build except run cmake with the `--ARROW_BUILD_BENCHMARKS` parameter set correctly: - cmake -DARROW_BUILD_BENCHMARKS=ON .. + cmake -DARROW_BUILD_TESTS=ON -DARROW_BUILD_BENCHMARKS=ON .. and instead of make unittest run either `make; ctest` to run both unit tests and benchmarks or `make benchmark` to run only the benchmark tests. @@ -265,7 +269,7 @@ The optional `gandiva` libraries and tests can be built by passing `-DARROW_GANDIVA=on`. ```shell -cmake .. -DARROW_GANDIVA=on +cmake .. -DARROW_GANDIVA=ON -DARROW_GANDIVA_BUILD_TESTS=ON make ctest -L gandiva ``` diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 6850b0bddefc5..8f3fc2cabe3c2 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -525,9 +525,11 @@ message(STATUS "double-conversion static library: ${DOUBLE_CONVERSION_STATIC_LIB # ---------------------------------------------------------------------- # Google gtest & gflags -if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) - add_custom_target(unittest ctest -L unittest) +add_custom_target(unittest ctest -L unittest) +add_custom_target(benchmark ctest -L benchmark) +if(ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS + OR ARROW_BUILD_BENCHMARKS) if("${GTEST_HOME}" STREQUAL "") if(APPLE) set(GTEST_CMAKE_CXX_FLAGS "-fPIC -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes") @@ -627,7 +629,6 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) endif() if(ARROW_BUILD_BENCHMARKS) - add_custom_target(benchmark ctest -L benchmark) if("$ENV{GBENCHMARK_HOME}" STREQUAL "") if(NOT MSVC) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 13aaeab494090..2d043a9a27627 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -213,7 +213,8 @@ if (ARROW_BUILD_STATIC AND WIN32) target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) endif() -if (ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) +if (ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS + OR ARROW_BUILD_BENCHMARKS) # that depend on gtest ADD_ARROW_LIB(arrow_testing SOURCES test-util.cc diff --git a/docs/source/cpp/api/builder.rst b/docs/source/cpp/api/builder.rst index 0912706ac081c..9e6540aa557fb 100644 --- a/docs/source/cpp/api/builder.rst +++ b/docs/source/cpp/api/builder.rst @@ -31,9 +31,6 @@ Concrete builder subclasses .. doxygenclass:: arrow::BooleanBuilder :members: -.. doxygenclass:: arrow::PrimitiveBuilder - :members: - .. doxygenclass:: arrow::NumericBuilder :members: From aa8bb3cc4bcbee02ed7d7599e5dcf234507e65b4 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 12 Dec 2018 10:58:11 -0600 Subject: [PATCH 30/45] ARROW-3986: [C++] Document memory management and table APIs Author: Antoine Pitrou Closes #3159 from pitrou/ARROW-3986-more-prose-documentation and squashes the following commits: 4e8ff421d ARROW-3986: Document memory management and table APIs --- cpp/src/arrow/allocator.h | 8 ++ cpp/src/arrow/buffer.h | 67 ++++++++++++--- cpp/src/arrow/builder.h | 3 +- cpp/src/arrow/memory_pool.h | 1 + cpp/src/arrow/table.h | 57 ++++++++++--- cpp/src/arrow/type.h | 25 +++++- docs/source/cpp/api.rst | 1 + docs/source/cpp/api/datatype.rst | 13 +++ docs/source/cpp/api/memory.rst | 43 ++++++++-- docs/source/cpp/api/table.rst | 52 ++++++++++++ docs/source/cpp/getting_started.rst | 3 +- docs/source/cpp/index.rst | 6 ++ docs/source/cpp/memory.rst | 127 ++++++++++++++++++++++++++++ docs/source/cpp/tables.rst | 87 +++++++++++++++++++ 14 files changed, 459 insertions(+), 34 deletions(-) create mode 100644 docs/source/cpp/api/table.rst create mode 100644 docs/source/cpp/memory.rst create mode 100644 docs/source/cpp/tables.rst diff --git a/cpp/src/arrow/allocator.h b/cpp/src/arrow/allocator.h index 144ba575063a3..a02b8e64bb05a 100644 --- a/cpp/src/arrow/allocator.h +++ b/cpp/src/arrow/allocator.h @@ -29,6 +29,7 @@ namespace arrow { +/// \brief A STL allocator delegating allocations to a Arrow MemoryPool template class stl_allocator { public: @@ -45,7 +46,9 @@ class stl_allocator { using other = stl_allocator; }; + /// \brief Construct an allocator from the default MemoryPool stl_allocator() noexcept : pool_(default_memory_pool()) {} + /// \brief Construct an allocator from the given MemoryPool explicit stl_allocator(MemoryPool* pool) noexcept : pool_(pool) {} template @@ -86,9 +89,14 @@ class stl_allocator { MemoryPool* pool_; }; +/// \brief A MemoryPool implementation delegating allocations to a STL allocator +/// +/// Note that STL allocators don't provide a resizing operation, and therefore +/// any buffer resizes will do a full reallocation and copy. template > class STLMemoryPool : public MemoryPool { public: + /// \brief Construct a memory pool from the given allocator explicit STLMemoryPool(const Allocator& alloc) : alloc_(alloc) {} Status Allocate(int64_t size, uint8_t** out) override { diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 66c131413c2d3..6b2ad1bbefc7f 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -40,13 +40,15 @@ namespace arrow { /// \class Buffer /// \brief Object containing a pointer to a piece of contiguous memory with a -/// particular size. Base class does not own its memory +/// particular size. /// /// Buffers have two related notions of length: size and capacity. Size is /// the number of bytes that might have valid data. Capacity is the number -/// of bytes that where allocated for the buffer in total. +/// of bytes that were allocated for the buffer in total. /// -/// The following invariant is always true: Size < Capacity +/// The Buffer base class does not own its memory, but subclasses often do. +/// +/// The following invariant is always true: Size <= Capacity class ARROW_EXPORT Buffer { public: /// \brief Construct from buffer and size without copying memory @@ -158,9 +160,12 @@ class ARROW_EXPORT Buffer { /// \note Can throw std::bad_alloc if buffer is large std::string ToString() const; - int64_t capacity() const { return capacity_; } + /// \brief Return a pointer to the buffer's data const uint8_t* data() const { return data_; } - + /// \brief Return a writable pointer to the buffer's data + /// + /// The buffer has to be mutable. Otherwise, an assertion may be thrown + /// or a null pointer may be returned. uint8_t* mutable_data() { #ifndef NDEBUG CheckMutable(); @@ -168,8 +173,12 @@ class ARROW_EXPORT Buffer { return mutable_data_; } + /// \brief Return the buffer's size in bytes int64_t size() const { return size_; } + /// \brief Return the buffer's capacity (number of allocated bytes) + int64_t capacity() const { return capacity_; } + std::shared_ptr parent() const { return parent_; } protected: @@ -188,26 +197,38 @@ class ARROW_EXPORT Buffer { ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); }; -/// Construct a view on passed buffer at the indicated offset and length. This -/// function cannot fail and does not error checking (except in debug builds) +/// \defgroup buffer-slicing-functions Functions for slicing buffers +/// +/// @{ + +/// \brief Construct a view on a buffer at the given offset and length. +/// +/// This function cannot fail and does not check for errors (except in debug builds) static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, const int64_t offset, const int64_t length) { return std::make_shared(buffer, offset, length); } +/// \brief Construct a view on a buffer at the given offset, up to the buffer's end. +/// +/// This function cannot fail and does not check for errors (except in debug builds) static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, const int64_t offset) { int64_t length = buffer->size() - offset; return SliceBuffer(buffer, offset, length); } -/// Construct a mutable buffer slice. If the parent buffer is not mutable, this -/// will abort in debug builds +/// \brief Like SliceBuffer, but construct a mutable buffer slice. +/// +/// If the parent buffer is not mutable, behavior is undefined (it may abort +/// in debug builds). ARROW_EXPORT std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, const int64_t offset, const int64_t length); +/// @} + /// \class MutableBuffer /// \brief A Buffer whose contents can be mutated. May or may not own its data. class ARROW_EXPORT MutableBuffer : public Buffer { @@ -266,6 +287,10 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {} }; +/// \defgroup buffer-allocation-functions Functions for allocating buffers +/// +/// @{ + /// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding. /// /// \param[in] pool a memory pool @@ -364,6 +389,8 @@ Status AllocateEmptyBitmap(MemoryPool* pool, int64_t length, ARROW_EXPORT Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out); +/// @} + // ---------------------------------------------------------------------- // Buffer builder classes @@ -374,13 +401,13 @@ class ARROW_EXPORT BufferBuilder { explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {} - /// \brief Resizes the buffer to the nearest multiple of 64 bytes + /// \brief Resize the buffer to the nearest multiple of 64 bytes /// /// \param elements the new capacity of the of the builder. Will be rounded /// up to a multiple of 64 bytes for padding - /// \param shrink_to_fit if new capacity smaller than existing size, + /// \param shrink_to_fit if new capacity is smaller than the existing size, /// reallocate internal buffer. Set to false to avoid reallocations when - /// shrinking the builder + /// shrinking the builder. /// \return Status Status Resize(const int64_t elements, bool shrink_to_fit = true) { // Resize(0) is a no-op @@ -409,6 +436,9 @@ class ARROW_EXPORT BufferBuilder { /// \return Status Status Reserve(const int64_t size) { return Resize(size_ + size, false); } + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. Status Append(const void* data, int64_t length) { if (capacity_ < length + size_) { int64_t new_capacity = BitUtil::NextPower2(length + size_); @@ -418,6 +448,9 @@ class ARROW_EXPORT BufferBuilder { return Status::OK(); } + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. template Status Append(const std::array& data) { constexpr auto nbytes = static_cast(NBYTES); @@ -448,6 +481,15 @@ class ARROW_EXPORT BufferBuilder { size_ += length; } + /// \brief Return result of builder as a Buffer object. + /// + /// The builder is reset and can be reused afterwards. + /// + /// \param[out] out the finalized Buffer object + /// \param shrink_to_fit if the buffer size is smaller than its capacity, + /// reallocate to fit more tightly in memory. Set to false to avoid + /// a reallocation, at the expense of potentially more memory consumption. + /// \return Status Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); *out = buffer_; @@ -472,6 +514,7 @@ class ARROW_EXPORT BufferBuilder { int64_t size_; }; +/// \brief A BufferBuilder subclass with convenience methods to append typed data template class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { public: diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 180b43a220f30..d0016674215fc 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -118,7 +118,8 @@ class ARROW_EXPORT ArrayBuilder { virtual Status FinishInternal(std::shared_ptr* out) = 0; /// \brief Return result of builder as an Array object. - /// Resets the builder except for DictionaryBuilder + /// + /// The builder is reset except for DictionaryBuilder. /// /// \param[out] out the finalized Array object /// \return Status diff --git a/cpp/src/arrow/memory_pool.h b/cpp/src/arrow/memory_pool.h index 49cd4c7efc3ed..8499b6f35d400 100644 --- a/cpp/src/arrow/memory_pool.h +++ b/cpp/src/arrow/memory_pool.h @@ -142,6 +142,7 @@ class ARROW_EXPORT ProxyMemoryPool : public MemoryPool { std::unique_ptr impl_; }; +/// Return the process-wide default memory pool. ARROW_EXPORT MemoryPool* default_memory_pool(); #ifdef ARROW_NO_DEFAULT_MEMORY_POOL diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 9c478485b243c..6b5733252879b 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -85,7 +85,12 @@ class ARROW_EXPORT ChunkedArray { std::shared_ptr type() const { return type_; } + /// \brief Determine if two chunked arrays are equal. + /// + /// Two chunked arrays can be equal only if they have equal datatypes. + /// However, they may be equal even if they have different chunkings. bool Equals(const ChunkedArray& other) const; + /// \brief Determine if two chunked arrays are equal. bool Equals(const std::shared_ptr& other) const; protected: @@ -103,13 +108,26 @@ class ARROW_EXPORT ChunkedArray { /// metadata) and a chunked data array class ARROW_EXPORT Column { public: + /// \brief Construct a column from a vector of arrays + /// + /// The array chunks' datatype must match the field's datatype. Column(const std::shared_ptr& field, const ArrayVector& chunks); + /// \brief Construct a column from a chunked array + /// + /// The chunked array's datatype must match the field's datatype. Column(const std::shared_ptr& field, const std::shared_ptr& data); - + /// \brief Construct a column from a single array + /// + /// The array's datatype must match the field's datatype. Column(const std::shared_ptr& field, const std::shared_ptr& data); - // Construct from name and array + /// \brief Construct a column from a name and an array + /// + /// A field with the given name and the array's datatype is automatically created. Column(const std::string& name, const std::shared_ptr& data); + /// \brief Construct a column from a name and a chunked array + /// + /// A field with the given name and the array's datatype is automatically created. Column(const std::string& name, const std::shared_ptr& data); int64_t length() const { return data_->length(); } @@ -154,7 +172,12 @@ class ARROW_EXPORT Column { /// \param[out] out The resulting vector of arrays Status Flatten(MemoryPool* pool, std::vector>* out) const; + /// \brief Determine if two columns are equal. + /// + /// Two columns can be equal only if they have equal datatypes. + /// However, they may be equal even if they have different chunkings. bool Equals(const Column& other) const; + /// \brief Determine if the two columns are equal. bool Equals(const std::shared_ptr& other) const; /// \brief Verify that the column's array data is consistent with the passed @@ -214,11 +237,10 @@ class ARROW_EXPORT Table { const std::vector>& batches, std::shared_ptr
* table); - /// \return the table's schema + /// Return the table schema std::shared_ptr schema() const { return schema_; } - /// \param[in] i column index, does not boundscheck - /// \return the i-th column + /// Return a column by index virtual std::shared_ptr column(int i) const = 0; /// \brief Remove column from the table, producing a new Table @@ -250,13 +272,16 @@ class ARROW_EXPORT Table { /// \brief Perform any checks to validate the input arguments virtual Status Validate() const = 0; - /// \return the number of columns in the table + /// \brief Return the number of columns in the table int num_columns() const { return schema_->num_fields(); } - /// \return the number of rows (the corresponding length of each column) + /// \brief Return the number of rows (equal to each column's logical length) int64_t num_rows() const { return num_rows_; } - /// \brief Determine if semantic contents of tables are exactly equal + /// \brief Determine if tables are equal + /// + /// Two tables can be equal only if they have equal schemas. + /// However, they may be equal even if they have different chunkings. bool Equals(const Table& other) const; protected: @@ -269,18 +294,25 @@ class ARROW_EXPORT Table { ARROW_DISALLOW_COPY_AND_ASSIGN(Table); }; -/// \brief Compute a sequence of record batches from a (possibly chunked) Table +/// \brief Compute a stream of record batches from a (possibly chunked) Table +/// +/// The conversion is zero-copy: each record batch is a view over a slice +/// of the table's columns. class ARROW_EXPORT TableBatchReader : public RecordBatchReader { public: ~TableBatchReader() override; - /// \brief Read batches with the maximum possible size + /// \brief Construct a TableBatchReader for the given table explicit TableBatchReader(const Table& table); std::shared_ptr schema() const override; Status ReadNext(std::shared_ptr* out) override; + /// \brief Set the desired maximum chunk size of record batches + /// + /// The actual chunk size of each record batch may be smaller, depending + /// on actual chunking characteristics of each table column. void set_chunksize(int64_t chunksize); private: @@ -289,7 +321,10 @@ class ARROW_EXPORT TableBatchReader : public RecordBatchReader { }; /// \brief Construct table from multiple input tables. -/// \return Status, fails if any schemas are different +/// +/// The tables are concatenated vertically. Therefore, all tables should +/// have the same schema. Each column in the output table is the result +/// of concatenating the corresponding columns in all input tables. ARROW_EXPORT Status ConcatenateTables(const std::vector>& tables, std::shared_ptr
* table); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f187817b53f28..9694202b9705c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -150,11 +150,12 @@ class ARROW_EXPORT DataType { explicit DataType(Type::type id) : id_(id) {} virtual ~DataType(); - // Return whether the types are equal - // - // Types that are logically convertible from one to another (e.g. List - // and Binary) are NOT equal. + /// \brief Return whether the types are equal + /// + /// Types that are logically convertible from one to another (e.g. List + /// and Binary) are NOT equal. virtual bool Equals(const DataType& other) const; + /// \brief Return whether the types are equal bool Equals(const std::shared_ptr& other) const; std::shared_ptr child(int i) const { return children_[i]; } @@ -174,6 +175,7 @@ class ARROW_EXPORT DataType { /// \since 0.7.0 virtual std::string name() const = 0; + /// \brief Return the type category Type::type id() const { return id_; } protected: @@ -248,12 +250,16 @@ class ARROW_EXPORT Field { const std::shared_ptr& metadata = NULLPTR) : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {} + /// \brief Return the field's attached metadata std::shared_ptr metadata() const { return metadata_; } + /// \brief Return whether the field has non-empty metadata bool HasMetadata() const; + /// \brief Return a copy of this field with the given metadata attached to it std::shared_ptr AddMetadata( const std::shared_ptr& metadata) const; + /// \brief Return a copy of this field without any metadata attached to it std::shared_ptr RemoveMetadata() const; std::vector> Flatten() const; @@ -261,10 +267,14 @@ class ARROW_EXPORT Field { bool Equals(const Field& other) const; bool Equals(const std::shared_ptr& other) const; + /// \brief Return a string representation ot the field std::string ToString() const; + /// \brief Return the field name const std::string& name() const { return name_; } + /// \brief Return the field data type std::shared_ptr type() const { return type_; } + /// \brief Return whether the field is nullable bool nullable() const { return nullable_; } private: @@ -896,6 +906,11 @@ dictionary(const std::shared_ptr& index_type, /// @} +/// \defgroup schema-factories Factory functions for fields and schemas +/// +/// Factory functions for fields and schemas +/// @{ + /// \brief Create a Field instance /// /// \param name the field name @@ -926,6 +941,8 @@ std::shared_ptr schema( std::vector>&& fields, const std::shared_ptr& metadata = NULLPTR); +/// @} + } // namespace arrow #endif // ARROW_TYPE_H diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index 02aa4d62e3b31..f6c0418b5c10d 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -27,3 +27,4 @@ API Reference api/datatype api/array api/builder + api/table diff --git a/docs/source/cpp/api/datatype.rst b/docs/source/cpp/api/datatype.rst index ee7844277df27..adfc6e4171e66 100644 --- a/docs/source/cpp/api/datatype.rst +++ b/docs/source/cpp/api/datatype.rst @@ -133,3 +133,16 @@ Dictionary-encoded .. doxygenclass:: arrow::DictionaryType :members: + +Fields and Schemas +================== + +.. doxygengroup:: schema-factories + :project: arrow_cpp + :content-only: + +.. doxygenclass:: arrow::Field + :members: + +.. doxygenclass:: arrow::Schema + :members: diff --git a/docs/source/cpp/api/memory.rst b/docs/source/cpp/api/memory.rst index 1dc8e706d3e8d..c921229e6cb17 100644 --- a/docs/source/cpp/api/memory.rst +++ b/docs/source/cpp/api/memory.rst @@ -33,16 +33,11 @@ Buffers :project: arrow_cpp :members: -.. doxygenclass:: arrow::BufferBuilder - :project: arrow_cpp - :members: - Memory Pools ------------ .. doxygenfunction:: arrow::default_memory_pool :project: arrow_cpp - :outline: .. doxygenclass:: arrow::MemoryPool :project: arrow_cpp @@ -55,3 +50,41 @@ Memory Pools .. doxygenclass:: arrow::ProxyMemoryPool :project: arrow_cpp :members: + +Allocation Functions +-------------------- + +These functions allocate a buffer from a particular memory pool. + +.. doxygengroup:: buffer-allocation-functions + :project: arrow_cpp + :content-only: + +Slicing +------- + +.. doxygengroup:: buffer-slicing-functions + :project: arrow_cpp + :content-only: + +Buffer Builders +--------------- + +.. doxygenclass:: arrow::BufferBuilder + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::TypedBufferBuilder + :project: arrow_cpp + :members: + +STL Integration +--------------- + +.. doxygenclass:: arrow::stl_allocator + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::STLMemoryPool + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/api/table.rst b/docs/source/cpp/api/table.rst new file mode 100644 index 0000000000000..e8b4f8e066e30 --- /dev/null +++ b/docs/source/cpp/api/table.rst @@ -0,0 +1,52 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======================== +Two-dimensional Datasets +======================== + +Columns +======= + +.. doxygenclass:: arrow::Column + :project: arrow_cpp + :members: + +Tables +====== + +.. doxygenclass:: arrow::Table + :project: arrow_cpp + :members: + +.. doxygenfunction:: arrow::ConcatenateTables + :project: arrow_cpp + +Record Batches +============== + +.. doxygenclass:: arrow::RecordBatch + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::RecordBatchReader + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::TableBatchReader + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst index 8201c2ded0d92..7c55b76912d1b 100644 --- a/docs/source/cpp/getting_started.rst +++ b/docs/source/cpp/getting_started.rst @@ -25,6 +25,7 @@ Getting Started overview conventions + memory arrays datatypes - + tables diff --git a/docs/source/cpp/index.rst b/docs/source/cpp/index.rst index 8c7ced0c2e7b8..63290be9ecb42 100644 --- a/docs/source/cpp/index.rst +++ b/docs/source/cpp/index.rst @@ -23,3 +23,9 @@ C++ Implementation getting_started api + +.. TODO add "topics" chapter +.. - nested arrays +.. - dictionary encoding + +.. TODO add "building" or "development" chapter diff --git a/docs/source/cpp/memory.rst b/docs/source/cpp/memory.rst new file mode 100644 index 0000000000000..23b4725e4b971 --- /dev/null +++ b/docs/source/cpp/memory.rst @@ -0,0 +1,127 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +================= +Memory Management +================= + +Buffers +======= + +To avoid passing around raw data pointers with varying and non-obvious +lifetime rules, Arrow provides a generic abstraction called :class:`arrow::Buffer`. +A Buffer encapsulates a pointer and data size, and generally also ties its +lifetime to that of an underlying provider (in other words, a Buffer should +*always* point to valid memory till its destruction). Buffers are untyped: +they simply denote a physical memory area regardless of its intended meaning +or interpretation. + +Buffers may be allocated by Arrow itself , or by third-party routines. +For example, it is possible to pass the data of a Python bytestring as a Arrow +buffer, keeping the Python object alive as necessary. + +In addition, buffers come in various flavours: mutable or not, resizable or +not. Generally, you will hold a mutable buffer when building up a piece +of data, then it will be frozen as an immutable container such as an +:doc:`array `. + +.. note:: + Some buffers may point to non-CPU memory, such as GPU-backed memory + provided by a CUDA context. If you're writing a GPU-aware application, + you will need to be careful not to interpret a GPU memory pointer as + a CPU-reachable pointer, or vice-versa. + +Accessing Buffer Memory +----------------------- + +Buffers provide fast access to the underlying memory using the +:func:`~arrow::Buffer::size` and :func:`~arrow::Buffer::data` accessors +(or :func:`~arrow::Buffer::mutable_data` for writable access to a mutable +buffer). + +Slicing +------- + +It is possible to make zero-copy slices of buffers, to obtain a buffer +referring to some contiguous subset of the underlying data. This is done +by calling the :func:`arrow::SliceBuffer` and :func:`arrow::SliceMutableBuffer` +functions. + +Allocating a Buffer +------------------- + +You can allocate a buffer yourself by calling one of the +:func:`arrow::AllocateBuffer` or :func:`arrow::AllocateResizableBuffer` +overloads:: + + std::shared_ptr buffer; + + if (!arrow::AllocateBuffer(4096, &buffer).ok()) { + // ... handle allocation error + } + uint8_t* buffer_data = buffer->mutable_data(); + memcpy(buffer_data, "hello world", 11); + +Allocating a buffer this way ensures it is 64-bytes aligned and padded +as recommended by the :doc:`Arrow memory specification <../format/Layout>`. + +Building a Buffer +----------------- + +You can also allocate *and* build a Buffer incrementally, using the +:class:`arrow::BufferBuilder` API:: + + BufferBuilder builder; + builder.Resize(11); + builder.Append("hello ", 6); + builder.Append("world", 5); + + std::shared_ptr buffer; + if (!builder.Finish(&buffer).ok()) { + // ... handle buffer allocation error + } + +Memory Pools +============ + +When allocating a Buffer using the Arrow C++ API, the buffer's underlying +memory is allocated by a :class:`arrow::MemoryPool` instance. Usually this +will be the process-wide *default memory pool*, but many Arrow APIs allow +you to pass another MemoryPool instance for their internal allocations. + +Memory pools are used for large long-lived data such as array buffers. +Other data, such as small C++ objects and temporary workspaces, usually +goes through the regular C++ allocators. + +Default Memory Pool +------------------- + +Depending on how Arrow was compiled, the default memory pool may use the +standard C ``malloc`` allocator, or a `jemalloc `_ heap. + +STL Integration +--------------- + +If you wish to use a Arrow memory pool to allocate the data of STL containers, +you can do so using the :class:`arrow::stl_allocator` wrapper. + +Conversely, you can also use a STL allocator to allocate Arrow memory, +using the :class:`arrow::STLMemoryPool` class. However, this may be less +performant, as STL allocators don't provide a resizing operation. diff --git a/docs/source/cpp/tables.rst b/docs/source/cpp/tables.rst new file mode 100644 index 0000000000000..d42f0c6c4f53e --- /dev/null +++ b/docs/source/cpp/tables.rst @@ -0,0 +1,87 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +======================== +Two-dimensional Datasets +======================== + +While arrays and chunked arrays represent a one-dimensional sequence of +homogenous values, data often comes in the form of two-dimensional sets of +heterogenous data (such as database tables, CSV files...). Arrow provides +several abstractions to handle such data conveniently and efficiently. + +Fields +====== + +Fields are used to denote the particular columns of a table (and also +the particular members of a nested data type such as :class:`arrow::StructType`). +A field, i.e. an instance of :class:`arrow::Field`, holds together a data +type, a field name and some optional metadata. + +The recommended way to create a field is to call the :func:`arrow::field` +factory function. + +Schemas +======= + +A schema describes the overall structure of a two-dimensional dataset such +as a table. It holds a sequence of fields together with some optional +schema-wide metadata (in addition to per-field metadata). The recommended +way to create a schema is to call one the :func:`arrow::schema` factory +function overloads:: + + // Create a schema describing datasets with two columns: + // a int32 column "A" and a utf8-encoded string column "B" + std::shared_ptr field_a, field_b; + std::shared_ptr schema; + + field_a = arrow::field("A", arrow::int32()); + field_b = arrow::field("B", arrow::utf8()); + schema = arrow::schema({field_a, field_b}); + +Columns +======= + +A :class:`arrow::Column` is a chunked array tied together with a field. +The field describes the column's name (for lookup in a larger dataset) +and its metadata. + +Tables +====== + +A :class:`arrow::Table` is a two-dimensional dataset of a number of columns, +together with a schema. The columns' names and types must match the schema. +Also, each column must have the same logical length in number of elements +(although each column can be chunked in a different way). + +Record Batches +============== + +A :class:`arrow::RecordBatch` is a two-dimensional dataset of a number of +contiguous arrays, each the same length. Like a table, a record batch also +has a schema which must match its arrays' datatypes. + +Record batches are a convenient unit of work for various serialization +and computation functions, possibly incremental. + +A table can be streamed as an arbitrary number of record batches using +a :class:`arrow::TableBatchReader`. Conversely, a logical sequence of +record batches can be assembled to form a table using one of the +:func:`arrow::Table::FromRecordBatches` factory function overloads. From 7ddfba6693db99ec8ea38b6fd244c5d6e2af3295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 12 Dec 2018 19:54:45 +0100 Subject: [PATCH 31/45] ARROW-3470: [C++] Fix row-wise example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Implement the `ADD_EXAMPLE` cmake function with new ctest label `example`, also covered by the `runexample` target. This can be toggled via the `ARROW_BUILD_EXAMPLES` option which is ON by default. - Implement fully working `row-wise-conversion-example.cc` and add it to the default build. - Update documentation to embed (manually) the newly created example. Author: François Saint-Jacques Closes #3078 from fsaintjacques/ARROW-3470-out-of-date-example and squashes the following commits: fab63f6f ARROW-3470: Fix status macro 1eba067d ARROW-3470: Fix row-wise example --- ci/appveyor-cpp-build.bat | 3 + ci/appveyor-cpp-test-cmake-script.bat | 8 + ci/cpp-msvc-build-main.bat | 1 + ci/travis_before_script_cpp.sh | 1 + cpp/CMakeLists.txt | 15 ++ cpp/apidoc/tutorials/row_wise_conversion.md | 194 ------------------ cpp/cmake_modules/BuildUtils.cmake | 60 ++++++ cpp/examples/arrow/CMakeLists.txt | 18 ++ .../arrow/row-wise-conversion-example.cc | 190 +++++++++++++++++ cpp/src/arrow/status.h | 4 +- docs/source/cpp/examples.rst | 30 +++ docs/source/cpp/index.rst | 1 + 12 files changed, 329 insertions(+), 196 deletions(-) delete mode 100644 cpp/apidoc/tutorials/row_wise_conversion.md create mode 100644 cpp/examples/arrow/CMakeLists.txt create mode 100644 cpp/examples/arrow/row-wise-conversion-example.cc create mode 100644 docs/source/cpp/examples.rst diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index d20a0214f532c..387dd55d18545 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -35,6 +35,7 @@ if "%JOB%" == "Static_Crt_Build" ( -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=Debug ^ -DARROW_TEST_LINKAGE=static ^ -DARROW_CXXFLAGS="/MP" ^ @@ -53,6 +54,7 @@ if "%JOB%" == "Static_Crt_Build" ( -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=Release ^ -DARROW_TEST_LINKAGE=static ^ -DCMAKE_CXX_FLAGS_RELEASE="/MT %CMAKE_CXX_FLAGS_RELEASE%" ^ @@ -79,6 +81,7 @@ if "%JOB%" == "Build_Debug" ( -DARROW_VERBOSE_THIRDPARTY_BUILD=OFF ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_CXXFLAGS="/MP" ^ diff --git a/ci/appveyor-cpp-test-cmake-script.bat b/ci/appveyor-cpp-test-cmake-script.bat index 8158a44260235..415406c4ac366 100644 --- a/ci/appveyor-cpp-test-cmake-script.bat +++ b/ci/appveyor-cpp-test-cmake-script.bat @@ -33,6 +33,7 @@ set FLATBUFFERS_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -51,6 +52,7 @@ set GFLAGS_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -69,6 +71,7 @@ set SNAPPY_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -87,6 +90,7 @@ set ZLIB_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -105,6 +109,7 @@ set BROTLI_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -123,6 +128,7 @@ set LZ4_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -141,6 +147,7 @@ set ZSTD_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -166,6 +173,7 @@ set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. 2>output.txt diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index 560f5045af658..644170775d568 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -49,6 +49,7 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DARROW_CXXFLAGS="%ARROW_CXXFLAGS%" ^ -DCMAKE_CXX_FLAGS_RELEASE="/MD %CMAKE_CXX_FLAGS_RELEASE%" ^ -DARROW_PARQUET=ON ^ diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 6465f28008006..a77fcd8749de5 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -67,6 +67,7 @@ else $CMAKE_COMMON_FLAGS \ -DARROW_BUILD_BENCHMARKS=ON \ -DARROW_BUILD_TESTS=ON \ +-DARROW_BUILD_EXAMPLES=ON \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_INSTALL_NAME_RPATH=OFF" fi diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 35707de574648..a83b9dd6d9409 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -122,6 +122,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the Arrow micro benchmarks, default OFF" OFF) + option(ARROW_BUILD_EXAMPLES + "Build the Arrow examples, default OFF" + OFF) + set(ARROW_TEST_LINKAGE "shared" CACHE STRING "Linkage of Arrow libraries with unit tests executables. \ static|shared (default shared)") @@ -447,6 +451,10 @@ if(NOT ARROW_BUILD_BENCHMARKS) set(NO_BENCHMARKS 1) endif() +if(NOT ARROW_BUILD_EXAMPLES) + set(NO_EXAMPLES 1) +endif() + if (NOT ARROW_FUZZING) set(NO_FUZZING 1) endif() @@ -735,12 +743,14 @@ pass ARROW_BUILD_SHARED=on") endif() # Use shared linking for unit tests if it's available set(ARROW_TEST_LINK_LIBS ${ARROW_TEST_SHARED_LINK_LIBS}) + set(ARROW_EXAMPLE_LINK_LIBS arrow_shared) else() if (NOT ARROW_BUILD_STATIC) message(FATAL_ERROR "If using static linkage for unit tests, must also \ pass ARROW_BUILD_STATIC=on") endif() set(ARROW_TEST_LINK_LIBS ${ARROW_TEST_STATIC_LINK_LIBS}) + set(ARROW_EXAMPLE_LINK_LIBS arrow_static) endif() if (ARROW_BUILD_BENCHMARKS) @@ -805,6 +815,11 @@ if(ARROW_GANDIVA) add_subdirectory(src/gandiva) endif() +if(ARROW_BUILD_EXAMPLES) + add_custom_target(runexample ctest -L example) + add_subdirectory(examples/arrow) +endif() + include(CMakePackageConfigHelpers) # Makes the project importable from the build directory diff --git a/cpp/apidoc/tutorials/row_wise_conversion.md b/cpp/apidoc/tutorials/row_wise_conversion.md deleted file mode 100644 index 750a923c7846b..0000000000000 --- a/cpp/apidoc/tutorials/row_wise_conversion.md +++ /dev/null @@ -1,194 +0,0 @@ - - -Convert a vector of row-wise data into an Arrow table -===================================================== - -While we want to use columnar data structures to build efficient operations, we -often receive data in a row-wise fashion from other systems. In the following, -we want give a brief introduction into the classes provided by Apache Arrow by -showing how to transform row-wise data into a columnar table. - -The data in this example is stored in the following struct: - -``` -struct data_row { - int64_t id; - double cost; - std::vector cost_components; -}; - -std::vector rows; -``` - -The final representation should be an `arrow::Table` which in turn is made up of -an `arrow::Schema` and a list of `arrow::Column`. An `arrow::Column` is again a -named collection of one or more `arrow::Array` instances. As the first step, we -will iterate over the data and build up the arrays incrementally. For this task, -we provide `arrow::ArrayBuilder` classes that help in the construction of the -final `arrow::Array` instances. - -For each type, Arrow has a specially typed builder class. For the primitive -values `id` and `cost` we can use the respective `arrow::Int64Builder` and -`arrow::DoubleBuilder`. For the `cost_components` vector, we need to have two -builders, a top-level `arrow::ListBuilder` that builds the array of offsets and -a nested `arrow::DoubleBuilder` that constructs the underlying values array that -is referenced by the offsets in the former array. - -``` -// The builders are more efficient using -// arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of -// the underlying memory regions in-place. At the moment, arrow::jemalloc is only -// supported on Unix systems, not Windows. - -using arrow::DoubleBuilder; -using arrow::Int64Builder; -using arrow::ListBuilder; - -arrow::MemoryPool* pool = arrow::default_memory_pool(); -Int64Builder id_builder(pool); -DoubleBuilder cost_builder(pool); -std::unique_ptr components_values_builder(new DoubleBuilder(pool)); -ListBuilder components_builder(pool, std::move(components_values_builder)); -``` - -Now we can loop over our existing data and insert it into the builders. The -`Append` calls here may fail (e.g. we cannot allocate enough additional memory). -Thus we need to check their return values. For more information on these values, -check the documentation about `arrow::Status`. - -``` -for (const data_row& row : rows) { - ARROW_RETURN_NOT_OK(id_builder.Append(row.id)); - ARROW_RETURN_NOT_OK(cost_builder.Append(row.cost)); - - // Indicate the start of a new list row. This will memorise the current - // offset in the values builder. - ARROW_RETURN_NOT_OK(components_builder.Append()); - // Store the actual values. The final nullptr argument tells the underyling - // builder that all added values are valid, i.e. non-null. - ARROW_RETURN_NOT_OK(components_values_builder->Append( - row.cost_components.data(), row.cost_components.size(), - nullptr); -} -``` - -At the end, we finalise the arrays, declare the (type) schema and combine them - into a single `arrow::Table`: - -``` -std::shared_ptr id_array; -ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array)); -std::shared_ptr cost_array; -ARROW_RETURN_NOT_OK(cost_builder.Finish(&cost_array)); -std::shared_ptr cost_components_array; -ARROW_RETURN_NOT_OK(components_builder.Finish(&cost_components_array)); - -std::vector> schema_vector = { - arrow::field("id", arrow::int64()), - arrow::field("cost", arrow::float64()), - arrow::field("cost_components", arrow::list(arrow::float64())) -}; -auto schema = std::make_shared(schema_vector); - -std::shared_ptr table = arrow::Table::Make(schema, - {id_array, cost_array, cost_components_array}); -``` - -The final `table` variable is the one we then can pass on to other functions -that can consume Apache Arrow memory structures. This object has ownership of -all referenced data, thus we don't have to care about undefined references once -we leave the scope of the function building the table and its underlying arrays. - - - -Converting an Arrow Table back into row-wise representation -=========================================================== - -To convert an Arrow table back into the same row-wise representation as in the -above section, we first will check that the table conforms to our expected -schema and then will build up the vector of rows incrementally. - -For the check if the table is as expected, we can utilise solely its schema. - -``` -// This is our input that was passed in from the outside. -std::shared_ptr table; - -std::vector> schema_vector = { - arrow::field("id", arrow::int64()), - arrow::field("cost", arrow::float64()), - arrow::field("cost_components", arrow::list(arrow::float64())) -}; -auto expected_schema = std::make_shared(schema_vector); - -if (!expected_schema->Equals(*table->schema())) { - // The table doesn't have the expected schema thus we cannot directly - // convert it to our target representation. - // TODO: Implement your custom error handling logic here. -} -``` - -As we have ensured that the table has the expected structure, we can unpack the -underlying arrays. For the primitive columns `id` and `cost` we can use the high -level functions to get the values whereas for the nested column -`cost_components` we need to access the C-pointer to the data to copy its -contents into the resulting `std::vector`. Here we need to be care to -also add the offset to the pointer. This offset is needed to enable zero-copy -slicing operations. While this could be adjusted automatically for double -arrays, this cannot be done for the accompanying bitmap as often the slicing -border would be inside a byte. - -``` -// For simplicity, we assume that all arrays consist of a single chunk here. -// In a productive implementation this should either be explicitly check or code -// added that can treat chunked arrays. - -auto ids = std::static_pointer_cast( - table->column(0)->data()->chunk(0)); -auto costs = std::static_pointer_castcolumn(1)->data()->chunk(0)); -auto cost_components = std::static_pointer_castcolumn(2)->data()->chunk(0)); -auto cost_components_values = std::static_pointer_cast( - cost_components->values()); -// To enable zero-copy slices, the native values pointer might need to account -// for this slicing offset. This is not needed for the higher level functions -// like Value(…) that already account for this offset internally. -const double* cost_components_values_ptr = cost_components_values->data() - + cost_components_values->offset(); -``` - -After we have unpacked the arrays from the table, we can iterate over them in a -row-wise fashion and fill our target, row-wise representation. - -``` -std::vector rows; - -for (int64_t i = 0; i < table->num_rows(); i++) { - // Another simplification in this example is that we assume that there are - // no null entries, e.g. each row is fill with valid values. - int64_t id = ids->Value(i); - double cost = costs->Value(i); - const double* first = cost_components_values_ptr + cost_components->value_offset(i); - const double* last = cost_components_values_ptr + cost_components->value_offset(i + 1); - std::vector components_vec(first, last); - rows.push_back({id, cost, components_vec}); -} -``` diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index bcf672823b424..d5978e1d215ff 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -480,6 +480,66 @@ function(ADD_TEST_CASE REL_TEST_NAME) LABELS ${ARG_LABELS}) endfunction() +############################################################ +# Examples +############################################################ +# Add a new example, with or without an executable that should be built. +# If examples are enabled then they will be run along side unit tests with ctest. +# 'make runexample' to build/run only examples. +# +# REL_EXAMPLE_NAME is the name of the example app. It may be a single component +# (e.g. monotime-example) or contain additional components (e.g. +# net/net_util-example). Either way, the last component must be a globally +# unique name. + +# The example will registered as unit test with ctest with a label +# of 'example'. +# +# Arguments after the test name will be passed to set_tests_properties(). +# +# \arg PREFIX a string to append to the name of the example executable. For +# example, if you have src/arrow/foo/bar-example.cc, then PREFIX "foo" will +# create test executable foo-bar-example +function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME) + set(options) + set(one_value_args) + set(multi_value_args EXTRA_LINK_LIBS DEPENDENCIES PREFIX) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(NO_EXAMPLES) + return() + endif() + get_filename_component(EXAMPLE_NAME ${REL_EXAMPLE_NAME} NAME_WE) + + if(ARG_PREFIX) + set(EXAMPLE_NAME "${ARG_PREFIX}-${EXAMPLE_NAME}") + endif() + + if(EXISTS ${CMAKE_SOURCE_DIR}/examples/arrow/${REL_EXAMPLE_NAME}.cc) + # This example has a corresponding .cc file, set it up as an executable. + set(EXAMPLE_PATH "${EXECUTABLE_OUTPUT_PATH}/${EXAMPLE_NAME}") + add_executable(${EXAMPLE_NAME} "${REL_EXAMPLE_NAME}.cc") + target_link_libraries(${EXAMPLE_NAME} ${ARROW_EXAMPLE_LINK_LIBS}) + add_dependencies(runexample ${EXAMPLE_NAME}) + set(NO_COLOR "--color_print=false") + + if (ARG_EXTRA_LINK_LIBS) + target_link_libraries(${EXAMPLE_NAME} ${ARG_EXTRA_LINK_LIBS}) + endif() + endif() + + if (ARG_DEPENDENCIES) + add_dependencies(${EXAMPLE_NAME} ${ARG_DEPENDENCIES}) + endif() + + + add_test(${EXAMPLE_NAME} ${EXAMPLE_PATH}) + set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "example") +endfunction() + ############################################################ # Fuzzing ############################################################ diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt new file mode 100644 index 0000000000000..6ecb537ad9787 --- /dev/null +++ b/cpp/examples/arrow/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ADD_ARROW_EXAMPLE(row-wise-conversion-example) diff --git a/cpp/examples/arrow/row-wise-conversion-example.cc b/cpp/examples/arrow/row-wise-conversion-example.cc new file mode 100644 index 0000000000000..db8c28753dbe6 --- /dev/null +++ b/cpp/examples/arrow/row-wise-conversion-example.cc @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include + +using arrow::DoubleBuilder; +using arrow::Int64Builder; +using arrow::ListBuilder; + +// While we want to use columnar data structures to build efficient operations, we +// often receive data in a row-wise fashion from other systems. In the following, +// we want give a brief introduction into the classes provided by Apache Arrow by +// showing how to transform row-wise data into a columnar table. +// +// The data in this example is stored in the following struct: +struct data_row { + int64_t id; + double cost; + std::vector cost_components; +}; + +// Transforming a vector of structs into a columnar Table. +// +// The final representation should be an `arrow::Table` which in turn is made up of +// an `arrow::Schema` and a list of `arrow::Column`. An `arrow::Column` is again a +// named collection of one or more `arrow::Array` instances. As the first step, we +// will iterate over the data and build up the arrays incrementally. For this task, +// we provide `arrow::ArrayBuilder` classes that help in the construction of the +// final `arrow::Array` instances. +// +// For each type, Arrow has a specially typed builder class. For the primitive +// values `id` and `cost` we can use the respective `arrow::Int64Builder` and +// `arrow::DoubleBuilder`. For the `cost_components` vector, we need to have two +// builders, a top-level `arrow::ListBuilder` that builds the array of offsets and +// a nested `arrow::DoubleBuilder` that constructs the underlying values array that +// is referenced by the offsets in the former array. +arrow::Status VectorToColumnarTable(const std::vector& rows, + std::shared_ptr* table) { + // The builders are more efficient using + // arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of + // the underlying memory regions in-place. At the moment, arrow::jemalloc is only + // supported on Unix systems, not Windows. + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + Int64Builder id_builder(pool); + DoubleBuilder cost_builder(pool); + ListBuilder components_builder(pool, std::make_shared(pool)); + // The following builder is owned by components_builder. + DoubleBuilder& cost_components_builder = + *(static_cast(components_builder.value_builder())); + + // Now we can loop over our existing data and insert it into the builders. The + // `Append` calls here may fail (e.g. we cannot allocate enough additional memory). + // Thus we need to check their return values. For more information on these values, + // check the documentation about `arrow::Status`. + for (const data_row& row : rows) { + ARROW_RETURN_NOT_OK(id_builder.Append(row.id)); + ARROW_RETURN_NOT_OK(cost_builder.Append(row.cost)); + + // Indicate the start of a new list row. This will memorise the current + // offset in the values builder. + ARROW_RETURN_NOT_OK(components_builder.Append()); + // Store the actual values. The final nullptr argument tells the underyling + // builder that all added values are valid, i.e. non-null. + ARROW_RETURN_NOT_OK(cost_components_builder.AppendValues(row.cost_components.data(), + row.cost_components.size())); + } + + // At the end, we finalise the arrays, declare the (type) schema and combine them + // into a single `arrow::Table`: + std::shared_ptr id_array; + ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array)); + std::shared_ptr cost_array; + ARROW_RETURN_NOT_OK(cost_builder.Finish(&cost_array)); + // No need to invoke cost_components_builder.Finish because it is implied by + // the parent builder's Finish invocation. + std::shared_ptr cost_components_array; + ARROW_RETURN_NOT_OK(components_builder.Finish(&cost_components_array)); + + std::vector> schema_vector = { + arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()), + arrow::field("cost_components", arrow::list(arrow::float64()))}; + + auto schema = std::make_shared(schema_vector); + + // The final `table` variable is the one we then can pass on to other functions + // that can consume Apache Arrow memory structures. This object has ownership of + // all referenced data, thus we don't have to care about undefined references once + // we leave the scope of the function building the table and its underlying arrays. + *table = arrow::Table::Make(schema, {id_array, cost_array, cost_components_array}); + + return arrow::Status::OK(); +} + +arrow::Status ColumnarTableToVector(const std::shared_ptr& table, + std::vector* rows) { + // To convert an Arrow table back into the same row-wise representation as in the + // above section, we first will check that the table conforms to our expected + // schema and then will build up the vector of rows incrementally. + // + // For the check if the table is as expected, we can utilise solely its schema. + std::vector> schema_vector = { + arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()), + arrow::field("cost_components", arrow::list(arrow::float64()))}; + auto expected_schema = std::make_shared(schema_vector); + + if (!expected_schema->Equals(*table->schema())) { + // The table doesn't have the expected schema thus we cannot directly + // convert it to our target representation. + return arrow::Status::Invalid("Schemas are not matching!"); + } + + // As we have ensured that the table has the expected structure, we can unpack the + // underlying arrays. For the primitive columns `id` and `cost` we can use the high + // level functions to get the values whereas for the nested column + // `cost_components` we need to access the C-pointer to the data to copy its + // contents into the resulting `std::vector`. Here we need to be care to + // also add the offset to the pointer. This offset is needed to enable zero-copy + // slicing operations. While this could be adjusted automatically for double + // arrays, this cannot be done for the accompanying bitmap as often the slicing + // border would be inside a byte. + + auto ids = + std::static_pointer_cast(table->column(0)->data()->chunk(0)); + auto costs = + std::static_pointer_cast(table->column(1)->data()->chunk(0)); + auto cost_components = + std::static_pointer_cast(table->column(2)->data()->chunk(0)); + auto cost_components_values = + std::static_pointer_cast(cost_components->values()); + // To enable zero-copy slices, the native values pointer might need to account + // for this slicing offset. This is not needed for the higher level functions + // like Value(…) that already account for this offset internally. + const double* ccv_ptr = cost_components_values->data()->GetValues(1); + + for (int64_t i = 0; i < table->num_rows(); i++) { + // Another simplification in this example is that we assume that there are + // no null entries, e.g. each row is fill with valid values. + int64_t id = ids->Value(i); + double cost = costs->Value(i); + const double* first = ccv_ptr + cost_components->value_offset(i); + const double* last = ccv_ptr + cost_components->value_offset(i + 1); + std::vector components_vec(first, last); + rows->push_back({id, cost, components_vec}); + } + + return arrow::Status::OK(); +} + +#define EXIT_ON_FAILURE(expr) \ + do { \ + arrow::Status status_ = (expr); \ + if (!status_.ok()) { \ + std::cerr << status_.message() << std::endl; \ + return EXIT_FAILURE; \ + } \ + } while (0); + +int main(int argc, char** argv) { + std::vector rows = { + {1, 1.0, {1.0}}, {2, 2.0, {1.0, 2.0}}, {3, 3.0, {1.0, 2.0, 3.0}}}; + + std::shared_ptr table; + EXIT_ON_FAILURE(VectorToColumnarTable(rows, &table)); + + std::vector expected_rows; + EXIT_ON_FAILURE(ColumnarTableToVector(table, &expected_rows)); + + assert(rows.size() == expected_rows.size()); + + return EXIT_SUCCESS; +} diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index ddf3d7ee0e644..e3632a6d5f62e 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -36,7 +36,7 @@ if (ARROW_PREDICT_FALSE(!_s.ok())) { \ std::stringstream ss; \ ss << __FILE__ << ":" << __LINE__ << " code: " << #s << "\n" << _s.message(); \ - return Status(_s.code(), ss.str()); \ + return ::arrow::Status(_s.code(), ss.str()); \ } \ } while (0) @@ -69,7 +69,7 @@ std::stringstream ss; \ ss << __FILE__ << ":" << __LINE__ << " code: " << _status.CodeAsString() << " \n " \ << _status.message(); \ - return Status(_status.code(), ss.str()); \ + return ::arrow::Status(_status.code(), ss.str()); \ } \ } while (0) diff --git a/docs/source/cpp/examples.rst b/docs/source/cpp/examples.rst new file mode 100644 index 0000000000000..5f4372fbba2f2 --- /dev/null +++ b/docs/source/cpp/examples.rst @@ -0,0 +1,30 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Examples +======== + +Row to columnar conversion +-------------------------- + +The following example converts an array of structs to a :class:`arrow::Table` +instance, and then converts it back to the original array of structs. + +.. literalinclude:: ../../../cpp/examples/arrow/row-wise-conversion-example.cc diff --git a/docs/source/cpp/index.rst b/docs/source/cpp/index.rst index 63290be9ecb42..1d70e6acbf0ce 100644 --- a/docs/source/cpp/index.rst +++ b/docs/source/cpp/index.rst @@ -22,6 +22,7 @@ C++ Implementation :maxdepth: 2 getting_started + examples api .. TODO add "topics" chapter From 45940410e6cb88809338a8fb7bf6b50046fe77fe Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Dec 2018 16:11:11 -0600 Subject: [PATCH 32/45] ARROW-4008: [C++] Restore ARROW_BUILD_UTILITIES to fix integration tests In recent refactoring, ARROW_BUILD_UTILITIES got accidentally set to OFF Author: Wes McKinney Closes #3166 from wesm/ARROW-4008 and squashes the following commits: 105651722 Only add json-integration-test dependency when it is built 96bec050d Actually build utilities 02fd08ff6 Add integration target as dependency of arrow target 8c9fcf809 Do not write integration test files to /tmp af6a23b98 Add option to write integration test files to somewhere outside of /tmp. Add integration target to C++ build --- ci/travis_before_script_cpp.sh | 2 +- ci/travis_script_integration.sh | 7 +++++- cpp/src/arrow/ipc/CMakeLists.txt | 22 ++++++++++------ integration/integration_test.py | 43 +++++++++++++++++++------------- 4 files changed, 48 insertions(+), 26 deletions(-) diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index a77fcd8749de5..5f398e8c6e327 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -68,7 +68,7 @@ $CMAKE_COMMON_FLAGS \ -DARROW_BUILD_BENCHMARKS=ON \ -DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_EXAMPLES=ON \ --DARROW_BUILD_UTILITIES=OFF \ +-DARROW_BUILD_UTILITIES=ON \ -DARROW_INSTALL_NAME_RPATH=OFF" fi diff --git a/ci/travis_script_integration.sh b/ci/travis_script_integration.sh index 286acacd74004..9c2786282b08b 100755 --- a/ci/travis_script_integration.sh +++ b/ci/travis_script_integration.sh @@ -52,7 +52,12 @@ conda install -y nomkl # Expensive dependencies install from Continuum package repo conda install -y pip numpy six -python integration_test.py --debug +# ARROW-4008: Create a directory to write temporary files since /tmp can be +# unstable in Travis CI +INTEGRATION_TEMPDIR=$TRAVIS_BUILD_DIR/integration_temp +mkdir -p $INTEGRATION_TEMPDIR + +python integration_test.py --debug --tempdir=$INTEGRATION_TEMPDIR popd diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index bda4ef3e417d5..44c56f033269d 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +# Targets required for protocol integration testing +add_custom_target(integration) +add_dependencies(arrow integration) + ####################################### # Messaging and interprocess communication @@ -31,13 +35,14 @@ if (NOT ARROW_BOOST_HEADER_ONLY) EXTRA_LINK_LIBS gflags_static) # Test is being built - if (TARGET json-integration-test) + if (TARGET arrow-json-integration-test) + add_dependencies(integration arrow-json-integration-test) if (UNIX) if (APPLE) - set_target_properties(json-integration-test + set_target_properties(arrow-json-integration-test PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") else() - target_link_libraries(json-integration-test PRIVATE pthread) + target_link_libraries(arrow-json-integration-test PRIVATE pthread) endif() endif() endif() @@ -113,10 +118,13 @@ if(NOT WIN32) endif() if (ARROW_BUILD_UTILITIES) - add_executable(file-to-stream file-to-stream.cc) - target_link_libraries(file-to-stream ${UTIL_LINK_LIBS}) - add_executable(stream-to-file stream-to-file.cc) - target_link_libraries(stream-to-file ${UTIL_LINK_LIBS}) + add_executable(arrow-file-to-stream file-to-stream.cc) + target_link_libraries(arrow-file-to-stream ${UTIL_LINK_LIBS}) + add_executable(arrow-stream-to-file stream-to-file.cc) + target_link_libraries(arrow-stream-to-file ${UTIL_LINK_LIBS}) + + add_dependencies(integration arrow-file-to-stream) + add_dependencies(integration arrow-stream-to-file) endif() ADD_ARROW_BENCHMARK(read-write-benchmark diff --git a/integration/integration_test.py b/integration/integration_test.py index 3bd37bdd80677..7101af2516ad9 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -893,8 +893,8 @@ def generate_dictionary_case(): dictionaries=[dict1, dict2]) -def get_generated_json_files(): - temp_dir = tempfile.mkdtemp() +def get_generated_json_files(tempdir=None): + tempdir = tempdir or tempfile.mkdtemp() def _temp_path(): return @@ -910,7 +910,7 @@ def _temp_path(): generated_paths = [] for file_obj in file_objs: - out_path = os.path.join(temp_dir, 'generated_' + + out_path = os.path.join(tempdir, 'generated_' + file_obj.name + '.json') file_obj.write(out_path) generated_paths.append(out_path) @@ -924,10 +924,10 @@ def _temp_path(): class IntegrationRunner(object): - def __init__(self, json_files, testers, debug=False): + def __init__(self, json_files, testers, tempdir=None, debug=False): self.json_files = json_files self.testers = testers - self.temp_dir = tempfile.mkdtemp() + self.temp_dir = tempdir or tempfile.mkdtemp() self.debug = debug def run(self): @@ -950,10 +950,12 @@ def _compare_implementations(self, producer, consumer): name = os.path.splitext(os.path.basename(json_path))[0] + file_id = guid()[:8] + # Make the random access file print('-- Creating binary inputs') - producer_file_path = os.path.join(self.temp_dir, guid() + '_' + - name + '.json_to_arrow') + producer_file_path = os.path.join(self.temp_dir, file_id + '_' + + name + '.json_as_file') producer.json_to_file(json_path, producer_file_path) # Validate the file @@ -961,10 +963,12 @@ def _compare_implementations(self, producer, consumer): consumer.validate(json_path, producer_file_path) print('-- Validating stream') - producer_stream_path = os.path.join(self.temp_dir, guid() + '_' + - name + '.arrow_to_stream') - consumer_file_path = os.path.join(self.temp_dir, guid() + '_' + - name + '.stream_to_arrow') + producer_stream_path = os.path.join(self.temp_dir, file_id + '_' + + name + + '.producer_file_as_stream') + consumer_file_path = os.path.join(self.temp_dir, file_id + '_' + + name + + '.consumer_stream_as_file') producer.file_to_stream(producer_file_path, producer_stream_path) consumer.stream_to_file(producer_stream_path, @@ -1054,8 +1058,8 @@ class CPPTester(Tester): os.path.join(ARROW_HOME, 'cpp/build/debug')) CPP_INTEGRATION_EXE = os.path.join(EXE_PATH, 'arrow-json-integration-test') - STREAM_TO_FILE = os.path.join(EXE_PATH, 'stream-to-file') - FILE_TO_STREAM = os.path.join(EXE_PATH, 'file-to-stream') + STREAM_TO_FILE = os.path.join(EXE_PATH, 'arrow-stream-to-file') + FILE_TO_STREAM = os.path.join(EXE_PATH, 'arrow-file-to-stream') name = 'C++' @@ -1162,15 +1166,16 @@ def get_static_json_files(): return glob.glob(glob_pattern) -def run_all_tests(debug=False): +def run_all_tests(debug=False, tempdir=None): testers = [CPPTester(debug=debug), JavaTester(debug=debug), JSTester(debug=debug)] static_json_files = get_static_json_files() - generated_json_files = get_generated_json_files() + generated_json_files = get_generated_json_files(tempdir=tempdir) json_files = static_json_files + generated_json_files - runner = IntegrationRunner(json_files, testers, debug=debug) + runner = IntegrationRunner(json_files, testers, + tempdir=tempdir, debug=debug) runner.run() print('-- All tests passed!') @@ -1195,6 +1200,10 @@ def write_js_test_json(directory): parser.add_argument('--debug', dest='debug', action='store_true', default=False, help='Run executables in debug mode as relevant') + parser.add_argument('--tempdir', dest='tempdir', + default=tempfile.mkdtemp(), + help=('Directory to use for writing ' + 'integration test temporary files')) args = parser.parse_args() if args.generated_json_path: try: @@ -1204,4 +1213,4 @@ def write_js_test_json(directory): raise write_js_test_json(args.generated_json_path) else: - run_all_tests(debug=args.debug) + run_all_tests(debug=args.debug, tempdir=args.tempdir) From 0005048b2f2ab1b84908e81c9e0648158ccf639c Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Thu, 13 Dec 2018 17:44:17 +0900 Subject: [PATCH 33/45] ARROW-4005: [Plasma] [GLib] Add gplasma_client_disconnect() Author: Yosuke Shiro Closes #3163 from shiro615/glib-add-disconnect-for-plasma-glib and squashes the following commits: 3d990034 Remove require_gi by using options a8575acd Use bool instead of gboolean b2c9ccf7 Keep disconnected information 789c1dd4 Fix test case 5182beb1 Add Add gplasma_client_disconnect() --- c_glib/plasma-glib/client.cpp | 34 +++++++++++++++++++++--- c_glib/plasma-glib/client.h | 2 ++ c_glib/test/plasma/test-plasma-client.rb | 13 ++++++--- 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/c_glib/plasma-glib/client.cpp b/c_glib/plasma-glib/client.cpp index e88cb13e83cd0..c05a71085dd2d 100644 --- a/c_glib/plasma-glib/client.cpp +++ b/c_glib/plasma-glib/client.cpp @@ -185,6 +185,7 @@ gplasma_client_create_options_get_metadata(GPlasmaClientCreateOptions *options, typedef struct GPlasmaClientPrivate_ { plasma::PlasmaClient *client; + bool disconnected; } GPlasmaClientPrivate; enum { @@ -205,10 +206,12 @@ gplasma_client_finalize(GObject *object) { auto priv = GPLASMA_CLIENT_GET_PRIVATE(object); - auto status = priv->client->Disconnect(); - if (!status.ok()) { - g_warning("[plasma][client][finalize] Failed to disconnect: %s", - status.ToString().c_str()); + if (!priv->disconnected) { + auto status = priv->client->Disconnect(); + if (!status.ok()) { + g_warning("[plasma][client][finalize] Failed to disconnect: %s", + status.ToString().c_str()); + } } delete priv->client; @@ -431,6 +434,29 @@ gplasma_client_refer_object(GPlasmaClient *client, } } +/** + * gplasma_client_disconnect: + * @client: A #GPlasmaClient. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +gplasma_client_disconnect(GPlasmaClient *client, + GError **error) +{ + auto priv = GPLASMA_CLIENT_GET_PRIVATE(client); + auto status = priv->client->Disconnect(); + if (garrow_error_check(error, status, "[plasma][client][disconnect]")) { + priv->disconnected = true; + return TRUE; + } else { + return FALSE; + } +} + G_END_DECLS GPlasmaClient * diff --git a/c_glib/plasma-glib/client.h b/c_glib/plasma-glib/client.h index 6f99f467c83a7..34b0ba22e3188 100644 --- a/c_glib/plasma-glib/client.h +++ b/c_glib/plasma-glib/client.h @@ -71,5 +71,7 @@ gplasma_client_refer_object(GPlasmaClient *client, GPlasmaObjectID *id, gint64 timeout_ms, GError **error); +gboolean gplasma_client_disconnect(GPlasmaClient *client, + GError **error); G_END_DECLS diff --git a/c_glib/test/plasma/test-plasma-client.rb b/c_glib/test/plasma/test-plasma-client.rb index cbdce865f0132..6caf09f02570c 100644 --- a/c_glib/test/plasma/test-plasma-client.rb +++ b/c_glib/test/plasma/test-plasma-client.rb @@ -24,6 +24,9 @@ def setup @store = Helper::PlasmaStore.new @store.start @client = Plasma::Client.new(@store.socket_path) + @id = Plasma::ObjectID.new("Hello") + @data = "World" + @options = Plasma::ClientCreateOptions.new end def teardown @@ -34,10 +37,7 @@ def teardown def setup super - @id = Plasma::ObjectID.new("Hello") - @data = "World" @metadata = "Metadata" - @options = Plasma::ClientCreateOptions.new end test("no options") do @@ -84,4 +84,11 @@ def setup ]) end end + + test("#disconnect") do + @client.disconnect + assert_raise(Arrow::Error::Io) do + @client.create(@id, @data.bytesize, @options) + end + end end From 1882a0727ba275fbced9ed0754c5fe99f841bed4 Mon Sep 17 00:00:00 2001 From: Tanya Schlusser Date: Thu, 13 Dec 2018 13:36:21 +0100 Subject: [PATCH 34/45] ARROW-3866: [Python] Column metadata is not transferred to tables in pyarrow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use columns' existing metadata to create the new fields in `Table.from_arrays()`. Also persists the original `nullable` value. Happy to change things! Thank you for putting a newbie label on it. Author: Tanya Schlusser Author: Krisztián Szűcs Closes #3160 from tanyaschlusser/ARROW-3866 and squashes the following commits: 005940ea Move the test for preserved metadata to a separate function. Add a test that nullable=False is preserved. e4256a17 use column.field() 76216eae Arrow-3866: keep field matadata for columns passed to pa.Table.from_arrays() 33950a83 ARROW-3866: test to confirm column metadata is added when calling pa.Table.from_arrays(column_list) --- python/pyarrow/table.pxi | 24 +++++++++++------------- python/pyarrow/tests/test_table.py | 15 +++++++++++++++ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index fd565afae5acf..cf3411dc03616 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -636,12 +636,12 @@ cdef class Column: cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): cdef: - Column col - c_string c_name - vector[shared_ptr[CField]] fields - shared_ptr[CDataType] type_ Py_ssize_t K = len(arrays) + c_string c_name + CColumn* c_column + shared_ptr[CDataType] c_type shared_ptr[CKeyValueMetadata] c_meta + vector[shared_ptr[CField]] c_fields if metadata is not None: if not isinstance(metadata, dict): @@ -649,17 +649,15 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): c_meta = pyarrow_unwrap_metadata(metadata) if K == 0: - schema.reset(new CSchema(fields, c_meta)) + schema.reset(new CSchema(c_fields, c_meta)) return - fields.resize(K) + c_fields.resize(K) if isinstance(arrays[0], Column): for i in range(K): - col = arrays[i] - type_ = col.sp_column.get().type() - c_name = tobytes(col.name) - fields[i].reset(new CField(c_name, type_, True)) + c_column = (arrays[i]).column + c_fields[i] = c_column.field() else: if names is None: raise ValueError('Must pass names when constructing ' @@ -670,7 +668,7 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): for i in range(K): val = arrays[i] if isinstance(val, (Array, ChunkedArray)): - type_ = ( val.type).sp_type + c_type = ( val.type).sp_type else: raise TypeError(type(val)) @@ -678,9 +676,9 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): c_name = tobytes(u'None') else: c_name = tobytes(names[i]) - fields[i].reset(new CField(c_name, type_, True)) + c_fields[i].reset(new CField(c_name, c_type, True)) - schema.reset(new CSchema(fields, c_meta)) + schema.reset(new CSchema(c_fields, c_meta)) cdef class RecordBatch: diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 9c9828d8c0764..ecbf93bd3e8b0 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -579,6 +579,21 @@ def test_table_basics(): assert table.columns == columns +def test_table_from_arrays_preserves_column_metadata(): + # Added to test https://issues.apache.org/jira/browse/ARROW-3866 + arr0 = pa.array([1, 2]) + arr1 = pa.array([3, 4]) + field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B")) + field1 = pa.field('field2', pa.int64(), nullable=False) + columns = [ + pa.column(field0, arr0), + pa.column(field1, arr1) + ] + table = pa.Table.from_arrays(columns) + assert b"a" in table.column(0).field.metadata + assert table.column(1).field.nullable is False + + def test_table_from_arrays_invalid_names(): data = [ pa.array(range(5)), From 2a726c179fdb794a9fbc2025aced0dbab3c5c362 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Thu, 13 Dec 2018 08:56:16 -0600 Subject: [PATCH 35/45] PARQUET-1473: [C++] Add helper function that converts ParquetVersion to human-friendly string Author: Hatem Helal Author: Hatem Helal Closes #3148 from hatemhelal/parquet-1473-wip and squashes the following commits: 8983ae629 remove default case from switch to make any omission a compile-time error a87c3ba99 Add helper function parquet::ParquetVersionToString and use it in parquet-reader tool --- cpp/src/parquet/metadata.cc | 12 ++++++++++++ cpp/src/parquet/metadata.h | 2 ++ cpp/src/parquet/printer.cc | 2 +- cpp/tools/parquet/parquet-reader.cc | 2 +- 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index cf63b0f662b52..22cfbdb91aa73 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -47,6 +47,18 @@ const ApplicationVersion& ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION() return version; } +std::string ParquetVersionToString(ParquetVersion::type ver) { + switch (ver) { + case ParquetVersion::PARQUET_1_0: + return "1.0"; + case ParquetVersion::PARQUET_2_0: + return "2.0"; + } + + // This should be unreachable + return "UNKNOWN"; +} + template static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 706e980711683..25f4d4cd8cbdf 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -282,6 +282,8 @@ class PARQUET_EXPORT FileMetaDataBuilder { std::unique_ptr impl_; }; +PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver); + } // namespace parquet #endif // PARQUET_FILE_METADATA_H diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 9f26a4180cda1..5be8d9d96467c 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -38,7 +38,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte const FileMetaData* file_metadata = fileReader->metadata().get(); stream << "File Name: " << filename << "\n"; - stream << "Version: " << file_metadata->version() << "\n"; + stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n"; stream << "Created By: " << file_metadata->created_by() << "\n"; stream << "Total rows: " << file_metadata->num_rows() << "\n"; diff --git a/cpp/tools/parquet/parquet-reader.cc b/cpp/tools/parquet/parquet-reader.cc index 34bdfc103dcc0..a5b7db1330a97 100644 --- a/cpp/tools/parquet/parquet-reader.cc +++ b/cpp/tools/parquet/parquet-reader.cc @@ -23,7 +23,7 @@ int main(int argc, char** argv) { if (argc > 5 || argc < 2) { - std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] [--json]" + std::cerr << "Usage: parquet-reader [--only-metadata] [--no-memory-map] [--json]" "[--print-key-value-metadata] [--columns=...] " << std::endl; return -1; From e34057c4b4be8c7abf3537dd4998b5b38919ba73 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 13 Dec 2018 09:52:17 -0600 Subject: [PATCH 36/45] ARROW-4019: [C++] Fix Coverity issues This fixes a number of issues found by Coverity. Other issues are benign, or need to be tackled separately. Author: Antoine Pitrou Closes #3168 from pitrou/ARROW-4019-fix-coverity-issues and squashes the following commits: 6311aa99a ARROW-4019: Fix Coverity issues --- cpp/src/arrow/array.h | 8 +-- cpp/src/arrow/compute/kernel.h | 9 ++++ cpp/src/arrow/io/buffered-test.cc | 2 +- cpp/src/arrow/io/file-test.cc | 2 +- cpp/src/arrow/io/test-common.h | 2 +- cpp/src/arrow/ipc/json-integration-test.cc | 2 +- cpp/src/arrow/ipc/json.cc | 2 +- cpp/src/arrow/ipc/writer.cc | 5 +- cpp/src/arrow/table.cc | 2 +- cpp/src/arrow/util/decimal.cc | 57 +++++++++++++--------- cpp/src/arrow/util/logging.h | 3 ++ cpp/src/arrow/util/rle-encoding-test.cc | 2 +- cpp/src/arrow/util/rle-encoding.h | 1 + 13 files changed, 61 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index b34b53933314f..37fa5aedfc2d0 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -87,7 +87,7 @@ class Status; /// input array and replace them with newly-allocated data, changing the output /// data type as well. struct ARROW_EXPORT ArrayData { - ArrayData() : length(0) {} + ArrayData() : length(0), null_count(0), offset(0) {} ArrayData(const std::shared_ptr& type, int64_t length, int64_t null_count = kUnknownNullCount, int64_t offset = 0) @@ -311,7 +311,7 @@ class ARROW_EXPORT Array { std::string ToString() const; protected: - Array() {} + Array() : null_bitmap_data_(NULLPTR) {} std::shared_ptr data_; const uint8_t* null_bitmap_data_; @@ -382,7 +382,7 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { std::shared_ptr values() const { return data_->buffers[1]; } protected: - PrimitiveArray() {} + PrimitiveArray() : raw_values_(NULLPTR) {} inline void SetData(const std::shared_ptr& data) { auto values = data->buffers[1]; @@ -565,7 +565,7 @@ class ARROW_EXPORT BinaryArray : public FlatArray { protected: // For subclasses - BinaryArray() {} + BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} /// Protected method for constructors void SetData(const std::shared_ptr& data); diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 8048fff75bc29..bef2b9af21cff 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -19,6 +19,7 @@ #define ARROW_COMPUTE_KERNEL_H #include +#include #include #include "arrow/array.h" @@ -78,6 +79,14 @@ struct ARROW_EXPORT Datum { Datum(const Datum& other) noexcept { this->value = other.value; } + // Define move constructor and move assignment, for better performance + Datum(Datum&& other) noexcept : value(std::move(other.value)) {} + + Datum& operator=(Datum&& other) noexcept { + value = std::move(other.value); + return *this; + } + Datum::type kind() const { switch (this->value.which()) { case 0: diff --git a/cpp/src/arrow/io/buffered-test.cc b/cpp/src/arrow/io/buffered-test.cc index 7fc4c520d148b..074833d4bf7b7 100644 --- a/cpp/src/arrow/io/buffered-test.cc +++ b/cpp/src/arrow/io/buffered-test.cc @@ -67,7 +67,7 @@ class FileTestFixture : public ::testing::Test { void EnsureFileDeleted() { if (FileExists(path_)) { - std::remove(path_.c_str()); + ARROW_UNUSED(std::remove(path_.c_str())); } } diff --git a/cpp/src/arrow/io/file-test.cc b/cpp/src/arrow/io/file-test.cc index 6081005a8f6e1..4d710d3470f5c 100644 --- a/cpp/src/arrow/io/file-test.cc +++ b/cpp/src/arrow/io/file-test.cc @@ -56,7 +56,7 @@ class FileTestFixture : public ::testing::Test { void EnsureFileDeleted() { if (FileExists(path_)) { - std::remove(path_.c_str()); + ARROW_UNUSED(std::remove(path_.c_str())); } } diff --git a/cpp/src/arrow/io/test-common.h b/cpp/src/arrow/io/test-common.h index fa9145259b182..a091b01d32c79 100644 --- a/cpp/src/arrow/io/test-common.h +++ b/cpp/src/arrow/io/test-common.h @@ -118,7 +118,7 @@ class MemoryMapFixture { public: void TearDown() { for (auto path : tmp_files_) { - std::remove(path.c_str()); + ARROW_UNUSED(std::remove(path.c_str())); } } diff --git a/cpp/src/arrow/ipc/json-integration-test.cc b/cpp/src/arrow/ipc/json-integration-test.cc index 3e71415c69654..914cdb66599f4 100644 --- a/cpp/src/arrow/ipc/json-integration-test.cc +++ b/cpp/src/arrow/ipc/json-integration-test.cc @@ -262,7 +262,7 @@ class TestJSONIntegration : public ::testing::Test { void TearDown() { for (const std::string path : tmp_paths_) { - std::remove(path.c_str()); + ARROW_UNUSED(std::remove(path.c_str())); } } diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 394563c53c09d..61c242ca2dbbb 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -99,7 +99,7 @@ Status JsonWriter::WriteRecordBatch(const RecordBatch& batch) { class JsonReader::JsonReaderImpl { public: JsonReaderImpl(MemoryPool* pool, const std::shared_ptr& data) - : pool_(pool), data_(data) {} + : pool_(pool), data_(data), record_batches_(nullptr) {} Status ParseAndReadSchema() { doc_.Parse(reinterpret_cast(data_->data()), diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 3d3355dfe17fd..6ce72e070e7b3 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -772,7 +772,10 @@ class SchemaWriter : public StreamBookKeeper { public: SchemaWriter(const Schema& schema, DictionaryMemo* dictionary_memo, MemoryPool* pool, io::OutputStream* sink) - : StreamBookKeeper(sink), schema_(schema), dictionary_memo_(dictionary_memo) {} + : StreamBookKeeper(sink), + pool_(pool), + schema_(schema), + dictionary_memo_(dictionary_memo) {} Status WriteSchema() { #ifndef NDEBUG diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 04af4d9741c71..1f3d927ddd62b 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -392,7 +392,7 @@ class SimpleTable : public Table { std::vector> columns_; }; -Table::Table() {} +Table::Table() : num_rows_(0) {} std::shared_ptr
Table::Make(const std::shared_ptr& schema, const std::vector>& columns, diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index fda7746c6b4e0..c47ac82e8ce3c 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -889,7 +889,7 @@ Status Decimal128::Rescale(int32_t original_scale, int32_t new_scale, } // Helper function used by Decimal128::FromBigEndian -static inline uint64_t FromBigEndian(const uint8_t* bytes, int32_t length) { +static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) { // We don't bounds check the length here because this is called by // FromBigEndian that has a Decimal128 as its out parameters and // that function is already checking the length of the bytes and only @@ -906,8 +906,7 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 static constexpr int32_t kMinDecimalBytes = 1; static constexpr int32_t kMaxDecimalBytes = 16; - int64_t high; - uint64_t low; + int64_t high, low; if (length < kMinDecimalBytes || length > kMaxDecimalBytes) { std::ostringstream stream; @@ -917,35 +916,45 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 return Status::Invalid(stream.str()); } - /// Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the - /// sign bit. + // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the + // sign bit. const bool is_negative = static_cast(bytes[0]) < 0; - /// Sign extend the low bits if necessary - low = UINT64_MAX * (is_negative && length < 8); - high = -1 * (is_negative && length < kMaxDecimalBytes); - - /// Stop byte of the high bytes + // 1. Extract the high bytes + // Stop byte of the high bytes const int32_t high_bits_offset = std::max(0, length - 8); + const auto high_bits = UInt64FromBigEndian(bytes, high_bits_offset); - /// Shift left enough bits to make room for the incoming int64_t - high <<= high_bits_offset * CHAR_BIT; - - /// Preserve the upper bits by inplace OR-ing the int64_t - uint64_t value = arrow::FromBigEndian(bytes, high_bits_offset); - high |= value; + if (high_bits_offset == 8) { + // Avoid undefined shift by 64 below + high = high_bits; + } else { + high = -1 * (is_negative && length < kMaxDecimalBytes); + // Shift left enough bits to make room for the incoming int64_t + high <<= high_bits_offset * CHAR_BIT; + // Preserve the upper bits by inplace OR-ing the int64_t + high |= high_bits; + } - /// Stop byte of the low bytes + // 2. Extract the low bytes + // Stop byte of the low bytes const int32_t low_bits_offset = std::min(length, 8); + const auto low_bits = + UInt64FromBigEndian(bytes + high_bits_offset, length - high_bits_offset); - /// Shift left enough bits to make room for the incoming uint64_t - low <<= low_bits_offset * CHAR_BIT; - - /// Preserve the upper bits by inplace OR-ing the uint64_t - value = arrow::FromBigEndian(bytes + high_bits_offset, length - high_bits_offset); - low |= value; + if (low_bits_offset == 8) { + // Avoid undefined shift by 64 below + low = low_bits; + } else { + // Sign extend the low bits if necessary + low = -1 * (is_negative && length < 8); + // Shift left enough bits to make room for the incoming int64_t + low <<= low_bits_offset * CHAR_BIT; + // Preserve the upper bits by inplace OR-ing the int64_t + low |= low_bits; + } - *out = Decimal128(high, low); + *out = Decimal128(high, static_cast(low)); return Status::OK(); } diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 4cce700db970b..42ab18e9e96d3 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -22,6 +22,7 @@ #include #include +#include "arrow/util/macros.h" #include "arrow/util/visibility.h" namespace arrow { @@ -155,6 +156,8 @@ class ARROW_EXPORT ArrowLog : public ArrowLogBase { static void InstallFailureSignalHandler(); private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrowLog); + // Hide the implementation of log provider by void *. // Otherwise, lib user may define the same macro to use the correct header file. void* logging_provider_; diff --git a/cpp/src/arrow/util/rle-encoding-test.cc b/cpp/src/arrow/util/rle-encoding-test.cc index 88382618653e9..aac1b1523990c 100644 --- a/cpp/src/arrow/util/rle-encoding-test.cc +++ b/cpp/src/arrow/util/rle-encoding-test.cc @@ -193,7 +193,7 @@ void ValidateRle(const vector& values, int bit_width, uint8_t* expected_enc EXPECT_EQ(encoded_len, expected_len); } if (expected_encoding != NULL) { - EXPECT_EQ(memcmp(buffer, expected_encoding, expected_len), 0); + EXPECT_EQ(memcmp(buffer, expected_encoding, encoded_len), 0); } // Verify read diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index a97543d5be799..acefc8e3f7583 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -436,6 +436,7 @@ bool RleDecoder::NextCounts() { literal_count_ = (indicator_value >> 1) * 8; } else { repeat_count_ = indicator_value >> 1; + // XXX (ARROW-4018) this is not big-endian compatible bool result = bit_reader_.GetAligned(static_cast(BitUtil::CeilDiv(bit_width_, 8)), reinterpret_cast(¤t_value_)); From b3bc3384f3068edebe69f1084518ccfb85a368f8 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 13 Dec 2018 15:09:27 -0800 Subject: [PATCH 37/45] ARROW-3958: [Plasma] Reduce number of IPCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR also removes the client unmap, which is not necessary any more since the introduction of malloc (since there is only few memory mapped files and they typically stay around for the lifetime of the application). The PR also gets rid of a bunch of code that is not needed any more now (the release buffer, yay!). Benchmarks: ``` import pyarrow.plasma as plasma client = plasma.connect("/tmp/plasma", "", 0) # Put performance def f(): for i in range(10000): client.put(1) %timeit f() # without optimization: # 1.51 s ± 2.22 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.52 s ± 9.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.53 s ± 19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # with optimizations: # 1.27 s ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.31 s ± 8.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.31 s ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # Create/seal performance def f(): for i in range(10000): object_id = plasma.ObjectID.from_random() client.create(object_id, 0) client.seal(object_id) %timeit f() # without optimizations: # 571 ms ± 2.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 583 ms ± 22.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 588 ms ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # with optimizations: # 531 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 541 ms ± 9.99 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 542 ms ± 19.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # Get performance objects = [client.put(1) for i in range(10000)] def g(): for i in range(10000): client.get(objects[i]) %timeit g() # without optimizations # 1.11 s ± 6.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.12 s ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.19 s ± 24.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # with optimizations # 776 ms ± 11.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 792 ms ± 3.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 778 ms ± 9.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` Author: Philipp Moritz Author: Robert Nishihara Closes #3124 from pcmoritz/plasma-send-fd and squashes the following commits: f899f459 Update client.cc a0384040 Update _plasma.pyx af150c14 comments and fixes 71c4c5c1 don't close fd twice 0d572823 linting f60dcbed fix tests 502aeda4 linting 2887b170 clean up some code cfff7e32 lint e5ccbbac fixes 5f091993 introduce method 24beb277 working version --- cpp/src/plasma/client.cc | 184 +++++----------------- cpp/src/plasma/client.h | 16 +- cpp/src/plasma/store.cc | 13 +- cpp/src/plasma/store.h | 3 + cpp/src/plasma/test/client_tests.cc | 26 +-- docs/source/python/plasma.rst | 10 +- python/pyarrow/_plasma.pyx | 11 +- python/pyarrow/tensorflow/plasma_op.cc | 4 +- python/pyarrow/tests/test_plasma.py | 8 +- python/pyarrow/tests/test_plasma_tf_op.py | 2 +- 10 files changed, 83 insertions(+), 194 deletions(-) diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index 99cf00cab80fd..2dbe2b41478ea 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -83,9 +83,6 @@ typedef struct XXH64_state_s XXH64_state_t; constexpr int64_t kHashingConcurrency = 8; constexpr int64_t kBytesInMB = 1 << 20; -// Use 100MB as an overestimate of the L3 cache size. -constexpr int64_t kL3CacheSizeBytes = 100000000; - // ---------------------------------------------------------------------- // GPU support @@ -143,22 +140,13 @@ struct ObjectInUseEntry { bool is_sealed; }; -/// Configuration options for the plasma client. -struct PlasmaClientConfig { - /// Number of release calls we wait until the object is actually released. - /// This allows us to avoid invalidating the cpu cache on workers if objects - /// are reused accross tasks. - size_t release_delay; -}; - struct ClientMmapTableEntry { + /// The associated file descriptor on the client. + int fd; /// The result of mmap for this file descriptor. uint8_t* pointer; /// The length of the memory-mapped file. size_t length; - /// The number of objects in this memory-mapped file that are currently being - /// used by the client. When this count reaches zeros, we unmap the file. - int count; }; class PlasmaClient::Impl : public std::enable_shared_from_this { @@ -169,7 +157,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this> objects_in_use_; - /// Object IDs of the last few release calls. This is a deque and - /// is used to delay releasing objects to see if they can be reused by - /// subsequent tasks so we do not unneccessarily invalidate cpu caches. - /// TODO(pcm): replace this with a proper lru cache using the size of the L3 - /// cache. - std::deque release_history_; - /// The number of bytes in the combined objects that are held in the release - /// history doubly-linked list. If this is too large then the client starts - /// releasing objects. - int64_t in_use_object_bytes_; - /// Configuration options for the plasma client. - PlasmaClientConfig config_; /// The amount of memory available to the Plasma store. The client needs this /// information to make sure that it does not delay in releasing so much /// memory that the store is unable to evict enough objects to free up space. @@ -308,7 +288,6 @@ PlasmaClient::Impl::~Impl() {} uint8_t* PlasmaClient::Impl::LookupOrMmap(int fd, int store_fd_val, int64_t map_size) { auto entry = mmap_table_.find(store_fd_val); if (entry != mmap_table_.end()) { - close(fd); return entry->second.pointer; } else { // We subtract kMmapRegionsGap from the length that was added @@ -322,9 +301,9 @@ uint8_t* PlasmaClient::Impl::LookupOrMmap(int fd, int store_fd_val, int64_t map_ close(fd); // Closing this fd has an effect on performance. ClientMmapTableEntry& entry = mmap_table_[store_fd_val]; + entry.fd = fd; entry.pointer = result; entry.length = map_size; - entry.count = 0; return result; } } @@ -342,6 +321,17 @@ bool PlasmaClient::Impl::IsInUse(const ObjectID& object_id) { return (elem != objects_in_use_.end()); } +int PlasmaClient::Impl::GetStoreFd(int store_fd) { + auto entry = mmap_table_.find(store_fd); + if (entry == mmap_table_.end()) { + int fd = recv_fd(store_conn_); + ARROW_CHECK(fd >= 0) << "recv not successful"; + return fd; + } else { + return entry->second.fd; + } +} + void PlasmaClient::Impl::IncrementObjectCount(const ObjectID& object_id, PlasmaObject* object, bool is_sealed) { // Increment the count of the object to track the fact that it is being used. @@ -357,18 +347,6 @@ void PlasmaClient::Impl::IncrementObjectCount(const ObjectID& object_id, objects_in_use_[object_id]->count = 0; objects_in_use_[object_id]->is_sealed = is_sealed; object_entry = objects_in_use_[object_id].get(); - if (object->device_num == 0) { - // Increment the count of the number of objects in the memory-mapped file - // that are being used. The corresponding decrement should happen in - // PlasmaClient::Release. - auto entry = mmap_table_.find(object->store_fd); - ARROW_CHECK(entry != mmap_table_.end()); - ARROW_CHECK(entry->second.count >= 0); - // Update the in_use_object_bytes_. - in_use_object_bytes_ += - (object_entry->object.data_size + object_entry->object.metadata_size); - entry->second.count += 1; - } } else { object_entry = elem->second.get(); ARROW_CHECK(object_entry->count > 0); @@ -397,8 +375,7 @@ Status PlasmaClient::Impl::Create(const ObjectID& object_id, int64_t data_size, // If the CreateReply included an error, then the store will not send a file // descriptor. if (device_num == 0) { - int fd = recv_fd(store_conn_); - ARROW_CHECK(fd >= 0) << "recv not successful"; + int fd = GetStoreFd(store_fd); ARROW_CHECK(object.data_size == data_size); ARROW_CHECK(object.metadata_size == metadata_size); // The metadata should come right after the data. @@ -535,8 +512,7 @@ Status PlasmaClient::Impl::GetBuffers( // in the subsequent loop based on just the store file descriptor and without // having to know the relevant file descriptor received from recv_fd. for (size_t i = 0; i < store_fds.size(); i++) { - int fd = recv_fd(store_conn_); - ARROW_CHECK(fd >= 0); + int fd = GetStoreFd(store_fds[i]); LookupOrMmap(fd, store_fds[i], mmap_sizes[i]); } @@ -615,54 +591,21 @@ Status PlasmaClient::Impl::Get(const ObjectID* object_ids, int64_t num_objects, return GetBuffers(object_ids, num_objects, timeout_ms, wrap_buffer, out); } -Status PlasmaClient::Impl::UnmapObject(const ObjectID& object_id) { +Status PlasmaClient::Impl::MarkObjectUnused(const ObjectID& object_id) { auto object_entry = objects_in_use_.find(object_id); ARROW_CHECK(object_entry != objects_in_use_.end()); ARROW_CHECK(object_entry->second->count == 0); - // Decrement the count of the number of objects in this memory-mapped file - // that the client is using. The corresponding increment should have - // happened in plasma_get. - int fd = object_entry->second->object.store_fd; - auto entry = mmap_table_.find(fd); - ARROW_CHECK(entry != mmap_table_.end()); - ARROW_CHECK(entry->second.count >= 1); - if (entry->second.count == 1) { - // If no other objects are being used, then unmap the file. - // We subtract kMmapRegionsGap from the length that was added - // in fake_mmap in malloc.h, to make the size page-aligned again. - int err = munmap(entry->second.pointer, entry->second.length - kMmapRegionsGap); - if (err == -1) { - return Status::IOError("Error during munmap"); - } - // Remove the corresponding entry from the hash table. - mmap_table_.erase(fd); - } else { - // If there are other objects being used, decrement the reference count. - entry->second.count -= 1; - } - // Update the in_use_object_bytes_. - in_use_object_bytes_ -= (object_entry->second->object.data_size + - object_entry->second->object.metadata_size); - DCHECK_GE(in_use_object_bytes_, 0); // Remove the entry from the hash table of objects currently in use. objects_in_use_.erase(object_id); return Status::OK(); } -/// This is a helper method for implementing plasma_release. We maintain a -/// buffer -/// of release calls and only perform them once the buffer becomes full (as -/// judged by the aggregate sizes of the objects). There may be multiple release -/// calls for the same object ID in the buffer. In this case, the first release -/// calls will not do anything. The client will only send a message to the store -/// releasing the object when the client is truly done with the object. -/// -/// @param object_id The object ID to attempt to release. -Status PlasmaClient::Impl::PerformRelease(const ObjectID& object_id) { - // Decrement the count of the number of instances of this object that are - // being used by this client. The corresponding increment should have happened - // in PlasmaClient::Get. +Status PlasmaClient::Impl::Release(const ObjectID& object_id) { + // If the client is already disconnected, ignore release requests. + if (store_conn_ < 0) { + return Status::OK(); + } auto object_entry = objects_in_use_.find(object_id); ARROW_CHECK(object_entry != objects_in_use_.end()); object_entry->second->count -= 1; @@ -670,7 +613,7 @@ Status PlasmaClient::Impl::PerformRelease(const ObjectID& object_id) { // Check if the client is no longer using this object. if (object_entry->second->count == 0) { // Tell the store that the client no longer needs the object. - RETURN_NOT_OK(UnmapObject(object_id)); + RETURN_NOT_OK(MarkObjectUnused(object_id)); RETURN_NOT_OK(SendReleaseRequest(store_conn_, object_id)); auto iter = deletion_cache_.find(object_id); if (iter != deletion_cache_.end()) { @@ -681,50 +624,6 @@ Status PlasmaClient::Impl::PerformRelease(const ObjectID& object_id) { return Status::OK(); } -Status PlasmaClient::Impl::Release(const ObjectID& object_id) { - // If the client is already disconnected, ignore release requests. - if (store_conn_ < 0) { - return Status::OK(); - } - // If an object is in the deletion cache, handle it directly without waiting. - auto iter = deletion_cache_.find(object_id); - if (iter != deletion_cache_.end()) { - RETURN_NOT_OK(PerformRelease(object_id)); - return Status::OK(); - } - // Add the new object to the release history. - release_history_.push_front(object_id); - // If there are too many bytes in use by the client or if there are too many - // pending release calls, and there are at least some pending release calls in - // the release_history list, then release some objects. - - // TODO(wap): Eviction policy only works on host memory, and thus objects on - // the GPU cannot be released currently. - while ((in_use_object_bytes_ > std::min(kL3CacheSizeBytes, store_capacity_ / 100) || - release_history_.size() > config_.release_delay) && - release_history_.size() > 0) { - // Perform a release for the object ID for the first pending release. - RETURN_NOT_OK(PerformRelease(release_history_.back())); - // Remove the last entry from the release history. - release_history_.pop_back(); - } - return Status::OK(); -} - -Status PlasmaClient::Impl::FlushReleaseHistory() { - // If the client is already disconnected, ignore the flush. - if (store_conn_ < 0) { - return Status::OK(); - } - while (release_history_.size() > 0) { - // Perform a release for the object ID for the first pending release. - RETURN_NOT_OK(PerformRelease(release_history_.back())); - // Remove the last entry from the release history. - release_history_.pop_back(); - } - return Status::OK(); -} - // This method is used to query whether the plasma store contains an object. Status PlasmaClient::Impl::Contains(const ObjectID& object_id, bool* has_object) { // Check if we already have a reference to the object. @@ -855,8 +754,6 @@ Status PlasmaClient::Impl::Abort(const ObjectID& object_id) { ARROW_CHECK(!object_entry->second->is_sealed) << "Plasma client called abort on a sealed object"; - // Flush the release history. - RETURN_NOT_OK(FlushReleaseHistory()); // Make sure that the Plasma client only has one reference to the object. If // it has more, then the client needs to release the buffer before calling // abort. @@ -868,7 +765,7 @@ Status PlasmaClient::Impl::Abort(const ObjectID& object_id) { RETURN_NOT_OK(SendAbortRequest(store_conn_, object_id)); // Decrease the reference count to zero, then remove the object. object_entry->second->count--; - RETURN_NOT_OK(UnmapObject(object_id)); + RETURN_NOT_OK(MarkObjectUnused(object_id)); std::vector buffer; ObjectID id; @@ -878,7 +775,6 @@ Status PlasmaClient::Impl::Abort(const ObjectID& object_id) { } Status PlasmaClient::Impl::Delete(const std::vector& object_ids) { - RETURN_NOT_OK(FlushReleaseHistory()); std::vector not_in_use_ids; for (auto& object_id : object_ids) { // If the object is in used, skip it. @@ -981,8 +877,10 @@ Status PlasmaClient::Impl::Connect(const std::string& store_socket_name, } else { manager_conn_ = -1; } - config_.release_delay = release_delay; - in_use_object_bytes_ = 0; + if (release_delay != 0) { + ARROW_LOG(WARNING) << "The release_delay parameter in PlasmaClient::Connect " + << "is deprecated"; + } // Send a ConnectRequest to the store to get its memory capacity. RETURN_NOT_OK(SendConnectRequest(store_conn_)); std::vector buffer; @@ -1175,8 +1073,6 @@ Status PlasmaClient::Info(const ObjectID& object_id, int* object_status) { int PlasmaClient::get_manager_fd() const { return impl_->get_manager_fd(); } -Status PlasmaClient::FlushReleaseHistory() { return impl_->FlushReleaseHistory(); } - bool PlasmaClient::IsInUse(const ObjectID& object_id) { return impl_->IsInUse(object_id); } diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h index 9e080b7760dc8..514d2bd0d6d06 100644 --- a/cpp/src/plasma/client.h +++ b/cpp/src/plasma/client.h @@ -34,11 +34,6 @@ using arrow::Status; namespace plasma { -/// We keep a queue of unreleased objects cached in the client until we start -/// sending release requests to the store. This is to avoid frequently mapping -/// and unmapping objects and evicting data from processor caches. -constexpr int64_t kPlasmaDefaultReleaseDelay = 64; - /// Object buffer data structure. struct ObjectBuffer { /// The data buffer. @@ -62,13 +57,12 @@ class ARROW_EXPORT PlasmaClient { /// \param manager_socket_name The name of the UNIX domain socket to use to /// connect to the local Plasma manager. If this is "", then this /// function will not connect to a manager. - /// \param release_delay Number of released objects that are kept around - /// and not evicted to avoid too many munmaps. + /// \param release_delay Deprecated (not used). /// \param num_retries number of attempts to connect to IPC socket, default 50 /// \return The return status. Status Connect(const std::string& store_socket_name, - const std::string& manager_socket_name, - int release_delay = kPlasmaDefaultReleaseDelay, int num_retries = -1); + const std::string& manager_socket_name, int release_delay = 0, + int num_retries = -1); /// Create an object in the Plasma Store. Any metadata for this object must be /// be passed in when the object is created. @@ -354,10 +348,6 @@ class ARROW_EXPORT PlasmaClient { FRIEND_TEST(TestPlasmaStore, LegacyGetTest); FRIEND_TEST(TestPlasmaStore, AbortTest); - /// This is a helper method that flushes all pending release calls to the - /// store. - Status FlushReleaseHistory(); - bool IsInUse(const ObjectID& object_id); class ARROW_NO_EXPORT Impl; diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc index ae658d757c185..f6326ccf588de 100644 --- a/cpp/src/plasma/store.cc +++ b/cpp/src/plasma/store.cc @@ -327,7 +327,12 @@ void PlasmaStore::ReturnFromGet(GetRequest* get_req) { if (s.ok()) { // Send all of the file descriptors for the present objects. for (int store_fd : store_fds) { - WarnIfSigpipe(send_fd(get_req->client->fd, store_fd), get_req->client->fd); + // Only send the file descriptor if it hasn't been sent (see analogous + // logic in GetStoreFd in client.cc). + if (get_req->client->used_fds.find(store_fd) == get_req->client->used_fds.end()) { + WarnIfSigpipe(send_fd(get_req->client->fd, store_fd), get_req->client->fd); + get_req->client->used_fds.insert(store_fd); + } } } @@ -783,8 +788,12 @@ Status PlasmaStore::ProcessMessage(Client* client) { HANDLE_SIGPIPE( SendCreateReply(client->fd, object_id, &object, error_code, mmap_size), client->fd); - if (error_code == PlasmaError::OK && device_num == 0) { + // Only send the file descriptor if it hasn't been sent (see analogous + // logic in GetStoreFd in client.cc). Similar in ReturnFromGet. + if (error_code == PlasmaError::OK && device_num == 0 && + client->used_fds.find(object.store_fd) == client->used_fds.end()) { WarnIfSigpipe(send_fd(client->fd, object.store_fd), client->fd); + client->used_fds.insert(object.store_fd); } } break; case fb::MessageType::PlasmaCreateAndSealRequest: { diff --git a/cpp/src/plasma/store.h b/cpp/src/plasma/store.h index 8d3facd733f1c..0e0eb8323f3bb 100644 --- a/cpp/src/plasma/store.h +++ b/cpp/src/plasma/store.h @@ -54,6 +54,9 @@ struct Client { /// Object ids that are used by this client. std::unordered_set object_ids; + /// File descriptors that are used by this client. + std::unordered_set used_fds; + /// The file descriptor used to push notifications to client. This is only valid /// if client subscribes to plasma store. -1 indicates invalid. int notification_fd; diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc index f820303aba42b..65a9b71b7f251 100644 --- a/cpp/src/plasma/test/client_tests.cc +++ b/cpp/src/plasma/test/client_tests.cc @@ -82,7 +82,7 @@ class TestPlasmaStore : public ::testing::Test { void CreateObject(PlasmaClient& client, const ObjectID& object_id, const std::vector& metadata, - const std::vector& data) { + const std::vector& data, bool release = true) { std::shared_ptr data_buffer; ARROW_CHECK_OK(client.Create(object_id, data.size(), &metadata[0], metadata.size(), &data_buffer)); @@ -90,7 +90,9 @@ class TestPlasmaStore : public ::testing::Test { data_buffer->mutable_data()[i] = data[i]; } ARROW_CHECK_OK(client.Seal(object_id)); - ARROW_CHECK_OK(client.Release(object_id)); + if (release) { + ARROW_CHECK_OK(client.Release(object_id)); + } } const std::string& GetStoreSocketName() const { return store_socket_name_; } @@ -155,11 +157,12 @@ TEST_F(TestPlasmaStore, SealErrorsTest) { // Create object. std::vector data(100, 0); - CreateObject(client_, object_id, {42}, data); + CreateObject(client_, object_id, {42}, data, false); // Trying to seal it again. result = client_.Seal(object_id); ASSERT_TRUE(result.IsPlasmaObjectAlreadySealed()); + ARROW_CHECK_OK(client_.Release(object_id)); } TEST_F(TestPlasmaStore, DeleteTest) { @@ -228,13 +231,7 @@ TEST_F(TestPlasmaStore, DeleteObjectsTest) { // client2_ won't send the release request immediately because the trigger // condition is not reached. The release is only added to release cache. object_buffers.clear(); - // The reference count went to zero, but the objects are still in the release - // cache. - ARROW_CHECK_OK(client_.Contains(object_id1, &has_object)); - ASSERT_TRUE(has_object); - ARROW_CHECK_OK(client_.Contains(object_id2, &has_object)); - ASSERT_TRUE(has_object); - // The Delete call will flush release cache and send the Delete request. + // Delete the objects. result = client2_.Delete(std::vector{object_id1, object_id2}); ARROW_CHECK_OK(client_.Contains(object_id1, &has_object)); ASSERT_FALSE(has_object); @@ -277,7 +274,6 @@ TEST_F(TestPlasmaStore, GetTest) { // First create object. std::vector data = {3, 5, 6, 7, 9}; CreateObject(client_, object_id, {42}, data); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); object_buffers.clear(); @@ -291,11 +287,9 @@ TEST_F(TestPlasmaStore, GetTest) { auto metadata = object_buffers[0].metadata; object_buffers.clear(); ::arrow::AssertBufferEqual(*metadata, std::string{42}); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_TRUE(client_.IsInUse(object_id)); } // Object is automatically released - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); } @@ -314,17 +308,14 @@ TEST_F(TestPlasmaStore, LegacyGetTest) { // First create object. std::vector data = {3, 5, 6, 7, 9}; CreateObject(client_, object_id, {42}, data); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); ARROW_CHECK_OK(client_.Get(&object_id, 1, -1, &object_buffer)); AssertObjectBufferEqual(object_buffer, {42}, {3, 5, 6, 7, 9}); } // Object needs releasing manually - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_TRUE(client_.IsInUse(object_id)); ARROW_CHECK_OK(client_.Release(object_id)); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); } @@ -377,11 +368,9 @@ TEST_F(TestPlasmaStore, AbortTest) { ASSERT_TRUE(status.IsInvalid()); // Release, then abort. ARROW_CHECK_OK(client_.Release(object_id)); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_TRUE(client_.IsInUse(object_id)); ARROW_CHECK_OK(client_.Abort(object_id)); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); // Test for object non-existence after the abort. @@ -394,7 +383,6 @@ TEST_F(TestPlasmaStore, AbortTest) { // Test that we can get the object. ARROW_CHECK_OK(client_.Get({object_id}, -1, &object_buffers)); AssertObjectBufferEqual(object_buffers[0], {42, 43}, {1, 2, 3, 4, 5}); - ARROW_CHECK_OK(client_.Release(object_id)); } TEST_F(TestPlasmaStore, MultipleClientTest) { diff --git a/docs/source/python/plasma.rst b/docs/source/python/plasma.rst index 09837cf6e9ef9..3df68eff59e00 100644 --- a/docs/source/python/plasma.rst +++ b/docs/source/python/plasma.rst @@ -60,7 +60,7 @@ socket name: .. code-block:: python import pyarrow.plasma as plasma - client = plasma.connect("/tmp/plasma", "", 0) + client = plasma.connect("/tmp/plasma", "") If the following error occurs from running the above Python code, that means that either the socket given is incorrect, or the ``./plasma_store`` is @@ -68,7 +68,7 @@ not currently running. Check to see if the Plasma store is still running. .. code-block:: shell - >>> client = plasma.connect("/tmp/plasma", "", 0) + >>> client = plasma.connect("/tmp/plasma", "") Connection to socket failed for pathname /tmp/plasma Could not connect to socket /tmp/plasma @@ -179,7 +179,7 @@ the object buffer. # Create a different client. Note that this second client could be # created in the same or in a separate, concurrent Python session. - client2 = plasma.connect("/tmp/plasma", "", 0) + client2 = plasma.connect("/tmp/plasma", "") # Get the object in the second client. This blocks until the object has been sealed. object_id2 = plasma.ObjectID(20 * b"a") @@ -221,7 +221,7 @@ of the object info might change in the future): import pyarrow.plasma as plasma import time - client = plasma.connect("/tmp/plasma", "", 0) + client = plasma.connect("/tmp/plasma", "") client.put("hello, world") # Sleep a little so we get different creation times @@ -452,7 +452,7 @@ You can test this with the following script: import pyarrow.plasma as plasma import time - client = plasma.connect("/tmp/plasma", "", 0) + client = plasma.connect("/tmp/plasma", "") data = np.random.randn(100000000) tensor = pa.Tensor.from_numpy(data) diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx index 2fad09c0549c2..f7db3b4e0fec3 100644 --- a/python/pyarrow/_plasma.pyx +++ b/python/pyarrow/_plasma.pyx @@ -30,10 +30,11 @@ from cython.operator cimport dereference as deref, preincrement as inc from cpython.pycapsule cimport * import collections -import pyarrow import random import socket +import warnings +import pyarrow from pyarrow.lib cimport Buffer, NativeFile, check_status, pyarrow_wrap_buffer from pyarrow.includes.libarrow cimport (CBuffer, CMutableBuffer, CFixedSizeBufferWriter, CStatus) @@ -872,7 +873,7 @@ cdef class PlasmaClient: return result -def connect(store_socket_name, manager_socket_name, int release_delay, +def connect(store_socket_name, manager_socket_name, int release_delay=0, int num_retries=-1): """ Return a new PlasmaClient that is connected a plasma store and @@ -885,8 +886,7 @@ def connect(store_socket_name, manager_socket_name, int release_delay, manager_socket_name : str Name of the socket the plasma manager is listening at. release_delay : int - The maximum number of objects that the client will keep and - delay releasing (for caching reasons). + This parameter is deprecated and has no effect. num_retries : int, default -1 Number of times to try to connect to plasma store. Default value of -1 uses the default (50) @@ -894,6 +894,9 @@ def connect(store_socket_name, manager_socket_name, int release_delay, cdef PlasmaClient result = PlasmaClient() result.store_socket_name = store_socket_name.encode() result.manager_socket_name = manager_socket_name.encode() + if release_delay != 0: + warnings.warn("release_delay in PlasmaClient.connect is deprecated", + FutureWarning) with nogil: check_status(result.client.get() .Connect(result.store_socket_name, diff --git a/python/pyarrow/tensorflow/plasma_op.cc b/python/pyarrow/tensorflow/plasma_op.cc index a341d5a53988f..4e6449adfc85c 100644 --- a/python/pyarrow/tensorflow/plasma_op.cc +++ b/python/pyarrow/tensorflow/plasma_op.cc @@ -77,7 +77,7 @@ class TensorToPlasmaOp : public tf::AsyncOpKernel { if (!connected_) { VLOG(1) << "Connecting to Plasma..."; ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_, - plasma_manager_socket_name_, 0)); + plasma_manager_socket_name_)); VLOG(1) << "Connected!"; connected_ = true; } @@ -249,7 +249,7 @@ class PlasmaToTensorOp : public tf::AsyncOpKernel { if (!connected_) { VLOG(1) << "Connecting to Plasma..."; ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_, - plasma_manager_socket_name_, 0)); + plasma_manager_socket_name_)); VLOG(1) << "Connected!"; connected_ = true; } diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py index e3d31b7de1990..66449e6dba9a3 100644 --- a/python/pyarrow/tests/test_plasma.py +++ b/python/pyarrow/tests/test_plasma.py @@ -121,8 +121,8 @@ def setup_method(self, test_method): use_one_memory_mapped_file=use_one_memory_mapped_file) self.plasma_store_name, self.p = self.plasma_store_ctx.__enter__() # Connect to Plasma. - self.plasma_client = plasma.connect(self.plasma_store_name, "", 64) - self.plasma_client2 = plasma.connect(self.plasma_store_name, "", 0) + self.plasma_client = plasma.connect(self.plasma_store_name, "") + self.plasma_client2 = plasma.connect(self.plasma_store_name, "") def teardown_method(self, test_method): try: @@ -948,7 +948,7 @@ def test_use_huge_pages(): plasma_store_memory=2*10**9, plasma_directory="/mnt/hugepages", use_hugepages=True) as (plasma_store_name, p): - plasma_client = plasma.connect(plasma_store_name, "", 64) + plasma_client = plasma.connect(plasma_store_name, "") create_object(plasma_client, 10**8) @@ -962,7 +962,7 @@ def test_plasma_client_sharing(): with plasma.start_plasma_store( plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY) \ as (plasma_store_name, p): - plasma_client = plasma.connect(plasma_store_name, "", 64) + plasma_client = plasma.connect(plasma_store_name, "") object_id = plasma_client.put(np.zeros(3)) buf = plasma_client.get(object_id) del plasma_client diff --git a/python/pyarrow/tests/test_plasma_tf_op.py b/python/pyarrow/tests/test_plasma_tf_op.py index d9bf915d663aa..51e8b283e0a1d 100644 --- a/python/pyarrow/tests/test_plasma_tf_op.py +++ b/python/pyarrow/tests/test_plasma_tf_op.py @@ -94,7 +94,7 @@ def test_plasma_tf_op(use_gpu=False): pytest.skip("TensorFlow Op not found") with plasma.start_plasma_store(10**8) as (plasma_store_name, p): - client = plasma.connect(plasma_store_name, "", 0) + client = plasma.connect(plasma_store_name, "") for dtype in [np.float32, np.float64, np.int8, np.int16, np.int32, np.int64]: run_tensorflow_test_with_dtype(tf, plasma, plasma_store_name, From 8c413036775796d9bcc52be56373bbb45de8c0ae Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 14 Dec 2018 07:27:08 -0800 Subject: [PATCH 38/45] ARROW-4015: [Plasma] remove unused interfaces for plasma manager https://github.com/apache/arrow/issues/3154 This removes unused plasma interfaces Fetch(), Wait(), Transfer() and Info(), which depend on plasma manager which has already been removed from ray. Author: Philipp Moritz Author: Zhijun Fu Author: Robert Nishihara Closes #3167 from zhijunfu/remove-legacy-interfaces and squashes the following commits: 0efb5005f fix tensorflow op be92e9085 fix java client 9da2cd38b Update _plasma.pyx 16ec63e9a More updates e7413f739 Update _plasma.pyx 21398b5e7 merge bcb320400 address comments 7967aea09 Merge branch 'master' into remove-legacy-interfaces 583cd97c4 ARROW-4015: remove unused interfaces for plasma manager --- c_glib/plasma-glib/client.cpp | 3 +- cpp/apidoc/tutorials/plasma.md | 8 +- cpp/apidoc/tutorials/tensor_to_py.md | 2 +- cpp/src/plasma/client.cc | 111 +------------- cpp/src/plasma/client.h | 100 +----------- cpp/src/plasma/common.cc | 3 - cpp/src/plasma/common.h | 24 --- cpp/src/plasma/format/plasma.fbs | 74 --------- ...org_apache_arrow_plasma_PlasmaClientJNI.cc | 73 --------- cpp/src/plasma/plasma.h | 3 - cpp/src/plasma/protocol.cc | 143 ------------------ cpp/src/plasma/protocol.h | 35 ----- cpp/src/plasma/test/client_tests.cc | 2 - cpp/src/plasma/test/serialization_tests.cc | 116 -------------- docs/source/python/plasma.rst | 10 +- .../apache/arrow/plasma/ObjectStoreLink.java | 27 ---- .../org/apache/arrow/plasma/PlasmaClient.java | 23 --- python/benchmarks/plasma.py | 4 +- python/examples/plasma/sorting/sort_df.py | 2 +- python/pyarrow/_plasma.pyx | 130 +--------------- python/pyarrow/tensorflow/plasma_op.cc | 18 +-- python/pyarrow/tests/test_plasma.py | 16 +- python/pyarrow/tests/test_plasma_tf_op.py | 8 +- 23 files changed, 41 insertions(+), 894 deletions(-) diff --git a/c_glib/plasma-glib/client.cpp b/c_glib/plasma-glib/client.cpp index c05a71085dd2d..9591a0a714f27 100644 --- a/c_glib/plasma-glib/client.cpp +++ b/c_glib/plasma-glib/client.cpp @@ -41,8 +41,7 @@ G_BEGIN_DECLS * * #GPlasmaClientCreateOptions is a class for customizing object creation. * - * #GPlasmaClient is a class for an interface with a plasma store - * and a plasma manager. + * #GPlasmaClient is a class for an interface with a plasma store. * * Since: 0.12.0 */ diff --git a/cpp/apidoc/tutorials/plasma.md b/cpp/apidoc/tutorials/plasma.md index 472d479c4b2f9..b9046d50bc922 100644 --- a/cpp/apidoc/tutorials/plasma.md +++ b/cpp/apidoc/tutorials/plasma.md @@ -80,7 +80,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); // Disconnect the Plasma client. ARROW_CHECK_OK(client.Disconnect()); } @@ -226,7 +226,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); // Create an object with a fixed ObjectID. ObjectID object_id = ObjectID::from_binary("00000000000000000000"); int64_t data_size = 1000; @@ -332,7 +332,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); ObjectID object_id = ObjectID::from_binary("00000000000000000000"); ObjectBuffer object_buffer; ARROW_CHECK_OK(client.Get(&object_id, 1, -1, &object_buffer)); @@ -421,7 +421,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); int fd; ARROW_CHECK_OK(client.Subscribe(&fd)); diff --git a/cpp/apidoc/tutorials/tensor_to_py.md b/cpp/apidoc/tutorials/tensor_to_py.md index 0be973a4f3df9..cd191fea07d09 100644 --- a/cpp/apidoc/tutorials/tensor_to_py.md +++ b/cpp/apidoc/tutorials/tensor_to_py.md @@ -105,7 +105,7 @@ The `inputs` variable will be a list of Object IDs in their raw byte string form import pyarrow as pa import pyarrow.plasma as plasma -plasma_client = plasma.connect('/tmp/plasma', '', 0) +plasma_client = plasma.connect('/tmp/plasma') # inputs: a list of object ids inputs = [20 * b'1'] diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index 2dbe2b41478ea..4215399c0b009 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -198,17 +198,6 @@ class PlasmaClient::Impl : public std::enable_shared_from_this= 0) { - close(manager_conn_); - manager_conn_ = -1; - } - return Status::OK(); -} - -Status PlasmaClient::Impl::Transfer(const char* address, int port, - const ObjectID& object_id) { - return SendDataRequest(manager_conn_, object_id, address, port); -} - -Status PlasmaClient::Impl::Fetch(int num_object_ids, const ObjectID* object_ids) { - ARROW_CHECK(manager_conn_ >= 0); - return SendFetchRequest(manager_conn_, object_ids, num_object_ids); -} - -int PlasmaClient::Impl::get_manager_fd() const { return manager_conn_; } - -Status PlasmaClient::Impl::Info(const ObjectID& object_id, int* object_status) { - ARROW_CHECK(manager_conn_ >= 0); - - RETURN_NOT_OK(SendStatusRequest(manager_conn_, &object_id, 1)); - std::vector buffer; - RETURN_NOT_OK(PlasmaReceive(manager_conn_, MessageType::PlasmaStatusReply, &buffer)); - ObjectID id; - RETURN_NOT_OK(ReadStatusReply(buffer.data(), buffer.size(), &id, object_status, 1)); - ARROW_CHECK(object_id == id); - return Status::OK(); -} - -Status PlasmaClient::Impl::Wait(int64_t num_object_requests, - ObjectRequest* object_requests, int num_ready_objects, - int64_t timeout_ms, int* num_objects_ready) { - ARROW_CHECK(manager_conn_ >= 0); - ARROW_CHECK(num_object_requests > 0); - ARROW_CHECK(num_ready_objects > 0); - ARROW_CHECK(num_ready_objects <= num_object_requests); - - for (int i = 0; i < num_object_requests; ++i) { - ARROW_CHECK(object_requests[i].type == ObjectRequestType::PLASMA_QUERY_LOCAL || - object_requests[i].type == ObjectRequestType::PLASMA_QUERY_ANYWHERE); - } - - RETURN_NOT_OK(SendWaitRequest(manager_conn_, object_requests, num_object_requests, - num_ready_objects, timeout_ms)); - std::vector buffer; - RETURN_NOT_OK(PlasmaReceive(manager_conn_, MessageType::PlasmaWaitReply, &buffer)); - RETURN_NOT_OK( - ReadWaitReply(buffer.data(), buffer.size(), object_requests, &num_ready_objects)); - - *num_objects_ready = 0; - for (int i = 0; i < num_object_requests; ++i) { - ObjectRequestType type = object_requests[i].type; - auto status = static_cast(object_requests[i].location); - switch (type) { - case ObjectRequestType::PLASMA_QUERY_LOCAL: - if (status == fb::ObjectStatus::Local) { - *num_objects_ready += 1; - } - break; - case ObjectRequestType::PLASMA_QUERY_ANYWHERE: - if (status == fb::ObjectStatus::Local || status == fb::ObjectStatus::Remote) { - *num_objects_ready += 1; - } else { - ARROW_CHECK(status == fb::ObjectStatus::Nonexistent); - } - break; - default: - ARROW_LOG(FATAL) << "This code should be unreachable."; - } - } return Status::OK(); } @@ -1052,27 +964,6 @@ Status PlasmaClient::DecodeNotification(const uint8_t* buffer, ObjectID* object_ Status PlasmaClient::Disconnect() { return impl_->Disconnect(); } -Status PlasmaClient::Fetch(int num_object_ids, const ObjectID* object_ids) { - return impl_->Fetch(num_object_ids, object_ids); -} - -Status PlasmaClient::Wait(int64_t num_object_requests, ObjectRequest* object_requests, - int num_ready_objects, int64_t timeout_ms, - int* num_objects_ready) { - return impl_->Wait(num_object_requests, object_requests, num_ready_objects, timeout_ms, - num_objects_ready); -} - -Status PlasmaClient::Transfer(const char* addr, int port, const ObjectID& object_id) { - return impl_->Transfer(addr, port, object_id); -} - -Status PlasmaClient::Info(const ObjectID& object_id, int* object_status) { - return impl_->Info(object_id, object_status); -} - -int PlasmaClient::get_manager_fd() const { return impl_->get_manager_fd(); } - bool PlasmaClient::IsInUse(const ObjectID& object_id) { return impl_->IsInUse(object_id); } diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h index 514d2bd0d6d06..ac9e8eb0fe9c9 100644 --- a/cpp/src/plasma/client.h +++ b/cpp/src/plasma/client.h @@ -49,19 +49,20 @@ class ARROW_EXPORT PlasmaClient { PlasmaClient(); ~PlasmaClient(); - /// Connect to the local plasma store and plasma manager. Return - /// the resulting connection. + /// Connect to the local plasma store. Return the resulting connection. /// /// \param store_socket_name The name of the UNIX domain socket to use to /// connect to the Plasma store. /// \param manager_socket_name The name of the UNIX domain socket to use to /// connect to the local Plasma manager. If this is "", then this /// function will not connect to a manager. + /// Note that plasma manager is no longer supported, this function + /// will return failure if this is not "". /// \param release_delay Deprecated (not used). /// \param num_retries number of attempts to connect to IPC socket, default 50 /// \return The return status. Status Connect(const std::string& store_socket_name, - const std::string& manager_socket_name, int release_delay = 0, + const std::string& manager_socket_name = "", int release_delay = 0, int num_retries = -1); /// Create an object in the Plasma Store. Any metadata for this object must be @@ -249,99 +250,6 @@ class ARROW_EXPORT PlasmaClient { /// \return The return status. Status Disconnect(); - /// Attempt to initiate the transfer of some objects from remote Plasma - /// Stores. - /// This method does not guarantee that the fetched objects will arrive - /// locally. - /// - /// For an object that is available in the local Plasma Store, this method - /// will - /// not do anything. For an object that is not available locally, it will - /// check - /// if the object are already being fetched. If so, it will not do anything. - /// If - /// not, it will query the object table for a list of Plasma Managers that - /// have - /// the object. The object table will return a non-empty list, and this Plasma - /// Manager will attempt to initiate transfers from one of those Plasma - /// Managers. - /// - /// This function is non-blocking. - /// - /// This method is idempotent in the sense that it is ok to call it multiple - /// times. - /// - /// \param num_object_ids The number of object IDs fetch is being called on. - /// \param object_ids The IDs of the objects that fetch is being called on. - /// \return The return status. - Status Fetch(int num_object_ids, const ObjectID* object_ids); - - /// Wait for (1) a specified number of objects to be available (sealed) in the - /// local Plasma Store or in a remote Plasma Store, or (2) for a timeout to - /// expire. This is a blocking call. - /// - /// \param num_object_requests Size of the object_requests array. - /// \param object_requests Object event array. Each element contains a request - /// for a particular object_id. The type of request is specified in the - /// "type" field. - /// - A PLASMA_QUERY_LOCAL request is satisfied when object_id becomes - /// available in the local Plasma Store. In this case, this function - /// sets the "status" field to ObjectStatus::Local. Note, if the - /// status - /// is not ObjectStatus::Local, it will be ObjectStatus::Nonexistent, - /// but it may exist elsewhere in the system. - /// - A PLASMA_QUERY_ANYWHERE request is satisfied when object_id - /// becomes - /// available either at the local Plasma Store or on a remote Plasma - /// Store. In this case, the functions sets the "status" field to - /// ObjectStatus::Local or ObjectStatus::Remote. - /// \param num_ready_objects The number of requests in object_requests array - /// that - /// must be satisfied before the function returns, unless it timeouts. - /// The num_ready_objects should be no larger than num_object_requests. - /// \param timeout_ms Timeout value in milliseconds. If this timeout expires - /// before min_num_ready_objects of requests are satisfied, the - /// function - /// returns. - /// \param num_objects_ready Out parameter for number of satisfied requests in - /// the object_requests list. If the returned number is less than - /// min_num_ready_objects this means that timeout expired. - /// \return The return status. - Status Wait(int64_t num_object_requests, ObjectRequest* object_requests, - int num_ready_objects, int64_t timeout_ms, int* num_objects_ready); - - /// Transfer local object to a different plasma manager. - /// - /// \param addr IP address of the plasma manager we are transfering to. - /// \param port Port of the plasma manager we are transfering to. - /// \param object_id ObjectID of the object we are transfering. - /// \return The return status. - Status Transfer(const char* addr, int port, const ObjectID& object_id); - - /// Return the status of a given object. This method may query the object - /// table. - /// - /// \param object_id The ID of the object whose status we query. - /// \param object_status Out parameter for object status. Can take the - /// following values. - /// - PLASMA_CLIENT_LOCAL, if object is stored in the local Plasma - /// Store. - /// has been already scheduled by the Plasma Manager. - /// - PLASMA_CLIENT_TRANSFER, if the object is either currently being - /// transferred or just scheduled. - /// - PLASMA_CLIENT_REMOTE, if the object is stored at a remote - /// Plasma Store. - /// - PLASMA_CLIENT_DOES_NOT_EXIST, if the object doesn’t exist in the - /// system. - /// \return The return status. - Status Info(const ObjectID& object_id, int* object_status); - - /// Get the file descriptor for the socket connection to the plasma manager. - /// - /// \return The file descriptor for the manager connection. If there is no - /// connection to the manager, this is -1. - int get_manager_fd() const; - private: friend class PlasmaBuffer; FRIEND_TEST(TestPlasmaStore, GetTest); diff --git a/cpp/src/plasma/common.cc b/cpp/src/plasma/common.cc index 0ca17cf814f8a..1b86fd80b4920 100644 --- a/cpp/src/plasma/common.cc +++ b/cpp/src/plasma/common.cc @@ -107,9 +107,6 @@ bool UniqueID::operator==(const UniqueID& rhs) const { return std::memcmp(data(), rhs.data(), kUniqueIDSize) == 0; } -ARROW_EXPORT fb::ObjectStatus ObjectStatusLocal = fb::ObjectStatus::Local; -ARROW_EXPORT fb::ObjectStatus ObjectStatusRemote = fb::ObjectStatus::Remote; - const PlasmaStoreInfo* plasma_config; } // namespace plasma diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h index 7090428ff41c9..38925fef929e4 100644 --- a/cpp/src/plasma/common.h +++ b/cpp/src/plasma/common.h @@ -66,30 +66,6 @@ typedef UniqueID ObjectID; /// Size of object hash digests. constexpr int64_t kDigestSize = sizeof(uint64_t); -enum class ObjectRequestType : int { - /// Query for object in the local plasma store. - PLASMA_QUERY_LOCAL = 1, - /// Query for object in the local plasma store or in a remote plasma store. - PLASMA_QUERY_ANYWHERE -}; - -/// Object request data structure. Used for Wait. -struct ObjectRequest { - /// The ID of the requested object. If ID_NIL request any object. - ObjectID object_id; - /// Request associated to the object. It can take one of the following values: - /// - PLASMA_QUERY_LOCAL: return if or when the object is available in the - /// local Plasma Store. - /// - PLASMA_QUERY_ANYWHERE: return if or when the object is available in - /// the system (i.e., either in the local or a remote Plasma Store). - ObjectRequestType type; - /// Object location. This can be - /// - ObjectLocation::Local: object is ready at the local Plasma Store. - /// - ObjectLocation::Remote: object is ready at a remote Plasma Store. - /// - ObjectLocation::Nonexistent: object does not exist in the system. - ObjectLocation location; -}; - enum class ObjectState : int { /// Object was created but not sealed in the local Plasma Store. PLASMA_CREATED = 1, diff --git a/cpp/src/plasma/format/plasma.fbs b/cpp/src/plasma/format/plasma.fbs index ef934fbd81ed2..b3c890391887e 100644 --- a/cpp/src/plasma/format/plasma.fbs +++ b/cpp/src/plasma/format/plasma.fbs @@ -42,9 +42,6 @@ enum MessageType:long { // Delete an object. PlasmaDeleteRequest, PlasmaDeleteReply, - // Get status of an object. - PlasmaStatusRequest, - PlasmaStatusReply, // See if the store contains an object (will be deprecated). PlasmaContainsRequest, PlasmaContainsReply, @@ -57,11 +54,6 @@ enum MessageType:long { // Make room for new objects in the plasma store. PlasmaEvictRequest, PlasmaEvictReply, - // Fetch objects from remote Plasma stores. - PlasmaFetchRequest, - // Wait for objects to be ready either from local or remote Plasma stores. - PlasmaWaitRequest, - PlasmaWaitReply, // Subscribe to a list of objects or to all objects. PlasmaSubscribeRequest, // Unsubscribe. @@ -239,35 +231,6 @@ table PlasmaDeleteReply { errors: [PlasmaError]; } -table PlasmaStatusRequest { - // IDs of the objects stored at local Plasma store we request the status of. - object_ids: [string]; -} - -enum ObjectStatus:int { - // Object is stored in the local Plasma Store. - Local, - // Object is stored on a remote Plasma store, and it is not stored on the - // local Plasma Store. - Remote, - // Object is not stored in the system. - Nonexistent, - // Object is currently transferred from a remote Plasma store the local - // Plasma Store. - Transfer -} - -table PlasmaStatusReply { - // IDs of the objects being returned. - object_ids: [string]; - // Status of the object. - status: [ObjectStatus]; -} - -// PlasmaContains is a subset of PlasmaStatus which does not -// involve the plasma manager, only the store. We should consider -// unifying them in the future and deprecating PlasmaContains. - table PlasmaContainsRequest { // ID of the object we are querying. object_id: string; @@ -309,43 +272,6 @@ table PlasmaEvictReply { num_bytes: ulong; } -table PlasmaFetchRequest { - // IDs of objects to be gotten. - object_ids: [string]; -} - -table ObjectRequestSpec { - // ID of the object. - object_id: string; - // The type of the object. This specifies whether we - // will be waiting for an object store in the local or - // global Plasma store. - type: int; -} - -table PlasmaWaitRequest { - // Array of object requests whose status we are asking for. - object_requests: [ObjectRequestSpec]; - // Number of objects expected to be returned, if available. - num_ready_objects: int; - // timeout - timeout: long; -} - -table ObjectReply { - // ID of the object. - object_id: string; - // The object status. This specifies where the object is stored. - status: ObjectStatus; -} - -table PlasmaWaitReply { - // Array of object requests being returned. - object_requests: [ObjectReply]; - // Number of objects expected to be returned, if available. - num_ready_objects: int; -} - table PlasmaSubscribeRequest { } diff --git a/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc b/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc index 7cd2f3574423c..fa376ec43ce13 100644 --- a/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc +++ b/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc @@ -220,79 +220,6 @@ JNIEXPORT jboolean JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_contains return has_object; } -JNIEXPORT void JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_fetch( - JNIEnv* env, jclass cls, jlong conn, jobjectArray object_ids) { - plasma::PlasmaClient* client = reinterpret_cast(conn); - jsize num_oids = env->GetArrayLength(object_ids); - - std::vector oids(num_oids); - for (int i = 0; i < num_oids; ++i) { - jbyteArray_to_object_id( - env, reinterpret_cast(env->GetObjectArrayElement(object_ids, i)), - &oids[i]); - } - - ARROW_CHECK_OK(client->Fetch(static_cast(num_oids), oids.data())); - - return; -} - -JNIEXPORT jobjectArray JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_wait( - JNIEnv* env, jclass cls, jlong conn, jobjectArray object_ids, jint timeout_ms, - jint num_returns) { - plasma::PlasmaClient* client = reinterpret_cast(conn); - jsize num_oids = env->GetArrayLength(object_ids); - - if (num_returns < 0) { - jclass Exception = env->FindClass("java/lang/RuntimeException"); - env->ThrowNew(Exception, "The argument num_returns cannot be less than zero."); - return nullptr; - } - if (num_returns > num_oids) { - jclass Exception = env->FindClass("java/lang/RuntimeException"); - env->ThrowNew(Exception, - "The argument num_returns cannot be greater than len(object_ids)."); - return nullptr; - } - - std::vector oreqs(num_oids); - - for (int i = 0; i < num_oids; ++i) { - jbyteArray_to_object_id( - env, reinterpret_cast(env->GetObjectArrayElement(object_ids, i)), - &oreqs[i].object_id); - oreqs[i].type = plasma::ObjectRequestType::PLASMA_QUERY_ANYWHERE; - } - - int num_return_objects; - // TODO: may be blocked. consider to add the thread support - ARROW_CHECK_OK(client->Wait(static_cast(num_oids), oreqs.data(), num_returns, - static_cast(timeout_ms), &num_return_objects)); - - int num_to_return = std::min(num_return_objects, num_returns); - jclass clsByteArray = env->FindClass("[B"); - jobjectArray ret = env->NewObjectArray(num_to_return, clsByteArray, nullptr); - - int num_returned = 0; - jbyteArray oid = nullptr; - for (int i = 0; i < num_oids; ++i) { - if (num_returned >= num_to_return) { - break; - } - - if (oreqs[i].location == plasma::ObjectLocation::Local || - oreqs[i].location == plasma::ObjectLocation::Remote) { - oid = env->NewByteArray(OBJECT_ID_SIZE); - object_id_to_jbyteArray(env, oid, &oreqs[i].object_id); - env->SetObjectArrayElement(ret, num_returned, oid); - num_returned++; - } - } - ARROW_CHECK(num_returned == num_to_return); - - return ret; -} - JNIEXPORT jlong JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_evict( JNIEnv* env, jclass cls, jlong conn, jlong num_bytes) { plasma::PlasmaClient* client = reinterpret_cast(conn); diff --git a/cpp/src/plasma/plasma.h b/cpp/src/plasma/plasma.h index 83caec7ee4958..aafe527466913 100644 --- a/cpp/src/plasma/plasma.h +++ b/cpp/src/plasma/plasma.h @@ -68,9 +68,6 @@ constexpr int64_t kBlockSize = 64; struct Client; -/// Mapping from object IDs to type and status of the request. -typedef std::unordered_map ObjectRequestMap; - // TODO(pcm): Replace this by the flatbuffers message PlasmaObjectSpec. struct PlasmaObject { #ifdef PLASMA_CUDA diff --git a/cpp/src/plasma/protocol.cc b/cpp/src/plasma/protocol.cc index c437840874538..a878647718264 100644 --- a/cpp/src/plasma/protocol.cc +++ b/cpp/src/plasma/protocol.cc @@ -42,10 +42,6 @@ using flatbuffers::uoffset_t; #define PLASMA_CHECK_ENUM(x, y) \ static_assert(static_cast(x) == static_cast(y), "protocol mismatch") -PLASMA_CHECK_ENUM(ObjectLocation::Local, fb::ObjectStatus::Local); -PLASMA_CHECK_ENUM(ObjectLocation::Remote, fb::ObjectStatus::Remote); -PLASMA_CHECK_ENUM(ObjectLocation::Nonexistent, fb::ObjectStatus::Nonexistent); - flatbuffers::Offset>> ToFlatbuffer(flatbuffers::FlatBufferBuilder* fbb, const ObjectID* object_ids, int64_t num_objects) { @@ -367,56 +363,6 @@ Status ReadDeleteReply(uint8_t* data, size_t size, std::vector* object return Status::OK(); } -// Satus messages. - -Status SendStatusRequest(int sock, const ObjectID* object_ids, int64_t num_objects) { - flatbuffers::FlatBufferBuilder fbb; - auto message = - fb::CreatePlasmaStatusRequest(fbb, ToFlatbuffer(&fbb, object_ids, num_objects)); - return PlasmaSend(sock, MessageType::PlasmaStatusRequest, &fbb, message); -} - -Status ReadStatusRequest(uint8_t* data, size_t size, ObjectID object_ids[], - int64_t num_objects) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - for (uoffset_t i = 0; i < num_objects; ++i) { - object_ids[i] = ObjectID::from_binary(message->object_ids()->Get(i)->str()); - } - return Status::OK(); -} - -Status SendStatusReply(int sock, ObjectID object_ids[], int object_status[], - int64_t num_objects) { - flatbuffers::FlatBufferBuilder fbb; - auto message = - fb::CreatePlasmaStatusReply(fbb, ToFlatbuffer(&fbb, object_ids, num_objects), - fbb.CreateVector(object_status, num_objects)); - return PlasmaSend(sock, MessageType::PlasmaStatusReply, &fbb, message); -} - -int64_t ReadStatusReply_num_objects(uint8_t* data, size_t size) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - return message->object_ids()->size(); -} - -Status ReadStatusReply(uint8_t* data, size_t size, ObjectID object_ids[], - int object_status[], int64_t num_objects) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - for (uoffset_t i = 0; i < num_objects; ++i) { - object_ids[i] = ObjectID::from_binary(message->object_ids()->Get(i)->str()); - } - for (uoffset_t i = 0; i < num_objects; ++i) { - object_status[i] = message->status()->data()[i]; - } - return Status::OK(); -} - // Contains messages. Status SendContainsRequest(int sock, ObjectID object_id) { @@ -640,95 +586,6 @@ Status ReadGetReply(uint8_t* data, size_t size, ObjectID object_ids[], } return Status::OK(); } -// Fetch messages. - -Status SendFetchRequest(int sock, const ObjectID* object_ids, int64_t num_objects) { - flatbuffers::FlatBufferBuilder fbb; - auto message = - fb::CreatePlasmaFetchRequest(fbb, ToFlatbuffer(&fbb, object_ids, num_objects)); - return PlasmaSend(sock, MessageType::PlasmaFetchRequest, &fbb, message); -} - -Status ReadFetchRequest(uint8_t* data, size_t size, std::vector& object_ids) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - for (uoffset_t i = 0; i < message->object_ids()->size(); ++i) { - object_ids.push_back(ObjectID::from_binary(message->object_ids()->Get(i)->str())); - } - return Status::OK(); -} - -// Wait messages. - -Status SendWaitRequest(int sock, ObjectRequest object_requests[], int64_t num_requests, - int num_ready_objects, int64_t timeout_ms) { - flatbuffers::FlatBufferBuilder fbb; - - std::vector> object_request_specs; - for (int i = 0; i < num_requests; i++) { - object_request_specs.push_back(fb::CreateObjectRequestSpec( - fbb, fbb.CreateString(object_requests[i].object_id.binary()), - static_cast(object_requests[i].type))); - } - - auto message = fb::CreatePlasmaWaitRequest(fbb, fbb.CreateVector(object_request_specs), - num_ready_objects, timeout_ms); - return PlasmaSend(sock, MessageType::PlasmaWaitRequest, &fbb, message); -} - -Status ReadWaitRequest(uint8_t* data, size_t size, ObjectRequestMap& object_requests, - int64_t* timeout_ms, int* num_ready_objects) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - *num_ready_objects = message->num_ready_objects(); - *timeout_ms = message->timeout(); - - for (uoffset_t i = 0; i < message->object_requests()->size(); i++) { - ObjectID object_id = - ObjectID::from_binary(message->object_requests()->Get(i)->object_id()->str()); - ObjectRequest object_request( - {object_id, - static_cast(message->object_requests()->Get(i)->type()), - ObjectLocation::Nonexistent}); - object_requests[object_id] = object_request; - } - return Status::OK(); -} - -Status SendWaitReply(int sock, const ObjectRequestMap& object_requests, - int num_ready_objects) { - flatbuffers::FlatBufferBuilder fbb; - - std::vector> object_replies; - for (const auto& entry : object_requests) { - const auto& object_request = entry.second; - object_replies.push_back( - fb::CreateObjectReply(fbb, fbb.CreateString(object_request.object_id.binary()), - static_cast(object_request.location))); - } - - auto message = fb::CreatePlasmaWaitReply( - fbb, fbb.CreateVector(object_replies.data(), num_ready_objects), num_ready_objects); - return PlasmaSend(sock, MessageType::PlasmaWaitReply, &fbb, message); -} - -Status ReadWaitReply(uint8_t* data, size_t size, ObjectRequest object_requests[], - int* num_ready_objects) { - DCHECK(data); - - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - *num_ready_objects = message->num_ready_objects(); - for (int i = 0; i < *num_ready_objects; i++) { - object_requests[i].object_id = - ObjectID::from_binary(message->object_requests()->Get(i)->object_id()->str()); - object_requests[i].location = - static_cast(message->object_requests()->Get(i)->status()); - } - return Status::OK(); -} // Subscribe messages. diff --git a/cpp/src/plasma/protocol.h b/cpp/src/plasma/protocol.h index c8204584b8adb..0362bd47797d4 100644 --- a/cpp/src/plasma/protocol.h +++ b/cpp/src/plasma/protocol.h @@ -128,21 +128,6 @@ Status SendDeleteReply(int sock, const std::vector& object_ids, Status ReadDeleteReply(uint8_t* data, size_t size, std::vector* object_ids, std::vector* errors); -/* Satus messages. */ - -Status SendStatusRequest(int sock, const ObjectID* object_ids, int64_t num_objects); - -Status ReadStatusRequest(uint8_t* data, size_t size, ObjectID object_ids[], - int64_t num_objects); - -Status SendStatusReply(int sock, ObjectID object_ids[], int object_status[], - int64_t num_objects); - -int64_t ReadStatusReply_num_objects(uint8_t* data, size_t size); - -Status ReadStatusReply(uint8_t* data, size_t size, ObjectID object_ids[], - int object_status[], int64_t num_objects); - /* Plasma Constains message functions. */ Status SendContainsRequest(int sock, ObjectID object_id); @@ -184,26 +169,6 @@ Status SendEvictReply(int sock, int64_t num_bytes); Status ReadEvictReply(uint8_t* data, size_t size, int64_t& num_bytes); -/* Plasma Fetch Remote message functions. */ - -Status SendFetchRequest(int sock, const ObjectID* object_ids, int64_t num_objects); - -Status ReadFetchRequest(uint8_t* data, size_t size, std::vector& object_ids); - -/* Plasma Wait message functions. */ - -Status SendWaitRequest(int sock, ObjectRequest object_requests[], int64_t num_requests, - int num_ready_objects, int64_t timeout_ms); - -Status ReadWaitRequest(uint8_t* data, size_t size, ObjectRequestMap& object_requests, - int64_t* timeout_ms, int* num_ready_objects); - -Status SendWaitReply(int sock, const ObjectRequestMap& object_requests, - int num_ready_objects); - -Status ReadWaitReply(uint8_t* data, size_t size, ObjectRequest object_requests[], - int* num_ready_objects); - /* Plasma Subscribe message functions. */ Status SendSubscribeRequest(int sock); diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc index 65a9b71b7f251..30dc6850cd068 100644 --- a/cpp/src/plasma/test/client_tests.cc +++ b/cpp/src/plasma/test/client_tests.cc @@ -187,7 +187,6 @@ TEST_F(TestPlasmaStore, DeleteTest) { ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); ASSERT_TRUE(has_object); - // Avoid race condition of Plasma Manager waiting for notification. ARROW_CHECK_OK(client_.Release(object_id)); // object_id is marked as to-be-deleted, when it is not in use, it will be deleted. ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); @@ -251,7 +250,6 @@ TEST_F(TestPlasmaStore, ContainsTest) { // First create object. std::vector data(100, 0); CreateObject(client_, object_id, {42}, data); - // Avoid race condition of Plasma Manager waiting for notification. std::vector object_buffers; ARROW_CHECK_OK(client_.Get({object_id}, -1, &object_buffers)); ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); diff --git a/cpp/src/plasma/test/serialization_tests.cc b/cpp/src/plasma/test/serialization_tests.cc index 085ae97db980f..66d651d2923bf 100644 --- a/cpp/src/plasma/test/serialization_tests.cc +++ b/cpp/src/plasma/test/serialization_tests.cc @@ -254,44 +254,6 @@ TEST(PlasmaSerialization, DeleteReply) { close(fd); } -TEST(PlasmaSerialization, StatusRequest) { - int fd = create_temp_file(); - constexpr int64_t num_objects = 2; - ObjectID object_ids[num_objects]; - object_ids[0] = random_object_id(); - object_ids[1] = random_object_id(); - ARROW_CHECK_OK(SendStatusRequest(fd, object_ids, num_objects)); - std::vector data = - read_message_from_file(fd, MessageType::PlasmaStatusRequest); - ObjectID object_ids_read[num_objects]; - ARROW_CHECK_OK( - ReadStatusRequest(data.data(), data.size(), object_ids_read, num_objects)); - ASSERT_EQ(object_ids[0], object_ids_read[0]); - ASSERT_EQ(object_ids[1], object_ids_read[1]); - close(fd); -} - -TEST(PlasmaSerialization, StatusReply) { - int fd = create_temp_file(); - ObjectID object_ids[2]; - object_ids[0] = random_object_id(); - object_ids[1] = random_object_id(); - int object_statuses[2] = {42, 43}; - ARROW_CHECK_OK(SendStatusReply(fd, object_ids, object_statuses, 2)); - std::vector data = read_message_from_file(fd, MessageType::PlasmaStatusReply); - int64_t num_objects = ReadStatusReply_num_objects(data.data(), data.size()); - - std::vector object_ids_read(num_objects); - std::vector object_statuses_read(num_objects); - ARROW_CHECK_OK(ReadStatusReply(data.data(), data.size(), object_ids_read.data(), - object_statuses_read.data(), num_objects)); - ASSERT_EQ(object_ids[0], object_ids_read[0]); - ASSERT_EQ(object_ids[1], object_ids_read[1]); - ASSERT_EQ(object_statuses[0], object_statuses_read[0]); - ASSERT_EQ(object_statuses[1], object_statuses_read[1]); - close(fd); -} - TEST(PlasmaSerialization, EvictRequest) { int fd = create_temp_file(); int64_t num_bytes = 111; @@ -314,84 +276,6 @@ TEST(PlasmaSerialization, EvictReply) { close(fd); } -TEST(PlasmaSerialization, FetchRequest) { - int fd = create_temp_file(); - ObjectID object_ids[2]; - object_ids[0] = random_object_id(); - object_ids[1] = random_object_id(); - ARROW_CHECK_OK(SendFetchRequest(fd, object_ids, 2)); - std::vector data = read_message_from_file(fd, MessageType::PlasmaFetchRequest); - std::vector object_ids_read; - ARROW_CHECK_OK(ReadFetchRequest(data.data(), data.size(), object_ids_read)); - ASSERT_EQ(object_ids[0], object_ids_read[0]); - ASSERT_EQ(object_ids[1], object_ids_read[1]); - close(fd); -} - -TEST(PlasmaSerialization, WaitRequest) { - int fd = create_temp_file(); - const int num_objects_in = 2; - ObjectRequest object_requests_in[num_objects_in] = { - ObjectRequest({random_object_id(), ObjectRequestType::PLASMA_QUERY_ANYWHERE, - ObjectLocation::Local}), - ObjectRequest({random_object_id(), ObjectRequestType::PLASMA_QUERY_LOCAL, - ObjectLocation::Local})}; - const int num_ready_objects_in = 1; - int64_t timeout_ms = 1000; - - ARROW_CHECK_OK(SendWaitRequest(fd, &object_requests_in[0], num_objects_in, - num_ready_objects_in, timeout_ms)); - /* Read message back. */ - std::vector data = read_message_from_file(fd, MessageType::PlasmaWaitRequest); - int num_ready_objects_out; - int64_t timeout_ms_read; - ObjectRequestMap object_requests_out; - ARROW_CHECK_OK(ReadWaitRequest(data.data(), data.size(), object_requests_out, - &timeout_ms_read, &num_ready_objects_out)); - ASSERT_EQ(num_objects_in, object_requests_out.size()); - ASSERT_EQ(num_ready_objects_out, num_ready_objects_in); - for (int i = 0; i < num_objects_in; i++) { - const ObjectID& object_id = object_requests_in[i].object_id; - ASSERT_EQ(1, object_requests_out.count(object_id)); - const auto& entry = object_requests_out.find(object_id); - ASSERT_TRUE(entry != object_requests_out.end()); - ASSERT_EQ(entry->second.object_id, object_requests_in[i].object_id); - ASSERT_EQ(entry->second.type, object_requests_in[i].type); - } - close(fd); -} - -TEST(PlasmaSerialization, WaitReply) { - int fd = create_temp_file(); - const int num_objects_in = 2; - /* Create a map with two ObjectRequests in it. */ - ObjectRequestMap objects_in(num_objects_in); - ObjectID id1 = random_object_id(); - objects_in[id1] = - ObjectRequest({id1, ObjectRequestType::PLASMA_QUERY_LOCAL, ObjectLocation::Local}); - ObjectID id2 = random_object_id(); - objects_in[id2] = ObjectRequest( - {id2, ObjectRequestType::PLASMA_QUERY_LOCAL, ObjectLocation::Nonexistent}); - - ARROW_CHECK_OK(SendWaitReply(fd, objects_in, num_objects_in)); - /* Read message back. */ - std::vector data = read_message_from_file(fd, MessageType::PlasmaWaitReply); - ObjectRequest objects_out[2]; - int num_objects_out; - ARROW_CHECK_OK( - ReadWaitReply(data.data(), data.size(), &objects_out[0], &num_objects_out)); - ASSERT_EQ(num_objects_in, num_objects_out); - for (int i = 0; i < num_objects_out; i++) { - /* Each object request must appear exactly once. */ - ASSERT_EQ(objects_in.count(objects_out[i].object_id), 1); - const auto& entry = objects_in.find(objects_out[i].object_id); - ASSERT_TRUE(entry != objects_in.end()); - ASSERT_EQ(entry->second.object_id, objects_out[i].object_id); - ASSERT_EQ(entry->second.location, objects_out[i].location); - } - close(fd); -} - TEST(PlasmaSerialization, DataRequest) { int fd = create_temp_file(); ObjectID object_id1 = random_object_id(); diff --git a/docs/source/python/plasma.rst b/docs/source/python/plasma.rst index 3df68eff59e00..660c5fbba7918 100644 --- a/docs/source/python/plasma.rst +++ b/docs/source/python/plasma.rst @@ -60,7 +60,7 @@ socket name: .. code-block:: python import pyarrow.plasma as plasma - client = plasma.connect("/tmp/plasma", "") + client = plasma.connect("/tmp/plasma") If the following error occurs from running the above Python code, that means that either the socket given is incorrect, or the ``./plasma_store`` is @@ -68,7 +68,7 @@ not currently running. Check to see if the Plasma store is still running. .. code-block:: shell - >>> client = plasma.connect("/tmp/plasma", "") + >>> client = plasma.connect("/tmp/plasma") Connection to socket failed for pathname /tmp/plasma Could not connect to socket /tmp/plasma @@ -179,7 +179,7 @@ the object buffer. # Create a different client. Note that this second client could be # created in the same or in a separate, concurrent Python session. - client2 = plasma.connect("/tmp/plasma", "") + client2 = plasma.connect("/tmp/plasma") # Get the object in the second client. This blocks until the object has been sealed. object_id2 = plasma.ObjectID(20 * b"a") @@ -221,7 +221,7 @@ of the object info might change in the future): import pyarrow.plasma as plasma import time - client = plasma.connect("/tmp/plasma", "") + client = plasma.connect("/tmp/plasma") client.put("hello, world") # Sleep a little so we get different creation times @@ -452,7 +452,7 @@ You can test this with the following script: import pyarrow.plasma as plasma import time - client = plasma.connect("/tmp/plasma", "") + client = plasma.connect("/tmp/plasma") data = np.random.randn(100000000) tensor = pa.Tensor.from_numpy(data) diff --git a/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java b/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java index 3b67bc08ecfdc..8d6eec02e75a4 100644 --- a/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java +++ b/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java @@ -79,16 +79,6 @@ default byte[] get(byte[] objectId, int timeoutMs, boolean isMetadata) { */ List get(byte[][] objectIds, int timeoutMs); - /** - * Wait until numReturns objects in objectIds are ready. - * - * @param objectIds List of object IDs to wait for. - * @param timeoutMs Return to the caller after timeoutMs milliseconds. - * @param numReturns We are waiting for this number of objects to be ready. - * @return List of object IDs that are ready - */ - List wait(byte[][] objectIds, int timeoutMs, int numReturns); - /** * Compute the hash of an object in the object store. * @@ -98,23 +88,6 @@ default byte[] get(byte[] objectId, int timeoutMs, boolean isMetadata) { */ byte[] hash(byte[] objectId); - /** - * Fetch the object with the given ID from other plasma manager instances. - * - * @param objectId The object ID used to identify the object. - */ - default void fetch(byte[] objectId) { - byte[][] objectIds = {objectId}; - fetch(objectIds); - } - - /** - * Fetch the objects with the given IDs from other plasma manager instances. - * - * @param objectIds List of object IDs used to identify the objects. - */ - void fetch(byte[][] objectIds); - /** * Evict some objects to recover given count of bytes. * diff --git a/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java b/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java index db1f35e1641bb..d69b54df05ed1 100644 --- a/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java +++ b/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java @@ -81,34 +81,11 @@ public List get(byte[][] objectIds, int timeoutMs, boolean isMetadata) { return ret; } - @Override - public List wait(byte[][] objectIds, int timeoutMs, int numReturns) { - byte[][] readys = PlasmaClientJNI.wait(conn, objectIds, timeoutMs, numReturns); - - List ret = new ArrayList<>(); - for (byte[] ready : readys) { - for (byte[] id : objectIds) { - if (Arrays.equals(ready, id)) { - ret.add(id); - break; - } - } - } - - assert (ret.size() == readys.length); - return ret; - } - @Override public byte[] hash(byte[] objectId) { return PlasmaClientJNI.hash(conn, objectId); } - @Override - public void fetch(byte[][] objectIds) { - PlasmaClientJNI.fetch(conn, objectIds); - } - @Override public List get(byte[][] objectIds, int timeoutMs) { ByteBuffer[][] bufs = PlasmaClientJNI.get(conn, objectIds, timeoutMs); diff --git a/python/benchmarks/plasma.py b/python/benchmarks/plasma.py index 7cefcdffad2c6..398ec72561255 100644 --- a/python/benchmarks/plasma.py +++ b/python/benchmarks/plasma.py @@ -32,7 +32,7 @@ def setup(self, size): self.plasma_store_ctx = plasma.start_plasma_store( plasma_store_memory=10**9) plasma_store_name, p = self.plasma_store_ctx.__enter__() - self.plasma_client = plasma.connect(plasma_store_name, "", 64) + self.plasma_client = plasma.connect(plasma_store_name) self.data = np.random.randn(size // 8) @@ -52,7 +52,7 @@ def setup(self): self.plasma_store_ctx = plasma.start_plasma_store( plasma_store_memory=10**9) plasma_store_name, p = self.plasma_store_ctx.__enter__() - self.plasma_client = plasma.connect(plasma_store_name, "", 64) + self.plasma_client = plasma.connect(plasma_store_name) def teardown(self): self.plasma_store_ctx.__exit__(None, None, None) diff --git a/python/examples/plasma/sorting/sort_df.py b/python/examples/plasma/sorting/sort_df.py index 2e4df589ee38c..2a51759a67b89 100644 --- a/python/examples/plasma/sorting/sort_df.py +++ b/python/examples/plasma/sorting/sort_df.py @@ -49,7 +49,7 @@ # Connect to clients def connect(): global client - client = plasma.connect('/tmp/store', '', 0) + client = plasma.connect('/tmp/store') np.random.seed(int(time.time() * 10e7) % 10000000) diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx index f7db3b4e0fec3..cfaa39c96ea5d 100644 --- a/python/pyarrow/_plasma.pyx +++ b/python/pyarrow/_plasma.pyx @@ -63,11 +63,6 @@ cdef extern from "plasma/common.h" nogil: @staticmethod int64_t size() - cdef struct CObjectRequest" plasma::ObjectRequest": - CUniqueID object_id - int type - int location - cdef enum CObjectState" plasma::ObjectState": PLASMA_CREATED" plasma::ObjectState::PLASMA_CREATED" PLASMA_SEALED" plasma::ObjectState::PLASMA_SEALED" @@ -92,14 +87,6 @@ cdef extern from "plasma/common.h" nogil: cdef extern from "plasma/common.h": cdef int64_t kDigestSize" plasma::kDigestSize" - cdef enum ObjectRequestType: - PLASMA_QUERY_LOCAL"plasma::ObjectRequestType::PLASMA_QUERY_LOCAL", - PLASMA_QUERY_ANYWHERE"plasma::ObjectRequestType::PLASMA_QUERY_ANYWHERE" - - cdef enum ObjectLocation: - ObjectStatusLocal"plasma::ObjectLocation::Local" - ObjectStatusRemote"plasma::ObjectLocation::Remote" - cdef extern from "plasma/client.h" nogil: cdef cppclass CPlasmaClient" plasma::PlasmaClient": @@ -143,16 +130,6 @@ cdef extern from "plasma/client.h" nogil: CStatus Disconnect() - CStatus Fetch(int num_object_ids, const CUniqueID* object_ids) - - CStatus Wait(int64_t num_object_requests, - CObjectRequest* object_requests, - int num_ready_objects, int64_t timeout_ms, - int* num_objects_ready) - - CStatus Transfer(const char* addr, int port, - const CUniqueID& object_id) - CStatus Delete(const c_vector[CUniqueID] object_ids) cdef extern from "plasma/client.h" nogil: @@ -285,13 +262,11 @@ cdef class PlasmaClient: shared_ptr[CPlasmaClient] client int notification_fd c_string store_socket_name - c_string manager_socket_name def __cinit__(self): self.client.reset(new CPlasmaClient()) self.notification_fd = -1 self.store_socket_name = b"" - self.manager_socket_name = b"" cdef _get_object_buffers(self, object_ids, int64_t timeout_ms, c_vector[CObjectBuffer]* result): @@ -315,10 +290,6 @@ cdef class PlasmaClient: def store_socket_name(self): return self.store_socket_name.decode() - @property - def manager_socket_name(self): - return self.manager_socket_name.decode() - def create(self, ObjectID object_id, int64_t data_size, c_string metadata=b""): """ @@ -642,95 +613,6 @@ cdef class PlasmaClient: check_status(self.client.get().Evict(num_bytes, num_bytes_evicted)) return num_bytes_evicted - def transfer(self, address, int port, ObjectID object_id): - """ - Transfer local object with id object_id to another plasma instance - - Parameters - ---------- - addr : str - IPv4 address of the plasma instance the object is sent to. - port : int - Port number of the plasma instance the object is sent to. - object_id : str - A string used to identify an object. - """ - cdef c_string addr = address.encode() - with nogil: - check_status(self.client.get() - .Transfer(addr.c_str(), port, object_id.data)) - - def fetch(self, object_ids): - """ - Fetch the objects with the given IDs from other plasma managers. - - Parameters - ---------- - object_ids : list - A list of strings used to identify the objects. - """ - cdef c_vector[CUniqueID] ids - cdef ObjectID object_id - for object_id in object_ids: - ids.push_back(object_id.data) - with nogil: - check_status(self.client.get().Fetch(ids.size(), ids.data())) - - def wait(self, object_ids, int64_t timeout=PLASMA_WAIT_TIMEOUT, - int num_returns=1): - """ - Wait until num_returns objects in object_ids are ready. - Currently, the object ID arguments to wait must be unique. - - Parameters - ---------- - object_ids : list - List of object IDs to wait for. - timeout :int - Return to the caller after timeout milliseconds. - num_returns : int - We are waiting for this number of objects to be ready. - - Returns - ------- - list - List of object IDs that are ready. - list - List of object IDs we might still wait on. - """ - # Check that the object ID arguments are unique. The plasma manager - # currently crashes if given duplicate object IDs. - if len(object_ids) != len(set(object_ids)): - raise Exception("Wait requires a list of unique object IDs.") - cdef int64_t num_object_requests = len(object_ids) - cdef c_vector[CObjectRequest] object_requests = ( - c_vector[CObjectRequest](num_object_requests)) - cdef int num_objects_ready = 0 - cdef ObjectID object_id - for i, object_id in enumerate(object_ids): - object_requests[i].object_id = object_id.data - object_requests[i].type = PLASMA_QUERY_ANYWHERE - with nogil: - check_status(self.client.get().Wait(num_object_requests, - object_requests.data(), - num_returns, timeout, - &num_objects_ready)) - cdef int num_to_return = min(num_objects_ready, num_returns) - ready_ids = [] - waiting_ids = set(object_ids) - cdef int num_returned = 0 - for i in range(len(object_ids)): - if num_returned == num_to_return: - break - if (object_requests[i].location == ObjectStatusLocal or - object_requests[i].location == ObjectStatusRemote): - ready_ids.append( - ObjectID(object_requests[i].object_id.binary())) - waiting_ids.discard( - ObjectID(object_requests[i].object_id.binary())) - num_returned += 1 - return ready_ids, list(waiting_ids) - def subscribe(self): """Subscribe to notifications about sealed objects.""" with nogil: @@ -873,7 +755,7 @@ cdef class PlasmaClient: return result -def connect(store_socket_name, manager_socket_name, int release_delay=0, +def connect(store_socket_name, manager_socket_name=None, int release_delay=0, int num_retries=-1): """ Return a new PlasmaClient that is connected a plasma store and @@ -884,22 +766,24 @@ def connect(store_socket_name, manager_socket_name, int release_delay=0, store_socket_name : str Name of the socket the plasma store is listening at. manager_socket_name : str - Name of the socket the plasma manager is listening at. + This parameter is deprecated and has no effect. release_delay : int This parameter is deprecated and has no effect. num_retries : int, default -1 Number of times to try to connect to plasma store. Default value of -1 uses the default (50) """ + if manager_socket_name is not None: + warnings.warn( + "manager_socket_name in PlasmaClient.connect is deprecated", + FutureWarning) cdef PlasmaClient result = PlasmaClient() result.store_socket_name = store_socket_name.encode() - result.manager_socket_name = manager_socket_name.encode() if release_delay != 0: warnings.warn("release_delay in PlasmaClient.connect is deprecated", FutureWarning) with nogil: check_status(result.client.get() - .Connect(result.store_socket_name, - result.manager_socket_name, + .Connect(result.store_socket_name, b"", release_delay, num_retries)) return result diff --git a/python/pyarrow/tensorflow/plasma_op.cc b/python/pyarrow/tensorflow/plasma_op.cc index 4e6449adfc85c..852be339389e7 100644 --- a/python/pyarrow/tensorflow/plasma_op.cc +++ b/python/pyarrow/tensorflow/plasma_op.cc @@ -71,13 +71,10 @@ class TensorToPlasmaOp : public tf::AsyncOpKernel { explicit TensorToPlasmaOp(tf::OpKernelConstruction* context) : tf::AsyncOpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("plasma_store_socket_name", &plasma_store_socket_name_)); - OP_REQUIRES_OK(context, context->GetAttr("plasma_manager_socket_name", - &plasma_manager_socket_name_)); tf::mutex_lock lock(mu_); if (!connected_) { VLOG(1) << "Connecting to Plasma..."; - ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_, - plasma_manager_socket_name_)); + ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_)); VLOG(1) << "Connected!"; connected_ = true; } @@ -226,7 +223,6 @@ class TensorToPlasmaOp : public tf::AsyncOpKernel { private: std::string plasma_store_socket_name_; - std::string plasma_manager_socket_name_; tf::mutex mu_; bool connected_ = false; @@ -243,13 +239,10 @@ class PlasmaToTensorOp : public tf::AsyncOpKernel { explicit PlasmaToTensorOp(tf::OpKernelConstruction* context) : tf::AsyncOpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("plasma_store_socket_name", &plasma_store_socket_name_)); - OP_REQUIRES_OK(context, context->GetAttr("plasma_manager_socket_name", - &plasma_manager_socket_name_)); tf::mutex_lock lock(mu_); if (!connected_) { VLOG(1) << "Connecting to Plasma..."; - ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_, - plasma_manager_socket_name_)); + ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_)); VLOG(1) << "Connected!"; connected_ = true; } @@ -364,7 +357,6 @@ class PlasmaToTensorOp : public tf::AsyncOpKernel { private: std::string plasma_store_socket_name_; - std::string plasma_manager_socket_name_; tf::mutex mu_; bool connected_ = false; @@ -375,8 +367,7 @@ REGISTER_OP("TensorToPlasma") .Input("input_tensor: dtypes") .Input("plasma_object_id: string") .Attr("dtypes: list(type)") - .Attr("plasma_store_socket_name: string") - .Attr("plasma_manager_socket_name: string"); + .Attr("plasma_store_socket_name: string"); REGISTER_KERNEL_BUILDER(Name("TensorToPlasma").Device(tf::DEVICE_CPU), TensorToPlasmaOp); @@ -389,8 +380,7 @@ REGISTER_OP("PlasmaToTensor") .Input("plasma_object_id: string") .Output("tensor: dtype") .Attr("dtype: type") - .Attr("plasma_store_socket_name: string") - .Attr("plasma_manager_socket_name: string"); + .Attr("plasma_store_socket_name: string"); REGISTER_KERNEL_BUILDER(Name("PlasmaToTensor").Device(tf::DEVICE_CPU), PlasmaToTensorOp); diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py index 66449e6dba9a3..05375d7b65aee 100644 --- a/python/pyarrow/tests/test_plasma.py +++ b/python/pyarrow/tests/test_plasma.py @@ -121,8 +121,8 @@ def setup_method(self, test_method): use_one_memory_mapped_file=use_one_memory_mapped_file) self.plasma_store_name, self.p = self.plasma_store_ctx.__enter__() # Connect to Plasma. - self.plasma_client = plasma.connect(self.plasma_store_name, "") - self.plasma_client2 = plasma.connect(self.plasma_store_name, "") + self.plasma_client = plasma.connect(self.plasma_store_name) + self.plasma_client2 = plasma.connect(self.plasma_store_name) def teardown_method(self, test_method): try: @@ -147,7 +147,7 @@ def test_connection_failure_raises_exception(self): import pyarrow.plasma as plasma # ARROW-1264 with pytest.raises(IOError): - plasma.connect('unknown-store-name', '', 0, 1) + plasma.connect('unknown-store-name', num_retries=1) def test_create(self): # Create an object id string. @@ -860,7 +860,7 @@ def test_client_death_during_get(self): object_id = random_object_id() def client_blocked_in_get(plasma_store_name): - client = plasma.connect(self.plasma_store_name, "", 0) + client = plasma.connect(self.plasma_store_name) # Try to get an object ID that doesn't exist. This should block. client.get([object_id]) @@ -889,7 +889,7 @@ def test_client_getting_multiple_objects(self): object_ids = [random_object_id() for _ in range(10)] def client_get_multiple(plasma_store_name): - client = plasma.connect(self.plasma_store_name, "", 0) + client = plasma.connect(self.plasma_store_name) # Try to get an object ID that doesn't exist. This should block. client.get(object_ids) @@ -948,7 +948,7 @@ def test_use_huge_pages(): plasma_store_memory=2*10**9, plasma_directory="/mnt/hugepages", use_hugepages=True) as (plasma_store_name, p): - plasma_client = plasma.connect(plasma_store_name, "") + plasma_client = plasma.connect(plasma_store_name) create_object(plasma_client, 10**8) @@ -962,7 +962,7 @@ def test_plasma_client_sharing(): with plasma.start_plasma_store( plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY) \ as (plasma_store_name, p): - plasma_client = plasma.connect(plasma_store_name, "") + plasma_client = plasma.connect(plasma_store_name) object_id = plasma_client.put(np.zeros(3)) buf = plasma_client.get(object_id) del plasma_client @@ -977,7 +977,7 @@ def test_plasma_list(): with plasma.start_plasma_store( plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY) \ as (plasma_store_name, p): - plasma_client = plasma.connect(plasma_store_name, "", 0) + plasma_client = plasma.connect(plasma_store_name) # Test sizes u, _, _ = create_object(plasma_client, 11, metadata_size=7, seal=False) diff --git a/python/pyarrow/tests/test_plasma_tf_op.py b/python/pyarrow/tests/test_plasma_tf_op.py index 51e8b283e0a1d..e239055209f00 100644 --- a/python/pyarrow/tests/test_plasma_tf_op.py +++ b/python/pyarrow/tests/test_plasma_tf_op.py @@ -37,15 +37,13 @@ def ToPlasma(): return plasma.tf_plasma_op.tensor_to_plasma( [data_tensor, ones_tensor], object_id, - plasma_store_socket_name=plasma_store_name, - plasma_manager_socket_name="") + plasma_store_socket_name=plasma_store_name) def FromPlasma(): return plasma.tf_plasma_op.plasma_to_tensor( object_id, dtype=tf.as_dtype(dtype), - plasma_store_socket_name=plasma_store_name, - plasma_manager_socket_name="") + plasma_store_socket_name=plasma_store_name) with tf.device(FORCE_DEVICE): to_plasma = ToPlasma() @@ -94,7 +92,7 @@ def test_plasma_tf_op(use_gpu=False): pytest.skip("TensorFlow Op not found") with plasma.start_plasma_store(10**8) as (plasma_store_name, p): - client = plasma.connect(plasma_store_name, "") + client = plasma.connect(plasma_store_name) for dtype in [np.float32, np.float64, np.int8, np.int16, np.int32, np.int64]: run_tensorflow_test_with_dtype(tf, plasma, plasma_store_name, From 804502f941f808583e9f7043e203533de738d577 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Dec 2018 13:05:50 -0600 Subject: [PATCH 39/45] ARROW-3184: [C++] Enable modular builds and installs with ARROW_OPTIONAL_INSTALL option. Remove ARROW_GANDIVA_BUILD_TESTS Apparently CMake really does not want you to do `make $TARGET install` out of the box; I searched various threads about this and there's no great solutions. For expert users, this provides that option of installing only targets that have been built, while others will be ignored. Because the possibility of users shooting themselves in the foot is so high with this, it doesn't make sense to enable by default. In the hands of an expert though, this can significantly reduce build times and make it possible to build libraries and unit tests for only a part of the project, then only install those libraries. This will install all header files regardless of what libraries are built; I didn't see any easy way to work that out since you have to have knowledge of what headers are used by what library. Resolves ARROW-3994 Author: Wes McKinney Closes #3172 from wesm/ARROW-3184 and squashes the following commits: 583a916e0 plasma_store_server requires static libraries 3c2a21ea1 Add plasma_store_server to 'plasma' target 85fda6419 Build plasma again in Python build for now 1b3ac57dc Fix multiline comment in CMake d3ce84c4e More option reorg b6630605c Reorganize CMake options a bit more logically. Add more explicit warning about ARROW_OPTIONAL_INSTALL 262058b2f Do not build Gandiva JNI bindings by default 918fdb371 Fix ARROW_TEST_INCLUDE_LABELS option to actually work 578bc58f5 Use GLOB instead of GLOB_RECURSE daaafa214 Misc fixes a84643d6e Fix header install option f899bdd99 Work around ARROW-4026 via environment variable for now 001a3ad57 Pass in ARROW_TEST_INCLUDE_LABELS via environment variable a1df9ab3d Clarify documentation 2eca8a740 Enable modular builds and install with ARROW_OPTIONAL_INSTALL option. Remove ARROW_GANDIVA_BUILD_TESTS. Add helper function for installing header files. Build fewer targets using these options in some Travis CI jobs --- .travis.yml | 12 ++ ci/travis_before_script_cpp.sh | 18 +- ci/travis_script_python.sh | 10 +- cpp/CMakeLists.txt | 163 ++++++++++-------- cpp/README.md | 15 +- cpp/cmake_modules/BuildUtils.cmake | 20 ++- cpp/src/arrow/CMakeLists.txt | 26 +-- .../arrow/adapters/tensorflow/CMakeLists.txt | 5 +- cpp/src/arrow/compute/CMakeLists.txt | 7 +- cpp/src/arrow/csv/CMakeLists.txt | 7 +- cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt | 18 +- cpp/src/arrow/gpu/CMakeLists.txt | 7 +- cpp/src/arrow/ipc/CMakeLists.txt | 11 +- cpp/src/arrow/python/CMakeLists.txt | 24 +-- cpp/src/arrow/util/CMakeLists.txt | 40 +---- cpp/src/gandiva/CMakeLists.txt | 79 +++------ cpp/src/gandiva/precompiled/CMakeLists.txt | 2 +- cpp/src/parquet/CMakeLists.txt | 21 +-- cpp/src/parquet/api/CMakeLists.txt | 7 +- cpp/src/parquet/arrow/CMakeLists.txt | 7 +- cpp/src/parquet/util/CMakeLists.txt | 7 +- cpp/src/plasma/CMakeLists.txt | 6 +- cpp/tools/parquet/CMakeLists.txt | 4 +- dev/tasks/gandiva-jars/build-cpp.sh | 1 + 24 files changed, 215 insertions(+), 302 deletions(-) diff --git a/.travis.yml b/.travis.yml index d1fc6dba35dd2..d22a4e7df0fea 100644 --- a/.travis.yml +++ b/.travis.yml @@ -109,6 +109,12 @@ matrix: jdk: openjdk8 env: - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 + - ARROW_TRAVIS_GANDIVA_TESTS=1 + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 + - ARROW_CPP_BUILD_TARGETS="gandiva" + # TODO(wesm): Remove this after ARROW-4026 + - ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS="gandiva" - ARROW_TRAVIS_USE_TOOLCHAIN=1 # ARROW-3979 temporarily disabled. - ARROW_TRAVIS_VALGRIND=0 @@ -155,6 +161,12 @@ matrix: addons: env: - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 + - ARROW_TRAVIS_GANDIVA_TESTS=1 + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 + - ARROW_CPP_BUILD_TARGETS="gandiva" + # TODO(wesm): Remove this after ARROW-4026 + - ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS="gandiva" - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN before_script: diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 5f398e8c6e327..6cb7d6074f230 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -42,6 +42,7 @@ fi CMAKE_COMMON_FLAGS="\ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL \ +-DARROW_TEST_INCLUDE_LABELS=$ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS \ -DARROW_NO_DEPRECATED_API=ON \ -DARROW_EXTRA_ERROR_CONTEXT=ON" CMAKE_LINUX_FLAGS="" @@ -98,8 +99,11 @@ fi if [ $ARROW_TRAVIS_GANDIVA == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON" - if [ $only_library_mode == "no" ]; then - CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA_BUILD_TESTS=ON" + if [ $ARROW_TRAVIS_GANDIVA_JAVA == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA_JAVA=ON" + fi + if [ $ARROW_TRAVIS_GANDIVA_TESTS == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BUILD_TESTS=ON" fi fi @@ -119,6 +123,10 @@ if [ $ARROW_TRAVIS_USE_VENDORED_BOOST == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BOOST_VENDORED=ON" fi +if [ $ARROW_TRAVIS_OPTIONAL_INSTALL == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_OPTIONAL_INSTALL=ON" +fi + if [ $TRAVIS_OS_NAME == "linux" ]; then cmake $CMAKE_COMMON_FLAGS \ $CMAKE_LINUX_FLAGS \ @@ -139,8 +147,10 @@ else $ARROW_CPP_DIR fi -# Build and install libraries -$TRAVIS_MAKE -j4 +# Build and install libraries. Configure ARROW_CPP_BUILD_TARGETS environment +# variable to only build certain targets. If you use this, you must also set +# the environment variable ARROW_TRAVIS_OPTIONAL_INSTALL=1 +$TRAVIS_MAKE -j4 $ARROW_CPP_BUILD_TARGETS $TRAVIS_MAKE install popd diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 25bec262d861c..6d96ebe2dfb0b 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -86,19 +86,23 @@ rm -rf * # XXX Can we simply reuse CMAKE_COMMON_FLAGS from travis_before_script_cpp.sh? CMAKE_COMMON_FLAGS="-DARROW_EXTRA_ERROR_CONTEXT=ON" +PYTHON_CPP_BUILD_TARGETS="arrow_python plasma" + if [ $ARROW_TRAVIS_COVERAGE == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GENERATE_COVERAGE=ON" fi if [ $ARROW_TRAVIS_PYTHON_GANDIVA == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON -DARROW_GANDIVA_BUILD_TESTS=OFF" + PYTHON_CPP_BUILD_TARGETS="$PYTHON_CPP_BUILD_TARGETS gandiva" fi cmake -GNinja \ $CMAKE_COMMON_FLAGS \ - -DARROW_BUILD_TESTS=on \ + -DARROW_BUILD_TESTS=ON \ -DARROW_TEST_INCLUDE_LABELS=python \ - -DARROW_BUILD_UTILITIES=off \ + -DARROW_BUILD_UTILITIES=OFF \ + -DARROW_OPTIONAL_INSTALL=ON \ -DARROW_PLASMA=on \ -DARROW_TENSORFLOW=on \ -DARROW_PYTHON=on \ @@ -107,7 +111,7 @@ cmake -GNinja \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ $ARROW_CPP_DIR -ninja +ninja $PYTHON_CPP_BUILD_TARGETS ninja install popd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a83b9dd6d9409..54daaf96e8eb6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -130,26 +130,62 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Linkage of Arrow libraries with unit tests executables. \ static|shared (default shared)") - set(ARROW_TEST_INCLUDE_LABELS "" CACHE STRING - "Only build unit tests having the indicated label or labels. \ -Pass multiple labels by dividing with semicolons") - option(ARROW_NO_DEPRECATED_API "Exclude deprecated APIs from build" OFF) - option(ARROW_COMPUTE - "Build the Arrow Compute Modules" + option(ARROW_FUZZING + "Build Arrow Fuzzing executables" + OFF) + + # Disable this option to exercise non-SIMD fallbacks + option(ARROW_USE_SIMD + "Build with SIMD optimizations" ON) - option(ARROW_EXTRA_ERROR_CONTEXT - "Compile with extra error context (line numbers, code)" + option(ARROW_ALTIVEC + "Build Arrow with Altivec" + ON) + + option(ARROW_BUILD_UTILITIES + "Build Arrow commandline utilities" + ON) + + option(ARROW_RPATH_ORIGIN + "Build Arrow libraries with RATH set to \$ORIGIN" + OFF) + + option(ARROW_INSTALL_NAME_RPATH + "Build Arrow libraries with install_name set to @rpath" + ON) + + option(ARROW_GENERATE_COVERAGE + "Build with C++ code coverage enabled" + OFF) + + option(ARROW_VERBOSE_LINT + "If off, 'quiet' flags will be passed to linting tools" OFF) + #---------------------------------------------------------------------- + # Project components to enable / disable building + + option(ARROW_COMPUTE + "Build the Arrow Compute Modules" + ON) + option(ARROW_FLIGHT "Build the Arrow Flight RPC System (requires GRPC, Protocol Buffers)" OFF) + option(ARROW_GANDIVA + "Build the Gandiva libraries" + OFF) + + option(ARROW_PARQUET + "Build the Parquet libraries" + OFF) + option(ARROW_IPC "Build the Arrow IPC extensions" ON) @@ -174,58 +210,44 @@ Pass multiple labels by dividing with semicolons") "Build the Arrow HDFS bridge" ON) - option(ARROW_BOOST_USE_SHARED - "Rely on boost shared libraries where relevant" - ON) - - option(ARROW_BOOST_VENDORED - "Use vendored Boost instead of existing Boost" - OFF) - - option(ARROW_PROTOBUF_USE_SHARED - "Rely on Protocol Buffers shared libraries where relevant" - OFF) - option(ARROW_PYTHON "Build the Arrow CPython extensions" OFF) - option(ARROW_FUZZING - "Build Arrow Fuzzing executables" + option(ARROW_HIVESERVER2 + "Build the HiveServer2 client and Arrow adapter" OFF) - # Disable this option to exercise non-SIMD fallbacks - option(ARROW_USE_SIMD - "Build with SIMD optimizations" - ON) + option(ARROW_PLASMA + "Build the plasma object store along with Arrow" + OFF) - option(ARROW_ALTIVEC - "Build Arrow with Altivec" - ON) + option(ARROW_PLASMA_JAVA_CLIENT + "Build the plasma object store java client" + OFF) - option(ARROW_BUILD_UTILITIES - "Build Arrow commandline utilities" - ON) + #---------------------------------------------------------------------- + # Thirdparty toolchain options - option(ARROW_RPATH_ORIGIN - "Build Arrow libraries with RATH set to \$ORIGIN" + option(ARROW_VERBOSE_THIRDPARTY_BUILD + "If off, output from ExternalProjects will be logged to files rather than shown" OFF) - option(ARROW_INSTALL_NAME_RPATH - "Build Arrow libraries with install_name set to @rpath" + option(ARROW_BOOST_USE_SHARED + "Rely on boost shared libraries where relevant" ON) - option(ARROW_HIVESERVER2 - "Build the HiveServer2 client and Arrow adapter" + option(ARROW_BOOST_VENDORED + "Use vendored Boost instead of existing Boost" OFF) - option(ARROW_PLASMA - "Build the plasma object store along with Arrow" + option(ARROW_PROTOBUF_USE_SHARED + "Rely on Protocol Buffers shared libraries where relevant" OFF) - option(ARROW_PLASMA_JAVA_CLIENT - "Build the plasma object store java client" - OFF) + option(ARROW_USE_GLOG + "Build libraries with glog support for pluggable logging" + ON) option(ARROW_WITH_BROTLI "Build with Brotli compression" @@ -257,21 +279,8 @@ Pass multiple labels by dividing with semicolons") "Build with zstd compression" ${ARROW_WITH_ZSTD_DEFAULT}) - option(ARROW_GENERATE_COVERAGE - "Build with C++ code coverage enabled" - OFF) - - option(ARROW_VERBOSE_THIRDPARTY_BUILD - "If off, output from ExternalProjects will be logged to files rather than shown" - OFF) - - option(ARROW_VERBOSE_LINT - "If off, 'quiet' flags will be passed to linting tools" - OFF) - - option(ARROW_USE_GLOG - "Build libraries with glog support for pluggable logging" - ON) + #---------------------------------------------------------------------- + # Windows options if (MSVC) option(ARROW_USE_CLCACHE @@ -292,10 +301,8 @@ Pass multiple labels by dividing with semicolons") OFF) endif() - # Parquet-related build options - option(ARROW_PARQUET - "Build the Parquet libraries" - OFF) + #---------------------------------------------------------------------- + # Parquet build options option(PARQUET_MINIMAL_DEPENDENCY "Depend only on Thirdparty headers to build libparquet. \ @@ -310,9 +317,11 @@ Always OFF if building binaries" "Build the Parquet examples. Requires static libraries to be built." OFF) - # Gandiva related build options - option(ARROW_GANDIVA - "Build the Gandiva libraries" + #---------------------------------------------------------------------- + # Gandiva build options + + option(ARROW_GANDIVA_JAVA + "Build the Gandiva JNI wrappers" OFF) # ARROW-3860: Temporary workaround @@ -320,16 +329,30 @@ Always OFF if building binaries" "Include -static-libstdc++ -static-libgcc when linking with Gandiva static libraries" OFF) - option(ARROW_GANDIVA_JAVA - "Build the Gandiva JNI wrappers" - ON) + #---------------------------------------------------------------------- + # Advanced developer options - option(ARROW_GANDIVA_BUILD_TESTS - "Build the Gandiva googletest unit tests" + set(ARROW_TEST_INCLUDE_LABELS "" CACHE STRING + "Only build unit tests having the indicated label or labels. \ +Pass multiple labels by dividing with semicolons") + + option(ARROW_EXTRA_ERROR_CONTEXT + "Compile with extra error context (line numbers, code)" OFF) + option(ARROW_OPTIONAL_INSTALL + "If enabled install ONLY targets that have already been built. Please be \ +advised that if this is enabled 'install' will fail silently on components \ +that have not been built" + OFF) endif() +if (ARROW_OPTIONAL_INSTALL) + # Don't make the "install" target depend on the "all" target + set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) + + set(INSTALL_IS_OPTIONAL OPTIONAL) +endif() ############################################################ # "make lint" target diff --git a/cpp/README.md b/cpp/README.md index d1d76c17875d7..1f12117e8d01e 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -101,6 +101,19 @@ unit tests, and benchmarks (if enabled): * `make gandiva` for Gandiva (LLVM expression compiler) libraries * `make plasma` for Plasma libraries, server +If you wish to only build and install one or more project subcomponents, we +have provided the CMake option `ARROW_OPTIONAL_INSTALL` to only install targets +that have been built. For example, if you only wish to build the Parquet +libraries, its tests, and its dependencies, you can run: + +``` +cmake .. -DARROW_PARQUET=ON -DARROW_OPTIONAL_INSTALL=ON -DARROW_BUILD_TESTS=ON +make parquet +make install +``` + +If you omit an explicit target when invoking `make`, all targets will be built. + ## Parquet Development Notes To build the C++ libraries for Apache Parquet, add the flag @@ -269,7 +282,7 @@ The optional `gandiva` libraries and tests can be built by passing `-DARROW_GANDIVA=on`. ```shell -cmake .. -DARROW_GANDIVA=ON -DARROW_GANDIVA_BUILD_TESTS=ON +cmake .. -DARROW_GANDIVA=ON -DARROW_BUILD_TESTS=ON make ctest -L gandiva ``` diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index d5978e1d215ff..1abe97eecc59f 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -226,6 +226,7 @@ function(ADD_ARROW_LIB LIB_NAME) endif() install(TARGETS ${LIB_NAME}_shared + ${INSTALL_IS_OPTIONAL} EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} @@ -272,6 +273,7 @@ function(ADD_ARROW_LIB LIB_NAME) LINK_PUBLIC ${ARG_STATIC_LINK_LIBS}) install(TARGETS ${LIB_NAME}_static + ${INSTALL_IS_OPTIONAL} EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} @@ -401,7 +403,7 @@ function(ADD_TEST_CASE REL_TEST_NAME) if (NOT "${ARROW_TEST_INCLUDE_LABELS}" STREQUAL "") set(_SKIP_TEST TRUE) - foreach (_INCLUDED_LABEL ${ARG_LABELS}) + foreach (_INCLUDED_LABEL ${ARROW_TEST_INCLUDE_LABELS}) if ("${ARG_LABELS}" MATCHES "${_INCLUDED_LABEL}") set(_SKIP_TEST FALSE) endif() @@ -569,3 +571,19 @@ function(ADD_ARROW_FUZZING REL_FUZZING_NAME) PROPERTIES LINK_FLAGS "-fsanitize=fuzzer") endfunction() + +################################################### + +function(ARROW_INSTALL_ALL_HEADERS PATH) + set(options) + set(one_value_args) + set(multi_value_args PATTERN) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if (NOT ARG_PATTERN) + set(ARG_PATTERN "*.h") + endif() + file(GLOB CURRENT_DIRECTORY_HEADERS ${ARG_PATTERN}) + install(FILES + ${CURRENT_DIRECTORY_HEADERS} + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PATH}") +endfunction() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 2d043a9a27627..e12d2d2ee2958 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -213,8 +213,7 @@ if (ARROW_BUILD_STATIC AND WIN32) target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) endif() -if (ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS - OR ARROW_BUILD_BENCHMARKS) +if (ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) # that depend on gtest ADD_ARROW_LIB(arrow_testing SOURCES test-util.cc @@ -244,28 +243,7 @@ foreach(LIB_TARGET ${ARROW_LIBRARIES}) endforeach() # Headers: top level -install(FILES - allocator.h - api.h - array.h - buffer.h - builder.h - compare.h - memory_pool.h - pretty_print.h - record_batch.h - status.h - stl.h - table.h - table_builder.h - tensor.h - type.h - type_fwd.h - type_traits.h - test-util.h - visitor.h - visitor_inline.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow") +ARROW_INSTALL_ALL_HEADERS("arrow") # pkg-config support configure_file(arrow.pc.in diff --git a/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt b/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt index db4264b59ab63..5bb5b725910e3 100644 --- a/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt +++ b/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt @@ -15,7 +15,4 @@ # specific language governing permissions and limitations # under the License. -# Headers: top level -install(FILES - convert.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/adapters/tensorflow") +ARROW_INSTALL_ALL_HEADERS("arrow/adapters/tensorflow") diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index d4369ed27b7c4..242937005cf9c 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -15,12 +15,7 @@ # specific language governing permissions and limitations # under the License. -# Headers: top level -install(FILES - api.h - context.h - kernel.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/compute") +ARROW_INSTALL_ALL_HEADERS("arrow/compute") # pkg-config support configure_file(arrow-compute.pc.in diff --git a/cpp/src/arrow/csv/CMakeLists.txt b/cpp/src/arrow/csv/CMakeLists.txt index db23d6feff111..2a72dceadad16 100644 --- a/cpp/src/arrow/csv/CMakeLists.txt +++ b/cpp/src/arrow/csv/CMakeLists.txt @@ -29,9 +29,4 @@ ADD_ARROW_BENCHMARK(converter-benchmark ADD_ARROW_BENCHMARK(parser-benchmark PREFIX "arrow-csv") -# Headers: top level -file(GLOB_RECURSE ARROW_CSV_HEADERS "*.h") - -install(FILES - ${ARROW_CSV_HEADERS} - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/csv") +ARROW_INSTALL_ALL_HEADERS("arrow/csv") diff --git a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt index eb4446f05d971..9fd7f924d3a69 100644 --- a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt +++ b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt @@ -18,15 +18,7 @@ add_custom_target(arrow_hiveserver2) # Headers: top level -install(FILES - api.h - columnar-row-set.h - operation.h - service.h - session.h - types.h - util.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/dbi/hiveserver2") +ARROW_INSTALL_ALL_HEADERS("arrow/dbi/hiveserver2") set(ARROW_HIVESERVER2_SRCS columnar-row-set.cc @@ -115,7 +107,9 @@ if (ARROW_BUILD_TESTS) STATIC_LINK_LIBS "${ARROW_HIVESERVER2_TEST_LINK_LIBS}" LABELS "arrow_hiveserver2" ) - set_property(TARGET arrow-hiveserver2-test - APPEND_STRING PROPERTY COMPILE_FLAGS - " -Wno-shadow-field") + if (TARGET arrow-hiveserver2-test) + set_property(TARGET arrow-hiveserver2-test + APPEND_STRING PROPERTY COMPILE_FLAGS + " -Wno-shadow-field") + endif() endif(ARROW_BUILD_TESTS) diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index 60407acb0a1ec..c37779aefa9aa 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -63,12 +63,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cuda_version.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu") -install(FILES - cuda_api.h - cuda_arrow_ipc.h - cuda_context.h - cuda_memory.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu") +ARROW_INSTALL_ALL_HEADERS("arrow/gpu") # pkg-config support configure_file(arrow-cuda.pc.in diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 44c56f033269d..c44f7b9fe1bfe 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -88,16 +88,7 @@ add_custom_command( add_custom_target(metadata_fbs DEPENDS ${FBS_OUTPUT_FILES}) # Headers: top level -install(FILES - api.h - dictionary.h - feather.h - json.h - json-simple.h - message.h - reader.h - writer.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/ipc") +ARROW_INSTALL_ALL_HEADERS("arrow/ipc") if (ARROW_BUILD_STATIC) set(ARROW_UTIL_LIB arrow_static) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 7f4603ae5dfaf..4913083537340 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -91,29 +91,7 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") COMPILE_FLAGS -Wno-parentheses-equality) endif() -install(FILES - api.h - arrow_to_pandas.h - benchmark.h - common.h - config.h - decimal.h - deserialize.h - helpers.h - inference.h - init.h - io.h - iterators.h - numpy_convert.h - numpy_interop.h - numpy_to_arrow.h - python_to_arrow.h - platform.h - pyarrow.h - serialize.h - type_traits.h - visibility.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python") +ARROW_INSTALL_ALL_HEADERS("arrow/python") # pkg-config support configure_file(arrow-python.pc.in diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 4f515b52e8e64..a09797183212f 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -20,45 +20,7 @@ ####################################### # Headers: top level -install(FILES - bit-stream-utils.h - bit-util.h - bpacking.h - checked_cast.h - compiler-util.h - compression.h - compression_brotli.h - compression_bz2.h - compression_lz4.h - compression_snappy.h - compression_zlib.h - compression_zstd.h - cpu-info.h - date.h - decimal.h - hash-util.h - hashing.h - io-util.h - key_value_metadata.h - lazy.h - logging.h - macros.h - memory.h - neon-util.h - parallel.h - rle-encoding.h - sse-util.h - stl.h - stopwatch.h - string.h - string_view.h - thread-pool.h - type_traits.h - utf8.h - variant.h - visibility.h - windows_compatibility.h - DESTINATION include/arrow/util) +ARROW_INSTALL_ALL_HEADERS("arrow/util") ####################################### # arrow_test_main diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 5ef573875b660..9763f297b0b8b 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -92,31 +92,8 @@ add_dependencies(gandiva ${GANDIVA_LIBRARIES}) # install for gandiva include(GNUInstallDirs) -# install libgandiva -install( - TARGETS gandiva_shared gandiva_static - DESTINATION ${CMAKE_INSTALL_LIBDIR} -) - # install the header files. -install(FILES - arrow.h - condition.h - configuration.h - expression.h - expression_registry.h - filter.h - func_descriptor.h - function_signature.h - gandiva_aliases.h - literal_holder.h - logging.h - node.h - node_visitor.h - projector.h - selection_vector.h - tree_expr_builder.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/gandiva") +ARROW_INSTALL_ALL_HEADERS("gandiva") # pkg-config support configure_file(gandiva.pc.in @@ -141,6 +118,10 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) set(multi_value_args) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if (NO_TESTS) + return() + endif() + set(TEST_ARGUMENTS ENABLED PREFIX "gandiva" @@ -159,39 +140,35 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) STATIC_LINK_LIBS ${GANDIVA_SHARED_TEST_LINK_LIBS}) endif() - if(${REL_TEST_NAME} MATCHES "llvm" OR - ${REL_TEST_NAME} MATCHES "expression_registry") + set(TARGET_NAME gandiva-${REL_TEST_NAME}) + + if((TARGET ${TARGET_NAME}) AND + (${REL_TEST_NAME} MATCHES "llvm" OR + ${REL_TEST_NAME} MATCHES "expression_registry")) # If the unit test has llvm in its name, include llvm. - add_dependencies(gandiva-${REL_TEST_NAME} LLVM::LLVM_INTERFACE) - target_link_libraries(gandiva-${REL_TEST_NAME} PRIVATE LLVM::LLVM_INTERFACE) + add_dependencies(${TARGET_NAME} LLVM::LLVM_INTERFACE) + target_link_libraries(${TARGET_NAME} PRIVATE LLVM::LLVM_INTERFACE) endif() endfunction() -if (ARROW_GANDIVA_BUILD_TESTS) - ADD_GANDIVA_TEST(bitmap_accumulator_test) - ADD_GANDIVA_TEST(engine_llvm_test) - ADD_GANDIVA_TEST(function_signature_test) - ADD_GANDIVA_TEST(function_registry_test) - ADD_GANDIVA_TEST(llvm_types_test) - ADD_GANDIVA_TEST(llvm_generator_test) - ADD_GANDIVA_TEST(annotator_test) - ADD_GANDIVA_TEST(tree_expr_test) - ADD_GANDIVA_TEST(expr_decomposer_test) - ADD_GANDIVA_TEST(expression_registry_test) - ADD_GANDIVA_TEST(selection_vector_test) - ADD_GANDIVA_TEST(lru_cache_test) - ADD_GANDIVA_TEST(to_date_holder_test) - ADD_GANDIVA_TEST(simple_arena_test) -endif() +ADD_GANDIVA_TEST(bitmap_accumulator_test) +ADD_GANDIVA_TEST(engine_llvm_test) +ADD_GANDIVA_TEST(function_signature_test) +ADD_GANDIVA_TEST(function_registry_test) +ADD_GANDIVA_TEST(llvm_types_test) +ADD_GANDIVA_TEST(llvm_generator_test) +ADD_GANDIVA_TEST(annotator_test) +ADD_GANDIVA_TEST(tree_expr_test) +ADD_GANDIVA_TEST(expr_decomposer_test) +ADD_GANDIVA_TEST(expression_registry_test) +ADD_GANDIVA_TEST(selection_vector_test) +ADD_GANDIVA_TEST(lru_cache_test) +ADD_GANDIVA_TEST(to_date_holder_test) +ADD_GANDIVA_TEST(simple_arena_test) if (ARROW_GANDIVA_JAVA) add_subdirectory(jni) endif() -add_subdirectory(precompiled) - -if (ARROW_GANDIVA_BUILD_TESTS) - include(CTest) - enable_testing() - add_subdirectory(tests) -endif() +add_subdirectory(precompiled) +add_subdirectory(tests) diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 886fdced887ff..0792fd6421d65 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -69,7 +69,7 @@ function(add_precompiled_unit_test REL_TEST_NAME) endfunction(add_precompiled_unit_test REL_TEST_NAME) # testing -if (ARROW_GANDIVA_BUILD_TESTS) +if (ARROW_BUILD_TESTS) add_precompiled_unit_test(bitmap_test.cc bitmap.cc) add_precompiled_unit_test(epoch_time_point_test.cc) add_precompiled_unit_test(time_test.cc time.cc timestamp_arithmetic.cc ../context_helper.cc) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 246f69dcc09fa..6b7846b709d0b 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -238,26 +238,7 @@ add_subdirectory(api) add_subdirectory(arrow) add_subdirectory(util) -# Headers: top level -install(FILES - bloom_filter.h - column_reader.h - column_page.h - column_scanner.h - column_writer.h - encoding.h - exception.h - file_reader.h - file_writer.h - hasher.h - metadata.h - murmur3.h - printer.h - properties.h - schema.h - statistics.h - types.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet") +ARROW_INSTALL_ALL_HEADERS("parquet") configure_file(parquet_version.h.in "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h" diff --git a/cpp/src/parquet/api/CMakeLists.txt b/cpp/src/parquet/api/CMakeLists.txt index 79fc716952a16..48fddb9d61ddf 100644 --- a/cpp/src/parquet/api/CMakeLists.txt +++ b/cpp/src/parquet/api/CMakeLists.txt @@ -16,9 +16,4 @@ # under the License. # Headers: public api -install(FILES - io.h - reader.h - writer.h - schema.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/api") +ARROW_INSTALL_ALL_HEADERS("parquet/api") diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index 429dadcd37e5e..9372c3110a3af 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -22,9 +22,4 @@ ADD_ARROW_BENCHMARK(reader-writer-benchmark PREFIX "parquet-arrow" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) -# Headers: top level -install(FILES - reader.h - schema.h - writer.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/arrow") +ARROW_INSTALL_ALL_HEADERS("parquet/arrow") diff --git a/cpp/src/parquet/util/CMakeLists.txt b/cpp/src/parquet/util/CMakeLists.txt index 72d4ca28f9b83..b5718b1601ee0 100644 --- a/cpp/src/parquet/util/CMakeLists.txt +++ b/cpp/src/parquet/util/CMakeLists.txt @@ -16,12 +16,7 @@ # under the License. # Headers: util -install(FILES - comparison.h - macros.h - memory.h - visibility.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/util") +ARROW_INSTALL_ALL_HEADERS("parquet/util") ADD_PARQUET_TEST(comparison-test) ADD_PARQUET_TEST(memory-test) diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 4ea4b76066cf7..317835bb7ac44 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -127,6 +127,7 @@ endif() # be copied around and used in different locations. add_executable(plasma_store_server store.cc) target_link_libraries(plasma_store_server plasma_static ${PLASMA_STATIC_LINK_LIBS}) +add_dependencies(plasma plasma_store_server) if (ARROW_RPATH_ORIGIN) if (APPLE) @@ -138,7 +139,6 @@ if (ARROW_RPATH_ORIGIN) INSTALL_RPATH ${_lib_install_rpath}) endif() -# Headers: top level install(FILES common.h compat.h @@ -149,7 +149,9 @@ install(FILES # Plasma store set_target_properties(plasma_store_server PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) -install(TARGETS plasma_store_server DESTINATION ${CMAKE_INSTALL_BINDIR}) +install(TARGETS plasma_store_server + ${INSTALL_IS_OPTIONAL} + DESTINATION ${CMAKE_INSTALL_BINDIR}) # pkg-config support configure_file(plasma.pc.in diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 47aea28ff6828..bbbec29c13009 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -26,7 +26,9 @@ if (PARQUET_BUILD_EXECUTABLES) target_link_libraries(${TOOL} parquet_static) # Avoid unsetting RPATH when installing set_target_properties(${TOOL} PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) - install(TARGETS ${TOOL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + install(TARGETS ${TOOL} + ${INSTALL_IS_OPTIONAL} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) add_dependencies(parquet ${PARQUET_TOOLS}) diff --git a/dev/tasks/gandiva-jars/build-cpp.sh b/dev/tasks/gandiva-jars/build-cpp.sh index 21289dee5a6b1..ae13f9c0193ce 100755 --- a/dev/tasks/gandiva-jars/build-cpp.sh +++ b/dev/tasks/gandiva-jars/build-cpp.sh @@ -27,6 +27,7 @@ pushd arrow/cpp pushd build cmake -DCMAKE_BUILD_TYPE=Release \ -DARROW_GANDIVA=ON \ + -DARROW_GANDIVA_JAVA=ON \ -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_BOOST_USE_SHARED=OFF \ From 73f94c93d7eee25a43415dfa7a806b887942abd1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Dec 2018 14:49:12 -0600 Subject: [PATCH 40/45] ARROW-3762: [C++/Python] Support reading Parquet BYTE_ARRAY columns containing over 2GB of data This patch ended up being a bit more of a bloodbath than I planned: please accept my apologies. Associated changes in this patch: * Split up builder.h/builder.cc into a new arrow/array directory. Public arrow/builder.h API preserved. I think this code is going to keep growing more specialized components, so I think we should get out ahead of it by having a subdirectory to contain files related to implementation details * Implement ChunkedBinaryBuilder, ChunkedStringBuilder classes, add tests and benchmarks * Deprecate parquet::arrow methods returning Array * Allow implicit construction of Datum from its variant types (makes for a lot nicer syntax) As far as what code to review, focus efforts on * src/parquet/arrow * src/arrow/array/builder_binary.h/cc, array-binary-test.cc, builder-benchmark * src/arrow/compute changes * Python changes I'm going to tackle ARROW-2970 which should not be complicated after this patch; I will submit that as a PR after this is reviews and merged. Author: Wes McKinney Closes #3171 from wesm/ARROW-3762 and squashes the following commits: 822451280 Fix int conversion warning on Windows 695ffc9df Remove unimplemented and unused ChunkedBinaryBuilder ctor 5a525115c Use strnlen to compute string length. Inline BinaryBuilder::AppendNextOffset b90eb4b71 Restore sstream include to pretty_print.cc 3669201be Fix deprecated API use 5fdbbb261 Rename columnar/ directory to array/ 8ffaec1ef Address preliminary code comments. Check in missing files 81e787c69 Fix up Python bindings, unit test 2efae064c Finish scaffolding. Get fully compiling again and original parquet-arrow test suite passing 3d075e4aa Additional refactoring to make things chunked. Allow implicit construction of arrow::compute::Datum 922811278 More refactoring 716322377 Split up builder.h, builder.cc into smaller headers, compilation units. add failing test case for ARROW-3762. Add ChunkedBinaryBuilder, make BinaryBuilder Append methods inline --- cpp/cmake_modules/SetupCxxFlags.cmake | 3 + cpp/examples/parquet/CMakeLists.txt | 2 +- .../parquet/parquet-arrow/CMakeLists.txt | 2 +- .../parquet-arrow/{src => }/reader-writer.cc | 4 +- cpp/src/arrow/CMakeLists.txt | 15 +- cpp/src/arrow/allocator-test.cc | 1 + cpp/src/arrow/array-binary-test.cc | 114 +- cpp/src/arrow/array-dict-test.cc | 8 +- cpp/src/arrow/array-list-test.cc | 4 +- cpp/src/arrow/array-struct-test.cc | 4 +- cpp/src/arrow/array-test.cc | 2 - cpp/src/arrow/array.cc | 1 + cpp/src/arrow/array.h | 1 - cpp/src/arrow/array/CMakeLists.txt | 27 + cpp/src/arrow/array/README.md | 20 + .../builder_adaptive.cc} | 4 +- cpp/src/arrow/array/builder_adaptive.h | 174 +++ cpp/src/arrow/array/builder_base.cc | 176 +++ cpp/src/arrow/array/builder_base.h | 227 ++++ .../builder_binary.cc} | 78 +- cpp/src/arrow/array/builder_binary.h | 304 +++++ cpp/src/arrow/array/builder_decimal.cc | 64 + cpp/src/arrow/array/builder_decimal.h | 45 + .../builder_dict.cc} | 4 +- cpp/src/arrow/array/builder_dict.h | 167 +++ cpp/src/arrow/array/builder_nested.cc | 156 +++ cpp/src/arrow/array/builder_nested.h | 121 ++ cpp/src/arrow/array/builder_primitive.cc | 272 ++++ cpp/src/arrow/array/builder_primitive.h | 401 ++++++ cpp/src/arrow/builder-benchmark.cc | 30 +- cpp/src/arrow/builder.cc | 503 +------ cpp/src/arrow/builder.h | 1177 +---------------- cpp/src/arrow/compute/compute-test.cc | 61 +- cpp/src/arrow/compute/kernel.h | 35 +- cpp/src/arrow/csv/column-builder.h | 21 +- cpp/src/arrow/csv/converter.cc | 1 + cpp/src/arrow/csv/parser.h | 1 + cpp/src/arrow/csv/reader.cc | 2 + cpp/src/arrow/io/buffered.cc | 2 +- cpp/src/arrow/io/buffered.h | 1 + cpp/src/arrow/ipc/feather-test.cc | 1 + cpp/src/arrow/ipc/json-simple-test.cc | 1 + cpp/src/arrow/memory_pool-test.h | 1 + cpp/src/arrow/memory_pool.cc | 12 +- cpp/src/arrow/pretty_print-test.cc | 4 +- cpp/src/arrow/pretty_print.cc | 2 +- cpp/src/arrow/pretty_print.h | 5 +- cpp/src/arrow/python/numpy_to_arrow.cc | 27 +- cpp/src/arrow/python/python-test.cc | 1 + cpp/src/arrow/record_batch.h | 1 + cpp/src/arrow/table.h | 5 + cpp/src/arrow/tensor.cc | 1 + cpp/src/arrow/test-util.cc | 13 +- cpp/src/arrow/test-util.h | 18 +- cpp/src/arrow/util/compression_lz4.cc | 1 + cpp/src/arrow/util/int-util-test.cc | 2 - cpp/src/arrow/util/string_view.h | 2 +- cpp/src/parquet/arrow/CMakeLists.txt | 5 +- .../parquet/arrow/arrow-reader-writer-test.cc | 15 +- cpp/src/parquet/arrow/reader.cc | 567 ++++---- cpp/src/parquet/arrow/reader.h | 38 +- cpp/src/parquet/arrow/record_reader.cc | 103 +- cpp/src/parquet/arrow/record_reader.h | 7 +- python/pyarrow/_parquet.pxd | 6 +- python/pyarrow/_parquet.pyx | 23 +- python/pyarrow/lib.pxd | 2 + python/pyarrow/tests/test_parquet.py | 27 + 67 files changed, 2985 insertions(+), 2140 deletions(-) rename cpp/examples/parquet/parquet-arrow/{src => }/reader-writer.cc (98%) create mode 100644 cpp/src/arrow/array/CMakeLists.txt create mode 100644 cpp/src/arrow/array/README.md rename cpp/src/arrow/{builder-adaptive.cc => array/builder_adaptive.cc} (99%) create mode 100644 cpp/src/arrow/array/builder_adaptive.h create mode 100644 cpp/src/arrow/array/builder_base.cc create mode 100644 cpp/src/arrow/array/builder_base.h rename cpp/src/arrow/{builder-binary.cc => array/builder_binary.cc} (86%) create mode 100644 cpp/src/arrow/array/builder_binary.h create mode 100644 cpp/src/arrow/array/builder_decimal.cc create mode 100644 cpp/src/arrow/array/builder_decimal.h rename cpp/src/arrow/{builder-dict.cc => array/builder_dict.cc} (99%) create mode 100644 cpp/src/arrow/array/builder_dict.h create mode 100644 cpp/src/arrow/array/builder_nested.cc create mode 100644 cpp/src/arrow/array/builder_nested.h create mode 100644 cpp/src/arrow/array/builder_primitive.cc create mode 100644 cpp/src/arrow/array/builder_primitive.h diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 893ec360d3e55..61fd14ca2cf46 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -25,6 +25,9 @@ CHECK_CXX_COMPILER_FLAG("-maltivec" CXX_SUPPORTS_ALTIVEC) # Arm64 compiler flags CHECK_CXX_COMPILER_FLAG("-march=armv8-a+crc" CXX_SUPPORTS_ARMCRC) +# Support C11 +set(CMAKE_C_STANDARD 11) + # This ensures that things like gnu++11 get passed correctly set(CMAKE_CXX_STANDARD 11) diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 98c5cd9402bb7..db172a2534f37 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -22,7 +22,7 @@ target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) -add_executable(parquet-arrow-example parquet-arrow/src/reader-writer.cc) +add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) target_link_libraries(parquet-arrow-example parquet_shared) add_dependencies(parquet diff --git a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt index d9e01acd3eea3..915930ec228e1 100644 --- a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt +++ b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt @@ -38,5 +38,5 @@ find_package(Parquet) include_directories(SYSTEM ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR}) -add_executable(parquet-arrow-example src/reader-writer.cc) +add_executable(parquet-arrow-example reader-writer.cc) target_link_libraries(parquet-arrow-example ${PARQUET_SHARED_LIB} ${ARROW_SHARED_LIB}) diff --git a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc b/cpp/examples/parquet/parquet-arrow/reader-writer.cc similarity index 98% rename from cpp/examples/parquet/parquet-arrow/src/reader-writer.cc rename to cpp/examples/parquet/parquet-arrow/reader-writer.cc index 8d474486e7413..a5f928b6d4f69 100644 --- a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc +++ b/cpp/examples/parquet/parquet-arrow/reader-writer.cc @@ -100,7 +100,7 @@ void read_single_column() { std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - std::shared_ptr array; + std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; @@ -119,7 +119,7 @@ void read_single_column_chunk() { std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - std::shared_ptr array; + std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index e12d2d2ee2958..b13c9b66ac48d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -53,11 +53,17 @@ endfunction() set(ARROW_SRCS array.cc - buffer.cc + builder.cc - builder-adaptive.cc - builder-binary.cc - builder-dict.cc + array/builder_adaptive.cc + array/builder_base.cc + array/builder_binary.cc + array/builder_decimal.cc + array/builder_dict.cc + array/builder_nested.cc + array/builder_primitive.cc + + buffer.cc compare.cc memory_pool.cc pretty_print.cc @@ -275,6 +281,7 @@ ADD_ARROW_TEST(tensor-test) ADD_ARROW_BENCHMARK(builder-benchmark) ADD_ARROW_BENCHMARK(column-benchmark) +add_subdirectory(array) add_subdirectory(csv) add_subdirectory(io) add_subdirectory(util) diff --git a/cpp/src/arrow/allocator-test.cc b/cpp/src/arrow/allocator-test.cc index cdffbd7e8494f..1a94467281dbc 100644 --- a/cpp/src/arrow/allocator-test.cc +++ b/cpp/src/arrow/allocator-test.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/cpp/src/arrow/array-binary-test.cc b/cpp/src/arrow/array-binary-test.cc index 4376695c68cba..6f938c82bfd0a 100644 --- a/cpp/src/arrow/array-binary-test.cc +++ b/cpp/src/arrow/array-binary-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -28,10 +26,14 @@ #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" namespace arrow { @@ -676,4 +678,112 @@ TEST_F(TestStringArray, TestSliceEquality) { CheckSliceEquality(); } TEST_F(TestBinaryArray, LengthZeroCtor) { BinaryArray array(0, nullptr, nullptr); } +// ---------------------------------------------------------------------- +// ChunkedBinaryBuilder tests + +class TestChunkedBinaryBuilder : public ::testing::Test { + public: + void SetUp() {} + + void Init(int32_t chunksize) { + builder_.reset(new internal::ChunkedBinaryBuilder(chunksize)); + } + + protected: + std::unique_ptr builder_; +}; + +TEST_F(TestChunkedBinaryBuilder, BasicOperation) { + const int32_t chunksize = 1000; + Init(chunksize); + + const int elem_size = 10; + uint8_t buf[elem_size]; + + BinaryBuilder unchunked_builder; + + const int iterations = 1000; + for (int i = 0; i < iterations; ++i) { + random_bytes(elem_size, i, buf); + + ASSERT_OK(unchunked_builder.Append(buf, elem_size)); + ASSERT_OK(builder_->Append(buf, elem_size)); + } + + std::shared_ptr unchunked; + ASSERT_OK(unchunked_builder.Finish(&unchunked)); + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + + // This assumes that everything is evenly divisible + ArrayVector expected_chunks; + const int elems_per_chunk = chunksize / elem_size; + for (int i = 0; i < iterations / elems_per_chunk; ++i) { + expected_chunks.emplace_back(unchunked->Slice(i * elems_per_chunk, elems_per_chunk)); + } + + ASSERT_EQ(expected_chunks.size(), chunks.size()); + for (size_t i = 0; i < chunks.size(); ++i) { + AssertArraysEqual(*expected_chunks[i], *chunks[i]); + } +} + +TEST_F(TestChunkedBinaryBuilder, NoData) { + Init(1000); + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + + ASSERT_EQ(1, chunks.size()); + ASSERT_EQ(0, chunks[0]->length()); +} + +TEST_F(TestChunkedBinaryBuilder, LargeElements) { + Init(100); + + const int bufsize = 101; + uint8_t buf[bufsize]; + + const int iterations = 100; + for (int i = 0; i < iterations; ++i) { + random_bytes(bufsize, i, buf); + ASSERT_OK(builder_->Append(buf, bufsize)); + } + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + ASSERT_EQ(iterations, static_cast(chunks.size())); + + int64_t total_data_size = 0; + for (auto chunk : chunks) { + ASSERT_EQ(1, chunk->length()); + total_data_size += + static_cast(static_cast(*chunk).GetView(0).size()); + } + ASSERT_EQ(iterations * bufsize, total_data_size); +} + +TEST(TestChunkedStringBuilder, BasicOperation) { + const int chunksize = 100; + internal::ChunkedStringBuilder builder(chunksize); + + std::string value = "0123456789"; + + const int iterations = 100; + for (int i = 0; i < iterations; ++i) { + ASSERT_OK(builder.Append(value)); + } + + ArrayVector chunks; + ASSERT_OK(builder.Finish(&chunks)); + + ASSERT_EQ(10, chunks.size()); + + // Type is correct + for (auto chunk : chunks) { + ASSERT_TRUE(chunk->type()->Equals(*::arrow::utf8())); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index cc471a3e54066..87cb2290a7bf9 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -15,23 +15,23 @@ // specific language governing permissions and limitations // under the License. -#include +#include #include -#include -#include #include +#include #include #include #include #include "arrow/array.h" -#include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/decimal.h" namespace arrow { diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc index 207acd4cf65d7..c49c5e3097058 100644 --- a/cpp/src/arrow/array-list-test.cc +++ b/cpp/src/arrow/array-list-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -32,6 +30,8 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" namespace arrow { diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc index dc8bafd4c0071..68c35f57116a8 100644 --- a/cpp/src/arrow/array-struct-test.cc +++ b/cpp/src/arrow/array-struct-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -26,12 +24,12 @@ #include #include "arrow/array.h" -#include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" namespace arrow { diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index de0885e6f5f3a..bdb7eda118d51 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -40,7 +39,6 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 05d66d5cffdb2..ff94aa2a1e6fe 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -18,6 +18,7 @@ #include "arrow/array.h" #include +#include #include #include #include diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 37fa5aedfc2d0..52c5207d8dddc 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -18,7 +18,6 @@ #ifndef ARROW_ARRAY_H #define ARROW_ARRAY_H -#include #include #include #include diff --git a/cpp/src/arrow/array/CMakeLists.txt b/cpp/src/arrow/array/CMakeLists.txt new file mode 100644 index 0000000000000..a789c88dd9d31 --- /dev/null +++ b/cpp/src/arrow/array/CMakeLists.txt @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Headers: top level +install(FILES + builder_adaptive.h + builder_base.h + builder_binary.h + builder_decimal.h + builder_dict.h + builder_nested.h + builder_primitive.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/array") diff --git a/cpp/src/arrow/array/README.md b/cpp/src/arrow/array/README.md new file mode 100644 index 0000000000000..09580193aad28 --- /dev/null +++ b/cpp/src/arrow/array/README.md @@ -0,0 +1,20 @@ + + +## Implementation details related to columnnar (array) data structures diff --git a/cpp/src/arrow/builder-adaptive.cc b/cpp/src/arrow/array/builder_adaptive.cc similarity index 99% rename from cpp/src/arrow/builder-adaptive.cc rename to cpp/src/arrow/array/builder_adaptive.cc index a715f469c7aa1..599e9e1c38d76 100644 --- a/cpp/src/arrow/builder-adaptive.cc +++ b/cpp/src/arrow/array/builder_adaptive.cc @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/array/builder_adaptive.h" + +#include #include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h new file mode 100644 index 0000000000000..6523de41622e4 --- /dev/null +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_base.h" + +namespace arrow { + +namespace internal { + +class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { + public: + explicit AdaptiveIntBuilderBase(MemoryPool* pool); + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(CommitPendingData()); + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + Status AppendNull() { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 0; + pending_has_nulls_ = true; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + virtual Status CommitPendingData() = 0; + + std::shared_ptr data_; + uint8_t* raw_data_; + uint8_t int_size_; + + static constexpr int32_t pending_size_ = 1024; + uint8_t pending_valid_[pending_size_]; + uint64_t pending_data_[pending_size_]; + int32_t pending_pos_; + bool pending_has_nulls_; +}; + +} // namespace internal + +class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveUIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using ArrayBuilder::Advance; + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const uint64_t val) { + pending_data_[pending_pos_] = val; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); +#define __LESS(a, b) (a) < (b) + template + typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); +#undef __LESS + + template + Status ExpandIntSizeN(); +}; + +class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using ArrayBuilder::Advance; + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const int64_t val) { + auto v = static_cast(val); + + pending_data_[pending_pos_] = v; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const int64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); +#define __LESS(a, b) (a) < (b) + template + typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); +#undef __LESS + + template + Status ExpandIntSizeN(); +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc new file mode 100644 index 0000000000000..321aa44dab5e3 --- /dev/null +++ b/cpp/src/arrow/array/builder_base.cc @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_base.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { + if (buffer) { + if (bytes_filled < buffer->size()) { + // Trim buffer + RETURN_NOT_OK(buffer->Resize(bytes_filled)); + } + // zero the padding + buffer->ZeroPadding(); + } else { + // Null buffers are allowed in place of 0-byte buffers + DCHECK_EQ(bytes_filled, 0); + } + return Status::OK(); +} + +Status ArrayBuilder::AppendToBitmap(bool is_valid) { + if (length_ == capacity_) { + // If the capacity was not already a multiple of 2, do so here + // TODO(emkornfield) doubling isn't great default allocation practice + // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md + // fo discussion + RETURN_NOT_OK(Resize(BitUtil::NextPower2(capacity_ + 1))); + } + UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + RETURN_NOT_OK(Reserve(length)); + + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +Status ArrayBuilder::Resize(int64_t capacity) { + // Target size of validity (null) bitmap data + const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &null_bitmap_)); + null_bitmap_data_ = null_bitmap_->mutable_data(); + + // Padding is zeroed by AllocateResizableBuffer + memset(null_bitmap_data_, 0, static_cast(new_bitmap_size)); + } else { + const int64_t old_bitmap_capacity = null_bitmap_->capacity(); + RETURN_NOT_OK(null_bitmap_->Resize(new_bitmap_size)); + + const int64_t new_bitmap_capacity = null_bitmap_->capacity(); + null_bitmap_data_ = null_bitmap_->mutable_data(); + + // Zero the region between the original capacity and the new capacity, + // including padding, which has not been zeroed, unlike + // AllocateResizableBuffer + if (old_bitmap_capacity < new_bitmap_capacity) { + memset(null_bitmap_data_ + old_bitmap_capacity, 0, + static_cast(new_bitmap_capacity - old_bitmap_capacity)); + } + } + capacity_ = capacity; + return Status::OK(); +} + +Status ArrayBuilder::Advance(int64_t elements) { + if (length_ + elements > capacity_) { + return Status::Invalid("Builder must be expanded"); + } + length_ += elements; + return Status::OK(); +} + +Status ArrayBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr internal_data; + RETURN_NOT_OK(FinishInternal(&internal_data)); + *out = MakeArray(internal_data); + return Status::OK(); +} + +Status ArrayBuilder::Reserve(int64_t additional_elements) { + if (length_ + additional_elements > capacity_) { + // TODO(emkornfield) power of 2 growth is potentially suboptimal + int64_t new_size = BitUtil::NextPower2(length_ + additional_elements); + return Resize(new_size); + } + return Status::OK(); +} + +void ArrayBuilder::Reset() { + capacity_ = length_ = null_count_ = 0; + null_bitmap_ = nullptr; +} + +Status ArrayBuilder::SetNotNull(int64_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeSetNotNull(length); + return Status::OK(); +} + +void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + if (valid_bytes == nullptr) { + UnsafeSetNotNull(length); + return; + } + UnsafeAppendToBitmap(valid_bytes, valid_bytes + length); +} + +void ArrayBuilder::UnsafeAppendToBitmap(const std::vector& is_valid) { + UnsafeAppendToBitmap(is_valid.begin(), is_valid.end()); +} + +void ArrayBuilder::UnsafeSetNotNull(int64_t length) { + const int64_t new_length = length + length_; + + // Fill up the bytes until we have a byte alignment + int64_t pad_to_byte = std::min(8 - (length_ % 8), length); + + if (pad_to_byte == 8) { + pad_to_byte = 0; + } + for (int64_t i = length_; i < length_ + pad_to_byte; ++i) { + BitUtil::SetBit(null_bitmap_data_, i); + } + + // Fast bitsetting + int64_t fast_length = (length - pad_to_byte) / 8; + memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF, + static_cast(fast_length)); + + // Trailing bits + for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) { + BitUtil::SetBit(null_bitmap_data_, i); + } + + length_ = new_length; +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h new file mode 100644 index 0000000000000..ae400fc463810 --- /dev/null +++ b/cpp/src/arrow/array/builder_base.h @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array/builder_base.h" + +#include // IWYU pragma: keep +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +struct ArrayData; + +constexpr int64_t kMinBuilderCapacity = 1 << 5; +constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; + +/// Base class for all data array builders. +/// +/// This class provides a facilities for incrementally building the null bitmap +/// (see Append methods) and as a side effect the current number of slots and +/// the null count. +/// +/// \note Users are expected to use builders as one of the concrete types below. +/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. +class ARROW_EXPORT ArrayBuilder { + public: + explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) + : type_(type), + pool_(pool), + null_bitmap_(NULLPTR), + null_count_(0), + null_bitmap_data_(NULLPTR), + length_(0), + capacity_(0) {} + + virtual ~ArrayBuilder() = default; + + /// For nested types. Since the objects are owned by this class instance, we + /// skip shared pointers and just return a raw pointer + ArrayBuilder* child(int i) { return children_[i].get(); } + + int num_children() const { return static_cast(children_.size()); } + + int64_t length() const { return length_; } + int64_t null_count() const { return null_count_; } + int64_t capacity() const { return capacity_; } + + /// \brief Ensure that enough memory has been allocated to fit the indicated + /// number of total elements in the builder, including any that have already + /// been appended. Does not account for reallocations that may be due to + /// variable size data, like binary values. To make space for incremental + /// appends, use Reserve instead. + /// + /// \param[in] capacity the minimum number of total array values to + /// accommodate. Must be greater than the current capacity. + /// \return Status + virtual Status Resize(int64_t capacity); + + /// \brief Ensure that there is enough space allocated to add the indicated + /// number of elements without any further calls to Resize. The memory + /// allocated is rounded up to the next highest power of 2 similar to memory + /// allocations in STL containers like std::vector + /// \param[in] additional_capacity the number of additional array values + /// \return Status + Status Reserve(int64_t additional_capacity); + + /// Reset the builder. + virtual void Reset(); + + /// For cases where raw data was memcpy'd into the internal buffers, allows us + /// to advance the length of the builder. It is your responsibility to use + /// this function responsibly. + Status Advance(int64_t elements); + + /// \brief Return result of builder as an internal generic ArrayData + /// object. Resets builder except for dictionary builder + /// + /// \param[out] out the finalized ArrayData object + /// \return Status + virtual Status FinishInternal(std::shared_ptr* out) = 0; + + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \param[out] out the finalized Array object + /// \return Status + Status Finish(std::shared_ptr* out); + + std::shared_ptr type() const { return type_; } + + protected: + ArrayBuilder() {} + + /// Append to null bitmap + Status AppendToBitmap(bool is_valid); + + /// Vector append. Treat each zero byte as a null. If valid_bytes is null + /// assume all of length bits are valid. + Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); + + /// Set the next length bits to not null (i.e. valid). + Status SetNotNull(int64_t length); + + // Unsafe operations (don't check capacity/don't resize) + + void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } + + // Append to null bitmap, update the length + void UnsafeAppendToBitmap(bool is_valid) { + if (is_valid) { + BitUtil::SetBit(null_bitmap_data_, length_); + } else { + ++null_count_; + } + ++length_; + } + + template + void UnsafeAppendToBitmap(const IterType& begin, const IterType& end) { + int64_t byte_offset = length_ / 8; + int64_t bit_offset = length_ % 8; + uint8_t bitset = null_bitmap_data_[byte_offset]; + + for (auto iter = begin; iter != end; ++iter) { + if (bit_offset == 8) { + bit_offset = 0; + null_bitmap_data_[byte_offset] = bitset; + byte_offset++; + // TODO: Except for the last byte, this shouldn't be needed + bitset = null_bitmap_data_[byte_offset]; + } + + if (*iter) { + bitset |= BitUtil::kBitmask[bit_offset]; + } else { + bitset &= BitUtil::kFlippedBitmask[bit_offset]; + ++null_count_; + } + + bit_offset++; + } + + if (bit_offset != 0) { + null_bitmap_data_[byte_offset] = bitset; + } + + length_ += std::distance(begin, end); + } + + // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null + // assume all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length); + + void UnsafeAppendToBitmap(const std::vector& is_valid); + + // Set the next length bits to not null (i.e. valid). + void UnsafeSetNotNull(int64_t length); + + static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); + + static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { + if (new_capacity < 0) { + return Status::Invalid("Resize capacity must be positive"); + } + if (new_capacity < old_capacity) { + return Status::Invalid("Resize cannot downsize"); + } + return Status::OK(); + } + + std::shared_ptr type_; + MemoryPool* pool_; + + // When null_bitmap are first appended to the builder, the null bitmap is allocated + std::shared_ptr null_bitmap_; + int64_t null_count_; + uint8_t* null_bitmap_data_; + + // Array length, so far. Also, the index of the next element to be added + int64_t length_; + int64_t capacity_; + + // Child value array builders. These are owned by this class + std::vector> children_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); +}; + +} // namespace arrow diff --git a/cpp/src/arrow/builder-binary.cc b/cpp/src/arrow/array/builder_binary.cc similarity index 86% rename from cpp/src/arrow/builder-binary.cc rename to cpp/src/arrow/array/builder_binary.cc index c250837b4a3fa..ad6ba11a484d1 100644 --- a/cpp/src/arrow/builder-binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/array/builder_binary.h" + #include #include #include @@ -27,7 +29,6 @@ #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -68,32 +69,11 @@ Status BinaryBuilder::ReserveData(int64_t elements) { return Status::OK(); } -Status BinaryBuilder::AppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { - std::stringstream ss; - ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " - << num_bytes; - return Status::CapacityError(ss.str()); - } - return offsets_builder_.Append(static_cast(num_bytes)); -} - -Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { - RETURN_NOT_OK(Reserve(1)); - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(value_data_builder_.Append(value, length)); - - UnsafeAppendToBitmap(true); - return Status::OK(); -} - -Status BinaryBuilder::AppendNull() { - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(Reserve(1)); - - UnsafeAppendToBitmap(false); - return Status::OK(); +Status BinaryBuilder::AppendOverflow(int64_t num_bytes) { + std::stringstream ss; + ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " + << num_bytes; + return Status::CapacityError(ss.str()); } Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { @@ -292,24 +272,46 @@ util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { } // ---------------------------------------------------------------------- -// Decimal128Builder +// ChunkedArray builders -Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool) - : FixedSizeBinaryBuilder(type, pool) {} +namespace internal { -Status Decimal128Builder::Append(const Decimal128& value) { - RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); - return FixedSizeBinaryBuilder::Append(value.ToBytes()); +ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_size, MemoryPool* pool) + : max_chunk_size_(max_chunk_size), + chunk_data_size_(0), + builder_(new BinaryBuilder(pool)) {} + +Status ChunkedBinaryBuilder::Finish(ArrayVector* out) { + if (builder_->length() > 0 || chunks_.size() == 0) { + std::shared_ptr chunk; + RETURN_NOT_OK(builder_->Finish(&chunk)); + chunks_.emplace_back(std::move(chunk)); + } + *out = std::move(chunks_); + return Status::OK(); } -Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr data; - RETURN_NOT_OK(byte_builder_.Finish(&data)); +Status ChunkedBinaryBuilder::NextChunk() { + std::shared_ptr chunk; + RETURN_NOT_OK(builder_->Finish(&chunk)); + chunks_.emplace_back(std::move(chunk)); - *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); + chunk_data_size_ = 0; + return Status::OK(); +} +Status ChunkedStringBuilder::Finish(ArrayVector* out) { + RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); + + // Change data type to string/utf8 + for (size_t i = 0; i < out->size(); ++i) { + std::shared_ptr data = (*out)[i]->data(); + data->type = ::arrow::utf8(); + (*out)[i] = std::make_shared(data); + } return Status::OK(); } +} // namespace internal + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h new file mode 100644 index 0000000000000..7c101bdffc5e4 --- /dev/null +++ b/cpp/src/arrow/array/builder_binary.h @@ -0,0 +1,304 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_base.h" +#include "arrow/status.h" +#include "arrow/type_traits.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" + +namespace arrow { + +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; + +// ---------------------------------------------------------------------- +// Binary and String + +/// \class BinaryBuilder +/// \brief Builder class for variable-length binary data +class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { + public: + explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + + Status Append(const uint8_t* value, int32_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status AppendNull() { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + Status Append(const char* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(util::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Offsets and data should have been presized using Reserve() and + /// ReserveData(), respectively. + void UnsafeAppend(const uint8_t* value, int32_t length) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(value, length); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppend(const char* value, int32_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppendNull() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(false); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \return size of values buffer so far + int64_t value_data_length() const { return value_data_builder_.length(); } + /// \return capacity of values buffer + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i, int32_t* out_length) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + util::string_view GetView(int64_t i) const; + + protected: + TypedBufferBuilder offsets_builder_; + TypedBufferBuilder value_data_builder_; + + Status AppendOverflow(int64_t num_bytes); + + Status AppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { + return AppendOverflow(num_bytes); + } + return offsets_builder_.Append(static_cast(num_bytes)); + } + + void UnsafeAppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } +}; + +/// \class StringBuilder +/// \brief Builder class for UTF8 strings +class ARROW_EXPORT StringBuilder : public BinaryBuilder { + public: + using BinaryBuilder::BinaryBuilder; + explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using BinaryBuilder::Append; + using BinaryBuilder::Reset; + using BinaryBuilder::UnsafeAppend; + + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector& values, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of nul-terminated strings in one shot. + /// If one of the values is NULL, it is processed as a null + /// value even if the corresponding valid_bytes entry is 1. + /// + /// \param[in] values a contiguous C array of nul-terminated char * + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const char** values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); +}; + +// ---------------------------------------------------------------------- +// FixedSizeBinaryBuilder + +class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { + public: + FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + Status Append(const uint8_t* value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return byte_builder_.Append(value, byte_width_); + } + + Status Append(const char* value) { + return Append(reinterpret_cast(value)); + } + + Status Append(const util::string_view& view) { +#ifndef NDEBUG + CheckValueSize(static_cast(view.size())); +#endif + return Append(reinterpret_cast(view.data())); + } + + Status Append(const std::string& s) { +#ifndef NDEBUG + CheckValueSize(static_cast(s.size())); +#endif + return Append(reinterpret_cast(s.data())); + } + + template + Status Append(const std::array& value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return byte_builder_.Append(value); + } + + Status AppendValues(const uint8_t* data, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + Status AppendNull(); + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \return size of values buffer so far + int64_t value_data_length() const { return byte_builder_.length(); } + + int32_t byte_width() const { return byte_width_; } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + util::string_view GetView(int64_t i) const; + + protected: + int32_t byte_width_; + BufferBuilder byte_builder_; + +#ifndef NDEBUG + void CheckValueSize(int64_t size); +#endif +}; + +// ---------------------------------------------------------------------- +// Chunked builders: build a sequence of BinaryArray or StringArray that are +// limited to a particular size (to the upper limit of 2GB) + +namespace internal { + +class ARROW_EXPORT ChunkedBinaryBuilder { + public: + ChunkedBinaryBuilder(int32_t max_chunk_size, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + virtual ~ChunkedBinaryBuilder() = default; + + Status Append(const uint8_t* value, int32_t length) { + if (ARROW_PREDICT_FALSE(length + chunk_data_size_ > max_chunk_size_)) { + // Move onto next chunk, unless the builder length is currently 0, which + // means that max_chunk_size_ is less than the item length + if (builder_->length() > 0) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + // else fall through + } + + chunk_data_size_ += length; + return builder_->Append(value, length); + } + + Status Append(const util::string_view& value) { + return Append(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + Status AppendNull() { + if (ARROW_PREDICT_FALSE(builder_->length() == std::numeric_limits::max())) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + return builder_->AppendNull(); + } + + virtual Status Finish(ArrayVector* out); + + protected: + Status NextChunk(); + + int32_t max_chunk_size_; + int32_t chunk_data_size_; + + std::unique_ptr builder_; + std::vector> chunks_; +}; + +class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder { + public: + using ChunkedBinaryBuilder::ChunkedBinaryBuilder; + + Status Finish(ArrayVector* out) override; +}; + +} // namespace internal + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_decimal.cc b/cpp/src/arrow/array/builder_decimal.cc new file mode 100644 index 0000000000000..d64c4db6f0c30 --- /dev/null +++ b/cpp/src/arrow/array/builder_decimal.cc @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_decimal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Decimal128Builder + +Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool) + : FixedSizeBinaryBuilder(type, pool) {} + +Status Decimal128Builder::Append(const Decimal128& value) { + RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); + return FixedSizeBinaryBuilder::Append(value.ToBytes()); +} + +Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr data; + RETURN_NOT_OK(byte_builder_.Finish(&data)); + + *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); + + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h new file mode 100644 index 0000000000000..fb40a7950abbd --- /dev/null +++ b/cpp/src/arrow/array/builder_decimal.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" + +namespace arrow { + +class Decimal128; + +class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { + public: + explicit Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(const Decimal128& val); + + Status FinishInternal(std::shared_ptr* out) override; +}; + +using DecimalBuilder = Decimal128Builder; + +} // namespace arrow diff --git a/cpp/src/arrow/builder-dict.cc b/cpp/src/arrow/array/builder_dict.cc similarity index 99% rename from cpp/src/arrow/builder-dict.cc rename to cpp/src/arrow/array/builder_dict.cc index b021c3a9d37cc..0891e4c0829f4 100644 --- a/cpp/src/arrow/builder-dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/array/builder_dict.h" + +#include #include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h new file mode 100644 index 0000000000000..6f0271683aea2 --- /dev/null +++ b/cpp/src/arrow/array/builder_dict.h @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export + +namespace arrow { + +// ---------------------------------------------------------------------- +// Dictionary builder + +namespace internal { + +template +struct DictionaryScalar { + using type = typename T::c_type; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +} // namespace internal + +/// \brief Array builder for created encoded DictionaryArray from dense array +/// +/// Unlike other builders, dictionary builder does not completely reset the state +/// on Finish calls. The arrays built after the initial Finish call will reuse +/// the previously created encoding and build a delta dictionary when new terms +/// occur. +/// +/// data +template +class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { + public: + using Scalar = typename internal::DictionaryScalar::type; + + // WARNING: the type given below is the value type, not the DictionaryType. + // The DictionaryType is instantiated on the Finish() call. + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + + template + explicit DictionaryBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) + : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} + + ~DictionaryBuilder() override; + + /// \brief Append a scalar value + Status Append(const Scalar& value); + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + Status Append(typename std::enable_if::value, + const uint8_t*>::type value) { + return Append(util::string_view(reinterpret_cast(value), byte_width_)); + } + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + Status Append(typename std::enable_if::value, + const char*>::type value) { + return Append(util::string_view(value, byte_width_)); + } + + /// \brief Append a scalar null value + Status AppendNull(); + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array); + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// is the dictionary builder in the delta building mode + bool is_building_delta() { return delta_offset_ > 0; } + + protected: + class MemoTableImpl; + std::unique_ptr memo_table_; + + int32_t delta_offset_; + // Only used for FixedSizeBinaryType + int32_t byte_width_; + + AdaptiveIntBuilder values_builder_; +}; + +template <> +class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { + public: + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + explicit DictionaryBuilder(MemoryPool* pool); + + /// \brief Append a scalar null value + Status AppendNull(); + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array); + + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + protected: + AdaptiveIntBuilder values_builder_; +}; + +class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { + public: + using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; + + Status Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const char* value, int32_t length) { + return Append(util::string_view(value, length)); + } +}; + +/// \brief Dictionary array builder with convenience methods for strings +class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder { + public: + using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; + + Status Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const char* value, int32_t length) { + return Append(util::string_view(value, length)); + } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc new file mode 100644 index 0000000000000..e73324323af3d --- /dev/null +++ b/cpp/src/arrow/array/builder_nested.cc @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_nested.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// ListBuilder + +ListBuilder::ListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type) + : ArrayBuilder(type ? type + : std::static_pointer_cast( + std::make_shared(value_builder->type())), + pool), + offsets_builder_(pool), + value_builder_(value_builder) {} + +Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); +} + +Status ListBuilder::AppendNextOffset() { + int64_t num_values = value_builder_->length(); + if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { + std::stringstream ss; + ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," + << " have " << num_values; + return Status::CapacityError(ss.str()); + } + return offsets_builder_.Append(static_cast(num_values)); +} + +Status ListBuilder::Append(bool is_valid) { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return AppendNextOffset(); +} + +Status ListBuilder::Resize(int64_t capacity) { + DCHECK_LE(capacity, kListMaximumElements); + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + // one more then requested for offsets + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); + return ArrayBuilder::Resize(capacity); +} + +Status ListBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + + std::shared_ptr items; + if (values_) { + items = values_->data(); + } else { + if (value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + RETURN_NOT_OK(value_builder_->Resize(0)); + } + RETURN_NOT_OK(value_builder_->FinishInternal(&items)); + } + + *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets}, null_count_); + (*out)->child_data.emplace_back(std::move(items)); + Reset(); + return Status::OK(); +} + +void ListBuilder::Reset() { + ArrayBuilder::Reset(); + values_.reset(); + offsets_builder_.Reset(); + value_builder_->Reset(); +} + +ArrayBuilder* ListBuilder::value_builder() const { + DCHECK(!values_) << "Using value builder is pointless when values_ is set"; + return value_builder_.get(); +} + +// ---------------------------------------------------------------------- +// Struct + +StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders) + : ArrayBuilder(type, pool) { + children_ = std::move(field_builders); +} + +void StructBuilder::Reset() { + ArrayBuilder::Reset(); + for (const auto& field_builder : children_) { + field_builder->Reset(); + } +} + +Status StructBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_); + + (*out)->child_data.resize(children_.size()); + for (size_t i = 0; i < children_.size(); ++i) { + if (length_ == 0) { + // Try to make sure the child buffers are initialized + RETURN_NOT_OK(children_[i]->Resize(0)); + } + RETURN_NOT_OK(children_[i]->FinishInternal(&(*out)->child_data[i])); + } + + null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h new file mode 100644 index 0000000000000..863e6fef06f7d --- /dev/null +++ b/cpp/src/arrow/array/builder_nested.h @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/builder_base.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// List builder + +/// \class ListBuilder +/// \brief Builder class for variable-length list array value types +/// +/// To use this class, you must append values to the child array builder and use +/// the Append function to delimit each distinct list value (once the values +/// have been appended to the child array) or use the bulk API to append +/// a sequence of offests and null values. +/// +/// A note on types. Per arrow/type.h all types in the c++ implementation are +/// logical so even though this class always builds list array, this can +/// represent multiple different logical types. If no logical type is provided +/// at construction time, the class defaults to List where t is taken from the +/// value_builder/values that the object is constructed with. +class ARROW_EXPORT ListBuilder : public ArrayBuilder { + public: + /// Use this constructor to incrementally build the value array along with offsets and + /// null bitmap. + ListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, + const std::shared_ptr& type = NULLPTR); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true); + + Status AppendNull() { return Append(false); } + + ArrayBuilder* value_builder() const; + + protected: + TypedBufferBuilder offsets_builder_; + std::shared_ptr value_builder_; + std::shared_ptr values_; + + Status AppendNextOffset(); +}; + +// ---------------------------------------------------------------------- +// Struct + +// --------------------------------------------------------------------------------- +// StructArray builder +/// Append, Resize and Reserve methods are acting on StructBuilder. +/// Please make sure all these methods of all child-builders' are consistently +/// called to maintain data-structure consistency. +class ARROW_EXPORT StructBuilder : public ArrayBuilder { + public: + StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders); + + Status FinishInternal(std::shared_ptr* out) override; + + /// Null bitmap is of equal length to every child field, and any zero byte + /// will be considered as a null for that field, but users must using app- + /// end methods or advance methods of the child builders' independently to + /// insert data. + Status AppendValues(int64_t length, const uint8_t* valid_bytes) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// Append an element to the Struct. All child-builders' Append method must + /// be called independently to maintain data-structure consistency. + Status Append(bool is_valid = true) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + Status AppendNull() { return Append(false); } + + void Reset() override; + + ArrayBuilder* field_builder(int i) const { return children_[i].get(); } + + int num_fields() const { return static_cast(children_.size()); } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc new file mode 100644 index 0000000000000..bc14000c3e10d --- /dev/null +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -0,0 +1,272 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_primitive.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Null builder + +Status NullBuilder::FinishInternal(std::shared_ptr* out) { + *out = ArrayData::Make(null(), length_, {nullptr}, length_); + length_ = null_count_ = 0; + return Status::OK(); +} + +// ---------------------------------------------------------------------- + +template +void PrimitiveBuilder::Reset() { + data_.reset(); + raw_data_ = nullptr; +} + +template +Status PrimitiveBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + int64_t nbytes = TypeTraits::bytes_required(capacity); + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); + } else { + RETURN_NOT_OK(data_->Resize(nbytes)); + } + + raw_data_ = reinterpret_cast(data_->mutable_data()); + return ArrayBuilder::Resize(capacity); +} + +template +Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + if (length > 0) { + std::memcpy(raw_data_ + length_, values, + static_cast(TypeTraits::bytes_required(length))); + } + + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +template +Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid) { + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + if (length > 0) { + std::memcpy(raw_data_ + length_, values, + static_cast(TypeTraits::bytes_required(length))); + } + + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +template +Status PrimitiveBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); +} + +template +Status PrimitiveBuilder::AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); +} + +template +Status PrimitiveBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + RETURN_NOT_OK(TrimBuffer(TypeTraits::bytes_required(length_), data_.get())); + + *out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + + return Status::OK(); +} + +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; + +BooleanBuilder::BooleanBuilder(MemoryPool* pool) + : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} + +BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) + : BooleanBuilder(pool) { + DCHECK_EQ(Type::BOOL, type->id()); +} + +void BooleanBuilder::Reset() { + ArrayBuilder::Reset(); + data_.reset(); + raw_data_ = nullptr; +} + +Status BooleanBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &data_)); + raw_data_ = reinterpret_cast(data_->mutable_data()); + + // We zero the memory for booleans to keep things simple; for some reason if + // we do not, even though we may write every bit (through in-place | or &), + // valgrind will still show a warning. If we do not zero the bytes here, we + // will have to be careful to zero them in AppendNull and AppendNulls. Also, + // zeroing the bits results in deterministic bits when each byte may have a + // mix of nulls and not nulls. + // + // We only zero up to new_bitmap_size because the padding was zeroed by + // AllocateResizableBuffer + memset(raw_data_, 0, static_cast(new_bitmap_size)); + } else { + const int64_t old_bitmap_capacity = data_->capacity(); + RETURN_NOT_OK(data_->Resize(new_bitmap_size)); + const int64_t new_bitmap_capacity = data_->capacity(); + raw_data_ = reinterpret_cast(data_->mutable_data()); + + // See comment above about why we zero memory for booleans + memset(raw_data_ + old_bitmap_capacity, 0, + static_cast(new_bitmap_capacity - old_bitmap_capacity)); + } + + return ArrayBuilder::Resize(capacity); +} + +Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { + int64_t bit_offset = length_ % 8; + if (bit_offset > 0) { + // Adjust last byte + data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset]; + } + + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get())); + + *out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_}, null_count_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++] != 0; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid) { + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); +} + +Status BooleanBuilder::AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); +} + +Status BooleanBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + const int64_t length = static_cast(values.size()); + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const std::vector& values) { + const int64_t length = static_cast(values.size()); + RETURN_NOT_OK(Reserve(length)); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeSetNotNull(length); + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h new file mode 100644 index 0000000000000..13f6c229b2a23 --- /dev/null +++ b/cpp/src/arrow/array/builder_primitive.h @@ -0,0 +1,401 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/type.h" + +namespace arrow { + +class ARROW_EXPORT NullBuilder : public ArrayBuilder { + public: + explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : ArrayBuilder(null(), pool) {} + + Status AppendNull() { + ++null_count_; + ++length_; + return Status::OK(); + } + + Status Append(std::nullptr_t value) { return AppendNull(); } + + Status FinishInternal(std::shared_ptr* out) override; +}; + +template +class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { + public: + using value_type = typename Type::c_type; + + explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {} + + using ArrayBuilder::Advance; + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + /// The memory at the corresponding data slot is set to 0 to prevent uninitialized + /// memory access + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(raw_data_ + length_, 0, + static_cast(TypeTraits::bytes_required(length))); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() { + ARROW_RETURN_NOT_OK(Reserve(1)); + memset(raw_data_ + length_, 0, sizeof(value_type)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + value_type GetValue(int64_t index) const { + return reinterpret_cast(data_->data())[index]; + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \return Status + + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values. + /// \return Status + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + return Status::OK(); + } + + // Same as above, with a pointer type ValidIter + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + } + + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override; + void Reset() override; + + Status Resize(int64_t capacity) override; + + protected: + std::shared_ptr data_; + value_type* raw_data_; +}; + +/// Base class for all Builders that emit an Array of a scalar numerical type. +template +class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { + public: + using typename PrimitiveBuilder::value_type; + using PrimitiveBuilder::PrimitiveBuilder; + + template + explicit NumericBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool + ARROW_MEMORY_POOL_DEFAULT) + : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} + + using ArrayBuilder::UnsafeAppendNull; + using PrimitiveBuilder::AppendValues; + using PrimitiveBuilder::Resize; + using PrimitiveBuilder::Reserve; + + /// Append a single scalar and increase the size if necessary. + Status Append(const value_type val) { + ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + /// Append a single scalar under the assumption that the underlying Buffer is + /// large enough. + /// + /// This method does not capacity-check; make sure to call Reserve + /// beforehand. + void UnsafeAppend(const value_type val) { + BitUtil::SetBit(null_bitmap_data_, length_); + raw_data_[length_++] = val; + } + + protected: + using PrimitiveBuilder::length_; + using PrimitiveBuilder::null_bitmap_data_; + using PrimitiveBuilder::raw_data_; +}; + +// Builders + +using UInt8Builder = NumericBuilder; +using UInt16Builder = NumericBuilder; +using UInt32Builder = NumericBuilder; +using UInt64Builder = NumericBuilder; + +using Int8Builder = NumericBuilder; +using Int16Builder = NumericBuilder; +using Int32Builder = NumericBuilder; +using Int64Builder = NumericBuilder; +using TimestampBuilder = NumericBuilder; +using Time32Builder = NumericBuilder; +using Time64Builder = NumericBuilder; +using Date32Builder = NumericBuilder; +using Date64Builder = NumericBuilder; + +using HalfFloatBuilder = NumericBuilder; +using FloatBuilder = NumericBuilder; +using DoubleBuilder = NumericBuilder; + +class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { + public: + using value_type = bool; + explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); + + using ArrayBuilder::Advance; + using ArrayBuilder::UnsafeAppendNull; + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + + return Status::OK(); + } + + Status AppendNull() { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + + return Status::OK(); + } + + /// Scalar append + Status Append(const bool val) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + Status Append(const uint8_t val) { return Append(val != 0); } + + /// Scalar append, without checking for capacity + void UnsafeAppend(const bool val) { + BitUtil::SetBit(null_bitmap_data_, length_); + if (val) { + BitUtil::SetBit(raw_data_, length_); + } else { + BitUtil::ClearBit(raw_data_, length_); + } + ++length_; + } + + void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous array of bytes (non-zero is 1) + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// or null(0) values + /// \return Status + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values + /// \return Status + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + return Status::OK(); + } + + // Same as above, for a pointer type ValidIter + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + } + + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override; + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + std::shared_ptr data_; + uint8_t* raw_data_; +}; + +} // namespace arrow diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc index f96728dcd4fdf..fae9c89a14fdf 100644 --- a/cpp/src/arrow/builder-benchmark.cc +++ b/cpp/src/arrow/builder-benchmark.cc @@ -163,10 +163,11 @@ static void BM_BuildBooleanArrayNoNulls( } static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const reference - const int64_t iterations = 1 << 20; - + // About 160MB + const int64_t iterations = 1 << 24; std::string value = "1234567890"; - while (state.KeepRunning()) { + + for (auto _ : state) { BinaryBuilder builder; for (int64_t i = 0; i < iterations; i++) { ABORT_NOT_OK(builder.Append(value)); @@ -177,6 +178,26 @@ static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const state.SetBytesProcessed(state.iterations() * iterations * value.size()); } +static void BM_BuildChunkedBinaryArray( + benchmark::State& state) { // NOLINT non-const reference + // About 160MB + const int64_t iterations = 1 << 24; + std::string value = "1234567890"; + + for (auto _ : state) { + // 1MB chunks + const int32_t chunksize = 1 << 20; + internal::ChunkedBinaryBuilder builder(chunksize); + for (int64_t i = 0; i < iterations; i++) { + ABORT_NOT_OK(builder.Append(reinterpret_cast(value.data()), + static_cast(value.size()))); + } + ArrayVector out; + ABORT_NOT_OK(builder.Finish(&out)); + } + state.SetBytesProcessed(state.iterations() * iterations * value.size()); +} + static void BM_BuildFixedSizeBinaryArray( benchmark::State& state) { // NOLINT non-const reference const int64_t iterations = 1 << 20; @@ -371,7 +392,8 @@ BENCHMARK(BM_BuildAdaptiveUIntNoNullsScalarAppend) ->Repetitions(kRepetitions) ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildBinaryArray)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildChunkedBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(BM_BuildFixedSizeBinaryArray) ->Repetitions(kRepetitions) ->Unit(benchmark::kMicrosecond); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index aef4df05108b7..ff2b453bb4494 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -15,513 +15,20 @@ // specific language governing permissions and limitations // under the License. -#include -#include -#include -#include +#include "arrow/builder.h" + #include +#include #include #include -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/int-util.h" -#include "arrow/util/logging.h" namespace arrow { -using internal::checked_cast; - -Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { - if (buffer) { - if (bytes_filled < buffer->size()) { - // Trim buffer - RETURN_NOT_OK(buffer->Resize(bytes_filled)); - } - // zero the padding - buffer->ZeroPadding(); - } else { - // Null buffers are allowed in place of 0-byte buffers - DCHECK_EQ(bytes_filled, 0); - } - return Status::OK(); -} - -Status ArrayBuilder::AppendToBitmap(bool is_valid) { - if (length_ == capacity_) { - // If the capacity was not already a multiple of 2, do so here - // TODO(emkornfield) doubling isn't great default allocation practice - // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md - // fo discussion - RETURN_NOT_OK(Resize(BitUtil::NextPower2(capacity_ + 1))); - } - UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t length) { - RETURN_NOT_OK(Reserve(length)); - - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -Status ArrayBuilder::Resize(int64_t capacity) { - // Target size of validity (null) bitmap data - const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &null_bitmap_)); - null_bitmap_data_ = null_bitmap_->mutable_data(); - - // Padding is zeroed by AllocateResizableBuffer - memset(null_bitmap_data_, 0, static_cast(new_bitmap_size)); - } else { - const int64_t old_bitmap_capacity = null_bitmap_->capacity(); - RETURN_NOT_OK(null_bitmap_->Resize(new_bitmap_size)); - - const int64_t new_bitmap_capacity = null_bitmap_->capacity(); - null_bitmap_data_ = null_bitmap_->mutable_data(); - - // Zero the region between the original capacity and the new capacity, - // including padding, which has not been zeroed, unlike - // AllocateResizableBuffer - if (old_bitmap_capacity < new_bitmap_capacity) { - memset(null_bitmap_data_ + old_bitmap_capacity, 0, - static_cast(new_bitmap_capacity - old_bitmap_capacity)); - } - } - capacity_ = capacity; - return Status::OK(); -} - -Status ArrayBuilder::Advance(int64_t elements) { - if (length_ + elements > capacity_) { - return Status::Invalid("Builder must be expanded"); - } - length_ += elements; - return Status::OK(); -} - -Status ArrayBuilder::Finish(std::shared_ptr* out) { - std::shared_ptr internal_data; - RETURN_NOT_OK(FinishInternal(&internal_data)); - *out = MakeArray(internal_data); - return Status::OK(); -} - -Status ArrayBuilder::Reserve(int64_t additional_elements) { - if (length_ + additional_elements > capacity_) { - // TODO(emkornfield) power of 2 growth is potentially suboptimal - int64_t new_size = BitUtil::NextPower2(length_ + additional_elements); - return Resize(new_size); - } - return Status::OK(); -} - -void ArrayBuilder::Reset() { - capacity_ = length_ = null_count_ = 0; - null_bitmap_ = nullptr; -} - -Status ArrayBuilder::SetNotNull(int64_t length) { - RETURN_NOT_OK(Reserve(length)); - UnsafeSetNotNull(length); - return Status::OK(); -} - -void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { - if (valid_bytes == nullptr) { - UnsafeSetNotNull(length); - return; - } - UnsafeAppendToBitmap(valid_bytes, valid_bytes + length); -} - -void ArrayBuilder::UnsafeAppendToBitmap(const std::vector& is_valid) { - UnsafeAppendToBitmap(is_valid.begin(), is_valid.end()); -} - -void ArrayBuilder::UnsafeSetNotNull(int64_t length) { - const int64_t new_length = length + length_; - - // Fill up the bytes until we have a byte alignment - int64_t pad_to_byte = std::min(8 - (length_ % 8), length); - - if (pad_to_byte == 8) { - pad_to_byte = 0; - } - for (int64_t i = length_; i < length_ + pad_to_byte; ++i) { - BitUtil::SetBit(null_bitmap_data_, i); - } - - // Fast bitsetting - int64_t fast_length = (length - pad_to_byte) / 8; - memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF, - static_cast(fast_length)); - - // Trailing bits - for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) { - BitUtil::SetBit(null_bitmap_data_, i); - } - - length_ = new_length; -} - -// ---------------------------------------------------------------------- -// Null builder - -Status NullBuilder::FinishInternal(std::shared_ptr* out) { - *out = ArrayData::Make(null(), length_, {nullptr}, length_); - length_ = null_count_ = 0; - return Status::OK(); -} - -// ---------------------------------------------------------------------- - -template -void PrimitiveBuilder::Reset() { - data_.reset(); - raw_data_ = nullptr; -} - -template -Status PrimitiveBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - int64_t nbytes = TypeTraits::bytes_required(capacity); - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); - } else { - RETURN_NOT_OK(data_->Resize(nbytes)); - } - - raw_data_ = reinterpret_cast(data_->mutable_data()); - return ArrayBuilder::Resize(capacity); -} - -template -Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - - if (length > 0) { - std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); - } - - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -template -Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid) { - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - if (length > 0) { - std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); - } - - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -template -Status PrimitiveBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - return AppendValues(values.data(), static_cast(values.size()), is_valid); -} - -template -Status PrimitiveBuilder::AppendValues(const std::vector& values) { - return AppendValues(values.data(), static_cast(values.size())); -} - -template -Status PrimitiveBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - RETURN_NOT_OK(TrimBuffer(TypeTraits::bytes_required(length_), data_.get())); - - *out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_); - - data_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - - return Status::OK(); -} - -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; - -BooleanBuilder::BooleanBuilder(MemoryPool* pool) - : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} - -BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) - : BooleanBuilder(pool) { - DCHECK_EQ(Type::BOOL, type->id()); -} - -void BooleanBuilder::Reset() { - ArrayBuilder::Reset(); - data_.reset(); - raw_data_ = nullptr; -} - -Status BooleanBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &data_)); - raw_data_ = reinterpret_cast(data_->mutable_data()); - - // We zero the memory for booleans to keep things simple; for some reason if - // we do not, even though we may write every bit (through in-place | or &), - // valgrind will still show a warning. If we do not zero the bytes here, we - // will have to be careful to zero them in AppendNull and AppendNulls. Also, - // zeroing the bits results in deterministic bits when each byte may have a - // mix of nulls and not nulls. - // - // We only zero up to new_bitmap_size because the padding was zeroed by - // AllocateResizableBuffer - memset(raw_data_, 0, static_cast(new_bitmap_size)); - } else { - const int64_t old_bitmap_capacity = data_->capacity(); - RETURN_NOT_OK(data_->Resize(new_bitmap_size)); - const int64_t new_bitmap_capacity = data_->capacity(); - raw_data_ = reinterpret_cast(data_->mutable_data()); - - // See comment above about why we zero memory for booleans - memset(raw_data_ + old_bitmap_capacity, 0, - static_cast(new_bitmap_capacity - old_bitmap_capacity)); - } - - return ArrayBuilder::Resize(capacity); -} - -Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { - int64_t bit_offset = length_ % 8; - if (bit_offset > 0) { - // Adjust last byte - data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset]; - } - - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get())); - - *out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_}, null_count_); - - data_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [values, &i]() -> bool { return values[i++] != 0; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, - const std::vector& is_valid) { - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - return AppendValues(values.data(), static_cast(values.size()), is_valid); -} - -Status BooleanBuilder::AppendValues(const std::vector& values) { - return AppendValues(values.data(), static_cast(values.size())); -} - -Status BooleanBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - const int64_t length = static_cast(values.size()); - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const std::vector& values) { - const int64_t length = static_cast(values.size()); - RETURN_NOT_OK(Reserve(length)); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeSetNotNull(length); - return Status::OK(); -} - -// ---------------------------------------------------------------------- -// ListBuilder - -ListBuilder::ListBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, - const std::shared_ptr& type) - : ArrayBuilder(type ? type - : std::static_pointer_cast( - std::make_shared(value_builder->type())), - pool), - offsets_builder_(pool), - value_builder_(value_builder) {} - -Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - offsets_builder_.UnsafeAppend(offsets, length); - return Status::OK(); -} - -Status ListBuilder::AppendNextOffset() { - int64_t num_values = value_builder_->length(); - if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { - std::stringstream ss; - ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," - << " have " << num_values; - return Status::CapacityError(ss.str()); - } - return offsets_builder_.Append(static_cast(num_values)); -} - -Status ListBuilder::Append(bool is_valid) { - RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_valid); - return AppendNextOffset(); -} - -Status ListBuilder::Resize(int64_t capacity) { - DCHECK_LE(capacity, kListMaximumElements); - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); - return ArrayBuilder::Resize(capacity); -} - -Status ListBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(AppendNextOffset()); - - // Offset padding zeroed by BufferBuilder - std::shared_ptr offsets; - RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - - std::shared_ptr items; - if (values_) { - items = values_->data(); - } else { - if (value_builder_->length() == 0) { - // Try to make sure we get a non-null values buffer (ARROW-2744) - RETURN_NOT_OK(value_builder_->Resize(0)); - } - RETURN_NOT_OK(value_builder_->FinishInternal(&items)); - } - - *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets}, null_count_); - (*out)->child_data.emplace_back(std::move(items)); - Reset(); - return Status::OK(); -} - -void ListBuilder::Reset() { - ArrayBuilder::Reset(); - values_.reset(); - offsets_builder_.Reset(); - value_builder_->Reset(); -} - -ArrayBuilder* ListBuilder::value_builder() const { - DCHECK(!values_) << "Using value builder is pointless when values_ is set"; - return value_builder_.get(); -} - -// ---------------------------------------------------------------------- -// Struct - -StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders) - : ArrayBuilder(type, pool), field_builders_(std::move(field_builders)) {} - -void StructBuilder::Reset() { - ArrayBuilder::Reset(); - for (const auto& field_builder : field_builders_) { - field_builder->Reset(); - } -} - -Status StructBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_); - - (*out)->child_data.resize(field_builders_.size()); - for (size_t i = 0; i < field_builders_.size(); ++i) { - if (length_ == 0) { - // Try to make sure the child buffers are initialized - RETURN_NOT_OK(field_builders_[i]->Resize(0)); - } - RETURN_NOT_OK(field_builders_[i]->FinishInternal(&(*out)->child_data[i])); - } - - null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} +class MemoryPool; // ---------------------------------------------------------------------- // Helper functions @@ -566,7 +73,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, case Type::LIST: { std::unique_ptr value_builder; std::shared_ptr value_type = - checked_cast(*type).value_type(); + internal::checked_cast(*type).value_type(); RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); out->reset(new ListBuilder(pool, std::move(value_builder))); return Status::OK(); diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index d0016674215fc..a7ab22c1beedb 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -15,1184 +15,27 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_BUILDER_H -#define ARROW_BUILDER_H +#pragma once -#include // IWYU pragma: keep -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include "arrow/buffer.h" -#include "arrow/memory_pool.h" +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export +#include "arrow/array/builder_binary.h" // IWYU pragma: export +#include "arrow/array/builder_decimal.h" // IWYU pragma: export +#include "arrow/array/builder_dict.h" // IWYU pragma: export +#include "arrow/array/builder_nested.h" // IWYU pragma: export +#include "arrow/array/builder_primitive.h" // IWYU pragma: export #include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" -#include "arrow/util/type_traits.h" #include "arrow/util/visibility.h" namespace arrow { -class Array; -struct ArrayData; -class Decimal128; - -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; -constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; - -constexpr int64_t kMinBuilderCapacity = 1 << 5; - -/// Base class for all data array builders. -/// -/// This class provides a facilities for incrementally building the null bitmap -/// (see Append methods) and as a side effect the current number of slots and -/// the null count. -/// -/// \note Users are expected to use builders as one of the concrete types below. -/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. -class ARROW_EXPORT ArrayBuilder { - public: - explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) - : type_(type), - pool_(pool), - null_bitmap_(NULLPTR), - null_count_(0), - null_bitmap_data_(NULLPTR), - length_(0), - capacity_(0) {} - - virtual ~ArrayBuilder() = default; - - /// For nested types. Since the objects are owned by this class instance, we - /// skip shared pointers and just return a raw pointer - ArrayBuilder* child(int i) { return children_[i].get(); } - - int num_children() const { return static_cast(children_.size()); } - - int64_t length() const { return length_; } - int64_t null_count() const { return null_count_; } - int64_t capacity() const { return capacity_; } - - /// \brief Ensure that enough memory has been allocated to fit the indicated - /// number of total elements in the builder, including any that have already - /// been appended. Does not account for reallocations that may be due to - /// variable size data, like binary values. To make space for incremental - /// appends, use Reserve instead. - /// - /// \param[in] capacity the minimum number of total array values to - /// accommodate. Must be greater than the current capacity. - /// \return Status - virtual Status Resize(int64_t capacity); - - /// \brief Ensure that there is enough space allocated to add the indicated - /// number of elements without any further calls to Resize. The memory - /// allocated is rounded up to the next highest power of 2 similar to memory - /// allocations in STL containers like std::vector - /// \param[in] additional_capacity the number of additional array values - /// \return Status - Status Reserve(int64_t additional_capacity); - - /// Reset the builder. - virtual void Reset(); - - /// For cases where raw data was memcpy'd into the internal buffers, allows us - /// to advance the length of the builder. It is your responsibility to use - /// this function responsibly. - Status Advance(int64_t elements); - - /// \brief Return result of builder as an internal generic ArrayData - /// object. Resets builder except for dictionary builder - /// - /// \param[out] out the finalized ArrayData object - /// \return Status - virtual Status FinishInternal(std::shared_ptr* out) = 0; - - /// \brief Return result of builder as an Array object. - /// - /// The builder is reset except for DictionaryBuilder. - /// - /// \param[out] out the finalized Array object - /// \return Status - Status Finish(std::shared_ptr* out); - - std::shared_ptr type() const { return type_; } - - protected: - ArrayBuilder() {} - - /// Append to null bitmap - Status AppendToBitmap(bool is_valid); - - /// Vector append. Treat each zero byte as a null. If valid_bytes is null - /// assume all of length bits are valid. - Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); - - /// Set the next length bits to not null (i.e. valid). - Status SetNotNull(int64_t length); - - // Unsafe operations (don't check capacity/don't resize) - - void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } - - // Append to null bitmap, update the length - void UnsafeAppendToBitmap(bool is_valid) { - if (is_valid) { - BitUtil::SetBit(null_bitmap_data_, length_); - } else { - ++null_count_; - } - ++length_; - } - - template - void UnsafeAppendToBitmap(const IterType& begin, const IterType& end) { - int64_t byte_offset = length_ / 8; - int64_t bit_offset = length_ % 8; - uint8_t bitset = null_bitmap_data_[byte_offset]; - - for (auto iter = begin; iter != end; ++iter) { - if (bit_offset == 8) { - bit_offset = 0; - null_bitmap_data_[byte_offset] = bitset; - byte_offset++; - // TODO: Except for the last byte, this shouldn't be needed - bitset = null_bitmap_data_[byte_offset]; - } - - if (*iter) { - bitset |= BitUtil::kBitmask[bit_offset]; - } else { - bitset &= BitUtil::kFlippedBitmask[bit_offset]; - ++null_count_; - } - - bit_offset++; - } - - if (bit_offset != 0) { - null_bitmap_data_[byte_offset] = bitset; - } - - length_ += std::distance(begin, end); - } - - // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null - // assume all of length bits are valid. - void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length); - - void UnsafeAppendToBitmap(const std::vector& is_valid); - - // Set the next length bits to not null (i.e. valid). - void UnsafeSetNotNull(int64_t length); - - static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); - - static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { - if (new_capacity < 0) { - return Status::Invalid("Resize capacity must be positive"); - } - if (new_capacity < old_capacity) { - return Status::Invalid("Resize cannot downsize"); - } - return Status::OK(); - } - - std::shared_ptr type_; - MemoryPool* pool_; - - // When null_bitmap are first appended to the builder, the null bitmap is allocated - std::shared_ptr null_bitmap_; - int64_t null_count_; - uint8_t* null_bitmap_data_; - - // Array length, so far. Also, the index of the next element to be added - int64_t length_; - int64_t capacity_; - - // Child value array builders. These are owned by this class - std::vector> children_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); -}; - -class ARROW_EXPORT NullBuilder : public ArrayBuilder { - public: - explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(null(), pool) {} - - Status AppendNull() { - ++null_count_; - ++length_; - return Status::OK(); - } - - Status Append(std::nullptr_t value) { return AppendNull(); } - - Status FinishInternal(std::shared_ptr* out) override; -}; - -template -class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { - public: - using value_type = typename Type::c_type; - - explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {} - - using ArrayBuilder::Advance; - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - /// The memory at the corresponding data slot is set to 0 to prevent uninitialized - /// memory access - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(Reserve(length)); - memset(raw_data_ + length_, 0, - static_cast(TypeTraits::bytes_required(length))); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - /// \brief Append a single null element - Status AppendNull() { - ARROW_RETURN_NOT_OK(Reserve(1)); - memset(raw_data_ + length_, 0, sizeof(value_type)); - UnsafeAppendToBitmap(false); - return Status::OK(); - } - - value_type GetValue(int64_t index) const { - return reinterpret_cast(data_->data())[index]; - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of values - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of values - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \return Status - - template - Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - UnsafeSetNotNull(length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin InputIterator with elements indication valid(1) - /// or null(0) values. - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - static_assert(!internal::is_null_pointer::value, - "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " - "version instead"); - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - return Status::OK(); - } - - // Same as above, with a pointer type ValidIter - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - if (valid_begin == NULLPTR) { - UnsafeSetNotNull(length); - } else { - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - } - - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override; - void Reset() override; - - Status Resize(int64_t capacity) override; - - protected: - std::shared_ptr data_; - value_type* raw_data_; -}; - -/// Base class for all Builders that emit an Array of a scalar numerical type. -template -class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { - public: - using typename PrimitiveBuilder::value_type; - using PrimitiveBuilder::PrimitiveBuilder; - - template - explicit NumericBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool - ARROW_MEMORY_POOL_DEFAULT) - : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} - - using ArrayBuilder::UnsafeAppendNull; - using PrimitiveBuilder::AppendValues; - using PrimitiveBuilder::Resize; - using PrimitiveBuilder::Reserve; - - /// Append a single scalar and increase the size if necessary. - Status Append(const value_type val) { - ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - - /// Append a single scalar under the assumption that the underlying Buffer is - /// large enough. - /// - /// This method does not capacity-check; make sure to call Reserve - /// beforehand. - void UnsafeAppend(const value_type val) { - BitUtil::SetBit(null_bitmap_data_, length_); - raw_data_[length_++] = val; - } - - protected: - using PrimitiveBuilder::length_; - using PrimitiveBuilder::null_bitmap_data_; - using PrimitiveBuilder::raw_data_; -}; - -// Builders - -using UInt8Builder = NumericBuilder; -using UInt16Builder = NumericBuilder; -using UInt32Builder = NumericBuilder; -using UInt64Builder = NumericBuilder; - -using Int8Builder = NumericBuilder; -using Int16Builder = NumericBuilder; -using Int32Builder = NumericBuilder; -using Int64Builder = NumericBuilder; -using TimestampBuilder = NumericBuilder; -using Time32Builder = NumericBuilder; -using Time64Builder = NumericBuilder; -using Date32Builder = NumericBuilder; -using Date64Builder = NumericBuilder; - -using HalfFloatBuilder = NumericBuilder; -using FloatBuilder = NumericBuilder; -using DoubleBuilder = NumericBuilder; - -namespace internal { - -class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { - public: - explicit AdaptiveIntBuilderBase(MemoryPool* pool); - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(CommitPendingData()); - ARROW_RETURN_NOT_OK(Reserve(length)); - memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - Status AppendNull() { - pending_data_[pending_pos_] = 0; - pending_valid_[pending_pos_] = 0; - pending_has_nulls_ = true; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - - protected: - virtual Status CommitPendingData() = 0; - - std::shared_ptr data_; - uint8_t* raw_data_; - uint8_t int_size_; - - static constexpr int32_t pending_size_ = 1024; - uint8_t pending_valid_[pending_size_]; - uint64_t pending_data_[pending_size_]; - int32_t pending_pos_; - bool pending_has_nulls_; -}; - -} // namespace internal - -class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { - public: - explicit AdaptiveUIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using ArrayBuilder::Advance; - using internal::AdaptiveIntBuilderBase::Reset; - - /// Scalar append - Status Append(const uint64_t val) { - pending_data_[pending_pos_] = val; - pending_valid_[pending_pos_] = 1; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status FinishInternal(std::shared_ptr* out) override; - - protected: - Status CommitPendingData() override; - Status ExpandIntSize(uint8_t new_int_size); - - Status AppendValuesInternal(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes); - - template - typename std::enable_if= sizeof(new_type), Status>::type - ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) - template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type - ExpandIntSizeInternal(); -#undef __LESS - - template - Status ExpandIntSizeN(); -}; - -class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { - public: - explicit AdaptiveIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using ArrayBuilder::Advance; - using internal::AdaptiveIntBuilderBase::Reset; - - /// Scalar append - Status Append(const int64_t val) { - auto v = static_cast(val); - - pending_data_[pending_pos_] = v; - pending_valid_[pending_pos_] = 1; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const int64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status FinishInternal(std::shared_ptr* out) override; - - protected: - Status CommitPendingData() override; - Status ExpandIntSize(uint8_t new_int_size); - - Status AppendValuesInternal(const int64_t* values, int64_t length, - const uint8_t* valid_bytes); - - template - typename std::enable_if= sizeof(new_type), Status>::type - ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) - template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type - ExpandIntSizeInternal(); -#undef __LESS - - template - Status ExpandIntSizeN(); -}; - -class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { - public: - using value_type = bool; - explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); - - using ArrayBuilder::Advance; - using ArrayBuilder::UnsafeAppendNull; - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - - return Status::OK(); - } - - Status AppendNull() { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(false); - - return Status::OK(); - } - - /// Scalar append - Status Append(const bool val) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - - Status Append(const uint8_t val) { return Append(val != 0); } - - /// Scalar append, without checking for capacity - void UnsafeAppend(const bool val) { - BitUtil::SetBit(null_bitmap_data_, length_); - if (val) { - BitUtil::SetBit(raw_data_, length_); - } else { - BitUtil::ClearBit(raw_data_, length_); - } - ++length_; - } - - void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous array of bytes (non-zero is 1) - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const uint8_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const uint8_t* values, int64_t length, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of bytes - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of bytes - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values an std::vector indicating true (1) or false - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values an std::vector indicating true (1) or false - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// or null(0) values - /// \return Status - template - Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates length_ - UnsafeSetNotNull(length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin InputIterator with elements indication valid(1) - /// or null(0) values - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - static_assert(!internal::is_null_pointer::value, - "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " - "version instead"); - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - return Status::OK(); - } - - // Same as above, for a pointer type ValidIter - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates the length_ - if (valid_begin == NULLPTR) { - UnsafeSetNotNull(length); - } else { - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - } - - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override; - void Reset() override; - Status Resize(int64_t capacity) override; - - protected: - std::shared_ptr data_; - uint8_t* raw_data_; -}; - -// ---------------------------------------------------------------------- -// List builder - -/// \class ListBuilder -/// \brief Builder class for variable-length list array value types -/// -/// To use this class, you must append values to the child array builder and use -/// the Append function to delimit each distinct list value (once the values -/// have been appended to the child array) or use the bulk API to append -/// a sequence of offests and null values. -/// -/// A note on types. Per arrow/type.h all types in the c++ implementation are -/// logical so even though this class always builds list array, this can -/// represent multiple different logical types. If no logical type is provided -/// at construction time, the class defaults to List where t is taken from the -/// value_builder/values that the object is constructed with. -class ARROW_EXPORT ListBuilder : public ArrayBuilder { - public: - /// Use this constructor to incrementally build the value array along with offsets and - /// null bitmap. - ListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type = NULLPTR); - - Status Resize(int64_t capacity) override; - void Reset() override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \brief Vector append - /// - /// If passed, valid_bytes is of equal length to values, and any zero byte - /// will be considered as a null for that slot - Status AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Start a new variable-length list slot - /// - /// This function should be called before beginning to append elements to the - /// value builder - Status Append(bool is_valid = true); - - Status AppendNull() { return Append(false); } - - ArrayBuilder* value_builder() const; - - protected: - TypedBufferBuilder offsets_builder_; - std::shared_ptr value_builder_; - std::shared_ptr values_; - - Status AppendNextOffset(); -}; - -// ---------------------------------------------------------------------- -// Binary and String - -/// \class BinaryBuilder -/// \brief Builder class for variable-length binary data -class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { - public: - explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - - Status Append(const uint8_t* value, int32_t length); - - Status Append(const char* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(util::string_view value) { - return Append(value.data(), static_cast(value.size())); - } - - Status AppendNull(); - - /// \brief Append without checking capacity - /// - /// Offsets and data should have been presized using Reserve() and - /// ReserveData(), respectively. - void UnsafeAppend(const uint8_t* value, int32_t length) { - UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend(value, length); - UnsafeAppendToBitmap(true); - } - - void UnsafeAppend(const char* value, int32_t length) { - UnsafeAppend(reinterpret_cast(value), length); - } - - void UnsafeAppend(const std::string& value) { - UnsafeAppend(value.c_str(), static_cast(value.size())); - } - - void UnsafeAppendNull() { - const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - UnsafeAppendToBitmap(false); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - - /// \brief Ensures there is enough allocated capacity to append the indicated - /// number of bytes to the value data buffer without additional allocations - Status ReserveData(int64_t elements); - - Status FinishInternal(std::shared_ptr* out) override; - - /// \return size of values buffer so far - int64_t value_data_length() const { return value_data_builder_.length(); } - /// \return capacity of values buffer - int64_t value_data_capacity() const { return value_data_builder_.capacity(); } - - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i, int32_t* out_length) const; - - /// Temporary access to a value. - /// - /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; - - protected: - TypedBufferBuilder offsets_builder_; - TypedBufferBuilder value_data_builder_; - - Status AppendNextOffset(); - - void UnsafeAppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - } -}; - -/// \class StringBuilder -/// \brief Builder class for UTF8 strings -class ARROW_EXPORT StringBuilder : public BinaryBuilder { - public: - using BinaryBuilder::BinaryBuilder; - explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using BinaryBuilder::Append; - using BinaryBuilder::Reset; - using BinaryBuilder::UnsafeAppend; - - /// \brief Append a sequence of strings in one shot. - /// - /// \param[in] values a vector of strings - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const std::vector& values, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of nul-terminated strings in one shot. - /// If one of the values is NULL, it is processed as a null - /// value even if the corresponding valid_bytes entry is 1. - /// - /// \param[in] values a contiguous C array of nul-terminated char * - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const char** values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); -}; - -// ---------------------------------------------------------------------- -// FixedSizeBinaryBuilder - -class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { - public: - FixedSizeBinaryBuilder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - Status Append(const uint8_t* value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(true); - return byte_builder_.Append(value, byte_width_); - } - - Status Append(const char* value) { - return Append(reinterpret_cast(value)); - } - - Status Append(const util::string_view& view) { -#ifndef NDEBUG - CheckValueSize(static_cast(view.size())); -#endif - return Append(reinterpret_cast(view.data())); - } - - Status Append(const std::string& s) { -#ifndef NDEBUG - CheckValueSize(static_cast(s.size())); -#endif - return Append(reinterpret_cast(s.data())); - } - - template - Status Append(const std::array& value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(true); - return byte_builder_.Append(value); - } - - Status AppendValues(const uint8_t* data, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - Status AppendNull(); - - void Reset() override; - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \return size of values buffer so far - int64_t value_data_length() const { return byte_builder_.length(); } - - int32_t byte_width() const { return byte_width_; } - - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i) const; - - /// Temporary access to a value. - /// - /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; - - protected: - int32_t byte_width_; - BufferBuilder byte_builder_; - -#ifndef NDEBUG - void CheckValueSize(int64_t size); -#endif -}; - -class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { - public: - explicit Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using FixedSizeBinaryBuilder::Append; - using FixedSizeBinaryBuilder::AppendValues; - using FixedSizeBinaryBuilder::Reset; - - Status Append(const Decimal128& val); - - Status FinishInternal(std::shared_ptr* out) override; -}; - -using DecimalBuilder = Decimal128Builder; - -// ---------------------------------------------------------------------- -// Struct - -// --------------------------------------------------------------------------------- -// StructArray builder -/// Append, Resize and Reserve methods are acting on StructBuilder. -/// Please make sure all these methods of all child-builders' are consistently -/// called to maintain data-structure consistency. -class ARROW_EXPORT StructBuilder : public ArrayBuilder { - public: - StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders); - - Status FinishInternal(std::shared_ptr* out) override; - - /// Null bitmap is of equal length to every child field, and any zero byte - /// will be considered as a null for that field, but users must using app- - /// end methods or advance methods of the child builders' independently to - /// insert data. - Status AppendValues(int64_t length, const uint8_t* valid_bytes) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - /// Append an element to the Struct. All child-builders' Append method must - /// be called independently to maintain data-structure consistency. - Status Append(bool is_valid = true) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_valid); - return Status::OK(); - } - - Status AppendNull() { return Append(false); } - - void Reset() override; - - ArrayBuilder* field_builder(int i) const { return field_builders_[i].get(); } - - int num_fields() const { return static_cast(field_builders_.size()); } - - protected: - std::vector> field_builders_; -}; - -// ---------------------------------------------------------------------- -// Dictionary builder - -namespace internal { - -template -struct DictionaryScalar { - using type = typename T::c_type; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -} // namespace internal - -/// \brief Array builder for created encoded DictionaryArray from dense array -/// -/// Unlike other builders, dictionary builder does not completely reset the state -/// on Finish calls. The arrays built after the initial Finish call will reuse -/// the previously created encoding and build a delta dictionary when new terms -/// occur. -/// -/// data -template -class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { - public: - using Scalar = typename internal::DictionaryScalar::type; - - // WARNING: the type given below is the value type, not the DictionaryType. - // The DictionaryType is instantiated on the Finish() call. - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - - template - explicit DictionaryBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) - : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} - - ~DictionaryBuilder() override; - - /// \brief Append a scalar value - Status Append(const Scalar& value); - - /// \brief Append a fixed-width string (only for FixedSizeBinaryType) - template - Status Append(typename std::enable_if::value, - const uint8_t*>::type value) { - return Append(util::string_view(reinterpret_cast(value), byte_width_)); - } - - /// \brief Append a fixed-width string (only for FixedSizeBinaryType) - template - Status Append(typename std::enable_if::value, - const char*>::type value) { - return Append(util::string_view(value, byte_width_)); - } - - /// \brief Append a scalar null value - Status AppendNull(); - - /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array); - - void Reset() override; - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - /// is the dictionary builder in the delta building mode - bool is_building_delta() { return delta_offset_ > 0; } - - protected: - class MemoTableImpl; - std::unique_ptr memo_table_; - - int32_t delta_offset_; - // Only used for FixedSizeBinaryType - int32_t byte_width_; - - AdaptiveIntBuilder values_builder_; -}; - -template <> -class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { - public: - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - explicit DictionaryBuilder(MemoryPool* pool); - - /// \brief Append a scalar null value - Status AppendNull(); - - /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array); - - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - protected: - AdaptiveIntBuilder values_builder_; -}; - -class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { - public: - using DictionaryBuilder::Append; - using DictionaryBuilder::DictionaryBuilder; - - Status Append(const uint8_t* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); - } -}; - -/// \brief Dictionary array builder with convenience methods for strings -class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder { - public: - using DictionaryBuilder::Append; - using DictionaryBuilder::DictionaryBuilder; - - Status Append(const uint8_t* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); - } -}; - -// ---------------------------------------------------------------------- -// Helper functions +class DataType; +class MemoryPool; ARROW_EXPORT Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::unique_ptr* out); } // namespace arrow - -#endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 52fc58809604c..e34a086d8e2d9 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -70,6 +70,27 @@ shared_ptr _MakeArray(const shared_ptr& type, const vector& return result; } +// ---------------------------------------------------------------------- +// Datum + +template +void CheckImplicitConstructor(enum Datum::type expected_kind) { + std::shared_ptr value; + Datum datum = value; + ASSERT_EQ(expected_kind, datum.kind()); +} + +TEST(TestDatum, ImplicitConstructors) { + CheckImplicitConstructor(Datum::ARRAY); + + // Instantiate from array subclass + CheckImplicitConstructor(Datum::ARRAY); + + CheckImplicitConstructor(Datum::CHUNKED_ARRAY); + CheckImplicitConstructor(Datum::RECORD_BATCH); + CheckImplicitConstructor
(Datum::TABLE); +} + // ---------------------------------------------------------------------- // Cast @@ -781,7 +802,7 @@ TEST_F(TestCast, ChunkedArray) { CastOptions options; Datum out; - ASSERT_OK(Cast(&this->ctx_, Datum(carr), out_type, options, &out)); + ASSERT_OK(Cast(&this->ctx_, carr, out_type, options, &out)); ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); auto out_carr = out.chunked_array(); @@ -869,7 +890,7 @@ TEST_F(TestCast, PreallocatedMemory) { out_data->buffers.push_back(out_values); Datum out(out_data); - ASSERT_OK(kernel->Call(&this->ctx_, Datum(arr), &out)); + ASSERT_OK(kernel->Call(&this->ctx_, arr, &out)); // Buffer address unchanged ASSERT_EQ(out_values.get(), out_data->buffers[1].get()); @@ -912,8 +933,8 @@ void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr Datum out_second(out_second_data); // Cast each bit - ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(0, first_half)), &out_first)); - ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(first_half)), &out_second)); + ASSERT_OK(kernel->Call(ctx, arr->Slice(0, first_half), &out_first)); + ASSERT_OK(kernel->Call(ctx, arr->Slice(first_half), &out_second)); shared_ptr result = MakeArray(out_data); @@ -1105,7 +1126,7 @@ TYPED_TEST(TestDictionaryCast, Basic) { TestBase::MakeRandomArray::ArrayType>(10, 2); Datum out; - ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(plain_array->data()), &out)); + ASSERT_OK(DictionaryEncode(&this->ctx_, plain_array->data(), &out)); this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options); } @@ -1201,7 +1222,7 @@ void CheckUnique(FunctionContext* ctx, const shared_ptr& type, shared_ptr expected = _MakeArray(type, out_values, out_is_valid); shared_ptr result; - ASSERT_OK(Unique(ctx, Datum(input), &result)); + ASSERT_OK(Unique(ctx, input, &result)); ASSERT_ARRAYS_EQUAL(*expected, *result); } @@ -1218,7 +1239,7 @@ void CheckDictEncode(FunctionContext* ctx, const shared_ptr& type, DictionaryArray expected(dictionary(int32(), ex_dict), ex_indices); Datum datum_out; - ASSERT_OK(DictionaryEncode(ctx, Datum(input), &datum_out)); + ASSERT_OK(DictionaryEncode(ctx, input, &datum_out)); shared_ptr result = MakeArray(datum_out.array()); ASSERT_ARRAYS_EQUAL(expected, *result); @@ -1461,7 +1482,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) { // Unique shared_ptr result; - ASSERT_OK(Unique(&this->ctx_, Datum(carr), &result)); + ASSERT_OK(Unique(&this->ctx_, carr, &result)); ASSERT_ARRAYS_EQUAL(*ex_dict, *result); // Dictionary encode @@ -1475,7 +1496,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) { auto dict_carr = std::make_shared(dict_arrays); Datum encoded_out; - ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(carr), &encoded_out)); + ASSERT_OK(DictionaryEncode(&this->ctx_, carr, &encoded_out)); ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); @@ -1490,7 +1511,7 @@ class TestBooleanKernel : public ComputeFixture, public TestBase { const std::shared_ptr& right, const std::shared_ptr& expected) { Datum result; - ASSERT_OK(kernel(&this->ctx_, Datum(left), Datum(right), &result)); + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); std::shared_ptr result_array = result.make_array(); ASSERT_TRUE(result_array->Equals(expected)); @@ -1502,7 +1523,7 @@ class TestBooleanKernel : public ComputeFixture, public TestBase { const std::shared_ptr& expected) { Datum result; std::shared_ptr result_array; - ASSERT_OK(kernel(&this->ctx_, Datum(left), Datum(right), &result)); + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); std::shared_ptr result_ca = result.chunked_array(); ASSERT_TRUE(result_ca->Equals(expected)); @@ -1552,13 +1573,13 @@ TEST_F(TestBooleanKernel, Invert) { // Plain array Datum result; - ASSERT_OK(Invert(&this->ctx_, Datum(a1), &result)); + ASSERT_OK(Invert(&this->ctx_, a1, &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); std::shared_ptr result_array = result.make_array(); ASSERT_TRUE(result_array->Equals(a2)); // Array with offset - ASSERT_OK(Invert(&this->ctx_, Datum(a1->Slice(1)), &result)); + ASSERT_OK(Invert(&this->ctx_, a1->Slice(1), &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); result_array = result.make_array(); ASSERT_TRUE(result_array->Equals(a2->Slice(1))); @@ -1568,7 +1589,7 @@ TEST_F(TestBooleanKernel, Invert) { auto ca1 = std::make_shared(ca1_arrs); std::vector> ca2_arrs = {a2, a2->Slice(1)}; auto ca2 = std::make_shared(ca2_arrs); - ASSERT_OK(Invert(&this->ctx_, Datum(ca1), &result)); + ASSERT_OK(Invert(&this->ctx_, ca1, &result)); ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); std::shared_ptr result_ca = result.chunked_array(); ASSERT_TRUE(result_ca->Equals(ca2)); @@ -1618,14 +1639,14 @@ TEST_F(TestInvokeBinaryKernel, Exceptions) { auto a2 = _MakeArray(type, values2, {}); // Left is not an array-like - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel( - &this->ctx_, &kernel, Datum(table), Datum(a2), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, table, a2, + &outputs)); // Right is not an array-like - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, Datum(a1), - Datum(table), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, a1, table, + &outputs)); // Different sized inputs - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, Datum(a1), - Datum(a1->Slice(1)), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, a1, + a1->Slice(1), &outputs)); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index bef2b9af21cff..87080b1000d5f 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -61,19 +61,28 @@ struct ARROW_EXPORT Datum { /// \brief Empty datum, to be populated elsewhere Datum() : value(NULLPTR) {} - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : Datum(value->data()) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr
& value) : value(value) {} - - explicit Datum(const std::vector& value) : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : Datum(value ? value->data() : NULLPTR) {} + + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr
& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::vector& value) // NOLINT implicit conversion + : value(value) {} + + // Cast from subtypes of Array to Datum + template ::value>::type> + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : Datum(std::shared_ptr(value)) {} ~Datum() {} diff --git a/cpp/src/arrow/csv/column-builder.h b/cpp/src/arrow/csv/column-builder.h index b21cff76be5c6..054a642295cb5 100644 --- a/cpp/src/arrow/csv/column-builder.h +++ b/cpp/src/arrow/csv/column-builder.h @@ -18,22 +18,29 @@ #ifndef ARROW_CSV_COLUMN_BUILDER_H #define ARROW_CSV_COLUMN_BUILDER_H +#include #include -#include #include "arrow/array.h" -#include "arrow/csv/converter.h" -#include "arrow/csv/options.h" -#include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/table.h" -#include "arrow/type.h" -#include "arrow/util/task-group.h" #include "arrow/util/visibility.h" namespace arrow { + +class ChunkedArray; +class DataType; + +namespace internal { + +class TaskGroup; + +} // namespace internal + namespace csv { +class BlockParser; +struct ConvertOptions; + class ARROW_EXPORT ColumnBuilder { public: virtual ~ColumnBuilder() = default; diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 7d8bff870ba84..8a249a68c07ec 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "arrow/builder.h" #include "arrow/csv/parser.h" diff --git a/cpp/src/arrow/csv/parser.h b/cpp/src/arrow/csv/parser.h index 8a515744ee2d9..fdddc37a2c0fb 100644 --- a/cpp/src/arrow/csv/parser.h +++ b/cpp/src/arrow/csv/parser.h @@ -18,6 +18,7 @@ #ifndef ARROW_CSV_PARSER_H #define ARROW_CSV_PARSER_H +#include #include #include #include diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 8cf74d6b99901..b2a6b7b430ad0 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include "arrow/buffer.h" diff --git a/cpp/src/arrow/io/buffered.cc b/cpp/src/arrow/io/buffered.cc index 0c04ac21c208e..f3eae39c8e62e 100644 --- a/cpp/src/arrow/io/buffered.cc +++ b/cpp/src/arrow/io/buffered.cc @@ -21,10 +21,10 @@ #include #include #include -#include #include #include "arrow/buffer.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/util/logging.h" #include "arrow/util/string_view.h" diff --git a/cpp/src/arrow/io/buffered.h b/cpp/src/arrow/io/buffered.h index e4374ba8079d3..d5079556c7cfc 100644 --- a/cpp/src/arrow/io/buffered.h +++ b/cpp/src/arrow/io/buffered.h @@ -29,6 +29,7 @@ namespace arrow { +class Buffer; class MemoryPool; class Status; diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc index b0be28925cf23..8139c47e09fca 100644 --- a/cpp/src/arrow/ipc/feather-test.cc +++ b/cpp/src/arrow/ipc/feather-test.cc @@ -30,6 +30,7 @@ #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc index 45525212d2f4b..84a2210157f53 100644 --- a/cpp/src/arrow/ipc/json-simple-test.cc +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -34,6 +34,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" #if defined(_MSC_VER) // "warning C4307: '+': integral constant overflow" diff --git a/cpp/src/arrow/memory_pool-test.h b/cpp/src/arrow/memory_pool-test.h index 34523a181ba1e..fc86d943ec116 100644 --- a/cpp/src/arrow/memory_pool-test.h +++ b/cpp/src/arrow/memory_pool-test.h @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index 0a27141b447f7..d62db32b062ac 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -17,18 +17,16 @@ #include "arrow/memory_pool.h" -#include -#include -#include -#include -#include -#include +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep #include #include #include // IWYU pragma: keep #include "arrow/status.h" -#include "arrow/util/logging.h" +#include "arrow/util/logging.h" // IWYU pragma: keep #ifdef ARROW_JEMALLOC // Needed to support jemalloc 3 and 4 diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 8434e59b0ce79..a1acfb81aeff1 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -26,12 +26,10 @@ #include "arrow/array.h" #include "arrow/builder.h" -#include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/util/decimal.h" namespace arrow { @@ -342,7 +340,7 @@ TEST_F(TestPrettyPrint, DictionaryType) { TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { auto array = ArrayFromJSON(int32(), "[0, 1, null, 3, null]"); - ChunkedArray chunked_array({array}); + ChunkedArray chunked_array(array); static const char* expected = R"expected([ [ diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index ec23bfb00fcde..c524039c3e86a 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -19,7 +19,7 @@ #include #include #include -#include +#include // IWYU pragma: keep #include #include #include diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index fde6c293f9b68..ca50bc0bc993c 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -21,14 +21,17 @@ #include #include -#include "arrow/type_fwd.h" #include "arrow/util/visibility.h" namespace arrow { class Array; +class Column; class ChunkedArray; +class RecordBatch; +class Schema; class Status; +class Table; struct PrettyPrintOptions { PrettyPrintOptions(int indent_arg, int window_arg = 10, int indent_size_arg = 2, diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index f9a5ea1b0d67e..da288d3c6868e 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -539,33 +540,27 @@ Status NumPyConverter::Visit(const BinaryType& type) { auto data = reinterpret_cast(PyArray_DATA(arr_)); - int item_length = 0; + auto AppendNotNull = [&builder, this](const uint8_t* data) { + // This is annoying. NumPy allows strings to have nul-terminators, so + // we must check for them here + const size_t item_size = + strnlen(reinterpret_cast(data), static_cast(itemsize_)); + return builder.Append(data, static_cast(item_size)); + }; + if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); for (int64_t i = 0; i < length_; ++i) { if (mask_values[i]) { RETURN_NOT_OK(builder.AppendNull()); } else { - // This is annoying. NumPy allows strings to have nul-terminators, so - // we must check for them here - for (item_length = 0; item_length < itemsize_; ++item_length) { - if (data[item_length] == 0) { - break; - } - } - RETURN_NOT_OK(builder.Append(data, item_length)); + RETURN_NOT_OK(AppendNotNull(data)); } data += stride_; } } else { for (int64_t i = 0; i < length_; ++i) { - for (item_length = 0; item_length < itemsize_; ++item_length) { - // Look for nul-terminator - if (data[item_length] == 0) { - break; - } - } - RETURN_NOT_OK(builder.Append(data, item_length)); + RETURN_NOT_OK(AppendNotNull(data)); data += stride_; } } diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index 2d15ce45b3b7f..7443c54845630 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -25,6 +25,7 @@ #include "arrow/builder.h" #include "arrow/table.h" #include "arrow/test-util.h" +#include "arrow/util/decimal.h" #include "arrow/python/arrow_to_pandas.h" #include "arrow/python/decimal.h" diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 674b68b40fa6e..ceb6885da621e 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -32,6 +32,7 @@ namespace arrow { class Array; struct ArrayData; class Status; +class Table; /// \class RecordBatch /// \brief Collection of equal-length arrays matching a particular Schema diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 6b5733252879b..2ac34b4cde57d 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -44,6 +44,11 @@ class ARROW_EXPORT ChunkedArray { /// The vector should be non-empty and all its elements should have the same /// data type. explicit ChunkedArray(const ArrayVector& chunks); + + /// \brief Construct a chunked array from a single Array + explicit ChunkedArray(const std::shared_ptr& chunk) + : ChunkedArray(ArrayVector({chunk})) {} + /// \brief Construct a chunked array from a vector of arrays and a data type /// /// As the data type is passed explicitly, the vector may be empty. diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 589ee995e2181..792945b1740f3 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -17,6 +17,7 @@ #include "arrow/tensor.h" +#include #include #include #include diff --git a/cpp/src/arrow/test-util.cc b/cpp/src/arrow/test-util.cc index 38e07dd060ae4..8c5f36417f881 100644 --- a/cpp/src/arrow/test-util.cc +++ b/cpp/src/arrow/test-util.cc @@ -18,13 +18,12 @@ #include "arrow/test-util.h" #ifndef _WIN32 -#include -#include -#include +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep #endif #include -#include #include #include #include @@ -33,23 +32,17 @@ #include #include #include -#include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/ipc/json-simple.h" -#include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/decimal.h" #include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 7829ac25678a9..7fe7685f5a39f 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -17,23 +17,17 @@ #pragma once -#ifndef _WIN32 -#include -#include -#include -#endif - #include -#include #include #include +#include #include #include #include #include #include #include -#include +#include #include #include @@ -43,13 +37,13 @@ #include "arrow/builder.h" #include "arrow/memory_pool.h" #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" -#include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/util/macros.h" #include "arrow/util/visibility.h" #define STRINGIFY(x) #x @@ -102,6 +96,10 @@ namespace arrow { +class ChunkedArray; +class Column; +class Table; + using ArrayVector = std::vector>; #define ASSERT_ARRAYS_EQUAL(LEFT, RIGHT) \ diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index 0acd54d057218..97fd46ab6c587 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -18,6 +18,7 @@ #include "arrow/util/compression_lz4.h" #include +#include #include #include diff --git a/cpp/src/arrow/util/int-util-test.cc b/cpp/src/arrow/util/int-util-test.cc index 51fd96e4ea25a..018eeda7248a3 100644 --- a/cpp/src/arrow/util/int-util-test.cc +++ b/cpp/src/arrow/util/int-util-test.cc @@ -17,14 +17,12 @@ #include #include -#include #include #include #include #include -#include "arrow/test-util.h" #include "arrow/util/int-util.h" namespace arrow { diff --git a/cpp/src/arrow/util/string_view.h b/cpp/src/arrow/util/string_view.h index 2ee594a9e9ad3..0f35483e3738e 100644 --- a/cpp/src/arrow/util/string_view.h +++ b/cpp/src/arrow/util/string_view.h @@ -18,7 +18,7 @@ #ifndef ARROW_UTIL_STRING_VIEW_H #define ARROW_UTIL_STRING_VIEW_H -#include "arrow/util/string_view/string_view.hpp" +#include "arrow/util/string_view/string_view.hpp" // IWYU pragma: export namespace arrow { namespace util { diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index 9372c3110a3af..89afc39a23376 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -18,8 +18,11 @@ ADD_PARQUET_TEST(arrow-schema-test) ADD_PARQUET_TEST(arrow-reader-writer-test) -ADD_ARROW_BENCHMARK(reader-writer-benchmark +ADD_BENCHMARK(reader-writer-benchmark PREFIX "parquet-arrow" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) +if (TARGET parquet-arrow-reader-writer-benchmark) + add_dependencies(parquet parquet-arrow-reader-writer-benchmark) +endif() ARROW_INSTALL_ALL_HEADERS("parquet/arrow") diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 24ec0dd24eec3..07124ebb3057a 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -464,7 +464,11 @@ class TestParquetIO : public ::testing::Test { ASSERT_OK_NO_THROW(file_reader->GetColumn(0, &column_reader)); ASSERT_NE(nullptr, column_reader.get()); - ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out)); + std::shared_ptr chunked_out; + ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, &chunked_out)); + + ASSERT_EQ(1, chunked_out->num_chunks()); + *out = chunked_out->chunk(0); ASSERT_NE(nullptr, out->get()); } @@ -1745,10 +1749,11 @@ TEST(TestArrowReadWrite, ListLargeRecords) { std::vector> pieces; for (int i = 0; i < num_rows; ++i) { - std::shared_ptr piece; - ASSERT_OK(col_reader->NextBatch(1, &piece)); - ASSERT_EQ(1, piece->length()); - pieces.push_back(piece); + std::shared_ptr chunked_piece; + ASSERT_OK(col_reader->NextBatch(1, &chunked_piece)); + ASSERT_EQ(1, chunked_piece->length()); + ASSERT_EQ(1, chunked_piece->num_chunks()); + pieces.push_back(chunked_piece->chunk(0)); } auto chunked = std::make_shared<::arrow::ChunkedArray>(pieces); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 6273fda464025..2a7730d42ad23 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -32,6 +32,9 @@ #include "arrow/util/logging.h" #include "arrow/util/thread-pool.h" +// For arrow::compute::Datum. This should perhaps be promoted. See ARROW-4022 +#include "arrow/compute/kernel.h" + #include "parquet/arrow/record_reader.h" #include "parquet/arrow/schema.h" #include "parquet/column_reader.h" @@ -46,6 +49,7 @@ using arrow::Array; using arrow::BooleanArray; +using arrow::ChunkedArray; using arrow::Column; using arrow::Field; using arrow::Int32Array; @@ -57,6 +61,9 @@ using arrow::StructArray; using arrow::Table; using arrow::TimestampArray; +// For Array/ChunkedArray variant +using arrow::compute::Datum; + using parquet::schema::Node; // Help reduce verbosity @@ -85,6 +92,19 @@ static inline int64_t impala_timestamp_to_nanoseconds(const Int96& impala_timest template using ArrayType = typename ::arrow::TypeTraits::ArrayType; +namespace { + +Status GetSingleChunk(const ChunkedArray& chunked, std::shared_ptr* out) { + DCHECK_GT(chunked.num_chunks(), 0); + if (chunked.num_chunks() > 1) { + return Status::Invalid("Function call returned a chunked array"); + } + *out = chunked.chunk(0); + return Status::OK(); +} + +} // namespace + // ---------------------------------------------------------------------- // Iteration utilities @@ -223,15 +243,18 @@ class FileReader::Impl { virtual ~Impl() {} Status GetColumn(int i, std::unique_ptr* out); - Status ReadSchemaField(int i, std::shared_ptr* out); + + Status ReadSchemaField(int i, std::shared_ptr* out); Status ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr* out); + std::shared_ptr* out); + Status ReadColumn(int i, std::shared_ptr* out); + Status ReadColumnChunk(int column_index, int row_group_index, + std::shared_ptr* out); + Status GetReaderForNode(int index, const Node* node, const std::vector& indices, int16_t def_level, std::unique_ptr* out); - Status ReadColumn(int i, std::shared_ptr* out); - Status ReadColumnChunk(int column_index, int row_group_index, - std::shared_ptr* out); + Status GetSchema(std::shared_ptr<::arrow::Schema>* out); Status GetSchema(const std::vector& indices, std::shared_ptr<::arrow::Schema>* out); @@ -267,7 +290,8 @@ class FileReader::Impl { class ColumnReader::ColumnReaderImpl { public: virtual ~ColumnReaderImpl() {} - virtual Status NextBatch(int64_t records_to_read, std::shared_ptr* out) = 0; + virtual Status NextBatch(int64_t records_to_read, + std::shared_ptr* out) = 0; virtual Status GetDefLevels(const int16_t** data, size_t* length) = 0; virtual Status GetRepLevels(const int16_t** data, size_t* length) = 0; virtual const std::shared_ptr field() = 0; @@ -283,10 +307,10 @@ class PARQUET_NO_EXPORT PrimitiveImpl : public ColumnReader::ColumnReaderImpl { NextRowGroup(); } - Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; + Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; template - Status WrapIntoListArray(std::shared_ptr* array); + Status WrapIntoListArray(Datum* inout_array); Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; @@ -314,7 +338,7 @@ class PARQUET_NO_EXPORT StructImpl : public ColumnReader::ColumnReaderImpl { InitField(node, children); } - Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; + Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; const std::shared_ptr field() override { return field_; } @@ -395,7 +419,7 @@ Status FileReader::Impl::GetReaderForNode( return Status::OK(); } -Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { +Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { std::vector indices(reader_->metadata()->num_columns()); for (size_t j = 0; j < indices.size(); ++j) { @@ -406,7 +430,7 @@ Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { } Status FileReader::Impl::ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr* out) { + std::shared_ptr* out) { auto parquet_schema = reader_->metadata()->schema(); auto node = parquet_schema->group_node()->field(i).get(); @@ -432,7 +456,7 @@ Status FileReader::Impl::ReadSchemaField(int i, const std::vector& indices, return reader->NextBatch(records_to_read, out); } -Status FileReader::Impl::ReadColumn(int i, std::shared_ptr* out) { +Status FileReader::Impl::ReadColumn(int i, std::shared_ptr* out) { std::unique_ptr flat_column_reader; RETURN_NOT_OK(GetColumn(i, &flat_column_reader)); @@ -452,7 +476,7 @@ Status FileReader::Impl::GetSchema(const std::vector& indices, } Status FileReader::Impl::ReadColumnChunk(int column_index, int row_group_index, - std::shared_ptr* out) { + std::shared_ptr* out) { auto rg_metadata = reader_->metadata()->RowGroup(row_group_index); int64_t records_to_read = rg_metadata->ColumnChunk(column_index)->num_values(); @@ -463,10 +487,7 @@ Status FileReader::Impl::ReadColumnChunk(int column_index, int row_group_index, new PrimitiveImpl(pool_, std::move(input))); ColumnReader flat_column_reader(std::move(impl)); - std::shared_ptr array; - RETURN_NOT_OK(flat_column_reader.NextBatch(records_to_read, &array)); - *out = array; - return Status::OK(); + return flat_column_reader.NextBatch(records_to_read, out); } Status FileReader::Impl::ReadRowGroup(int row_group_index, @@ -485,7 +506,7 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index, auto ReadColumnFunc = [&indices, &row_group_index, &schema, &columns, this](int i) { int column_index = indices[i]; - std::shared_ptr array; + std::shared_ptr array; RETURN_NOT_OK(ReadColumnChunk(column_index, row_group_index, &array)); columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); @@ -532,7 +553,7 @@ Status FileReader::Impl::ReadTable(const std::vector& indices, std::vector> columns(num_fields); auto ReadColumnFunc = [&indices, &field_indices, &schema, &columns, this](int i) { - std::shared_ptr array; + std::shared_ptr array; RETURN_NOT_OK(ReadSchemaField(field_indices[i], indices, &array)); columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); @@ -576,8 +597,6 @@ Status FileReader::Impl::ReadTable(std::shared_ptr
* table) { Status FileReader::Impl::ReadRowGroups(const std::vector& row_groups, const std::vector& indices, std::shared_ptr
* table) { - // TODO(PARQUET-1393): Modify the record readers to already read this into a single, - // continuous array. std::vector> tables(row_groups.size(), nullptr); for (size_t i = 0; i < row_groups.size(); ++i) { @@ -633,7 +652,7 @@ Status FileReader::GetSchema(const std::vector& indices, return impl_->GetSchema(indices, out); } -Status FileReader::ReadColumn(int i, std::shared_ptr* out) { +Status FileReader::ReadColumn(int i, std::shared_ptr* out) { try { return impl_->ReadColumn(i, out); } catch (const ::parquet::ParquetException& e) { @@ -641,7 +660,7 @@ Status FileReader::ReadColumn(int i, std::shared_ptr* out) { } } -Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { +Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { try { return impl_->ReadSchemaField(i, out); } catch (const ::parquet::ParquetException& e) { @@ -649,6 +668,18 @@ Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { } } +Status FileReader::ReadColumn(int i, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(ReadColumn(i, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + +Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(ReadSchemaField(i, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + Status FileReader::GetRecordBatchReader(const std::vector& row_group_indices, std::shared_ptr* out) { std::vector indices(impl_->num_columns()); @@ -764,7 +795,28 @@ const ParquetFileReader* FileReader::parquet_reader() const { } template -Status PrimitiveImpl::WrapIntoListArray(std::shared_ptr* array) { +Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { + if (descr_->max_repetition_level() == 0) { + // Flat, no action + return Status::OK(); + } + + std::shared_ptr flat_array; + + // ARROW-3762(wesm): If inout_array is a chunked array, we reject as this is + // not yet implemented + if (inout_array->kind() == Datum::CHUNKED_ARRAY) { + if (inout_array->chunked_array()->num_chunks() > 1) { + return Status::NotImplemented( + "Nested data conversions not implemented for " + "chunked array outputs"); + } + flat_array = inout_array->chunked_array()->chunk(0); + } else { + DCHECK_EQ(Datum::ARRAY, inout_array->kind()); + flat_array = inout_array->make_array(); + } + const int16_t* def_levels = record_reader_->def_levels(); const int16_t* rep_levels = record_reader_->rep_levels(); const int64_t total_levels_read = record_reader_->levels_position(); @@ -775,110 +827,106 @@ Status PrimitiveImpl::WrapIntoListArray(std::shared_ptr* array) { &arrow_schema)); std::shared_ptr current_field = arrow_schema->field(0); - if (descr_->max_repetition_level() > 0) { - // Walk downwards to extract nullability - std::vector nullable; - std::vector> offset_builders; - std::vector> valid_bits_builders; - nullable.push_back(current_field->nullable()); - while (current_field->type()->num_children() > 0) { - if (current_field->type()->num_children() > 1) { - return Status::NotImplemented( - "Fields with more than one child are not supported."); - } else { - if (current_field->type()->id() != ::arrow::Type::LIST) { - return Status::NotImplemented( - "Currently only nesting with Lists is supported."); - } - current_field = current_field->type()->child(0); + // Walk downwards to extract nullability + std::vector nullable; + std::vector> offset_builders; + std::vector> valid_bits_builders; + nullable.push_back(current_field->nullable()); + while (current_field->type()->num_children() > 0) { + if (current_field->type()->num_children() > 1) { + return Status::NotImplemented("Fields with more than one child are not supported."); + } else { + if (current_field->type()->id() != ::arrow::Type::LIST) { + return Status::NotImplemented("Currently only nesting with Lists is supported."); } - offset_builders.emplace_back( - std::make_shared<::arrow::Int32Builder>(::arrow::int32(), pool_)); - valid_bits_builders.emplace_back( - std::make_shared<::arrow::BooleanBuilder>(::arrow::boolean(), pool_)); - nullable.push_back(current_field->nullable()); + current_field = current_field->type()->child(0); } + offset_builders.emplace_back( + std::make_shared<::arrow::Int32Builder>(::arrow::int32(), pool_)); + valid_bits_builders.emplace_back( + std::make_shared<::arrow::BooleanBuilder>(::arrow::boolean(), pool_)); + nullable.push_back(current_field->nullable()); + } - int64_t list_depth = offset_builders.size(); - // This describes the minimal definition that describes a level that - // reflects a value in the primitive values array. - int16_t values_def_level = descr_->max_definition_level(); - if (nullable[nullable.size() - 1]) { - values_def_level--; - } + int64_t list_depth = offset_builders.size(); + // This describes the minimal definition that describes a level that + // reflects a value in the primitive values array. + int16_t values_def_level = descr_->max_definition_level(); + if (nullable[nullable.size() - 1]) { + values_def_level--; + } - // The definition levels that are needed so that a list is declared - // as empty and not null. - std::vector empty_def_level(list_depth); - int def_level = 0; - for (int i = 0; i < list_depth; i++) { - if (nullable[i]) { - def_level++; - } - empty_def_level[i] = static_cast(def_level); + // The definition levels that are needed so that a list is declared + // as empty and not null. + std::vector empty_def_level(list_depth); + int def_level = 0; + for (int i = 0; i < list_depth; i++) { + if (nullable[i]) { def_level++; } + empty_def_level[i] = static_cast(def_level); + def_level++; + } - int32_t values_offset = 0; - std::vector null_counts(list_depth, 0); - for (int64_t i = 0; i < total_levels_read; i++) { - int16_t rep_level = rep_levels[i]; - if (rep_level < descr_->max_repetition_level()) { - for (int64_t j = rep_level; j < list_depth; j++) { - if (j == (list_depth - 1)) { - RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); - } else { - RETURN_NOT_OK(offset_builders[j]->Append( - static_cast(offset_builders[j + 1]->length()))); - } + int32_t values_offset = 0; + std::vector null_counts(list_depth, 0); + for (int64_t i = 0; i < total_levels_read; i++) { + int16_t rep_level = rep_levels[i]; + if (rep_level < descr_->max_repetition_level()) { + for (int64_t j = rep_level; j < list_depth; j++) { + if (j == (list_depth - 1)) { + RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); + } else { + RETURN_NOT_OK(offset_builders[j]->Append( + static_cast(offset_builders[j + 1]->length()))); + } - if (((empty_def_level[j] - 1) == def_levels[i]) && (nullable[j])) { - RETURN_NOT_OK(valid_bits_builders[j]->Append(false)); - null_counts[j]++; + if (((empty_def_level[j] - 1) == def_levels[i]) && (nullable[j])) { + RETURN_NOT_OK(valid_bits_builders[j]->Append(false)); + null_counts[j]++; + break; + } else { + RETURN_NOT_OK(valid_bits_builders[j]->Append(true)); + if (empty_def_level[j] == def_levels[i]) { break; - } else { - RETURN_NOT_OK(valid_bits_builders[j]->Append(true)); - if (empty_def_level[j] == def_levels[i]) { - break; - } } } } - if (def_levels[i] >= values_def_level) { - values_offset++; - } } - // Add the final offset to all lists - for (int64_t j = 0; j < list_depth; j++) { - if (j == (list_depth - 1)) { - RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); - } else { - RETURN_NOT_OK(offset_builders[j]->Append( - static_cast(offset_builders[j + 1]->length()))); - } + if (def_levels[i] >= values_def_level) { + values_offset++; } - - std::vector> offsets; - std::vector> valid_bits; - std::vector list_lengths; - for (int64_t j = 0; j < list_depth; j++) { - list_lengths.push_back(offset_builders[j]->length() - 1); - std::shared_ptr array; - RETURN_NOT_OK(offset_builders[j]->Finish(&array)); - offsets.emplace_back(std::static_pointer_cast(array)->values()); - RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array)); - valid_bits.emplace_back(std::static_pointer_cast(array)->values()); + } + // Add the final offset to all lists + for (int64_t j = 0; j < list_depth; j++) { + if (j == (list_depth - 1)) { + RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); + } else { + RETURN_NOT_OK(offset_builders[j]->Append( + static_cast(offset_builders[j + 1]->length()))); } + } - std::shared_ptr output(*array); - for (int64_t j = list_depth - 1; j >= 0; j--) { - auto list_type = - ::arrow::list(::arrow::field("item", output->type(), nullable[j + 1])); - output = std::make_shared<::arrow::ListArray>( - list_type, list_lengths[j], offsets[j], output, valid_bits[j], null_counts[j]); - } - *array = output; + std::vector> offsets; + std::vector> valid_bits; + std::vector list_lengths; + for (int64_t j = 0; j < list_depth; j++) { + list_lengths.push_back(offset_builders[j]->length() - 1); + std::shared_ptr array; + RETURN_NOT_OK(offset_builders[j]->Finish(&array)); + offsets.emplace_back(std::static_pointer_cast(array)->values()); + RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array)); + valid_bits.emplace_back(std::static_pointer_cast(array)->values()); + } + + std::shared_ptr output = flat_array; + for (int64_t j = list_depth - 1; j >= 0; j--) { + auto list_type = + ::arrow::list(::arrow::field("item", output->type(), nullable[j + 1])); + output = std::make_shared<::arrow::ListArray>(list_type, list_lengths[j], offsets[j], + output, valid_bits[j], null_counts[j]); } + *inout_array = output; return Status::OK(); } @@ -909,8 +957,7 @@ struct TransferFunctor { using ParquetCType = typename ParquetType::c_type; Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { static_assert(!std::is_same::value, "The fast path transfer functor should be used " "for primitive values"); @@ -938,8 +985,7 @@ template struct TransferFunctor> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); std::shared_ptr values = reader->ReleaseValues(); @@ -957,8 +1003,7 @@ struct TransferFunctor struct TransferFunctor<::arrow::BooleanType, BooleanType> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); std::shared_ptr data; @@ -991,8 +1036,7 @@ struct TransferFunctor<::arrow::BooleanType, BooleanType> { template <> struct TransferFunctor<::arrow::TimestampType, Int96Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1019,8 +1063,7 @@ struct TransferFunctor<::arrow::TimestampType, Int96Type> { template <> struct TransferFunctor<::arrow::Date64Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1046,19 +1089,24 @@ struct TransferFunctor<::arrow::Date64Type, Int32Type> { template struct TransferFunctor< ArrowType, ParquetType, - typename std::enable_if::value || - std::is_same::value>::type> { + typename std::enable_if< + (std::is_base_of<::arrow::BinaryType, ArrowType>::value || + std::is_same<::arrow::FixedSizeBinaryType, ArrowType>::value) && + (std::is_same::value || + std::is_same::value)>::type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { - RETURN_NOT_OK(reader->builder()->Finish(out)); + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + std::vector> chunks = reader->GetBuilderChunks(); if (type->id() == ::arrow::Type::STRING) { // Convert from BINARY type to STRING - auto new_data = (*out)->data()->Copy(); - new_data->type = type; - *out = ::arrow::MakeArray(new_data); + for (size_t i = 0; i < chunks.size(); ++i) { + auto new_data = chunks[i]->data()->Copy(); + new_data->type = type; + chunks[i] = ::arrow::MakeArray(new_data); + } } + *out = std::make_shared(chunks); return Status::OK(); } }; @@ -1166,121 +1214,133 @@ static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_wid BytesToIntegerPair(value, byte_width, high, low); } -/// \brief Convert an array of FixedLenByteArrays to an arrow::Decimal128Array -/// We do this by: -/// 1. Creating a arrow::FixedSizeBinaryArray from the RecordReader's builder -/// 2. Allocating a buffer for the arrow::Decimal128Array -/// 3. Converting the big-endian bytes in the FixedSizeBinaryArray to two integers -/// representing the high and low bits of each decimal value. +// ---------------------------------------------------------------------- +// BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 + +template +Status ConvertToDecimal128(const Array& array, const std::shared_ptr<::arrow::DataType>&, + MemoryPool* pool, std::shared_ptr*) { + return Status::NotImplemented("not implemented"); +} + template <> -struct TransferFunctor<::arrow::Decimal128Type, FLBAType> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { - DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); +Status ConvertToDecimal128(const Array& array, + const std::shared_ptr<::arrow::DataType>& type, + MemoryPool* pool, std::shared_ptr* out) { + const auto& fixed_size_binary_array = + static_cast(array); - // Finish the built data into a temporary array - std::shared_ptr array; - RETURN_NOT_OK(reader->builder()->Finish(&array)); - const auto& fixed_size_binary_array = - static_cast(*array); + // The byte width of each decimal value + const int32_t type_length = + static_cast(*type).byte_width(); - // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time - // this will be different from the decimal array width because we write the minimum - // number of bytes necessary to represent a given precision - const int32_t byte_width = - static_cast(*fixed_size_binary_array.type()) - .byte_width(); + // number of elements in the entire array + const int64_t length = fixed_size_binary_array.length(); - // The byte width of each decimal value - const int32_t type_length = - static_cast(*type).byte_width(); + // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time + // this will be different from the decimal array width because we write the minimum + // number of bytes necessary to represent a given precision + const int32_t byte_width = + static_cast(*fixed_size_binary_array.type()) + .byte_width(); - // number of elements in the entire array - const int64_t length = fixed_size_binary_array.length(); + // allocate memory for the decimal array + std::shared_ptr data; + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - // allocate memory for the decimal array - std::shared_ptr data; - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - - // raw bytes that we can write to - uint8_t* out_ptr = data->mutable_data(); - - // convert each FixedSizeBinary value to valid decimal bytes - const int64_t null_count = fixed_size_binary_array.null_count(); - if (null_count > 0) { - for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { - if (!fixed_size_binary_array.IsNull(i)) { - RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, - out_ptr); - } - } - } else { - for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + // raw bytes that we can write to + uint8_t* out_ptr = data->mutable_data(); + + // convert each FixedSizeBinary value to valid decimal bytes + const int64_t null_count = fixed_size_binary_array.null_count(); + if (null_count > 0) { + for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + if (!fixed_size_binary_array.IsNull(i)) { RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, out_ptr); } } - - *out = std::make_shared<::arrow::Decimal128Array>( - type, length, data, fixed_size_binary_array.null_bitmap(), null_count); - return Status::OK(); + } else { + for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, out_ptr); + } } -}; -/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array -/// We do this by: -/// 1. Creating an arrow::BinaryArray from the RecordReader's builder -/// 2. Allocating a buffer for the arrow::Decimal128Array -/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers -/// representing the high and low bits of each decimal value. -template <> -struct TransferFunctor<::arrow::Decimal128Type, ByteArrayType> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { - DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); + *out = std::make_shared<::arrow::Decimal128Array>( + type, length, data, fixed_size_binary_array.null_bitmap(), null_count); - // Finish the built data into a temporary array - std::shared_ptr array; - RETURN_NOT_OK(reader->builder()->Finish(&array)); - const auto& binary_array = static_cast(*array); + return Status::OK(); +} - const int64_t length = binary_array.length(); +template <> +Status ConvertToDecimal128(const Array& array, + const std::shared_ptr<::arrow::DataType>& type, + MemoryPool* pool, std::shared_ptr* out) { + const auto& binary_array = static_cast(array); + const int64_t length = binary_array.length(); - const auto& decimal_type = static_cast(*type); - const int64_t type_length = decimal_type.byte_width(); + const auto& decimal_type = static_cast(*type); + const int64_t type_length = decimal_type.byte_width(); - std::shared_ptr data; - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); + std::shared_ptr data; + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - // raw bytes that we can write to - uint8_t* out_ptr = data->mutable_data(); + // raw bytes that we can write to + uint8_t* out_ptr = data->mutable_data(); - const int64_t null_count = binary_array.null_count(); + const int64_t null_count = binary_array.null_count(); - // convert each BinaryArray value to valid decimal bytes - for (int64_t i = 0; i < length; i++, out_ptr += type_length) { - int32_t record_len = 0; - const uint8_t* record_loc = binary_array.GetValue(i, &record_len); + // convert each BinaryArray value to valid decimal bytes + for (int64_t i = 0; i < length; i++, out_ptr += type_length) { + int32_t record_len = 0; + const uint8_t* record_loc = binary_array.GetValue(i, &record_len); - if ((record_len < 0) || (record_len > type_length)) { - return Status::Invalid("Invalid BYTE_ARRAY size"); - } + if ((record_len < 0) || (record_len > type_length)) { + return Status::Invalid("Invalid BYTE_ARRAY size"); + } - auto out_ptr_view = reinterpret_cast(out_ptr); - out_ptr_view[0] = 0; - out_ptr_view[1] = 0; + auto out_ptr_view = reinterpret_cast(out_ptr); + out_ptr_view[0] = 0; + out_ptr_view[1] = 0; - // only convert rows that are not null if there are nulls, or - // all rows, if there are not - if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) { - RawBytesToDecimalBytes(record_loc, record_len, out_ptr); - } + // only convert rows that are not null if there are nulls, or + // all rows, if there are not + if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) { + RawBytesToDecimalBytes(record_loc, record_len, out_ptr); } + } + + *out = std::make_shared<::arrow::Decimal128Array>( + type, length, data, binary_array.null_bitmap(), null_count); + return Status::OK(); +} + +/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array +/// We do this by: +/// 1. Creating an arrow::BinaryArray from the RecordReader's builder +/// 2. Allocating a buffer for the arrow::Decimal128Array +/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers +/// representing the high and low bits of each decimal value. +template +struct TransferFunctor< + ArrowType, ParquetType, + typename std::enable_if::value && + (std::is_same::value || + std::is_same::value)>::type> { + Status operator()(RecordReader* reader, MemoryPool* pool, + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); - *out = std::make_shared<::arrow::Decimal128Array>( - type, length, data, binary_array.null_bitmap(), null_count); + ::arrow::ArrayVector chunks = reader->GetBuilderChunks(); + for (size_t i = 0; i < chunks.size(); ++i) { + std::shared_ptr chunk_as_decimal; + RETURN_NOT_OK( + ConvertToDecimal128(*chunks[i], type, pool, &chunk_as_decimal)); + + // Replace the chunk, which will hopefully also free memory as we go + chunks[i] = chunk_as_decimal; + } + *out = std::make_shared(chunks); return Status::OK(); } }; @@ -1295,7 +1355,7 @@ template ::value>::type> static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); const int64_t length = reader->values_written(); @@ -1342,8 +1402,7 @@ static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, template <> struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; @@ -1351,23 +1410,23 @@ struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { template <> struct TransferFunctor<::arrow::Decimal128Type, Int64Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; -#define TRANSFER_DATA(ArrowType, ParquetType) \ - TransferFunctor func; \ - RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), out)); \ - RETURN_NOT_OK(WrapIntoListArray(out)) +#define TRANSFER_DATA(ArrowType, ParquetType) \ + TransferFunctor func; \ + RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), &result)); \ + RETURN_NOT_OK(WrapIntoListArray(&result)) #define TRANSFER_CASE(ENUM, ArrowType, ParquetType) \ case ::arrow::Type::ENUM: { \ TRANSFER_DATA(ArrowType, ParquetType); \ } break; -Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status PrimitiveImpl::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { try { // Pre-allocation gives much better performance for flat columns record_reader_->Reserve(records_to_read); @@ -1387,6 +1446,7 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* return ::arrow::Status::IOError(e.what()); } + Datum result; switch (field_->type()->id()) { TRANSFER_CASE(BOOL, ::arrow::BooleanType, BooleanType) TRANSFER_CASE(UINT8, ::arrow::UInt8Type, Int32Type) @@ -1405,8 +1465,8 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* TRANSFER_CASE(DATE64, ::arrow::Date64Type, Int32Type) TRANSFER_CASE(FIXED_SIZE_BINARY, ::arrow::FixedSizeBinaryType, FLBAType) case ::arrow::Type::NA: { - *out = std::make_shared<::arrow::NullArray>(record_reader_->values_written()); - RETURN_NOT_OK(WrapIntoListArray(out)); + result = std::make_shared<::arrow::NullArray>(record_reader_->values_written()); + RETURN_NOT_OK(WrapIntoListArray(&result)); break; } case ::arrow::Type::DECIMAL: { @@ -1452,6 +1512,15 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* return Status::NotImplemented(ss.str()); } + DCHECK_NE(result.kind(), Datum::NONE); + + if (result.kind() == Datum::ARRAY) { + *out = std::make_shared(result.make_array()); + } else if (result.kind() == Datum::CHUNKED_ARRAY) { + *out = result.chunked_array(); + } else { + DCHECK(false) << "Should be impossible"; + } return Status::OK(); } @@ -1477,10 +1546,17 @@ ColumnReader::ColumnReader(std::unique_ptr impl) ColumnReader::~ColumnReader() {} -Status ColumnReader::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status ColumnReader::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { return impl_->NextBatch(records_to_read, out); } +Status ColumnReader::NextBatch(int64_t records_to_read, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(impl_->NextBatch(records_to_read, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + // StructImpl methods Status StructImpl::DefLevelsToNullArray(std::shared_ptr* null_bitmap_out, @@ -1565,17 +1641,21 @@ Status StructImpl::GetRepLevels(const int16_t** data, size_t* length) { return Status::NotImplemented("GetRepLevels is not implemented for struct"); } -Status StructImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status StructImpl::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { std::vector> children_arrays; std::shared_ptr null_bitmap; int64_t null_count; // Gather children arrays and def levels for (auto& child : children_) { - std::shared_ptr child_array; + std::shared_ptr field; + RETURN_NOT_OK(child->NextBatch(records_to_read, &field)); - RETURN_NOT_OK(child->NextBatch(records_to_read, &child_array)); - children_arrays.push_back(child_array); + if (field->num_chunks() > 1) { + return Status::Invalid("Chunked field reads not yet supported with StructArray"); + } + children_arrays.push_back(field->chunk(0)); } RETURN_NOT_OK(DefLevelsToNullArray(&null_bitmap, &null_count)); @@ -1589,8 +1669,9 @@ Status StructImpl::NextBatch(int64_t records_to_read, std::shared_ptr* ou } } - *out = std::make_shared(field()->type(), struct_length, children_arrays, - null_bitmap, null_count); + auto result = std::make_shared(field()->type(), struct_length, + children_arrays, null_bitmap, null_count); + *out = std::make_shared(result); return Status::OK(); } @@ -1613,10 +1694,16 @@ RowGroupReader::~RowGroupReader() {} RowGroupReader::RowGroupReader(FileReader::Impl* impl, int row_group_index) : impl_(impl), row_group_index_(row_group_index) {} -Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { +Status ColumnChunkReader::Read(std::shared_ptr<::arrow::ChunkedArray>* out) { return impl_->ReadColumnChunk(column_index_, row_group_index_, out); } +Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(impl_->ReadColumnChunk(column_index_, row_group_index_, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + ColumnChunkReader::~ColumnChunkReader() {} ColumnChunkReader::ColumnChunkReader(FileReader::Impl* impl, int row_group_index, diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 2cd94ca28fdcb..5286e742b08c1 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -30,6 +30,7 @@ namespace arrow { class Array; +class ChunkedArray; class MemoryPool; class RecordBatchReader; class Schema; @@ -125,6 +126,10 @@ class PARQUET_EXPORT FileReader { std::shared_ptr<::arrow::Schema>* out); // Read column as a whole into an Array. + ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out); // NOTE: Experimental API @@ -139,27 +144,11 @@ class PARQUET_EXPORT FileReader { // 2 foo3 // // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc - ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); + ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::ChunkedArray>* out); - // NOTE: Experimental API - // Reads a specific top level schema field into an Array, while keeping only chosen - // leaf columns. - // The index i refers the index of the top level schema field, which may - // be nested or flat, and indices vector refers to the leaf column indices - e.g. - // - // i indices - // 0 0 foo.bar - // 0 1 foo.bar.baz - // 0 2 foo.qux - // 1 3 foo2 - // 2 4 foo3 - // - // i=0 indices={0,2} will read a partial struct with foo.bar and foo.quox columns - // i=1 indices={3} will read foo2 column - // i=1 indices={2} will result in out=nullptr - // leaf indices which are unrelated to the schema field are ignored - ::arrow::Status ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr<::arrow::Array>* out); + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") + ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the /// ordering in row_group_indices matters. @@ -248,6 +237,10 @@ class PARQUET_EXPORT RowGroupReader { class PARQUET_EXPORT ColumnChunkReader { public: + ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out); virtual ~ColumnChunkReader(); @@ -281,6 +274,11 @@ class PARQUET_EXPORT ColumnReader { // // Returns Status::OK on a successful read, including if you have exhausted // the data available in the file. + ::arrow::Status NextBatch(int64_t batch_size, + std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out); private: diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index 4a3cd526b118a..d1bf2c5cdfdc6 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -86,14 +86,6 @@ class RecordReader::RecordReaderImpl { valid_bits_ = AllocateBuffer(pool); def_levels_ = AllocateBuffer(pool); rep_levels_ = AllocateBuffer(pool); - - if (descr->physical_type() == Type::BYTE_ARRAY) { - builder_.reset(new ::arrow::BinaryBuilder(pool)); - } else if (descr->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) { - int byte_width = descr->type_length(); - std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); - builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool)); - } Reset(); } @@ -229,8 +221,6 @@ class RecordReader::RecordReaderImpl { return result; } - ::arrow::ArrayBuilder* builder() { return builder_.get(); } - // Process written repetition/definition levels to reach the end of // records. Process no more levels than necessary to delimit the indicated // number of logical records. Updates internal state of RecordReader @@ -375,7 +365,7 @@ class RecordReader::RecordReaderImpl { records_read_ = 0; - // Calling Finish on the builders also resets them + // Call Finish on the binary builders to reset them } void ResetValues() { @@ -391,6 +381,8 @@ class RecordReader::RecordReaderImpl { virtual void DebugPrintState() = 0; + virtual std::vector> GetBuilderChunks() = 0; + protected: virtual bool ReadNewPage() = 0; @@ -434,9 +426,6 @@ class RecordReader::RecordReaderImpl { int64_t levels_position_; int64_t levels_capacity_; - // TODO(wesm): ByteArray / FixedLenByteArray types - std::unique_ptr<::arrow::ArrayBuilder> builder_; - std::shared_ptr<::arrow::ResizableBuffer> values_; template @@ -449,13 +438,32 @@ class RecordReader::RecordReaderImpl { std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; }; +template +struct RecordReaderTraits { + using BuilderType = ::arrow::ArrayBuilder; +}; + +template <> +struct RecordReaderTraits { + using BuilderType = ::arrow::internal::ChunkedBinaryBuilder; +}; + +template <> +struct RecordReaderTraits { + using BuilderType = ::arrow::FixedSizeBinaryBuilder; +}; + template class TypedRecordReader : public RecordReader::RecordReaderImpl { public: - typedef typename DType::c_type T; + using T = typename DType::c_type; - TypedRecordReader(const ColumnDescriptor* schema, ::arrow::MemoryPool* pool) - : RecordReader::RecordReaderImpl(schema, pool), current_decoder_(nullptr) {} + using BuilderType = typename RecordReaderTraits::BuilderType; + + TypedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) + : RecordReader::RecordReaderImpl(descr, pool), current_decoder_(nullptr) { + InitializeBuilder(); + } void ResetDecoders() override { decoders_.clear(); } @@ -546,6 +554,10 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { std::cout << std::endl; } + std::vector> GetBuilderChunks() override { + throw ParquetException("GetChunks only implemented for binary types"); + } + private: typedef Decoder DecoderType; @@ -554,11 +566,15 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { // plain-encoded data. std::unordered_map> decoders_; + std::unique_ptr builder_; + DecoderType* current_decoder_; // Advance to the next data page bool ReadNewPage() override; + void InitializeBuilder() {} + void ConfigureDictionary(const DictionaryPage* page); }; @@ -572,6 +588,36 @@ void TypedRecordReader::DebugPrintState() {} template <> void TypedRecordReader::DebugPrintState() {} +template <> +void TypedRecordReader::InitializeBuilder() { + // Maximum of 16MB chunks + constexpr int32_t kBinaryChunksize = 1 << 24; + DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); + builder_.reset(new ::arrow::internal::ChunkedBinaryBuilder(kBinaryChunksize, pool_)); +} + +template <> +void TypedRecordReader::InitializeBuilder() { + DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY); + int byte_width = descr_->type_length(); + std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); + builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool_)); +} + +template <> +::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { + ::arrow::ArrayVector chunks; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunks)); + return chunks; +} + +template <> +::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { + std::shared_ptr<::arrow::Array> chunk; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); + return ::arrow::ArrayVector({chunk}); +} + template <> inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) { auto values = ValuesHead(); @@ -579,10 +625,9 @@ inline void TypedRecordReader::ReadValuesDense(int64_t values_to_ current_decoder_->Decode(values, static_cast(values_to_read)); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::BinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { PARQUET_THROW_NOT_OK( - builder->Append(values[i].ptr, static_cast(values[i].len))); + builder_->Append(values[i].ptr, static_cast(values[i].len))); } ResetValues(); } @@ -594,9 +639,8 @@ inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) current_decoder_->Decode(values, static_cast(values_to_read)); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::FixedSizeBinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { - PARQUET_THROW_NOT_OK(builder->Append(values[i].ptr)); + PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); } ResetValues(); } @@ -613,14 +657,12 @@ inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to valid_bits_offset); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::BinaryBuilder*>(builder_.get()); - for (int64_t i = 0; i < num_decoded; i++) { if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { PARQUET_THROW_NOT_OK( - builder->Append(values[i].ptr, static_cast(values[i].len))); + builder_->Append(values[i].ptr, static_cast(values[i].len))); } else { - PARQUET_THROW_NOT_OK(builder->AppendNull()); + PARQUET_THROW_NOT_OK(builder_->AppendNull()); } } ResetValues(); @@ -638,12 +680,11 @@ inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to_read valid_bits_offset); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::FixedSizeBinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { - PARQUET_THROW_NOT_OK(builder->Append(values[i].ptr)); + PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); } else { - PARQUET_THROW_NOT_OK(builder->AppendNull()); + PARQUET_THROW_NOT_OK(builder_->AppendNull()); } } ResetValues(); @@ -845,8 +886,6 @@ std::shared_ptr RecordReader::ReleaseIsValid() { return impl_->ReleaseIsValid(); } -::arrow::ArrayBuilder* RecordReader::builder() { return impl_->builder(); } - int64_t RecordReader::values_written() const { return impl_->values_written(); } int64_t RecordReader::levels_position() const { return impl_->levels_position(); } @@ -863,6 +902,10 @@ void RecordReader::SetPageReader(std::unique_ptr reader) { impl_->SetPageReader(std::move(reader)); } +::arrow::ArrayVector RecordReader::GetBuilderChunks() { + return impl_->GetBuilderChunks(); +} + void RecordReader::DebugPrintState() { impl_->DebugPrintState(); } } // namespace internal diff --git a/cpp/src/parquet/arrow/record_reader.h b/cpp/src/parquet/arrow/record_reader.h index 7efd0d54899fe..0f62b744f323a 100644 --- a/cpp/src/parquet/arrow/record_reader.h +++ b/cpp/src/parquet/arrow/record_reader.h @@ -20,6 +20,7 @@ #include #include +#include #include "arrow/memory_pool.h" @@ -28,7 +29,7 @@ namespace arrow { -class ArrayBuilder; +class Array; } // namespace arrow @@ -77,7 +78,6 @@ class RecordReader { std::shared_ptr ReleaseValues(); std::shared_ptr ReleaseIsValid(); - ::arrow::ArrayBuilder* builder(); /// \brief Number of values written including nulls (if any) int64_t values_written() const; @@ -106,6 +106,9 @@ class RecordReader { void DebugPrintState(); + // For BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY types that may have chunked output + std::vector> GetBuilderChunks(); + private: std::unique_ptr impl_; explicit RecordReader(RecordReaderImpl* impl); diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 9e1a24961af0e..b63e72c57cfa8 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -19,7 +19,7 @@ # cython: language_level = 3 from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus, +from pyarrow.includes.libarrow cimport (CChunkedArray, CSchema, CStatus, CTable, CMemoryPool, CKeyValueMetadata, RandomAccessFile, OutputStream, @@ -272,8 +272,8 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef cppclass FileReader: FileReader(CMemoryPool* pool, unique_ptr[ParquetFileReader] reader) - CStatus ReadColumn(int i, shared_ptr[CArray]* out) - CStatus ReadSchemaField(int i, shared_ptr[CArray]* out) + CStatus ReadColumn(int i, shared_ptr[CChunkedArray]* out) + CStatus ReadSchemaField(int i, shared_ptr[CChunkedArray]* out) int num_row_groups() CStatus ReadRowGroup(int i, shared_ptr[CTable]* out) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 8112504e9e403..36a4d345c6a3d 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -26,6 +26,7 @@ from pyarrow.lib cimport (Array, Schema, check_status, MemoryPool, maybe_unbox_memory_pool, Table, + pyarrow_wrap_chunked_array, pyarrow_wrap_schema, pyarrow_wrap_table, NativeFile, get_reader, get_writer) @@ -770,28 +771,18 @@ cdef class ParquetReader: return self._column_idx_map[tobytes(column_name)] def read_column(self, int column_index): - cdef: - Array array = Array() - shared_ptr[CArray] carray - + cdef shared_ptr[CChunkedArray] out with nogil: check_status(self.reader.get() - .ReadColumn(column_index, &carray)) - - array.init(carray) - return array + .ReadColumn(column_index, &out)) + return pyarrow_wrap_chunked_array(out) def read_schema_field(self, int field_index): - cdef: - Array array = Array() - shared_ptr[CArray] carray - + cdef shared_ptr[CChunkedArray] out with nogil: check_status(self.reader.get() - .ReadSchemaField(field_index, &carray)) - - array.init(carray) - return array + .ReadSchemaField(field_index, &out)) + return pyarrow_wrap_chunked_array(out) cdef class ParquetWriter: diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 745a049e32a7c..3e628263ba36f 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -396,6 +396,8 @@ cdef object pyarrow_wrap_metadata( # cdef public object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array) +cdef public object pyarrow_wrap_chunked_array( + const shared_ptr[CChunkedArray]& sp_array) # XXX pyarrow.h calls it `wrap_record_batch` cdef public object pyarrow_wrap_batch(const shared_ptr[CRecordBatch]& cbatch) cdef public object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 89d3224580463..5c27a9b86a369 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1959,6 +1959,33 @@ def test_large_table_int32_overflow(): _write_table(table, f) +@pytest.mark.large_memory +def test_binary_array_overflow_to_chunked(): + # ARROW-3762 + + # 2^31 + 1 bytes + values = [b'x'] + [ + b'x' * (1 << 20) + ] * 2 * (1 << 10) + df = pd.DataFrame({'byte_col': values}) + + tbl = pa.Table.from_pandas(df, preserve_index=False) + + buf = io.BytesIO() + _write_table(tbl, buf) + buf.seek(0) + read_tbl = _read_table(buf) + buf = None + + col0_data = read_tbl[0].data + assert isinstance(col0_data, pa.ChunkedArray) + + # Split up into 16MB chunks. 128 * 16 = 2048, so 129 + assert col0_data.num_chunks == 129 + + assert tbl.equals(read_tbl) + + def test_index_column_name_duplicate(tempdir): data = { 'close': { From ce12fb55107e2ee5439267fe1a17ded8d2210849 Mon Sep 17 00:00:00 2001 From: Pindikura Ravindra Date: Sat, 15 Dec 2018 05:48:01 +0530 Subject: [PATCH 41/45] ARROW-1807: [Java] consolidate bufs to reduce heap (#3121) - for fixed-len vectors, alloc a combined arrow buf for value and validity. - Remove the read-write locks in AllocationMgr, they contribute about 150 bytes to the heap, and aren't very useful since there isn't much contention. --- .../arrow/memory/AllocationManager.java | 34 ++----- .../arrow/vector/BaseFixedWidthVector.java | 94 ++++++++++++------- .../vector/TestBufferOwnershipTransfer.java | 5 +- .../apache/arrow/vector/TestListVector.java | 10 +- 4 files changed, 73 insertions(+), 70 deletions(-) diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java index aaa1f506fb5c2..687674f951b89 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -22,11 +22,8 @@ import java.util.IdentityHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.arrow.memory.BaseAllocator.Verbosity; -import org.apache.arrow.memory.util.AutoCloseableLock; import org.apache.arrow.memory.util.HistoricalLog; import org.apache.arrow.util.Preconditions; @@ -73,9 +70,6 @@ public class AllocationManager { // ARROW-1627 Trying to minimize memory overhead caused by previously used IdentityHashMap // see JIRA for details private final LowCostIdentityHashMap map = new LowCostIdentityHashMap<>(); - private final ReadWriteLock lock = new ReentrantReadWriteLock(); - private final AutoCloseableLock readLock = new AutoCloseableLock(lock.readLock()); - private final AutoCloseableLock writeLock = new AutoCloseableLock(lock.writeLock()); private final long amCreationTime = System.nanoTime(); private volatile BufferLedger owningLedger; @@ -115,9 +109,8 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta "A buffer can only be associated between two allocators that share the same root."); } - try (AutoCloseableLock read = readLock.open()) { - - final BufferLedger ledger = map.get(allocator); + synchronized (this) { + BufferLedger ledger = map.get(allocator); if (ledger != null) { if (retain) { ledger.inc(); @@ -125,20 +118,7 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta return ledger; } - } - try (AutoCloseableLock write = writeLock.open()) { - // we have to recheck existing ledger since a second reader => writer could be competing - // with us. - - final BufferLedger existingLedger = map.get(allocator); - if (existingLedger != null) { - if (retain) { - existingLedger.inc(); - } - return existingLedger; - } - - final BufferLedger ledger = new BufferLedger(allocator); + ledger = new BufferLedger(allocator); if (retain) { ledger.inc(); } @@ -153,7 +133,7 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta * The way that a particular BufferLedger communicates back to the AllocationManager that it * now longer needs to hold * a reference to particular piece of memory. - * Can only be called when you already hold the writeLock. + * Can only be called when you already hold the lock. */ private void release(final BufferLedger ledger) { final BaseAllocator allocator = ledger.getAllocator(); @@ -250,7 +230,7 @@ public boolean transferBalance(final BufferLedger target) { // since two balance transfers out from the allocator manager could cause incorrect // accounting, we need to ensure // that this won't happen by synchronizing on the allocator manager instance. - try (AutoCloseableLock write = writeLock.open()) { + synchronized (this) { if (owningLedger != this) { return true; } @@ -330,7 +310,7 @@ public int decrement(int decrement) { allocator.assertOpen(); final int outcome; - try (AutoCloseableLock write = writeLock.open()) { + synchronized (this) { outcome = bufRefCnt.addAndGet(-decrement); if (outcome == 0) { lDestructionTime = System.nanoTime(); @@ -431,7 +411,7 @@ public int getSize() { * @return Amount of accounted(owned) memory associated with this ledger. */ public int getAccountedSize() { - try (AutoCloseableLock read = readLock.open()) { + synchronized (this) { if (owningLedger == this) { return size; } else { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index bc0b77a0aeb0a..f69a9d1754ac7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -270,7 +270,7 @@ public boolean allocateNewSafe() { long curAllocationSizeValue = valueAllocationSizeInBytes; long curAllocationSizeValidity = validityAllocationSizeInBytes; - if (curAllocationSizeValue > MAX_ALLOCATION_SIZE) { + if (align(curAllocationSizeValue) + curAllocationSizeValidity > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory exceeds limit"); } @@ -302,7 +302,7 @@ public void allocateNew(int valueCount) { valueBufferSize = validityBufferSize; } - if (valueBufferSize > MAX_ALLOCATION_SIZE) { + if (align(valueBufferSize) + validityBufferSize > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); } @@ -317,6 +317,13 @@ public void allocateNew(int valueCount) { } } + /* + * align to a 8-byte value. + */ + private long align(long size) { + return ((size + 7) / 8) * 8; + } + /** * Actual memory allocation is done by this function. All the calculations * and knowledge about what size to allocate is upto the callers of this @@ -327,14 +334,24 @@ public void allocateNew(int valueCount) { * conditions. */ private void allocateBytes(final long valueBufferSize, final long validityBufferSize) { - /* allocate data buffer */ - int curSize = (int) valueBufferSize; - valueBuffer = allocator.buffer(curSize); + int valueBufferSlice = (int)align(valueBufferSize); + int validityBufferSlice = (int)validityBufferSize; + + /* allocate combined buffer */ + ArrowBuf buffer = allocator.buffer(valueBufferSlice + validityBufferSlice); + + valueAllocationSizeInBytes = valueBufferSlice; + valueBuffer = buffer.slice(0, valueBufferSlice); + valueBuffer.retain(); valueBuffer.readerIndex(0); - valueAllocationSizeInBytes = curSize; - /* allocate validity buffer */ - allocateValidityBuffer((int) validityBufferSize); + + validityAllocationSizeInBytes = validityBufferSlice; + validityBuffer = buffer.slice(valueBufferSlice, validityBufferSlice); + validityBuffer.retain(); + validityBuffer.readerIndex(0); zeroVector(); + + buffer.release(); } /** @@ -422,43 +439,50 @@ public ArrowBuf[] getBuffers(boolean clear) { */ @Override public void reAlloc() { - valueBuffer = reallocBufferHelper(valueBuffer, true); - validityBuffer = reallocBufferHelper(validityBuffer, false); - } - - /** - * Helper method for reallocating a particular internal buffer - * Returns the new buffer. - */ - private ArrowBuf reallocBufferHelper(ArrowBuf buffer, final boolean dataBuffer) { - final int currentBufferCapacity = buffer.capacity(); - long baseSize = (dataBuffer ? valueAllocationSizeInBytes - : validityAllocationSizeInBytes); + int valueBaseSize = Integer.max(valueBuffer.capacity(), valueAllocationSizeInBytes); + long newValueBufferSlice = align(valueBaseSize * 2L); + long newValidityBufferSlice; + if (typeWidth > 0) { + long targetValueBufferSize = align(BaseAllocator.nextPowerOfTwo(newValueBufferSlice)); + long targetValueCount = targetValueBufferSize / typeWidth; + targetValueBufferSize -= getValidityBufferSizeFromCount((int) targetValueCount); + if (newValueBufferSlice < targetValueBufferSize) { + newValueBufferSlice = targetValueBufferSize; + } - if (baseSize < (long) currentBufferCapacity) { - baseSize = (long) currentBufferCapacity; + newValidityBufferSlice = getValidityBufferSizeFromCount((int)(newValueBufferSlice / typeWidth)); + } else { + newValidityBufferSlice = newValueBufferSlice; } - long newAllocationSize = baseSize * 2L; - newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + long newAllocationSize = newValueBufferSlice + newValidityBufferSlice; assert newAllocationSize >= 1; if (newAllocationSize > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Unable to expand the buffer"); } - final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); - newBuf.setBytes(0, buffer, 0, currentBufferCapacity); - newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); - buffer.release(1); - buffer = newBuf; - if (dataBuffer) { - valueAllocationSizeInBytes = (int) newAllocationSize; - } else { - validityAllocationSizeInBytes = (int) newAllocationSize; - } + final ArrowBuf newBuffer = allocator.buffer((int) newAllocationSize); + final ArrowBuf newValueBuffer = newBuffer.slice(0, (int)newValueBufferSlice); + newValueBuffer.setBytes(0, valueBuffer, 0, valueBuffer.capacity()); + newValueBuffer.setZero(valueBuffer.capacity(), (int)newValueBufferSlice - valueBuffer.capacity()); + newValueBuffer.retain(); + newValueBuffer.readerIndex(0); + valueBuffer.release(); + valueBuffer = newValueBuffer; + valueAllocationSizeInBytes = (int)newValueBufferSlice; + + final ArrowBuf newValidityBuffer = newBuffer.slice((int)newValueBufferSlice, + (int)newValidityBufferSlice); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), (int)newValidityBufferSlice - validityBuffer.capacity()); + newValidityBuffer.retain(); + newValidityBuffer.readerIndex(0); + validityBuffer.release(); + validityBuffer = newValidityBuffer; + validityAllocationSizeInBytes = (int)newValidityBufferSlice; - return buffer; + newBuffer.release(); } @Override diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java index 48bc8936d9fbe..9165343bfdc2b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java @@ -40,15 +40,14 @@ public void testTransferFixedWidth() { IntVector v1 = new IntVector("v1", childAllocator1); v1.allocateNew(); v1.setValueCount(4095); + long totalAllocatedMemory = childAllocator1.getAllocatedMemory(); IntVector v2 = new IntVector("v2", childAllocator2); v1.makeTransferPair(v2).transfer(); assertEquals(0, childAllocator1.getAllocatedMemory()); - int expectedBitVector = 512; - int expectedValueVector = 4096 * 4; - assertEquals(expectedBitVector + expectedValueVector, childAllocator2.getAllocatedMemory()); + assertEquals(totalAllocatedMemory, childAllocator2.getAllocatedMemory()); } @Test diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index 4e8d8f0f39944..68102b1c32a46 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -774,13 +774,13 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(512); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(4096, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 5); /* use density as 4 */ vector.setInitialCapacity(512, 4); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(512 * 4, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 4); /** * inner value capacity we pass to data vector is 512 * 0.1 => 51 @@ -793,7 +793,7 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(512, 0.1); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(64, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 51); /** * inner value capacity we pass to data vector is 512 * 0.01 => 5 @@ -806,7 +806,7 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(512, 0.01); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(8, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 5); /** * inner value capacity we pass to data vector is 5 * 0.1 => 0 @@ -822,7 +822,7 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(5, 0.1); vector.allocateNew(); assertEquals(7, vector.getValueCapacity()); - assertEquals(1, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 1); } } From e098651a12f8199936f48f523e4f062a411969f7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Dec 2018 17:17:09 +0100 Subject: [PATCH 42/45] ARROW-3971: [Python] Remove deprecations in 0.11 and prior Author: Wes McKinney Closes #3180 from wesm/ARROW-3971 and squashes the following commits: 2a367f5d Remove Python deprecations in 0.11 and prior --- python/pyarrow/_parquet.pyx | 2 +- python/pyarrow/feather.py | 6 ----- python/pyarrow/filesystem.py | 5 +---- python/pyarrow/formatting.py | 43 ------------------------------------ python/pyarrow/parquet.py | 15 +++++-------- python/pyarrow/util.py | 16 +++++--------- 6 files changed, 14 insertions(+), 73 deletions(-) delete mode 100644 python/pyarrow/formatting.py diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 36a4d345c6a3d..2e92bac9a74d8 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -32,8 +32,8 @@ from pyarrow.lib cimport (Array, Schema, NativeFile, get_reader, get_writer) from pyarrow.compat import tobytes, frombytes -from pyarrow.formatting import indent from pyarrow.lib import ArrowException, NativeFile, _stringify_path +from pyarrow.util import indent import six import warnings diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 930e999a56116..faa2f7d892ee0 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -20,7 +20,6 @@ import six import pandas as pd -import warnings from pyarrow.compat import pdapi from pyarrow.lib import FeatherError # noqa @@ -44,11 +43,6 @@ def __init__(self, source): self.source = source self.open(source) - def read(self, *args, **kwargs): - warnings.warn("read has been deprecated. Use read_pandas instead.", - FutureWarning, stacklevel=2) - return self.read_pandas(*args, **kwargs) - def read_table(self, columns=None): if columns is None: return self._read() diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index f1d0eec3f8df5..8188a2607e21a 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -148,8 +148,7 @@ def _isfilestore(self): raise NotImplementedError def read_parquet(self, path, columns=None, metadata=None, schema=None, - use_threads=True, nthreads=None, - use_pandas_metadata=False): + use_threads=True, use_pandas_metadata=False): """ Read Parquet data from path in file system. Can read from a single file or a directory of files @@ -176,8 +175,6 @@ def read_parquet(self, path, columns=None, metadata=None, schema=None, table : pyarrow.Table """ from pyarrow.parquet import ParquetDataset - from pyarrow.util import _deprecate_nthreads - use_threads = _deprecate_nthreads(use_threads, nthreads) dataset = ParquetDataset(path, schema=schema, metadata=metadata, filesystem=self) return dataset.read(columns=columns, use_threads=use_threads, diff --git a/python/pyarrow/formatting.py b/python/pyarrow/formatting.py deleted file mode 100644 index 5ef9482ed144c..0000000000000 --- a/python/pyarrow/formatting.py +++ /dev/null @@ -1,43 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Pretty-printing and other formatting utilities for Arrow data structures - -import pyarrow.lib as lib -import warnings - -try: - from textwrap import indent -except ImportError: - def indent(text, prefix): - return ''.join(prefix + line for line in text.splitlines(True)) - - -def array_format(arr, window=10): - warnings.warn("array_format is deprecated, use Array.format() instead", - FutureWarning) - return arr.format(window=window) - - -def value_format(x, indent_level=0): - warnings.warn("value_format is deprecated", - FutureWarning) - if isinstance(x, lib.ListValue): - contents = ',\n'.join(value_format(item) for item in x) - return '[{0}]'.format(indent(contents, ' ').strip()) - else: - return repr(x) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 3ebfc8c0517ce..b89145adc4433 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -35,7 +35,7 @@ from pyarrow.compat import guid from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem, _get_fs_from_path) -from pyarrow.util import _is_path_like, _stringify_path, _deprecate_nthreads +from pyarrow.util import _is_path_like, _stringify_path def _check_contains_null(val): @@ -135,8 +135,8 @@ def schema(self): def num_row_groups(self): return self.reader.num_row_groups - def read_row_group(self, i, columns=None, nthreads=None, - use_threads=True, use_pandas_metadata=False): + def read_row_group(self, i, columns=None, use_threads=True, + use_pandas_metadata=False): """ Read a single row group from a Parquet file @@ -157,7 +157,6 @@ def read_row_group(self, i, columns=None, nthreads=None, pyarrow.table.Table Content of the row group as a table (of columns) """ - use_threads = _deprecate_nthreads(use_threads, nthreads) column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_row_group(i, column_indices=column_indices, @@ -1071,9 +1070,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1): def read_table(source, columns=None, use_threads=True, metadata=None, - use_pandas_metadata=False, memory_map=True, - nthreads=None): - use_threads = _deprecate_nthreads(use_threads, nthreads) + use_pandas_metadata=False, memory_map=True): if _is_path_like(source): fs = _get_fs_from_path(source) return fs.read_parquet(source, columns=columns, @@ -1094,8 +1091,8 @@ def read_table(source, columns=None, use_threads=True, metadata=None, Content of the file as a table (of columns)""") -def read_pandas(source, columns=None, use_threads=True, - memory_map=True, nthreads=None, metadata=None): +def read_pandas(source, columns=None, use_threads=True, memory_map=True, + metadata=None): return read_table(source, columns=columns, use_threads=use_threads, metadata=metadata, memory_map=True, diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index 1c26ee5e22f73..7cf57d88380e9 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -20,6 +20,12 @@ import six import warnings +try: + from textwrap import indent +except ImportError: + def indent(text, prefix): + return ''.join(prefix + line for line in text.splitlines(True)) + try: # pathlib might not be available try: @@ -72,13 +78,3 @@ def _stringify_path(path): return str(path) raise TypeError("not a path-like object") - - -def _deprecate_nthreads(use_threads, nthreads): - if nthreads is not None: - warnings.warn("`nthreads` argument is deprecated, " - "pass `use_threads` instead", FutureWarning, - stacklevel=3) - if nthreads > 1: - use_threads = True - return use_threads From 537aa2fabaad04455dbffceb77d5589230db3cea Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Sat, 15 Dec 2018 17:22:16 +0100 Subject: [PATCH 43/45] ARROW-4037: [Packaging] Remove workaround to verify 0.11.0 Author: Yosuke Shiro Closes #3182 from shiro615/packaging-remove-workaround and squashes the following commits: 60ae617b Remove workaround for 0.11.0 --- dev/release/verify-release-candidate.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 57b1850337067..45404b03dfb8a 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -211,8 +211,6 @@ test_glib() { gem install bundler fi - # Workaround for 0.11.0. 0.11.0 doesn't include c_glib/Gemfile. - wget https://raw.githubusercontent.com/apache/arrow/master/c_glib/Gemfile bundle install --path vendor/bundle bundle exec ruby test/run-test.rb From 23dfc1c5b1e303aa4ed699970c68235e319aa3d8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Dec 2018 17:27:28 +0100 Subject: [PATCH 44/45] ARROW-4006: Add CODE_OF_CONDUCT.md Many people are not aware that The ASF has a code of conduct. Having this document in the root directory will increase awareness of the type of professional behavior we expect from members of our community. Author: Wes McKinney Closes #3179 from wesm/ARROW-4006 and squashes the following commits: 6e88d8ab Add CODE_OF_CONDUCT.md --- CODE_OF_CONDUCT.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000..2efe740b77c50 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,24 @@ + + +# Code of Conduct + +* [Code of Conduct for The Apache Software Foundation][1] + +[1]: https://www.apache.org/foundation/policies/conduct.html \ No newline at end of file From 0936938e875c77c80f34b92884a30ff7fceeddcb Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Sat, 15 Dec 2018 20:19:31 +0100 Subject: [PATCH 45/45] ARROW-4039: [Python] Update link to 'development.rst' page from Python README.md Author: Benjamin Kietzman Closes #3185 from bkietz/ARROW-4039-update-development-link and squashes the following commits: a18596e7 Update README.md --- python/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/README.md b/python/README.md index 7d66dddd87c77..ce696939929f9 100644 --- a/python/README.md +++ b/python/README.md @@ -83,6 +83,6 @@ pip install -r ../docs/requirements.txt python setup.py build_sphinx -s ../docs/source ``` -[2]: https://github.com/apache/arrow/blob/master/python/doc/source/development.rst +[2]: https://github.com/apache/arrow/blob/master/docs/source/python/development.rst [3]: https://github.com/pandas-dev/pandas [4]: https://docs.pytest.org/en/latest/