From 5704d8d644154f21597125d14d20c9e9ae4379d0 Mon Sep 17 00:00:00 2001 From: Kengo Seki Date: Sun, 9 Dec 2018 11:30:20 -0600 Subject: [PATCH 001/328] ARROW-3940: [Python/Documentation] Add required packages to the development instruction I mistakenly closed #3102 so I'll submit the revised PR. @wesm would you take a look at this? Author: Kengo Seki Closes #3126 from sekikn/ARROW-3940-2 and squashes the following commits: 15e369eb0 ARROW-3940: Add required packages to the development instruction --- docs/source/python/development.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst index e86a0be0d04a4..4258feef79f44 100644 --- a/docs/source/python/development.rst +++ b/docs/source/python/development.rst @@ -125,9 +125,13 @@ dependencies will be automatically built by Arrow's third-party toolchain. libboost-filesystem-dev \ libboost-system-dev \ libboost-regex-dev \ + python-dev \ + autoconf \ flex \ bison +If you are building Arrow for Python 3, install ``python3-dev`` instead of ``python-dev``. + On Arch Linux, you can get these dependencies via pacman. .. code-block:: shell @@ -185,6 +189,12 @@ Now build and install the Arrow C++ libraries: If you don't want to build and install the Plasma in-memory object store, you can omit the ``-DARROW_PLASMA=on`` flag. +Also, if multiple versions of Python are installed in your environment, +you may have to pass additional parameters to cmake so that +it can find the right executable, headers and libraries. +For example, specifying `-DPYTHON_EXECUTABLE=$VIRTUAL_ENV/bin/python` +(assuming that you're in virtualenv) enables cmake to choose +the python executable which you are using. .. note:: @@ -227,6 +237,7 @@ libraries), one can set ``--bundle-arrow-cpp``: .. code-block:: shell + pip install wheel # if not installed python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ --with-parquet --with-plasma --bundle-arrow-cpp bdist_wheel From 1dee3f4e794ead69490073cb0e7d99cb6cf1169f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sun, 9 Dec 2018 13:28:34 -0600 Subject: [PATCH 002/328] ARROW-3303: [C++] API for creating arrays from simple JSON string Author: Antoine Pitrou Closes #3084 from pitrou/ARROW-3303-json-values and squashes the following commits: 1b9f4b510 ARROW-3303: API for creating arrays from simple JSON string --- cpp/CMakeLists.txt | 5 + cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array-dict-test.cc | 281 +++------- cpp/src/arrow/array-test.cc | 17 + cpp/src/arrow/builder.h | 3 + cpp/src/arrow/ipc/CMakeLists.txt | 2 + cpp/src/arrow/ipc/ipc-json-simple-test.cc | 594 ++++++++++++++++++++++ cpp/src/arrow/ipc/json-internal.h | 1 + cpp/src/arrow/ipc/json-simple.cc | 508 ++++++++++++++++++ cpp/src/arrow/ipc/json-simple.h | 56 ++ cpp/src/arrow/pretty_print-test.cc | 76 +-- cpp/src/arrow/test-util.cc | 13 +- cpp/src/arrow/test-util.h | 12 + cpp/src/arrow/util/decimal.cc | 14 +- cpp/src/arrow/util/decimal.h | 5 + 15 files changed, 1299 insertions(+), 289 deletions(-) create mode 100644 cpp/src/arrow/ipc/ipc-json-simple-test.cc create mode 100644 cpp/src/arrow/ipc/json-simple.cc create mode 100644 cpp/src/arrow/ipc/json-simple.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6deb339f4c2f0..68ac84e42dd6a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -416,6 +416,11 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) set(ARROW_WITH_ZSTD ON) endif() +if(ARROW_BUILD_TESTS) + # JSON parsing of arrays is required for Arrow unit tests + set(ARROW_IPC ON) +endif() + if(PARQUET_BUILD_EXAMPLES OR PARQUET_BUILD_EXECUTABLES) set(ARROW_PARQUET ON) set(ARROW_BUILD_STATIC ON) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6858f3c4c4fbe..8e932680de034 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -142,6 +142,7 @@ if (ARROW_IPC) ipc/feather.cc ipc/json.cc ipc/json-internal.cc + ipc/json-simple.cc ipc/message.cc ipc/metadata-internal.cc ipc/reader.cc diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index 4c8dcc067b8c5..cc471a3e54066 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -60,54 +60,31 @@ TYPED_TEST(TestDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - NumericBuilder dict_builder; - ASSERT_OK(dict_builder.Append(static_cast(1))); - ASSERT_OK(dict_builder.Append(static_cast(2))); - std::shared_ptr dict_array; - ASSERT_OK(dict_builder.Finish(&dict_array)); - auto dtype = std::make_shared(int8(), dict_array); + auto dict_array = ArrayFromJSON(std::make_shared(), "[1, 2]"); + auto dict_type = std::make_shared(int8(), dict_array); - Int8Builder int_builder; - ASSERT_OK(int_builder.Append(0)); - ASSERT_OK(int_builder.Append(1)); - ASSERT_OK(int_builder.Append(0)); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dict_type, int_array); - DictionaryArray expected(dtype, int_array); ASSERT_TRUE(expected.Equals(result)); } TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { - NumericBuilder builder; - // DictionaryBuilder builder; - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(1))); + auto type = std::make_shared(); - std::shared_ptr intermediate_result; - ASSERT_OK(builder.Finish(&intermediate_result)); + auto intermediate_result = ArrayFromJSON(type, "[1, 2, 1]"); DictionaryBuilder dictionary_builder(default_memory_pool()); ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result)); std::shared_ptr result; ASSERT_OK(dictionary_builder.Finish(&result)); // Build expected data - NumericBuilder dict_builder; - ASSERT_OK(dict_builder.Append(static_cast(1))); - ASSERT_OK(dict_builder.Append(static_cast(2))); - std::shared_ptr dict_array; - ASSERT_OK(dict_builder.Finish(&dict_array)); - auto dtype = std::make_shared(int8(), dict_array); + auto dict_array = ArrayFromJSON(type, "[1, 2]"); + auto dict_type = std::make_shared(int8(), dict_array); - Int8Builder int_builder; - ASSERT_OK(int_builder.Append(0)); - ASSERT_OK(int_builder.Append(1)); - ASSERT_OK(int_builder.Append(0)); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dict_type, int_array); - DictionaryArray expected(dtype, int_array); ASSERT_TRUE(expected.Equals(result)); } @@ -150,120 +127,74 @@ TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) { } TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) { + using c_type = typename TypeParam::c_type; + auto type = std::make_shared(); + DictionaryBuilder builder(default_memory_pool()); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); std::shared_ptr result; FinishAndCheckPadding(&builder, &result); // Build expected data for the initial dictionary - NumericBuilder dict_builder1; - ASSERT_OK(dict_builder1.Append(static_cast(1))); - ASSERT_OK(dict_builder1.Append(static_cast(2))); - std::shared_ptr dict_array1; - ASSERT_OK(dict_builder1.Finish(&dict_array1)); - auto dtype1 = std::make_shared(int8(), dict_array1); - - Int8Builder int_builder1; - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - std::shared_ptr int_array1; - ASSERT_OK(int_builder1.Finish(&int_array1)); + auto dict_type1 = dictionary(int8(), ArrayFromJSON(type, "[1, 2]")); + DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]")); - DictionaryArray expected(dtype1, int_array1); ASSERT_TRUE(expected.Equals(result)); // extend the dictionary builder with new data - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(3))); std::shared_ptr result_delta; ASSERT_OK(builder.Finish(&result_delta)); // Build expected data for the delta dictionary - NumericBuilder dict_builder2; - ASSERT_OK(dict_builder2.Append(static_cast(3))); - std::shared_ptr dict_array2; - ASSERT_OK(dict_builder2.Finish(&dict_array2)); - auto dtype2 = std::make_shared(int8(), dict_array2); - - Int8Builder int_builder2; - ASSERT_OK(int_builder2.Append(1)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(0)); - ASSERT_OK(int_builder2.Append(2)); - std::shared_ptr int_array2; - ASSERT_OK(int_builder2.Finish(&int_array2)); + auto dict_type2 = dictionary(int8(), ArrayFromJSON(type, "[3]")); + DictionaryArray expected_delta(dict_type2, ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]")); - DictionaryArray expected_delta(dtype2, int_array2); ASSERT_TRUE(expected_delta.Equals(result_delta)); } TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) { + using c_type = typename TypeParam::c_type; + auto type = std::make_shared(); + DictionaryBuilder builder(default_memory_pool()); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(2))); std::shared_ptr result; FinishAndCheckPadding(&builder, &result); // Build expected data for the initial dictionary - NumericBuilder dict_builder1; - ASSERT_OK(dict_builder1.Append(static_cast(1))); - ASSERT_OK(dict_builder1.Append(static_cast(2))); - std::shared_ptr dict_array1; - ASSERT_OK(dict_builder1.Finish(&dict_array1)); - auto dtype1 = std::make_shared(int8(), dict_array1); - - Int8Builder int_builder1; - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - std::shared_ptr int_array1; - ASSERT_OK(int_builder1.Finish(&int_array1)); + auto dict_type1 = dictionary(int8(), ArrayFromJSON(type, "[1, 2]")); + DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]")); - DictionaryArray expected(dtype1, int_array1); ASSERT_TRUE(expected.Equals(result)); // extend the dictionary builder with new data - ASSERT_OK(builder.Append(static_cast(2))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(3))); - ASSERT_OK(builder.Append(static_cast(1))); - ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(2))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(3))); + ASSERT_OK(builder.Append(static_cast(1))); + ASSERT_OK(builder.Append(static_cast(3))); std::shared_ptr result_delta1; ASSERT_OK(builder.Finish(&result_delta1)); // Build expected data for the delta dictionary - NumericBuilder dict_builder2; - ASSERT_OK(dict_builder2.Append(static_cast(3))); - std::shared_ptr dict_array2; - ASSERT_OK(dict_builder2.Finish(&dict_array2)); - auto dtype2 = std::make_shared(int8(), dict_array2); - - Int8Builder int_builder2; - ASSERT_OK(int_builder2.Append(1)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(0)); - ASSERT_OK(int_builder2.Append(2)); - std::shared_ptr int_array2; - ASSERT_OK(int_builder2.Finish(&int_array2)); + auto dict_type2 = dictionary(int8(), ArrayFromJSON(type, "[3]")); + DictionaryArray expected_delta1(dict_type2, ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]")); - DictionaryArray expected_delta1(dtype2, int_array2); ASSERT_TRUE(expected_delta1.Equals(result_delta1)); // extend the dictionary builder with new data again @@ -277,23 +208,9 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) { ASSERT_OK(builder.Finish(&result_delta2)); // Build expected data for the delta dictionary again - NumericBuilder dict_builder3; - ASSERT_OK(dict_builder3.Append(static_cast(4))); - ASSERT_OK(dict_builder3.Append(static_cast(5))); - std::shared_ptr dict_array3; - ASSERT_OK(dict_builder3.Finish(&dict_array3)); - auto dtype3 = std::make_shared(int8(), dict_array3); - - Int8Builder int_builder3; - ASSERT_OK(int_builder3.Append(0)); - ASSERT_OK(int_builder3.Append(1)); - ASSERT_OK(int_builder3.Append(2)); - ASSERT_OK(int_builder3.Append(3)); - ASSERT_OK(int_builder3.Append(4)); - std::shared_ptr int_array3; - ASSERT_OK(int_builder3.Finish(&int_array3)); + auto dict_type3 = dictionary(int8(), ArrayFromJSON(type, "[4, 5]")); + DictionaryArray expected_delta2(dict_type3, ArrayFromJSON(int8(), "[0, 1, 2, 3, 4]")); - DictionaryArray expected_delta2(dtype3, int_array3); ASSERT_TRUE(expected_delta2.Equals(result_delta2)); } @@ -308,21 +225,10 @@ TEST(TestStringDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - StringBuilder str_builder; - ASSERT_OK(str_builder.Append("test")); - ASSERT_OK(str_builder.Append("test2")); - std::shared_ptr str_array; - ASSERT_OK(str_builder.Finish(&str_array)); - auto dtype = std::make_shared(int8(), str_array); - - Int8Builder int_builder; - ASSERT_OK(int_builder.Append(0)); - ASSERT_OK(int_builder.Append(1)); - ASSERT_OK(int_builder.Append(0)); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); - + auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test\", \"test2\"]")); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); DictionaryArray expected(dtype, int_array); + ASSERT_TRUE(expected.Equals(result)); } @@ -373,21 +279,10 @@ TEST(TestStringDictionaryBuilder, DeltaDictionary) { ASSERT_OK(builder.Finish(&result)); // Build expected data - StringBuilder str_builder1; - ASSERT_OK(str_builder1.Append("test")); - ASSERT_OK(str_builder1.Append("test2")); - std::shared_ptr str_array1; - ASSERT_OK(str_builder1.Finish(&str_array1)); - auto dtype1 = std::make_shared(int8(), str_array1); - - Int8Builder int_builder1; - ASSERT_OK(int_builder1.Append(0)); - ASSERT_OK(int_builder1.Append(1)); - ASSERT_OK(int_builder1.Append(0)); - std::shared_ptr int_array1; - ASSERT_OK(int_builder1.Finish(&int_array1)); + auto dtype = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test\", \"test2\"]")); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dtype, int_array); - DictionaryArray expected(dtype1, int_array1); ASSERT_TRUE(expected.Equals(result)); // build a delta dictionary @@ -399,20 +294,10 @@ TEST(TestStringDictionaryBuilder, DeltaDictionary) { FinishAndCheckPadding(&builder, &result_delta); // Build expected data - StringBuilder str_builder2; - ASSERT_OK(str_builder2.Append("test3")); - std::shared_ptr str_array2; - ASSERT_OK(str_builder2.Finish(&str_array2)); - auto dtype2 = std::make_shared(int8(), str_array2); - - Int8Builder int_builder2; - ASSERT_OK(int_builder2.Append(1)); - ASSERT_OK(int_builder2.Append(2)); - ASSERT_OK(int_builder2.Append(1)); - std::shared_ptr int_array2; - ASSERT_OK(int_builder2.Finish(&int_array2)); - + auto dtype2 = dictionary(int8(), ArrayFromJSON(utf8(), "[\"test3\"]")); + auto int_array2 = ArrayFromJSON(int8(), "[1, 2, 1]"); DictionaryArray expected_delta(dtype2, int_array2); + ASSERT_TRUE(expected_delta.Equals(result_delta)); } @@ -647,7 +532,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) { TEST(TestDecimalDictionaryBuilder, Basic) { // Build the dictionary Array - const auto& decimal_type = arrow::decimal(2, 0); + auto decimal_type = arrow::decimal(2, 0); DictionaryBuilder builder(decimal_type, default_memory_pool()); // Test data @@ -660,20 +545,9 @@ TEST(TestDecimalDictionaryBuilder, Basic) { ASSERT_OK(builder.Finish(&result)); // Build expected data - FixedSizeBinaryBuilder decimal_builder(decimal_type); - ASSERT_OK(decimal_builder.Append(Decimal128(12).ToBytes())); - ASSERT_OK(decimal_builder.Append(Decimal128(11).ToBytes())); + auto dtype = dictionary(int8(), ArrayFromJSON(decimal_type, "[\"12\", \"11\"]")); + DictionaryArray expected(dtype, ArrayFromJSON(int8(), "[0, 0, 1, 0]")); - std::shared_ptr decimal_array; - ASSERT_OK(decimal_builder.Finish(&decimal_array)); - auto dtype = arrow::dictionary(int8(), decimal_array); - - Int8Builder int_builder; - ASSERT_OK(int_builder.AppendValues({0, 0, 1, 0})); - std::shared_ptr int_array; - ASSERT_OK(int_builder.Finish(&int_array)); - - DictionaryArray expected(dtype, int_array); ASSERT_TRUE(expected.Equals(result)); } @@ -758,26 +632,20 @@ TEST(TestDictionary, Basics) { TEST(TestDictionary, Equals) { vector is_valid = {true, true, false, true, true, true}; + std::shared_ptr dict, dict2, indices, indices2, indices3; - std::shared_ptr dict; - vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(dict_values, &dict); + dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]"); std::shared_ptr dict_type = dictionary(int16(), dict); - std::shared_ptr dict2; - vector dict2_values = {"foo", "bar", "baz", "qux"}; - ArrayFromVector(dict2_values, &dict2); + dict2 = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\", \"qux\"]"); std::shared_ptr dict2_type = dictionary(int16(), dict2); - std::shared_ptr indices; vector indices_values = {1, 2, -1, 0, 2, 0}; ArrayFromVector(is_valid, indices_values, &indices); - std::shared_ptr indices2; vector indices2_values = {1, 2, 0, 0, 2, 0}; ArrayFromVector(is_valid, indices2_values, &indices2); - std::shared_ptr indices3; vector indices3_values = {1, 1, 0, 0, 2, 0}; ArrayFromVector(is_valid, indices3_values, &indices3); @@ -825,17 +693,10 @@ TEST(TestDictionary, Equals) { } TEST(TestDictionary, Validate) { - vector is_valid = {true, true, false, true, true, true}; - - std::shared_ptr dict; - vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(dict_values, &dict); + auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]"); std::shared_ptr dict_type = dictionary(int16(), dict); - std::shared_ptr indices; - vector indices_values = {1, 2, 0, 0, 2, 0}; - ArrayFromVector(is_valid, indices_values, &indices); - + auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0, 2, 0]"); std::shared_ptr arr = std::make_shared(dict_type, indices); // Only checking index type for now @@ -857,28 +718,20 @@ TEST(TestDictionary, Validate) { } TEST(TestDictionary, FromArray) { - std::shared_ptr dict; - vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(dict_values, &dict); + auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]"); std::shared_ptr dict_type = dictionary(int16(), dict); - std::shared_ptr indices1; - vector indices_values1 = {1, 2, 0, 0, 2, 0}; - ArrayFromVector(indices_values1, &indices1); - - std::shared_ptr indices2; - vector indices_values2 = {1, 2, 0, 3, 2, 0}; - ArrayFromVector(indices_values2, &indices2); + auto indices1 = ArrayFromJSON(int16(), "[1, 2, 0, 0, 2, 0]"); + auto indices2 = ArrayFromJSON(int16(), "[1, 2, 0, 3, 2, 0]"); + // Invalid index is masked by null std::shared_ptr indices3; vector is_valid3 = {true, true, false, true, true, true}; vector indices_values3 = {1, 2, -1, 0, 2, 0}; ArrayFromVector(is_valid3, indices_values3, &indices3); - std::shared_ptr indices4; - vector is_valid4 = {true, true, false, true, true, true}; - vector indices_values4 = {1, 2, 1, 3, 2, 0}; - ArrayFromVector(is_valid4, indices_values4, &indices4); + // Index out of bounds + auto indices4 = ArrayFromJSON(int16(), "[1, 2, null, 3, 2, 0]"); std::shared_ptr arr1, arr2, arr3, arr4; ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices1, &arr1)); diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 1a88740a4ac08..de0885e6f5f3a 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -246,6 +246,23 @@ TEST_F(TestArray, BuildLargeInMemoryArray) { TEST_F(TestArray, TestCopy) {} +// ---------------------------------------------------------------------- +// Null type tests + +TEST(TestNullBuilder, Basics) { + NullBuilder builder; + std::shared_ptr array; + + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Append(nullptr)); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Finish(&array)); + + const auto& null_array = checked_cast(*array); + ASSERT_EQ(null_array.length(), 3); + ASSERT_EQ(null_array.null_count(), 3); +} + // ---------------------------------------------------------------------- // Primitive type tests diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 34398eebebfb6..607fa1745a5a0 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -20,6 +20,7 @@ #include // IWYU pragma: keep #include +#include #include #include #include @@ -235,6 +236,8 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { return Status::OK(); } + Status Append(std::nullptr_t value) { return AppendNull(); } + Status FinishInternal(std::shared_ptr* out) override; }; diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 9c384c3e9901c..40cebf1823e2c 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -20,6 +20,7 @@ ADD_ARROW_TEST(feather-test) ADD_ARROW_TEST(ipc-read-write-test) +ADD_ARROW_TEST(ipc-json-simple-test) ADD_ARROW_TEST(ipc-json-test) if (NOT ARROW_BOOST_HEADER_ONLY) @@ -84,6 +85,7 @@ install(FILES dictionary.h feather.h json.h + json-simple.h message.h reader.h writer.h diff --git a/cpp/src/arrow/ipc/ipc-json-simple-test.cc b/cpp/src/arrow/ipc/ipc-json-simple-test.cc new file mode 100644 index 0000000000000..45525212d2f4b --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-json-simple-test.cc @@ -0,0 +1,594 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/ipc/json-simple.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#if defined(_MSC_VER) +// "warning C4307: '+': integral constant overflow" +#pragma warning(disable : 4307) +#endif + +namespace arrow { +namespace ipc { +namespace internal { +namespace json { + +using ::arrow::internal::checked_cast; + +// Avoid undefined behaviour on signed overflow +template +Signed SafeSignedAdd(Signed u, Signed v) { + using Unsigned = typename std::make_unsigned::type; + return static_cast(static_cast(u) + static_cast(v)); +} + +// Special case for 8-bit ints (must output their decimal value, not the +// corresponding ASCII character) +void JSONArrayInternal(std::ostream* ss, int8_t value) { + *ss << static_cast(value); +} + +void JSONArrayInternal(std::ostream* ss, uint8_t value) { + *ss << static_cast(value); +} + +template +void JSONArrayInternal(std::ostream* ss, const Value& value) { + *ss << value; +} + +template +void JSONArrayInternal(std::ostream* ss, const Value& value, Tail... tail) { + JSONArrayInternal(ss, value); + *ss << ", "; + JSONArrayInternal(ss, std::forward(tail)...); +} + +template +std::string JSONArray(Args... args) { + std::stringstream ss; + ss << "["; + JSONArrayInternal(&ss, std::forward(args)...); + ss << "]"; + return ss.str(); +} + +template +void AssertJSONArray(const std::shared_ptr& type, const std::string& json, + const std::vector& values) { + std::shared_ptr actual, expected; + + ASSERT_OK(ArrayFromJSON(type, json, &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector(type, values, &expected); + AssertArraysEqual(*expected, *actual); +} + +template +void AssertJSONArray(const std::shared_ptr& type, const std::string& json, + const std::vector& is_valid, + const std::vector& values) { + std::shared_ptr actual, expected; + + ASSERT_OK(ArrayFromJSON(type, json, &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector(type, is_valid, values, &expected); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestHelper, JSONArray) { + // Test the JSONArray helper func + std::string s = + JSONArray(123, -4.5, static_cast(-12), static_cast(34)); + ASSERT_EQ(s, "[123, -4.5, -12, 34]"); + s = JSONArray(9223372036854775807LL, 9223372036854775808ULL, -9223372036854775807LL - 1, + 18446744073709551615ULL); + ASSERT_EQ(s, + "[9223372036854775807, 9223372036854775808, -9223372036854775808, " + "18446744073709551615]"); +} + +TEST(TestHelper, SafeSignedAdd) { + ASSERT_EQ(0, SafeSignedAdd(-128, -128)); + ASSERT_EQ(1, SafeSignedAdd(-128, -127)); + ASSERT_EQ(-128, SafeSignedAdd(1, 127)); + ASSERT_EQ(-2147483648LL, SafeSignedAdd(1, 2147483647)); +} + +template +class TestIntegers : public ::testing::Test {}; + +TYPED_TEST_CASE_P(TestIntegers); + +TYPED_TEST_P(TestIntegers, Basics) { + using T = TypeParam; + using c_type = typename T::c_type; + + std::shared_ptr expected, actual; + std::shared_ptr type = TypeTraits::type_singleton(); + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[4, 0, 5]", {4, 0, 5}); + AssertJSONArray(type, "[4, null, 5]", {true, false, true}, {4, 0, 5}); + + // Test limits + const auto min_val = std::numeric_limits::min(); + const auto max_val = std::numeric_limits::max(); + std::string json_string = JSONArray(0, 1, min_val); + AssertJSONArray(type, json_string, {0, 1, min_val}); + json_string = JSONArray(0, 1, max_val); + AssertJSONArray(type, json_string, {0, 1, max_val}); +} + +TYPED_TEST_P(TestIntegers, Errors) { + using T = TypeParam; + + std::shared_ptr array; + std::shared_ptr type = TypeTraits::type_singleton(); + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "0", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "{}", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0.0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"0\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); +} + +TYPED_TEST_P(TestIntegers, OutOfBounds) { + using T = TypeParam; + using c_type = typename T::c_type; + + std::shared_ptr array; + std::shared_ptr type = TypeTraits::type_singleton(); + + if (type->id() == Type::UINT64) { + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[18446744073709551616]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-1]", &array)); + } else if (type->id() == Type::INT64) { + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[9223372036854775808]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-9223372036854775809]", &array)); + } else if (std::is_signed::value) { + const auto lower = SafeSignedAdd(std::numeric_limits::min(), -1); + const auto upper = SafeSignedAdd(std::numeric_limits::max(), +1); + auto json_string = JSONArray(lower); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string, &array)); + json_string = JSONArray(upper); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string, &array)); + } else { + const auto upper = static_cast(std::numeric_limits::max()) + 1; + auto json_string = JSONArray(upper); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string, &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-1]", &array)); + } +} + +REGISTER_TYPED_TEST_CASE_P(TestIntegers, Basics, Errors, OutOfBounds); + +INSTANTIATE_TYPED_TEST_CASE_P(TestInt8, TestIntegers, Int8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt16, TestIntegers, Int16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt32, TestIntegers, Int32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestInt64, TestIntegers, Int64Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt8, TestIntegers, UInt8Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt16, TestIntegers, UInt16Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt32, TestIntegers, UInt32Type); +INSTANTIATE_TYPED_TEST_CASE_P(TestUInt64, TestIntegers, UInt64Type); + +TEST(TestNull, Basics) { + std::shared_ptr type = null(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[null, null]", {nullptr, nullptr}); +} + +TEST(TestNull, Errors) { + std::shared_ptr type = null(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[NaN]", &array)); +} + +TEST(TestBoolean, Basics) { + std::shared_ptr type = boolean(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[false, true, false]", {false, true, false}); + AssertJSONArray(type, "[false, true, null]", {true, true, false}, + {false, true, false}); +} + +TEST(TestBoolean, Errors) { + std::shared_ptr type = boolean(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"true\"]", &array)); +} + +TEST(TestFloat, Basics) { + std::shared_ptr type = float32(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[1, 2.5, -3e4]", {1.0f, 2.5f, -3.0e4f}); + AssertJSONArray(type, "[-0.0, Inf, -Inf, null]", {true, true, true, false}, + {-0.0f, INFINITY, -INFINITY, 0.0f}); + + // Check NaN separately as AssertArraysEqual simply memcmp's array contents + // and NaNs can have many bit representations. + ASSERT_OK(ArrayFromJSON(type, "[NaN]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + float value = checked_cast(*actual).Value(0); + ASSERT_TRUE(std::isnan(value)); +} + +TEST(TestFloat, Errors) { + std::shared_ptr type = float32(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[true]", &array)); +} + +TEST(TestDouble, Basics) { + std::shared_ptr type = float64(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[1, 2.5, -3e4]", {1.0, 2.5, -3.0e4}); + AssertJSONArray(type, "[-0.0, Inf, -Inf, null]", {true, true, true, false}, + {-0.0, INFINITY, -INFINITY, 0.0}); + + ASSERT_OK(ArrayFromJSON(type, "[NaN]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + double value = checked_cast(*actual).Value(0); + ASSERT_TRUE(std::isnan(value)); +} + +TEST(TestDouble, Errors) { + std::shared_ptr type = float64(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[true]", &array)); +} + +TEST(TestString, Basics) { + std::shared_ptr type = utf8(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[\"\", \"foo\"]", {"", "foo"}); + AssertJSONArray(type, "[\"\", null]", {true, false}, {"", ""}); + // NUL character in string + std::string s = "some"; + s += '\x00'; + s += "char"; + AssertJSONArray(type, "[\"\", \"some\\u0000char\"]", {"", s}); +} + +TEST(TestString, Errors) { + std::shared_ptr type = utf8(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); +} + +TEST(TestDecimal, Basics) { + std::shared_ptr type = decimal(10, 4); + std::shared_ptr expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + Decimal128Builder builder(type); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[\"123.4567\", \"-78.9000\"]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + Decimal128Builder builder(type); + ASSERT_OK(builder.Append(Decimal128(1234567))); + ASSERT_OK(builder.Append(Decimal128(-789000))); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[\"123.4567\", null]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + Decimal128Builder builder(type); + ASSERT_OK(builder.Append(Decimal128(1234567))); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestDecimal, Errors) { + std::shared_ptr type = decimal(10, 4); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[12.3456]", &array)); + // Bad scale + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"12.345\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"12.34560\"]", &array)); +} + +TEST(TestList, IntegerList) { + auto pool = default_memory_pool(); + std::shared_ptr type = list(int64()); + std::shared_ptr offsets, values, expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0}, &offsets); + ArrayFromVector({}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[4, 5], [], [6]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 2, 2, 3}, &offsets); + ArrayFromVector({4, 5, 6}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[], [null], [6, null]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 0, 1, 3}, &offsets); + auto is_valid = std::vector{false, true, false}; + ArrayFromVector(is_valid, {0, 6, 0}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [], null]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestList, IntegerListErrors) { + std::shared_ptr type = list(int64()); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]", &array)); +} + +TEST(TestList, NullList) { + auto pool = default_memory_pool(); + std::shared_ptr type = list(null()); + std::shared_ptr offsets, values, expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0}, &offsets); + values = std::make_shared(0); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[], [null], [null, null]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 0, 1, 3}, &offsets); + values = std::make_shared(3); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &expected)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [], null]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestList, IntegerListList) { + auto pool = default_memory_pool(); + std::shared_ptr type = list(list(uint8())); + std::shared_ptr offsets, values, nested, expected, actual; + + ASSERT_OK(ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &nested)); + ArrayFromVector({0, 2, 3}, &offsets); + ASSERT_OK(ListArray::FromArrays(*offsets, *nested, pool, &expected)); + ASSERT_EQ(actual->length(), 2); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool, &nested)); + ArrayFromVector({0, 0, 1, 4, 5}, &offsets); + ASSERT_OK(ListArray::FromArrays(*offsets, *nested, pool, &expected)); + ASSERT_EQ(actual->length(), 4); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [null], [[null]]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto& child_builder = checked_cast(*list_builder.value_builder()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(child_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(child_builder.Append()); + ASSERT_OK(list_builder.Finish(&expected)); + } +} + +TEST(TestStruct, SimpleStruct) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = struct_({field_a, field_b}); + std::shared_ptr a, b, expected, actual; + std::shared_ptr null_bitmap; + std::vector is_valid; + std::vector> children; + + // Trivial + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({}, &a); + ArrayFromVector({}, &b); + children.assign({a, b}); + expected = std::make_shared(type, 0, children); + AssertArraysEqual(*expected, *actual); + + // Non-empty + ArrayFromVector({5, 6}, &a); + ArrayFromVector({true, false}, &b); + children.assign({a, b}); + expected = std::make_shared(type, 2, children); + + ASSERT_OK(ArrayFromJSON(type, "[[5, true], [6, false]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); + ASSERT_OK(ArrayFromJSON(type, "[{\"a\": 5, \"b\": true}, {\"b\": false, \"a\": 6}]", + &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); + + // With nulls + is_valid = {false, true, false, false}; + ArrayFromVector(is_valid, {0, 5, 6, 0}, &a); + is_valid = {false, false, true, false}; + ArrayFromVector(is_valid, {false, true, false, false}, &b); + children.assign({a, b}); + BitmapFromVector({false, true, true, true}, &null_bitmap); + expected = std::make_shared(type, 4, children, null_bitmap, 1); + + ASSERT_OK( + ArrayFromJSON(type, "[null, [5, null], [null, false], [null, null]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); + // When using object notation, null members can be omitted + ASSERT_OK(ArrayFromJSON(type, "[null, {\"a\": 5, \"b\": null}, {\"b\": false}, {}]", + &actual)); + ASSERT_OK(ValidateArray(*actual)); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestStruct, NestedStruct) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto field_c = field("c", float64()); + std::shared_ptr nested_type = struct_({field_a, field_b}); + auto field_nested = field("nested", nested_type); + std::shared_ptr type = struct_({field_nested, field_c}); + std::shared_ptr expected, actual; + std::shared_ptr null_bitmap; + std::vector is_valid; + std::vector> children(2); + + ASSERT_OK(ArrayFromJSON(type, "[]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({}, &children[0]); + ArrayFromVector({}, &children[1]); + children[0] = std::make_shared(nested_type, 0, children); + ArrayFromVector({}, &children[1]); + expected = std::make_shared(type, 0, children); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[[[5, true], 1.5], [[6, false], -3e2]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + ArrayFromVector({5, 6}, &children[0]); + ArrayFromVector({true, false}, &children[1]); + children[0] = std::make_shared(nested_type, 2, children); + ArrayFromVector({1.5, -300.0}, &children[1]); + expected = std::make_shared(type, 2, children); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK(ArrayFromJSON(type, "[null, [[5, null], null], [null, -3e2]]", &actual)); + ASSERT_OK(ValidateArray(*actual)); + is_valid = {false, true, false}; + ArrayFromVector(is_valid, {0, 5, 0}, &children[0]); + is_valid = {false, false, false}; + ArrayFromVector(is_valid, {false, false, false}, &children[1]); + BitmapFromVector({false, true, false}, &null_bitmap); + children[0] = std::make_shared(nested_type, 3, children, null_bitmap, 2); + is_valid = {false, false, true}; + ArrayFromVector(is_valid, {0.0, 0.0, -300.0}, &children[1]); + BitmapFromVector({false, true, true}, &null_bitmap); + expected = std::make_shared(type, 3, children, null_bitmap, 1); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestStruct, Errors) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = struct_({field_a, field_b}); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0, true]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, true, 1]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[true, 0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"b\": 0, \"a\": true}]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"c\": 0}]", &array)); +} + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 8807a56551789..5516e2dd72a2e 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -36,6 +36,7 @@ #include "rapidjson/document.h" // IWYU pragma: export #include "rapidjson/encodings.h" // IWYU pragma: export +#include "rapidjson/error/en.h" // IWYU pragma: export #include "rapidjson/stringbuffer.h" // IWYU pragma: export #include "rapidjson/writer.h" // IWYU pragma: export diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc new file mode 100644 index 0000000000000..b69bd76f51611 --- /dev/null +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -0,0 +1,508 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/ipc/json-internal.h" +#include "arrow/ipc/json-simple.h" +#include "arrow/memory_pool.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" +#include "arrow/util/string_view.h" + +namespace arrow { +namespace ipc { +namespace internal { +namespace json { + +using ::arrow::internal::checked_cast; + +static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; + +static Status JSONTypeError(const char* expected_type, rj::Type json_type) { + std::stringstream ss; + ss << "Expected " << expected_type << " or null, got type " << json_type; + return Status::Invalid(ss.str()); +} + +class Converter { + public: + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status AppendValue(const rj::Value& json_obj) = 0; + + virtual Status AppendNull() = 0; + + virtual Status AppendValues(const rj::Value& json_array) = 0; + + virtual std::shared_ptr builder() = 0; + + virtual Status Finish(std::shared_ptr* out) { + auto builder = this->builder(); + if (builder->length() == 0) { + // Make sure the builder was initialized + RETURN_NOT_OK(builder->Resize(1)); + } + return builder->Finish(out); + } + + protected: + std::shared_ptr type_; +}; + +Status GetConverter(const std::shared_ptr&, std::shared_ptr* out); + +// CRTP +template +class ConcreteConverter : public Converter { + public: + Status AppendValues(const rj::Value& json_array) override { + auto self = static_cast(this); + if (!json_array.IsArray()) { + return JSONTypeError("array", json_array.GetType()); + } + auto size = json_array.Size(); + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(self->AppendValue(json_array[i])); + } + return Status::OK(); + } +}; + +// TODO : dates and times? +// TODO : binary / fixed size binary? + +// ------------------------------------------------------------------------ +// Converter for null arrays + +class NullConverter : public ConcreteConverter { + public: + explicit NullConverter(const std::shared_ptr& type) { + type_ = type; + builder_ = std::make_shared(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + return JSONTypeError("null", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for boolean arrays + +class BooleanConverter : public ConcreteConverter { + public: + explicit BooleanConverter(const std::shared_ptr& type) { + type_ = type; + builder_ = std::make_shared(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + if (json_obj.IsBool()) { + return builder_->Append(json_obj.GetBool()); + } + return JSONTypeError("boolean", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for int arrays + +template +class IntegerConverter : public ConcreteConverter> { + using c_type = typename Type::c_type; + static constexpr auto is_signed = std::is_signed::value; + + public: + explicit IntegerConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared>(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + return AppendNumber(json_obj); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + // Append signed integer value + template + typename std::enable_if::value, Status>::type AppendNumber( + const rj::Value& json_obj) { + if (json_obj.IsInt64()) { + int64_t v64 = json_obj.GetInt64(); + c_type v = static_cast(v64); + if (v == v64) { + return builder_->Append(v); + } else { + std::stringstream ss; + ss << "Value " << v64 << " out of bounds for " << this->type_->ToString(); + return Status::Invalid(ss.str()); + } + } else { + return JSONTypeError("signed int", json_obj.GetType()); + } + } + + // Append unsigned integer value + template + typename std::enable_if::value, Status>::type AppendNumber( + const rj::Value& json_obj) { + if (json_obj.IsUint64()) { + uint64_t v64 = json_obj.GetUint64(); + c_type v = static_cast(v64); + if (v == v64) { + return builder_->Append(v); + } else { + std::stringstream ss; + ss << "Value " << v64 << " out of bounds for " << this->type_->ToString(); + return Status::Invalid(ss.str()); + } + return builder_->Append(v); + } else { + return JSONTypeError("unsigned int", json_obj.GetType()); + } + } + + std::shared_ptr> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for float arrays + +template +class FloatConverter : public ConcreteConverter> { + using c_type = typename Type::c_type; + + public: + explicit FloatConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared>(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + if (json_obj.IsNumber()) { + c_type v = static_cast(json_obj.GetDouble()); + return builder_->Append(v); + } else { + return JSONTypeError("number", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for decimal arrays + +class DecimalConverter : public ConcreteConverter { + public: + explicit DecimalConverter(const std::shared_ptr& type) { + this->type_ = type; + decimal_type_ = checked_cast(type.get()); + builder_ = std::make_shared(type); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + if (json_obj.IsString()) { + int32_t precision, scale; + Decimal128 d; + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + RETURN_NOT_OK(Decimal128::FromString(view, &d, &precision, &scale)); + if (scale != decimal_type_->scale()) { + std::stringstream ss; + ss << "Invalid scale for decimal: expected " << decimal_type_->scale() << ", got " + << scale; + return Status::Invalid(ss.str()); + } + return builder_->Append(d); + } + return JSONTypeError("decimal string", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; + Decimal128Type* decimal_type_; +}; + +// ------------------------------------------------------------------------ +// Converter for string arrays + +class StringConverter : public ConcreteConverter { + public: + explicit StringConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared(type, default_memory_pool()); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + if (json_obj.IsString()) { + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + return builder_->Append(view); + } else { + return JSONTypeError("string", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for list arrays + +class ListConverter : public ConcreteConverter { + public: + explicit ListConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + const auto& list_type = checked_cast(*type_); + RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); + auto child_builder = child_converter_->builder(); + builder_ = std::make_shared(default_memory_pool(), child_builder, type_); + return Status::OK(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return builder_->AppendNull(); + } + RETURN_NOT_OK(builder_->Append()); + // Extend the child converter with this JSON array + return child_converter_->AppendValues(json_obj); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; + std::shared_ptr child_converter_; +}; + +// ------------------------------------------------------------------------ +// Converter for struct arrays + +class StructConverter : public ConcreteConverter { + public: + explicit StructConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + std::vector> child_builders; + for (const auto& field : type_->children()) { + std::shared_ptr child_converter; + RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); + child_converters_.push_back(child_converter); + child_builders.push_back(child_converter->builder()); + } + builder_ = std::make_shared(type_, default_memory_pool(), + std::move(child_builders)); + return Status::OK(); + } + + Status AppendNull() override { + for (auto& converter : child_converters_) { + RETURN_NOT_OK(converter->AppendNull()); + } + return builder_->AppendNull(); + } + + // Append a JSON value that is either an array of N elements in order + // or an object mapping struct names to values (omitted struct members + // are mapped to null). + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsArray()) { + auto size = json_obj.Size(); + auto expected_size = static_cast(type_->num_children()); + if (size != expected_size) { + std::stringstream ss; + ss << "Expected array of size " << expected_size << ", got array of size " + << size; + return Status::Invalid(ss.str()); + } + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); + } + return builder_->Append(); + } + if (json_obj.IsObject()) { + auto remaining = json_obj.MemberCount(); + auto num_children = type_->num_children(); + for (int32_t i = 0; i < num_children; ++i) { + const auto& field = type_->child(i); + auto it = json_obj.FindMember(field->name()); + if (it != json_obj.MemberEnd()) { + --remaining; + RETURN_NOT_OK(child_converters_[i]->AppendValue(it->value)); + } else { + RETURN_NOT_OK(child_converters_[i]->AppendNull()); + } + } + if (remaining > 0) { + std::stringstream ss; + ss << "Unexpected members in JSON object for type " << type_->ToString(); + return Status::Invalid(ss.str()); + } + return builder_->Append(); + } + return JSONTypeError("array or object", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; + std::vector> child_converters_; +}; + +// ------------------------------------------------------------------------ +// General conversion functions + +Status GetConverter(const std::shared_ptr& type, + std::shared_ptr* out) { + std::shared_ptr res; + +#define SIMPLE_CONVERTER_CASE(ID, CLASS) \ + case ID: \ + res = std::make_shared(type); \ + break; + + switch (type->id()) { + SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::NA, NullConverter) + SIMPLE_CONVERTER_CASE(Type::BOOL, BooleanConverter) + SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter) + SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter) + SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) + SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) + SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) + SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) + default: { + std::stringstream ss; + ss << "JSON conversion to " << type->ToString() << " not implemented"; + return Status::NotImplemented(ss.str()); + } + } + +#undef SIMPLE_CONVERTER_CASE + + RETURN_NOT_OK(res->Init()); + *out = res; + return Status::OK(); +} + +Status ArrayFromJSON(const std::shared_ptr& type, + const util::string_view& json_string, std::shared_ptr* out) { + std::shared_ptr converter; + RETURN_NOT_OK(GetConverter(type, &converter)); + + rj::Document json_doc; + json_doc.Parse(json_string.data(), json_string.length()); + if (json_doc.HasParseError()) { + std::stringstream ss; + ss << "JSON parse error at offset " << json_doc.GetErrorOffset() << ": " + << GetParseError_En(json_doc.GetParseError()); + return Status::Invalid(ss.str()); + } + + // The JSON document should be an array, append it + RETURN_NOT_OK(converter->AppendValues(json_doc)); + return converter->Finish(out); +} + +Status ArrayFromJSON(const std::shared_ptr& type, + const std::string& json_string, std::shared_ptr* out) { + return ArrayFromJSON(type, util::string_view(json_string), out); +} + +Status ArrayFromJSON(const std::shared_ptr& type, const char* json_string, + std::shared_ptr* out) { + return ArrayFromJSON(type, util::string_view(json_string), out); +} + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json-simple.h b/cpp/src/arrow/ipc/json-simple.h new file mode 100644 index 0000000000000..da6483ff1556f --- /dev/null +++ b/cpp/src/arrow/ipc/json-simple.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement a simple JSON representation format for arrays + +#ifndef ARROW_IPC_JSON_SIMPLE_H +#define ARROW_IPC_JSON_SIMPLE_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; + +namespace ipc { +namespace internal { +namespace json { + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const std::string& json, + std::shared_ptr* out); + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const util::string_view& json, + std::shared_ptr* out); + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const char* json, + std::shared_ptr* out); + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_JSON_SIMPLE_H diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 482bc4370fdca..8434e59b0ce79 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -163,16 +163,7 @@ TEST_F(TestPrettyPrint, StructTypeBasic) { auto simple_2 = field("two", int32()); auto simple_struct = struct_({simple_1, simple_2}); - auto int_builder_1 = std::make_shared(); - auto int_builder_2 = std::make_shared(); - StructBuilder builder(simple_struct, default_memory_pool(), - {int_builder_1, int_builder_2}); - ASSERT_OK(builder.Append()); - ASSERT_OK(int_builder_1->Append(11)); - ASSERT_OK(int_builder_2->Append(22)); - - std::shared_ptr array; - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(simple_struct, "[[11, 22]]"); static const char* ex = R"expected(-- is_valid: all not null -- child 0 type: int32 @@ -202,22 +193,7 @@ TEST_F(TestPrettyPrint, StructTypeAdvanced) { auto simple_2 = field("two", int32()); auto simple_struct = struct_({simple_1, simple_2}); - auto int_builder_1 = std::make_shared(); - auto int_builder_2 = std::make_shared(); - StructBuilder builder(simple_struct, default_memory_pool(), - {int_builder_1, int_builder_2}); - ASSERT_OK(builder.Append()); - ASSERT_OK(int_builder_1->Append(11)); - ASSERT_OK(int_builder_2->Append(22)); - ASSERT_OK(builder.AppendNull()); - ASSERT_OK(int_builder_1->AppendNull()); - ASSERT_OK(int_builder_2->AppendNull()); - ASSERT_OK(builder.Append()); - ASSERT_OK(int_builder_1->AppendNull()); - ASSERT_OK(int_builder_2->Append(33)); - - std::shared_ptr array; - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(simple_struct, "[[11, 22], null, [null, 33]]"); static const char* ex = R"expected(-- is_valid: [ @@ -251,24 +227,9 @@ TEST_F(TestPrettyPrint, BinaryType) { } TEST_F(TestPrettyPrint, ListType) { - Int64Builder* int_builder = new Int64Builder(); - ListBuilder list_builder(default_memory_pool(), - std::unique_ptr(int_builder)); - - ASSERT_OK(list_builder.Append()); - ASSERT_OK(int_builder->AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.Append(false)); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(int_builder->Append(4)); - ASSERT_OK(int_builder->Append(6)); - ASSERT_OK(int_builder->Append(7)); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(int_builder->Append(2)); - ASSERT_OK(int_builder->Append(3)); + auto list_type = list(int64()); + auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); - std::shared_ptr array; - ASSERT_OK(list_builder.Finish(&array)); static const char* ex = R"expected([ [ null @@ -340,19 +301,7 @@ TEST_F(TestPrettyPrint, Decimal128Type) { int32_t s = 4; auto type = decimal(p, s); - - Decimal128Builder builder(type); - Decimal128 val; - - ASSERT_OK(Decimal128::FromString("123.4567", &val)); - ASSERT_OK(builder.Append(val)); - - ASSERT_OK(Decimal128::FromString("456.7891", &val)); - ASSERT_OK(builder.Append(val)); - ASSERT_OK(builder.AppendNull()); - - std::shared_ptr array; - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(type, "[\"123.4567\", \"456.7891\", null]"); static const char* ex = "[\n 123.4567,\n 456.7891,\n null\n]"; CheckArray(*array, {0}, ex); @@ -392,10 +341,7 @@ TEST_F(TestPrettyPrint, DictionaryType) { } TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { - std::vector is_valid = {true, true, false, true, false}; - std::vector values = {0, 1, 2, 3, 4}; - std::shared_ptr array; - ArrayFromVector(is_valid, values, &array); + auto array = ArrayFromJSON(int32(), "[0, 1, null, 3, null]"); ChunkedArray chunked_array({array}); static const char* expected = R"expected([ @@ -432,11 +378,8 @@ TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { } TEST_F(TestPrettyPrint, ColumnPrimitiveType) { - std::vector is_valid = {true, true, false, true, false}; - std::vector values = {0, 1, 2, 3, 4}; - std::shared_ptr array; - ArrayFromVector(is_valid, values, &array); std::shared_ptr int_field = field("column", int32()); + auto array = ArrayFromJSON(int_field->type(), "[0, 1, null, 3, null]"); Column column(int_field, ArrayVector({array})); static const char* expected = R"expected(column: int32 @@ -475,11 +418,8 @@ TEST_F(TestPrettyPrint, ColumnPrimitiveType) { } TEST_F(TestPrettyPrint, TablePrimitive) { - std::vector is_valid = {true, true, false, true, false}; - std::vector values = {0, 1, 2, 3, 4}; - std::shared_ptr array; - ArrayFromVector(is_valid, values, &array); std::shared_ptr int_field = field("column", int32()); + auto array = ArrayFromJSON(int_field->type(), "[0, 1, null, 3, null]"); std::shared_ptr column = std::make_shared(int_field, ArrayVector({array})); std::shared_ptr table_schema = schema({int_field}); diff --git a/cpp/src/arrow/test-util.cc b/cpp/src/arrow/test-util.cc index 7fb96cda7af73..38e07dd060ae4 100644 --- a/cpp/src/arrow/test-util.cc +++ b/cpp/src/arrow/test-util.cc @@ -41,6 +41,7 @@ #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/ipc/json-simple.h" #include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/status.h" @@ -51,13 +52,15 @@ #include "arrow/util/decimal.h" #include "arrow/util/logging.h" -void sleep_for(double seconds) { - std::this_thread::sleep_for( - std::chrono::nanoseconds(static_cast(seconds * 1e9))); -} - namespace arrow { +std::shared_ptr ArrayFromJSON(const std::shared_ptr& type, + const std::string& json) { + std::shared_ptr out; + ABORT_NOT_OK(ipc::internal::json::ArrayFromJSON(type, json, &out)); + return out; +} + void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { const int random_seed = 0; std::default_random_engine gen(random_seed); diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index a01fd7d84a601..7829ac25678a9 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -169,6 +169,12 @@ static inline Status GetBitmapFromVector(const std::vector& is_valid, return Status::OK(); } +template +inline void BitmapFromVector(const std::vector& is_valid, + std::shared_ptr* out) { + ASSERT_OK(GetBitmapFromVector(is_valid, out)); +} + // Sets approximately pct_null of the first n bytes in null_bytes to zero // and the rest to non-zero (true) values. ARROW_EXPORT void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes); @@ -247,6 +253,12 @@ Status MakeRandomBuffer(int64_t length, MemoryPool* pool, return Status::OK(); } +// ArrayFromJSON: construct an Array from a simple JSON representation + +ARROW_EXPORT +std::shared_ptr ArrayFromJSON(const std::shared_ptr&, + const std::string& json); + // ArrayFromVector: construct an Array from vectors of C values template diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 9d22e005e7276..fda7746c6b4e0 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -337,8 +337,8 @@ bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* out) } // namespace -Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision, - int32_t* scale) { +Status Decimal128::FromString(const util::string_view& s, Decimal128* out, + int32_t* precision, int32_t* scale) { if (s.empty()) { return Status::Invalid("Empty string cannot be converted to decimal"); } @@ -393,6 +393,16 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* pr return Status::OK(); } +Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision, + int32_t* scale) { + return FromString(util::string_view(s), out, precision, scale); +} + +Status Decimal128::FromString(const char* s, Decimal128* out, int32_t* precision, + int32_t* scale) { + return FromString(util::string_view(s), out, precision, scale); +} + Decimal128& Decimal128::Negate() { low_bits_ = ~low_bits_ + 1; high_bits_ = ~high_bits_; diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 26b82a42f70a7..fe76d25eb41d0 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -27,6 +27,7 @@ #include "arrow/status.h" #include "arrow/util/macros.h" +#include "arrow/util/string_view.h" #include "arrow/util/type_traits.h" #include "arrow/util/visibility.h" @@ -128,6 +129,10 @@ class ARROW_EXPORT Decimal128 { /// precision and scale if they're passed in and not null. static Status FromString(const std::string& s, Decimal128* out, int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); + static Status FromString(const util::string_view& s, Decimal128* out, + int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); + static Status FromString(const char* s, Decimal128* out, int32_t* precision = NULLPTR, + int32_t* scale = NULLPTR); /// \brief Convert from a big endian byte representation. The length must be /// between 1 and 16 From d3d7669221ee0c714d3095388d769c99d3e51b2b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 9 Dec 2018 14:45:15 -0600 Subject: [PATCH 003/328] ARROW-3969: [Rust] Format using stable rustfmt Author: Andy Grove Closes #3138 from andygrove/ARROW-3969 and squashes the following commits: 99b6256b6 move rustfmt installation into travis_install_cargo 55ab06fee Be more explicit and don't assume order of stable vs nightly builds 5fed7dbc2 simplify cca7da3ab oops, wrong command 9b2e5b771 Format using stable rustfmt --- ci/travis_install_cargo.sh | 1 + ci/travis_script_rust.sh | 9 ++------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/ci/travis_install_cargo.sh b/ci/travis_install_cargo.sh index f433033091ce1..e4a6b3b3493f3 100755 --- a/ci/travis_install_cargo.sh +++ b/ci/travis_install_cargo.sh @@ -21,6 +21,7 @@ set -e # ensure that both toolchains are installed rustup install stable +rustup component add rustfmt rustup install nightly pip install 'travis-cargo<0.2' --user diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh index 02a32cdabe818..55cce8f354e44 100755 --- a/ci/travis_script_rust.sh +++ b/ci/travis_script_rust.sh @@ -26,13 +26,8 @@ pushd $RUST_DIR # show activated toolchain rustup show -# check code formatting only for Rust nightly -if [ $RUSTUP_TOOLCHAIN == "nightly" ] -then - # raises on any formatting errors - rustup component add rustfmt-preview - cargo fmt --all -- --check -fi +# raises on any formatting errors +cargo +stable fmt --all -- --check # raises on any warnings cargo rustc -- -D warnings From cc24218ed8a5abe0a8d35cb6fd7ef1a283384be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 9 Dec 2018 22:22:08 +0100 Subject: [PATCH 004/328] ARROW-3963: [Packaging/Docker] Nightly test for building sphinx documentations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test is here: https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=docker-docs Author: Krisztián Szűcs Closes #3130 from kszucs/ARROW-3963 and squashes the following commits: 0b5be2cc add docker-docs to docker group 1575909e path corrections 51768fc0 use sphinx-build command instead of setup.py 60635acc error msg a93fcad6 merge _as_type and ensure_type 8d3d58fd nightly test for building cpp and python docs --- ci/docker_build_sphinx.sh | 4 +--- dev/tasks/tests.yml | 15 +++++++++++++++ docker-compose.yml | 2 +- docs/Dockerfile | 1 + python/pyarrow/gandiva.pyx | 20 +++++++++++++------- python/pyarrow/tests/test_csv.py | 2 +- python/pyarrow/types.pxi | 21 ++++++++------------- python/pyarrow/types.py | 3 +-- 8 files changed, 41 insertions(+), 27 deletions(-) diff --git a/ci/docker_build_sphinx.sh b/ci/docker_build_sphinx.sh index 957804325adf1..4a65f8155fb16 100755 --- a/ci/docker_build_sphinx.sh +++ b/ci/docker_build_sphinx.sh @@ -22,9 +22,7 @@ pushd /arrow/cpp/apidoc doxygen popd -pushd /arrow/python -python setup.py build_sphinx -s ../docs/source --build-dir ../docs/_build -popd +sphinx-build -b html /arrow/docs/source /arrow/docs/_build/html mkdir -p /arrow/site/asf-site/docs/latest rsync -r /arrow/docs/_build/html/ /arrow/site/asf-site/docs/latest/ diff --git a/dev/tasks/tests.yml b/dev/tasks/tests.yml index c158481de461e..d51fa7eac7a35 100644 --- a/dev/tasks/tests.yml +++ b/dev/tasks/tests.yml @@ -31,6 +31,7 @@ groups: - docker-python-3.6-alpine - docker-java - docker-js + - docker-docs - docker-lint - docker-iwyu - docker-clang-format @@ -174,6 +175,20 @@ tasks: - docker-compose build python-alpine - docker-compose run python-alpine + ###################### Documentation building tests ######################### + + docker-docs: + platform: linux + template: docker-tests/travis.linux.yml + params: + environment: + PYTHON_VERSION: 3.6 + commands: + - docker-compose build cpp + - docker-compose build python + - docker-compose build docs + - docker-compose run docs + ############################## Linter tests ################################# docker-lint: diff --git a/docker-compose.yml b/docker-compose.yml index d6f11004233e5..51f1a49542212 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -196,7 +196,7 @@ services: build: context: . dockerfile: docs/Dockerfile - volumes: *volumes + volumes: *ubuntu-volumes ######################### Integration Tests ################################# diff --git a/docs/Dockerfile b/docs/Dockerfile index 4908110b7fb56..31ad84e17ee48 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -21,6 +21,7 @@ ADD ci/conda_env_sphinx.yml /arrow/ci/ RUN conda install -c conda-forge \ --file arrow/ci/conda_env_sphinx.yml && \ conda clean --all + CMD arrow/ci/docker_build_cpp.sh && \ arrow/ci/docker_build_python.sh && \ arrow/ci/docker_build_sphinx.sh diff --git a/python/pyarrow/gandiva.pyx b/python/pyarrow/gandiva.pyx index 418d0d61502b3..76e55d6ba27ef 100644 --- a/python/pyarrow/gandiva.pyx +++ b/python/pyarrow/gandiva.pyx @@ -28,10 +28,9 @@ from libc.stdint cimport int64_t, int32_t, uint8_t, uintptr_t from pyarrow.includes.libarrow cimport * from pyarrow.compat import frombytes -from pyarrow.types import _as_type from pyarrow.lib cimport (Array, DataType, Field, MemoryPool, RecordBatch, Schema, check_status, pyarrow_wrap_array, - pyarrow_wrap_data_type) + pyarrow_wrap_data_type, ensure_type) from pyarrow.includes.libgandiva cimport ( CCondition, CExpression, @@ -173,8 +172,10 @@ cdef class Filter: return self def evaluate(self, RecordBatch batch, MemoryPool pool, dtype='int32'): - cdef shared_ptr[CSelectionVector] selection - cdef DataType type = _as_type(dtype) + cdef: + DataType type = ensure_type(dtype) + shared_ptr[CSelectionVector] selection + if type.id == _Type_INT16: check_status(SelectionVector_MakeInt16( batch.num_rows, pool.pool, &selection)) @@ -187,6 +188,7 @@ cdef class Filter: else: raise ValueError("'dtype' of the selection vector should be " "one of 'int16', 'int32' and 'int64'.") + check_status(self.filter.get().Evaluate( batch.sp_batch.get()[0], selection)) return SelectionVector.create(selection) @@ -195,8 +197,10 @@ cdef class Filter: cdef class TreeExprBuilder: def make_literal(self, value, dtype): - cdef shared_ptr[CNode] r - cdef DataType type = _as_type(dtype) + cdef: + DataType type = ensure_type(dtype) + shared_ptr[CNode] r + if type.id == _Type_BOOL: r = TreeExprBuilder_MakeBoolLiteral(value) elif type.id == _Type_UINT8: @@ -225,6 +229,7 @@ cdef class TreeExprBuilder: r = TreeExprBuilder_MakeBinaryLiteral(value) else: raise TypeError("Didn't recognize dtype " + str(dtype)) + return Node.create(r) def make_expression(self, Node root_node, Field return_field): @@ -353,7 +358,8 @@ cdef class TreeExprBuilder: return Node.create(r) def make_in_expression(self, Node node, values, dtype): - cdef DataType type = _as_type(dtype) + cdef DataType type = ensure_type(dtype) + if type.id == _Type_INT32: return self._make_in_expression_int32(node, values) elif type.id == _Type_INT64: diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 115595bbb877c..c5816de8a4203 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -146,7 +146,7 @@ def test_convert_options(): opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} - with pytest.raises(TypeError, match='data type expected'): + with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index d5d99e4044e23..1ebd196fabf95 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -869,7 +869,7 @@ def field(name, type, bint nullable=True, dict metadata=None): cdef: shared_ptr[CKeyValueMetadata] c_meta Field result = Field.__new__(Field) - DataType _type = _as_type(type) + DataType _type = ensure_type(type, allow_none=False) if metadata is not None: convert_metadata(metadata, &c_meta) @@ -1479,20 +1479,15 @@ def type_for_alias(name): return alias() -def _as_type(typ): - if isinstance(typ, DataType): - return typ - elif isinstance(typ, six.string_types): - return type_for_alias(typ) - else: - raise TypeError("data type expected, got '%r'" % (type(typ),)) - - -cdef DataType ensure_type(object type, c_bool allow_none=False): - if allow_none and type is None: +cdef DataType ensure_type(object ty, c_bool allow_none=False): + if allow_none and ty is None: return None + elif isinstance(ty, DataType): + return ty + elif isinstance(ty, six.string_types): + return type_for_alias(ty) else: - return _as_type(type) + raise TypeError('DataType expected, got {!r}'.format(type(ty))) def schema(fields, dict metadata=None): diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index d07dccaedfb97..2bd70276e7ea1 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -19,8 +19,7 @@ from pyarrow.lib import (is_boolean_value, # noqa is_integer_value, - is_float_value, - _as_type) + is_float_value) import pyarrow.lib as lib From 7a5631dedc2de4c7740cd978949322cefdce8251 Mon Sep 17 00:00:00 2001 From: c-bata Date: Mon, 10 Dec 2018 10:37:30 +0900 Subject: [PATCH 005/328] ARROW-3964: [Go] Refactor examples of csv reader Example of godoc doesn't include input file(testdata/simple.csv). So it's hard to understand the output. This PR refactors it. screenshot https://godoc.org/github.com/apache/arrow/go/arrow/csv Author: c-bata Closes #3131 from c-bata/refactor-csv-reader-example and squashes the following commits: eed8e29b Refactor examples of csv reader for Go --- go/arrow/csv/csv_test.go | 48 +++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/go/arrow/csv/csv_test.go b/go/arrow/csv/csv_test.go index aaafb37554b87..97f31cc209c27 100644 --- a/go/arrow/csv/csv_test.go +++ b/go/arrow/csv/csv_test.go @@ -20,8 +20,6 @@ import ( "bytes" "fmt" "io/ioutil" - "log" - "os" "testing" "github.com/apache/arrow/go/arrow" @@ -30,17 +28,24 @@ import ( ) func Example() { - f, err := os.Open("testdata/simple.csv") - if err != nil { - log.Fatal(err) - } - defer f.Close() + f := bytes.NewBufferString(`## a simple set of data: int64;float64;string +0;0;str-0 +1;1;str-1 +2;2;str-2 +3;3;str-3 +4;4;str-4 +5;5;str-5 +6;6;str-6 +7;7;str-7 +8;8;str-8 +9;9;str-9 +`) schema := arrow.NewSchema( []arrow.Field{ - arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64}, - arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64}, - arrow.Field{Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, }, nil, ) @@ -90,17 +95,24 @@ func Example() { } func Example_withChunk() { - f, err := os.Open("testdata/simple.csv") - if err != nil { - log.Fatal(err) - } - defer f.Close() + f := bytes.NewBufferString(`## a simple set of data: int64;float64;string +0;0;str-0 +1;1;str-1 +2;2;str-2 +3;3;str-3 +4;4;str-4 +5;5;str-5 +6;6;str-6 +7;7;str-7 +8;8;str-8 +9;9;str-9 +`) schema := arrow.NewSchema( []arrow.Field{ - arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64}, - arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64}, - arrow.Field{Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, }, nil, ) From a4063edf262caeb57c6cb7365e08756788c736a3 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Mon, 10 Dec 2018 10:50:03 +0900 Subject: [PATCH 006/328] ARROW-3967: [Gandiva] [C++] Make node.h public Because some methods in node.h is useful in bindings. C GLib Gandiva bindings want to use LiteralNode::holder() to access raw literal data. Author: Yosuke Shiro Closes #3135 from shiro615/make-gandiva-node-header-public and squashes the following commits: 5950c52b Make node.h public --- cpp/src/gandiva/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index bd497dcb92882..68f02f03cf29b 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -112,9 +112,13 @@ install(FILES expression.h expression_registry.h filter.h + func_descriptor.h function_signature.h gandiva_aliases.h + literal_holder.h logging.h + node.h + node_visitor.h projector.h selection_vector.h tree_expr_builder.h From 1dc906e0fe76d558c13febb02c4c63bc4eeba50b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 10 Dec 2018 12:31:21 +0900 Subject: [PATCH 007/328] ARROW-3885: [Rust] Release prepare step should increment Rust version Author: Andy Grove Closes #3096 from andygrove/ARROW-3885 and squashes the following commits: 7d15ee77 add commit step 0d98c2cf revert to 0.11.0 ready for next prepare step a7f60835 update release prepare step to increment Rust version ac6e5fc0 Set version to 0.11.0 and update prepare script b39b7c4b Update Rust version to 0.12.0 --- dev/release/00-prepare.sh | 9 +++++++++ rust/Cargo.toml | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 9282cbfd2771d..5ff4ddc8f28a6 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -76,6 +76,15 @@ if [ "$#" -eq 2 ]; then git commit -m "[Release] Update .deb package names for $nextVersion" cd - + echo "prepare release ${version} in Rust crate" + + cd "${SOURCE_DIR}/../../rust" + sed -i.bak -r -e "s/version = \"$version\"/version = \"$nextVersion\"/g" Cargo.toml + rm -f Cargo.toml.bak + git add Cargo.toml + git commit -m "[Release] Update Rust Cargo.toml version for $nextVersion" + cd - + echo "Finish staging binary artifacts by running: sh dev/release/01-perform.sh" else diff --git a/rust/Cargo.toml b/rust/Cargo.toml index b56cd6fb30091..39de50c8a336d 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "0.10.0" +version = "0.11.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" From 612bdca20c9685911cfa5de6f87993f0544fb7aa Mon Sep 17 00:00:00 2001 From: Praveen Date: Sun, 9 Dec 2018 21:45:10 -0600 Subject: [PATCH 008/328] ARROW-3970: [Gandiva][C++] Remove unnecessary boost dependencies. Removed the dynamic dependencies since we do not need them. Author: Praveen Closes #3137 from praveenbingo/ARROW-3970 and squashes the following commits: 6e3a6bbdc ARROW-3970: Added more time for a benchmark test. fbb551645 ARROW-3970: Remove unnecessary boost dynamic dependencies. --- cpp/src/gandiva/CMakeLists.txt | 3 --- .../org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 68f02f03cf29b..1f76f7841590a 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -64,9 +64,6 @@ set(SRC_FILES annotator.cc set(GANDIVA_SHARED_PRIVATE_LINK_LIBS arrow_shared - ${BOOST_REGEX_LIBRARY} - ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY} LLVM::LLVM_INTERFACE ${RE2_LIBRARY}) diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java index cd297034df80f..c4d6bd9070613 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java @@ -58,7 +58,7 @@ public void testAdd3() throws Exception { 1 * MILLION, 16 * THOUSAND, 4); System.out.println("Time taken for projecting 1m records of add3 is " + timeTaken + "ms"); - Assert.assertTrue(timeTaken <= 10 * toleranceRatio); + Assert.assertTrue(timeTaken <= 13 * toleranceRatio); } @Test From d6284cf89c75f4767996abe087a8eb203401fb6d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Dec 2018 14:05:14 +0100 Subject: [PATCH 009/328] ARROW-3792: [C++] Writing a list-type chunked column to Parquet fails if any chunk is 0-length Thanks to @tanyaschlusser to providing a minimal reproduction to help find the underlying problem Author: Wes McKinney Closes #3141 from wesm/ARROW-3792 and squashes the following commits: 1ed82a57 Add test case and fix --- cpp/src/parquet/arrow/writer.cc | 5 +++++ python/pyarrow/tests/test_parquet.py | 33 ++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index ef5de07d87f16..402cbf0f2027c 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -861,6 +861,11 @@ Status ArrowColumnWriter::TypedWriteBatch( } Status ArrowColumnWriter::Write(const Array& data) { + if (data.length() == 0) { + // Write nothing when length is 0 + return Status::OK(); + } + ::arrow::Type::type values_type; RETURN_NOT_OK(GetLeafType(*data.type(), &values_type)); diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index c14056e8533b8..89d3224580463 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +from collections import OrderedDict import datetime import decimal import io @@ -2224,6 +2225,34 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir): def test_writing_empty_lists(): # ARROW-2591: [Python] Segmentation fault issue in pq.write_table - arr = pa.array([[], []], pa.list_(pa.int32())) - table = pa.Table.from_arrays([arr], ['test']) + arr1 = pa.array([[], []], pa.list_(pa.int32())) + table = pa.Table.from_arrays([arr1], ['list(int32)']) _check_roundtrip(table) + + +def test_write_nested_zero_length_array_chunk_failure(): + # Bug report in ARROW-3792 + cols = OrderedDict( + int32=pa.int32(), + list_string=pa.list_(pa.string()) + ) + data = [[], [OrderedDict(int32=1, list_string=('G',)), ]] + + # This produces a table with a column like + # )> + # [ + # [], + # [ + # [ + # "G" + # ] + # ] + # ] + # + # Each column is a ChunkedArray with 2 elements + my_arrays = [pa.array(batch, type=pa.struct(cols)).flatten() + for batch in data] + my_batches = [pa.RecordBatch.from_arrays(batch, pa.schema(cols)) + for batch in my_arrays] + tbl = pa.Table.from_batches(my_batches, pa.schema(cols)) + _check_roundtrip(tbl) From e4761e07d6d32e8c3fddac20f0abca0bb89543ad Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 10 Dec 2018 09:12:40 -0600 Subject: [PATCH 010/328] ARROW-3727: [Python] Document use of foreign_buffer() Author: Antoine Pitrou Closes #3146 from pitrou/ARROW-3727-foreign-buffer-doc and squashes the following commits: e81a5f0cf ARROW-3727: Document use of foreign_buffer() --- docs/source/python/memory.rst | 23 ++++++++++++++--------- python/pyarrow/io.pxi | 8 ++++++-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/docs/source/python/memory.rst b/docs/source/python/memory.rst index 1ee81e754da1c..0d30866d0aa4d 100644 --- a/docs/source/python/memory.rst +++ b/docs/source/python/memory.rst @@ -35,8 +35,8 @@ Referencing and Allocating Memory pyarrow.Buffer -------------- -The :class:`~pyarrow.Buffer` object wraps the C++ ``arrow::Buffer`` type and is -the primary tool for memory management in Apache Arrow in C++. It permits +The :class:`Buffer` object wraps the C++ :cpp:class:`arrow::Buffer` type +which is the primary tool for memory management in Apache Arrow in C++. It permits higher-level array classes to safely interact with memory which they may or may not own. ``arrow::Buffer`` can be zero-copy sliced to permit Buffers to cheaply reference other Buffers, while preserving memory lifetime and clean @@ -46,8 +46,9 @@ There are many implementations of ``arrow::Buffer``, but they all provide a standard interface: a data pointer and length. This is similar to Python's built-in `buffer protocol` and ``memoryview`` objects. -A :class:`~pyarrow.Buffer` can be created from any Python object which -implements the buffer protocol. Let's consider a bytes object: +A :class:`Buffer` can be created from any Python object implementing +the buffer protocol by calling the :func:`py_buffer` function. Let's consider +a bytes object: .. ipython:: python @@ -61,18 +62,22 @@ implements the buffer protocol. Let's consider a bytes object: Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the ``data`` bytes object. -The Buffer's ``to_pybytes`` method can convert to a Python byte string: +External memory, under the form of a raw pointer and size, can also be +referenced using the :func:`foreign_buffer` function. + +Buffers can be used in circumstances where a Python buffer or memoryview is +required, and such conversions are zero-copy: .. ipython:: python - buf.to_pybytes() + memoryview(buf) -Buffers can be used in circumstances where a Python buffer or memoryview is -required, and such conversions are also zero-copy: +The Buffer's :meth:`~Buffer.to_pybytes` method converts the Buffer's data to a +Python bytestring (thus making a copy of the data): .. ipython:: python - memoryview(buf) + buf.to_pybytes() Memory Pools ------------ diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 9f7dc7bc8386f..97abde8f892af 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1173,10 +1173,14 @@ def py_buffer(object obj): return pyarrow_wrap_buffer(buf) -def foreign_buffer(address, size, base): +def foreign_buffer(address, size, base=None): """ Construct an Arrow buffer with the given *address* and *size*, - backed by the Python *base* object. + optionally backed by the Python *base* object. + + The *base* object, if given, will be kept alive as long as this buffer + is alive, including accross language boundaries (for example if the + buffer is referenced by C++ code). """ cdef: intptr_t c_addr = address From fa5d5ad98349dddf98b66d67e8737f77bd261d1f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 10 Dec 2018 09:15:01 -0600 Subject: [PATCH 011/328] ARROW-3980: [C++] Fix CRTP use in json-simple.cc Nudge the compiler into devirtualizing method calls. Author: Antoine Pitrou Closes #3144 from pitrou/ARROW-3980-json-crtp and squashes the following commits: ef96713a2 ARROW-3980: Fix CRTP use in json-simple.cc --- cpp/src/arrow/ipc/json-simple.cc | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index b69bd76f51611..a8d120036e4f5 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -98,7 +98,7 @@ class ConcreteConverter : public Converter { // ------------------------------------------------------------------------ // Converter for null arrays -class NullConverter : public ConcreteConverter { +class NullConverter final : public ConcreteConverter { public: explicit NullConverter(const std::shared_ptr& type) { type_ = type; @@ -109,7 +109,7 @@ class NullConverter : public ConcreteConverter { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } return JSONTypeError("null", json_obj.GetType()); } @@ -123,7 +123,7 @@ class NullConverter : public ConcreteConverter { // ------------------------------------------------------------------------ // Converter for boolean arrays -class BooleanConverter : public ConcreteConverter { +class BooleanConverter final : public ConcreteConverter { public: explicit BooleanConverter(const std::shared_ptr& type) { type_ = type; @@ -134,7 +134,7 @@ class BooleanConverter : public ConcreteConverter { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } if (json_obj.IsBool()) { return builder_->Append(json_obj.GetBool()); @@ -152,7 +152,7 @@ class BooleanConverter : public ConcreteConverter { // Converter for int arrays template -class IntegerConverter : public ConcreteConverter> { +class IntegerConverter final : public ConcreteConverter> { using c_type = typename Type::c_type; static constexpr auto is_signed = std::is_signed::value; @@ -166,7 +166,7 @@ class IntegerConverter : public ConcreteConverter> { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } return AppendNumber(json_obj); } @@ -220,7 +220,7 @@ class IntegerConverter : public ConcreteConverter> { // Converter for float arrays template -class FloatConverter : public ConcreteConverter> { +class FloatConverter final : public ConcreteConverter> { using c_type = typename Type::c_type; public: @@ -233,7 +233,7 @@ class FloatConverter : public ConcreteConverter> { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } if (json_obj.IsNumber()) { c_type v = static_cast(json_obj.GetDouble()); @@ -252,7 +252,7 @@ class FloatConverter : public ConcreteConverter> { // ------------------------------------------------------------------------ // Converter for decimal arrays -class DecimalConverter : public ConcreteConverter { +class DecimalConverter final : public ConcreteConverter { public: explicit DecimalConverter(const std::shared_ptr& type) { this->type_ = type; @@ -264,7 +264,7 @@ class DecimalConverter : public ConcreteConverter { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } if (json_obj.IsString()) { int32_t precision, scale; @@ -292,7 +292,7 @@ class DecimalConverter : public ConcreteConverter { // ------------------------------------------------------------------------ // Converter for string arrays -class StringConverter : public ConcreteConverter { +class StringConverter final : public ConcreteConverter { public: explicit StringConverter(const std::shared_ptr& type) { this->type_ = type; @@ -303,7 +303,7 @@ class StringConverter : public ConcreteConverter { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } if (json_obj.IsString()) { auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); @@ -322,7 +322,7 @@ class StringConverter : public ConcreteConverter { // ------------------------------------------------------------------------ // Converter for list arrays -class ListConverter : public ConcreteConverter { +class ListConverter final : public ConcreteConverter { public: explicit ListConverter(const std::shared_ptr& type) { type_ = type; } @@ -338,7 +338,7 @@ class ListConverter : public ConcreteConverter { Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { - return builder_->AppendNull(); + return AppendNull(); } RETURN_NOT_OK(builder_->Append()); // Extend the child converter with this JSON array @@ -355,7 +355,7 @@ class ListConverter : public ConcreteConverter { // ------------------------------------------------------------------------ // Converter for struct arrays -class StructConverter : public ConcreteConverter { +class StructConverter final : public ConcreteConverter { public: explicit StructConverter(const std::shared_ptr& type) { type_ = type; } From 7a296bd597ab7061ff8f39280d3d6a9a694faf79 Mon Sep 17 00:00:00 2001 From: Pindikura Ravindra Date: Mon, 10 Dec 2018 09:28:47 -0600 Subject: [PATCH 012/328] ARROW-3977: [Gandiva] fix label during ctest invoc Author: Pindikura Ravindra Closes #3139 from pravindra/ci and squashes the following commits: 3372401c3 ARROW-3977: temporary disable valgrind c51b23aff ARROW-3977: fix label during ctest invoc --- .travis.yml | 3 ++- ci/travis_script_gandiva_cpp.sh | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index b877e205b5bd0..7489d72c80502 100644 --- a/.travis.yml +++ b/.travis.yml @@ -110,7 +110,8 @@ matrix: env: - ARROW_TRAVIS_GANDIVA=1 - ARROW_TRAVIS_USE_TOOLCHAIN=1 - - ARROW_TRAVIS_VALGRIND=1 + # ARROW-3979 temporarily disabled. + - ARROW_TRAVIS_VALGRIND=0 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" before_script: diff --git a/ci/travis_script_gandiva_cpp.sh b/ci/travis_script_gandiva_cpp.sh index 4d0a9b7a6bac4..f3c379393fe14 100755 --- a/ci/travis_script_gandiva_cpp.sh +++ b/ci/travis_script_gandiva_cpp.sh @@ -23,10 +23,7 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh pushd $CPP_BUILD_DIR -PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva,unittest - -# not running in parallel, since some of them are benchmarks -PATH=$ARROW_BUILD_TYPE:$PATH ctest -VV -L gandiva,integ +PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva popd From e6b96aa30e68e3bbd020babbaa79eea83b747f0c Mon Sep 17 00:00:00 2001 From: Paddy Horan Date: Mon, 10 Dec 2018 16:30:30 +0100 Subject: [PATCH 013/328] ARROW-3687: [Rust] Anything measuring array slots should be `usize` Author: Paddy Horan Closes #3142 from paddyhoran/ARROW-3687 and squashes the following commits: c0a75e9c Fixed lint issues (outdated stable) 0b39fe87 Updated subtraction to be checked c4c223c5 Fixing lints for stable fmt d6aec71c All values measuring array slots changed to `usize` --- rust/src/array.rs | 81 +++++++++++++++++++-------------------- rust/src/array_data.rs | 61 +++++++++++++++-------------- rust/src/bitmap.rs | 6 +-- rust/src/buffer.rs | 18 ++++----- rust/src/builder.rs | 70 ++++++++++++++++----------------- rust/src/csv/reader.rs | 8 ++-- rust/src/memory.rs | 8 ++-- rust/src/record_batch.rs | 2 +- rust/src/tensor.rs | 72 +++++++++++++++++----------------- rust/src/util/bit_util.rs | 18 ++++----- 10 files changed, 173 insertions(+), 171 deletions(-) diff --git a/rust/src/array.rs b/rust/src/array.rs index ca1d2a5cdb1e7..51bc8d993c19b 100644 --- a/rust/src/array.rs +++ b/rust/src/array.rs @@ -47,27 +47,27 @@ pub trait Array: Send + Sync { } /// Returns the length (i.e., number of elements) of this array - fn len(&self) -> i64 { + fn len(&self) -> usize { self.data().len() } /// Returns the offset of this array - fn offset(&self) -> i64 { + fn offset(&self) -> usize { self.data().offset() } /// Returns whether the element at index `i` is null - fn is_null(&self, i: i64) -> bool { + fn is_null(&self, i: usize) -> bool { self.data().is_null(i) } /// Returns whether the element at index `i` is not null - fn is_valid(&self, i: i64) -> bool { + fn is_valid(&self, i: usize) -> bool { self.data().is_valid(i) } /// Returns the total number of nulls in this array - fn null_count(&self) -> i64 { + fn null_count(&self) -> usize { self.data().null_count() } } @@ -158,7 +158,7 @@ impl Array for PrimitiveArray { /// Implementation for primitive arrays with numeric types. /// Boolean arrays are bit-packed and so implemented separately. impl PrimitiveArray { - pub fn new(length: i64, values: Buffer, null_count: i64, offset: i64) -> Self { + pub fn new(length: usize, values: Buffer, null_count: usize, offset: usize) -> Self { let array_data = ArrayData::builder(T::get_data_type()) .len(length) .add_buffer(values) @@ -176,7 +176,7 @@ impl PrimitiveArray { } /// Returns the length of this array - pub fn len(&self) -> i64 { + pub fn len(&self) -> usize { self.data.len() } @@ -188,16 +188,16 @@ impl PrimitiveArray { /// Returns the primitive value at index `i`. /// /// Note this doesn't do any bound checking, for performance reason. - pub fn value(&self, i: i64) -> T::Native { + pub fn value(&self, i: usize) -> T::Native { unsafe { *(self.raw_values().offset(i as isize)) } } /// Returns a slice for the given offset and length /// /// Note this doesn't do any bound checking, for performance reason. - pub fn value_slice(&self, offset: i64, len: i64) -> &[T::Native] { - let raw = unsafe { std::slice::from_raw_parts(self.raw_values(), self.len() as usize) }; - &raw[offset as usize..offset as usize + len as usize] + pub fn value_slice(&self, offset: usize, len: usize) -> &[T::Native] { + let raw = unsafe { std::slice::from_raw_parts(self.raw_values(), self.len()) }; + &raw[offset..offset + len] } /// Returns the minimum value in the array, according to the natural order. @@ -220,7 +220,7 @@ impl PrimitiveArray { if data.is_null(i) { continue; } - let m = self.value(i as i64); + let m = self.value(i); match n { None => n = Some(m), Some(nn) => { @@ -234,14 +234,14 @@ impl PrimitiveArray { } // Returns a new primitive array builder - pub fn builder(capacity: i64) -> PrimitiveArrayBuilder { + pub fn builder(capacity: usize) -> PrimitiveArrayBuilder { PrimitiveArrayBuilder::::new(capacity) } } /// Specific implementation for Boolean arrays due to bit-packing impl PrimitiveArray { - pub fn new(length: i64, values: Buffer, null_count: i64, offset: i64) -> Self { + pub fn new(length: usize, values: Buffer, null_count: usize, offset: usize) -> Self { let array_data = ArrayData::builder(DataType::Boolean) .len(length) .add_buffer(values) @@ -259,14 +259,14 @@ impl PrimitiveArray { } /// Returns the boolean value at index `i`. - pub fn value(&self, i: i64) -> bool { + pub fn value(&self, i: usize) -> bool { let offset = i + self.offset(); assert!(offset < self.data.len()); - unsafe { bit_util::get_bit_raw(self.raw_values.get() as *const u8, offset as usize) } + unsafe { bit_util::get_bit_raw(self.raw_values.get() as *const u8, offset) } } // Returns a new primitive array builder - pub fn builder(capacity: i64) -> BooleanBuilder { + pub fn builder(capacity: usize) -> BooleanBuilder { BooleanBuilder::new(capacity) } } @@ -279,7 +279,7 @@ macro_rules! def_numeric_from_vec { impl From> for PrimitiveArray<$ty> { fn from(data: Vec<$native_ty>) -> Self { let array_data = ArrayData::builder($ty_id) - .len(data.len() as i64) + .len(data.len()) .add_buffer(Buffer::from(data.to_byte_slice())) .build(); PrimitiveArray::from(array_data) @@ -290,7 +290,7 @@ macro_rules! def_numeric_from_vec { impl From>> for PrimitiveArray<$ty> { fn from(data: Vec>) -> Self { let data_len = data.len(); - let num_bytes = bit_util::ceil(data_len as i64, 8) as usize; + let num_bytes = bit_util::ceil(data_len, 8); let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); let mut val_buf = MutableBuffer::new(data_len * mem::size_of::<$native_ty>()); @@ -310,7 +310,7 @@ macro_rules! def_numeric_from_vec { } let array_data = ArrayData::builder($ty_id) - .len(data_len as i64) + .len(data_len) .add_buffer(val_buf.freeze()) .null_bit_buffer(null_buf.freeze()) .build(); @@ -334,7 +334,7 @@ def_numeric_from_vec!(Float64Type, f64, DataType::Float64); /// Constructs a boolean array from a vector. Should only be used for testing. impl From> for BooleanArray { fn from(data: Vec) -> Self { - let num_byte = bit_util::ceil(data.len() as i64, 8) as usize; + let num_byte = bit_util::ceil(data.len(), 8); let mut mut_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false); { let mut_slice = mut_buf.data_mut(); @@ -345,7 +345,7 @@ impl From> for BooleanArray { } } let array_data = ArrayData::builder(DataType::Boolean) - .len(data.len() as i64) + .len(data.len()) .add_buffer(mut_buf.freeze()) .build(); BooleanArray::from(array_data) @@ -354,8 +354,8 @@ impl From> for BooleanArray { impl From>> for BooleanArray { fn from(data: Vec>) -> Self { - let data_len = data.len() as i64; - let num_byte = bit_util::ceil(data_len, 8) as usize; + let data_len = data.len(); + let num_byte = bit_util::ceil(data_len, 8); let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false); let mut val_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false); @@ -425,7 +425,7 @@ impl ListArray { /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - pub fn value_offset(&self, i: i64) -> i32 { + pub fn value_offset(&self, i: usize) -> i32 { self.value_offset_at(self.data.offset() + i) } @@ -433,13 +433,13 @@ impl ListArray { /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - pub fn value_length(&self, mut i: i64) -> i32 { + pub fn value_length(&self, mut i: usize) -> i32 { i += self.data.offset(); self.value_offset_at(i + 1) - self.value_offset_at(i) } #[inline] - fn value_offset_at(&self, i: i64) -> i32 { + fn value_offset_at(&self, i: usize) -> i32 { unsafe { *self.value_offsets.get().offset(i as isize) } } } @@ -503,11 +503,8 @@ pub struct BinaryArray { impl BinaryArray { /// Returns the element at index `i` as a byte slice. - pub fn get_value(&self, i: i64) -> &[u8] { - assert!( - i >= 0 && i < self.data.len(), - "BinaryArray out of bounds access" - ); + pub fn get_value(&self, i: usize) -> &[u8] { + assert!(i < self.data.len(), "BinaryArray out of bounds access"); let offset = i.checked_add(self.data.offset()).unwrap(); unsafe { let pos = self.value_offset_at(offset); @@ -521,7 +518,7 @@ impl BinaryArray { /// Returns the element at index `i` as a string. /// /// Note this doesn't do any bound checking, for performance reason. - pub fn get_string(&self, i: i64) -> String { + pub fn get_string(&self, i: usize) -> String { let slice = self.get_value(i); unsafe { String::from_utf8_unchecked(Vec::from(slice)) } } @@ -530,7 +527,7 @@ impl BinaryArray { /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - pub fn value_offset(&self, i: i64) -> i32 { + pub fn value_offset(&self, i: usize) -> i32 { self.value_offset_at(self.data.offset() + i) } @@ -538,13 +535,13 @@ impl BinaryArray { /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - pub fn value_length(&self, mut i: i64) -> i32 { + pub fn value_length(&self, mut i: usize) -> i32 { i += self.data.offset(); self.value_offset_at(i + 1) - self.value_offset_at(i) } #[inline] - fn value_offset_at(&self, i: i64) -> i32 { + fn value_offset_at(&self, i: usize) -> i32 { unsafe { *self.value_offsets.get().offset(i as isize) } } } @@ -582,7 +579,7 @@ impl<'a> From> for BinaryArray { values.extend_from_slice(s.as_bytes()); } let array_data = ArrayData::builder(DataType::Utf8) - .len(v.len() as i64) + .len(v.len()) .add_buffer(Buffer::from(offsets.to_byte_slice())) .add_buffer(Buffer::from(&values[..])) .build(); @@ -664,7 +661,7 @@ impl Array for StructArray { } /// Returns the length (i.e., number of elements) of this array - fn len(&self) -> i64 { + fn len(&self) -> usize { self.boxed_fields[0].len() } } @@ -876,8 +873,8 @@ mod tests { assert_eq!(6, list_array.value_offset(2)); assert_eq!(2, list_array.value_length(2)); for i in 0..3 { - assert!(list_array.is_valid(i as i64)); - assert!(!list_array.is_null(i as i64)); + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); } // Now test with a non-zero offset @@ -991,8 +988,8 @@ mod tests { assert_eq!(5, binary_array.value_offset(2)); assert_eq!(7, binary_array.value_length(2)); for i in 0..3 { - assert!(binary_array.is_valid(i as i64)); - assert!(!binary_array.is_null(i as i64)); + assert!(binary_array.is_valid(i)); + assert!(!binary_array.is_null(i)); } // Test binary array with offset diff --git a/rust/src/array_data.rs b/rust/src/array_data.rs index b288d4a804535..36a817ee579a0 100644 --- a/rust/src/array_data.rs +++ b/rust/src/array_data.rs @@ -31,13 +31,13 @@ pub struct ArrayData { data_type: DataType, /// The number of elements in this array data - len: i64, + len: usize, /// The number of null elements in this array data - null_count: i64, + null_count: usize, /// The offset into this array data - offset: i64, + offset: usize, /// The buffers for this array data. Note that depending on the array types, this /// could hold different kinds of buffers (e.g., value buffer, value offset buffer) @@ -54,25 +54,28 @@ pub struct ArrayData { } pub type ArrayDataRef = Arc; -pub const UNKNOWN_NULL_COUNT: i64 = -1; impl ArrayData { pub fn new( data_type: DataType, - len: i64, - mut null_count: i64, + len: usize, + null_count: Option, null_bit_buffer: Option, - offset: i64, + offset: usize, buffers: Vec, child_data: Vec, ) -> Self { - if null_count < 0 { - null_count = if let Some(ref buf) = null_bit_buffer { - len - bit_util::count_set_bits_offset(buf.data(), offset as usize) - } else { - 0 - }; - } + let null_count = match null_count { + None => { + if let Some(ref buf) = null_bit_buffer { + len.checked_sub(bit_util::count_set_bits_offset(buf.data(), offset)) + .unwrap() + } else { + 0 + } + } + Some(null_count) => null_count, + }; let null_bitmap = null_bit_buffer.map(Bitmap::from); Self { data_type, @@ -106,7 +109,7 @@ impl ArrayData { } /// Returns whether the element at index `i` is null - pub fn is_null(&self, i: i64) -> bool { + pub fn is_null(&self, i: usize) -> bool { if let Some(ref b) = self.null_bitmap { return !b.is_set(i); } @@ -119,7 +122,7 @@ impl ArrayData { } /// Returns whether the element at index `i` is not null - pub fn is_valid(&self, i: i64) -> bool { + pub fn is_valid(&self, i: usize) -> bool { if let Some(ref b) = self.null_bitmap { return b.is_set(i); } @@ -127,17 +130,17 @@ impl ArrayData { } /// Returns the length (i.e., number of elements) of this array - pub fn len(&self) -> i64 { + pub fn len(&self) -> usize { self.len } /// Returns the offset of this array - pub fn offset(&self) -> i64 { + pub fn offset(&self) -> usize { self.offset } /// Returns the total number of nulls in this array - pub fn null_count(&self) -> i64 { + pub fn null_count(&self) -> usize { self.null_count } } @@ -145,10 +148,10 @@ impl ArrayData { /// Builder for `ArrayData` type pub struct ArrayDataBuilder { data_type: DataType, - len: i64, - null_count: i64, + len: usize, + null_count: Option, null_bit_buffer: Option, - offset: i64, + offset: usize, buffers: Vec, child_data: Vec, } @@ -158,7 +161,7 @@ impl ArrayDataBuilder { Self { data_type, len: 0, - null_count: UNKNOWN_NULL_COUNT, + null_count: None, null_bit_buffer: None, offset: 0, buffers: vec![], @@ -166,13 +169,13 @@ impl ArrayDataBuilder { } } - pub fn len(mut self, n: i64) -> Self { + pub fn len(mut self, n: usize) -> Self { self.len = n; self } - pub fn null_count(mut self, n: i64) -> Self { - self.null_count = n; + pub fn null_count(mut self, n: usize) -> Self { + self.null_count = Some(n); self } @@ -181,7 +184,7 @@ impl ArrayDataBuilder { self } - pub fn offset(mut self, n: i64) -> Self { + pub fn offset(mut self, n: usize) -> Self { self.offset = n; self } @@ -230,7 +233,7 @@ mod tests { #[test] fn test_new() { - let arr_data = ArrayData::new(DataType::Boolean, 10, 1, None, 2, vec![], vec![]); + let arr_data = ArrayData::new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]); assert_eq!(10, arr_data.len()); assert_eq!(1, arr_data.null_count()); assert_eq!(2, arr_data.offset()); @@ -244,7 +247,7 @@ mod tests { let child_arr_data = Arc::new(ArrayData::new( DataType::Int32, 10, - 0, + Some(0), None, 0, vec![], diff --git a/rust/src/bitmap.rs b/rust/src/bitmap.rs index 742fac5587b3e..3d5a77f78a51e 100644 --- a/rust/src/bitmap.rs +++ b/rust/src/bitmap.rs @@ -45,9 +45,9 @@ impl Bitmap { self.bits.len() } - pub fn is_set(&self, i: i64) -> bool { - assert!(i < (self.bits.len() << 3) as i64); - unsafe { bit_util::get_bit_raw(self.bits.raw_data(), i as usize) } + pub fn is_set(&self, i: usize) -> bool { + assert!(i < (self.bits.len() << 3)); + unsafe { bit_util::get_bit_raw(self.bits.raw_data(), i) } } } diff --git a/rust/src/buffer.rs b/rust/src/buffer.rs index 4b7d2a0d3c97e..b9c159f33857a 100644 --- a/rust/src/buffer.rs +++ b/rust/src/buffer.rs @@ -49,7 +49,7 @@ impl PartialEq for BufferData { if self.len != other.len { return false; } - unsafe { memory::memcmp(self.ptr, other.ptr, self.len as usize) == 0 } + unsafe { memory::memcmp(self.ptr, other.ptr, self.len) == 0 } } } @@ -73,7 +73,7 @@ impl Buffer { /// Returns the number of bytes in the buffer pub fn len(&self) -> usize { - self.data.len - self.offset as usize + self.data.len - self.offset } /// Returns whether the buffer is empty. @@ -128,7 +128,7 @@ impl> From for Buffer { // allocate aligned memory buffer let slice = p.as_ref(); let len = slice.len() * mem::size_of::(); - let buffer = memory::allocate_aligned((len) as i64).unwrap(); + let buffer = memory::allocate_aligned(len).unwrap(); unsafe { memory::memcpy(buffer, slice.as_ptr(), len); } @@ -151,12 +151,12 @@ pub struct MutableBuffer { impl MutableBuffer { /// Allocate a new mutable buffer with initial capacity to be `capacity`. pub fn new(capacity: usize) -> Self { - let new_capacity = bit_util::round_upto_multiple_of_64(capacity as i64); + let new_capacity = bit_util::round_upto_multiple_of_64(capacity); let ptr = memory::allocate_aligned(new_capacity).unwrap(); Self { data: ptr, len: 0, - capacity: new_capacity as usize, + capacity: new_capacity, } } @@ -193,8 +193,8 @@ impl MutableBuffer { /// Returns the new capacity for this buffer. pub fn reserve(&mut self, capacity: usize) -> Result { if capacity > self.capacity { - let new_capacity = bit_util::round_upto_multiple_of_64(capacity as i64); - let new_capacity = cmp::max(new_capacity, self.capacity as i64 * 2) as usize; + let new_capacity = bit_util::round_upto_multiple_of_64(capacity); + let new_capacity = cmp::max(new_capacity, self.capacity * 2); let new_data = memory::reallocate(self.capacity, new_capacity, self.data)?; self.data = new_data as *mut u8; self.capacity = new_capacity; @@ -213,7 +213,7 @@ impl MutableBuffer { if new_len > self.len { self.reserve(new_len)?; } else { - let new_capacity = bit_util::round_upto_multiple_of_64(new_len as i64) as usize; + let new_capacity = bit_util::round_upto_multiple_of_64(new_len); if new_capacity < self.capacity { let new_data = memory::reallocate(self.capacity, new_capacity, self.data)?; self.data = new_data as *mut u8; @@ -287,7 +287,7 @@ impl PartialEq for MutableBuffer { if self.len != other.len { return false; } - unsafe { memory::memcmp(self.data, other.data, self.len as usize) == 0 } + unsafe { memory::memcmp(self.data, other.data, self.len) == 0 } } } diff --git a/rust/src/builder.rs b/rust/src/builder.rs index 2cbdce0c8570b..fc781ffa50641 100644 --- a/rust/src/builder.rs +++ b/rust/src/builder.rs @@ -33,7 +33,7 @@ use crate::util::bit_util; /// Buffer builder with zero-copy build method pub struct BufferBuilder { buffer: MutableBuffer, - len: i64, + len: usize, _marker: PhantomData, } @@ -53,11 +53,11 @@ pub type Float64BufferBuilder = BufferBuilder; // numeric types and boolean types, while still be able to call methods on buffer builder // with generic primitive type. pub trait BufferBuilderTrait { - fn new(capacity: i64) -> Self; - fn len(&self) -> i64; - fn capacity(&self) -> i64; - fn advance(&mut self, i: i64) -> Result<()>; - fn reserve(&mut self, n: i64) -> Result<()>; + fn new(capacity: usize) -> Self; + fn len(&self) -> usize; + fn capacity(&self) -> usize; + fn advance(&mut self, i: usize) -> Result<()>; + fn reserve(&mut self, n: usize) -> Result<()>; fn push(&mut self, v: T::Native) -> Result<()>; fn push_slice(&mut self, slice: &[T::Native]) -> Result<()>; fn finish(self) -> Buffer; @@ -65,8 +65,8 @@ pub trait BufferBuilderTrait { impl BufferBuilderTrait for BufferBuilder { /// Creates a builder with a fixed initial capacity - default fn new(capacity: i64) -> Self { - let buffer = MutableBuffer::new(capacity as usize * mem::size_of::()); + default fn new(capacity: usize) -> Self { + let buffer = MutableBuffer::new(capacity * mem::size_of::()); Self { buffer, len: 0, @@ -75,28 +75,28 @@ impl BufferBuilderTrait for BufferBuilder { } /// Returns the number of array elements (slots) in the builder - fn len(&self) -> i64 { + fn len(&self) -> usize { self.len } /// Returns the current capacity of the builder (number of elements) - fn capacity(&self) -> i64 { + fn capacity(&self) -> usize { let bit_capacity = self.buffer.capacity() * 8; - (bit_capacity / T::get_bit_width()) as i64 + (bit_capacity / T::get_bit_width()) } // Advances the `len` of the underlying `Buffer` by `i` slots of type T - default fn advance(&mut self, i: i64) -> Result<()> { - let new_buffer_len = (self.len + i) as usize * mem::size_of::(); + default fn advance(&mut self, i: usize) -> Result<()> { + let new_buffer_len = (self.len + i) * mem::size_of::(); self.buffer.resize(new_buffer_len)?; self.len += i; Ok(()) } /// Reserves memory for `n` elements of type `T`. - default fn reserve(&mut self, n: i64) -> Result<()> { + default fn reserve(&mut self, n: usize) -> Result<()> { let new_capacity = self.len + n; - let byte_capacity = mem::size_of::() * new_capacity as usize; + let byte_capacity = mem::size_of::() * new_capacity; self.buffer.reserve(byte_capacity)?; Ok(()) } @@ -109,7 +109,7 @@ impl BufferBuilderTrait for BufferBuilder { /// Pushes a slice of type `T`, growing the internal buffer as needed. default fn push_slice(&mut self, slice: &[T::Native]) -> Result<()> { - let array_slots = slice.len() as i64; + let array_slots = slice.len(); self.reserve(array_slots)?; self.write_bytes(slice.to_byte_slice(), array_slots) } @@ -124,7 +124,7 @@ impl BufferBuilder { /// Writes a byte slice to the underlying buffer and updates the `len`, i.e. the number array /// elements in the builder. Also, converts the `io::Result` required by the `Write` trait /// to the Arrow `Result` type. - fn write_bytes(&mut self, bytes: &[u8], len_added: i64) -> Result<()> { + fn write_bytes(&mut self, bytes: &[u8], len_added: usize) -> Result<()> { let write_result = self.buffer.write(bytes); // `io::Result` has many options one of which we use, so pattern matching is overkill here if write_result.is_err() { @@ -140,9 +140,9 @@ impl BufferBuilder { impl BufferBuilderTrait for BufferBuilder { /// Creates a builder with a fixed initial capacity. - fn new(capacity: i64) -> Self { + fn new(capacity: usize) -> Self { let byte_capacity = bit_util::ceil(capacity, 8); - let actual_capacity = bit_util::round_upto_multiple_of_64(byte_capacity) as usize; + let actual_capacity = bit_util::round_upto_multiple_of_64(byte_capacity); let mut buffer = MutableBuffer::new(actual_capacity); buffer.set_null_bits(0, actual_capacity); Self { @@ -153,9 +153,9 @@ impl BufferBuilderTrait for BufferBuilder { } // Advances the `len` of the underlying `Buffer` by `i` slots of type T - fn advance(&mut self, i: i64) -> Result<()> { + fn advance(&mut self, i: usize) -> Result<()> { let new_buffer_len = bit_util::ceil(self.len + i, 8); - self.buffer.resize(new_buffer_len as usize)?; + self.buffer.resize(new_buffer_len)?; self.len += i; Ok(()) } @@ -167,7 +167,7 @@ impl BufferBuilderTrait for BufferBuilder { // For performance the `len` of the buffer is not updated on each push but // is updated in the `freeze` method instead. unsafe { - bit_util::set_bit_raw(self.buffer.raw_data() as *mut u8, (self.len) as usize); + bit_util::set_bit_raw(self.buffer.raw_data() as *mut u8, self.len); } } self.len += 1; @@ -184,10 +184,10 @@ impl BufferBuilderTrait for BufferBuilder { } /// Reserves memory for `n` elements of type `T`. - fn reserve(&mut self, n: i64) -> Result<()> { + fn reserve(&mut self, n: usize) -> Result<()> { let new_capacity = self.len + n; if new_capacity > self.capacity() { - let new_byte_capacity = bit_util::ceil(new_capacity, 8) as usize; + let new_byte_capacity = bit_util::ceil(new_capacity, 8); let existing_capacity = self.buffer.capacity(); let new_capacity = self.buffer.reserve(new_byte_capacity)?; self.buffer @@ -199,7 +199,7 @@ impl BufferBuilderTrait for BufferBuilder { /// Consumes this and returns an immutable `Buffer`. fn finish(mut self) -> Buffer { // `push` does not update the buffer's `len` so do it before `freeze` is called. - let new_buffer_len = bit_util::ceil(self.len, 8) as usize; + let new_buffer_len = bit_util::ceil(self.len, 8); debug_assert!(new_buffer_len >= self.buffer.len()); self.buffer.resize(new_buffer_len).unwrap(); self.buffer.freeze() @@ -216,7 +216,7 @@ pub trait ArrayBuilder { fn into_any(self) -> Box; /// Returns the number of array slots in the builder - fn len(&self) -> i64; + fn len(&self) -> usize; /// Builds the array fn finish(self) -> Self::ArrayType; @@ -250,7 +250,7 @@ impl ArrayBuilder for PrimitiveArrayBuilder { } /// Returns the number of array slots in the builder - fn len(&self) -> i64 { + fn len(&self) -> usize { self.values_builder.len } @@ -270,7 +270,7 @@ impl ArrayBuilder for PrimitiveArrayBuilder { impl PrimitiveArrayBuilder { /// Creates a new primitive array builder - pub fn new(capacity: i64) -> Self { + pub fn new(capacity: usize) -> Self { Self { values_builder: BufferBuilder::::new(capacity), bitmap_builder: BooleanBufferBuilder::new(capacity), @@ -278,7 +278,7 @@ impl PrimitiveArrayBuilder { } /// Returns the capacity of this builder measured in slots of type `T` - pub fn capacity(&self) -> i64 { + pub fn capacity(&self) -> usize { self.values_builder.capacity() } @@ -318,7 +318,7 @@ pub struct ListArrayBuilder { offsets_builder: Int32BufferBuilder, bitmap_builder: BooleanBufferBuilder, values_builder: T, - len: i64, + len: usize, } impl ListArrayBuilder { @@ -348,7 +348,7 @@ where } /// Returns the number of array slots in the builder - fn len(&self) -> i64 { + fn len(&self) -> usize { self.len } @@ -410,7 +410,7 @@ impl ArrayBuilder for BinaryArrayBuilder { } /// Returns the number of array slots in the builder - fn len(&self) -> i64 { + fn len(&self) -> usize { self.builder.len() } @@ -422,7 +422,7 @@ impl ArrayBuilder for BinaryArrayBuilder { impl BinaryArrayBuilder { /// Creates a new `BinaryArrayBuilder`, `capacity` is the number of bytes in the values array - pub fn new(capacity: i64) -> Self { + pub fn new(capacity: usize) -> Self { let values_builder = UInt8Builder::new(capacity); Self { builder: ListArrayBuilder::new(values_builder), @@ -736,8 +736,8 @@ mod tests { assert_eq!(6, list_array.value_offset(2)); assert_eq!(2, list_array.value_length(2)); for i in 0..3 { - assert!(list_array.is_valid(i as i64)); - assert!(!list_array.is_null(i as i64)); + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); } } diff --git a/rust/src/csv/reader.rs b/rust/src/csv/reader.rs index 697ace653b691..956408e4a40c3 100644 --- a/rust/src/csv/reader.rs +++ b/rust/src/csv/reader.rs @@ -44,13 +44,15 @@ use std::fs::File; use std::io::BufReader; use std::sync::Arc; +use csv as csv_crate; + use crate::array::{ArrayRef, BinaryArray}; use crate::builder::*; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::RecordBatch; -use csv_crate::{StringRecord, StringRecordsIntoIter}; +use self::csv_crate::{StringRecord, StringRecordsIntoIter}; /// CSV file reader pub struct Reader { @@ -91,7 +93,7 @@ fn build_primitive_array( rows: &[StringRecord], col_idx: &usize, ) -> Result { - let mut builder = PrimitiveArrayBuilder::::new(rows.len() as i64); + let mut builder = PrimitiveArrayBuilder::::new(rows.len()); for row_index in 0..rows.len() { match rows[row_index].get(*col_idx) { Some(s) if s.len() > 0 => match s.parse::() { @@ -161,7 +163,7 @@ impl Reader { &DataType::Float32 => build_primitive_array::(rows, i), &DataType::Float64 => build_primitive_array::(rows, i), &DataType::Utf8 => { - let values_builder: UInt8Builder = UInt8Builder::new(rows.len() as i64); + let values_builder: UInt8Builder = UInt8Builder::new(rows.len()); let mut list_builder = ListArrayBuilder::new(values_builder); for row_index in 0..rows.len() { match rows[row_index].get(*i) { diff --git a/rust/src/memory.rs b/rust/src/memory.rs index 193eff12d6f6f..763cb48f50f9e 100644 --- a/rust/src/memory.rs +++ b/rust/src/memory.rs @@ -31,7 +31,7 @@ extern "C" { } #[cfg(windows)] -pub fn allocate_aligned(size: i64) -> Result<*mut u8> { +pub fn allocate_aligned(size: usize) -> Result<*mut u8> { let page = unsafe { _aligned_malloc(size as libc::size_t, ALIGNMENT as libc::size_t) }; match page { 0 => Err(ArrowError::MemoryError( @@ -42,10 +42,10 @@ pub fn allocate_aligned(size: i64) -> Result<*mut u8> { } #[cfg(not(windows))] -pub fn allocate_aligned(size: i64) -> Result<*mut u8> { +pub fn allocate_aligned(size: usize) -> Result<*mut u8> { unsafe { let mut page: *mut libc::c_void = mem::uninitialized(); - let result = libc::posix_memalign(&mut page, ALIGNMENT, size as usize); + let result = libc::posix_memalign(&mut page, ALIGNMENT, size); match result { 0 => Ok(mem::transmute::<*mut libc::c_void, *mut u8>(page)), _ => Err(ArrowError::MemoryError( @@ -72,7 +72,7 @@ pub fn free_aligned(p: *const u8) { pub fn reallocate(old_size: usize, new_size: usize, pointer: *const u8) -> Result<*const u8> { unsafe { let old_src = mem::transmute::<*const u8, *mut libc::c_void>(pointer); - let result = allocate_aligned(new_size as i64)?; + let result = allocate_aligned(new_size)?; let dst = mem::transmute::<*const u8, *mut libc::c_void>(result); libc::memcpy(dst, old_src, cmp::min(old_size, new_size)); free_aligned(pointer); diff --git a/rust/src/record_batch.rs b/rust/src/record_batch.rs index 4cb5c8e7db4df..2666770460e84 100644 --- a/rust/src/record_batch.rs +++ b/rust/src/record_batch.rs @@ -52,7 +52,7 @@ impl RecordBatch { self.columns.len() } - pub fn num_rows(&self) -> i64 { + pub fn num_rows(&self) -> usize { self.columns[0].data().len() } diff --git a/rust/src/tensor.rs b/rust/src/tensor.rs index ec56aeb4cccd5..175b68d81f188 100644 --- a/rust/src/tensor.rs +++ b/rust/src/tensor.rs @@ -23,30 +23,30 @@ use crate::buffer::Buffer; use crate::datatypes::*; /// Computes the strides required assuming a row major memory layout -fn compute_row_major_strides(shape: &Vec) -> Vec { +fn compute_row_major_strides(shape: &Vec) -> Vec { let mut remaining_bytes = mem::size_of::(); for i in shape { remaining_bytes = remaining_bytes - .checked_mul(*i as usize) + .checked_mul(*i) .expect("Overflow occurred when computing row major strides."); } - let mut strides = Vec::::new(); + let mut strides = Vec::::new(); for i in shape { - remaining_bytes /= *i as usize; - strides.push(remaining_bytes as i64); + remaining_bytes /= *i; + strides.push(remaining_bytes); } strides } /// Computes the strides required assuming a column major memory layout -fn compute_column_major_strides(shape: &Vec) -> Vec { +fn compute_column_major_strides(shape: &Vec) -> Vec { let mut remaining_bytes = mem::size_of::(); - let mut strides = Vec::::new(); + let mut strides = Vec::::new(); for i in shape { - strides.push(remaining_bytes as i64); + strides.push(remaining_bytes); remaining_bytes = remaining_bytes - .checked_mul(*i as usize) + .checked_mul(*i) .expect("Overflow occurred when computing column major strides."); } strides @@ -56,8 +56,8 @@ fn compute_column_major_strides(shape: &Vec) -> Vec< pub struct Tensor<'a, T: ArrowPrimitiveType> { data_type: DataType, buffer: Buffer, - shape: Option>, - strides: Option>, + shape: Option>, + strides: Option>, names: Option>, _marker: PhantomData, } @@ -78,8 +78,8 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { /// Creates a new `Tensor` pub fn new( buffer: Buffer, - shape: Option>, - strides: Option>, + shape: Option>, + strides: Option>, names: Option>, ) -> Self { match &shape { @@ -122,7 +122,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { /// Creates a new Tensor using row major memory layout pub fn new_row_major( buffer: Buffer, - shape: Option>, + shape: Option>, names: Option>, ) -> Self { let strides = match &shape { @@ -135,7 +135,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { /// Creates a new Tensor using column major memory layout pub fn new_column_major( buffer: Buffer, - shape: Option>, + shape: Option>, names: Option>, ) -> Self { let strides = match &shape { @@ -151,7 +151,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { } /// The sizes of the dimensions - pub fn shape(&self) -> Option<&Vec> { + pub fn shape(&self) -> Option<&Vec> { self.shape.as_ref() } @@ -161,7 +161,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { } /// The number of bytes between elements in each dimension - pub fn strides(&self) -> Option<&Vec> { + pub fn strides(&self) -> Option<&Vec> { self.strides.as_ref() } @@ -171,24 +171,24 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { } /// The number of dimensions - pub fn ndim(&self) -> i64 { + pub fn ndim(&self) -> usize { match &self.shape { None => 0, - Some(v) => v.len() as i64, + Some(v) => v.len(), } } /// The name of dimension i - pub fn dim_name(&self, i: i64) -> Option<&'a str> { + pub fn dim_name(&self, i: usize) -> Option<&'a str> { match &self.names { None => None, - Some(ref names) => Some(&names[i as usize]), + Some(ref names) => Some(&names[i]), } } /// The total number of elements in the `Tensor` - pub fn size(&self) -> i64 { - (self.buffer.len() / mem::size_of::()) as i64 + pub fn size(&self) -> usize { + (self.buffer.len() / mem::size_of::()) } /// Indicates if the data is laid out contiguously in memory @@ -223,15 +223,15 @@ mod tests { fn test_compute_row_major_strides() { assert_eq!( vec![48, 8], - compute_row_major_strides::(&vec![4_i64, 6]) + compute_row_major_strides::(&vec![4_usize, 6]) ); assert_eq!( vec![24, 4], - compute_row_major_strides::(&vec![4_i64, 6]) + compute_row_major_strides::(&vec![4_usize, 6]) ); assert_eq!( vec![6, 1], - compute_row_major_strides::(&vec![4_i64, 6]) + compute_row_major_strides::(&vec![4_usize, 6]) ); } @@ -239,15 +239,15 @@ mod tests { fn test_compute_column_major_strides() { assert_eq!( vec![8, 32], - compute_column_major_strides::(&vec![4_i64, 6]) + compute_column_major_strides::(&vec![4_usize, 6]) ); assert_eq!( vec![4, 16], - compute_column_major_strides::(&vec![4_i64, 6]) + compute_column_major_strides::(&vec![4_usize, 6]) ); assert_eq!( vec![1, 4], - compute_column_major_strides::(&vec![4_i64, 6]) + compute_column_major_strides::(&vec![4_usize, 6]) ); } @@ -283,7 +283,7 @@ mod tests { let buf = builder.finish(); let tensor = Int32Tensor::new(buf, Some(vec![2, 8]), None, None); assert_eq!(16, tensor.size()); - assert_eq!(Some(vec![2_i64, 8]).as_ref(), tensor.shape()); + assert_eq!(Some(vec![2_usize, 8]).as_ref(), tensor.shape()); assert_eq!(None, tensor.strides()); assert_eq!(2, tensor.ndim()); assert_eq!(None, tensor.names()); @@ -298,8 +298,8 @@ mod tests { let buf = builder.finish(); let tensor = Int32Tensor::new_row_major(buf, Some(vec![2, 8]), None); assert_eq!(16, tensor.size()); - assert_eq!(Some(vec![2_i64, 8]).as_ref(), tensor.shape()); - assert_eq!(Some(vec![32_i64, 4]).as_ref(), tensor.strides()); + assert_eq!(Some(vec![2_usize, 8]).as_ref(), tensor.shape()); + assert_eq!(Some(vec![32_usize, 4]).as_ref(), tensor.strides()); assert_eq!(None, tensor.names()); assert_eq!(2, tensor.ndim()); assert_eq!(true, tensor.is_row_major()); @@ -316,8 +316,8 @@ mod tests { let buf = builder.finish(); let tensor = Int32Tensor::new_column_major(buf, Some(vec![2, 8]), None); assert_eq!(16, tensor.size()); - assert_eq!(Some(vec![2_i64, 8]).as_ref(), tensor.shape()); - assert_eq!(Some(vec![4_i64, 8]).as_ref(), tensor.strides()); + assert_eq!(Some(vec![2_usize, 8]).as_ref(), tensor.shape()); + assert_eq!(Some(vec![4_usize, 8]).as_ref(), tensor.strides()); assert_eq!(None, tensor.names()); assert_eq!(2, tensor.ndim()); assert_eq!(false, tensor.is_row_major()); @@ -335,8 +335,8 @@ mod tests { let names = vec!["Dim 1", "Dim 2"]; let tensor = Int64Tensor::new_column_major(buf, Some(vec![2, 4]), Some(names)); assert_eq!(8, tensor.size()); - assert_eq!(Some(vec![2_i64, 4]).as_ref(), tensor.shape()); - assert_eq!(Some(vec![8_i64, 16]).as_ref(), tensor.strides()); + assert_eq!(Some(vec![2_usize, 4]).as_ref(), tensor.shape()); + assert_eq!(Some(vec![8_usize, 16]).as_ref(), tensor.strides()); assert_eq!("Dim 1", tensor.dim_name(0).unwrap()); assert_eq!("Dim 2", tensor.dim_name(1).unwrap()); assert_eq!(2, tensor.ndim()); diff --git a/rust/src/util/bit_util.rs b/rust/src/util/bit_util.rs index da6d10d269ca2..3f7f4cb573b49 100644 --- a/rust/src/util/bit_util.rs +++ b/rust/src/util/bit_util.rs @@ -30,13 +30,13 @@ static POPCOUNT_TABLE: [u8; 256] = [ /// Returns the nearest number that is `>=` than `num` and is a multiple of 64 #[inline] -pub fn round_upto_multiple_of_64(num: i64) -> i64 { +pub fn round_upto_multiple_of_64(num: usize) -> usize { round_upto_power_of_2(num, 64) } /// Returns the nearest multiple of `factor` that is `>=` than `num`. Here `factor` must /// be a power of 2. -fn round_upto_power_of_2(num: i64, factor: i64) -> i64 { +fn round_upto_power_of_2(num: usize, factor: usize) -> usize { debug_assert!(factor > 0 && (factor & (factor - 1)) == 0); (num + (factor - 1)) & !(factor - 1) } @@ -73,20 +73,20 @@ pub unsafe fn set_bit_raw(data: *mut u8, i: usize) { /// Returns the number of 1-bits in `data` #[inline] -pub fn count_set_bits(data: &[u8]) -> i64 { - let mut count: i64 = 0; +pub fn count_set_bits(data: &[u8]) -> usize { + let mut count: usize = 0; for u in data { - count += POPCOUNT_TABLE[*u as usize] as i64; + count += POPCOUNT_TABLE[*u as usize] as usize; } count } /// Returns the number of 1-bits in `data`, starting from `offset`. #[inline] -pub fn count_set_bits_offset(data: &[u8], offset: usize) -> i64 { +pub fn count_set_bits_offset(data: &[u8], offset: usize) -> usize { debug_assert!(offset <= (data.len() << 3)); - let start_byte_pos = (offset >> 3) as usize; + let start_byte_pos = offset >> 3; let start_bit_pos = offset & 7; if start_bit_pos == 0 { @@ -95,7 +95,7 @@ pub fn count_set_bits_offset(data: &[u8], offset: usize) -> i64 { let mut result = 0; result += count_set_bits(&data[start_byte_pos + 1..]); for i in start_bit_pos..8 { - if get_bit(&data[start_byte_pos..start_byte_pos + 1], i as usize) { + if get_bit(&data[start_byte_pos..start_byte_pos + 1], i) { result += 1; } } @@ -105,7 +105,7 @@ pub fn count_set_bits_offset(data: &[u8], offset: usize) -> i64 { /// Returns the ceil of `value`/`divisor` #[inline] -pub fn ceil(value: i64, divisor: i64) -> i64 { +pub fn ceil(value: usize, divisor: usize) -> usize { let mut result = value / divisor; if value % divisor != 0 { result += 1 From 8973cfe4332e4b8e917fb52e47168ccea8b9653d Mon Sep 17 00:00:00 2001 From: Praveen Date: Mon, 10 Dec 2018 16:39:43 +0100 Subject: [PATCH 014/328] ARROW-3983: [Gandiva][Crossbow] Link Boost statically in JAR packaging scripts Use static boost libraries while packaging Gandiva. Author: Praveen Closes #3145 from praveenbingo/ARROW-3983 and squashes the following commits: 2a704969 ARROW-3983: Use static version of boost. --- dev/tasks/gandiva-jars/build-cpp.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/tasks/gandiva-jars/build-cpp.sh b/dev/tasks/gandiva-jars/build-cpp.sh index a0538cf6f3116..21289dee5a6b1 100755 --- a/dev/tasks/gandiva-jars/build-cpp.sh +++ b/dev/tasks/gandiva-jars/build-cpp.sh @@ -29,6 +29,7 @@ pushd arrow/cpp -DARROW_GANDIVA=ON \ -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ -DARROW_BUILD_UTILITIES=OFF \ + -DARROW_BOOST_USE_SHARED=OFF \ .. make -j4 ctest From 9da458437162574f3e0d82e4a51dc6c1589b9f94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 10 Dec 2018 16:42:53 +0100 Subject: [PATCH 015/328] ARROW-2624: [Python] Random schema generator for Arrow conversion and Parquet testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - introduced hypothesis to generate pyarrow types, fields and schemas - test cases to highlight the functionality provided by hypothesis - hypothesis tests are disabled by default - represent kev-value metadata as OrderedDict on python side instead of plain dicts (pickling was indeterministic, found this bug by hypo) - unified multiple metadata conversion paths to a single one (pyarrow_wrap_metadata, pyarrow_unwrap_metadata) Also resolves: [ARROW-3901: [Python] Make Schema hashable](https://issues.apache.org/jira/browse/ARROW-3901) Follow-up issue: [ARROW-3903: [Python] Random data generator for ... testing](https://issues.apache.org/jira/browse/ARROW-3903) Author: Krisztián Szűcs Closes #3046 from kszucs/ARROW-2624 and squashes the following commits: 3e27ad15 hypo profiles 88b107bb install hypothesis for msvc wheel test 8fb6d0bc make pyarrow_wrap_metadata private 80a276be manylinux 26e6ecd6 manylinux e385d243 manylinux b6fe7576 append in unwrap 0e28e5df ci fixes efeb65ee use conde_env_python.yml in travis 1f7ad6b6 don't validate metadata type pyarrow_wrap_metadata 14e444d9 introduce requirements-test.txt 11b020c0 install hypothesis on appveyor and travis 6bd5b21e license header a8fae546 remove unbox_metadata e8c0f3f5 add hypo as test dependency; hashing test e7bab691 remove box_metadata f1ae290e hypothesis strategies for pyarrow types; deterministic key-value metadata conversions --- ci/appveyor-cpp-build.bat | 2 +- ci/conda_env_python.yml | 2 + ci/cpp-msvc-build-main.bat | 2 +- ci/travis_script_python.sh | 10 +- dev/release/rat_exclude_files.txt | 1 + dev/release/verify-release-candidate.sh | 2 +- python/manylinux1/build_arrow.sh | 5 +- .../manylinux1/scripts/build_virtualenvs.sh | 2 +- python/pyarrow/includes/libarrow.pxd | 8 +- python/pyarrow/lib.pxd | 6 +- python/pyarrow/public-api.pxi | 25 ++++ python/pyarrow/table.pxi | 60 ++++---- python/pyarrow/tests/conftest.py | 34 ++++- python/pyarrow/tests/strategies.py | 138 ++++++++++++++++++ python/pyarrow/tests/test_types.py | 50 +++++++ python/pyarrow/types.pxi | 88 +++++------ python/requirements-test.txt | 5 + python/requirements.txt | 9 +- python/setup.py | 3 +- 19 files changed, 348 insertions(+), 104 deletions(-) create mode 100644 python/pyarrow/tests/strategies.py create mode 100644 python/requirements-test.txt diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 91212a63fe3ac..b8e431613210a 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -91,7 +91,7 @@ if "%JOB%" == "Build_Debug" ( conda create -n arrow -q -y ^ python=%PYTHON% ^ - six pytest setuptools numpy pandas cython ^ + six pytest setuptools numpy pandas cython hypothesis ^ thrift-cpp=0.11.0 boost-cpp ^ -c conda-forge diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index 429851eb2f5ae..c187155275eaa 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -16,6 +16,8 @@ # under the License. cython +cloudpickle +hypothesis nomkl numpy pandas diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index ef961b2e0f26e..7349f8d3aca6b 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -112,6 +112,6 @@ pip install %WHEEL_PATH% || exit /B python -c "import pyarrow" || exit /B python -c "import pyarrow.parquet" || exit /B -pip install pandas pickle5 pytest pytest-faulthandler || exit /B +pip install pandas pickle5 pytest pytest-faulthandler hypothesis || exit /B py.test -r sxX --durations=15 --pyargs pyarrow.tests || exit /B diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index e4290ed8ee026..b316c81f3b6b0 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -51,13 +51,11 @@ if [ $ARROW_TRAVIS_PYTHON_JVM == "1" ]; then CONDA_JVM_DEPS="jpype1" fi -conda install -y -q pip \ - nomkl \ - cloudpickle \ +conda install -y -q \ + --file $TRAVIS_BUILD_DIR/ci/conda_env_python.yml \ + pip \ numpy=1.13.1 \ - ${CONDA_JVM_DEPS} \ - pandas \ - cython + ${CONDA_JVM_DEPS} if [ "$ARROW_TRAVIS_PYTHON_DOCS" == "1" ] && [ "$PYTHON_VERSION" == "3.6" ]; then # Install documentation dependencies diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 0baf29edd83e4..e274d97548068 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -129,6 +129,7 @@ python/MANIFEST.in python/pyarrow/includes/__init__.pxd python/pyarrow/tests/__init__.py python/requirements.txt +python/requirements-test.txt pax_global_header MANIFEST.in __init__.pxd diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 5b666630d17a0..57b1850337067 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -189,7 +189,7 @@ test_and_install_cpp() { test_python() { pushd python - pip install -r requirements.txt + pip install -r requirements-test.txt python setup.py build_ext --inplace --with-parquet --with-plasma py.test pyarrow -v --pdb diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index 44816526d2179..904297375ef25 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -107,7 +107,7 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER setup.py bdist_wheel PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER setup.py sdist - echo "=== (${PYTHON}) Test the existence of optional modules ===" + echo "=== (${PYTHON}) Ensure the existence of mandatory modules ===" $PIP install -r requirements.txt echo "=== (${PYTHON}) Tag the wheel with manylinux1 ===" @@ -122,6 +122,9 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER -c "import pyarrow.parquet" PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER -c "import pyarrow.plasma" + echo "=== (${PYTHON}) Install modules required for testing ===" + pip install -r requirements-test.txt + # The TensorFlow test will be skipped here, since TensorFlow is not # manylinux1 compatible; however, the wheels will support TensorFlow on # a TensorFlow compatible system diff --git a/python/manylinux1/scripts/build_virtualenvs.sh b/python/manylinux1/scripts/build_virtualenvs.sh index 18f3b0dd4657e..14100317d974f 100755 --- a/python/manylinux1/scripts/build_virtualenvs.sh +++ b/python/manylinux1/scripts/build_virtualenvs.sh @@ -41,7 +41,7 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do echo "=== (${PYTHON}, ${U_WIDTH}) Preparing virtualenv for tests ===" "$(cpython_path $PYTHON ${U_WIDTH})/bin/virtualenv" -p ${PYTHON_INTERPRETER} --no-download /venv-test-${PYTHON}-${U_WIDTH} source /venv-test-${PYTHON}-${U_WIDTH}/bin/activate - pip install pytest 'numpy==1.14.5' 'pandas==0.23.4' + pip install pytest hypothesis 'numpy==1.14.5' 'pandas==0.23.4' deactivate done diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index c5e745708308f..61517e4f09d21 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -23,9 +23,15 @@ cdef extern from "arrow/util/key_value_metadata.h" namespace "arrow" nogil: cdef cppclass CKeyValueMetadata" arrow::KeyValueMetadata": CKeyValueMetadata() CKeyValueMetadata(const unordered_map[c_string, c_string]&) + CKeyValueMetadata(const vector[c_string]& keys, + const vector[c_string]& values) - c_bool Equals(const CKeyValueMetadata& other) + void reserve(int64_t n) + int64_t size() const + c_string key(int64_t i) const + c_string value(int64_t i) const + c_bool Equals(const CKeyValueMetadata& other) void Append(const c_string& key, const c_string& value) void ToUnorderedMap(unordered_map[c_string, c_string]*) const diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 098ae62c8f492..745a049e32a7c 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -384,11 +384,13 @@ cdef get_reader(object source, c_bool use_memory_map, shared_ptr[RandomAccessFile]* reader) cdef get_writer(object source, shared_ptr[OutputStream]* writer) -cdef dict box_metadata(const CKeyValueMetadata* sp_metadata) - # Default is allow_none=False cdef DataType ensure_type(object type, c_bool allow_none=*) +cdef shared_ptr[CKeyValueMetadata] pyarrow_unwrap_metadata(object meta) +cdef object pyarrow_wrap_metadata( + const shared_ptr[const CKeyValueMetadata]& meta) + # # Public Cython API for 3rd party code # diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index e8798c5edbc7d..ef54c7ab42f74 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -92,6 +92,31 @@ cdef public api object pyarrow_wrap_data_type( return out +cdef object pyarrow_wrap_metadata( + const shared_ptr[const CKeyValueMetadata]& meta): + cdef const CKeyValueMetadata* cmeta = meta.get() + + if cmeta == nullptr: + return None + + result = OrderedDict() + for i in range(cmeta.size()): + result[cmeta.key(i)] = cmeta.value(i) + + return result + + +cdef shared_ptr[CKeyValueMetadata] pyarrow_unwrap_metadata(object meta): + cdef vector[c_string] keys, values + + if isinstance(meta, dict): + keys = map(tobytes, meta.keys()) + values = map(tobytes, meta.values()) + return make_shared[CKeyValueMetadata](keys, values) + + return shared_ptr[CKeyValueMetadata]() + + cdef public api bint pyarrow_is_field(object field): return isinstance(field, Field) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 0d529d3787614..fd565afae5acf 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -634,26 +634,22 @@ cdef class Column: return pyarrow_wrap_chunked_array(self.column.data()) -cdef shared_ptr[const CKeyValueMetadata] unbox_metadata(dict metadata): - if metadata is None: - return nullptr - cdef: - unordered_map[c_string, c_string] unordered_metadata = metadata - return ( - make_shared[CKeyValueMetadata](unordered_metadata)) - - -cdef _schema_from_arrays(arrays, names, dict metadata, - shared_ptr[CSchema]* schema): +cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): cdef: Column col c_string c_name vector[shared_ptr[CField]] fields shared_ptr[CDataType] type_ Py_ssize_t K = len(arrays) + shared_ptr[CKeyValueMetadata] c_meta + + if metadata is not None: + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + c_meta = pyarrow_unwrap_metadata(metadata) if K == 0: - schema.reset(new CSchema(fields, unbox_metadata(metadata))) + schema.reset(new CSchema(fields, c_meta)) return fields.resize(K) @@ -684,7 +680,7 @@ cdef _schema_from_arrays(arrays, names, dict metadata, c_name = tobytes(names[i]) fields[i].reset(new CField(c_name, type_, True)) - schema.reset(new CSchema(fields, unbox_metadata(metadata))) + schema.reset(new CSchema(fields, c_meta)) cdef class RecordBatch: @@ -715,7 +711,7 @@ cdef class RecordBatch: def __len__(self): return self.batch.num_rows() - def replace_schema_metadata(self, dict metadata=None): + def replace_schema_metadata(self, metadata=None): """ EXPERIMENTAL: Create shallow copy of record batch by replacing schema key-value metadata with the indicated new metadata (which may be None, @@ -729,15 +725,19 @@ cdef class RecordBatch: ------- shallow_copy : RecordBatch """ - cdef shared_ptr[CKeyValueMetadata] c_meta + cdef: + shared_ptr[CKeyValueMetadata] c_meta + shared_ptr[CRecordBatch] c_batch + if metadata is not None: - convert_metadata(metadata, &c_meta) + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + c_meta = pyarrow_unwrap_metadata(metadata) - cdef shared_ptr[CRecordBatch] new_batch with nogil: - new_batch = self.batch.ReplaceSchemaMetadata(c_meta) + c_batch = self.batch.ReplaceSchemaMetadata(c_meta) - return pyarrow_wrap_batch(new_batch) + return pyarrow_wrap_batch(c_batch) @property def num_columns(self): @@ -953,7 +953,7 @@ cdef class RecordBatch: return cls.from_arrays(arrays, names, metadata) @staticmethod - def from_arrays(list arrays, names, dict metadata=None): + def from_arrays(list arrays, names, metadata=None): """ Construct a RecordBatch from multiple pyarrow.Arrays @@ -1062,7 +1062,7 @@ cdef class Table: columns = [col.data for col in self.columns] return _reconstruct_table, (columns, self.schema) - def replace_schema_metadata(self, dict metadata=None): + def replace_schema_metadata(self, metadata=None): """ EXPERIMENTAL: Create shallow copy of table by replacing schema key-value metadata with the indicated new metadata (which may be None, @@ -1076,15 +1076,19 @@ cdef class Table: ------- shallow_copy : Table """ - cdef shared_ptr[CKeyValueMetadata] c_meta + cdef: + shared_ptr[CKeyValueMetadata] c_meta + shared_ptr[CTable] c_table + if metadata is not None: - convert_metadata(metadata, &c_meta) + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + c_meta = pyarrow_unwrap_metadata(metadata) - cdef shared_ptr[CTable] new_table with nogil: - new_table = self.table.ReplaceSchemaMetadata(c_meta) + c_table = self.table.ReplaceSchemaMetadata(c_meta) - return pyarrow_wrap_table(new_table) + return pyarrow_wrap_table(c_table) def flatten(self, MemoryPool memory_pool=None): """ @@ -1225,7 +1229,7 @@ cdef class Table: return cls.from_arrays(arrays, names=names, metadata=metadata) @staticmethod - def from_arrays(arrays, names=None, schema=None, dict metadata=None): + def from_arrays(arrays, names=None, schema=None, metadata=None): """ Construct a Table from Arrow arrays or columns @@ -1236,6 +1240,8 @@ cdef class Table: names: list of str, optional Names for the table columns. If Columns passed, will be inferred. If Arrays passed, this argument is required + schema : Schema, default None + If not passed, will be inferred from the arrays Returns ------- diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 6cdedbbb507cc..69e8e82e2532a 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -15,7 +15,9 @@ # specific language governing permissions and limitations # under the License. +import os import pytest +import hypothesis as h try: import pathlib @@ -23,7 +25,20 @@ import pathlib2 as pathlib # py2 compat +# setup hypothesis profiles +h.settings.register_profile('ci', max_examples=1000) +h.settings.register_profile('dev', max_examples=10) +h.settings.register_profile('debug', max_examples=10, + verbosity=h.Verbosity.verbose) + +# load default hypothesis profile, either set HYPOTHESIS_PROFILE environment +# variable or pass --hypothesis-profile option to pytest, to see the generated +# examples try: pytest pyarrow -sv --only-hypothesis --hypothesis-profile=debug +h.settings.load_profile(os.environ.get('HYPOTHESIS_PROFILE', 'default')) + + groups = [ + 'hypothesis', 'gandiva', 'hdfs', 'large_memory', @@ -36,6 +51,7 @@ defaults = { + 'hypothesis': False, 'gandiva': False, 'hdfs': False, 'large_memory': False, @@ -84,16 +100,15 @@ def pytest_configure(config): def pytest_addoption(parser): for group in groups: - parser.addoption('--{0}'.format(group), action='store_true', - default=defaults[group], - help=('Enable the {0} test group'.format(group))) + for flag in ['--{0}', '--enable-{0}']: + parser.addoption(flag.format(group), action='store_true', + default=defaults[group], + help=('Enable the {0} test group'.format(group))) - for group in groups: parser.addoption('--disable-{0}'.format(group), action='store_true', default=False, help=('Disable the {0} test group'.format(group))) - for group in groups: parser.addoption('--only-{0}'.format(group), action='store_true', default=False, help=('Run only the {0} test group'.format(group))) @@ -115,15 +130,18 @@ def pytest_runtest_setup(item): only_set = False for group in groups: + flag = '--{0}'.format(group) only_flag = '--only-{0}'.format(group) + enable_flag = '--enable-{0}'.format(group) disable_flag = '--disable-{0}'.format(group) - flag = '--{0}'.format(group) if item.config.getoption(only_flag): only_set = True elif getattr(item.obj, group, None): - if (item.config.getoption(disable_flag) or - not item.config.getoption(flag)): + is_enabled = (item.config.getoption(flag) or + item.config.getoption(enable_flag)) + is_disabled = item.config.getoption(disable_flag) + if is_disabled or not is_enabled: pytest.skip('{0} NOT enabled'.format(flag)) if only_set: diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py new file mode 100644 index 0000000000000..bc8ded2e896d0 --- /dev/null +++ b/python/pyarrow/tests/strategies.py @@ -0,0 +1,138 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +import hypothesis.strategies as st + + +# TODO(kszucs): alphanum_text, surrogate_text +custom_text = st.text( + alphabet=st.characters( + min_codepoint=0x41, + max_codepoint=0x7E + ) +) + +null_type = st.just(pa.null()) +bool_type = st.just(pa.bool_()) + +binary_type = st.just(pa.binary()) +string_type = st.just(pa.string()) + +signed_integer_types = st.sampled_from([ + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64() +]) +unsigned_integer_types = st.sampled_from([ + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64() +]) +integer_types = st.one_of(signed_integer_types, unsigned_integer_types) + +floating_types = st.sampled_from([ + pa.float16(), + pa.float32(), + pa.float64() +]) +decimal_type = st.builds( + pa.decimal128, + precision=st.integers(min_value=0, max_value=38), + scale=st.integers(min_value=0, max_value=38) +) +numeric_types = st.one_of(integer_types, floating_types, decimal_type) + +date_types = st.sampled_from([ + pa.date32(), + pa.date64() +]) +time_types = st.sampled_from([ + pa.time32('s'), + pa.time32('ms'), + pa.time64('us'), + pa.time64('ns') +]) +timestamp_types = st.sampled_from([ + pa.timestamp('s'), + pa.timestamp('ms'), + pa.timestamp('us'), + pa.timestamp('ns') +]) +temporal_types = st.one_of(date_types, time_types, timestamp_types) + +primitive_types = st.one_of( + null_type, + bool_type, + binary_type, + string_type, + numeric_types, + temporal_types +) + +metadata = st.dictionaries(st.text(), st.text()) + + +@st.defines_strategy +def fields(type_strategy=primitive_types): + return st.builds(pa.field, name=custom_text, type=type_strategy, + nullable=st.booleans(), metadata=metadata) + + +@st.defines_strategy +def list_types(item_strategy=primitive_types): + return st.builds(pa.list_, item_strategy) + + +@st.defines_strategy +def struct_types(item_strategy=primitive_types): + return st.builds(pa.struct, st.lists(fields(item_strategy))) + + +@st.defines_strategy +def complex_types(inner_strategy=primitive_types): + return list_types(inner_strategy) | struct_types(inner_strategy) + + +@st.defines_strategy +def nested_list_types(item_strategy=primitive_types): + return st.recursive(item_strategy, list_types) + + +@st.defines_strategy +def nested_struct_types(item_strategy=primitive_types): + return st.recursive(item_strategy, struct_types) + + +@st.defines_strategy +def nested_complex_types(inner_strategy=primitive_types): + return st.recursive(inner_strategy, complex_types) + + +@st.defines_strategy +def schemas(type_strategy=primitive_types): + return st.builds(pa.schema, st.lists(fields(type_strategy))) + + +complex_schemas = schemas(complex_types()) + + +all_types = st.one_of(primitive_types, complex_types(), nested_complex_types()) +all_fields = fields(all_types) +all_schemas = schemas(all_types) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 176ce8769f488..310656d86fd47 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -19,11 +19,14 @@ import pickle import pytest +import hypothesis as h +import hypothesis.strategies as st import pandas as pd import numpy as np import pyarrow as pa import pyarrow.types as types +import pyarrow.tests.strategies as past def get_many_types(): @@ -466,15 +469,27 @@ def test_field_metadata(): def test_field_add_remove_metadata(): + import collections + f0 = pa.field('foo', pa.int32()) assert f0.metadata is None metadata = {b'foo': b'bar', b'pandas': b'badger'} + metadata2 = collections.OrderedDict([ + (b'a', b'alpha'), + (b'b', b'beta') + ]) f1 = f0.add_metadata(metadata) assert f1.metadata == metadata + f2 = f0.add_metadata(metadata2) + assert f2.metadata == metadata2 + + with pytest.raises(TypeError): + f0.add_metadata([1, 2, 3]) + f3 = f1.remove_metadata() assert f3.metadata is None @@ -533,3 +548,38 @@ def test_schema_from_pandas(data): schema = pa.Schema.from_pandas(df) expected = pa.Table.from_pandas(df).schema assert schema == expected + + +@h.given( + past.all_types | + past.all_fields | + past.all_schemas +) +@h.example( + pa.field(name='', type=pa.null(), metadata={'0': '', '': ''}) +) +def test_pickling(field): + data = pickle.dumps(field) + assert pickle.loads(data) == field + + +@h.given( + st.lists(past.all_types) | + st.lists(past.all_fields) | + st.lists(past.all_schemas) +) +def test_hashing(items): + h.assume( + # well, this is still O(n^2), but makes the input unique + all(not a.equals(b) for i, a in enumerate(items) for b in items[:i]) + ) + + container = {} + for i, item in enumerate(items): + assert hash(item) == hash(item) + container[item] = i + + assert len(container) == len(items) + + for i, item in enumerate(items): + assert container[item] == i diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 1ebd196fabf95..f69190c1c2eaa 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -430,11 +430,9 @@ cdef class Field: @property def metadata(self): - cdef shared_ptr[const CKeyValueMetadata] metadata = ( - self.field.metadata()) - return box_metadata(metadata.get()) + return pyarrow_wrap_metadata(self.field.metadata()) - def add_metadata(self, dict metadata): + def add_metadata(self, metadata): """ Add metadata as dict of string keys and values to Field @@ -447,14 +445,18 @@ cdef class Field: ------- field : pyarrow.Field """ - cdef shared_ptr[CKeyValueMetadata] c_meta - convert_metadata(metadata, &c_meta) + cdef: + shared_ptr[CField] c_field + shared_ptr[CKeyValueMetadata] c_meta - cdef shared_ptr[CField] new_field + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + + c_meta = pyarrow_unwrap_metadata(metadata) with nogil: - new_field = self.field.AddMetadata(c_meta) + c_field = self.field.AddMetadata(c_meta) - return pyarrow_wrap_field(new_field) + return pyarrow_wrap_field(c_field) def remove_metadata(self): """ @@ -515,6 +517,9 @@ cdef class Schema: def __reduce__(self): return schema, (list(self), self.metadata) + def __hash__(self): + return hash((tuple(self), self.metadata)) + @property def names(self): """ @@ -544,9 +549,7 @@ cdef class Schema: @property def metadata(self): - cdef shared_ptr[const CKeyValueMetadata] metadata = ( - self.schema.metadata()) - return box_metadata(metadata.get()) + return pyarrow_wrap_metadata(self.schema.metadata()) def __eq__(self, other): try: @@ -728,7 +731,7 @@ cdef class Schema: return pyarrow_wrap_schema(new_schema) - def add_metadata(self, dict metadata): + def add_metadata(self, metadata): """ Add metadata as dict of string keys and values to Schema @@ -741,14 +744,18 @@ cdef class Schema: ------- schema : pyarrow.Schema """ - cdef shared_ptr[CKeyValueMetadata] c_meta - convert_metadata(metadata, &c_meta) + cdef: + shared_ptr[CKeyValueMetadata] c_meta + shared_ptr[CSchema] c_schema - cdef shared_ptr[CSchema] new_schema + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + + c_meta = pyarrow_unwrap_metadata(metadata) with nogil: - new_schema = self.schema.AddMetadata(c_meta) + c_schema = self.schema.AddMetadata(c_meta) - return pyarrow_wrap_schema(new_schema) + return pyarrow_wrap_schema(c_schema) def serialize(self, memory_pool=None): """ @@ -810,15 +817,6 @@ cdef class Schema: return self.__str__() -cdef dict box_metadata(const CKeyValueMetadata* metadata): - cdef unordered_map[c_string, c_string] result - if metadata != nullptr: - metadata.ToUnorderedMap(&result) - return result - else: - return None - - cdef dict _type_cache = {} @@ -832,25 +830,12 @@ cdef DataType primitive_type(Type type): _type_cache[type] = out return out + # ----------------------------------------------------------- # Type factory functions -cdef int convert_metadata(dict metadata, - shared_ptr[CKeyValueMetadata]* out) except -1: - cdef: - shared_ptr[CKeyValueMetadata] meta = ( - make_shared[CKeyValueMetadata]()) - c_string key, value - - for py_key, py_value in metadata.items(): - key = tobytes(py_key) - value = tobytes(py_value) - meta.get().Append(key, value) - out[0] = meta - return 0 - -def field(name, type, bint nullable=True, dict metadata=None): +def field(name, type, bint nullable=True, metadata=None): """ Create a pyarrow.Field instance @@ -867,17 +852,21 @@ def field(name, type, bint nullable=True, dict metadata=None): field : pyarrow.Field """ cdef: - shared_ptr[CKeyValueMetadata] c_meta Field result = Field.__new__(Field) DataType _type = ensure_type(type, allow_none=False) + shared_ptr[CKeyValueMetadata] c_meta if metadata is not None: - convert_metadata(metadata, &c_meta) + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + c_meta = pyarrow_unwrap_metadata(metadata) - result.sp_field.reset(new CField(tobytes(name), _type.sp_type, - nullable == 1, c_meta)) + result.sp_field.reset( + new CField(tobytes(name), _type.sp_type, nullable, c_meta) + ) result.field = result.sp_field.get() result.type = _type + return result @@ -1490,7 +1479,7 @@ cdef DataType ensure_type(object ty, c_bool allow_none=False): raise TypeError('DataType expected, got {!r}'.format(type(ty))) -def schema(fields, dict metadata=None): +def schema(fields, metadata=None): """ Construct pyarrow.Schema from collection of fields @@ -1535,11 +1524,14 @@ def schema(fields, dict metadata=None): c_fields.push_back(py_field.sp_field) if metadata is not None: - convert_metadata(metadata, &c_meta) + if not isinstance(metadata, dict): + raise TypeError('Metadata must be an instance of dict') + c_meta = pyarrow_unwrap_metadata(metadata) c_schema.reset(new CSchema(c_fields, c_meta)) result = Schema.__new__(Schema) result.init_schema(c_schema) + return result diff --git a/python/requirements-test.txt b/python/requirements-test.txt new file mode 100644 index 0000000000000..482e88860669a --- /dev/null +++ b/python/requirements-test.txt @@ -0,0 +1,5 @@ +-r requirements.txt +pandas +pytest +hypothesis +pathlib2; python_version < "3.4" diff --git a/python/requirements.txt b/python/requirements.txt index ddedd757da224..3a23d1dacf81e 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,6 +1,3 @@ -six -pytest -cloudpickle>=0.4.0 -numpy>=1.14.0 -futures; python_version < "3" -pathlib2; python_version < "3.4" +six>=1.0.0 +numpy>=1.14 +futures; python_version < "3.2" diff --git a/python/setup.py b/python/setup.py index e6a88712c0e09..b8d192ddaec45 100755 --- a/python/setup.py +++ b/python/setup.py @@ -577,7 +577,8 @@ def has_ext_modules(foo): }, setup_requires=['setuptools_scm', 'cython >= 0.27'] + setup_requires, install_requires=install_requires, - tests_require=['pytest', 'pandas', 'pathlib2; python_version < "3.4"'], + tests_require=['pytest', 'pandas', 'hypothesis', + 'pathlib2; python_version < "3.4"'], description="Python library for Apache Arrow", long_description=long_description, long_description_content_type="text/markdown", From 9c8ddae11622ace00a187c46412309af82191b74 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Mon, 10 Dec 2018 09:51:41 -0600 Subject: [PATCH 016/328] ARROW-3942: [R] Feather api fixes Some fixes to follow up open #3043, and added the columns argument to `read_feather` that can be: - character vector - integer vector : 1-based in R - NULL: to get all columns (the default) Also adds `as_tibble` argument to read_feather to switch between data.frame and arrow::Table return value Author: Romain Francois Closes #3106 from romainfrancois/ARROW-3942/feather and squashes the following commits: 13061af4d fixed link in documentation ce414c153 + as_tibble argument to read_feather() d6c30a38b + columns argument to read_feather() 46a6fbb69 Update feather factories --- r/NAMESPACE | 16 ++--- r/R/RcppExports.R | 4 +- r/R/feather.R | 44 ++++++++------ ..._table_reader.Rd => FeatherTableReader.Rd} | 6 +- ..._table_writer.Rd => FeatherTableWriter.Rd} | 6 +- r/man/read_feather.Rd | 10 +++- r/src/RcppExports.cpp | 9 +-- r/src/feather.cpp | 32 +++++++++- r/tests/testthat/test-feather.R | 59 ++++++++++++++++--- 9 files changed, 134 insertions(+), 52 deletions(-) rename r/man/{feather_table_reader.Rd => FeatherTableReader.Rd} (80%) rename r/man/{feather_table_writer.Rd => FeatherTableWriter.Rd} (74%) diff --git a/r/NAMESPACE b/r/NAMESPACE index cc5961e5ba148..65d60d846f4cb 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -8,6 +8,12 @@ S3method("==","arrow::RecordBatch") S3method("==","arrow::ipc::Message") S3method(BufferReader,"arrow::Buffer") S3method(BufferReader,default) +S3method(FeatherTableReader,"arrow::io::RandomAccessFile") +S3method(FeatherTableReader,"arrow::ipc::feather::TableReader") +S3method(FeatherTableReader,character) +S3method(FeatherTableReader,default) +S3method(FeatherTableReader,fs_path) +S3method(FeatherTableWriter,"arrow::io::OutputStream") S3method(FixedSizeBufferWriter,"arrow::Buffer") S3method(FixedSizeBufferWriter,default) S3method(MessageReader,"arrow::io::InputStream") @@ -33,12 +39,6 @@ S3method(buffer,default) S3method(buffer,integer) S3method(buffer,numeric) S3method(buffer,raw) -S3method(feather_table_reader,"arrow::io::RandomAccessFile") -S3method(feather_table_reader,"arrow::ipc::feather::TableReader") -S3method(feather_table_reader,character) -S3method(feather_table_reader,default) -S3method(feather_table_reader,fs_path) -S3method(feather_table_writer,"arrow::io::OutputStream") S3method(length,"arrow::Array") S3method(names,"arrow::RecordBatch") S3method(print,"arrow-enum") @@ -70,6 +70,8 @@ S3method(write_feather_RecordBatch,fs_path) export(BufferOutputStream) export(BufferReader) export(DateUnit) +export(FeatherTableReader) +export(FeatherTableWriter) export(FileMode) export(FileOutputStream) export(FixedSizeBufferWriter) @@ -95,8 +97,6 @@ export(date64) export(decimal) export(default_memory_pool) export(dictionary) -export(feather_table_reader) -export(feather_table_writer) export(field) export(float16) export(float32) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index ccf854927b76e..0310eab2027b9 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -445,8 +445,8 @@ ipc___feather___TableReader__GetColumn <- function(reader, i) { .Call(`_arrow_ipc___feather___TableReader__GetColumn`, reader, i) } -ipc___feather___TableReader__Read <- function(reader) { - .Call(`_arrow_ipc___feather___TableReader__Read`, reader) +ipc___feather___TableReader__Read <- function(reader, columns) { + .Call(`_arrow_ipc___feather___TableReader__Read`, reader, columns) } ipc___feather___TableReader__Open <- function(stream) { diff --git a/r/R/feather.R b/r/R/feather.R index bae71d31bc1e5..064652145c8e4 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -35,7 +35,9 @@ num_columns = function() ipc___feather___TableReader__num_columns(self), GetColumnName = function(i) ipc___feather___TableReader__GetColumnName(self, i), GetColumn = function(i) shared_ptr(`arrow::Column`, ipc___feather___TableReader__GetColumn(self, i)), - Read = function() shared_ptr(`arrow::Table`, ipc___feather___TableReader__Read(self)) + Read = function(columns) { + shared_ptr(`arrow::Table`, ipc___feather___TableReader__Read(self, columns)) + } ) ) @@ -44,12 +46,12 @@ #' @param stream an OutputStream #' #' @export -feather_table_writer <- function(stream) { - UseMethod("feather_table_writer") +FeatherTableWriter <- function(stream) { + UseMethod("FeatherTableWriter") } #' @export -`feather_table_writer.arrow::io::OutputStream` <- function(stream){ +`FeatherTableWriter.arrow::io::OutputStream` <- function(stream){ unique_ptr(`arrow::ipc::feather::TableWriter`, ipc___feather___TableWriter__Open(stream)) } @@ -107,7 +109,7 @@ write_feather_RecordBatch <- function(data, stream) { #' @export #' @method write_feather_RecordBatch arrow::io::OutputStream `write_feather_RecordBatch.arrow::io::OutputStream` <- function(data, stream) { - ipc___TableWriter__RecordBatch__WriteFeather(feather_table_writer(stream), data) + ipc___TableWriter__RecordBatch__WriteFeather(FeatherTableWriter(stream), data) } #' A arrow::ipc::feather::TableReader to read from a file @@ -117,44 +119,50 @@ write_feather_RecordBatch <- function(data, stream) { #' @param ... extra parameters #' #' @export -feather_table_reader <- function(file, mmap = TRUE, ...){ - UseMethod("feather_table_reader") +FeatherTableReader <- function(file, mmap = TRUE, ...){ + UseMethod("FeatherTableReader") } #' @export -feather_table_reader.default <- function(file, mmap = TRUE, ...) { +FeatherTableReader.default <- function(file, mmap = TRUE, ...) { stop("unsupported") } #' @export -feather_table_reader.character <- function(file, mmap = TRUE, ...) { - feather_table_reader(fs::path_abs(file), mmap = mmap, ...) +FeatherTableReader.character <- function(file, mmap = TRUE, ...) { + FeatherTableReader(fs::path_abs(file), mmap = mmap, ...) } #' @export -feather_table_reader.fs_path <- function(file, mmap = TRUE, ...) { +FeatherTableReader.fs_path <- function(file, mmap = TRUE, ...) { stream <- if(isTRUE(mmap)) mmap_open(file, ...) else ReadableFile(file, ...) - feather_table_reader(stream) + FeatherTableReader(stream) } #' @export -`feather_table_reader.arrow::io::RandomAccessFile` <- function(file, mmap = TRUE, ...){ +`FeatherTableReader.arrow::io::RandomAccessFile` <- function(file, mmap = TRUE, ...){ unique_ptr(`arrow::ipc::feather::TableReader`, ipc___feather___TableReader__Open(file)) } #' @export -`feather_table_reader.arrow::ipc::feather::TableReader` <- function(file, mmap = TRUE, ...){ +`FeatherTableReader.arrow::ipc::feather::TableReader` <- function(file, mmap = TRUE, ...){ file } #' Read a feather file #' -#' @param file a arrow::ipc::feather::TableReader or whatever the [feather_table_reader()] function can handle +#' @param file a arrow::ipc::feather::TableReader or whatever the [FeatherTableReader()] function can handle +#' @param columns names if the columns to read. The default `NULL` means all columns +#' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble. #' @param ... additional parameters #' -#' @return an arrow::Table +#' @return a data frame if `as_tibble` is `TRUE` (the default), or a [arrow::Table][arrow__Table] otherwise #' #' @export -read_feather <- function(file, ...){ - feather_table_reader(file, ...)$Read() +read_feather <- function(file, columns = NULL, as_tibble = TRUE, ...){ + out <- FeatherTableReader(file, ...)$Read(columns) + if (isTRUE(as_tibble)) { + out <- as_tibble(out) + } + out } diff --git a/r/man/feather_table_reader.Rd b/r/man/FeatherTableReader.Rd similarity index 80% rename from r/man/feather_table_reader.Rd rename to r/man/FeatherTableReader.Rd index fb1c53429f860..15a260bd57cf6 100644 --- a/r/man/feather_table_reader.Rd +++ b/r/man/FeatherTableReader.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/feather.R -\name{feather_table_reader} -\alias{feather_table_reader} +\name{FeatherTableReader} +\alias{FeatherTableReader} \title{A arrow::ipc::feather::TableReader to read from a file} \usage{ -feather_table_reader(file, mmap = TRUE, ...) +FeatherTableReader(file, mmap = TRUE, ...) } \arguments{ \item{file}{A file path, arrow::io::RandomAccessFile} diff --git a/r/man/feather_table_writer.Rd b/r/man/FeatherTableWriter.Rd similarity index 74% rename from r/man/feather_table_writer.Rd rename to r/man/FeatherTableWriter.Rd index 36035aca12090..3acf5971a71b3 100644 --- a/r/man/feather_table_writer.Rd +++ b/r/man/FeatherTableWriter.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/feather.R -\name{feather_table_writer} -\alias{feather_table_writer} +\name{FeatherTableWriter} +\alias{FeatherTableWriter} \title{Create TableWriter that writes into a stream} \usage{ -feather_table_writer(stream) +FeatherTableWriter(stream) } \arguments{ \item{stream}{an OutputStream} diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index e86b86b99e9e2..31fd36ab65a26 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -4,15 +4,19 @@ \alias{read_feather} \title{Read a feather file} \usage{ -read_feather(file, ...) +read_feather(file, columns = NULL, as_tibble = TRUE, ...) } \arguments{ -\item{file}{a arrow::ipc::feather::TableReader or whatever the \code{\link[=feather_table_reader]{feather_table_reader()}} function can handle} +\item{file}{a arrow::ipc::feather::TableReader or whatever the \code{\link[=FeatherTableReader]{FeatherTableReader()}} function can handle} + +\item{columns}{names if the columns to read. The default \code{NULL} means all columns} + +\item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.} \item{...}{additional parameters} } \value{ -an arrow::Table +a data frame if \code{as_tibble} is \code{TRUE} (the default), or a \link[=arrow__Table]{arrow::Table} otherwise } \description{ Read a feather file diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index bca4eafdee4ce..e5a784eb70c23 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -1244,13 +1244,14 @@ BEGIN_RCPP END_RCPP } // ipc___feather___TableReader__Read -std::shared_ptr ipc___feather___TableReader__Read(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__Read(SEXP readerSEXP) { +std::shared_ptr ipc___feather___TableReader__Read(const std::unique_ptr& reader, SEXP columns); +RcppExport SEXP _arrow_ipc___feather___TableReader__Read(SEXP readerSEXP, SEXP columnsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::unique_ptr& >::type reader(readerSEXP); - rcpp_result_gen = Rcpp::wrap(ipc___feather___TableReader__Read(reader)); + Rcpp::traits::input_parameter< SEXP >::type columns(columnsSEXP); + rcpp_result_gen = Rcpp::wrap(ipc___feather___TableReader__Read(reader, columns)); return rcpp_result_gen; END_RCPP } @@ -2262,7 +2263,7 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_ipc___feather___TableReader__num_columns", (DL_FUNC) &_arrow_ipc___feather___TableReader__num_columns, 1}, {"_arrow_ipc___feather___TableReader__GetColumnName", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumnName, 2}, {"_arrow_ipc___feather___TableReader__GetColumn", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumn, 2}, - {"_arrow_ipc___feather___TableReader__Read", (DL_FUNC) &_arrow_ipc___feather___TableReader__Read, 1}, + {"_arrow_ipc___feather___TableReader__Read", (DL_FUNC) &_arrow_ipc___feather___TableReader__Read, 2}, {"_arrow_ipc___feather___TableReader__Open", (DL_FUNC) &_arrow_ipc___feather___TableReader__Open, 1}, {"_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3}, {"_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, diff --git a/r/src/feather.cpp b/r/src/feather.cpp index 7b84deefadb9c..8389156c3847b 100644 --- a/r/src/feather.cpp +++ b/r/src/feather.cpp @@ -115,9 +115,37 @@ std::shared_ptr ipc___feather___TableReader__GetColumn( // [[Rcpp::export]] std::shared_ptr ipc___feather___TableReader__Read( - const std::unique_ptr& reader) { + const std::unique_ptr& reader, SEXP columns) { std::shared_ptr table; - STOP_IF_NOT_OK(reader->Read(&table)); + + switch (TYPEOF(columns)) { + case INTSXP: { + R_xlen_t n = XLENGTH(columns); + std::vector indices(n); + int* p_columns = INTEGER(columns); + for (int i = 0; i < n; i++) { + indices[i] = p_columns[i] - 1; + } + STOP_IF_NOT_OK(reader->Read(indices, &table)); + break; + } + case STRSXP: { + R_xlen_t n = XLENGTH(columns); + std::vector names(n); + for (R_xlen_t i = 0; i < n; i++) { + names[i] = CHAR(STRING_ELT(columns, i)); + } + STOP_IF_NOT_OK(reader->Read(names, &table)); + break; + } + case NILSXP: + STOP_IF_NOT_OK(reader->Read(&table)); + break; + default: + Rcpp::stop("incompatible column specification"); + break; + }; + return table; } diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 715017fb5865c..23fdc58fd781e 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -34,25 +34,66 @@ test_that("feather read/write round trip", { expect_true(fs::file_exists(tf3)) tab1 <- read_feather(tf1) - expect_is(tab1, "arrow::Table") + expect_is(tab1, "data.frame") tab2 <- read_feather(tf2) - expect_is(tab2, "arrow::Table") + expect_is(tab2, "data.frame") tab3 <- read_feather(tf3) - expect_is(tab3, "arrow::Table") + expect_is(tab3, "data.frame") # reading directly from arrow::io::MemoryMappedFile tab4 <- read_feather(mmap_open(tf3)) - expect_is(tab4, "arrow::Table") + expect_is(tab4, "data.frame") # reading directly from arrow::io::ReadableFile tab5 <- read_feather(ReadableFile(tf3)) - expect_is(tab5, "arrow::Table") + expect_is(tab5, "data.frame") + + expect_equal(tib, tab1) + expect_equal(tib, tab2) + expect_equal(tib, tab3) + expect_equal(tib, tab4) + expect_equal(tib, tab5) +}) + +test_that("feather handles columns = ", { + tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) + + tf1 <- local_tempfile() + write_feather(tib, tf1) + expect_true(fs::file_exists(tf1)) + + tab1 <- read_feather(tf1, columns = c("x", "y")) + expect_is(tab1, "data.frame") + + expect_equal(tib[, c("x", "y")], as_tibble(tab1)) +}) + +test_that("feather handles columns = ", { + tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) + + tf1 <- local_tempfile() + write_feather(tib, tf1) + expect_true(fs::file_exists(tf1)) + + tab1 <- read_feather(tf1, columns = 1:2) + expect_is(tab1, "data.frame") + + expect_equal(tib[, c("x", "y")], as_tibble(tab1)) +}) + +test_that("feather read/write round trip", { + tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) + + tf1 <- local_tempfile() + write_feather(tib, tf1) + expect_true(fs::file_exists(tf1)) + + tab1 <- read_feather(tf1, as_tibble = FALSE) + expect_is(tab1, "arrow::Table") expect_equal(tib, as_tibble(tab1)) - expect_equal(tib, as_tibble(tab2)) - expect_equal(tib, as_tibble(tab3)) - expect_equal(tib, as_tibble(tab4)) - expect_equal(tib, as_tibble(tab5)) }) + + From 12201841212967c78e31b2d2840b55b1707c4e7b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Dec 2018 14:02:21 -0600 Subject: [PATCH 017/328] ARROW-3641: [Python] Remove unneeded public keyword from pyarrow public C APIs According to https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#c-api-declarations it is not necessary to use `public` here. If we want to be able to refer to Cython extension types at the C API level (at some point, this may not be a bad idea), then we must use `public` with those. Author: Wes McKinney Closes #3147 from wesm/ARROW-3641 and squashes the following commits: f09902cb4 Remove unneeded public keyword from pyarrow public APIs --- python/pyarrow/public-api.pxi | 58 +++++++++++++++++------------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index ef54c7ab42f74..7bd9154dfa8d7 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -24,11 +24,11 @@ from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, # methods don't use Status to indicate a successful operation. -cdef public api bint pyarrow_is_buffer(object buffer): +cdef api bint pyarrow_is_buffer(object buffer): return isinstance(buffer, Buffer) -cdef public api shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer): +cdef api shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer): cdef Buffer buf if pyarrow_is_buffer(buffer): buf = (buffer) @@ -37,24 +37,24 @@ cdef public api shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer): return shared_ptr[CBuffer]() -cdef public api object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf): +cdef api object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf): cdef Buffer result = Buffer.__new__(Buffer) result.init(buf) return result -cdef public api object pyarrow_wrap_resizable_buffer( +cdef api object pyarrow_wrap_resizable_buffer( const shared_ptr[CResizableBuffer]& buf): cdef ResizableBuffer result = ResizableBuffer.__new__(ResizableBuffer) result.init_rz(buf) return result -cdef public api bint pyarrow_is_data_type(object type_): +cdef api bint pyarrow_is_data_type(object type_): return isinstance(type_, DataType) -cdef public api shared_ptr[CDataType] pyarrow_unwrap_data_type( +cdef api shared_ptr[CDataType] pyarrow_unwrap_data_type( object data_type): cdef DataType type_ if pyarrow_is_data_type(data_type): @@ -64,7 +64,7 @@ cdef public api shared_ptr[CDataType] pyarrow_unwrap_data_type( return shared_ptr[CDataType]() -cdef public api object pyarrow_wrap_data_type( +cdef api object pyarrow_wrap_data_type( const shared_ptr[CDataType]& type): cdef DataType out @@ -117,11 +117,11 @@ cdef shared_ptr[CKeyValueMetadata] pyarrow_unwrap_metadata(object meta): return shared_ptr[CKeyValueMetadata]() -cdef public api bint pyarrow_is_field(object field): +cdef api bint pyarrow_is_field(object field): return isinstance(field, Field) -cdef public api shared_ptr[CField] pyarrow_unwrap_field(object field): +cdef api shared_ptr[CField] pyarrow_unwrap_field(object field): cdef Field field_ if pyarrow_is_field(field): field_ = (field) @@ -130,7 +130,7 @@ cdef public api shared_ptr[CField] pyarrow_unwrap_field(object field): return shared_ptr[CField]() -cdef public api object pyarrow_wrap_field(const shared_ptr[CField]& field): +cdef api object pyarrow_wrap_field(const shared_ptr[CField]& field): if field.get() == NULL: return None cdef Field out = Field.__new__(Field) @@ -138,11 +138,11 @@ cdef public api object pyarrow_wrap_field(const shared_ptr[CField]& field): return out -cdef public api bint pyarrow_is_schema(object schema): +cdef api bint pyarrow_is_schema(object schema): return isinstance(schema, Schema) -cdef public api shared_ptr[CSchema] pyarrow_unwrap_schema(object schema): +cdef api shared_ptr[CSchema] pyarrow_unwrap_schema(object schema): cdef Schema sch if pyarrow_is_schema(schema): sch = (schema) @@ -151,17 +151,17 @@ cdef public api shared_ptr[CSchema] pyarrow_unwrap_schema(object schema): return shared_ptr[CSchema]() -cdef public api object pyarrow_wrap_schema(const shared_ptr[CSchema]& schema): +cdef api object pyarrow_wrap_schema(const shared_ptr[CSchema]& schema): cdef Schema out = Schema.__new__(Schema) out.init_schema(schema) return out -cdef public api bint pyarrow_is_array(object array): +cdef api bint pyarrow_is_array(object array): return isinstance(array, Array) -cdef public api shared_ptr[CArray] pyarrow_unwrap_array(object array): +cdef api shared_ptr[CArray] pyarrow_unwrap_array(object array): cdef Array arr if pyarrow_is_array(array): arr = (array) @@ -170,7 +170,7 @@ cdef public api shared_ptr[CArray] pyarrow_unwrap_array(object array): return shared_ptr[CArray]() -cdef public api object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array): +cdef api object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array): if sp_array.get() == NULL: raise ValueError('Array was NULL') @@ -186,7 +186,7 @@ cdef public api object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array): return arr -cdef public api object pyarrow_wrap_chunked_array( +cdef api object pyarrow_wrap_chunked_array( const shared_ptr[CChunkedArray]& sp_array): if sp_array.get() == NULL: raise ValueError('ChunkedArray was NULL') @@ -201,11 +201,11 @@ cdef public api object pyarrow_wrap_chunked_array( return arr -cdef public api bint pyarrow_is_tensor(object tensor): +cdef api bint pyarrow_is_tensor(object tensor): return isinstance(tensor, Tensor) -cdef public api shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor): +cdef api shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor): cdef Tensor ten if pyarrow_is_tensor(tensor): ten = (tensor) @@ -214,7 +214,7 @@ cdef public api shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor): return shared_ptr[CTensor]() -cdef public api object pyarrow_wrap_tensor( +cdef api object pyarrow_wrap_tensor( const shared_ptr[CTensor]& sp_tensor): if sp_tensor.get() == NULL: raise ValueError('Tensor was NULL') @@ -224,11 +224,11 @@ cdef public api object pyarrow_wrap_tensor( return tensor -cdef public api bint pyarrow_is_column(object column): +cdef api bint pyarrow_is_column(object column): return isinstance(column, Column) -cdef public api shared_ptr[CColumn] pyarrow_unwrap_column(object column): +cdef api shared_ptr[CColumn] pyarrow_unwrap_column(object column): cdef Column col if pyarrow_is_column(column): col = (column) @@ -237,17 +237,17 @@ cdef public api shared_ptr[CColumn] pyarrow_unwrap_column(object column): return shared_ptr[CColumn]() -cdef public api object pyarrow_wrap_column(const shared_ptr[CColumn]& ccolumn): +cdef api object pyarrow_wrap_column(const shared_ptr[CColumn]& ccolumn): cdef Column column = Column.__new__(Column) column.init(ccolumn) return column -cdef public api bint pyarrow_is_table(object table): +cdef api bint pyarrow_is_table(object table): return isinstance(table, Table) -cdef public api shared_ptr[CTable] pyarrow_unwrap_table(object table): +cdef api shared_ptr[CTable] pyarrow_unwrap_table(object table): cdef Table tab if pyarrow_is_table(table): tab = (table) @@ -256,17 +256,17 @@ cdef public api shared_ptr[CTable] pyarrow_unwrap_table(object table): return shared_ptr[CTable]() -cdef public api object pyarrow_wrap_table(const shared_ptr[CTable]& ctable): +cdef api object pyarrow_wrap_table(const shared_ptr[CTable]& ctable): cdef Table table = Table.__new__(Table) table.init(ctable) return table -cdef public api bint pyarrow_is_batch(object batch): +cdef api bint pyarrow_is_batch(object batch): return isinstance(batch, RecordBatch) -cdef public api shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch): +cdef api shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch): cdef RecordBatch bat if pyarrow_is_batch(batch): bat = (batch) @@ -275,7 +275,7 @@ cdef public api shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch): return shared_ptr[CRecordBatch]() -cdef public api object pyarrow_wrap_batch( +cdef api object pyarrow_wrap_batch( const shared_ptr[CRecordBatch]& cbatch): cdef RecordBatch batch = RecordBatch.__new__(RecordBatch) batch.init(cbatch) From 24d00c0783e07ba4a7247779f569cd745ae60185 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Dec 2018 17:55:23 -0600 Subject: [PATCH 018/328] ARROW-3248: [C++] Add "arrow" prefix to Arrow core unit tests, use PREFIX instead of file name for csv, io, ipc tests. Modular target cleanup I added a section to the cpp/README.md about the modular build targets. Author: Wes McKinney Closes #3152 from wesm/ARROW-3248 and squashes the following commits: ba3a3e58c Need to add arrow- prefix to some Travis scripts 1f3daaf78 Rename io/ipc tests/executables. Add appropriate labels/prefixes to all unit tests/benchmarks. Add labels option to ADD_BENCHMARK --- ci/cpp-msvc-build-main.bat | 2 +- ci/travis_script_python.sh | 4 +- cpp/CMakeLists.txt | 12 ----- cpp/README.md | 15 +++++- cpp/cmake_modules/BuildUtils.cmake | 24 +++++++--- cpp/cmake_modules/ThirdpartyToolchain.cmake | 6 +-- cpp/src/arrow/CMakeLists.txt | 48 +++++++++++++++++++ cpp/src/arrow/csv/CMakeLists.txt | 18 ++++--- .../{csv-chunker-test.cc => chunker-test.cc} | 0 ...builder-test.cc => column-builder-test.cc} | 0 ...er-benchmark.cc => converter-benchmark.cc} | 0 ...sv-converter-test.cc => converter-test.cc} | 0 ...arser-benchmark.cc => parser-benchmark.cc} | 0 .../{csv-parser-test.cc => parser-test.cc} | 0 cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt | 2 +- cpp/src/arrow/io/CMakeLists.txt | 24 ++++++---- .../{io-buffered-test.cc => buffered-test.cc} | 0 ...-compressed-test.cc => compressed-test.cc} | 0 ...io-file-benchmark.cc => file-benchmark.cc} | 0 .../io/{io-file-test.cc => file-test.cc} | 0 .../io/{io-hdfs-test.cc => hdfs-test.cc} | 0 ...emory-benchmark.cc => memory-benchmark.cc} | 0 .../io/{io-memory-test.cc => memory-test.cc} | 0 ...io-readahead-test.cc => readahead-test.cc} | 0 cpp/src/arrow/ipc/CMakeLists.txt | 13 +++-- ...son-simple-test.cc => json-simple-test.cc} | 0 .../ipc/{ipc-json-test.cc => json-test.cc} | 0 ...e-benchmark.cc => read-write-benchmark.cc} | 0 ...-read-write-test.cc => read-write-test.cc} | 0 cpp/src/arrow/util/CMakeLists.txt | 6 +-- cpp/src/gandiva/CMakeLists.txt | 4 +- cpp/src/gandiva/tests/CMakeLists.txt | 2 +- cpp/src/parquet/CMakeLists.txt | 4 +- cpp/src/plasma/CMakeLists.txt | 18 +++++-- 34 files changed, 145 insertions(+), 57 deletions(-) rename cpp/src/arrow/csv/{csv-chunker-test.cc => chunker-test.cc} (100%) rename cpp/src/arrow/csv/{csv-column-builder-test.cc => column-builder-test.cc} (100%) rename cpp/src/arrow/csv/{csv-converter-benchmark.cc => converter-benchmark.cc} (100%) rename cpp/src/arrow/csv/{csv-converter-test.cc => converter-test.cc} (100%) rename cpp/src/arrow/csv/{csv-parser-benchmark.cc => parser-benchmark.cc} (100%) rename cpp/src/arrow/csv/{csv-parser-test.cc => parser-test.cc} (100%) rename cpp/src/arrow/io/{io-buffered-test.cc => buffered-test.cc} (100%) rename cpp/src/arrow/io/{io-compressed-test.cc => compressed-test.cc} (100%) rename cpp/src/arrow/io/{io-file-benchmark.cc => file-benchmark.cc} (100%) rename cpp/src/arrow/io/{io-file-test.cc => file-test.cc} (100%) rename cpp/src/arrow/io/{io-hdfs-test.cc => hdfs-test.cc} (100%) rename cpp/src/arrow/io/{io-memory-benchmark.cc => memory-benchmark.cc} (100%) rename cpp/src/arrow/io/{io-memory-test.cc => memory-test.cc} (100%) rename cpp/src/arrow/io/{io-readahead-test.cc => readahead-test.cc} (100%) rename cpp/src/arrow/ipc/{ipc-json-simple-test.cc => json-simple-test.cc} (100%) rename cpp/src/arrow/ipc/{ipc-json-test.cc => json-test.cc} (100%) rename cpp/src/arrow/ipc/{ipc-read-write-benchmark.cc => read-write-benchmark.cc} (100%) rename cpp/src/arrow/ipc/{ipc-read-write-test.cc => read-write-test.cc} (100%) diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index 7349f8d3aca6b..8703dc9631773 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -55,7 +55,7 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ .. || exit /B cmake --build . --target install --config %CONFIGURATION% || exit /B -@rem Needed so python-test.exe works +@rem Needed so arrow-python-test.exe works set OLD_PYTHONHOME=%PYTHONHOME% set PYTHONHOME=%CONDA_PREFIX% diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index b316c81f3b6b0..25bec262d861c 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -38,7 +38,7 @@ conda activate $CONDA_ENV_DIR # We should use zlib in the target Python directory to avoid loading # wrong libpython on macOS at run-time. If we use zlib in # $ARROW_BUILD_TOOLCHAIN and libpython3.6m.dylib exists in both -# $ARROW_BUILD_TOOLCHAIN and $CONDA_ENV_DIR, python-test uses +# $ARROW_BUILD_TOOLCHAIN and $CONDA_ENV_DIR, arrow-python-test uses # libpython3.6m.dylib on $ARROW_BUILD_TOOLCHAIN not $CONDA_ENV_DIR. # libpython3.6m.dylib on $ARROW_BUILD_TOOLCHAIN doesn't have NumPy. So # python-test fails. @@ -113,7 +113,7 @@ ninja install popd # python-test isn't run by travis_script_cpp.sh, exercise it here -$ARROW_CPP_BUILD_DIR/$ARROW_BUILD_TYPE/python-test +$ARROW_CPP_BUILD_DIR/$ARROW_BUILD_TYPE/arrow-python-test pushd $ARROW_PYTHON_DIR diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 68ac84e42dd6a..7140d05d577f2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -793,18 +793,6 @@ endif() add_subdirectory(src/arrow) -if(ARROW_FLIGHT) - add_subdirectory(src/arrow/flight) -endif() - -if(ARROW_PYTHON) - add_subdirectory(src/arrow/python) -endif() - -if(ARROW_HIVESERVER2) - add_subdirectory(src/arrow/dbi/hiveserver2) -endif() - if(ARROW_PARQUET) add_subdirectory(src/parquet) add_subdirectory(tools/parquet) diff --git a/cpp/README.md b/cpp/README.md index 394b23d69f8fc..7d0851762c291 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -82,7 +82,18 @@ environment variable (which requires the `locales` package or equivalent): export LC_ALL="en_US.UTF-8" ``` -## Building and Developing Parquet Libraries +## Modular Build Targets + +Since there are several major parts of the C++ project, we have provided +modular CMake targets for building each component along with its dependencies, +unit tests, and benchmarks (if enabled): + +* `make arrow` for Arrow core libraries +* `make parquet` for Parquet libraries +* `make gandiva` for Gandiva (LLVM expression compiler) libraries +* `make plasma` for Plasma libraries, server + +## Parquet Development Notes To build the C++ libraries for Apache Parquet, add the flag `-DARROW_PARQUET=ON` when invoking CMake. The Parquet libraries and unit tests @@ -120,7 +131,7 @@ with the `--ARROW_BUILD_BENCHMARKS` parameter set correctly: cmake -DARROW_BUILD_BENCHMARKS=ON .. and instead of make unittest run either `make; ctest` to run both unit tests -and benchmarks or `make runbenchmark` to run only the benchmark tests. +and benchmarks or `make benchmark` to run only the benchmark tests. Benchmark logs will be placed in the build directory under `build/benchmark-logs`. diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 916b9ebddb88e..bcf672823b424 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -290,7 +290,7 @@ endfunction() ############################################################ # Add a new micro benchmark, with or without an executable that should be built. # If benchmarks are enabled then they will be run along side unit tests with ctest. -# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, +# 'make benchmark' and 'make unittest' to build/run only benchmark or unittests, # respectively. # # REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component @@ -306,10 +306,10 @@ endfunction() # \arg PREFIX a string to append to the name of the benchmark executable. For # example, if you have src/arrow/foo/bar-benchmark.cc, then PREFIX "foo" will # create test executable foo-bar-benchmark -function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) +function(ADD_BENCHMARK REL_BENCHMARK_NAME) set(options) set(one_value_args) - set(multi_value_args EXTRA_LINK_LIBS DEPENDENCIES PREFIX) + set(multi_value_args EXTRA_LINK_LIBS DEPENDENCIES PREFIX LABELS) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if(ARG_UNPARSED_ARGUMENTS) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") @@ -329,7 +329,7 @@ function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") add_executable(${BENCHMARK_NAME} "${REL_BENCHMARK_NAME}.cc") target_link_libraries(${BENCHMARK_NAME} ${ARROW_BENCHMARK_LINK_LIBS}) - add_dependencies(runbenchmark ${BENCHMARK_NAME}) + add_dependencies(benchmark ${BENCHMARK_NAME}) set(NO_COLOR "--color_print=false") if (ARG_EXTRA_LINK_LIBS) @@ -345,9 +345,21 @@ function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) add_dependencies(${BENCHMARK_NAME} ${ARG_DEPENDENCIES}) endif() + if (ARG_LABELS) + set(ARG_LABELS "${ARG_LABELS}") + else() + set(ARG_LABELS benchmark) + endif() + + foreach (TEST_LABEL ${ARG_LABELS}) + add_dependencies(${TEST_LABEL} ${BENCHMARK_NAME}) + endforeach() + add_test(${BENCHMARK_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) - set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark") + set_property(TEST ${BENCHMARK_NAME} + APPEND PROPERTY + LABELS ${ARG_LABELS}) endfunction() ############################################################ @@ -377,7 +389,7 @@ endfunction() # multiple unit tests in some subgroup, you can assign a test to multiple # groups using the syntax unittest;GROUP2;GROUP3. Custom targets for the group # names must exist -function(ADD_ARROW_TEST REL_TEST_NAME) +function(ADD_TEST_CASE REL_TEST_NAME) set(options NO_VALGRIND ENABLED) set(one_value_args) set(multi_value_args SOURCES STATIC_LINK_LIBS EXTRA_LINK_LIBS EXTRA_INCLUDES diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 9829a4d3fbd80..6850b0bddefc5 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -627,7 +627,7 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) endif() if(ARROW_BUILD_BENCHMARKS) - add_custom_target(runbenchmark ctest -L benchmark) + add_custom_target(benchmark ctest -L benchmark) if("$ENV{GBENCHMARK_HOME}" STREQUAL "") if(NOT MSVC) @@ -664,11 +664,11 @@ if(ARROW_BUILD_BENCHMARKS) message(STATUS "GBenchmark include dir: ${GBENCHMARK_INCLUDE_DIR}") message(STATUS "GBenchmark static library: ${GBENCHMARK_STATIC_LIB}") include_directories(SYSTEM ${GBENCHMARK_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(benchmark + ADD_THIRDPARTY_LIB(gbenchmark STATIC_LIB ${GBENCHMARK_STATIC_LIB}) if(GBENCHMARK_VENDORED) - add_dependencies(benchmark_static gbenchmark_ep) + add_dependencies(gbenchmark_static gbenchmark_ep) endif() endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 8e932680de034..13aaeab494090 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -15,6 +15,42 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(arrow) + +# Adding unit tests part of the "arrow" portion of the test suite +function(ADD_ARROW_TEST REL_TEST_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + + if (ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "arrow") + endif() + ADD_TEST_CASE(${REL_TEST_NAME} + PREFIX ${PREFIX} + LABELS "unittest;arrow" + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + +function(ADD_ARROW_BENCHMARK REL_TEST_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if (ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "arrow") + endif() + ADD_BENCHMARK(${REL_TEST_NAME} + PREFIX ${PREFIX} + LABELS "benchmark;arrow" + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + set(ARROW_SRCS array.cc buffer.cc @@ -263,3 +299,15 @@ ADD_ARROW_BENCHMARK(column-benchmark) add_subdirectory(csv) add_subdirectory(io) add_subdirectory(util) + +if(ARROW_FLIGHT) + add_subdirectory(flight) +endif() + +if(ARROW_PYTHON) + add_subdirectory(python) +endif() + +if(ARROW_HIVESERVER2) + add_subdirectory(dbi/hiveserver2) +endif() diff --git a/cpp/src/arrow/csv/CMakeLists.txt b/cpp/src/arrow/csv/CMakeLists.txt index 84b080b1eef09..db23d6feff111 100644 --- a/cpp/src/arrow/csv/CMakeLists.txt +++ b/cpp/src/arrow/csv/CMakeLists.txt @@ -15,13 +15,19 @@ # specific language governing permissions and limitations # under the License. -ADD_ARROW_TEST(csv-chunker-test) -ADD_ARROW_TEST(csv-column-builder-test) -ADD_ARROW_TEST(csv-converter-test) -ADD_ARROW_TEST(csv-parser-test) +ADD_ARROW_TEST(chunker-test + PREFIX "arrow-csv") +ADD_ARROW_TEST(column-builder-test + PREFIX "arrow-csv") +ADD_ARROW_TEST(converter-test + PREFIX "arrow-csv") +ADD_ARROW_TEST(parser-test + PREFIX "arrow-csv") -ADD_ARROW_BENCHMARK(csv-converter-benchmark) -ADD_ARROW_BENCHMARK(csv-parser-benchmark) +ADD_ARROW_BENCHMARK(converter-benchmark + PREFIX "arrow-csv") +ADD_ARROW_BENCHMARK(parser-benchmark + PREFIX "arrow-csv") # Headers: top level file(GLOB_RECURSE ARROW_CSV_HEADERS "*.h") diff --git a/cpp/src/arrow/csv/csv-chunker-test.cc b/cpp/src/arrow/csv/chunker-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-chunker-test.cc rename to cpp/src/arrow/csv/chunker-test.cc diff --git a/cpp/src/arrow/csv/csv-column-builder-test.cc b/cpp/src/arrow/csv/column-builder-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-column-builder-test.cc rename to cpp/src/arrow/csv/column-builder-test.cc diff --git a/cpp/src/arrow/csv/csv-converter-benchmark.cc b/cpp/src/arrow/csv/converter-benchmark.cc similarity index 100% rename from cpp/src/arrow/csv/csv-converter-benchmark.cc rename to cpp/src/arrow/csv/converter-benchmark.cc diff --git a/cpp/src/arrow/csv/csv-converter-test.cc b/cpp/src/arrow/csv/converter-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-converter-test.cc rename to cpp/src/arrow/csv/converter-test.cc diff --git a/cpp/src/arrow/csv/csv-parser-benchmark.cc b/cpp/src/arrow/csv/parser-benchmark.cc similarity index 100% rename from cpp/src/arrow/csv/csv-parser-benchmark.cc rename to cpp/src/arrow/csv/parser-benchmark.cc diff --git a/cpp/src/arrow/csv/csv-parser-test.cc b/cpp/src/arrow/csv/parser-test.cc similarity index 100% rename from cpp/src/arrow/csv/csv-parser-test.cc rename to cpp/src/arrow/csv/parser-test.cc diff --git a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt index 3a16a7834c3c1..eb4446f05d971 100644 --- a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt +++ b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt @@ -115,7 +115,7 @@ if (ARROW_BUILD_TESTS) STATIC_LINK_LIBS "${ARROW_HIVESERVER2_TEST_LINK_LIBS}" LABELS "arrow_hiveserver2" ) - set_property(TARGET hiveserver2-test + set_property(TARGET arrow-hiveserver2-test APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-shadow-field") endif(ARROW_BUILD_TESTS) diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index d21bb16755271..80d68fb503bb9 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -18,19 +18,27 @@ # ---------------------------------------------------------------------- # arrow_io : Arrow IO interfaces -ADD_ARROW_TEST(io-buffered-test) -ADD_ARROW_TEST(io-compressed-test) -ADD_ARROW_TEST(io-file-test) +ADD_ARROW_TEST(buffered-test + PREFIX "arrow-io") +ADD_ARROW_TEST(compressed-test + PREFIX "arrow-io") +ADD_ARROW_TEST(file-test + PREFIX "arrow-io") if (ARROW_HDFS AND NOT ARROW_BOOST_HEADER_ONLY) - ADD_ARROW_TEST(io-hdfs-test NO_VALGRIND) + ADD_ARROW_TEST(hdfs-test NO_VALGRIND + PREFIX "arrow-io") endif() -ADD_ARROW_TEST(io-memory-test) -ADD_ARROW_TEST(io-readahead-test) +ADD_ARROW_TEST(memory-test + PREFIX "arrow-io") +ADD_ARROW_TEST(readahead-test + PREFIX "arrow-io") -ADD_ARROW_BENCHMARK(io-file-benchmark) -ADD_ARROW_BENCHMARK(io-memory-benchmark) +ADD_ARROW_BENCHMARK(file-benchmark + PREFIX "arrow-io") +ADD_ARROW_BENCHMARK(memory-benchmark + PREFIX "arrow-io") # Headers: top level install(FILES diff --git a/cpp/src/arrow/io/io-buffered-test.cc b/cpp/src/arrow/io/buffered-test.cc similarity index 100% rename from cpp/src/arrow/io/io-buffered-test.cc rename to cpp/src/arrow/io/buffered-test.cc diff --git a/cpp/src/arrow/io/io-compressed-test.cc b/cpp/src/arrow/io/compressed-test.cc similarity index 100% rename from cpp/src/arrow/io/io-compressed-test.cc rename to cpp/src/arrow/io/compressed-test.cc diff --git a/cpp/src/arrow/io/io-file-benchmark.cc b/cpp/src/arrow/io/file-benchmark.cc similarity index 100% rename from cpp/src/arrow/io/io-file-benchmark.cc rename to cpp/src/arrow/io/file-benchmark.cc diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/file-test.cc similarity index 100% rename from cpp/src/arrow/io/io-file-test.cc rename to cpp/src/arrow/io/file-test.cc diff --git a/cpp/src/arrow/io/io-hdfs-test.cc b/cpp/src/arrow/io/hdfs-test.cc similarity index 100% rename from cpp/src/arrow/io/io-hdfs-test.cc rename to cpp/src/arrow/io/hdfs-test.cc diff --git a/cpp/src/arrow/io/io-memory-benchmark.cc b/cpp/src/arrow/io/memory-benchmark.cc similarity index 100% rename from cpp/src/arrow/io/io-memory-benchmark.cc rename to cpp/src/arrow/io/memory-benchmark.cc diff --git a/cpp/src/arrow/io/io-memory-test.cc b/cpp/src/arrow/io/memory-test.cc similarity index 100% rename from cpp/src/arrow/io/io-memory-test.cc rename to cpp/src/arrow/io/memory-test.cc diff --git a/cpp/src/arrow/io/io-readahead-test.cc b/cpp/src/arrow/io/readahead-test.cc similarity index 100% rename from cpp/src/arrow/io/io-readahead-test.cc rename to cpp/src/arrow/io/readahead-test.cc diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 40cebf1823e2c..bda4ef3e417d5 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -19,9 +19,12 @@ # Messaging and interprocess communication ADD_ARROW_TEST(feather-test) -ADD_ARROW_TEST(ipc-read-write-test) -ADD_ARROW_TEST(ipc-json-simple-test) -ADD_ARROW_TEST(ipc-json-test) +ADD_ARROW_TEST(read-write-test + PREFIX "arrow-ipc") +ADD_ARROW_TEST(json-simple-test + PREFIX "arrow-ipc") +ADD_ARROW_TEST(json-test + PREFIX "arrow-ipc") if (NOT ARROW_BOOST_HEADER_ONLY) ADD_ARROW_TEST(json-integration-test @@ -116,6 +119,6 @@ if (ARROW_BUILD_UTILITIES) target_link_libraries(stream-to-file ${UTIL_LINK_LIBS}) endif() -ADD_ARROW_BENCHMARK(ipc-read-write-benchmark) - +ADD_ARROW_BENCHMARK(read-write-benchmark + PREFIX "arrow-ipc") ADD_ARROW_FUZZING(ipc-fuzzing-test) diff --git a/cpp/src/arrow/ipc/ipc-json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc similarity index 100% rename from cpp/src/arrow/ipc/ipc-json-simple-test.cc rename to cpp/src/arrow/ipc/json-simple-test.cc diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/json-test.cc similarity index 100% rename from cpp/src/arrow/ipc/ipc-json-test.cc rename to cpp/src/arrow/ipc/json-test.cc diff --git a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc b/cpp/src/arrow/ipc/read-write-benchmark.cc similarity index 100% rename from cpp/src/arrow/ipc/ipc-read-write-benchmark.cc rename to cpp/src/arrow/ipc/read-write-benchmark.cc diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/read-write-test.cc similarity index 100% rename from cpp/src/arrow/ipc/ipc-read-write-test.cc rename to cpp/src/arrow/ipc/read-write-test.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 6b9c3590b44dc..4f515b52e8e64 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -68,16 +68,16 @@ if (ARROW_BUILD_BENCHMARKS) add_library(arrow_benchmark_main benchmark_main.cc) if (APPLE) target_link_libraries(arrow_benchmark_main - benchmark_static + gbenchmark_static ) elseif(MSVC) target_link_libraries(arrow_benchmark_main - benchmark_static + gbenchmark_static Shlwapi.lib ) else() target_link_libraries(arrow_benchmark_main - benchmark_static + gbenchmark_static pthread ) endif() diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 1f76f7841590a..5d75aa271152b 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -153,11 +153,11 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) # and uses less disk space, but in some cases we need to force static # linking (see rationale below). if (ARG_USE_STATIC_LINKING) - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} ${TEST_ARGUMENTS} STATIC_LINK_LIBS ${GANDIVA_STATIC_TEST_LINK_LIBS}) else() - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} ${TEST_ARGUMENTS} STATIC_LINK_LIBS ${GANDIVA_SHARED_TEST_LINK_LIBS}) endif() diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index 1fd30aac495cf..9558fc0757f7b 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -32,6 +32,6 @@ ADD_GANDIVA_TEST(projector_test_static SOURCES projector_test.cc USE_STATIC_LINKING) -ADD_ARROW_BENCHMARK(micro_benchmarks +ADD_BENCHMARK(micro_benchmarks PREFIX "gandiva" EXTRA_LINK_LIBS gandiva_static) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 1538b58164b62..246f69dcc09fa 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -38,12 +38,12 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) # and uses less disk space, but in some cases we need to force static # linking (see rationale below). if (ARG_USE_STATIC_LINKING) - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} STATIC_LINK_LIBS ${PARQUET_STATIC_TEST_LINK_LIBS} PREFIX "parquet" LABELS "unittest;parquet") else() - ADD_ARROW_TEST(${REL_TEST_NAME} + ADD_TEST_CASE(${REL_TEST_NAME} STATIC_LINK_LIBS ${PARQUET_SHARED_TEST_LINK_LIBS} PREFIX "parquet" LABELS "unittest;parquet") diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 0f8916e6c48aa..4ea4b76066cf7 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -cmake_minimum_required(VERSION 3.2) +add_custom_target(plasma) # For the moment, Plasma is versioned like Arrow project(plasma VERSION "${ARROW_BASE_VERSION}") @@ -198,8 +198,20 @@ endif() # Unit tests ####################################### -ADD_ARROW_TEST(test/serialization_tests +# Adding unit tests part of the "arrow" portion of the test suite +function(ADD_PLASMA_TEST REL_TEST_NAME) + set(options) + set(one_value_args) + set(multi_value_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + ADD_TEST_CASE(${REL_TEST_NAME} + PREFIX "plasma" + LABELS "unittest;plasma" + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + +ADD_PLASMA_TEST(test/serialization_tests EXTRA_LINK_LIBS plasma_shared ${PLASMA_LINK_LIBS}) -ADD_ARROW_TEST(test/client_tests +ADD_PLASMA_TEST(test/client_tests EXTRA_LINK_LIBS plasma_shared ${PLASMA_LINK_LIBS} EXTRA_DEPENDENCIES plasma_store_server) From 2428945c0684bed4295f783caaf4a681ef785d90 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 10 Dec 2018 20:03:34 -0700 Subject: [PATCH 019/328] ARROW-3880: [Rust] Implement simple math operations for numeric arrays Author: Andy Grove Closes #3033 from andygrove/ARROW-3880 and squashes the following commits: 17cd418 merge from master afb3518 Move min and max to array_ops 0c77c61 code cleanup f8bfb41 move comparison ops to array_ops 7a5975e Move math ops into new array_ops source file 7946142 Address PR feedback adfe4b0 merge from master and fix conflicts 5ed5f6e add comparison operations 42c68af re-implement with generics 963def6 Merge branch 'master' into ARROW-3880 729cd9a fix formatting 405c63e re-implement using macros 5876fb7 save work a2b87e2 merge from master, comment out new methods 2a43b3f merge from master 06bbc4a improve handling of divide by zero, format for rust nightly 1ea98cf Improve error handling dcad28a cargo fmt 12dc05b Implement simple math operations for numeric arrays --- rust/Cargo.toml | 1 + rust/src/array.rs | 47 ----- rust/src/array_ops.rs | 418 ++++++++++++++++++++++++++++++++++++++++++ rust/src/error.rs | 2 + rust/src/lib.rs | 1 + 5 files changed, 422 insertions(+), 47 deletions(-) create mode 100644 rust/src/array_ops.rs diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 39de50c8a336d..aa23815f74085 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -42,6 +42,7 @@ serde_derive = "1.0.80" serde_json = "1.0.13" rand = "0.5" csv = "1.0.0" +num = "0.2" [dev-dependencies] criterion = "0.2" diff --git a/rust/src/array.rs b/rust/src/array.rs index 51bc8d993c19b..11e732a1267ea 100644 --- a/rust/src/array.rs +++ b/rust/src/array.rs @@ -200,39 +200,6 @@ impl PrimitiveArray { &raw[offset..offset + len] } - /// Returns the minimum value in the array, according to the natural order. - pub fn min(&self) -> Option { - self.min_max_helper(|a, b| a < b) - } - - /// Returns the maximum value in the array, according to the natural order. - pub fn max(&self) -> Option { - self.min_max_helper(|a, b| a > b) - } - - fn min_max_helper(&self, cmp: F) -> Option - where - F: Fn(T::Native, T::Native) -> bool, - { - let mut n: Option = None; - let data = self.data(); - for i in 0..data.len() { - if data.is_null(i) { - continue; - } - let m = self.value(i); - match n { - None => n = Some(m), - Some(nn) => { - if cmp(m, nn) { - n = Some(m) - } - } - } - } - n - } - // Returns a new primitive array builder pub fn builder(capacity: usize) -> PrimitiveArrayBuilder { PrimitiveArrayBuilder::::new(capacity) @@ -1218,20 +1185,6 @@ mod tests { BinaryArray::from(array_data); } - #[test] - fn test_buffer_array_min_max() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - assert_eq!(5, a.min().unwrap()); - assert_eq!(9, a.max().unwrap()); - } - - #[test] - fn test_buffer_array_min_max_with_nulls() { - let a = Int32Array::from(vec![Some(5), None, None, Some(8), Some(9)]); - assert_eq!(5, a.min().unwrap()); - assert_eq!(9, a.max().unwrap()); - } - #[test] fn test_access_array_concurrently() { let a = Int32Array::from(vec![5, 6, 7, 8, 9]); diff --git a/rust/src/array_ops.rs b/rust/src/array_ops.rs new file mode 100644 index 0000000000000..e73a858e951b1 --- /dev/null +++ b/rust/src/array_ops.rs @@ -0,0 +1,418 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::ops::{Add, Div, Mul, Sub}; + +use num::Zero; + +use crate::array::{Array, BooleanArray, PrimitiveArray}; +use crate::builder::{ArrayBuilder, PrimitiveArrayBuilder}; +use crate::datatypes; +use crate::datatypes::ArrowNumericType; +use crate::error::{ArrowError, Result}; + +pub fn add(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +where + T: datatypes::ArrowNumericType, + T::Native: Add + + Sub + + Mul + + Div + + Zero, +{ + math_op(left, right, |a, b| Ok(a + b)) +} + +pub fn subtract(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +where + T: datatypes::ArrowNumericType, + T::Native: Add + + Sub + + Mul + + Div + + Zero, +{ + math_op(left, right, |a, b| Ok(a - b)) +} + +pub fn multiply(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +where + T: datatypes::ArrowNumericType, + T::Native: Add + + Sub + + Mul + + Div + + Zero, +{ + math_op(left, right, |a, b| Ok(a * b)) +} + +pub fn divide(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> +where + T: datatypes::ArrowNumericType, + T::Native: Add + + Sub + + Mul + + Div + + Zero, +{ + math_op(left, right, |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a / b) + } + }) +} + +fn math_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result> +where + T: datatypes::ArrowNumericType, + F: Fn(T::Native, T::Native) -> Result, +{ + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform math operation on two batches of different length".to_string(), + )); + } + let mut b = PrimitiveArrayBuilder::::new(left.len()); + for i in 0..left.len() { + let index = i; + if left.is_null(i) || right.is_null(i) { + b.push_null().unwrap(); + } else { + b.push(op(left.value(index), right.value(index))?).unwrap(); + } + } + Ok(b.finish()) +} + +/// Returns the minimum value in the array, according to the natural order. +pub fn min(array: &PrimitiveArray) -> Option +where + T: ArrowNumericType, +{ + min_max_helper(array, |a, b| a < b) +} + +/// Returns the maximum value in the array, according to the natural order. +pub fn max(array: &PrimitiveArray) -> Option +where + T: ArrowNumericType, +{ + min_max_helper(array, |a, b| a > b) +} + +fn min_max_helper(array: &PrimitiveArray, cmp: F) -> Option +where + T: ArrowNumericType, + F: Fn(T::Native, T::Native) -> bool, +{ + let mut n: Option = None; + let data = array.data(); + for i in 0..data.len() { + if data.is_null(i) { + continue; + } + let m = array.value(i); + match n { + None => n = Some(m), + Some(nn) => { + if cmp(m, nn) { + n = Some(m) + } + } + } + } + n +} + +pub fn eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| a == b) +} + +pub fn neq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| a != b) +} + +pub fn lt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| match (a, b) { + (None, _) => true, + (_, None) => false, + (Some(aa), Some(bb)) => aa < bb, + }) +} + +pub fn lt_eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| match (a, b) { + (None, _) => true, + (_, None) => false, + (Some(aa), Some(bb)) => aa <= bb, + }) +} + +pub fn gt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| match (a, b) { + (None, _) => false, + (_, None) => true, + (Some(aa), Some(bb)) => aa > bb, + }) +} + +pub fn gt_eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +where + T: ArrowNumericType, +{ + bool_op(left, right, |a, b| match (a, b) { + (None, _) => false, + (_, None) => true, + (Some(aa), Some(bb)) => aa >= bb, + }) +} + +fn bool_op(left: &PrimitiveArray, right: &PrimitiveArray, op: F) -> Result +where + T: ArrowNumericType, + F: Fn(Option, Option) -> bool, +{ + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform math operation on two batches of different length".to_string(), + )); + } + let mut b = BooleanArray::builder(left.len()); + for i in 0..left.len() { + let index = i; + let l = if left.is_null(i) { + None + } else { + Some(left.value(index)) + }; + let r = if right.is_null(i) { + None + } else { + Some(right.value(index)) + }; + b.push(op(l, r)).unwrap(); + } + Ok(b.finish()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::array::{Float64Array, Int32Array}; + + #[test] + fn test_primitive_array_add() { + let a = Int32Array::from(vec![5, 6, 7, 8, 9]); + let b = Int32Array::from(vec![6, 7, 8, 9, 8]); + let c = add(&a, &b).unwrap(); + assert_eq!(11, c.value(0)); + assert_eq!(13, c.value(1)); + assert_eq!(15, c.value(2)); + assert_eq!(17, c.value(3)); + assert_eq!(17, c.value(4)); + } + + #[test] + fn test_primitive_array_add_mismatched_length() { + let a = Int32Array::from(vec![5, 6, 7, 8, 9]); + let b = Int32Array::from(vec![6, 7, 8]); + let e = add(&a, &b) + .err() + .expect("should have failed due to different lengths"); + assert_eq!( + "ComputeError(\"Cannot perform math operation on two batches of different length\")", + format!("{:?}", e) + ); + } + + #[test] + fn test_primitive_array_subtract() { + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + let b = Int32Array::from(vec![5, 4, 3, 2, 1]); + let c = subtract(&a, &b).unwrap(); + assert_eq!(-4, c.value(0)); + assert_eq!(-2, c.value(1)); + assert_eq!(0, c.value(2)); + assert_eq!(2, c.value(3)); + assert_eq!(4, c.value(4)); + } + + #[test] + fn test_primitive_array_multiply() { + let a = Int32Array::from(vec![5, 6, 7, 8, 9]); + let b = Int32Array::from(vec![6, 7, 8, 9, 8]); + let c = multiply(&a, &b).unwrap(); + assert_eq!(30, c.value(0)); + assert_eq!(42, c.value(1)); + assert_eq!(56, c.value(2)); + assert_eq!(72, c.value(3)); + assert_eq!(72, c.value(4)); + } + + #[test] + fn test_primitive_array_divide() { + let a = Int32Array::from(vec![15, 15, 8, 1, 9]); + let b = Int32Array::from(vec![5, 6, 8, 9, 1]); + let c = divide(&a, &b).unwrap(); + assert_eq!(3, c.value(0)); + assert_eq!(2, c.value(1)); + assert_eq!(1, c.value(2)); + assert_eq!(0, c.value(3)); + assert_eq!(9, c.value(4)); + } + + #[test] + fn test_primitive_array_divide_by_zero() { + let a = Int32Array::from(vec![15]); + let b = Int32Array::from(vec![0]); + assert_eq!( + ArrowError::DivideByZero, + divide(&a, &b).err().expect("divide by zero should fail") + ); + } + + #[test] + fn test_primitive_array_divide_f64() { + let a = Float64Array::from(vec![15.0, 15.0, 8.0]); + let b = Float64Array::from(vec![5.0, 6.0, 8.0]); + let c = divide(&a, &b).unwrap(); + assert_eq!(3.0, c.value(0)); + assert_eq!(2.5, c.value(1)); + assert_eq!(1.0, c.value(2)); + } + + #[test] + fn test_primitive_array_add_with_nulls() { + let a = Int32Array::from(vec![Some(5), None, Some(7), None]); + let b = Int32Array::from(vec![None, None, Some(6), Some(7)]); + let c = add(&a, &b).unwrap(); + assert_eq!(true, c.is_null(0)); + assert_eq!(true, c.is_null(1)); + assert_eq!(false, c.is_null(2)); + assert_eq!(true, c.is_null(3)); + assert_eq!(13, c.value(2)); + } + + #[test] + fn test_primitive_array_eq() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = eq(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(true, c.value(2)); + assert_eq!(false, c.value(3)); + assert_eq!(false, c.value(4)); + } + + #[test] + fn test_primitive_array_neq() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = neq(&a, &b).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(true, c.value(3)); + assert_eq!(true, c.value(4)); + } + + #[test] + fn test_primitive_array_lt() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = lt(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(true, c.value(3)); + assert_eq!(true, c.value(4)); + } + + #[test] + fn test_primitive_array_lt_eq() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = lt_eq(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(true, c.value(2)); + assert_eq!(true, c.value(3)); + assert_eq!(true, c.value(4)); + } + + #[test] + fn test_primitive_array_gt() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = gt(&a, &b).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(false, c.value(3)); + assert_eq!(false, c.value(4)); + } + + #[test] + fn test_primitive_array_gt_eq() { + let a = Int32Array::from(vec![8, 8, 8, 8, 8]); + let b = Int32Array::from(vec![6, 7, 8, 9, 10]); + let c = gt_eq(&a, &b).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(true, c.value(2)); + assert_eq!(false, c.value(3)); + assert_eq!(false, c.value(4)); + } + + #[test] + fn test_buffer_array_min_max() { + let a = Int32Array::from(vec![5, 6, 7, 8, 9]); + assert_eq!(5, min(&a).unwrap()); + assert_eq!(9, max(&a).unwrap()); + } + + #[test] + fn test_buffer_array_min_max_with_nulls() { + let a = Int32Array::from(vec![Some(5), None, None, Some(8), Some(9)]); + assert_eq!(5, min(&a).unwrap()); + assert_eq!(9, max(&a).unwrap()); + } + +} diff --git a/rust/src/error.rs b/rust/src/error.rs index d82ee1190a68c..559b2d7205994 100644 --- a/rust/src/error.rs +++ b/rust/src/error.rs @@ -19,6 +19,8 @@ pub enum ArrowError { MemoryError(String), ParseError(String), + ComputeError(String), + DivideByZero, } pub type Result = ::std::result::Result; diff --git a/rust/src/lib.rs b/rust/src/lib.rs index e1670ff055971..b661c21279d22 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -27,6 +27,7 @@ extern crate serde_json; pub mod array; pub mod array_data; +pub mod array_ops; pub mod bitmap; pub mod buffer; pub mod builder; From bb3fa4b871b26df786c8f67b23208aae719b56e9 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Mon, 10 Dec 2018 21:30:34 -0600 Subject: [PATCH 020/328] ARROW-3993: [JS] CI Jobs Failing Use `gulp@4.0.0` rather than `gulp@next` Author: Brian Hulette Closes #3153 from TheNeuralBit/gulp-fix and squashes the following commits: e5d2e74c4 gulp@{next->4.0.0} --- integration/integration_test.py | 2 +- js/package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integration/integration_test.py b/integration/integration_test.py index 8021aa643263e..3bd37bdd80677 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -1053,7 +1053,7 @@ class CPPTester(Tester): 'ARROW_CPP_EXE_PATH', os.path.join(ARROW_HOME, 'cpp/build/debug')) - CPP_INTEGRATION_EXE = os.path.join(EXE_PATH, 'json-integration-test') + CPP_INTEGRATION_EXE = os.path.join(EXE_PATH, 'arrow-json-integration-test') STREAM_TO_FILE = os.path.join(EXE_PATH, 'stream-to-file') FILE_TO_STREAM = os.path.join(EXE_PATH, 'file-to-stream') diff --git a/js/package.json b/js/package.json index 9f76819c2e1fd..cf49e41dbe2f4 100644 --- a/js/package.json +++ b/js/package.json @@ -72,7 +72,7 @@ "del": "3.0.0", "glob": "7.1.3", "google-closure-compiler": "20181008.0.0", - "gulp": "next", + "gulp": "4.0.0", "gulp-json-transform": "0.4.5", "gulp-rename": "1.4.0", "gulp-sourcemaps": "2.6.4", From e7341356a365711ef2e19f2a4cb3ee59bc55296d Mon Sep 17 00:00:00 2001 From: "Korn, Uwe" Date: Tue, 11 Dec 2018 15:01:03 +0100 Subject: [PATCH 021/328] ARROW-3995: [CI] Use understandable names on Travis Author: Korn, Uwe Closes #3158 from xhochy/travis-names and squashes the following commits: f268f276 ARROW-3995: Use understandable names on Travis --- .travis.yml | 60 ++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7489d72c80502..42b1275d1c4bf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -50,8 +50,8 @@ matrix: - jdk: oraclejdk9 - language: r include: - # Lint C++, Python, R - - os: linux + - name: "Lint C++, Python, R" + os: linux language: python python: "3.6" env: @@ -62,8 +62,8 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh script: - $TRAVIS_BUILD_DIR/ci/travis_lint.sh - # C++ & Python w/ gcc 4.9 - - compiler: gcc + - name: "C++ & Python w/ gcc 4.9" + compiler: gcc language: cpp os: linux jdk: openjdk8 @@ -102,8 +102,8 @@ matrix: - export PLASMA_VALGRIND=1 - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh - # Gandiva C++ w/ gcc 4.9 and Java - - compiler: gcc + - name: "Gandiva C++ w/ gcc 4.9 and Java" + compiler: gcc language: cpp os: linux jdk: openjdk8 @@ -123,8 +123,8 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh - # [OS X] C++ & Python w/ XCode 6.4 - - compiler: clang + - name: "[OS X] C++ & Python w/ XCode 6.4" + compiler: clang language: cpp osx_image: xcode6.4 os: osx @@ -145,8 +145,8 @@ matrix: - if [ $ARROW_CI_CPP_AFFECTED == "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh; fi - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - # [OS X] Gandiva C++ w/ XCode 8.3 & Java - - compiler: clang + - name: "[OS X] Gandiva C++ w/ XCode 8.3 & Java" + compiler: clang language: cpp # xcode 7.3 has a bug in strptime. osx_image: xcode8.3 @@ -164,14 +164,14 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh - # [manylinux1] Python - - language: cpp + - name: "[manylinux1] Python" + language: cpp before_script: - if [ $ARROW_CI_PYTHON_AFFECTED == "1" ]; then docker pull quay.io/xhochy/arrow_manylinux1_x86_64_base:latest; fi script: - if [ $ARROW_CI_PYTHON_AFFECTED == "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_manylinux.sh; fi - # Java w/ OpenJDK 8 - - language: java + - name: "Java w/ OpenJDK 8" + language: java os: linux jdk: openjdk8 before_script: @@ -180,8 +180,8 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh - $TRAVIS_BUILD_DIR/ci/travis_script_javadoc.sh - # Java w/ Oracle JDK 9 - - language: java + - name: "Java w/ Oracle JDK 9" + language: java os: linux jdk: oraclejdk9 before_script: @@ -192,8 +192,8 @@ matrix: apt: packages: - oracle-java9-installer - # Integration w/ OpenJDK 8 - - language: java + - name: "Integration w/ OpenJDK 8" + language: java os: linux env: ARROW_TEST_GROUP=integration jdk: openjdk8 @@ -212,8 +212,8 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_integration.sh - $TRAVIS_BUILD_DIR/ci/travis_script_plasma_java_client.sh - # NodeJS - - language: node_js + - name: "NodeJS" + language: node_js os: linux node_js: - '10.1' @@ -223,8 +223,8 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_before_script_js.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_js.sh - # C++ & GLib & Ruby w/ gcc 4.9 - - compiler: gcc + - name: "C++ & GLib & Ruby w/ gcc 4.9" + compiler: gcc language: cpp os: linux env: @@ -245,8 +245,8 @@ matrix: script: - if [ $ARROW_CI_C_GLIB_AFFECTED = "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh; fi - $TRAVIS_BUILD_DIR/ci/travis_script_ruby.sh - # [OS X] C++ & GLib & Ruby w/ XCode 8.3 & homebrew - - compiler: clang + - name: "[OS X] C++ & GLib & Ruby w/ XCode 8.3 & homebrew" + compiler: clang osx_image: xcode8.3 os: osx env: @@ -266,8 +266,8 @@ matrix: script: - if [ $ARROW_CI_C_GLIB_AFFECTED = "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh; fi - $TRAVIS_BUILD_DIR/ci/travis_script_ruby.sh - # Rust - - language: rust + - name: Rust + language: rust cache: cargo addons: apt: @@ -289,8 +289,8 @@ matrix: - mkdir -p target/kcov - RUST_BACKTRACE=1 RUSTUP_TOOLCHAIN=stable cargo coverage --verbose - bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" - # Go - - language: go + - name: Go + language: go go_import_path: github.com/apache/arrow os: linux go: @@ -302,8 +302,8 @@ matrix: after_success: - pushd ${TRAVIS_BUILD_DIR}/go/arrow - bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" - # R - - language: r + - name: R + language: r cache: packages latex: false before_install: From a1eff5f3eee7609ce2a67b051d26aca810961f43 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 11 Dec 2018 15:16:06 +0100 Subject: [PATCH 022/328] ARROW-3986: [C++] Write prose documentation Author: Antoine Pitrou Closes #3149 from pitrou/ARROW-3986-cpp-prose-doc and squashes the following commits: 77e37940 ARROW-3986: Write prose documentation --- .gitignore | 9 ++ cpp/src/arrow/builder.h | 15 +- cpp/src/arrow/status.h | 61 +++++--- cpp/src/arrow/table.h | 7 + cpp/src/arrow/type.h | 16 ++- cpp/src/arrow/type_fwd.h | 7 + docs/source/cpp/api.rst | 8 +- docs/source/cpp/api/array.rst | 23 ++- docs/source/cpp/api/builder.rst | 59 ++++++++ docs/source/cpp/api/datatype.rst | 135 ++++++++++++++++++ docs/source/cpp/api/memory.rst | 4 +- docs/source/cpp/api/support.rst | 29 ++++ docs/source/cpp/arrays.rst | 211 ++++++++++++++++++++++++++++ docs/source/cpp/conventions.rst | 91 ++++++++++++ docs/source/cpp/datatypes.rst | 65 +++++++++ docs/source/cpp/getting_started.rst | 30 ++++ docs/source/cpp/index.rst | 65 +-------- docs/source/cpp/overview.rst | 93 ++++++++++++ docs/source/format/Metadata.rst | 2 + 19 files changed, 822 insertions(+), 108 deletions(-) create mode 100644 docs/source/cpp/api/builder.rst create mode 100644 docs/source/cpp/api/datatype.rst create mode 100644 docs/source/cpp/api/support.rst create mode 100644 docs/source/cpp/arrays.rst create mode 100644 docs/source/cpp/conventions.rst create mode 100644 docs/source/cpp/datatypes.rst create mode 100644 docs/source/cpp/getting_started.rst create mode 100644 docs/source/cpp/overview.rst diff --git a/.gitignore b/.gitignore index 5817efdcac091..61440bb504664 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ apache-rat-*.jar arrow-src.tar +arrow-src.tar.gz # Compiled source *.a @@ -36,10 +37,18 @@ MANIFEST *.sln *.iml +# Linux perf sample data +perf.data +perf.data.old + cpp/.idea/ cpp/apidoc/xml/ +docs/example.gz +docs/example1.dat +docs/example3.dat python/.eggs/ python/doc/ + .vscode .idea/ .pytest_cache/ diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 607fa1745a5a0..180b43a220f30 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -262,6 +262,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { return Status::OK(); } + /// \brief Append a single null element Status AppendNull() { ARROW_RETURN_NOT_OK(Reserve(1)); memset(raw_data_ + length_, 0, sizeof(value_type)); @@ -343,12 +344,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { return Status::OK(); } - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin uint8_t* indication valid(1) or null(0) values. - /// nullptr indicates all values are valid. - /// \return Status + // Same as above, with a pointer type ValidIter template typename std::enable_if::value, Status>::type AppendValues( ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { @@ -719,12 +715,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { return Status::OK(); } - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin uint8_t* indication valid(1) or null(0) values. - /// nullptr indicates all values are valid. - /// \return Status + // Same as above, for a pointer type ValidIter template typename std::enable_if::value, Status>::type AppendValues( ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index 7280133a65fb9..ddf3d7ee0e644 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -29,6 +29,7 @@ #ifdef ARROW_EXTRA_ERROR_CONTEXT +/// \brief Propagate any non-successful Status to the caller #define ARROW_RETURN_NOT_OK(s) \ do { \ ::arrow::Status _s = (s); \ @@ -41,6 +42,7 @@ #else +/// \brief Propagate any non-successful Status to the caller #define ARROW_RETURN_NOT_OK(s) \ do { \ ::arrow::Status _s = (s); \ @@ -107,6 +109,14 @@ enum class StatusCode : char { class ARROW_MUST_USE_RESULT ARROW_EXPORT Status; #endif +/// \brief Status outcome object (success or error) +/// +/// The Status object is an object holding the outcome of an operation. +/// The outcome is represented as a StatusCode, either success +/// (StatusCode::OK) or an error (any other of the StatusCode enumeration values). +/// +/// Additionally, if an error occurred, a specific error message is generally +/// attached. class ARROW_EXPORT Status { public: // Create a success status. @@ -135,45 +145,54 @@ class ARROW_EXPORT Status { Status& operator&=(const Status& s) noexcept; Status& operator&=(Status&& s) noexcept; - // Return a success status. + /// Return a success status static Status OK() { return Status(); } - // Return a success status with extra info + /// Return a success status with a specific message static Status OK(const std::string& msg) { return Status(StatusCode::OK, msg); } - // Return error status of an appropriate type. + /// Return an error status for out-of-memory conditions static Status OutOfMemory(const std::string& msg) { return Status(StatusCode::OutOfMemory, msg); } + /// Return an error status for failed key lookups (e.g. column name in a table) static Status KeyError(const std::string& msg) { return Status(StatusCode::KeyError, msg); } + /// Return an error status for type errors (such as mismatching data types) static Status TypeError(const std::string& msg) { return Status(StatusCode::TypeError, msg); } + /// Return an error status for unknown errors static Status UnknownError(const std::string& msg) { return Status(StatusCode::UnknownError, msg); } + /// Return an error status when an operation or a combination of operation and + /// data types is unimplemented static Status NotImplemented(const std::string& msg) { return Status(StatusCode::NotImplemented, msg); } + /// Return an error status for invalid data (for example a string that fails parsing) static Status Invalid(const std::string& msg) { return Status(StatusCode::Invalid, msg); } + /// Return an error status when a container's capacity would exceed its limits static Status CapacityError(const std::string& msg) { return Status(StatusCode::CapacityError, msg); } + /// Return an error status when some IO-related operation failed static Status IOError(const std::string& msg) { return Status(StatusCode::IOError, msg); } + /// Return an error status when some (de)serialization operation failed static Status SerializationError(const std::string& msg) { return Status(StatusCode::SerializationError, msg); } @@ -198,7 +217,6 @@ class ARROW_EXPORT Status { static Status StillExecuting() { return Status(StatusCode::StillExecuting, ""); } - // Return error status of an appropriate type. static Status CodeGenError(const std::string& msg) { return Status(StatusCode::CodeGenError, msg); } @@ -211,34 +229,42 @@ class ARROW_EXPORT Status { return Status(StatusCode::ExecutionError, msg); } - // Returns true iff the status indicates success. + /// Return true iff the status indicates success. bool ok() const { return (state_ == NULL); } + /// Return true iff the status indicates an out-of-memory error. bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } + /// Return true iff the status indicates a key lookup error. bool IsKeyError() const { return code() == StatusCode::KeyError; } + /// Return true iff the status indicates invalid data. bool IsInvalid() const { return code() == StatusCode::Invalid; } + /// Return true iff the status indicates an IO-related failure. bool IsIOError() const { return code() == StatusCode::IOError; } + /// Return true iff the status indicates a container reaching capacity limits. bool IsCapacityError() const { return code() == StatusCode::CapacityError; } + /// Return true iff the status indicates a type error. bool IsTypeError() const { return code() == StatusCode::TypeError; } + /// Return true iff the status indicates an unknown error. bool IsUnknownError() const { return code() == StatusCode::UnknownError; } + /// Return true iff the status indicates an unimplemented operation. bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } - // An object could not be serialized or deserialized. + /// Return true iff the status indicates a (de)serialization failure bool IsSerializationError() const { return code() == StatusCode::SerializationError; } - // An error from R + /// Return true iff the status indicates a R-originated error. bool IsRError() const { return code() == StatusCode::RError; } - // An error is propagated from a nested Python function. + /// Return true iff the status indicates a Python-originated error. bool IsPythonError() const { return code() == StatusCode::PythonError; } - // An object with this object ID already exists in the plasma store. + /// Return true iff the status indicates an already existing Plasma object. bool IsPlasmaObjectExists() const { return code() == StatusCode::PlasmaObjectExists; } - // An object was requested that doesn't exist in the plasma store. + /// Return true iff the status indicates a non-existent Plasma object. bool IsPlasmaObjectNonexistent() const { return code() == StatusCode::PlasmaObjectNonexistent; } - // An already sealed object is tried to be sealed again. + /// Return true iff the status indicates an already sealed Plasma object. bool IsPlasmaObjectAlreadySealed() const { return code() == StatusCode::PlasmaObjectAlreadySealed; } - // An object is too large to fit into the plasma store. + /// Return true iff the status indicates the Plasma store reached its capacity limit. bool IsPlasmaStoreFull() const { return code() == StatusCode::PlasmaStoreFull; } bool IsStillExecuting() const { return code() == StatusCode::StillExecuting; } @@ -251,16 +277,19 @@ class ARROW_EXPORT Status { bool IsExecutionError() const { return code() == StatusCode::ExecutionError; } - // Return a string representation of this status suitable for printing. - // Returns the string "OK" for success. + /// \brief Return a string representation of this status suitable for printing. + /// + /// The string "OK" is returned for success. std::string ToString() const; - // Return a string representation of the status code, without the message - // text or posix code information. + /// \brief Return a string representation of the status code, without the message + /// text or POSIX code information. std::string CodeAsString() const; + /// \brief Return the StatusCode value attached to this status. StatusCode code() const { return ok() ? StatusCode::OK : state_->code; } + /// \brief Return the specific error message attached to this status. std::string message() const { return ok() ? "" : state_->msg; } private: diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 119e4e4491225..9c478485b243c 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -39,7 +39,14 @@ class Status; /// as one large array class ARROW_EXPORT ChunkedArray { public: + /// \brief Construct a chunked array from a vector of arrays + /// + /// The vector should be non-empty and all its elements should have the same + /// data type. explicit ChunkedArray(const ArrayVector& chunks); + /// \brief Construct a chunked array from a vector of arrays and a data type + /// + /// As the data type is passed explicitly, the vector may be empty. ChunkedArray(const ArrayVector& chunks, const std::shared_ptr& type); /// \return the total length of the chunked array; computed on construction diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 63f0e2d237242..f187817b53f28 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -40,11 +40,11 @@ namespace arrow { class Array; class Field; -/// \brief Main data type enumeration -/// -/// This enumeration provides a quick way to interrogate the category -/// of a DataType instance. struct Type { + /// \brief Main data type enumeration + /// + /// This enumeration provides a quick way to interrogate the category + /// of a DataType instance. enum type { /// A NULL type having no physical storage NA, @@ -143,7 +143,7 @@ struct Type { /// nested type consisting of other data types, or another data type (e.g. a /// timestamp encoded as an int64). /// -/// Simple datatypes may be entirely described by their Type id, but +/// Simple datatypes may be entirely described by their Type::type id, but /// complex datatypes are usually parametric. class ARROW_EXPORT DataType { public: @@ -624,6 +624,7 @@ class ARROW_EXPORT Date64Type : public DateType { }; struct TimeUnit { + /// The unit for a time or timestamp DataType enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; }; @@ -837,6 +838,9 @@ class ARROW_EXPORT Schema { // Parametric factory functions // Other factory functions are in type_fwd.h +/// \addtogroup type-factories +/// @{ + /// \brief Create a FixedSizeBinaryType instance ARROW_EXPORT std::shared_ptr fixed_size_binary(int32_t byte_width); @@ -890,6 +894,8 @@ std::shared_ptr ARROW_EXPORT dictionary(const std::shared_ptr& index_type, const std::shared_ptr& values, bool ordered = false); +/// @} + /// \brief Create a Field instance /// /// \param name the field name diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index dbbe7092b4f12..2a83d8a664d80 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -137,6 +137,11 @@ using IntervalArray = NumericArray; // (parameter-free) Factory functions // Other factory functions are in type.h +/// \defgroup type-factories Factory functions for creating data types +/// +/// Factory functions for creating data types +/// @{ + /// \brief Return a NullType instance std::shared_ptr ARROW_EXPORT null(); /// \brief Return a BooleanType instance @@ -172,6 +177,8 @@ std::shared_ptr ARROW_EXPORT date32(); /// \brief Return a Date64Type instance std::shared_ptr ARROW_EXPORT date64(); +/// @} + } // namespace arrow #endif // ARROW_TYPE_FWD_H diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index 894ed1f907f6d..02aa4d62e3b31 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -20,8 +20,10 @@ API Reference ************* .. toctree:: - :maxdepth: 2 - :caption: Getting Started + :maxdepth: 3 - api/array + api/support api/memory + api/datatype + api/array + api/builder diff --git a/docs/source/cpp/api/array.rst b/docs/source/cpp/api/array.rst index aed18763b6ce7..bb981d1a0477d 100644 --- a/docs/source/cpp/api/array.rst +++ b/docs/source/cpp/api/array.rst @@ -15,19 +15,23 @@ .. specific language governing permissions and limitations .. under the License. -Array types -============= +====== +Arrays +====== .. doxygenclass:: arrow::Array :project: arrow_cpp :members: +Concrete array subclasses +========================= + .. doxygenclass:: arrow::DictionaryArray :project: arrow_cpp :members: -non-nested array types ----------------------- +Non-nested +---------- .. doxygenclass:: arrow::FlatArray :project: arrow_cpp @@ -65,8 +69,8 @@ non-nested array types :project: arrow_cpp :members: -nested array types ------------------- +Nested +------ .. doxygenclass:: arrow::UnionArray :project: arrow_cpp @@ -79,3 +83,10 @@ nested array types .. doxygenclass:: arrow::StructArray :project: arrow_cpp :members: + +Chunked Arrays +============== + +.. doxygenclass:: arrow::ChunkedArray + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/api/builder.rst b/docs/source/cpp/api/builder.rst new file mode 100644 index 0000000000000..0912706ac081c --- /dev/null +++ b/docs/source/cpp/api/builder.rst @@ -0,0 +1,59 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============== +Array Builders +============== + +.. doxygenclass:: arrow::ArrayBuilder + :members: + +Concrete builder subclasses +=========================== + +.. doxygenclass:: arrow::NullBuilder + :members: + +.. doxygenclass:: arrow::BooleanBuilder + :members: + +.. doxygenclass:: arrow::PrimitiveBuilder + :members: + +.. doxygenclass:: arrow::NumericBuilder + :members: + +.. doxygenclass:: arrow::BinaryBuilder + :members: + +.. doxygenclass:: arrow::StringBuilder + :members: + +.. doxygenclass:: arrow::FixedSizeBinaryBuilder + :members: + +.. doxygenclass:: arrow::Decimal128Builder + :members: + +.. doxygenclass:: arrow::ListBuilder + :members: + +.. doxygenclass:: arrow::StructBuilder + :members: + +.. doxygenclass:: arrow::DictionaryBuilder + :members: diff --git a/docs/source/cpp/api/datatype.rst b/docs/source/cpp/api/datatype.rst new file mode 100644 index 0000000000000..ee7844277df27 --- /dev/null +++ b/docs/source/cpp/api/datatype.rst @@ -0,0 +1,135 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +========== +Data Types +========== + +.. doxygenenum:: arrow::Type::type + +.. doxygenclass:: arrow::DataType + :members: + +.. _api-type-factories: + +Factory functions +================= + +These functions are recommended for creating data types. They may return +new objects or existing singletons, depending on the type requested. + +.. doxygengroup:: type-factories + :project: arrow_cpp + :content-only: + +Concrete type subclasses +======================== + +Primitive +--------- + +.. doxygenclass:: arrow::NullType + :members: + +.. doxygenclass:: arrow::BooleanType + :members: + +.. doxygenclass:: arrow::Int8Type + :members: + +.. doxygenclass:: arrow::Int16Type + :members: + +.. doxygenclass:: arrow::Int32Type + :members: + +.. doxygenclass:: arrow::Int64Type + :members: + +.. doxygenclass:: arrow::UInt8Type + :members: + +.. doxygenclass:: arrow::UInt16Type + :members: + +.. doxygenclass:: arrow::UInt32Type + :members: + +.. doxygenclass:: arrow::UInt64Type + :members: + +.. doxygenclass:: arrow::HalfFloatType + :members: + +.. doxygenclass:: arrow::FloatType + :members: + +.. doxygenclass:: arrow::DoubleType + :members: + +Time-related +------------ + +.. doxygenenum:: arrow::TimeUnit::type + +.. doxygenclass:: arrow::Date32Type + :members: + +.. doxygenclass:: arrow::Date64Type + :members: + +.. doxygenclass:: arrow::Time32Type + :members: + +.. doxygenclass:: arrow::Time64Type + :members: + +.. doxygenclass:: arrow::TimestampType + :members: + +Binary-like +----------- + +.. doxygenclass:: arrow::BinaryType + :members: + +.. doxygenclass:: arrow::StringType + :members: + +.. doxygenclass:: arrow::FixedSizeBinaryType + :members: + +.. doxygenclass:: arrow::Decimal128Type + :members: + +Nested +------ + +.. doxygenclass:: arrow::ListType + :members: + +.. doxygenclass:: arrow::StructType + :members: + +.. doxygenclass:: arrow::UnionType + :members: + +Dictionary-encoded +------------------ + +.. doxygenclass:: arrow::DictionaryType + :members: diff --git a/docs/source/cpp/api/memory.rst b/docs/source/cpp/api/memory.rst index fbb5dc818628c..1dc8e706d3e8d 100644 --- a/docs/source/cpp/api/memory.rst +++ b/docs/source/cpp/api/memory.rst @@ -18,8 +18,8 @@ Memory (management) =================== -Basic containers ----------------- +Buffers +------- .. doxygenclass:: arrow::Buffer :project: arrow_cpp diff --git a/docs/source/cpp/api/support.rst b/docs/source/cpp/api/support.rst new file mode 100644 index 0000000000000..b165a9973b4c1 --- /dev/null +++ b/docs/source/cpp/api/support.rst @@ -0,0 +1,29 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Programming Support +=================== + +Error return and reporting +-------------------------- + +.. doxygenclass:: arrow::Status + :project: arrow_cpp + :members: + +.. doxygendefine:: ARROW_RETURN_NOT_OK + diff --git a/docs/source/cpp/arrays.rst b/docs/source/cpp/arrays.rst new file mode 100644 index 0000000000000..0c5272d2aed5e --- /dev/null +++ b/docs/source/cpp/arrays.rst @@ -0,0 +1,211 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +====== +Arrays +====== + +The central type in Arrow is the class :class:`arrow::Array`. An array +represents a known-length sequence of values all having the same type. +Internally, those values are represented by one or several buffers, the +number and meaning of which depend on the array's data type, as documented +in :doc:`the Arrow data layout specification <../format/Layout>`. + +Those buffers consist of the value data itself and an optional bitmap buffer +that indicates which array entries are null values. The bitmap buffer +can be entirely omitted if the array is known to have zero null values. + +There are concrete subclasses of :class:`arrow::Array` for each data type, +that help you access individual values of the array. + +Building an array +================= + +As Arrow objects are immutable, there are classes provided that help you +build these objects incrementally from third-party data. These classes +are organized in a hierarchy around the :class:`arrow::ArrayBuilder` base class, +with concrete subclasses tailored for each particular data type. + +For example, to build an array of ``int64_t`` elements, we can use the +:class:`arrow::Int64Builder` class. In the following example, we build an array +of the range 1 to 8 where the element that should hold the value 4 is nulled:: + + arrow::Int64Builder builder; + builder.Append(1); + builder.Append(2); + builder.Append(3); + builder.AppendNull(); + builder.Append(5); + builder.Append(6); + builder.Append(7); + builder.Append(8); + + std::shared_ptr array; + arrow::Status st = builder.Finish(&array); + if (!st.ok()) { + // ... do something on array building failure + } + +The resulting Array (which can be casted to the concrete :class:`arrow::Int64Array` +subclass if you want to access its values) then consists of two +:class:`arrow::Buffer`\s. +The first buffer holds the null bitmap, which consists here of a single byte with +the bits ``0|0|0|0|1|0|0|0``. As we use `least-significant bit (LSB) numbering`_. +this indicates that the fourth entry in the array is null. The second +buffer is simply an ``int64_t`` array containing all the above values. +As the fourth entry is null, the value at that position in the buffer is +undefined. + +Here is how you could access the concrete array's contents:: + + // Cast the Array to its actual type to access its data + auto int64_array = std::static_pointer_cast(array); + + // Get the pointer to the null bitmap. + const uint8_t* null_bitmap = int64_array->null_bitmap_data(); + + // Get the pointer to the actual data + const int64_t* data = int64_array->raw_values(); + + // Alternatively, given an array index, query its null bit and value directly + int64_t index = 2; + if (!int64_array->IsNull(index)) { + int64_t value = int64_array->Value(index); + } + +.. note:: + :class:`arrow::Int64Array` (respectively :class:`arrow::Int64Builder`) is + just a ``typedef``, provided for convenience, of ``arrow::NumericArray`` + (respectively ``arrow::NumericBuilder``). + +.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering + +Performance +----------- + +While it is possible to build an array value-by-value as in the example above, +to attain highest performance it is recommended to use the bulk appending +methods (usually named ``AppendValues``) in the concrete :class:`arrow::ArrayBuilder` +subclasses. + +If you know the number of elements in advance, it is also recommended to +presize the working area by calling the :func:`~arrow::ArrayBuilder::Resize` +or :func:`~arrow::ArrayBuilder::Reserve` methods. + +Here is how one could rewrite the above example to take advantage of those +APIs:: + + arrow::Int64Builder builder; + // Make place for 8 values in total + builder.Resize(8); + // Bulk append the given values (with a null in 4th place as indicated by the + // validity vector) + std::vector validity = {true, true, true, false, true, true, true, true}; + std::vector values = {1, 2, 3, 0, 5, 6, 7, 8}; + builder.AppendValues(values, validity); + + std::shared_ptr array; + arrow::Status st = builder.Finish(&array); + +If you still must append values one by one, some concrete builder subclasses +have methods marked "Unsafe" that assume the working area has been correctly +presized, and offer higher performance in exchange:: + + arrow::Int64Builder builder; + // Make place for 8 values in total + builder.Resize(8); + builder.UnsafeAppend(1); + builder.UnsafeAppend(2); + builder.UnsafeAppend(3); + builder.UnsafeAppendNull(); + builder.UnsafeAppend(5); + builder.UnsafeAppend(6); + builder.UnsafeAppend(7); + builder.UnsafeAppend(8); + + std::shared_ptr array; + arrow::Status st = builder.Finish(&array); + + +Size Limitations and Recommendations +==================================== + +Some array types are structurally limited to 32-bit sizes. This is the case +for list arrays (which can hold up to 2^31 elements), string arrays and binary +arrays (which can hold up to 2GB of binary data), at least. Some other array +types can hold up to 2^63 elements in the C++ implementation, but other Arrow +implementations can have a 32-bit size limitation for those array types as well. + +For these reasons, it is recommended that huge data be chunked in subsets of +more reasonable size. + +Chunked Arrays +============== + +A :class:`arrow::ChunkedArray` is, like an array, a logical sequence of values; +but unlike a simple array, a chunked array does not require the entire sequence +to be physically contiguous in memory. Also, the constituents of a chunked array +need not have the same size, but they must all have the same data type. + +A chunked array is constructed by agregating any number of arrays. Here we'll +build a chunked array with the same logical values as in the example above, +but in two separate chunks:: + + std::vector> chunks; + std::shared_ptr array; + + // Build first chunk + arrow::Int64Builder builder; + builder.Append(1); + builder.Append(2); + builder.Append(3); + if (!builder.Finish(&array).ok()) { + // ... do something on array building failure + } + chunks.push_back(std::move(array)); + + // Build second chunk + builder.Reset(); + builder.AppendNull(); + builder.Append(5); + builder.Append(6); + builder.Append(7); + builder.Append(8); + if (!builder.Finish(&array).ok()) { + // ... do something on array building failure + } + chunks.push_back(std::move(array)); + + auto chunked_array = std::make_shared(std::move(chunks)); + + assert(chunked_array->num_chunks() == 2); + // Logical length in number of values + assert(chunked_array->length() == 8); + assert(chunked_array->null_count() == 1); + +Slicing +======= + +Like for physical memory buffers, it is possible to make zero-copy slices +of arrays and chunked arrays, to obtain an array or chunked array referring +to some logical subsequence of the data. This is done by calling the +:func:`arrow::Array::Slice` and :func:`arrow::ChunkedArray::Slice` methods, +respectively. + diff --git a/docs/source/cpp/conventions.rst b/docs/source/cpp/conventions.rst new file mode 100644 index 0000000000000..b0424358901b4 --- /dev/null +++ b/docs/source/cpp/conventions.rst @@ -0,0 +1,91 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Conventions +=========== + +The Arrow C++ API follows a few simple guidelines. As with many rules, +there may be exceptions. + +Language version +---------------- + +Arrow is C++11-compatible. A few backports are used for newer functionality, +for example the :class:`std::string_view` class. + +Namespacing +----------- + +All the Arrow API (except macros) is namespaced inside a ``arrow`` namespace, +and nested namespaces thereof. + +Safe pointers +------------- + +Arrow objects are usually passed and stored using safe pointers -- most of +the time :class:`std::shared_ptr` but sometimes also :class:`std::unique_ptr`. + +Immutability +------------ + +Many Arrow objects are immutable: once constructed, their logical properties +cannot change anymore. This makes it possible to use them in multi-threaded +scenarios without requiring tedious and error-prone synchronization. + +There are obvious exceptions to this, such as IO objects or mutable data buffers. + +Error reporting +--------------- + +Most APIs indicate a successful or erroneous outcome by returning a +:class:`arrow::Status` instance. Arrow doesn't throw exceptions of its +own, but third-party exceptions might propagate through, especially +:class:`std::bad_alloc` (but Arrow doesn't use the standard allocators for +large data). + +As a consequence, the result value of a function is generally passed as an +out-pointer parameter, rather than as a function return value. + +(however, functions which always determiniscally succeed may eschew this +convention and return their result directly) + +Here is an example of checking the outcome of an operation:: + + const int64_t buffer_size = 4096; + std::shared_ptr buffer; + + auto status = arrow::AllocateBuffer(buffer_size, &buffer); + if (!status.ok()) { + // ... handle error + } + +If the caller function itself returns a :class:`arrow::Status` and wants +to propagate any non-successful outcomes, a convenience macro +:cpp:func:`ARROW_RETURN_NON_OK` is available:: + + arrow::Status DoSomething() { + const int64_t buffer_size = 4096; + std::shared_ptr buffer; + ARROW_RETURN_NON_OK(arrow::AllocateBuffer(buffer_size, &buffer)); + // ... allocation successful, do something with buffer below + + // return success at the end + return Status::OK(); + } diff --git a/docs/source/cpp/datatypes.rst b/docs/source/cpp/datatypes.rst new file mode 100644 index 0000000000000..117c05b8755e7 --- /dev/null +++ b/docs/source/cpp/datatypes.rst @@ -0,0 +1,65 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Data Types +========== + +Data types govern how physical data is interpreted. Their :ref:`specification +` allows binary interoperability between different Arrow +implementations, including from different programming languages and runtimes +(for example it is possible to access the same data, without copying, from +both Python and Java using the :py:mod:`pyarrow.jvm` bridge module). + +Information about a data type in C++ can be represented in three ways: + +1. Using a :class:`arrow::DataType` instance (e.g. as a function argument) +2. Using a :class:`arrow::DataType` concrete subclass (e.g. as a template + parameter) +3. Using a :type:`arrow::Type::type` enum value (e.g. as the condition of + a switch statement) + +The first form (using a :class:`arrow::DataType` instance) is the most idiomatic +and flexible. Runtime-parametric types can only be fully represented with +a DataType instance. For example, a :class:`arrow::TimestampType` needs to be +constructed at runtime with a :type:`arrow::TimeUnit::type` parameter; a +:class:`arrow::Decimal128Type` with *scale* and *precision* parameters; +a :class:`arrow::ListType` with a full child type (itself a +:class:`arrow::DataType` instance). + +The two other forms can be used where performance is critical, in order to +avoid paying the price of dynamic typing and polymorphism. However, some +amount of runtime switching can still be required for parametric types. +It is not possible to reify all possible types at compile time, since Arrow +data types allows arbitrary nesting. + +Creating data types +------------------- + +To instantiate data types, it is recommended to call the provided +:ref:`factory functions `:: + + std::shared_ptr type; + + // A 16-bit integer type + type = arrow::int16(); + // A 64-bit timestamp type (with microsecond granularity) + type = arrow::timestamp(arrow::TimeUnit::MICRO); + // A list type of single-precision floating-point values + type = arrow::list(arrow::float32()); diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst new file mode 100644 index 0000000000000..8201c2ded0d92 --- /dev/null +++ b/docs/source/cpp/getting_started.rst @@ -0,0 +1,30 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Getting Started +=============== + +.. toctree:: + + overview + conventions + arrays + datatypes + diff --git a/docs/source/cpp/index.rst b/docs/source/cpp/index.rst index 4f874bac4fd1e..8c7ced0c2e7b8 100644 --- a/docs/source/cpp/index.rst +++ b/docs/source/cpp/index.rst @@ -20,69 +20,6 @@ C++ Implementation .. toctree:: :maxdepth: 2 - :caption: Getting Started + getting_started api - -Getting Started ---------------- - -The most basic structure in Arrow is an :cpp:class:`arrow::Array`. It holds a sequence -of values with known length all having the same type. It consists of the data -itself and an additional bitmap that indicates if the corresponding entry of -array is a null-value. Note that for array with zero null entries, we can omit -this bitmap. - -As Arrow objects are immutable, there are classes provided that should help you -build these objects. To build an array of ``int64_t`` elements, we can use the -:cpp:class:`arrow::Int64Builder`. In the following example, we build an array of -the range 1 to 8 where the element that should hold the number 4 is nulled. - -.. code:: - - Int64Builder builder; - builder.Append(1); - builder.Append(2); - builder.Append(3); - builder.AppendNull(); - builder.Append(5); - builder.Append(6); - builder.Append(7); - builder.Append(8); - - std::shared_ptr array; - builder.Finish(&array); - -The resulting Array (which can be casted to :cpp:class:`arrow::Int64Array` if you want -to access its values) then consists of two :cpp:class:`arrow::Buffer`. The first one is -the null bitmap holding a single byte with the bits ``0|0|0|0|1|0|0|0``. -As we use `least-significant bit (LSB) numbering`_. -this indicates that the fourth entry in the array is null. The second -buffer is simply an ``int64_t`` array containing all the above values. -As the fourth entry is null, the value at that position in the buffer is -undefined. - -.. code:: - - // Cast the Array to its actual type to access its data - std::shared_ptr int64_array = std::static_pointer_cast(array); - - // Get the pointer to the null bitmap. - const uint8_t* null_bitmap = int64_array->null_bitmap_data(); - - // Get the pointer to the actual data - const int64_t* data = int64_array->raw_values(); - -In the above example, we have yet skipped explaining two things in the code. -On constructing the builder, we have passed :cpp:func:`arrow::int64()` to it. This is -the type information with which the resulting array will be annotated. In -this simple form, it is solely a :cpp:class:`std::shared_ptr` -instantiation. - -Furthermore, we have passed :cpp:func:`arrow::default_memory_pool()` to the constructor. -This :cpp:class:`arrow::MemoryPool` is used for the allocations of heap memory. Besides -tracking the amount of memory allocated, the allocator also ensures that the -allocated memory regions are 64-byte aligned (as required by the Arrow -specification). - -.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering diff --git a/docs/source/cpp/overview.rst b/docs/source/cpp/overview.rst new file mode 100644 index 0000000000000..490efc1b7a2c1 --- /dev/null +++ b/docs/source/cpp/overview.rst @@ -0,0 +1,93 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +High-Level Overview +=================== + +The Arrow C++ library is comprised of different parts, each of which serves +a specific purpose. + +The physical layer +------------------ + +**Memory management** abstractions provide a uniform API over memory that +may be allocated through various means, such as heap allocation, the memory +mapping of a file or a static memory area. In particular, the **buffer** +abstraction represents a contiguous area of physical data. + +The one-dimensional layer +------------------------- + +**Data types** govern the *logical* interpretation of *physical* data. +Many operations in Arrow are parametered, at compile-time or at runtime, +by a data type. + +**Arrays** assemble one or several buffers with a data type, allowing to +view them as a logical contiguous sequence of values (possibly nested). + +**Chunked arrays** are a generalization of arrays, comprising several same-type +arrays into a longer logical sequence of values. + +The two-dimensional layer +------------------------- + +**Schemas** describe a logical collection of several pieces of data, +each with a distinct name and type, and optional metadata. + +**Columns** are like chunked arrays, but with optional metadata. + +**Tables** are collections of columns in accordance to a schema. They are +the most capable dataset-providing abstraction in Arrow. + +**Record batches** are collections of contiguous arrays, described +by a schema. They allow incremental construction or serialization of tables. + +The compute layer +----------------- + +**Datums** are flexible dataset references, able to hold for example an array or table +reference. + +**Kernels** are specialized computation functions running in a loop over a +given set of datums representing input and output parameters to the functions. + +The IO layer +------------ + +**Streams** allow untyped sequential or seekable access over external data +of various kinds (for example compressed or memory-mapped). + +The Inter-Process Communication (IPC) layer +------------------------------------------- + +A **messaging format** allows interchange of Arrow data between processes, using +as few copies as possible. + +The file formats layer +---------------------- + +Reading and writing Arrow data from/to various file formats is possible, for +example **Parquet**, **CSV**, **Orc** or the Arrow-specific **Feather** format. + +The devices layer +----------------- + +Basic **CUDA** integration is provided, allowing to describe Arrow data backed +by GPU-allocated memory. diff --git a/docs/source/format/Metadata.rst b/docs/source/format/Metadata.rst index 4ed82e0078e2c..293d0113875a6 100644 --- a/docs/source/format/Metadata.rst +++ b/docs/source/format/Metadata.rst @@ -266,6 +266,8 @@ detail for each type below): :: buffer 10: field 5 offsets buffer 11: field 5 data +.. _spec-logical-types: + Logical types ------------- From c7e986047a7066a4001227a2901f91bc2f2a17d2 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Tue, 11 Dec 2018 17:21:35 -0700 Subject: [PATCH 023/328] ARROW-3960: [Rust] remove extern crate for Rust 2018 This is a trivial change to remove "extern crate" definitions in lib.rs, to follow the new module system in Rust 2018 edition. Author: Chao Sun Author: Chao Sun Closes #3125 from sunchao/ARROW-3960 and squashes the following commits: 56a4393 Remove star import 0e5d06c Fixing json_internal error 53c13a9 ARROW-3960: remove extern crate for Rust 2018 --- rust/src/datatypes.rs | 4 +++- rust/src/lib.rs | 8 -------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index f91c75d7bd0c3..36cb818cdfc7a 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -26,8 +26,10 @@ use std::mem::size_of; use std::slice::from_raw_parts; use std::str::FromStr; +use serde_derive::{Deserialize, Serialize}; +use serde_json::{json, Value}; + use crate::error::{ArrowError, Result}; -use serde_json::Value; /// The possible relative types that are supported. /// diff --git a/rust/src/lib.rs b/rust/src/lib.rs index b661c21279d22..f41d08f1427a6 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -17,14 +17,6 @@ #![feature(specialization)] -extern crate csv as csv_crate; - -#[macro_use] -extern crate serde_derive; - -#[macro_use] -extern crate serde_json; - pub mod array; pub mod array_data; pub mod array_ops; From 28d16c0e5682edeeebb37d3724f17b82c10aa4cf Mon Sep 17 00:00:00 2001 From: kabukawa Date: Wed, 12 Dec 2018 15:28:54 +0900 Subject: [PATCH 024/328] ARROW-3996: [C++] Add missing packages on Linux [C++] Build requirement libraries add to README.md. * autoconf * Jemalloc * boost-regex Author: kabukawa Author: Kouhei Sutou Closes #3157 from kabukawa/apache-arrow-develop and squashes the following commits: a9f465a3 Add autoconf 45568fd1 Instration requirement add.(modified) dcee4855 Instration requirement add. --- cpp/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/README.md b/cpp/README.md index 7d0851762c291..1278ca046d432 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -36,9 +36,13 @@ Building Arrow requires: On Ubuntu/Debian you can install the requirements with: ```shell -sudo apt-get install cmake \ +sudo apt-get install \ + autoconf \ + build-essential \ + cmake \ libboost-dev \ libboost-filesystem-dev \ + libboost-regex-dev \ libboost-system-dev ``` From 527fed672260752e72a6572238d1029063091ef1 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Wed, 12 Dec 2018 17:50:13 +0900 Subject: [PATCH 025/328] ARROW-3913: [Gandiva] [GLib] Add GGandivaLiteralNode Support GGandivaLiteralNode including the following sub classes. - GGandivaUint8LiteralNode - GGandivaUint16LiteralNode - GGandivaUint32LiteralNode - GGandivaUint64LiteralNode - GGandivaInt8LiteralNode - GGandivaInt16LiteralNode - GGandivaInt32LiteralNode - GGandivaInt64LiteralNode - GGandivaFloatLiteralNode - GGandivaDoubleLiteralNode - GGandivaStringLiteralNode - GGandivaBinaryLiteralNode Author: Yosuke Shiro Author: Kouhei Sutou Closes #3092 from shiro615/glib-add-ggandiva-literal-node and squashes the following commits: fb49b256 Add a missing ref 91cebe1c Break a long line 3ad1e5d8 Support BooleanLiteralNode#value 28d301eb Fix class orders 7d70c89e Remove binary literal property 783a2868 Use g_bytes_ref in ggandiva_binary_literal_node_get_value() 4162234d Fix class orders 289dfce2 Add ggandiva_binary_literal_node_new_bytes() 77f9eb89 Remove (transfer full) to use return value.c_str() e43d525f Use static_pointer_cast 62a6dd5c Return GBytes in ggandiva_binary_literal_node_get_value() 48d1175d Remove unnecessary static_cast d7ac46b4 Remove (nullable) of size of binary literal 8f6643af Fix documents 3ded5866 Refactor ggandiva_literal_{}_node_new_raw() a54c6f58 Add the original raw value getter bb2f71be Rename Uint to UInt 34422ad1 Remove property 7a3fe325 Use 'const guint8 *value, gsize size' for binary data 138abbf8 Fix a typo ba501d60 Rename is_true to value a45fa752 Use MakeStringLiteral, MakeBinaryLiteral d8775e4a Fix property name in BooleanLiteralNode 62a4eb48 Add test case for LiteralNode 83876ccd Support GGandivaLiteralNode --- c_glib/gandiva-glib/node.cpp | 754 ++++++++++++++++++ c_glib/gandiva-glib/node.h | 236 ++++++ c_glib/gandiva-glib/node.hpp | 3 + .../test/gandiva/test-binary-literal-node.rb | 34 + .../test/gandiva/test-boolean-literal-node.rb | 28 + .../test/gandiva/test-double-literal-node.rb | 28 + .../test/gandiva/test-float-literal-node.rb | 34 + .../test/gandiva/test-int16-literal-node.rb | 28 + .../test/gandiva/test-int32-literal-node.rb | 28 + .../test/gandiva/test-int64-literal-node.rb | 28 + c_glib/test/gandiva/test-int8-literal-node.rb | 28 + .../test/gandiva/test-string-literal-node.rb | 28 + .../test/gandiva/test-uint16-literal-node.rb | 28 + .../test/gandiva/test-uint32-literal-node.rb | 28 + .../test/gandiva/test-uint64-literal-node.rb | 28 + .../test/gandiva/test-uint8-literal-node.rb | 28 + ruby/red-gandiva/lib/gandiva/loader.rb | 14 + .../test/test-boolean-literal-node.rb | 24 + 18 files changed, 1407 insertions(+) create mode 100644 c_glib/test/gandiva/test-binary-literal-node.rb create mode 100644 c_glib/test/gandiva/test-boolean-literal-node.rb create mode 100644 c_glib/test/gandiva/test-double-literal-node.rb create mode 100644 c_glib/test/gandiva/test-float-literal-node.rb create mode 100644 c_glib/test/gandiva/test-int16-literal-node.rb create mode 100644 c_glib/test/gandiva/test-int32-literal-node.rb create mode 100644 c_glib/test/gandiva/test-int64-literal-node.rb create mode 100644 c_glib/test/gandiva/test-int8-literal-node.rb create mode 100644 c_glib/test/gandiva/test-string-literal-node.rb create mode 100644 c_glib/test/gandiva/test-uint16-literal-node.rb create mode 100644 c_glib/test/gandiva/test-uint32-literal-node.rb create mode 100644 c_glib/test/gandiva/test-uint64-literal-node.rb create mode 100644 c_glib/test/gandiva/test-uint8-literal-node.rb create mode 100644 ruby/red-gandiva/test/test-boolean-literal-node.rb diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index 49d1d0b7168df..cdb9724d7ebbf 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -26,6 +26,15 @@ #include +template +Type +ggandiva_literal_node_get(GGandivaLiteralNode *node) +{ + auto gandiva_literal_node = + std::static_pointer_cast(ggandiva_node_get_raw(GGANDIVA_NODE(node))); + return boost::get(gandiva_literal_node->holder()); +} + G_BEGIN_DECLS /** @@ -40,6 +49,48 @@ G_BEGIN_DECLS * * #GGandivaFunctionNode is a class for a node in the expression tree, representing a function. * + * #GGandivaLiteralNode is a base class for a node in the expression tree, + * representing a literal. + * + * #GGandivaBooleanLiteralNode is a class for a node in the expression tree, + * representing a boolean literal. + * + * #GGandivaInt8LiteralNode is a class for a node in the expression tree, + * representing a 8-bit integer literal. + * + * #GGandivaUInt8LiteralNode is a class for a node in the expression tree, + * representing a 8-bit unsigned integer literal. + * + * #GGandivaInt16LiteralNode is a class for a node in the expression tree, + * representing a 16-bit integer literal. + * + * #GGandivaUInt16LiteralNode is a class for a node in the expression tree, + * representing a 16-bit unsigned integer literal. + * + * #GGandivaInt32LiteralNode is a class for a node in the expression tree, + * representing a 32-bit integer literal. + * + * #GGandivaUInt32LiteralNode is a class for a node in the expression tree, + * representing a 32-bit unsigned integer literal. + * + * #GGandivaInt64LiteralNode is a class for a node in the expression tree, + * representing a 64-bit integer literal. + * + * #GGandivaUInt64LiteralNode is a class for a node in the expression tree, + * representing a 64-bit unsigned integer literal. + * + * #GGandivaFloatLiteralNode is a class for a node in the expression tree, + * representing a 32-bit floating point literal. + * + * #GGandivaDoubleLiteralNode is a class for a node in the expression tree, + * representing a 64-bit floating point literal. + * + * #GGandivaBinaryLiteralNode is a class for a node in the expression tree, + * representing a binary literal. + * + * #GGandivaStringLiteralNode is a class for a node in the expression tree, + * representing an UTF-8 encoded string literal. + * * Since: 0.12.0 */ @@ -395,6 +446,654 @@ ggandiva_function_node_get_parameters(GGandivaFunctionNode *node) return priv->parameters; } + +G_DEFINE_TYPE(GGandivaLiteralNode, + ggandiva_literal_node, + GGANDIVA_TYPE_NODE) + +static void +ggandiva_literal_node_init(GGandivaLiteralNode *literal_node) +{ +} + +static void +ggandiva_literal_node_class_init(GGandivaLiteralNodeClass *klass) +{ +} + + +G_DEFINE_TYPE(GGandivaBooleanLiteralNode, + ggandiva_boolean_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_boolean_literal_node_init(GGandivaBooleanLiteralNode *boolean_literal_node) +{ +} + +static void +ggandiva_boolean_literal_node_class_init(GGandivaBooleanLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_boolean_literal_node_new: + * @value: The value of the boolean literal. + * + * Returns: A newly created #GGandivaBooleanLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaBooleanLiteralNode * +ggandiva_boolean_literal_node_new(gboolean value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(static_cast(value)); + return GGANDIVA_BOOLEAN_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_boolean_literal_node_get_value: + * @node: A #GGandivaBooleanLiteralNode. + * + * Returns: The value of the boolean literal. + * + * Since: 0.12.0 + */ +gboolean +ggandiva_boolean_literal_node_get_value(GGandivaBooleanLiteralNode *node) +{ + auto value = ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); + return static_cast(value); +} + + +G_DEFINE_TYPE(GGandivaInt8LiteralNode, + ggandiva_int8_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int8_literal_node_init(GGandivaInt8LiteralNode *int8_literal_node) +{ +} + +static void +ggandiva_int8_literal_node_class_init(GGandivaInt8LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int8_literal_node_new: + * @value: The value of the 8-bit integer literal. + * + * Returns: A newly created #GGandivaInt8LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt8LiteralNode * +ggandiva_int8_literal_node_new(gint8 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_int8_literal_node_get_value: + * @node: A #GGandivaInt8LiteralNode. + * + * Returns: The value of the 8-bit integer literal. + * + * Since: 0.12.0 + */ +gint8 +ggandiva_int8_literal_node_get_value(GGandivaInt8LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt8LiteralNode, + ggandiva_uint8_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint8_literal_node_init(GGandivaUInt8LiteralNode *uint8_literal_node) +{ +} + +static void +ggandiva_uint8_literal_node_class_init(GGandivaUInt8LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint8_literal_node_new: + * @value: The value of the 8-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt8LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt8LiteralNode * +ggandiva_uint8_literal_node_new(guint8 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_uint8_literal_node_get_value: + * @node: A #GGandivaUInt8LiteralNode. + * + * Returns: The value of the 8-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint8 +ggandiva_uint8_literal_node_get_value(GGandivaUInt8LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaInt16LiteralNode, + ggandiva_int16_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int16_literal_node_init(GGandivaInt16LiteralNode *int16_literal_node) +{ +} + +static void +ggandiva_int16_literal_node_class_init(GGandivaInt16LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int16_literal_node_new: + * @value: The value of the 16-bit integer literal. + * + * Returns: A newly created #GGandivaInt16LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt16LiteralNode * +ggandiva_int16_literal_node_new(gint16 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_int16_literal_node_get_value: + * @node: A #GGandivaInt16LiteralNode. + * + * Returns: The value of the 16-bit integer literal. + * + * Since: 0.12.0 + */ +gint16 +ggandiva_int16_literal_node_get_value(GGandivaInt16LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt16LiteralNode, + ggandiva_uint16_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint16_literal_node_init(GGandivaUInt16LiteralNode *uint16_literal_node) +{ +} + +static void +ggandiva_uint16_literal_node_class_init(GGandivaUInt16LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint16_literal_node_new: + * @value: The value of the 16-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt16LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt16LiteralNode * +ggandiva_uint16_literal_node_new(guint16 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_uint16_literal_node_get_value: + * @node: A #GGandivaUInt16LiteralNode. + * + * Returns: The value of the 16-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint16 +ggandiva_uint16_literal_node_get_value(GGandivaUInt16LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaInt32LiteralNode, + ggandiva_int32_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int32_literal_node_init(GGandivaInt32LiteralNode *int32_literal_node) +{ +} + +static void +ggandiva_int32_literal_node_class_init(GGandivaInt32LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int32_literal_node_new: + * @value: The value of the 32-bit integer literal. + * + * Returns: A newly created #GGandivaInt32LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt32LiteralNode * +ggandiva_int32_literal_node_new(gint32 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_int32_literal_node_get_value: + * @node: A #GGandivaInt32LiteralNode. + * + * Returns: The value of the 32-bit integer literal. + * + * Since: 0.12.0 + */ +gint32 +ggandiva_int32_literal_node_get_value(GGandivaInt32LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt32LiteralNode, + ggandiva_uint32_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint32_literal_node_init(GGandivaUInt32LiteralNode *uint32_literal_node) +{ +} + +static void +ggandiva_uint32_literal_node_class_init(GGandivaUInt32LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint32_literal_node_new: + * @value: The value of the 32-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt32LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt32LiteralNode * +ggandiva_uint32_literal_node_new(guint32 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_uint32_literal_node_get_value: + * @node: A #GGandivaUInt32LiteralNode. + * + * Returns: The value of the 32-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint32 +ggandiva_uint32_literal_node_get_value(GGandivaUInt32LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaInt64LiteralNode, + ggandiva_int64_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_int64_literal_node_init(GGandivaInt64LiteralNode *int64_literal_node) +{ +} + +static void +ggandiva_int64_literal_node_class_init(GGandivaInt64LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_int64_literal_node_new: + * @value: The value of the 64-bit integer literal. + * + * Returns: A newly created #GGandivaInt64LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaInt64LiteralNode * +ggandiva_int64_literal_node_new(gint64 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_INT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_int64_literal_node_get_value: + * @node: A #GGandivaInt64LiteralNode. + * + * Returns: The value of the 64-bit integer literal. + * + * Since: 0.12.0 + */ +gint64 +ggandiva_int64_literal_node_get_value(GGandivaInt64LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaUInt64LiteralNode, + ggandiva_uint64_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_uint64_literal_node_init(GGandivaUInt64LiteralNode *uint64_literal_node) +{ +} + +static void +ggandiva_uint64_literal_node_class_init(GGandivaUInt64LiteralNodeClass *klass) +{ +} + +/** + * ggandiva_uint64_literal_node_new: + * @value: The value of the 64-bit unsigned integer literal. + * + * Returns: A newly created #GGandivaUInt64LiteralNode. + * + * Since: 0.12.0 + */ +GGandivaUInt64LiteralNode * +ggandiva_uint64_literal_node_new(guint64 value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_UINT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_uint64_literal_node_get_value: + * @node: A #GGandivaUInt64LiteralNode. + * + * Returns: The value of the 64-bit unsigned integer literal. + * + * Since: 0.12.0 + */ +guint64 +ggandiva_uint64_literal_node_get_value(GGandivaUInt64LiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaFloatLiteralNode, + ggandiva_float_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_float_literal_node_init(GGandivaFloatLiteralNode *float_literal_node) +{ +} + +static void +ggandiva_float_literal_node_class_init(GGandivaFloatLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_float_literal_node_new: + * @value: The value of the 32-bit floating point literal. + * + * Returns: A newly created #GGandivaFloatLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaFloatLiteralNode * +ggandiva_float_literal_node_new(gfloat value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_FLOAT_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_float_literal_node_get_value: + * @node: A #GGandivaFloatLiteralNode. + * + * Returns: The value of the 32-bit floating point literal. + * + * Since: 0.12.0 + */ +gfloat +ggandiva_float_literal_node_get_value(GGandivaFloatLiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +G_DEFINE_TYPE(GGandivaDoubleLiteralNode, + ggandiva_double_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_double_literal_node_init(GGandivaDoubleLiteralNode *double_literal_node) +{ +} + +static void +ggandiva_double_literal_node_class_init(GGandivaDoubleLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_double_literal_node_new: + * @value: The value of the 64-bit floating point literal. + * + * Returns: A newly created #GGandivaDoubleLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaDoubleLiteralNode * +ggandiva_double_literal_node_new(gdouble value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); + return GGANDIVA_DOUBLE_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_double_literal_node_get_value: + * @node: A #GGandivaDoubleLiteralNode. + * + * Returns: The value of the 64-bit floating point literal. + * + * Since: 0.12.0 + */ +gdouble +ggandiva_double_literal_node_get_value(GGandivaDoubleLiteralNode *node) +{ + return ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); +} + + +typedef struct GGandivaBinaryLiteralNodePrivate_ { + GBytes *value; +} GGandivaBinaryLiteralNodePrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GGandivaBinaryLiteralNode, + ggandiva_binary_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +#define GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(object) \ + static_cast( \ + ggandiva_binary_literal_node_get_instance_private( \ + GGANDIVA_BINARY_LITERAL_NODE(object))) + +static void +ggandiva_binary_literal_node_dispose(GObject *object) +{ + auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(object); + + if (priv->value) { + g_bytes_unref(priv->value); + priv->value = nullptr; + } + + G_OBJECT_CLASS(ggandiva_binary_literal_node_parent_class)->dispose(object); +} + +static void +ggandiva_binary_literal_node_init(GGandivaBinaryLiteralNode *binary_literal_node) +{ +} + +static void +ggandiva_binary_literal_node_class_init(GGandivaBinaryLiteralNodeClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = ggandiva_binary_literal_node_dispose; +} + +/** + * ggandiva_binary_literal_node_new: + * @value: (array length=size): The value of the binary literal. + * @size: The number of bytes of the value. + * + * Returns: A newly created #GGandivaBinaryLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new(const guint8 *value, + gsize size) +{ + auto gandiva_node = + gandiva::TreeExprBuilder::MakeBinaryLiteral(std::string(reinterpret_cast(value), + size)); + return GGANDIVA_BINARY_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_binary_literal_node_new_bytes: + * @value: The value of the binary literal. + * + * Returns: A newly created #GGandivaBinaryLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new_bytes(GBytes *value) +{ + size_t value_size; + auto raw_value = g_bytes_get_data(value, &value_size); + auto gandiva_node = + gandiva::TreeExprBuilder::MakeBinaryLiteral( + std::string(reinterpret_cast(raw_value), + value_size)); + auto literal_node = ggandiva_literal_node_new_raw(&gandiva_node); + auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(literal_node); + priv->value = value; + g_bytes_ref(priv->value); + return GGANDIVA_BINARY_LITERAL_NODE(literal_node); +} + +/** + * ggandiva_binary_literal_node_get_value: + * @node: A #GGandivaBinaryLiteralNode. + * + * Returns: (transfer none): The value of the binary literal. + * + * Since: 0.12.0 + */ +GBytes * +ggandiva_binary_literal_node_get_value(GGandivaBinaryLiteralNode *node) +{ + auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(node); + if (!priv->value) { + auto value = ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); + priv->value = g_bytes_new(value.data(), value.size()); + } + + return priv->value; +} + + +G_DEFINE_TYPE(GGandivaStringLiteralNode, + ggandiva_string_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_string_literal_node_init(GGandivaStringLiteralNode *string_literal_node) +{ +} + +static void +ggandiva_string_literal_node_class_init(GGandivaStringLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_string_literal_node_new: + * @value: The value of the UTF-8 encoded string literal. + * + * Returns: A newly created #GGandivaStringLiteralNode. + * + * Since: 0.12.0 + */ +GGandivaStringLiteralNode * +ggandiva_string_literal_node_new(const gchar *value) +{ + auto gandiva_node = gandiva::TreeExprBuilder::MakeStringLiteral(value); + return GGANDIVA_STRING_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); +} + +/** + * ggandiva_string_literal_node_get_value: + * @node: A #GGandivaStringLiteralNode. + * + * Returns: The value of the UTF-8 encoded string literal. + * + * Since: 0.12.0 + */ +const gchar * +ggandiva_string_literal_node_get_value(GGandivaStringLiteralNode *node) +{ + auto value = ggandiva_literal_node_get(GGANDIVA_LITERAL_NODE(node)); + return value.c_str(); +} + G_END_DECLS std::shared_ptr @@ -434,3 +1133,58 @@ ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, priv->parameters = g_list_reverse(priv->parameters); return GGANDIVA_FUNCTION_NODE(function_node); } + +GGandivaLiteralNode * +ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node) +{ + GType type; + + switch ((*gandiva_node)->return_type()->id()) { + case arrow::Type::BOOL: + type = GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE; + break; + case arrow::Type::type::UINT8: + type = GGANDIVA_TYPE_UINT8_LITERAL_NODE; + break; + case arrow::Type::type::UINT16: + type = GGANDIVA_TYPE_UINT16_LITERAL_NODE; + break; + case arrow::Type::type::UINT32: + type = GGANDIVA_TYPE_UINT32_LITERAL_NODE; + break; + case arrow::Type::type::UINT64: + type = GGANDIVA_TYPE_UINT64_LITERAL_NODE; + break; + case arrow::Type::type::INT8: + type = GGANDIVA_TYPE_INT8_LITERAL_NODE; + break; + case arrow::Type::type::INT16: + type = GGANDIVA_TYPE_INT16_LITERAL_NODE; + break; + case arrow::Type::type::INT32: + type = GGANDIVA_TYPE_INT32_LITERAL_NODE; + break; + case arrow::Type::type::INT64: + type = GGANDIVA_TYPE_INT64_LITERAL_NODE; + break; + case arrow::Type::type::FLOAT: + type = GGANDIVA_TYPE_FLOAT_LITERAL_NODE; + break; + case arrow::Type::type::DOUBLE: + type = GGANDIVA_TYPE_DOUBLE_LITERAL_NODE; + break; + case arrow::Type::type::STRING: + type = GGANDIVA_TYPE_STRING_LITERAL_NODE; + break; + case arrow::Type::type::BINARY: + type = GGANDIVA_TYPE_BINARY_LITERAL_NODE; + break; + default: + type = GGANDIVA_TYPE_LITERAL_NODE; + break; + } + auto literal_node = GGANDIVA_LITERAL_NODE(g_object_new(type, + "node", gandiva_node, + NULL)); + return literal_node; +} diff --git a/c_glib/gandiva-glib/node.h b/c_glib/gandiva-glib/node.h index 98ab3afb6ae8f..183003fd9f68a 100644 --- a/c_glib/gandiva-glib/node.h +++ b/c_glib/gandiva-glib/node.h @@ -67,4 +67,240 @@ ggandiva_function_node_new(const gchar *name, GList * ggandiva_function_node_get_parameters(GGandivaFunctionNode *node); + +#define GGANDIVA_TYPE_LITERAL_NODE (ggandiva_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaLiteralNode, + ggandiva_literal_node, + GGANDIVA, + LITERAL_NODE, + GGandivaNode) +struct _GGandivaLiteralNodeClass +{ + GGandivaNodeClass parent_class; +}; + + +#define GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE (ggandiva_boolean_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaBooleanLiteralNode, + ggandiva_boolean_literal_node, + GGANDIVA, + BOOLEAN_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaBooleanLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaBooleanLiteralNode * +ggandiva_boolean_literal_node_new(gboolean value); +gboolean +ggandiva_boolean_literal_node_get_value(GGandivaBooleanLiteralNode *node); + + +#define GGANDIVA_TYPE_INT8_LITERAL_NODE (ggandiva_int8_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt8LiteralNode, + ggandiva_int8_literal_node, + GGANDIVA, + INT8_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt8LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt8LiteralNode * +ggandiva_int8_literal_node_new(gint8 value); +gint8 +ggandiva_int8_literal_node_get_value(GGandivaInt8LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT8_LITERAL_NODE (ggandiva_uint8_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt8LiteralNode, + ggandiva_uint8_literal_node, + GGANDIVA, + UINT8_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt8LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt8LiteralNode * +ggandiva_uint8_literal_node_new(guint8 value); +guint8 +ggandiva_uint8_literal_node_get_value(GGandivaUInt8LiteralNode *node); + + +#define GGANDIVA_TYPE_INT16_LITERAL_NODE (ggandiva_int16_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt16LiteralNode, + ggandiva_int16_literal_node, + GGANDIVA, + INT16_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt16LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt16LiteralNode * +ggandiva_int16_literal_node_new(gint16 value); +gint16 +ggandiva_int16_literal_node_get_value(GGandivaInt16LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT16_LITERAL_NODE (ggandiva_uint16_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt16LiteralNode, + ggandiva_uint16_literal_node, + GGANDIVA, + UINT16_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt16LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt16LiteralNode * +ggandiva_uint16_literal_node_new(guint16 value); +guint16 +ggandiva_uint16_literal_node_get_value(GGandivaUInt16LiteralNode *node); + + +#define GGANDIVA_TYPE_INT32_LITERAL_NODE (ggandiva_int32_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt32LiteralNode, + ggandiva_int32_literal_node, + GGANDIVA, + INT32_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt32LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt32LiteralNode * +ggandiva_int32_literal_node_new(gint32 value); +gint32 +ggandiva_int32_literal_node_get_value(GGandivaInt32LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT32_LITERAL_NODE (ggandiva_uint32_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt32LiteralNode, + ggandiva_uint32_literal_node, + GGANDIVA, + UINT32_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt32LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt32LiteralNode * +ggandiva_uint32_literal_node_new(guint32 value); +guint32 +ggandiva_uint32_literal_node_get_value(GGandivaUInt32LiteralNode *node); + + +#define GGANDIVA_TYPE_INT64_LITERAL_NODE (ggandiva_int64_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaInt64LiteralNode, + ggandiva_int64_literal_node, + GGANDIVA, + INT64_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaInt64LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaInt64LiteralNode * +ggandiva_int64_literal_node_new(gint64 value); +gint64 +ggandiva_int64_literal_node_get_value(GGandivaInt64LiteralNode *node); + + +#define GGANDIVA_TYPE_UINT64_LITERAL_NODE (ggandiva_uint64_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaUInt64LiteralNode, + ggandiva_uint64_literal_node, + GGANDIVA, + UINT64_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaUInt64LiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaUInt64LiteralNode * +ggandiva_uint64_literal_node_new(guint64 value); +guint64 +ggandiva_uint64_literal_node_get_value(GGandivaUInt64LiteralNode *node); + + +#define GGANDIVA_TYPE_FLOAT_LITERAL_NODE (ggandiva_float_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaFloatLiteralNode, + ggandiva_float_literal_node, + GGANDIVA, + FLOAT_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaFloatLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaFloatLiteralNode * +ggandiva_float_literal_node_new(gfloat value); +gfloat +ggandiva_float_literal_node_get_value(GGandivaFloatLiteralNode *node); + + +#define GGANDIVA_TYPE_DOUBLE_LITERAL_NODE (ggandiva_double_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaDoubleLiteralNode, + ggandiva_double_literal_node, + GGANDIVA, + DOUBLE_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaDoubleLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaDoubleLiteralNode * +ggandiva_double_literal_node_new(gdouble value); +gdouble +ggandiva_double_literal_node_get_value(GGandivaDoubleLiteralNode *node); + + +#define GGANDIVA_TYPE_BINARY_LITERAL_NODE (ggandiva_binary_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaBinaryLiteralNode, + ggandiva_binary_literal_node, + GGANDIVA, + BINARY_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaBinaryLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new(const guint8 *value, + gsize size); +GGandivaBinaryLiteralNode * +ggandiva_binary_literal_node_new_bytes(GBytes *value); +GBytes * +ggandiva_binary_literal_node_get_value(GGandivaBinaryLiteralNode *node); + + +#define GGANDIVA_TYPE_STRING_LITERAL_NODE (ggandiva_string_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaStringLiteralNode, + ggandiva_string_literal_node, + GGANDIVA, + STRING_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaStringLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaStringLiteralNode * +ggandiva_string_literal_node_new(const gchar *value); +const gchar * +ggandiva_string_literal_node_get_value(GGandivaStringLiteralNode *node); + G_END_DECLS diff --git a/c_glib/gandiva-glib/node.hpp b/c_glib/gandiva-glib/node.hpp index 953c214beb9d6..7ff136003f174 100644 --- a/c_glib/gandiva-glib/node.hpp +++ b/c_glib/gandiva-glib/node.hpp @@ -21,6 +21,7 @@ #include +#include #include #include @@ -34,3 +35,5 @@ ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, const gchar *name, GList *parameters, GArrowDataType *return_type); +GGandivaLiteralNode * +ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node); diff --git a/c_glib/test/gandiva/test-binary-literal-node.rb b/c_glib/test/gandiva/test-binary-literal-node.rb new file mode 100644 index 0000000000000..93a54a361cc82 --- /dev/null +++ b/c_glib/test/gandiva/test-binary-literal-node.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaBinaryLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + @value = "\x00\x01\x02\x03\x04" + end + + def test_new + literal_node = Gandiva::BinaryLiteralNode.new(@value) + assert_equal(@value, literal_node.value.to_s) + end + + def test_new_bytes + bytes_value = GLib::Bytes.new(@value) + literal_node = Gandiva::BinaryLiteralNode.new(bytes_value) + assert_equal(@value, literal_node.value.to_s) + end +end diff --git a/c_glib/test/gandiva/test-boolean-literal-node.rb b/c_glib/test/gandiva/test-boolean-literal-node.rb new file mode 100644 index 0000000000000..3d1f10c5e81c1 --- /dev/null +++ b/c_glib/test/gandiva/test-boolean-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaBooleanLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = true + literal_node = Gandiva::BooleanLiteralNode.new(value) + assert_equal(value, literal_node.value?) + end +end diff --git a/c_glib/test/gandiva/test-double-literal-node.rb b/c_glib/test/gandiva/test-double-literal-node.rb new file mode 100644 index 0000000000000..fd4bd08e4c254 --- /dev/null +++ b/c_glib/test/gandiva/test-double-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaDoubleLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = 1.5 + literal_node = Gandiva::DoubleLiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-float-literal-node.rb b/c_glib/test/gandiva/test-float-literal-node.rb new file mode 100644 index 0000000000000..202ec38fc5907 --- /dev/null +++ b/c_glib/test/gandiva/test-float-literal-node.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaFloatLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_new + assert_nothing_raised do + Gandiva::FloatLiteralNode.new(1.5) + end + end + + def test_value + value = 1.5 + literal_node = Gandiva::FloatLiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-int16-literal-node.rb b/c_glib/test/gandiva/test-int16-literal-node.rb new file mode 100644 index 0000000000000..9b5bb6822ebba --- /dev/null +++ b/c_glib/test/gandiva/test-int16-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt16LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = -3 + literal_node = Gandiva::Int16LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-int32-literal-node.rb b/c_glib/test/gandiva/test-int32-literal-node.rb new file mode 100644 index 0000000000000..9c94cdef4b125 --- /dev/null +++ b/c_glib/test/gandiva/test-int32-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt32LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = -3 + literal_node = Gandiva::Int32LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-int64-literal-node.rb b/c_glib/test/gandiva/test-int64-literal-node.rb new file mode 100644 index 0000000000000..e1b4b91d8c32c --- /dev/null +++ b/c_glib/test/gandiva/test-int64-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt64LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = -3 + literal_node = Gandiva::Int64LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-int8-literal-node.rb b/c_glib/test/gandiva/test-int8-literal-node.rb new file mode 100644 index 0000000000000..30f11fc81a60d --- /dev/null +++ b/c_glib/test/gandiva/test-int8-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaInt8LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = -3 + literal_node = Gandiva::Int8LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-string-literal-node.rb b/c_glib/test/gandiva/test-string-literal-node.rb new file mode 100644 index 0000000000000..a231f6111f40f --- /dev/null +++ b/c_glib/test/gandiva/test-string-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaStringLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = "Hello" + literal_node = Gandiva::StringLiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-uint16-literal-node.rb b/c_glib/test/gandiva/test-uint16-literal-node.rb new file mode 100644 index 0000000000000..e8bdd308969bb --- /dev/null +++ b/c_glib/test/gandiva/test-uint16-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt16LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = 3 + literal_node = Gandiva::UInt16LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-uint32-literal-node.rb b/c_glib/test/gandiva/test-uint32-literal-node.rb new file mode 100644 index 0000000000000..9d5995774dd97 --- /dev/null +++ b/c_glib/test/gandiva/test-uint32-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt32LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = 3 + literal_node = Gandiva::UInt32LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-uint64-literal-node.rb b/c_glib/test/gandiva/test-uint64-literal-node.rb new file mode 100644 index 0000000000000..56c46db81bd24 --- /dev/null +++ b/c_glib/test/gandiva/test-uint64-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt64LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = 3 + literal_node = Gandiva::UInt64LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/c_glib/test/gandiva/test-uint8-literal-node.rb b/c_glib/test/gandiva/test-uint8-literal-node.rb new file mode 100644 index 0000000000000..04f76cd76326f --- /dev/null +++ b/c_glib/test/gandiva/test-uint8-literal-node.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaUInt8LiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_value + value = 3 + literal_node = Gandiva::UInt8LiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end diff --git a/ruby/red-gandiva/lib/gandiva/loader.rb b/ruby/red-gandiva/lib/gandiva/loader.rb index 5a95897b61730..845275c3e7cbd 100644 --- a/ruby/red-gandiva/lib/gandiva/loader.rb +++ b/ruby/red-gandiva/lib/gandiva/loader.rb @@ -22,5 +22,19 @@ def load super("Gandiva", Gandiva) end end + + private + def load_method_info(info, klass, method_name) + case klass.name + when "Gandiva::BooleanLiteralNode" + case method_name + when "value?" + method_name = "value" + end + super(info, klass, method_name) + else + super + end + end end end diff --git a/ruby/red-gandiva/test/test-boolean-literal-node.rb b/ruby/red-gandiva/test/test-boolean-literal-node.rb new file mode 100644 index 0000000000000..d79f72994b6a0 --- /dev/null +++ b/ruby/red-gandiva/test/test-boolean-literal-node.rb @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestBooleanLiteralNode < Test::Unit::TestCase + def test_value + value = true + literal_node = Gandiva::BooleanLiteralNode.new(value) + assert_equal(value, literal_node.value) + end +end From c0ac97f126c98fb29e81d6544adfea9d4ab74aff Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 12 Dec 2018 19:57:12 +0900 Subject: [PATCH 026/328] ARROW-4004: [GLib] Replace GPU with CUDA This is a follow-up change for #3088. Author: Kouhei Sutou Closes #3162 from kou/glib-replace-gpu-with-cuda and squashes the following commits: 8891e510 Replace GPU with CUDA --- c_glib/plasma-glib/plasma-glib.pc.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_glib/plasma-glib/plasma-glib.pc.in b/c_glib/plasma-glib/plasma-glib.pc.in index f3a82c237d0b9..c82fe69580f1f 100644 --- a/c_glib/plasma-glib/plasma-glib.pc.in +++ b/c_glib/plasma-glib/plasma-glib.pc.in @@ -25,4 +25,4 @@ Description: C API for Apache Arrow Plasma based on GLib Version: @VERSION@ Libs: -L${libdir} -lplasma-glib Cflags: -I${includedir} -Requires: plasma arrow-glib @ARROW_GPU_GLIB_PACKAGE@ +Requires: plasma arrow-glib @ARROW_CUDA_GLIB_PACKAGE@ From c029b772f35958feb723cdddb67dcf04ae302013 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 12 Dec 2018 08:13:36 -0600 Subject: [PATCH 027/328] ARROW-3976: [Ruby] Try to upgrade git to avoid errors caused by Homebrew on older git It seems that `brew update` can fail if the git version is too old, see https://github.com/Linuxbrew/brew/issues/820. Also adds retry logic Author: Kouhei Sutou Author: Wes McKinney Closes #3155 from wesm/ARROW-3976 and squashes the following commits: 0c7964ba3 Stop to use old Ruby 7dce4f0a3 travis_wait isn't available in custom shell script 6d41e7196 Make brew commands more robust 05044892b Incorporate code review 8c0454fd7 Try to upgrade git to avoid errors caused by Homebrew on older git, where --local argument is missing --- .travis.yml | 1 - ci/travis_install_osx.sh | 24 ++++++++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 42b1275d1c4bf..d1fc6dba35dd2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -256,7 +256,6 @@ matrix: - ARROW_TRAVIS_PLASMA=1 cache: addons: - rvm: 2.2 before_script: - if [ $ARROW_CI_RUBY_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_osx.sh diff --git a/ci/travis_install_osx.sh b/ci/travis_install_osx.sh index 47d6a637f7d58..6b6a4b2533d8b 100755 --- a/ci/travis_install_osx.sh +++ b/ci/travis_install_osx.sh @@ -23,13 +23,25 @@ set -e if [ "$ARROW_CI_RUBY_AFFECTED" = "1" ]; then brew_log_path=brew.log function run_brew() { - echo brew "$@" >> ${brew_log_path} - if ! gtimeout --signal=KILL 5m brew "$@" >> ${brew_log_path} 2>&1; then - cat ${brew_log_path} - rm ${brew_log_path} - false - fi + local i=0 + local n_tries=3 + while [[ $((i++)) < ${n_tries} ]]; do + echo "${i}: brew" "$@" >> ${brew_log_path} + if gtimeout --signal=KILL 9m brew "$@" >> ${brew_log_path} 2>&1; then + break + elif [[ ${i} == ${n_tries} ]]; then + cat ${brew_log_path} + rm ${brew_log_path} + false + fi + done } + + # ARROW-3976 Old versions of git can cause failures when Homebrew prints a + # donation solicitation. Attempt to update git + git --version + run_brew upgrade git + run_brew update run_brew upgrade python run_brew uninstall postgis From 67506d94b762d0ea3d26ba0e2df1399e566d145b Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 12 Dec 2018 08:16:30 -0600 Subject: [PATCH 028/328] ARROW-4002: [C++][Gandiva] Remove needless CMake version check I could build Gandiva with CMake 3.7.2 and LLVM 6.0.0 on Debian stretch. But I disabled Gandiva JNI. Author: Kouhei Sutou Closes #3161 from kou/cpp-gandiva-remove-cmake-version-check and squashes the following commits: 1506c546c Remove needless CMake version check --- cpp/src/gandiva/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 5d75aa271152b..5ef573875b660 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -15,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -# LLVM/Clang is required by multiple subdirs. -cmake_minimum_required(VERSION 3.11) - project(gandiva) find_package(LLVM) From a3ba1a2b54afdd2a55bd600d644722cf54b9ab5d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Dec 2018 09:19:41 -0600 Subject: [PATCH 029/328] ARROW-3988: [C++] Do not build unit tests by default, fix building Gandiva unit tests when ARROW_BUILD_TESTS=OFF I found while working on this that disabling `ARROW_GANDIVA_BUILD_TESTS` would break the build -- I think this was caused by some other changes I made. We should remove that option and instead use the new modular build targets and invoke unit tests using labels. So we would write ``` ninja gandiva # this will build all libraries and unit tests when ARROW_BUILD_TESTS=ON ctest -L gandiva ``` Author: Wes McKinney Closes #3156 from wesm/ARROW-3988 and squashes the following commits: 0420f9ed0 Remove arrow::PrimitiveBuilder from builder.rst for now because of Sphinx warning f8a33a5aa Fix gandiva test flag c4893534c Add ARROW_BUILD_TESTS to appveyor-cpp-test-cmake-script.bat 5c6a33271 Do not build unit tests by default, fix building Gandiva unit tests when ARROW_BUILD_TESTS=OFF --- ci/appveyor-cpp-build.bat | 3 +++ ci/appveyor-cpp-test-cmake-script.bat | 8 ++++++++ ci/cpp-msvc-build-main.bat | 1 + ci/travis_before_script_cpp.sh | 12 ++++++++++-- cpp/CMakeLists.txt | 14 +++++++------- cpp/README.md | 14 +++++++++----- cpp/cmake_modules/ThirdpartyToolchain.cmake | 7 ++++--- cpp/src/arrow/CMakeLists.txt | 3 ++- docs/source/cpp/api/builder.rst | 3 --- 9 files changed, 44 insertions(+), 21 deletions(-) diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index b8e431613210a..d20a0214f532c 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -34,6 +34,7 @@ if "%JOB%" == "Static_Crt_Build" ( -DARROW_USE_STATIC_CRT=ON ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=Debug ^ -DARROW_TEST_LINKAGE=static ^ -DARROW_CXXFLAGS="/MP" ^ @@ -51,6 +52,7 @@ if "%JOB%" == "Static_Crt_Build" ( -DARROW_USE_STATIC_CRT=ON ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=Release ^ -DARROW_TEST_LINKAGE=static ^ -DCMAKE_CXX_FLAGS_RELEASE="/MT %CMAKE_CXX_FLAGS_RELEASE%" ^ @@ -76,6 +78,7 @@ if "%JOB%" == "Build_Debug" ( cmake -G "%GENERATOR%" ^ -DARROW_VERBOSE_THIRDPARTY_BUILD=OFF ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_CXXFLAGS="/MP" ^ diff --git a/ci/appveyor-cpp-test-cmake-script.bat b/ci/appveyor-cpp-test-cmake-script.bat index 25bf9bddbbf39..8158a44260235 100644 --- a/ci/appveyor-cpp-test-cmake-script.bat +++ b/ci/appveyor-cpp-test-cmake-script.bat @@ -32,6 +32,7 @@ set FLATBUFFERS_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -49,6 +50,7 @@ set GFLAGS_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -66,6 +68,7 @@ set SNAPPY_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -83,6 +86,7 @@ set ZLIB_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -100,6 +104,7 @@ set BROTLI_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -117,6 +122,7 @@ set LZ4_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -134,6 +140,7 @@ set ZSTD_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -158,6 +165,7 @@ pushd %BUILD_DIR% set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. 2>output.txt diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index 8703dc9631773..560f5045af658 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -48,6 +48,7 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ + -DARROW_BUILD_TESTS=ON ^ -DARROW_CXXFLAGS="%ARROW_CXXFLAGS%" ^ -DCMAKE_CXX_FLAGS_RELEASE="/MD %CMAKE_CXX_FLAGS_RELEASE%" ^ -DARROW_PARQUET=ON ^ diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index f9e0602a80971..6465f28008006 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -41,7 +41,6 @@ if [ "$only_library_mode" == "no" ]; then fi CMAKE_COMMON_FLAGS="\ --DARROW_BUILD_BENCHMARKS=ON \ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL \ -DARROW_NO_DEPRECATED_API=ON \ -DARROW_EXTRA_ERROR_CONTEXT=ON" @@ -61,7 +60,13 @@ pushd $ARROW_CPP_BUILD_DIR if [ $only_library_mode == "yes" ]; then CMAKE_COMMON_FLAGS="\ $CMAKE_COMMON_FLAGS \ --DARROW_BUILD_TESTS=OFF \ +-DARROW_BUILD_UTILITIES=OFF \ +-DARROW_INSTALL_NAME_RPATH=OFF" +else + CMAKE_COMMON_FLAGS="\ +$CMAKE_COMMON_FLAGS \ +-DARROW_BUILD_BENCHMARKS=ON \ +-DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_INSTALL_NAME_RPATH=OFF" fi @@ -92,6 +97,9 @@ fi if [ $ARROW_TRAVIS_GANDIVA == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON" + if [ $only_library_mode == "no" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA_BUILD_TESTS=ON" + fi fi if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7140d05d577f2..35707de574648 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -115,8 +115,12 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") OFF) option(ARROW_BUILD_TESTS - "Build the Arrow googletest unit tests" - ON) + "Build the Arrow googletest unit tests, default OFF" + OFF) + + option(ARROW_BUILD_BENCHMARKS + "Build the Arrow micro benchmarks, default OFF" + OFF) set(ARROW_TEST_LINKAGE "shared" CACHE STRING "Linkage of Arrow libraries with unit tests executables. \ @@ -126,10 +130,6 @@ static|shared (default shared)") "Only build unit tests having the indicated label or labels. \ Pass multiple labels by dividing with semicolons") - option(ARROW_BUILD_BENCHMARKS - "Build the Arrow micro benchmarks" - OFF) - option(ARROW_NO_DEPRECATED_API "Exclude deprecated APIs from build" OFF) @@ -322,7 +322,7 @@ Always OFF if building binaries" option(ARROW_GANDIVA_BUILD_TESTS "Build the Gandiva googletest unit tests" - ON) + OFF) endif() diff --git a/cpp/README.md b/cpp/README.md index 1278ca046d432..d1d76c17875d7 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -64,7 +64,7 @@ Simple debug build: cd arrow/cpp mkdir debug cd debug - cmake .. + cmake -DARROW_BUILD_TESTS=ON .. make unittest Simple release build: @@ -73,10 +73,14 @@ Simple release build: cd arrow/cpp mkdir release cd release - cmake .. -DCMAKE_BUILD_TYPE=Release + cmake -DARROW_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release .. make unittest -Detailed unit test logs will be placed in the build directory under `build/test-logs`. +If you do not need to build the test suite, you can omit the +`ARROW_BUILD_TESTS` option (the default is not to build the unit tests). + +Detailed unit test logs will be placed in the build directory under +`build/test-logs`. On some Linux distributions, running the test suite might require setting an explicit locale. If you see any locale-related errors, try setting the @@ -132,7 +136,7 @@ not use the macro. Follow the directions for simple build except run cmake with the `--ARROW_BUILD_BENCHMARKS` parameter set correctly: - cmake -DARROW_BUILD_BENCHMARKS=ON .. + cmake -DARROW_BUILD_TESTS=ON -DARROW_BUILD_BENCHMARKS=ON .. and instead of make unittest run either `make; ctest` to run both unit tests and benchmarks or `make benchmark` to run only the benchmark tests. @@ -265,7 +269,7 @@ The optional `gandiva` libraries and tests can be built by passing `-DARROW_GANDIVA=on`. ```shell -cmake .. -DARROW_GANDIVA=on +cmake .. -DARROW_GANDIVA=ON -DARROW_GANDIVA_BUILD_TESTS=ON make ctest -L gandiva ``` diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 6850b0bddefc5..8f3fc2cabe3c2 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -525,9 +525,11 @@ message(STATUS "double-conversion static library: ${DOUBLE_CONVERSION_STATIC_LIB # ---------------------------------------------------------------------- # Google gtest & gflags -if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) - add_custom_target(unittest ctest -L unittest) +add_custom_target(unittest ctest -L unittest) +add_custom_target(benchmark ctest -L benchmark) +if(ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS + OR ARROW_BUILD_BENCHMARKS) if("${GTEST_HOME}" STREQUAL "") if(APPLE) set(GTEST_CMAKE_CXX_FLAGS "-fPIC -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes") @@ -627,7 +629,6 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) endif() if(ARROW_BUILD_BENCHMARKS) - add_custom_target(benchmark ctest -L benchmark) if("$ENV{GBENCHMARK_HOME}" STREQUAL "") if(NOT MSVC) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 13aaeab494090..2d043a9a27627 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -213,7 +213,8 @@ if (ARROW_BUILD_STATIC AND WIN32) target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) endif() -if (ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) +if (ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS + OR ARROW_BUILD_BENCHMARKS) # that depend on gtest ADD_ARROW_LIB(arrow_testing SOURCES test-util.cc diff --git a/docs/source/cpp/api/builder.rst b/docs/source/cpp/api/builder.rst index 0912706ac081c..9e6540aa557fb 100644 --- a/docs/source/cpp/api/builder.rst +++ b/docs/source/cpp/api/builder.rst @@ -31,9 +31,6 @@ Concrete builder subclasses .. doxygenclass:: arrow::BooleanBuilder :members: -.. doxygenclass:: arrow::PrimitiveBuilder - :members: - .. doxygenclass:: arrow::NumericBuilder :members: From aa8bb3cc4bcbee02ed7d7599e5dcf234507e65b4 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 12 Dec 2018 10:58:11 -0600 Subject: [PATCH 030/328] ARROW-3986: [C++] Document memory management and table APIs Author: Antoine Pitrou Closes #3159 from pitrou/ARROW-3986-more-prose-documentation and squashes the following commits: 4e8ff421d ARROW-3986: Document memory management and table APIs --- cpp/src/arrow/allocator.h | 8 ++ cpp/src/arrow/buffer.h | 67 ++++++++++++--- cpp/src/arrow/builder.h | 3 +- cpp/src/arrow/memory_pool.h | 1 + cpp/src/arrow/table.h | 57 ++++++++++--- cpp/src/arrow/type.h | 25 +++++- docs/source/cpp/api.rst | 1 + docs/source/cpp/api/datatype.rst | 13 +++ docs/source/cpp/api/memory.rst | 43 ++++++++-- docs/source/cpp/api/table.rst | 52 ++++++++++++ docs/source/cpp/getting_started.rst | 3 +- docs/source/cpp/index.rst | 6 ++ docs/source/cpp/memory.rst | 127 ++++++++++++++++++++++++++++ docs/source/cpp/tables.rst | 87 +++++++++++++++++++ 14 files changed, 459 insertions(+), 34 deletions(-) create mode 100644 docs/source/cpp/api/table.rst create mode 100644 docs/source/cpp/memory.rst create mode 100644 docs/source/cpp/tables.rst diff --git a/cpp/src/arrow/allocator.h b/cpp/src/arrow/allocator.h index 144ba575063a3..a02b8e64bb05a 100644 --- a/cpp/src/arrow/allocator.h +++ b/cpp/src/arrow/allocator.h @@ -29,6 +29,7 @@ namespace arrow { +/// \brief A STL allocator delegating allocations to a Arrow MemoryPool template class stl_allocator { public: @@ -45,7 +46,9 @@ class stl_allocator { using other = stl_allocator; }; + /// \brief Construct an allocator from the default MemoryPool stl_allocator() noexcept : pool_(default_memory_pool()) {} + /// \brief Construct an allocator from the given MemoryPool explicit stl_allocator(MemoryPool* pool) noexcept : pool_(pool) {} template @@ -86,9 +89,14 @@ class stl_allocator { MemoryPool* pool_; }; +/// \brief A MemoryPool implementation delegating allocations to a STL allocator +/// +/// Note that STL allocators don't provide a resizing operation, and therefore +/// any buffer resizes will do a full reallocation and copy. template > class STLMemoryPool : public MemoryPool { public: + /// \brief Construct a memory pool from the given allocator explicit STLMemoryPool(const Allocator& alloc) : alloc_(alloc) {} Status Allocate(int64_t size, uint8_t** out) override { diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 66c131413c2d3..6b2ad1bbefc7f 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -40,13 +40,15 @@ namespace arrow { /// \class Buffer /// \brief Object containing a pointer to a piece of contiguous memory with a -/// particular size. Base class does not own its memory +/// particular size. /// /// Buffers have two related notions of length: size and capacity. Size is /// the number of bytes that might have valid data. Capacity is the number -/// of bytes that where allocated for the buffer in total. +/// of bytes that were allocated for the buffer in total. /// -/// The following invariant is always true: Size < Capacity +/// The Buffer base class does not own its memory, but subclasses often do. +/// +/// The following invariant is always true: Size <= Capacity class ARROW_EXPORT Buffer { public: /// \brief Construct from buffer and size without copying memory @@ -158,9 +160,12 @@ class ARROW_EXPORT Buffer { /// \note Can throw std::bad_alloc if buffer is large std::string ToString() const; - int64_t capacity() const { return capacity_; } + /// \brief Return a pointer to the buffer's data const uint8_t* data() const { return data_; } - + /// \brief Return a writable pointer to the buffer's data + /// + /// The buffer has to be mutable. Otherwise, an assertion may be thrown + /// or a null pointer may be returned. uint8_t* mutable_data() { #ifndef NDEBUG CheckMutable(); @@ -168,8 +173,12 @@ class ARROW_EXPORT Buffer { return mutable_data_; } + /// \brief Return the buffer's size in bytes int64_t size() const { return size_; } + /// \brief Return the buffer's capacity (number of allocated bytes) + int64_t capacity() const { return capacity_; } + std::shared_ptr parent() const { return parent_; } protected: @@ -188,26 +197,38 @@ class ARROW_EXPORT Buffer { ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); }; -/// Construct a view on passed buffer at the indicated offset and length. This -/// function cannot fail and does not error checking (except in debug builds) +/// \defgroup buffer-slicing-functions Functions for slicing buffers +/// +/// @{ + +/// \brief Construct a view on a buffer at the given offset and length. +/// +/// This function cannot fail and does not check for errors (except in debug builds) static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, const int64_t offset, const int64_t length) { return std::make_shared(buffer, offset, length); } +/// \brief Construct a view on a buffer at the given offset, up to the buffer's end. +/// +/// This function cannot fail and does not check for errors (except in debug builds) static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, const int64_t offset) { int64_t length = buffer->size() - offset; return SliceBuffer(buffer, offset, length); } -/// Construct a mutable buffer slice. If the parent buffer is not mutable, this -/// will abort in debug builds +/// \brief Like SliceBuffer, but construct a mutable buffer slice. +/// +/// If the parent buffer is not mutable, behavior is undefined (it may abort +/// in debug builds). ARROW_EXPORT std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, const int64_t offset, const int64_t length); +/// @} + /// \class MutableBuffer /// \brief A Buffer whose contents can be mutated. May or may not own its data. class ARROW_EXPORT MutableBuffer : public Buffer { @@ -266,6 +287,10 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {} }; +/// \defgroup buffer-allocation-functions Functions for allocating buffers +/// +/// @{ + /// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding. /// /// \param[in] pool a memory pool @@ -364,6 +389,8 @@ Status AllocateEmptyBitmap(MemoryPool* pool, int64_t length, ARROW_EXPORT Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out); +/// @} + // ---------------------------------------------------------------------- // Buffer builder classes @@ -374,13 +401,13 @@ class ARROW_EXPORT BufferBuilder { explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {} - /// \brief Resizes the buffer to the nearest multiple of 64 bytes + /// \brief Resize the buffer to the nearest multiple of 64 bytes /// /// \param elements the new capacity of the of the builder. Will be rounded /// up to a multiple of 64 bytes for padding - /// \param shrink_to_fit if new capacity smaller than existing size, + /// \param shrink_to_fit if new capacity is smaller than the existing size, /// reallocate internal buffer. Set to false to avoid reallocations when - /// shrinking the builder + /// shrinking the builder. /// \return Status Status Resize(const int64_t elements, bool shrink_to_fit = true) { // Resize(0) is a no-op @@ -409,6 +436,9 @@ class ARROW_EXPORT BufferBuilder { /// \return Status Status Reserve(const int64_t size) { return Resize(size_ + size, false); } + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. Status Append(const void* data, int64_t length) { if (capacity_ < length + size_) { int64_t new_capacity = BitUtil::NextPower2(length + size_); @@ -418,6 +448,9 @@ class ARROW_EXPORT BufferBuilder { return Status::OK(); } + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. template Status Append(const std::array& data) { constexpr auto nbytes = static_cast(NBYTES); @@ -448,6 +481,15 @@ class ARROW_EXPORT BufferBuilder { size_ += length; } + /// \brief Return result of builder as a Buffer object. + /// + /// The builder is reset and can be reused afterwards. + /// + /// \param[out] out the finalized Buffer object + /// \param shrink_to_fit if the buffer size is smaller than its capacity, + /// reallocate to fit more tightly in memory. Set to false to avoid + /// a reallocation, at the expense of potentially more memory consumption. + /// \return Status Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); *out = buffer_; @@ -472,6 +514,7 @@ class ARROW_EXPORT BufferBuilder { int64_t size_; }; +/// \brief A BufferBuilder subclass with convenience methods to append typed data template class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { public: diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 180b43a220f30..d0016674215fc 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -118,7 +118,8 @@ class ARROW_EXPORT ArrayBuilder { virtual Status FinishInternal(std::shared_ptr* out) = 0; /// \brief Return result of builder as an Array object. - /// Resets the builder except for DictionaryBuilder + /// + /// The builder is reset except for DictionaryBuilder. /// /// \param[out] out the finalized Array object /// \return Status diff --git a/cpp/src/arrow/memory_pool.h b/cpp/src/arrow/memory_pool.h index 49cd4c7efc3ed..8499b6f35d400 100644 --- a/cpp/src/arrow/memory_pool.h +++ b/cpp/src/arrow/memory_pool.h @@ -142,6 +142,7 @@ class ARROW_EXPORT ProxyMemoryPool : public MemoryPool { std::unique_ptr impl_; }; +/// Return the process-wide default memory pool. ARROW_EXPORT MemoryPool* default_memory_pool(); #ifdef ARROW_NO_DEFAULT_MEMORY_POOL diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 9c478485b243c..6b5733252879b 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -85,7 +85,12 @@ class ARROW_EXPORT ChunkedArray { std::shared_ptr type() const { return type_; } + /// \brief Determine if two chunked arrays are equal. + /// + /// Two chunked arrays can be equal only if they have equal datatypes. + /// However, they may be equal even if they have different chunkings. bool Equals(const ChunkedArray& other) const; + /// \brief Determine if two chunked arrays are equal. bool Equals(const std::shared_ptr& other) const; protected: @@ -103,13 +108,26 @@ class ARROW_EXPORT ChunkedArray { /// metadata) and a chunked data array class ARROW_EXPORT Column { public: + /// \brief Construct a column from a vector of arrays + /// + /// The array chunks' datatype must match the field's datatype. Column(const std::shared_ptr& field, const ArrayVector& chunks); + /// \brief Construct a column from a chunked array + /// + /// The chunked array's datatype must match the field's datatype. Column(const std::shared_ptr& field, const std::shared_ptr& data); - + /// \brief Construct a column from a single array + /// + /// The array's datatype must match the field's datatype. Column(const std::shared_ptr& field, const std::shared_ptr& data); - // Construct from name and array + /// \brief Construct a column from a name and an array + /// + /// A field with the given name and the array's datatype is automatically created. Column(const std::string& name, const std::shared_ptr& data); + /// \brief Construct a column from a name and a chunked array + /// + /// A field with the given name and the array's datatype is automatically created. Column(const std::string& name, const std::shared_ptr& data); int64_t length() const { return data_->length(); } @@ -154,7 +172,12 @@ class ARROW_EXPORT Column { /// \param[out] out The resulting vector of arrays Status Flatten(MemoryPool* pool, std::vector>* out) const; + /// \brief Determine if two columns are equal. + /// + /// Two columns can be equal only if they have equal datatypes. + /// However, they may be equal even if they have different chunkings. bool Equals(const Column& other) const; + /// \brief Determine if the two columns are equal. bool Equals(const std::shared_ptr& other) const; /// \brief Verify that the column's array data is consistent with the passed @@ -214,11 +237,10 @@ class ARROW_EXPORT Table { const std::vector>& batches, std::shared_ptr
* table); - /// \return the table's schema + /// Return the table schema std::shared_ptr schema() const { return schema_; } - /// \param[in] i column index, does not boundscheck - /// \return the i-th column + /// Return a column by index virtual std::shared_ptr column(int i) const = 0; /// \brief Remove column from the table, producing a new Table @@ -250,13 +272,16 @@ class ARROW_EXPORT Table { /// \brief Perform any checks to validate the input arguments virtual Status Validate() const = 0; - /// \return the number of columns in the table + /// \brief Return the number of columns in the table int num_columns() const { return schema_->num_fields(); } - /// \return the number of rows (the corresponding length of each column) + /// \brief Return the number of rows (equal to each column's logical length) int64_t num_rows() const { return num_rows_; } - /// \brief Determine if semantic contents of tables are exactly equal + /// \brief Determine if tables are equal + /// + /// Two tables can be equal only if they have equal schemas. + /// However, they may be equal even if they have different chunkings. bool Equals(const Table& other) const; protected: @@ -269,18 +294,25 @@ class ARROW_EXPORT Table { ARROW_DISALLOW_COPY_AND_ASSIGN(Table); }; -/// \brief Compute a sequence of record batches from a (possibly chunked) Table +/// \brief Compute a stream of record batches from a (possibly chunked) Table +/// +/// The conversion is zero-copy: each record batch is a view over a slice +/// of the table's columns. class ARROW_EXPORT TableBatchReader : public RecordBatchReader { public: ~TableBatchReader() override; - /// \brief Read batches with the maximum possible size + /// \brief Construct a TableBatchReader for the given table explicit TableBatchReader(const Table& table); std::shared_ptr schema() const override; Status ReadNext(std::shared_ptr* out) override; + /// \brief Set the desired maximum chunk size of record batches + /// + /// The actual chunk size of each record batch may be smaller, depending + /// on actual chunking characteristics of each table column. void set_chunksize(int64_t chunksize); private: @@ -289,7 +321,10 @@ class ARROW_EXPORT TableBatchReader : public RecordBatchReader { }; /// \brief Construct table from multiple input tables. -/// \return Status, fails if any schemas are different +/// +/// The tables are concatenated vertically. Therefore, all tables should +/// have the same schema. Each column in the output table is the result +/// of concatenating the corresponding columns in all input tables. ARROW_EXPORT Status ConcatenateTables(const std::vector>& tables, std::shared_ptr
* table); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f187817b53f28..9694202b9705c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -150,11 +150,12 @@ class ARROW_EXPORT DataType { explicit DataType(Type::type id) : id_(id) {} virtual ~DataType(); - // Return whether the types are equal - // - // Types that are logically convertible from one to another (e.g. List - // and Binary) are NOT equal. + /// \brief Return whether the types are equal + /// + /// Types that are logically convertible from one to another (e.g. List + /// and Binary) are NOT equal. virtual bool Equals(const DataType& other) const; + /// \brief Return whether the types are equal bool Equals(const std::shared_ptr& other) const; std::shared_ptr child(int i) const { return children_[i]; } @@ -174,6 +175,7 @@ class ARROW_EXPORT DataType { /// \since 0.7.0 virtual std::string name() const = 0; + /// \brief Return the type category Type::type id() const { return id_; } protected: @@ -248,12 +250,16 @@ class ARROW_EXPORT Field { const std::shared_ptr& metadata = NULLPTR) : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {} + /// \brief Return the field's attached metadata std::shared_ptr metadata() const { return metadata_; } + /// \brief Return whether the field has non-empty metadata bool HasMetadata() const; + /// \brief Return a copy of this field with the given metadata attached to it std::shared_ptr AddMetadata( const std::shared_ptr& metadata) const; + /// \brief Return a copy of this field without any metadata attached to it std::shared_ptr RemoveMetadata() const; std::vector> Flatten() const; @@ -261,10 +267,14 @@ class ARROW_EXPORT Field { bool Equals(const Field& other) const; bool Equals(const std::shared_ptr& other) const; + /// \brief Return a string representation ot the field std::string ToString() const; + /// \brief Return the field name const std::string& name() const { return name_; } + /// \brief Return the field data type std::shared_ptr type() const { return type_; } + /// \brief Return whether the field is nullable bool nullable() const { return nullable_; } private: @@ -896,6 +906,11 @@ dictionary(const std::shared_ptr& index_type, /// @} +/// \defgroup schema-factories Factory functions for fields and schemas +/// +/// Factory functions for fields and schemas +/// @{ + /// \brief Create a Field instance /// /// \param name the field name @@ -926,6 +941,8 @@ std::shared_ptr schema( std::vector>&& fields, const std::shared_ptr& metadata = NULLPTR); +/// @} + } // namespace arrow #endif // ARROW_TYPE_H diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index 02aa4d62e3b31..f6c0418b5c10d 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -27,3 +27,4 @@ API Reference api/datatype api/array api/builder + api/table diff --git a/docs/source/cpp/api/datatype.rst b/docs/source/cpp/api/datatype.rst index ee7844277df27..adfc6e4171e66 100644 --- a/docs/source/cpp/api/datatype.rst +++ b/docs/source/cpp/api/datatype.rst @@ -133,3 +133,16 @@ Dictionary-encoded .. doxygenclass:: arrow::DictionaryType :members: + +Fields and Schemas +================== + +.. doxygengroup:: schema-factories + :project: arrow_cpp + :content-only: + +.. doxygenclass:: arrow::Field + :members: + +.. doxygenclass:: arrow::Schema + :members: diff --git a/docs/source/cpp/api/memory.rst b/docs/source/cpp/api/memory.rst index 1dc8e706d3e8d..c921229e6cb17 100644 --- a/docs/source/cpp/api/memory.rst +++ b/docs/source/cpp/api/memory.rst @@ -33,16 +33,11 @@ Buffers :project: arrow_cpp :members: -.. doxygenclass:: arrow::BufferBuilder - :project: arrow_cpp - :members: - Memory Pools ------------ .. doxygenfunction:: arrow::default_memory_pool :project: arrow_cpp - :outline: .. doxygenclass:: arrow::MemoryPool :project: arrow_cpp @@ -55,3 +50,41 @@ Memory Pools .. doxygenclass:: arrow::ProxyMemoryPool :project: arrow_cpp :members: + +Allocation Functions +-------------------- + +These functions allocate a buffer from a particular memory pool. + +.. doxygengroup:: buffer-allocation-functions + :project: arrow_cpp + :content-only: + +Slicing +------- + +.. doxygengroup:: buffer-slicing-functions + :project: arrow_cpp + :content-only: + +Buffer Builders +--------------- + +.. doxygenclass:: arrow::BufferBuilder + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::TypedBufferBuilder + :project: arrow_cpp + :members: + +STL Integration +--------------- + +.. doxygenclass:: arrow::stl_allocator + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::STLMemoryPool + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/api/table.rst b/docs/source/cpp/api/table.rst new file mode 100644 index 0000000000000..e8b4f8e066e30 --- /dev/null +++ b/docs/source/cpp/api/table.rst @@ -0,0 +1,52 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======================== +Two-dimensional Datasets +======================== + +Columns +======= + +.. doxygenclass:: arrow::Column + :project: arrow_cpp + :members: + +Tables +====== + +.. doxygenclass:: arrow::Table + :project: arrow_cpp + :members: + +.. doxygenfunction:: arrow::ConcatenateTables + :project: arrow_cpp + +Record Batches +============== + +.. doxygenclass:: arrow::RecordBatch + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::RecordBatchReader + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::TableBatchReader + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst index 8201c2ded0d92..7c55b76912d1b 100644 --- a/docs/source/cpp/getting_started.rst +++ b/docs/source/cpp/getting_started.rst @@ -25,6 +25,7 @@ Getting Started overview conventions + memory arrays datatypes - + tables diff --git a/docs/source/cpp/index.rst b/docs/source/cpp/index.rst index 8c7ced0c2e7b8..63290be9ecb42 100644 --- a/docs/source/cpp/index.rst +++ b/docs/source/cpp/index.rst @@ -23,3 +23,9 @@ C++ Implementation getting_started api + +.. TODO add "topics" chapter +.. - nested arrays +.. - dictionary encoding + +.. TODO add "building" or "development" chapter diff --git a/docs/source/cpp/memory.rst b/docs/source/cpp/memory.rst new file mode 100644 index 0000000000000..23b4725e4b971 --- /dev/null +++ b/docs/source/cpp/memory.rst @@ -0,0 +1,127 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +================= +Memory Management +================= + +Buffers +======= + +To avoid passing around raw data pointers with varying and non-obvious +lifetime rules, Arrow provides a generic abstraction called :class:`arrow::Buffer`. +A Buffer encapsulates a pointer and data size, and generally also ties its +lifetime to that of an underlying provider (in other words, a Buffer should +*always* point to valid memory till its destruction). Buffers are untyped: +they simply denote a physical memory area regardless of its intended meaning +or interpretation. + +Buffers may be allocated by Arrow itself , or by third-party routines. +For example, it is possible to pass the data of a Python bytestring as a Arrow +buffer, keeping the Python object alive as necessary. + +In addition, buffers come in various flavours: mutable or not, resizable or +not. Generally, you will hold a mutable buffer when building up a piece +of data, then it will be frozen as an immutable container such as an +:doc:`array `. + +.. note:: + Some buffers may point to non-CPU memory, such as GPU-backed memory + provided by a CUDA context. If you're writing a GPU-aware application, + you will need to be careful not to interpret a GPU memory pointer as + a CPU-reachable pointer, or vice-versa. + +Accessing Buffer Memory +----------------------- + +Buffers provide fast access to the underlying memory using the +:func:`~arrow::Buffer::size` and :func:`~arrow::Buffer::data` accessors +(or :func:`~arrow::Buffer::mutable_data` for writable access to a mutable +buffer). + +Slicing +------- + +It is possible to make zero-copy slices of buffers, to obtain a buffer +referring to some contiguous subset of the underlying data. This is done +by calling the :func:`arrow::SliceBuffer` and :func:`arrow::SliceMutableBuffer` +functions. + +Allocating a Buffer +------------------- + +You can allocate a buffer yourself by calling one of the +:func:`arrow::AllocateBuffer` or :func:`arrow::AllocateResizableBuffer` +overloads:: + + std::shared_ptr buffer; + + if (!arrow::AllocateBuffer(4096, &buffer).ok()) { + // ... handle allocation error + } + uint8_t* buffer_data = buffer->mutable_data(); + memcpy(buffer_data, "hello world", 11); + +Allocating a buffer this way ensures it is 64-bytes aligned and padded +as recommended by the :doc:`Arrow memory specification <../format/Layout>`. + +Building a Buffer +----------------- + +You can also allocate *and* build a Buffer incrementally, using the +:class:`arrow::BufferBuilder` API:: + + BufferBuilder builder; + builder.Resize(11); + builder.Append("hello ", 6); + builder.Append("world", 5); + + std::shared_ptr buffer; + if (!builder.Finish(&buffer).ok()) { + // ... handle buffer allocation error + } + +Memory Pools +============ + +When allocating a Buffer using the Arrow C++ API, the buffer's underlying +memory is allocated by a :class:`arrow::MemoryPool` instance. Usually this +will be the process-wide *default memory pool*, but many Arrow APIs allow +you to pass another MemoryPool instance for their internal allocations. + +Memory pools are used for large long-lived data such as array buffers. +Other data, such as small C++ objects and temporary workspaces, usually +goes through the regular C++ allocators. + +Default Memory Pool +------------------- + +Depending on how Arrow was compiled, the default memory pool may use the +standard C ``malloc`` allocator, or a `jemalloc `_ heap. + +STL Integration +--------------- + +If you wish to use a Arrow memory pool to allocate the data of STL containers, +you can do so using the :class:`arrow::stl_allocator` wrapper. + +Conversely, you can also use a STL allocator to allocate Arrow memory, +using the :class:`arrow::STLMemoryPool` class. However, this may be less +performant, as STL allocators don't provide a resizing operation. diff --git a/docs/source/cpp/tables.rst b/docs/source/cpp/tables.rst new file mode 100644 index 0000000000000..d42f0c6c4f53e --- /dev/null +++ b/docs/source/cpp/tables.rst @@ -0,0 +1,87 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +======================== +Two-dimensional Datasets +======================== + +While arrays and chunked arrays represent a one-dimensional sequence of +homogenous values, data often comes in the form of two-dimensional sets of +heterogenous data (such as database tables, CSV files...). Arrow provides +several abstractions to handle such data conveniently and efficiently. + +Fields +====== + +Fields are used to denote the particular columns of a table (and also +the particular members of a nested data type such as :class:`arrow::StructType`). +A field, i.e. an instance of :class:`arrow::Field`, holds together a data +type, a field name and some optional metadata. + +The recommended way to create a field is to call the :func:`arrow::field` +factory function. + +Schemas +======= + +A schema describes the overall structure of a two-dimensional dataset such +as a table. It holds a sequence of fields together with some optional +schema-wide metadata (in addition to per-field metadata). The recommended +way to create a schema is to call one the :func:`arrow::schema` factory +function overloads:: + + // Create a schema describing datasets with two columns: + // a int32 column "A" and a utf8-encoded string column "B" + std::shared_ptr field_a, field_b; + std::shared_ptr schema; + + field_a = arrow::field("A", arrow::int32()); + field_b = arrow::field("B", arrow::utf8()); + schema = arrow::schema({field_a, field_b}); + +Columns +======= + +A :class:`arrow::Column` is a chunked array tied together with a field. +The field describes the column's name (for lookup in a larger dataset) +and its metadata. + +Tables +====== + +A :class:`arrow::Table` is a two-dimensional dataset of a number of columns, +together with a schema. The columns' names and types must match the schema. +Also, each column must have the same logical length in number of elements +(although each column can be chunked in a different way). + +Record Batches +============== + +A :class:`arrow::RecordBatch` is a two-dimensional dataset of a number of +contiguous arrays, each the same length. Like a table, a record batch also +has a schema which must match its arrays' datatypes. + +Record batches are a convenient unit of work for various serialization +and computation functions, possibly incremental. + +A table can be streamed as an arbitrary number of record batches using +a :class:`arrow::TableBatchReader`. Conversely, a logical sequence of +record batches can be assembled to form a table using one of the +:func:`arrow::Table::FromRecordBatches` factory function overloads. From 7ddfba6693db99ec8ea38b6fd244c5d6e2af3295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 12 Dec 2018 19:54:45 +0100 Subject: [PATCH 031/328] ARROW-3470: [C++] Fix row-wise example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Implement the `ADD_EXAMPLE` cmake function with new ctest label `example`, also covered by the `runexample` target. This can be toggled via the `ARROW_BUILD_EXAMPLES` option which is ON by default. - Implement fully working `row-wise-conversion-example.cc` and add it to the default build. - Update documentation to embed (manually) the newly created example. Author: François Saint-Jacques Closes #3078 from fsaintjacques/ARROW-3470-out-of-date-example and squashes the following commits: fab63f6f ARROW-3470: Fix status macro 1eba067d ARROW-3470: Fix row-wise example --- ci/appveyor-cpp-build.bat | 3 + ci/appveyor-cpp-test-cmake-script.bat | 8 + ci/cpp-msvc-build-main.bat | 1 + ci/travis_before_script_cpp.sh | 1 + cpp/CMakeLists.txt | 15 ++ cpp/apidoc/tutorials/row_wise_conversion.md | 194 ------------------ cpp/cmake_modules/BuildUtils.cmake | 60 ++++++ cpp/examples/arrow/CMakeLists.txt | 18 ++ .../arrow/row-wise-conversion-example.cc | 190 +++++++++++++++++ cpp/src/arrow/status.h | 4 +- docs/source/cpp/examples.rst | 30 +++ docs/source/cpp/index.rst | 1 + 12 files changed, 329 insertions(+), 196 deletions(-) delete mode 100644 cpp/apidoc/tutorials/row_wise_conversion.md create mode 100644 cpp/examples/arrow/CMakeLists.txt create mode 100644 cpp/examples/arrow/row-wise-conversion-example.cc create mode 100644 docs/source/cpp/examples.rst diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index d20a0214f532c..387dd55d18545 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -35,6 +35,7 @@ if "%JOB%" == "Static_Crt_Build" ( -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=Debug ^ -DARROW_TEST_LINKAGE=static ^ -DARROW_CXXFLAGS="/MP" ^ @@ -53,6 +54,7 @@ if "%JOB%" == "Static_Crt_Build" ( -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=Release ^ -DARROW_TEST_LINKAGE=static ^ -DCMAKE_CXX_FLAGS_RELEASE="/MT %CMAKE_CXX_FLAGS_RELEASE%" ^ @@ -79,6 +81,7 @@ if "%JOB%" == "Build_Debug" ( -DARROW_VERBOSE_THIRDPARTY_BUILD=OFF ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_CXXFLAGS="/MP" ^ diff --git a/ci/appveyor-cpp-test-cmake-script.bat b/ci/appveyor-cpp-test-cmake-script.bat index 8158a44260235..415406c4ac366 100644 --- a/ci/appveyor-cpp-test-cmake-script.bat +++ b/ci/appveyor-cpp-test-cmake-script.bat @@ -33,6 +33,7 @@ set FLATBUFFERS_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -51,6 +52,7 @@ set GFLAGS_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -69,6 +71,7 @@ set SNAPPY_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -87,6 +90,7 @@ set ZLIB_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -105,6 +109,7 @@ set BROTLI_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -123,6 +128,7 @@ set LZ4_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -141,6 +147,7 @@ set ZSTD_HOME=WrongPath cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. >nul 2>error.txt @@ -166,6 +173,7 @@ set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/MP" ^ .. 2>output.txt diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index 560f5045af658..644170775d568 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -49,6 +49,7 @@ cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_BUILD_TESTS=ON ^ + -DARROW_BUILD_EXAMPLES=ON ^ -DARROW_CXXFLAGS="%ARROW_CXXFLAGS%" ^ -DCMAKE_CXX_FLAGS_RELEASE="/MD %CMAKE_CXX_FLAGS_RELEASE%" ^ -DARROW_PARQUET=ON ^ diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 6465f28008006..a77fcd8749de5 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -67,6 +67,7 @@ else $CMAKE_COMMON_FLAGS \ -DARROW_BUILD_BENCHMARKS=ON \ -DARROW_BUILD_TESTS=ON \ +-DARROW_BUILD_EXAMPLES=ON \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_INSTALL_NAME_RPATH=OFF" fi diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 35707de574648..a83b9dd6d9409 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -122,6 +122,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the Arrow micro benchmarks, default OFF" OFF) + option(ARROW_BUILD_EXAMPLES + "Build the Arrow examples, default OFF" + OFF) + set(ARROW_TEST_LINKAGE "shared" CACHE STRING "Linkage of Arrow libraries with unit tests executables. \ static|shared (default shared)") @@ -447,6 +451,10 @@ if(NOT ARROW_BUILD_BENCHMARKS) set(NO_BENCHMARKS 1) endif() +if(NOT ARROW_BUILD_EXAMPLES) + set(NO_EXAMPLES 1) +endif() + if (NOT ARROW_FUZZING) set(NO_FUZZING 1) endif() @@ -735,12 +743,14 @@ pass ARROW_BUILD_SHARED=on") endif() # Use shared linking for unit tests if it's available set(ARROW_TEST_LINK_LIBS ${ARROW_TEST_SHARED_LINK_LIBS}) + set(ARROW_EXAMPLE_LINK_LIBS arrow_shared) else() if (NOT ARROW_BUILD_STATIC) message(FATAL_ERROR "If using static linkage for unit tests, must also \ pass ARROW_BUILD_STATIC=on") endif() set(ARROW_TEST_LINK_LIBS ${ARROW_TEST_STATIC_LINK_LIBS}) + set(ARROW_EXAMPLE_LINK_LIBS arrow_static) endif() if (ARROW_BUILD_BENCHMARKS) @@ -805,6 +815,11 @@ if(ARROW_GANDIVA) add_subdirectory(src/gandiva) endif() +if(ARROW_BUILD_EXAMPLES) + add_custom_target(runexample ctest -L example) + add_subdirectory(examples/arrow) +endif() + include(CMakePackageConfigHelpers) # Makes the project importable from the build directory diff --git a/cpp/apidoc/tutorials/row_wise_conversion.md b/cpp/apidoc/tutorials/row_wise_conversion.md deleted file mode 100644 index 750a923c7846b..0000000000000 --- a/cpp/apidoc/tutorials/row_wise_conversion.md +++ /dev/null @@ -1,194 +0,0 @@ - - -Convert a vector of row-wise data into an Arrow table -===================================================== - -While we want to use columnar data structures to build efficient operations, we -often receive data in a row-wise fashion from other systems. In the following, -we want give a brief introduction into the classes provided by Apache Arrow by -showing how to transform row-wise data into a columnar table. - -The data in this example is stored in the following struct: - -``` -struct data_row { - int64_t id; - double cost; - std::vector cost_components; -}; - -std::vector rows; -``` - -The final representation should be an `arrow::Table` which in turn is made up of -an `arrow::Schema` and a list of `arrow::Column`. An `arrow::Column` is again a -named collection of one or more `arrow::Array` instances. As the first step, we -will iterate over the data and build up the arrays incrementally. For this task, -we provide `arrow::ArrayBuilder` classes that help in the construction of the -final `arrow::Array` instances. - -For each type, Arrow has a specially typed builder class. For the primitive -values `id` and `cost` we can use the respective `arrow::Int64Builder` and -`arrow::DoubleBuilder`. For the `cost_components` vector, we need to have two -builders, a top-level `arrow::ListBuilder` that builds the array of offsets and -a nested `arrow::DoubleBuilder` that constructs the underlying values array that -is referenced by the offsets in the former array. - -``` -// The builders are more efficient using -// arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of -// the underlying memory regions in-place. At the moment, arrow::jemalloc is only -// supported on Unix systems, not Windows. - -using arrow::DoubleBuilder; -using arrow::Int64Builder; -using arrow::ListBuilder; - -arrow::MemoryPool* pool = arrow::default_memory_pool(); -Int64Builder id_builder(pool); -DoubleBuilder cost_builder(pool); -std::unique_ptr components_values_builder(new DoubleBuilder(pool)); -ListBuilder components_builder(pool, std::move(components_values_builder)); -``` - -Now we can loop over our existing data and insert it into the builders. The -`Append` calls here may fail (e.g. we cannot allocate enough additional memory). -Thus we need to check their return values. For more information on these values, -check the documentation about `arrow::Status`. - -``` -for (const data_row& row : rows) { - ARROW_RETURN_NOT_OK(id_builder.Append(row.id)); - ARROW_RETURN_NOT_OK(cost_builder.Append(row.cost)); - - // Indicate the start of a new list row. This will memorise the current - // offset in the values builder. - ARROW_RETURN_NOT_OK(components_builder.Append()); - // Store the actual values. The final nullptr argument tells the underyling - // builder that all added values are valid, i.e. non-null. - ARROW_RETURN_NOT_OK(components_values_builder->Append( - row.cost_components.data(), row.cost_components.size(), - nullptr); -} -``` - -At the end, we finalise the arrays, declare the (type) schema and combine them - into a single `arrow::Table`: - -``` -std::shared_ptr id_array; -ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array)); -std::shared_ptr cost_array; -ARROW_RETURN_NOT_OK(cost_builder.Finish(&cost_array)); -std::shared_ptr cost_components_array; -ARROW_RETURN_NOT_OK(components_builder.Finish(&cost_components_array)); - -std::vector> schema_vector = { - arrow::field("id", arrow::int64()), - arrow::field("cost", arrow::float64()), - arrow::field("cost_components", arrow::list(arrow::float64())) -}; -auto schema = std::make_shared(schema_vector); - -std::shared_ptr table = arrow::Table::Make(schema, - {id_array, cost_array, cost_components_array}); -``` - -The final `table` variable is the one we then can pass on to other functions -that can consume Apache Arrow memory structures. This object has ownership of -all referenced data, thus we don't have to care about undefined references once -we leave the scope of the function building the table and its underlying arrays. - - - -Converting an Arrow Table back into row-wise representation -=========================================================== - -To convert an Arrow table back into the same row-wise representation as in the -above section, we first will check that the table conforms to our expected -schema and then will build up the vector of rows incrementally. - -For the check if the table is as expected, we can utilise solely its schema. - -``` -// This is our input that was passed in from the outside. -std::shared_ptr table; - -std::vector> schema_vector = { - arrow::field("id", arrow::int64()), - arrow::field("cost", arrow::float64()), - arrow::field("cost_components", arrow::list(arrow::float64())) -}; -auto expected_schema = std::make_shared(schema_vector); - -if (!expected_schema->Equals(*table->schema())) { - // The table doesn't have the expected schema thus we cannot directly - // convert it to our target representation. - // TODO: Implement your custom error handling logic here. -} -``` - -As we have ensured that the table has the expected structure, we can unpack the -underlying arrays. For the primitive columns `id` and `cost` we can use the high -level functions to get the values whereas for the nested column -`cost_components` we need to access the C-pointer to the data to copy its -contents into the resulting `std::vector`. Here we need to be care to -also add the offset to the pointer. This offset is needed to enable zero-copy -slicing operations. While this could be adjusted automatically for double -arrays, this cannot be done for the accompanying bitmap as often the slicing -border would be inside a byte. - -``` -// For simplicity, we assume that all arrays consist of a single chunk here. -// In a productive implementation this should either be explicitly check or code -// added that can treat chunked arrays. - -auto ids = std::static_pointer_cast( - table->column(0)->data()->chunk(0)); -auto costs = std::static_pointer_castcolumn(1)->data()->chunk(0)); -auto cost_components = std::static_pointer_castcolumn(2)->data()->chunk(0)); -auto cost_components_values = std::static_pointer_cast( - cost_components->values()); -// To enable zero-copy slices, the native values pointer might need to account -// for this slicing offset. This is not needed for the higher level functions -// like Value(…) that already account for this offset internally. -const double* cost_components_values_ptr = cost_components_values->data() - + cost_components_values->offset(); -``` - -After we have unpacked the arrays from the table, we can iterate over them in a -row-wise fashion and fill our target, row-wise representation. - -``` -std::vector rows; - -for (int64_t i = 0; i < table->num_rows(); i++) { - // Another simplification in this example is that we assume that there are - // no null entries, e.g. each row is fill with valid values. - int64_t id = ids->Value(i); - double cost = costs->Value(i); - const double* first = cost_components_values_ptr + cost_components->value_offset(i); - const double* last = cost_components_values_ptr + cost_components->value_offset(i + 1); - std::vector components_vec(first, last); - rows.push_back({id, cost, components_vec}); -} -``` diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index bcf672823b424..d5978e1d215ff 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -480,6 +480,66 @@ function(ADD_TEST_CASE REL_TEST_NAME) LABELS ${ARG_LABELS}) endfunction() +############################################################ +# Examples +############################################################ +# Add a new example, with or without an executable that should be built. +# If examples are enabled then they will be run along side unit tests with ctest. +# 'make runexample' to build/run only examples. +# +# REL_EXAMPLE_NAME is the name of the example app. It may be a single component +# (e.g. monotime-example) or contain additional components (e.g. +# net/net_util-example). Either way, the last component must be a globally +# unique name. + +# The example will registered as unit test with ctest with a label +# of 'example'. +# +# Arguments after the test name will be passed to set_tests_properties(). +# +# \arg PREFIX a string to append to the name of the example executable. For +# example, if you have src/arrow/foo/bar-example.cc, then PREFIX "foo" will +# create test executable foo-bar-example +function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME) + set(options) + set(one_value_args) + set(multi_value_args EXTRA_LINK_LIBS DEPENDENCIES PREFIX) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(NO_EXAMPLES) + return() + endif() + get_filename_component(EXAMPLE_NAME ${REL_EXAMPLE_NAME} NAME_WE) + + if(ARG_PREFIX) + set(EXAMPLE_NAME "${ARG_PREFIX}-${EXAMPLE_NAME}") + endif() + + if(EXISTS ${CMAKE_SOURCE_DIR}/examples/arrow/${REL_EXAMPLE_NAME}.cc) + # This example has a corresponding .cc file, set it up as an executable. + set(EXAMPLE_PATH "${EXECUTABLE_OUTPUT_PATH}/${EXAMPLE_NAME}") + add_executable(${EXAMPLE_NAME} "${REL_EXAMPLE_NAME}.cc") + target_link_libraries(${EXAMPLE_NAME} ${ARROW_EXAMPLE_LINK_LIBS}) + add_dependencies(runexample ${EXAMPLE_NAME}) + set(NO_COLOR "--color_print=false") + + if (ARG_EXTRA_LINK_LIBS) + target_link_libraries(${EXAMPLE_NAME} ${ARG_EXTRA_LINK_LIBS}) + endif() + endif() + + if (ARG_DEPENDENCIES) + add_dependencies(${EXAMPLE_NAME} ${ARG_DEPENDENCIES}) + endif() + + + add_test(${EXAMPLE_NAME} ${EXAMPLE_PATH}) + set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "example") +endfunction() + ############################################################ # Fuzzing ############################################################ diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt new file mode 100644 index 0000000000000..6ecb537ad9787 --- /dev/null +++ b/cpp/examples/arrow/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ADD_ARROW_EXAMPLE(row-wise-conversion-example) diff --git a/cpp/examples/arrow/row-wise-conversion-example.cc b/cpp/examples/arrow/row-wise-conversion-example.cc new file mode 100644 index 0000000000000..db8c28753dbe6 --- /dev/null +++ b/cpp/examples/arrow/row-wise-conversion-example.cc @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include + +using arrow::DoubleBuilder; +using arrow::Int64Builder; +using arrow::ListBuilder; + +// While we want to use columnar data structures to build efficient operations, we +// often receive data in a row-wise fashion from other systems. In the following, +// we want give a brief introduction into the classes provided by Apache Arrow by +// showing how to transform row-wise data into a columnar table. +// +// The data in this example is stored in the following struct: +struct data_row { + int64_t id; + double cost; + std::vector cost_components; +}; + +// Transforming a vector of structs into a columnar Table. +// +// The final representation should be an `arrow::Table` which in turn is made up of +// an `arrow::Schema` and a list of `arrow::Column`. An `arrow::Column` is again a +// named collection of one or more `arrow::Array` instances. As the first step, we +// will iterate over the data and build up the arrays incrementally. For this task, +// we provide `arrow::ArrayBuilder` classes that help in the construction of the +// final `arrow::Array` instances. +// +// For each type, Arrow has a specially typed builder class. For the primitive +// values `id` and `cost` we can use the respective `arrow::Int64Builder` and +// `arrow::DoubleBuilder`. For the `cost_components` vector, we need to have two +// builders, a top-level `arrow::ListBuilder` that builds the array of offsets and +// a nested `arrow::DoubleBuilder` that constructs the underlying values array that +// is referenced by the offsets in the former array. +arrow::Status VectorToColumnarTable(const std::vector& rows, + std::shared_ptr* table) { + // The builders are more efficient using + // arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of + // the underlying memory regions in-place. At the moment, arrow::jemalloc is only + // supported on Unix systems, not Windows. + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + Int64Builder id_builder(pool); + DoubleBuilder cost_builder(pool); + ListBuilder components_builder(pool, std::make_shared(pool)); + // The following builder is owned by components_builder. + DoubleBuilder& cost_components_builder = + *(static_cast(components_builder.value_builder())); + + // Now we can loop over our existing data and insert it into the builders. The + // `Append` calls here may fail (e.g. we cannot allocate enough additional memory). + // Thus we need to check their return values. For more information on these values, + // check the documentation about `arrow::Status`. + for (const data_row& row : rows) { + ARROW_RETURN_NOT_OK(id_builder.Append(row.id)); + ARROW_RETURN_NOT_OK(cost_builder.Append(row.cost)); + + // Indicate the start of a new list row. This will memorise the current + // offset in the values builder. + ARROW_RETURN_NOT_OK(components_builder.Append()); + // Store the actual values. The final nullptr argument tells the underyling + // builder that all added values are valid, i.e. non-null. + ARROW_RETURN_NOT_OK(cost_components_builder.AppendValues(row.cost_components.data(), + row.cost_components.size())); + } + + // At the end, we finalise the arrays, declare the (type) schema and combine them + // into a single `arrow::Table`: + std::shared_ptr id_array; + ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array)); + std::shared_ptr cost_array; + ARROW_RETURN_NOT_OK(cost_builder.Finish(&cost_array)); + // No need to invoke cost_components_builder.Finish because it is implied by + // the parent builder's Finish invocation. + std::shared_ptr cost_components_array; + ARROW_RETURN_NOT_OK(components_builder.Finish(&cost_components_array)); + + std::vector> schema_vector = { + arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()), + arrow::field("cost_components", arrow::list(arrow::float64()))}; + + auto schema = std::make_shared(schema_vector); + + // The final `table` variable is the one we then can pass on to other functions + // that can consume Apache Arrow memory structures. This object has ownership of + // all referenced data, thus we don't have to care about undefined references once + // we leave the scope of the function building the table and its underlying arrays. + *table = arrow::Table::Make(schema, {id_array, cost_array, cost_components_array}); + + return arrow::Status::OK(); +} + +arrow::Status ColumnarTableToVector(const std::shared_ptr& table, + std::vector* rows) { + // To convert an Arrow table back into the same row-wise representation as in the + // above section, we first will check that the table conforms to our expected + // schema and then will build up the vector of rows incrementally. + // + // For the check if the table is as expected, we can utilise solely its schema. + std::vector> schema_vector = { + arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()), + arrow::field("cost_components", arrow::list(arrow::float64()))}; + auto expected_schema = std::make_shared(schema_vector); + + if (!expected_schema->Equals(*table->schema())) { + // The table doesn't have the expected schema thus we cannot directly + // convert it to our target representation. + return arrow::Status::Invalid("Schemas are not matching!"); + } + + // As we have ensured that the table has the expected structure, we can unpack the + // underlying arrays. For the primitive columns `id` and `cost` we can use the high + // level functions to get the values whereas for the nested column + // `cost_components` we need to access the C-pointer to the data to copy its + // contents into the resulting `std::vector`. Here we need to be care to + // also add the offset to the pointer. This offset is needed to enable zero-copy + // slicing operations. While this could be adjusted automatically for double + // arrays, this cannot be done for the accompanying bitmap as often the slicing + // border would be inside a byte. + + auto ids = + std::static_pointer_cast(table->column(0)->data()->chunk(0)); + auto costs = + std::static_pointer_cast(table->column(1)->data()->chunk(0)); + auto cost_components = + std::static_pointer_cast(table->column(2)->data()->chunk(0)); + auto cost_components_values = + std::static_pointer_cast(cost_components->values()); + // To enable zero-copy slices, the native values pointer might need to account + // for this slicing offset. This is not needed for the higher level functions + // like Value(…) that already account for this offset internally. + const double* ccv_ptr = cost_components_values->data()->GetValues(1); + + for (int64_t i = 0; i < table->num_rows(); i++) { + // Another simplification in this example is that we assume that there are + // no null entries, e.g. each row is fill with valid values. + int64_t id = ids->Value(i); + double cost = costs->Value(i); + const double* first = ccv_ptr + cost_components->value_offset(i); + const double* last = ccv_ptr + cost_components->value_offset(i + 1); + std::vector components_vec(first, last); + rows->push_back({id, cost, components_vec}); + } + + return arrow::Status::OK(); +} + +#define EXIT_ON_FAILURE(expr) \ + do { \ + arrow::Status status_ = (expr); \ + if (!status_.ok()) { \ + std::cerr << status_.message() << std::endl; \ + return EXIT_FAILURE; \ + } \ + } while (0); + +int main(int argc, char** argv) { + std::vector rows = { + {1, 1.0, {1.0}}, {2, 2.0, {1.0, 2.0}}, {3, 3.0, {1.0, 2.0, 3.0}}}; + + std::shared_ptr table; + EXIT_ON_FAILURE(VectorToColumnarTable(rows, &table)); + + std::vector expected_rows; + EXIT_ON_FAILURE(ColumnarTableToVector(table, &expected_rows)); + + assert(rows.size() == expected_rows.size()); + + return EXIT_SUCCESS; +} diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index ddf3d7ee0e644..e3632a6d5f62e 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -36,7 +36,7 @@ if (ARROW_PREDICT_FALSE(!_s.ok())) { \ std::stringstream ss; \ ss << __FILE__ << ":" << __LINE__ << " code: " << #s << "\n" << _s.message(); \ - return Status(_s.code(), ss.str()); \ + return ::arrow::Status(_s.code(), ss.str()); \ } \ } while (0) @@ -69,7 +69,7 @@ std::stringstream ss; \ ss << __FILE__ << ":" << __LINE__ << " code: " << _status.CodeAsString() << " \n " \ << _status.message(); \ - return Status(_status.code(), ss.str()); \ + return ::arrow::Status(_status.code(), ss.str()); \ } \ } while (0) diff --git a/docs/source/cpp/examples.rst b/docs/source/cpp/examples.rst new file mode 100644 index 0000000000000..5f4372fbba2f2 --- /dev/null +++ b/docs/source/cpp/examples.rst @@ -0,0 +1,30 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Examples +======== + +Row to columnar conversion +-------------------------- + +The following example converts an array of structs to a :class:`arrow::Table` +instance, and then converts it back to the original array of structs. + +.. literalinclude:: ../../../cpp/examples/arrow/row-wise-conversion-example.cc diff --git a/docs/source/cpp/index.rst b/docs/source/cpp/index.rst index 63290be9ecb42..1d70e6acbf0ce 100644 --- a/docs/source/cpp/index.rst +++ b/docs/source/cpp/index.rst @@ -22,6 +22,7 @@ C++ Implementation :maxdepth: 2 getting_started + examples api .. TODO add "topics" chapter From 45940410e6cb88809338a8fb7bf6b50046fe77fe Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Dec 2018 16:11:11 -0600 Subject: [PATCH 032/328] ARROW-4008: [C++] Restore ARROW_BUILD_UTILITIES to fix integration tests In recent refactoring, ARROW_BUILD_UTILITIES got accidentally set to OFF Author: Wes McKinney Closes #3166 from wesm/ARROW-4008 and squashes the following commits: 105651722 Only add json-integration-test dependency when it is built 96bec050d Actually build utilities 02fd08ff6 Add integration target as dependency of arrow target 8c9fcf809 Do not write integration test files to /tmp af6a23b98 Add option to write integration test files to somewhere outside of /tmp. Add integration target to C++ build --- ci/travis_before_script_cpp.sh | 2 +- ci/travis_script_integration.sh | 7 +++++- cpp/src/arrow/ipc/CMakeLists.txt | 22 ++++++++++------ integration/integration_test.py | 43 +++++++++++++++++++------------- 4 files changed, 48 insertions(+), 26 deletions(-) diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index a77fcd8749de5..5f398e8c6e327 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -68,7 +68,7 @@ $CMAKE_COMMON_FLAGS \ -DARROW_BUILD_BENCHMARKS=ON \ -DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_EXAMPLES=ON \ --DARROW_BUILD_UTILITIES=OFF \ +-DARROW_BUILD_UTILITIES=ON \ -DARROW_INSTALL_NAME_RPATH=OFF" fi diff --git a/ci/travis_script_integration.sh b/ci/travis_script_integration.sh index 286acacd74004..9c2786282b08b 100755 --- a/ci/travis_script_integration.sh +++ b/ci/travis_script_integration.sh @@ -52,7 +52,12 @@ conda install -y nomkl # Expensive dependencies install from Continuum package repo conda install -y pip numpy six -python integration_test.py --debug +# ARROW-4008: Create a directory to write temporary files since /tmp can be +# unstable in Travis CI +INTEGRATION_TEMPDIR=$TRAVIS_BUILD_DIR/integration_temp +mkdir -p $INTEGRATION_TEMPDIR + +python integration_test.py --debug --tempdir=$INTEGRATION_TEMPDIR popd diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index bda4ef3e417d5..44c56f033269d 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +# Targets required for protocol integration testing +add_custom_target(integration) +add_dependencies(arrow integration) + ####################################### # Messaging and interprocess communication @@ -31,13 +35,14 @@ if (NOT ARROW_BOOST_HEADER_ONLY) EXTRA_LINK_LIBS gflags_static) # Test is being built - if (TARGET json-integration-test) + if (TARGET arrow-json-integration-test) + add_dependencies(integration arrow-json-integration-test) if (UNIX) if (APPLE) - set_target_properties(json-integration-test + set_target_properties(arrow-json-integration-test PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") else() - target_link_libraries(json-integration-test PRIVATE pthread) + target_link_libraries(arrow-json-integration-test PRIVATE pthread) endif() endif() endif() @@ -113,10 +118,13 @@ if(NOT WIN32) endif() if (ARROW_BUILD_UTILITIES) - add_executable(file-to-stream file-to-stream.cc) - target_link_libraries(file-to-stream ${UTIL_LINK_LIBS}) - add_executable(stream-to-file stream-to-file.cc) - target_link_libraries(stream-to-file ${UTIL_LINK_LIBS}) + add_executable(arrow-file-to-stream file-to-stream.cc) + target_link_libraries(arrow-file-to-stream ${UTIL_LINK_LIBS}) + add_executable(arrow-stream-to-file stream-to-file.cc) + target_link_libraries(arrow-stream-to-file ${UTIL_LINK_LIBS}) + + add_dependencies(integration arrow-file-to-stream) + add_dependencies(integration arrow-stream-to-file) endif() ADD_ARROW_BENCHMARK(read-write-benchmark diff --git a/integration/integration_test.py b/integration/integration_test.py index 3bd37bdd80677..7101af2516ad9 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -893,8 +893,8 @@ def generate_dictionary_case(): dictionaries=[dict1, dict2]) -def get_generated_json_files(): - temp_dir = tempfile.mkdtemp() +def get_generated_json_files(tempdir=None): + tempdir = tempdir or tempfile.mkdtemp() def _temp_path(): return @@ -910,7 +910,7 @@ def _temp_path(): generated_paths = [] for file_obj in file_objs: - out_path = os.path.join(temp_dir, 'generated_' + + out_path = os.path.join(tempdir, 'generated_' + file_obj.name + '.json') file_obj.write(out_path) generated_paths.append(out_path) @@ -924,10 +924,10 @@ def _temp_path(): class IntegrationRunner(object): - def __init__(self, json_files, testers, debug=False): + def __init__(self, json_files, testers, tempdir=None, debug=False): self.json_files = json_files self.testers = testers - self.temp_dir = tempfile.mkdtemp() + self.temp_dir = tempdir or tempfile.mkdtemp() self.debug = debug def run(self): @@ -950,10 +950,12 @@ def _compare_implementations(self, producer, consumer): name = os.path.splitext(os.path.basename(json_path))[0] + file_id = guid()[:8] + # Make the random access file print('-- Creating binary inputs') - producer_file_path = os.path.join(self.temp_dir, guid() + '_' + - name + '.json_to_arrow') + producer_file_path = os.path.join(self.temp_dir, file_id + '_' + + name + '.json_as_file') producer.json_to_file(json_path, producer_file_path) # Validate the file @@ -961,10 +963,12 @@ def _compare_implementations(self, producer, consumer): consumer.validate(json_path, producer_file_path) print('-- Validating stream') - producer_stream_path = os.path.join(self.temp_dir, guid() + '_' + - name + '.arrow_to_stream') - consumer_file_path = os.path.join(self.temp_dir, guid() + '_' + - name + '.stream_to_arrow') + producer_stream_path = os.path.join(self.temp_dir, file_id + '_' + + name + + '.producer_file_as_stream') + consumer_file_path = os.path.join(self.temp_dir, file_id + '_' + + name + + '.consumer_stream_as_file') producer.file_to_stream(producer_file_path, producer_stream_path) consumer.stream_to_file(producer_stream_path, @@ -1054,8 +1058,8 @@ class CPPTester(Tester): os.path.join(ARROW_HOME, 'cpp/build/debug')) CPP_INTEGRATION_EXE = os.path.join(EXE_PATH, 'arrow-json-integration-test') - STREAM_TO_FILE = os.path.join(EXE_PATH, 'stream-to-file') - FILE_TO_STREAM = os.path.join(EXE_PATH, 'file-to-stream') + STREAM_TO_FILE = os.path.join(EXE_PATH, 'arrow-stream-to-file') + FILE_TO_STREAM = os.path.join(EXE_PATH, 'arrow-file-to-stream') name = 'C++' @@ -1162,15 +1166,16 @@ def get_static_json_files(): return glob.glob(glob_pattern) -def run_all_tests(debug=False): +def run_all_tests(debug=False, tempdir=None): testers = [CPPTester(debug=debug), JavaTester(debug=debug), JSTester(debug=debug)] static_json_files = get_static_json_files() - generated_json_files = get_generated_json_files() + generated_json_files = get_generated_json_files(tempdir=tempdir) json_files = static_json_files + generated_json_files - runner = IntegrationRunner(json_files, testers, debug=debug) + runner = IntegrationRunner(json_files, testers, + tempdir=tempdir, debug=debug) runner.run() print('-- All tests passed!') @@ -1195,6 +1200,10 @@ def write_js_test_json(directory): parser.add_argument('--debug', dest='debug', action='store_true', default=False, help='Run executables in debug mode as relevant') + parser.add_argument('--tempdir', dest='tempdir', + default=tempfile.mkdtemp(), + help=('Directory to use for writing ' + 'integration test temporary files')) args = parser.parse_args() if args.generated_json_path: try: @@ -1204,4 +1213,4 @@ def write_js_test_json(directory): raise write_js_test_json(args.generated_json_path) else: - run_all_tests(debug=args.debug) + run_all_tests(debug=args.debug, tempdir=args.tempdir) From 0005048b2f2ab1b84908e81c9e0648158ccf639c Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Thu, 13 Dec 2018 17:44:17 +0900 Subject: [PATCH 033/328] ARROW-4005: [Plasma] [GLib] Add gplasma_client_disconnect() Author: Yosuke Shiro Closes #3163 from shiro615/glib-add-disconnect-for-plasma-glib and squashes the following commits: 3d990034 Remove require_gi by using options a8575acd Use bool instead of gboolean b2c9ccf7 Keep disconnected information 789c1dd4 Fix test case 5182beb1 Add Add gplasma_client_disconnect() --- c_glib/plasma-glib/client.cpp | 34 +++++++++++++++++++++--- c_glib/plasma-glib/client.h | 2 ++ c_glib/test/plasma/test-plasma-client.rb | 13 ++++++--- 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/c_glib/plasma-glib/client.cpp b/c_glib/plasma-glib/client.cpp index e88cb13e83cd0..c05a71085dd2d 100644 --- a/c_glib/plasma-glib/client.cpp +++ b/c_glib/plasma-glib/client.cpp @@ -185,6 +185,7 @@ gplasma_client_create_options_get_metadata(GPlasmaClientCreateOptions *options, typedef struct GPlasmaClientPrivate_ { plasma::PlasmaClient *client; + bool disconnected; } GPlasmaClientPrivate; enum { @@ -205,10 +206,12 @@ gplasma_client_finalize(GObject *object) { auto priv = GPLASMA_CLIENT_GET_PRIVATE(object); - auto status = priv->client->Disconnect(); - if (!status.ok()) { - g_warning("[plasma][client][finalize] Failed to disconnect: %s", - status.ToString().c_str()); + if (!priv->disconnected) { + auto status = priv->client->Disconnect(); + if (!status.ok()) { + g_warning("[plasma][client][finalize] Failed to disconnect: %s", + status.ToString().c_str()); + } } delete priv->client; @@ -431,6 +434,29 @@ gplasma_client_refer_object(GPlasmaClient *client, } } +/** + * gplasma_client_disconnect: + * @client: A #GPlasmaClient. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +gplasma_client_disconnect(GPlasmaClient *client, + GError **error) +{ + auto priv = GPLASMA_CLIENT_GET_PRIVATE(client); + auto status = priv->client->Disconnect(); + if (garrow_error_check(error, status, "[plasma][client][disconnect]")) { + priv->disconnected = true; + return TRUE; + } else { + return FALSE; + } +} + G_END_DECLS GPlasmaClient * diff --git a/c_glib/plasma-glib/client.h b/c_glib/plasma-glib/client.h index 6f99f467c83a7..34b0ba22e3188 100644 --- a/c_glib/plasma-glib/client.h +++ b/c_glib/plasma-glib/client.h @@ -71,5 +71,7 @@ gplasma_client_refer_object(GPlasmaClient *client, GPlasmaObjectID *id, gint64 timeout_ms, GError **error); +gboolean gplasma_client_disconnect(GPlasmaClient *client, + GError **error); G_END_DECLS diff --git a/c_glib/test/plasma/test-plasma-client.rb b/c_glib/test/plasma/test-plasma-client.rb index cbdce865f0132..6caf09f02570c 100644 --- a/c_glib/test/plasma/test-plasma-client.rb +++ b/c_glib/test/plasma/test-plasma-client.rb @@ -24,6 +24,9 @@ def setup @store = Helper::PlasmaStore.new @store.start @client = Plasma::Client.new(@store.socket_path) + @id = Plasma::ObjectID.new("Hello") + @data = "World" + @options = Plasma::ClientCreateOptions.new end def teardown @@ -34,10 +37,7 @@ def teardown def setup super - @id = Plasma::ObjectID.new("Hello") - @data = "World" @metadata = "Metadata" - @options = Plasma::ClientCreateOptions.new end test("no options") do @@ -84,4 +84,11 @@ def setup ]) end end + + test("#disconnect") do + @client.disconnect + assert_raise(Arrow::Error::Io) do + @client.create(@id, @data.bytesize, @options) + end + end end From 1882a0727ba275fbced9ed0754c5fe99f841bed4 Mon Sep 17 00:00:00 2001 From: Tanya Schlusser Date: Thu, 13 Dec 2018 13:36:21 +0100 Subject: [PATCH 034/328] ARROW-3866: [Python] Column metadata is not transferred to tables in pyarrow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use columns' existing metadata to create the new fields in `Table.from_arrays()`. Also persists the original `nullable` value. Happy to change things! Thank you for putting a newbie label on it. Author: Tanya Schlusser Author: Krisztián Szűcs Closes #3160 from tanyaschlusser/ARROW-3866 and squashes the following commits: 005940ea Move the test for preserved metadata to a separate function. Add a test that nullable=False is preserved. e4256a17 use column.field() 76216eae Arrow-3866: keep field matadata for columns passed to pa.Table.from_arrays() 33950a83 ARROW-3866: test to confirm column metadata is added when calling pa.Table.from_arrays(column_list) --- python/pyarrow/table.pxi | 24 +++++++++++------------- python/pyarrow/tests/test_table.py | 15 +++++++++++++++ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index fd565afae5acf..cf3411dc03616 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -636,12 +636,12 @@ cdef class Column: cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): cdef: - Column col - c_string c_name - vector[shared_ptr[CField]] fields - shared_ptr[CDataType] type_ Py_ssize_t K = len(arrays) + c_string c_name + CColumn* c_column + shared_ptr[CDataType] c_type shared_ptr[CKeyValueMetadata] c_meta + vector[shared_ptr[CField]] c_fields if metadata is not None: if not isinstance(metadata, dict): @@ -649,17 +649,15 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): c_meta = pyarrow_unwrap_metadata(metadata) if K == 0: - schema.reset(new CSchema(fields, c_meta)) + schema.reset(new CSchema(c_fields, c_meta)) return - fields.resize(K) + c_fields.resize(K) if isinstance(arrays[0], Column): for i in range(K): - col = arrays[i] - type_ = col.sp_column.get().type() - c_name = tobytes(col.name) - fields[i].reset(new CField(c_name, type_, True)) + c_column = (arrays[i]).column + c_fields[i] = c_column.field() else: if names is None: raise ValueError('Must pass names when constructing ' @@ -670,7 +668,7 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): for i in range(K): val = arrays[i] if isinstance(val, (Array, ChunkedArray)): - type_ = ( val.type).sp_type + c_type = ( val.type).sp_type else: raise TypeError(type(val)) @@ -678,9 +676,9 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): c_name = tobytes(u'None') else: c_name = tobytes(names[i]) - fields[i].reset(new CField(c_name, type_, True)) + c_fields[i].reset(new CField(c_name, c_type, True)) - schema.reset(new CSchema(fields, c_meta)) + schema.reset(new CSchema(c_fields, c_meta)) cdef class RecordBatch: diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 9c9828d8c0764..ecbf93bd3e8b0 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -579,6 +579,21 @@ def test_table_basics(): assert table.columns == columns +def test_table_from_arrays_preserves_column_metadata(): + # Added to test https://issues.apache.org/jira/browse/ARROW-3866 + arr0 = pa.array([1, 2]) + arr1 = pa.array([3, 4]) + field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B")) + field1 = pa.field('field2', pa.int64(), nullable=False) + columns = [ + pa.column(field0, arr0), + pa.column(field1, arr1) + ] + table = pa.Table.from_arrays(columns) + assert b"a" in table.column(0).field.metadata + assert table.column(1).field.nullable is False + + def test_table_from_arrays_invalid_names(): data = [ pa.array(range(5)), From 2a726c179fdb794a9fbc2025aced0dbab3c5c362 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Thu, 13 Dec 2018 08:56:16 -0600 Subject: [PATCH 035/328] PARQUET-1473: [C++] Add helper function that converts ParquetVersion to human-friendly string Author: Hatem Helal Author: Hatem Helal Closes #3148 from hatemhelal/parquet-1473-wip and squashes the following commits: 8983ae629 remove default case from switch to make any omission a compile-time error a87c3ba99 Add helper function parquet::ParquetVersionToString and use it in parquet-reader tool --- cpp/src/parquet/metadata.cc | 12 ++++++++++++ cpp/src/parquet/metadata.h | 2 ++ cpp/src/parquet/printer.cc | 2 +- cpp/tools/parquet/parquet-reader.cc | 2 +- 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index cf63b0f662b52..22cfbdb91aa73 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -47,6 +47,18 @@ const ApplicationVersion& ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION() return version; } +std::string ParquetVersionToString(ParquetVersion::type ver) { + switch (ver) { + case ParquetVersion::PARQUET_1_0: + return "1.0"; + case ParquetVersion::PARQUET_2_0: + return "2.0"; + } + + // This should be unreachable + return "UNKNOWN"; +} + template static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 706e980711683..25f4d4cd8cbdf 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -282,6 +282,8 @@ class PARQUET_EXPORT FileMetaDataBuilder { std::unique_ptr impl_; }; +PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver); + } // namespace parquet #endif // PARQUET_FILE_METADATA_H diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 9f26a4180cda1..5be8d9d96467c 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -38,7 +38,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte const FileMetaData* file_metadata = fileReader->metadata().get(); stream << "File Name: " << filename << "\n"; - stream << "Version: " << file_metadata->version() << "\n"; + stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n"; stream << "Created By: " << file_metadata->created_by() << "\n"; stream << "Total rows: " << file_metadata->num_rows() << "\n"; diff --git a/cpp/tools/parquet/parquet-reader.cc b/cpp/tools/parquet/parquet-reader.cc index 34bdfc103dcc0..a5b7db1330a97 100644 --- a/cpp/tools/parquet/parquet-reader.cc +++ b/cpp/tools/parquet/parquet-reader.cc @@ -23,7 +23,7 @@ int main(int argc, char** argv) { if (argc > 5 || argc < 2) { - std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] [--json]" + std::cerr << "Usage: parquet-reader [--only-metadata] [--no-memory-map] [--json]" "[--print-key-value-metadata] [--columns=...] " << std::endl; return -1; From e34057c4b4be8c7abf3537dd4998b5b38919ba73 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 13 Dec 2018 09:52:17 -0600 Subject: [PATCH 036/328] ARROW-4019: [C++] Fix Coverity issues This fixes a number of issues found by Coverity. Other issues are benign, or need to be tackled separately. Author: Antoine Pitrou Closes #3168 from pitrou/ARROW-4019-fix-coverity-issues and squashes the following commits: 6311aa99a ARROW-4019: Fix Coverity issues --- cpp/src/arrow/array.h | 8 +-- cpp/src/arrow/compute/kernel.h | 9 ++++ cpp/src/arrow/io/buffered-test.cc | 2 +- cpp/src/arrow/io/file-test.cc | 2 +- cpp/src/arrow/io/test-common.h | 2 +- cpp/src/arrow/ipc/json-integration-test.cc | 2 +- cpp/src/arrow/ipc/json.cc | 2 +- cpp/src/arrow/ipc/writer.cc | 5 +- cpp/src/arrow/table.cc | 2 +- cpp/src/arrow/util/decimal.cc | 57 +++++++++++++--------- cpp/src/arrow/util/logging.h | 3 ++ cpp/src/arrow/util/rle-encoding-test.cc | 2 +- cpp/src/arrow/util/rle-encoding.h | 1 + 13 files changed, 61 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index b34b53933314f..37fa5aedfc2d0 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -87,7 +87,7 @@ class Status; /// input array and replace them with newly-allocated data, changing the output /// data type as well. struct ARROW_EXPORT ArrayData { - ArrayData() : length(0) {} + ArrayData() : length(0), null_count(0), offset(0) {} ArrayData(const std::shared_ptr& type, int64_t length, int64_t null_count = kUnknownNullCount, int64_t offset = 0) @@ -311,7 +311,7 @@ class ARROW_EXPORT Array { std::string ToString() const; protected: - Array() {} + Array() : null_bitmap_data_(NULLPTR) {} std::shared_ptr data_; const uint8_t* null_bitmap_data_; @@ -382,7 +382,7 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { std::shared_ptr values() const { return data_->buffers[1]; } protected: - PrimitiveArray() {} + PrimitiveArray() : raw_values_(NULLPTR) {} inline void SetData(const std::shared_ptr& data) { auto values = data->buffers[1]; @@ -565,7 +565,7 @@ class ARROW_EXPORT BinaryArray : public FlatArray { protected: // For subclasses - BinaryArray() {} + BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} /// Protected method for constructors void SetData(const std::shared_ptr& data); diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 8048fff75bc29..bef2b9af21cff 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -19,6 +19,7 @@ #define ARROW_COMPUTE_KERNEL_H #include +#include #include #include "arrow/array.h" @@ -78,6 +79,14 @@ struct ARROW_EXPORT Datum { Datum(const Datum& other) noexcept { this->value = other.value; } + // Define move constructor and move assignment, for better performance + Datum(Datum&& other) noexcept : value(std::move(other.value)) {} + + Datum& operator=(Datum&& other) noexcept { + value = std::move(other.value); + return *this; + } + Datum::type kind() const { switch (this->value.which()) { case 0: diff --git a/cpp/src/arrow/io/buffered-test.cc b/cpp/src/arrow/io/buffered-test.cc index 7fc4c520d148b..074833d4bf7b7 100644 --- a/cpp/src/arrow/io/buffered-test.cc +++ b/cpp/src/arrow/io/buffered-test.cc @@ -67,7 +67,7 @@ class FileTestFixture : public ::testing::Test { void EnsureFileDeleted() { if (FileExists(path_)) { - std::remove(path_.c_str()); + ARROW_UNUSED(std::remove(path_.c_str())); } } diff --git a/cpp/src/arrow/io/file-test.cc b/cpp/src/arrow/io/file-test.cc index 6081005a8f6e1..4d710d3470f5c 100644 --- a/cpp/src/arrow/io/file-test.cc +++ b/cpp/src/arrow/io/file-test.cc @@ -56,7 +56,7 @@ class FileTestFixture : public ::testing::Test { void EnsureFileDeleted() { if (FileExists(path_)) { - std::remove(path_.c_str()); + ARROW_UNUSED(std::remove(path_.c_str())); } } diff --git a/cpp/src/arrow/io/test-common.h b/cpp/src/arrow/io/test-common.h index fa9145259b182..a091b01d32c79 100644 --- a/cpp/src/arrow/io/test-common.h +++ b/cpp/src/arrow/io/test-common.h @@ -118,7 +118,7 @@ class MemoryMapFixture { public: void TearDown() { for (auto path : tmp_files_) { - std::remove(path.c_str()); + ARROW_UNUSED(std::remove(path.c_str())); } } diff --git a/cpp/src/arrow/ipc/json-integration-test.cc b/cpp/src/arrow/ipc/json-integration-test.cc index 3e71415c69654..914cdb66599f4 100644 --- a/cpp/src/arrow/ipc/json-integration-test.cc +++ b/cpp/src/arrow/ipc/json-integration-test.cc @@ -262,7 +262,7 @@ class TestJSONIntegration : public ::testing::Test { void TearDown() { for (const std::string path : tmp_paths_) { - std::remove(path.c_str()); + ARROW_UNUSED(std::remove(path.c_str())); } } diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 394563c53c09d..61c242ca2dbbb 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -99,7 +99,7 @@ Status JsonWriter::WriteRecordBatch(const RecordBatch& batch) { class JsonReader::JsonReaderImpl { public: JsonReaderImpl(MemoryPool* pool, const std::shared_ptr& data) - : pool_(pool), data_(data) {} + : pool_(pool), data_(data), record_batches_(nullptr) {} Status ParseAndReadSchema() { doc_.Parse(reinterpret_cast(data_->data()), diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 3d3355dfe17fd..6ce72e070e7b3 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -772,7 +772,10 @@ class SchemaWriter : public StreamBookKeeper { public: SchemaWriter(const Schema& schema, DictionaryMemo* dictionary_memo, MemoryPool* pool, io::OutputStream* sink) - : StreamBookKeeper(sink), schema_(schema), dictionary_memo_(dictionary_memo) {} + : StreamBookKeeper(sink), + pool_(pool), + schema_(schema), + dictionary_memo_(dictionary_memo) {} Status WriteSchema() { #ifndef NDEBUG diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 04af4d9741c71..1f3d927ddd62b 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -392,7 +392,7 @@ class SimpleTable : public Table { std::vector> columns_; }; -Table::Table() {} +Table::Table() : num_rows_(0) {} std::shared_ptr
Table::Make(const std::shared_ptr& schema, const std::vector>& columns, diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index fda7746c6b4e0..c47ac82e8ce3c 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -889,7 +889,7 @@ Status Decimal128::Rescale(int32_t original_scale, int32_t new_scale, } // Helper function used by Decimal128::FromBigEndian -static inline uint64_t FromBigEndian(const uint8_t* bytes, int32_t length) { +static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) { // We don't bounds check the length here because this is called by // FromBigEndian that has a Decimal128 as its out parameters and // that function is already checking the length of the bytes and only @@ -906,8 +906,7 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 static constexpr int32_t kMinDecimalBytes = 1; static constexpr int32_t kMaxDecimalBytes = 16; - int64_t high; - uint64_t low; + int64_t high, low; if (length < kMinDecimalBytes || length > kMaxDecimalBytes) { std::ostringstream stream; @@ -917,35 +916,45 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 return Status::Invalid(stream.str()); } - /// Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the - /// sign bit. + // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the + // sign bit. const bool is_negative = static_cast(bytes[0]) < 0; - /// Sign extend the low bits if necessary - low = UINT64_MAX * (is_negative && length < 8); - high = -1 * (is_negative && length < kMaxDecimalBytes); - - /// Stop byte of the high bytes + // 1. Extract the high bytes + // Stop byte of the high bytes const int32_t high_bits_offset = std::max(0, length - 8); + const auto high_bits = UInt64FromBigEndian(bytes, high_bits_offset); - /// Shift left enough bits to make room for the incoming int64_t - high <<= high_bits_offset * CHAR_BIT; - - /// Preserve the upper bits by inplace OR-ing the int64_t - uint64_t value = arrow::FromBigEndian(bytes, high_bits_offset); - high |= value; + if (high_bits_offset == 8) { + // Avoid undefined shift by 64 below + high = high_bits; + } else { + high = -1 * (is_negative && length < kMaxDecimalBytes); + // Shift left enough bits to make room for the incoming int64_t + high <<= high_bits_offset * CHAR_BIT; + // Preserve the upper bits by inplace OR-ing the int64_t + high |= high_bits; + } - /// Stop byte of the low bytes + // 2. Extract the low bytes + // Stop byte of the low bytes const int32_t low_bits_offset = std::min(length, 8); + const auto low_bits = + UInt64FromBigEndian(bytes + high_bits_offset, length - high_bits_offset); - /// Shift left enough bits to make room for the incoming uint64_t - low <<= low_bits_offset * CHAR_BIT; - - /// Preserve the upper bits by inplace OR-ing the uint64_t - value = arrow::FromBigEndian(bytes + high_bits_offset, length - high_bits_offset); - low |= value; + if (low_bits_offset == 8) { + // Avoid undefined shift by 64 below + low = low_bits; + } else { + // Sign extend the low bits if necessary + low = -1 * (is_negative && length < 8); + // Shift left enough bits to make room for the incoming int64_t + low <<= low_bits_offset * CHAR_BIT; + // Preserve the upper bits by inplace OR-ing the int64_t + low |= low_bits; + } - *out = Decimal128(high, low); + *out = Decimal128(high, static_cast(low)); return Status::OK(); } diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 4cce700db970b..42ab18e9e96d3 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -22,6 +22,7 @@ #include #include +#include "arrow/util/macros.h" #include "arrow/util/visibility.h" namespace arrow { @@ -155,6 +156,8 @@ class ARROW_EXPORT ArrowLog : public ArrowLogBase { static void InstallFailureSignalHandler(); private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrowLog); + // Hide the implementation of log provider by void *. // Otherwise, lib user may define the same macro to use the correct header file. void* logging_provider_; diff --git a/cpp/src/arrow/util/rle-encoding-test.cc b/cpp/src/arrow/util/rle-encoding-test.cc index 88382618653e9..aac1b1523990c 100644 --- a/cpp/src/arrow/util/rle-encoding-test.cc +++ b/cpp/src/arrow/util/rle-encoding-test.cc @@ -193,7 +193,7 @@ void ValidateRle(const vector& values, int bit_width, uint8_t* expected_enc EXPECT_EQ(encoded_len, expected_len); } if (expected_encoding != NULL) { - EXPECT_EQ(memcmp(buffer, expected_encoding, expected_len), 0); + EXPECT_EQ(memcmp(buffer, expected_encoding, encoded_len), 0); } // Verify read diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index a97543d5be799..acefc8e3f7583 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -436,6 +436,7 @@ bool RleDecoder::NextCounts() { literal_count_ = (indicator_value >> 1) * 8; } else { repeat_count_ = indicator_value >> 1; + // XXX (ARROW-4018) this is not big-endian compatible bool result = bit_reader_.GetAligned(static_cast(BitUtil::CeilDiv(bit_width_, 8)), reinterpret_cast(¤t_value_)); From b3bc3384f3068edebe69f1084518ccfb85a368f8 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 13 Dec 2018 15:09:27 -0800 Subject: [PATCH 037/328] ARROW-3958: [Plasma] Reduce number of IPCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR also removes the client unmap, which is not necessary any more since the introduction of malloc (since there is only few memory mapped files and they typically stay around for the lifetime of the application). The PR also gets rid of a bunch of code that is not needed any more now (the release buffer, yay!). Benchmarks: ``` import pyarrow.plasma as plasma client = plasma.connect("/tmp/plasma", "", 0) # Put performance def f(): for i in range(10000): client.put(1) %timeit f() # without optimization: # 1.51 s ± 2.22 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.52 s ± 9.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.53 s ± 19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # with optimizations: # 1.27 s ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.31 s ± 8.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.31 s ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # Create/seal performance def f(): for i in range(10000): object_id = plasma.ObjectID.from_random() client.create(object_id, 0) client.seal(object_id) %timeit f() # without optimizations: # 571 ms ± 2.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 583 ms ± 22.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 588 ms ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # with optimizations: # 531 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 541 ms ± 9.99 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 542 ms ± 19.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # Get performance objects = [client.put(1) for i in range(10000)] def g(): for i in range(10000): client.get(objects[i]) %timeit g() # without optimizations # 1.11 s ± 6.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.12 s ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 1.19 s ± 24.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # with optimizations # 776 ms ± 11.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 792 ms ± 3.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # 778 ms ± 9.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` Author: Philipp Moritz Author: Robert Nishihara Closes #3124 from pcmoritz/plasma-send-fd and squashes the following commits: f899f459 Update client.cc a0384040 Update _plasma.pyx af150c14 comments and fixes 71c4c5c1 don't close fd twice 0d572823 linting f60dcbed fix tests 502aeda4 linting 2887b170 clean up some code cfff7e32 lint e5ccbbac fixes 5f091993 introduce method 24beb277 working version --- cpp/src/plasma/client.cc | 184 +++++----------------- cpp/src/plasma/client.h | 16 +- cpp/src/plasma/store.cc | 13 +- cpp/src/plasma/store.h | 3 + cpp/src/plasma/test/client_tests.cc | 26 +-- docs/source/python/plasma.rst | 10 +- python/pyarrow/_plasma.pyx | 11 +- python/pyarrow/tensorflow/plasma_op.cc | 4 +- python/pyarrow/tests/test_plasma.py | 8 +- python/pyarrow/tests/test_plasma_tf_op.py | 2 +- 10 files changed, 83 insertions(+), 194 deletions(-) diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index 99cf00cab80fd..2dbe2b41478ea 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -83,9 +83,6 @@ typedef struct XXH64_state_s XXH64_state_t; constexpr int64_t kHashingConcurrency = 8; constexpr int64_t kBytesInMB = 1 << 20; -// Use 100MB as an overestimate of the L3 cache size. -constexpr int64_t kL3CacheSizeBytes = 100000000; - // ---------------------------------------------------------------------- // GPU support @@ -143,22 +140,13 @@ struct ObjectInUseEntry { bool is_sealed; }; -/// Configuration options for the plasma client. -struct PlasmaClientConfig { - /// Number of release calls we wait until the object is actually released. - /// This allows us to avoid invalidating the cpu cache on workers if objects - /// are reused accross tasks. - size_t release_delay; -}; - struct ClientMmapTableEntry { + /// The associated file descriptor on the client. + int fd; /// The result of mmap for this file descriptor. uint8_t* pointer; /// The length of the memory-mapped file. size_t length; - /// The number of objects in this memory-mapped file that are currently being - /// used by the client. When this count reaches zeros, we unmap the file. - int count; }; class PlasmaClient::Impl : public std::enable_shared_from_this { @@ -169,7 +157,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_this> objects_in_use_; - /// Object IDs of the last few release calls. This is a deque and - /// is used to delay releasing objects to see if they can be reused by - /// subsequent tasks so we do not unneccessarily invalidate cpu caches. - /// TODO(pcm): replace this with a proper lru cache using the size of the L3 - /// cache. - std::deque release_history_; - /// The number of bytes in the combined objects that are held in the release - /// history doubly-linked list. If this is too large then the client starts - /// releasing objects. - int64_t in_use_object_bytes_; - /// Configuration options for the plasma client. - PlasmaClientConfig config_; /// The amount of memory available to the Plasma store. The client needs this /// information to make sure that it does not delay in releasing so much /// memory that the store is unable to evict enough objects to free up space. @@ -308,7 +288,6 @@ PlasmaClient::Impl::~Impl() {} uint8_t* PlasmaClient::Impl::LookupOrMmap(int fd, int store_fd_val, int64_t map_size) { auto entry = mmap_table_.find(store_fd_val); if (entry != mmap_table_.end()) { - close(fd); return entry->second.pointer; } else { // We subtract kMmapRegionsGap from the length that was added @@ -322,9 +301,9 @@ uint8_t* PlasmaClient::Impl::LookupOrMmap(int fd, int store_fd_val, int64_t map_ close(fd); // Closing this fd has an effect on performance. ClientMmapTableEntry& entry = mmap_table_[store_fd_val]; + entry.fd = fd; entry.pointer = result; entry.length = map_size; - entry.count = 0; return result; } } @@ -342,6 +321,17 @@ bool PlasmaClient::Impl::IsInUse(const ObjectID& object_id) { return (elem != objects_in_use_.end()); } +int PlasmaClient::Impl::GetStoreFd(int store_fd) { + auto entry = mmap_table_.find(store_fd); + if (entry == mmap_table_.end()) { + int fd = recv_fd(store_conn_); + ARROW_CHECK(fd >= 0) << "recv not successful"; + return fd; + } else { + return entry->second.fd; + } +} + void PlasmaClient::Impl::IncrementObjectCount(const ObjectID& object_id, PlasmaObject* object, bool is_sealed) { // Increment the count of the object to track the fact that it is being used. @@ -357,18 +347,6 @@ void PlasmaClient::Impl::IncrementObjectCount(const ObjectID& object_id, objects_in_use_[object_id]->count = 0; objects_in_use_[object_id]->is_sealed = is_sealed; object_entry = objects_in_use_[object_id].get(); - if (object->device_num == 0) { - // Increment the count of the number of objects in the memory-mapped file - // that are being used. The corresponding decrement should happen in - // PlasmaClient::Release. - auto entry = mmap_table_.find(object->store_fd); - ARROW_CHECK(entry != mmap_table_.end()); - ARROW_CHECK(entry->second.count >= 0); - // Update the in_use_object_bytes_. - in_use_object_bytes_ += - (object_entry->object.data_size + object_entry->object.metadata_size); - entry->second.count += 1; - } } else { object_entry = elem->second.get(); ARROW_CHECK(object_entry->count > 0); @@ -397,8 +375,7 @@ Status PlasmaClient::Impl::Create(const ObjectID& object_id, int64_t data_size, // If the CreateReply included an error, then the store will not send a file // descriptor. if (device_num == 0) { - int fd = recv_fd(store_conn_); - ARROW_CHECK(fd >= 0) << "recv not successful"; + int fd = GetStoreFd(store_fd); ARROW_CHECK(object.data_size == data_size); ARROW_CHECK(object.metadata_size == metadata_size); // The metadata should come right after the data. @@ -535,8 +512,7 @@ Status PlasmaClient::Impl::GetBuffers( // in the subsequent loop based on just the store file descriptor and without // having to know the relevant file descriptor received from recv_fd. for (size_t i = 0; i < store_fds.size(); i++) { - int fd = recv_fd(store_conn_); - ARROW_CHECK(fd >= 0); + int fd = GetStoreFd(store_fds[i]); LookupOrMmap(fd, store_fds[i], mmap_sizes[i]); } @@ -615,54 +591,21 @@ Status PlasmaClient::Impl::Get(const ObjectID* object_ids, int64_t num_objects, return GetBuffers(object_ids, num_objects, timeout_ms, wrap_buffer, out); } -Status PlasmaClient::Impl::UnmapObject(const ObjectID& object_id) { +Status PlasmaClient::Impl::MarkObjectUnused(const ObjectID& object_id) { auto object_entry = objects_in_use_.find(object_id); ARROW_CHECK(object_entry != objects_in_use_.end()); ARROW_CHECK(object_entry->second->count == 0); - // Decrement the count of the number of objects in this memory-mapped file - // that the client is using. The corresponding increment should have - // happened in plasma_get. - int fd = object_entry->second->object.store_fd; - auto entry = mmap_table_.find(fd); - ARROW_CHECK(entry != mmap_table_.end()); - ARROW_CHECK(entry->second.count >= 1); - if (entry->second.count == 1) { - // If no other objects are being used, then unmap the file. - // We subtract kMmapRegionsGap from the length that was added - // in fake_mmap in malloc.h, to make the size page-aligned again. - int err = munmap(entry->second.pointer, entry->second.length - kMmapRegionsGap); - if (err == -1) { - return Status::IOError("Error during munmap"); - } - // Remove the corresponding entry from the hash table. - mmap_table_.erase(fd); - } else { - // If there are other objects being used, decrement the reference count. - entry->second.count -= 1; - } - // Update the in_use_object_bytes_. - in_use_object_bytes_ -= (object_entry->second->object.data_size + - object_entry->second->object.metadata_size); - DCHECK_GE(in_use_object_bytes_, 0); // Remove the entry from the hash table of objects currently in use. objects_in_use_.erase(object_id); return Status::OK(); } -/// This is a helper method for implementing plasma_release. We maintain a -/// buffer -/// of release calls and only perform them once the buffer becomes full (as -/// judged by the aggregate sizes of the objects). There may be multiple release -/// calls for the same object ID in the buffer. In this case, the first release -/// calls will not do anything. The client will only send a message to the store -/// releasing the object when the client is truly done with the object. -/// -/// @param object_id The object ID to attempt to release. -Status PlasmaClient::Impl::PerformRelease(const ObjectID& object_id) { - // Decrement the count of the number of instances of this object that are - // being used by this client. The corresponding increment should have happened - // in PlasmaClient::Get. +Status PlasmaClient::Impl::Release(const ObjectID& object_id) { + // If the client is already disconnected, ignore release requests. + if (store_conn_ < 0) { + return Status::OK(); + } auto object_entry = objects_in_use_.find(object_id); ARROW_CHECK(object_entry != objects_in_use_.end()); object_entry->second->count -= 1; @@ -670,7 +613,7 @@ Status PlasmaClient::Impl::PerformRelease(const ObjectID& object_id) { // Check if the client is no longer using this object. if (object_entry->second->count == 0) { // Tell the store that the client no longer needs the object. - RETURN_NOT_OK(UnmapObject(object_id)); + RETURN_NOT_OK(MarkObjectUnused(object_id)); RETURN_NOT_OK(SendReleaseRequest(store_conn_, object_id)); auto iter = deletion_cache_.find(object_id); if (iter != deletion_cache_.end()) { @@ -681,50 +624,6 @@ Status PlasmaClient::Impl::PerformRelease(const ObjectID& object_id) { return Status::OK(); } -Status PlasmaClient::Impl::Release(const ObjectID& object_id) { - // If the client is already disconnected, ignore release requests. - if (store_conn_ < 0) { - return Status::OK(); - } - // If an object is in the deletion cache, handle it directly without waiting. - auto iter = deletion_cache_.find(object_id); - if (iter != deletion_cache_.end()) { - RETURN_NOT_OK(PerformRelease(object_id)); - return Status::OK(); - } - // Add the new object to the release history. - release_history_.push_front(object_id); - // If there are too many bytes in use by the client or if there are too many - // pending release calls, and there are at least some pending release calls in - // the release_history list, then release some objects. - - // TODO(wap): Eviction policy only works on host memory, and thus objects on - // the GPU cannot be released currently. - while ((in_use_object_bytes_ > std::min(kL3CacheSizeBytes, store_capacity_ / 100) || - release_history_.size() > config_.release_delay) && - release_history_.size() > 0) { - // Perform a release for the object ID for the first pending release. - RETURN_NOT_OK(PerformRelease(release_history_.back())); - // Remove the last entry from the release history. - release_history_.pop_back(); - } - return Status::OK(); -} - -Status PlasmaClient::Impl::FlushReleaseHistory() { - // If the client is already disconnected, ignore the flush. - if (store_conn_ < 0) { - return Status::OK(); - } - while (release_history_.size() > 0) { - // Perform a release for the object ID for the first pending release. - RETURN_NOT_OK(PerformRelease(release_history_.back())); - // Remove the last entry from the release history. - release_history_.pop_back(); - } - return Status::OK(); -} - // This method is used to query whether the plasma store contains an object. Status PlasmaClient::Impl::Contains(const ObjectID& object_id, bool* has_object) { // Check if we already have a reference to the object. @@ -855,8 +754,6 @@ Status PlasmaClient::Impl::Abort(const ObjectID& object_id) { ARROW_CHECK(!object_entry->second->is_sealed) << "Plasma client called abort on a sealed object"; - // Flush the release history. - RETURN_NOT_OK(FlushReleaseHistory()); // Make sure that the Plasma client only has one reference to the object. If // it has more, then the client needs to release the buffer before calling // abort. @@ -868,7 +765,7 @@ Status PlasmaClient::Impl::Abort(const ObjectID& object_id) { RETURN_NOT_OK(SendAbortRequest(store_conn_, object_id)); // Decrease the reference count to zero, then remove the object. object_entry->second->count--; - RETURN_NOT_OK(UnmapObject(object_id)); + RETURN_NOT_OK(MarkObjectUnused(object_id)); std::vector buffer; ObjectID id; @@ -878,7 +775,6 @@ Status PlasmaClient::Impl::Abort(const ObjectID& object_id) { } Status PlasmaClient::Impl::Delete(const std::vector& object_ids) { - RETURN_NOT_OK(FlushReleaseHistory()); std::vector not_in_use_ids; for (auto& object_id : object_ids) { // If the object is in used, skip it. @@ -981,8 +877,10 @@ Status PlasmaClient::Impl::Connect(const std::string& store_socket_name, } else { manager_conn_ = -1; } - config_.release_delay = release_delay; - in_use_object_bytes_ = 0; + if (release_delay != 0) { + ARROW_LOG(WARNING) << "The release_delay parameter in PlasmaClient::Connect " + << "is deprecated"; + } // Send a ConnectRequest to the store to get its memory capacity. RETURN_NOT_OK(SendConnectRequest(store_conn_)); std::vector buffer; @@ -1175,8 +1073,6 @@ Status PlasmaClient::Info(const ObjectID& object_id, int* object_status) { int PlasmaClient::get_manager_fd() const { return impl_->get_manager_fd(); } -Status PlasmaClient::FlushReleaseHistory() { return impl_->FlushReleaseHistory(); } - bool PlasmaClient::IsInUse(const ObjectID& object_id) { return impl_->IsInUse(object_id); } diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h index 9e080b7760dc8..514d2bd0d6d06 100644 --- a/cpp/src/plasma/client.h +++ b/cpp/src/plasma/client.h @@ -34,11 +34,6 @@ using arrow::Status; namespace plasma { -/// We keep a queue of unreleased objects cached in the client until we start -/// sending release requests to the store. This is to avoid frequently mapping -/// and unmapping objects and evicting data from processor caches. -constexpr int64_t kPlasmaDefaultReleaseDelay = 64; - /// Object buffer data structure. struct ObjectBuffer { /// The data buffer. @@ -62,13 +57,12 @@ class ARROW_EXPORT PlasmaClient { /// \param manager_socket_name The name of the UNIX domain socket to use to /// connect to the local Plasma manager. If this is "", then this /// function will not connect to a manager. - /// \param release_delay Number of released objects that are kept around - /// and not evicted to avoid too many munmaps. + /// \param release_delay Deprecated (not used). /// \param num_retries number of attempts to connect to IPC socket, default 50 /// \return The return status. Status Connect(const std::string& store_socket_name, - const std::string& manager_socket_name, - int release_delay = kPlasmaDefaultReleaseDelay, int num_retries = -1); + const std::string& manager_socket_name, int release_delay = 0, + int num_retries = -1); /// Create an object in the Plasma Store. Any metadata for this object must be /// be passed in when the object is created. @@ -354,10 +348,6 @@ class ARROW_EXPORT PlasmaClient { FRIEND_TEST(TestPlasmaStore, LegacyGetTest); FRIEND_TEST(TestPlasmaStore, AbortTest); - /// This is a helper method that flushes all pending release calls to the - /// store. - Status FlushReleaseHistory(); - bool IsInUse(const ObjectID& object_id); class ARROW_NO_EXPORT Impl; diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc index ae658d757c185..f6326ccf588de 100644 --- a/cpp/src/plasma/store.cc +++ b/cpp/src/plasma/store.cc @@ -327,7 +327,12 @@ void PlasmaStore::ReturnFromGet(GetRequest* get_req) { if (s.ok()) { // Send all of the file descriptors for the present objects. for (int store_fd : store_fds) { - WarnIfSigpipe(send_fd(get_req->client->fd, store_fd), get_req->client->fd); + // Only send the file descriptor if it hasn't been sent (see analogous + // logic in GetStoreFd in client.cc). + if (get_req->client->used_fds.find(store_fd) == get_req->client->used_fds.end()) { + WarnIfSigpipe(send_fd(get_req->client->fd, store_fd), get_req->client->fd); + get_req->client->used_fds.insert(store_fd); + } } } @@ -783,8 +788,12 @@ Status PlasmaStore::ProcessMessage(Client* client) { HANDLE_SIGPIPE( SendCreateReply(client->fd, object_id, &object, error_code, mmap_size), client->fd); - if (error_code == PlasmaError::OK && device_num == 0) { + // Only send the file descriptor if it hasn't been sent (see analogous + // logic in GetStoreFd in client.cc). Similar in ReturnFromGet. + if (error_code == PlasmaError::OK && device_num == 0 && + client->used_fds.find(object.store_fd) == client->used_fds.end()) { WarnIfSigpipe(send_fd(client->fd, object.store_fd), client->fd); + client->used_fds.insert(object.store_fd); } } break; case fb::MessageType::PlasmaCreateAndSealRequest: { diff --git a/cpp/src/plasma/store.h b/cpp/src/plasma/store.h index 8d3facd733f1c..0e0eb8323f3bb 100644 --- a/cpp/src/plasma/store.h +++ b/cpp/src/plasma/store.h @@ -54,6 +54,9 @@ struct Client { /// Object ids that are used by this client. std::unordered_set object_ids; + /// File descriptors that are used by this client. + std::unordered_set used_fds; + /// The file descriptor used to push notifications to client. This is only valid /// if client subscribes to plasma store. -1 indicates invalid. int notification_fd; diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc index f820303aba42b..65a9b71b7f251 100644 --- a/cpp/src/plasma/test/client_tests.cc +++ b/cpp/src/plasma/test/client_tests.cc @@ -82,7 +82,7 @@ class TestPlasmaStore : public ::testing::Test { void CreateObject(PlasmaClient& client, const ObjectID& object_id, const std::vector& metadata, - const std::vector& data) { + const std::vector& data, bool release = true) { std::shared_ptr data_buffer; ARROW_CHECK_OK(client.Create(object_id, data.size(), &metadata[0], metadata.size(), &data_buffer)); @@ -90,7 +90,9 @@ class TestPlasmaStore : public ::testing::Test { data_buffer->mutable_data()[i] = data[i]; } ARROW_CHECK_OK(client.Seal(object_id)); - ARROW_CHECK_OK(client.Release(object_id)); + if (release) { + ARROW_CHECK_OK(client.Release(object_id)); + } } const std::string& GetStoreSocketName() const { return store_socket_name_; } @@ -155,11 +157,12 @@ TEST_F(TestPlasmaStore, SealErrorsTest) { // Create object. std::vector data(100, 0); - CreateObject(client_, object_id, {42}, data); + CreateObject(client_, object_id, {42}, data, false); // Trying to seal it again. result = client_.Seal(object_id); ASSERT_TRUE(result.IsPlasmaObjectAlreadySealed()); + ARROW_CHECK_OK(client_.Release(object_id)); } TEST_F(TestPlasmaStore, DeleteTest) { @@ -228,13 +231,7 @@ TEST_F(TestPlasmaStore, DeleteObjectsTest) { // client2_ won't send the release request immediately because the trigger // condition is not reached. The release is only added to release cache. object_buffers.clear(); - // The reference count went to zero, but the objects are still in the release - // cache. - ARROW_CHECK_OK(client_.Contains(object_id1, &has_object)); - ASSERT_TRUE(has_object); - ARROW_CHECK_OK(client_.Contains(object_id2, &has_object)); - ASSERT_TRUE(has_object); - // The Delete call will flush release cache and send the Delete request. + // Delete the objects. result = client2_.Delete(std::vector{object_id1, object_id2}); ARROW_CHECK_OK(client_.Contains(object_id1, &has_object)); ASSERT_FALSE(has_object); @@ -277,7 +274,6 @@ TEST_F(TestPlasmaStore, GetTest) { // First create object. std::vector data = {3, 5, 6, 7, 9}; CreateObject(client_, object_id, {42}, data); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); object_buffers.clear(); @@ -291,11 +287,9 @@ TEST_F(TestPlasmaStore, GetTest) { auto metadata = object_buffers[0].metadata; object_buffers.clear(); ::arrow::AssertBufferEqual(*metadata, std::string{42}); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_TRUE(client_.IsInUse(object_id)); } // Object is automatically released - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); } @@ -314,17 +308,14 @@ TEST_F(TestPlasmaStore, LegacyGetTest) { // First create object. std::vector data = {3, 5, 6, 7, 9}; CreateObject(client_, object_id, {42}, data); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); ARROW_CHECK_OK(client_.Get(&object_id, 1, -1, &object_buffer)); AssertObjectBufferEqual(object_buffer, {42}, {3, 5, 6, 7, 9}); } // Object needs releasing manually - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_TRUE(client_.IsInUse(object_id)); ARROW_CHECK_OK(client_.Release(object_id)); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); } @@ -377,11 +368,9 @@ TEST_F(TestPlasmaStore, AbortTest) { ASSERT_TRUE(status.IsInvalid()); // Release, then abort. ARROW_CHECK_OK(client_.Release(object_id)); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_TRUE(client_.IsInUse(object_id)); ARROW_CHECK_OK(client_.Abort(object_id)); - ARROW_CHECK_OK(client_.FlushReleaseHistory()); EXPECT_FALSE(client_.IsInUse(object_id)); // Test for object non-existence after the abort. @@ -394,7 +383,6 @@ TEST_F(TestPlasmaStore, AbortTest) { // Test that we can get the object. ARROW_CHECK_OK(client_.Get({object_id}, -1, &object_buffers)); AssertObjectBufferEqual(object_buffers[0], {42, 43}, {1, 2, 3, 4, 5}); - ARROW_CHECK_OK(client_.Release(object_id)); } TEST_F(TestPlasmaStore, MultipleClientTest) { diff --git a/docs/source/python/plasma.rst b/docs/source/python/plasma.rst index 09837cf6e9ef9..3df68eff59e00 100644 --- a/docs/source/python/plasma.rst +++ b/docs/source/python/plasma.rst @@ -60,7 +60,7 @@ socket name: .. code-block:: python import pyarrow.plasma as plasma - client = plasma.connect("/tmp/plasma", "", 0) + client = plasma.connect("/tmp/plasma", "") If the following error occurs from running the above Python code, that means that either the socket given is incorrect, or the ``./plasma_store`` is @@ -68,7 +68,7 @@ not currently running. Check to see if the Plasma store is still running. .. code-block:: shell - >>> client = plasma.connect("/tmp/plasma", "", 0) + >>> client = plasma.connect("/tmp/plasma", "") Connection to socket failed for pathname /tmp/plasma Could not connect to socket /tmp/plasma @@ -179,7 +179,7 @@ the object buffer. # Create a different client. Note that this second client could be # created in the same or in a separate, concurrent Python session. - client2 = plasma.connect("/tmp/plasma", "", 0) + client2 = plasma.connect("/tmp/plasma", "") # Get the object in the second client. This blocks until the object has been sealed. object_id2 = plasma.ObjectID(20 * b"a") @@ -221,7 +221,7 @@ of the object info might change in the future): import pyarrow.plasma as plasma import time - client = plasma.connect("/tmp/plasma", "", 0) + client = plasma.connect("/tmp/plasma", "") client.put("hello, world") # Sleep a little so we get different creation times @@ -452,7 +452,7 @@ You can test this with the following script: import pyarrow.plasma as plasma import time - client = plasma.connect("/tmp/plasma", "", 0) + client = plasma.connect("/tmp/plasma", "") data = np.random.randn(100000000) tensor = pa.Tensor.from_numpy(data) diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx index 2fad09c0549c2..f7db3b4e0fec3 100644 --- a/python/pyarrow/_plasma.pyx +++ b/python/pyarrow/_plasma.pyx @@ -30,10 +30,11 @@ from cython.operator cimport dereference as deref, preincrement as inc from cpython.pycapsule cimport * import collections -import pyarrow import random import socket +import warnings +import pyarrow from pyarrow.lib cimport Buffer, NativeFile, check_status, pyarrow_wrap_buffer from pyarrow.includes.libarrow cimport (CBuffer, CMutableBuffer, CFixedSizeBufferWriter, CStatus) @@ -872,7 +873,7 @@ cdef class PlasmaClient: return result -def connect(store_socket_name, manager_socket_name, int release_delay, +def connect(store_socket_name, manager_socket_name, int release_delay=0, int num_retries=-1): """ Return a new PlasmaClient that is connected a plasma store and @@ -885,8 +886,7 @@ def connect(store_socket_name, manager_socket_name, int release_delay, manager_socket_name : str Name of the socket the plasma manager is listening at. release_delay : int - The maximum number of objects that the client will keep and - delay releasing (for caching reasons). + This parameter is deprecated and has no effect. num_retries : int, default -1 Number of times to try to connect to plasma store. Default value of -1 uses the default (50) @@ -894,6 +894,9 @@ def connect(store_socket_name, manager_socket_name, int release_delay, cdef PlasmaClient result = PlasmaClient() result.store_socket_name = store_socket_name.encode() result.manager_socket_name = manager_socket_name.encode() + if release_delay != 0: + warnings.warn("release_delay in PlasmaClient.connect is deprecated", + FutureWarning) with nogil: check_status(result.client.get() .Connect(result.store_socket_name, diff --git a/python/pyarrow/tensorflow/plasma_op.cc b/python/pyarrow/tensorflow/plasma_op.cc index a341d5a53988f..4e6449adfc85c 100644 --- a/python/pyarrow/tensorflow/plasma_op.cc +++ b/python/pyarrow/tensorflow/plasma_op.cc @@ -77,7 +77,7 @@ class TensorToPlasmaOp : public tf::AsyncOpKernel { if (!connected_) { VLOG(1) << "Connecting to Plasma..."; ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_, - plasma_manager_socket_name_, 0)); + plasma_manager_socket_name_)); VLOG(1) << "Connected!"; connected_ = true; } @@ -249,7 +249,7 @@ class PlasmaToTensorOp : public tf::AsyncOpKernel { if (!connected_) { VLOG(1) << "Connecting to Plasma..."; ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_, - plasma_manager_socket_name_, 0)); + plasma_manager_socket_name_)); VLOG(1) << "Connected!"; connected_ = true; } diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py index e3d31b7de1990..66449e6dba9a3 100644 --- a/python/pyarrow/tests/test_plasma.py +++ b/python/pyarrow/tests/test_plasma.py @@ -121,8 +121,8 @@ def setup_method(self, test_method): use_one_memory_mapped_file=use_one_memory_mapped_file) self.plasma_store_name, self.p = self.plasma_store_ctx.__enter__() # Connect to Plasma. - self.plasma_client = plasma.connect(self.plasma_store_name, "", 64) - self.plasma_client2 = plasma.connect(self.plasma_store_name, "", 0) + self.plasma_client = plasma.connect(self.plasma_store_name, "") + self.plasma_client2 = plasma.connect(self.plasma_store_name, "") def teardown_method(self, test_method): try: @@ -948,7 +948,7 @@ def test_use_huge_pages(): plasma_store_memory=2*10**9, plasma_directory="/mnt/hugepages", use_hugepages=True) as (plasma_store_name, p): - plasma_client = plasma.connect(plasma_store_name, "", 64) + plasma_client = plasma.connect(plasma_store_name, "") create_object(plasma_client, 10**8) @@ -962,7 +962,7 @@ def test_plasma_client_sharing(): with plasma.start_plasma_store( plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY) \ as (plasma_store_name, p): - plasma_client = plasma.connect(plasma_store_name, "", 64) + plasma_client = plasma.connect(plasma_store_name, "") object_id = plasma_client.put(np.zeros(3)) buf = plasma_client.get(object_id) del plasma_client diff --git a/python/pyarrow/tests/test_plasma_tf_op.py b/python/pyarrow/tests/test_plasma_tf_op.py index d9bf915d663aa..51e8b283e0a1d 100644 --- a/python/pyarrow/tests/test_plasma_tf_op.py +++ b/python/pyarrow/tests/test_plasma_tf_op.py @@ -94,7 +94,7 @@ def test_plasma_tf_op(use_gpu=False): pytest.skip("TensorFlow Op not found") with plasma.start_plasma_store(10**8) as (plasma_store_name, p): - client = plasma.connect(plasma_store_name, "", 0) + client = plasma.connect(plasma_store_name, "") for dtype in [np.float32, np.float64, np.int8, np.int16, np.int32, np.int64]: run_tensorflow_test_with_dtype(tf, plasma, plasma_store_name, From 8c413036775796d9bcc52be56373bbb45de8c0ae Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 14 Dec 2018 07:27:08 -0800 Subject: [PATCH 038/328] ARROW-4015: [Plasma] remove unused interfaces for plasma manager https://github.com/apache/arrow/issues/3154 This removes unused plasma interfaces Fetch(), Wait(), Transfer() and Info(), which depend on plasma manager which has already been removed from ray. Author: Philipp Moritz Author: Zhijun Fu Author: Robert Nishihara Closes #3167 from zhijunfu/remove-legacy-interfaces and squashes the following commits: 0efb5005f fix tensorflow op be92e9085 fix java client 9da2cd38b Update _plasma.pyx 16ec63e9a More updates e7413f739 Update _plasma.pyx 21398b5e7 merge bcb320400 address comments 7967aea09 Merge branch 'master' into remove-legacy-interfaces 583cd97c4 ARROW-4015: remove unused interfaces for plasma manager --- c_glib/plasma-glib/client.cpp | 3 +- cpp/apidoc/tutorials/plasma.md | 8 +- cpp/apidoc/tutorials/tensor_to_py.md | 2 +- cpp/src/plasma/client.cc | 111 +------------- cpp/src/plasma/client.h | 100 +----------- cpp/src/plasma/common.cc | 3 - cpp/src/plasma/common.h | 24 --- cpp/src/plasma/format/plasma.fbs | 74 --------- ...org_apache_arrow_plasma_PlasmaClientJNI.cc | 73 --------- cpp/src/plasma/plasma.h | 3 - cpp/src/plasma/protocol.cc | 143 ------------------ cpp/src/plasma/protocol.h | 35 ----- cpp/src/plasma/test/client_tests.cc | 2 - cpp/src/plasma/test/serialization_tests.cc | 116 -------------- docs/source/python/plasma.rst | 10 +- .../apache/arrow/plasma/ObjectStoreLink.java | 27 ---- .../org/apache/arrow/plasma/PlasmaClient.java | 23 --- python/benchmarks/plasma.py | 4 +- python/examples/plasma/sorting/sort_df.py | 2 +- python/pyarrow/_plasma.pyx | 130 +--------------- python/pyarrow/tensorflow/plasma_op.cc | 18 +-- python/pyarrow/tests/test_plasma.py | 16 +- python/pyarrow/tests/test_plasma_tf_op.py | 8 +- 23 files changed, 41 insertions(+), 894 deletions(-) diff --git a/c_glib/plasma-glib/client.cpp b/c_glib/plasma-glib/client.cpp index c05a71085dd2d..9591a0a714f27 100644 --- a/c_glib/plasma-glib/client.cpp +++ b/c_glib/plasma-glib/client.cpp @@ -41,8 +41,7 @@ G_BEGIN_DECLS * * #GPlasmaClientCreateOptions is a class for customizing object creation. * - * #GPlasmaClient is a class for an interface with a plasma store - * and a plasma manager. + * #GPlasmaClient is a class for an interface with a plasma store. * * Since: 0.12.0 */ diff --git a/cpp/apidoc/tutorials/plasma.md b/cpp/apidoc/tutorials/plasma.md index 472d479c4b2f9..b9046d50bc922 100644 --- a/cpp/apidoc/tutorials/plasma.md +++ b/cpp/apidoc/tutorials/plasma.md @@ -80,7 +80,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); // Disconnect the Plasma client. ARROW_CHECK_OK(client.Disconnect()); } @@ -226,7 +226,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); // Create an object with a fixed ObjectID. ObjectID object_id = ObjectID::from_binary("00000000000000000000"); int64_t data_size = 1000; @@ -332,7 +332,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); ObjectID object_id = ObjectID::from_binary("00000000000000000000"); ObjectBuffer object_buffer; ARROW_CHECK_OK(client.Get(&object_id, 1, -1, &object_buffer)); @@ -421,7 +421,7 @@ using namespace plasma; int main(int argc, char** argv) { // Start up and connect a Plasma client. PlasmaClient client; - ARROW_CHECK_OK(client.Connect("/tmp/plasma", "")); + ARROW_CHECK_OK(client.Connect("/tmp/plasma")); int fd; ARROW_CHECK_OK(client.Subscribe(&fd)); diff --git a/cpp/apidoc/tutorials/tensor_to_py.md b/cpp/apidoc/tutorials/tensor_to_py.md index 0be973a4f3df9..cd191fea07d09 100644 --- a/cpp/apidoc/tutorials/tensor_to_py.md +++ b/cpp/apidoc/tutorials/tensor_to_py.md @@ -105,7 +105,7 @@ The `inputs` variable will be a list of Object IDs in their raw byte string form import pyarrow as pa import pyarrow.plasma as plasma -plasma_client = plasma.connect('/tmp/plasma', '', 0) +plasma_client = plasma.connect('/tmp/plasma') # inputs: a list of object ids inputs = [20 * b'1'] diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index 2dbe2b41478ea..4215399c0b009 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -198,17 +198,6 @@ class PlasmaClient::Impl : public std::enable_shared_from_this= 0) { - close(manager_conn_); - manager_conn_ = -1; - } - return Status::OK(); -} - -Status PlasmaClient::Impl::Transfer(const char* address, int port, - const ObjectID& object_id) { - return SendDataRequest(manager_conn_, object_id, address, port); -} - -Status PlasmaClient::Impl::Fetch(int num_object_ids, const ObjectID* object_ids) { - ARROW_CHECK(manager_conn_ >= 0); - return SendFetchRequest(manager_conn_, object_ids, num_object_ids); -} - -int PlasmaClient::Impl::get_manager_fd() const { return manager_conn_; } - -Status PlasmaClient::Impl::Info(const ObjectID& object_id, int* object_status) { - ARROW_CHECK(manager_conn_ >= 0); - - RETURN_NOT_OK(SendStatusRequest(manager_conn_, &object_id, 1)); - std::vector buffer; - RETURN_NOT_OK(PlasmaReceive(manager_conn_, MessageType::PlasmaStatusReply, &buffer)); - ObjectID id; - RETURN_NOT_OK(ReadStatusReply(buffer.data(), buffer.size(), &id, object_status, 1)); - ARROW_CHECK(object_id == id); - return Status::OK(); -} - -Status PlasmaClient::Impl::Wait(int64_t num_object_requests, - ObjectRequest* object_requests, int num_ready_objects, - int64_t timeout_ms, int* num_objects_ready) { - ARROW_CHECK(manager_conn_ >= 0); - ARROW_CHECK(num_object_requests > 0); - ARROW_CHECK(num_ready_objects > 0); - ARROW_CHECK(num_ready_objects <= num_object_requests); - - for (int i = 0; i < num_object_requests; ++i) { - ARROW_CHECK(object_requests[i].type == ObjectRequestType::PLASMA_QUERY_LOCAL || - object_requests[i].type == ObjectRequestType::PLASMA_QUERY_ANYWHERE); - } - - RETURN_NOT_OK(SendWaitRequest(manager_conn_, object_requests, num_object_requests, - num_ready_objects, timeout_ms)); - std::vector buffer; - RETURN_NOT_OK(PlasmaReceive(manager_conn_, MessageType::PlasmaWaitReply, &buffer)); - RETURN_NOT_OK( - ReadWaitReply(buffer.data(), buffer.size(), object_requests, &num_ready_objects)); - - *num_objects_ready = 0; - for (int i = 0; i < num_object_requests; ++i) { - ObjectRequestType type = object_requests[i].type; - auto status = static_cast(object_requests[i].location); - switch (type) { - case ObjectRequestType::PLASMA_QUERY_LOCAL: - if (status == fb::ObjectStatus::Local) { - *num_objects_ready += 1; - } - break; - case ObjectRequestType::PLASMA_QUERY_ANYWHERE: - if (status == fb::ObjectStatus::Local || status == fb::ObjectStatus::Remote) { - *num_objects_ready += 1; - } else { - ARROW_CHECK(status == fb::ObjectStatus::Nonexistent); - } - break; - default: - ARROW_LOG(FATAL) << "This code should be unreachable."; - } - } return Status::OK(); } @@ -1052,27 +964,6 @@ Status PlasmaClient::DecodeNotification(const uint8_t* buffer, ObjectID* object_ Status PlasmaClient::Disconnect() { return impl_->Disconnect(); } -Status PlasmaClient::Fetch(int num_object_ids, const ObjectID* object_ids) { - return impl_->Fetch(num_object_ids, object_ids); -} - -Status PlasmaClient::Wait(int64_t num_object_requests, ObjectRequest* object_requests, - int num_ready_objects, int64_t timeout_ms, - int* num_objects_ready) { - return impl_->Wait(num_object_requests, object_requests, num_ready_objects, timeout_ms, - num_objects_ready); -} - -Status PlasmaClient::Transfer(const char* addr, int port, const ObjectID& object_id) { - return impl_->Transfer(addr, port, object_id); -} - -Status PlasmaClient::Info(const ObjectID& object_id, int* object_status) { - return impl_->Info(object_id, object_status); -} - -int PlasmaClient::get_manager_fd() const { return impl_->get_manager_fd(); } - bool PlasmaClient::IsInUse(const ObjectID& object_id) { return impl_->IsInUse(object_id); } diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h index 514d2bd0d6d06..ac9e8eb0fe9c9 100644 --- a/cpp/src/plasma/client.h +++ b/cpp/src/plasma/client.h @@ -49,19 +49,20 @@ class ARROW_EXPORT PlasmaClient { PlasmaClient(); ~PlasmaClient(); - /// Connect to the local plasma store and plasma manager. Return - /// the resulting connection. + /// Connect to the local plasma store. Return the resulting connection. /// /// \param store_socket_name The name of the UNIX domain socket to use to /// connect to the Plasma store. /// \param manager_socket_name The name of the UNIX domain socket to use to /// connect to the local Plasma manager. If this is "", then this /// function will not connect to a manager. + /// Note that plasma manager is no longer supported, this function + /// will return failure if this is not "". /// \param release_delay Deprecated (not used). /// \param num_retries number of attempts to connect to IPC socket, default 50 /// \return The return status. Status Connect(const std::string& store_socket_name, - const std::string& manager_socket_name, int release_delay = 0, + const std::string& manager_socket_name = "", int release_delay = 0, int num_retries = -1); /// Create an object in the Plasma Store. Any metadata for this object must be @@ -249,99 +250,6 @@ class ARROW_EXPORT PlasmaClient { /// \return The return status. Status Disconnect(); - /// Attempt to initiate the transfer of some objects from remote Plasma - /// Stores. - /// This method does not guarantee that the fetched objects will arrive - /// locally. - /// - /// For an object that is available in the local Plasma Store, this method - /// will - /// not do anything. For an object that is not available locally, it will - /// check - /// if the object are already being fetched. If so, it will not do anything. - /// If - /// not, it will query the object table for a list of Plasma Managers that - /// have - /// the object. The object table will return a non-empty list, and this Plasma - /// Manager will attempt to initiate transfers from one of those Plasma - /// Managers. - /// - /// This function is non-blocking. - /// - /// This method is idempotent in the sense that it is ok to call it multiple - /// times. - /// - /// \param num_object_ids The number of object IDs fetch is being called on. - /// \param object_ids The IDs of the objects that fetch is being called on. - /// \return The return status. - Status Fetch(int num_object_ids, const ObjectID* object_ids); - - /// Wait for (1) a specified number of objects to be available (sealed) in the - /// local Plasma Store or in a remote Plasma Store, or (2) for a timeout to - /// expire. This is a blocking call. - /// - /// \param num_object_requests Size of the object_requests array. - /// \param object_requests Object event array. Each element contains a request - /// for a particular object_id. The type of request is specified in the - /// "type" field. - /// - A PLASMA_QUERY_LOCAL request is satisfied when object_id becomes - /// available in the local Plasma Store. In this case, this function - /// sets the "status" field to ObjectStatus::Local. Note, if the - /// status - /// is not ObjectStatus::Local, it will be ObjectStatus::Nonexistent, - /// but it may exist elsewhere in the system. - /// - A PLASMA_QUERY_ANYWHERE request is satisfied when object_id - /// becomes - /// available either at the local Plasma Store or on a remote Plasma - /// Store. In this case, the functions sets the "status" field to - /// ObjectStatus::Local or ObjectStatus::Remote. - /// \param num_ready_objects The number of requests in object_requests array - /// that - /// must be satisfied before the function returns, unless it timeouts. - /// The num_ready_objects should be no larger than num_object_requests. - /// \param timeout_ms Timeout value in milliseconds. If this timeout expires - /// before min_num_ready_objects of requests are satisfied, the - /// function - /// returns. - /// \param num_objects_ready Out parameter for number of satisfied requests in - /// the object_requests list. If the returned number is less than - /// min_num_ready_objects this means that timeout expired. - /// \return The return status. - Status Wait(int64_t num_object_requests, ObjectRequest* object_requests, - int num_ready_objects, int64_t timeout_ms, int* num_objects_ready); - - /// Transfer local object to a different plasma manager. - /// - /// \param addr IP address of the plasma manager we are transfering to. - /// \param port Port of the plasma manager we are transfering to. - /// \param object_id ObjectID of the object we are transfering. - /// \return The return status. - Status Transfer(const char* addr, int port, const ObjectID& object_id); - - /// Return the status of a given object. This method may query the object - /// table. - /// - /// \param object_id The ID of the object whose status we query. - /// \param object_status Out parameter for object status. Can take the - /// following values. - /// - PLASMA_CLIENT_LOCAL, if object is stored in the local Plasma - /// Store. - /// has been already scheduled by the Plasma Manager. - /// - PLASMA_CLIENT_TRANSFER, if the object is either currently being - /// transferred or just scheduled. - /// - PLASMA_CLIENT_REMOTE, if the object is stored at a remote - /// Plasma Store. - /// - PLASMA_CLIENT_DOES_NOT_EXIST, if the object doesn’t exist in the - /// system. - /// \return The return status. - Status Info(const ObjectID& object_id, int* object_status); - - /// Get the file descriptor for the socket connection to the plasma manager. - /// - /// \return The file descriptor for the manager connection. If there is no - /// connection to the manager, this is -1. - int get_manager_fd() const; - private: friend class PlasmaBuffer; FRIEND_TEST(TestPlasmaStore, GetTest); diff --git a/cpp/src/plasma/common.cc b/cpp/src/plasma/common.cc index 0ca17cf814f8a..1b86fd80b4920 100644 --- a/cpp/src/plasma/common.cc +++ b/cpp/src/plasma/common.cc @@ -107,9 +107,6 @@ bool UniqueID::operator==(const UniqueID& rhs) const { return std::memcmp(data(), rhs.data(), kUniqueIDSize) == 0; } -ARROW_EXPORT fb::ObjectStatus ObjectStatusLocal = fb::ObjectStatus::Local; -ARROW_EXPORT fb::ObjectStatus ObjectStatusRemote = fb::ObjectStatus::Remote; - const PlasmaStoreInfo* plasma_config; } // namespace plasma diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h index 7090428ff41c9..38925fef929e4 100644 --- a/cpp/src/plasma/common.h +++ b/cpp/src/plasma/common.h @@ -66,30 +66,6 @@ typedef UniqueID ObjectID; /// Size of object hash digests. constexpr int64_t kDigestSize = sizeof(uint64_t); -enum class ObjectRequestType : int { - /// Query for object in the local plasma store. - PLASMA_QUERY_LOCAL = 1, - /// Query for object in the local plasma store or in a remote plasma store. - PLASMA_QUERY_ANYWHERE -}; - -/// Object request data structure. Used for Wait. -struct ObjectRequest { - /// The ID of the requested object. If ID_NIL request any object. - ObjectID object_id; - /// Request associated to the object. It can take one of the following values: - /// - PLASMA_QUERY_LOCAL: return if or when the object is available in the - /// local Plasma Store. - /// - PLASMA_QUERY_ANYWHERE: return if or when the object is available in - /// the system (i.e., either in the local or a remote Plasma Store). - ObjectRequestType type; - /// Object location. This can be - /// - ObjectLocation::Local: object is ready at the local Plasma Store. - /// - ObjectLocation::Remote: object is ready at a remote Plasma Store. - /// - ObjectLocation::Nonexistent: object does not exist in the system. - ObjectLocation location; -}; - enum class ObjectState : int { /// Object was created but not sealed in the local Plasma Store. PLASMA_CREATED = 1, diff --git a/cpp/src/plasma/format/plasma.fbs b/cpp/src/plasma/format/plasma.fbs index ef934fbd81ed2..b3c890391887e 100644 --- a/cpp/src/plasma/format/plasma.fbs +++ b/cpp/src/plasma/format/plasma.fbs @@ -42,9 +42,6 @@ enum MessageType:long { // Delete an object. PlasmaDeleteRequest, PlasmaDeleteReply, - // Get status of an object. - PlasmaStatusRequest, - PlasmaStatusReply, // See if the store contains an object (will be deprecated). PlasmaContainsRequest, PlasmaContainsReply, @@ -57,11 +54,6 @@ enum MessageType:long { // Make room for new objects in the plasma store. PlasmaEvictRequest, PlasmaEvictReply, - // Fetch objects from remote Plasma stores. - PlasmaFetchRequest, - // Wait for objects to be ready either from local or remote Plasma stores. - PlasmaWaitRequest, - PlasmaWaitReply, // Subscribe to a list of objects or to all objects. PlasmaSubscribeRequest, // Unsubscribe. @@ -239,35 +231,6 @@ table PlasmaDeleteReply { errors: [PlasmaError]; } -table PlasmaStatusRequest { - // IDs of the objects stored at local Plasma store we request the status of. - object_ids: [string]; -} - -enum ObjectStatus:int { - // Object is stored in the local Plasma Store. - Local, - // Object is stored on a remote Plasma store, and it is not stored on the - // local Plasma Store. - Remote, - // Object is not stored in the system. - Nonexistent, - // Object is currently transferred from a remote Plasma store the local - // Plasma Store. - Transfer -} - -table PlasmaStatusReply { - // IDs of the objects being returned. - object_ids: [string]; - // Status of the object. - status: [ObjectStatus]; -} - -// PlasmaContains is a subset of PlasmaStatus which does not -// involve the plasma manager, only the store. We should consider -// unifying them in the future and deprecating PlasmaContains. - table PlasmaContainsRequest { // ID of the object we are querying. object_id: string; @@ -309,43 +272,6 @@ table PlasmaEvictReply { num_bytes: ulong; } -table PlasmaFetchRequest { - // IDs of objects to be gotten. - object_ids: [string]; -} - -table ObjectRequestSpec { - // ID of the object. - object_id: string; - // The type of the object. This specifies whether we - // will be waiting for an object store in the local or - // global Plasma store. - type: int; -} - -table PlasmaWaitRequest { - // Array of object requests whose status we are asking for. - object_requests: [ObjectRequestSpec]; - // Number of objects expected to be returned, if available. - num_ready_objects: int; - // timeout - timeout: long; -} - -table ObjectReply { - // ID of the object. - object_id: string; - // The object status. This specifies where the object is stored. - status: ObjectStatus; -} - -table PlasmaWaitReply { - // Array of object requests being returned. - object_requests: [ObjectReply]; - // Number of objects expected to be returned, if available. - num_ready_objects: int; -} - table PlasmaSubscribeRequest { } diff --git a/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc b/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc index 7cd2f3574423c..fa376ec43ce13 100644 --- a/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc +++ b/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc @@ -220,79 +220,6 @@ JNIEXPORT jboolean JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_contains return has_object; } -JNIEXPORT void JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_fetch( - JNIEnv* env, jclass cls, jlong conn, jobjectArray object_ids) { - plasma::PlasmaClient* client = reinterpret_cast(conn); - jsize num_oids = env->GetArrayLength(object_ids); - - std::vector oids(num_oids); - for (int i = 0; i < num_oids; ++i) { - jbyteArray_to_object_id( - env, reinterpret_cast(env->GetObjectArrayElement(object_ids, i)), - &oids[i]); - } - - ARROW_CHECK_OK(client->Fetch(static_cast(num_oids), oids.data())); - - return; -} - -JNIEXPORT jobjectArray JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_wait( - JNIEnv* env, jclass cls, jlong conn, jobjectArray object_ids, jint timeout_ms, - jint num_returns) { - plasma::PlasmaClient* client = reinterpret_cast(conn); - jsize num_oids = env->GetArrayLength(object_ids); - - if (num_returns < 0) { - jclass Exception = env->FindClass("java/lang/RuntimeException"); - env->ThrowNew(Exception, "The argument num_returns cannot be less than zero."); - return nullptr; - } - if (num_returns > num_oids) { - jclass Exception = env->FindClass("java/lang/RuntimeException"); - env->ThrowNew(Exception, - "The argument num_returns cannot be greater than len(object_ids)."); - return nullptr; - } - - std::vector oreqs(num_oids); - - for (int i = 0; i < num_oids; ++i) { - jbyteArray_to_object_id( - env, reinterpret_cast(env->GetObjectArrayElement(object_ids, i)), - &oreqs[i].object_id); - oreqs[i].type = plasma::ObjectRequestType::PLASMA_QUERY_ANYWHERE; - } - - int num_return_objects; - // TODO: may be blocked. consider to add the thread support - ARROW_CHECK_OK(client->Wait(static_cast(num_oids), oreqs.data(), num_returns, - static_cast(timeout_ms), &num_return_objects)); - - int num_to_return = std::min(num_return_objects, num_returns); - jclass clsByteArray = env->FindClass("[B"); - jobjectArray ret = env->NewObjectArray(num_to_return, clsByteArray, nullptr); - - int num_returned = 0; - jbyteArray oid = nullptr; - for (int i = 0; i < num_oids; ++i) { - if (num_returned >= num_to_return) { - break; - } - - if (oreqs[i].location == plasma::ObjectLocation::Local || - oreqs[i].location == plasma::ObjectLocation::Remote) { - oid = env->NewByteArray(OBJECT_ID_SIZE); - object_id_to_jbyteArray(env, oid, &oreqs[i].object_id); - env->SetObjectArrayElement(ret, num_returned, oid); - num_returned++; - } - } - ARROW_CHECK(num_returned == num_to_return); - - return ret; -} - JNIEXPORT jlong JNICALL Java_org_apache_arrow_plasma_PlasmaClientJNI_evict( JNIEnv* env, jclass cls, jlong conn, jlong num_bytes) { plasma::PlasmaClient* client = reinterpret_cast(conn); diff --git a/cpp/src/plasma/plasma.h b/cpp/src/plasma/plasma.h index 83caec7ee4958..aafe527466913 100644 --- a/cpp/src/plasma/plasma.h +++ b/cpp/src/plasma/plasma.h @@ -68,9 +68,6 @@ constexpr int64_t kBlockSize = 64; struct Client; -/// Mapping from object IDs to type and status of the request. -typedef std::unordered_map ObjectRequestMap; - // TODO(pcm): Replace this by the flatbuffers message PlasmaObjectSpec. struct PlasmaObject { #ifdef PLASMA_CUDA diff --git a/cpp/src/plasma/protocol.cc b/cpp/src/plasma/protocol.cc index c437840874538..a878647718264 100644 --- a/cpp/src/plasma/protocol.cc +++ b/cpp/src/plasma/protocol.cc @@ -42,10 +42,6 @@ using flatbuffers::uoffset_t; #define PLASMA_CHECK_ENUM(x, y) \ static_assert(static_cast(x) == static_cast(y), "protocol mismatch") -PLASMA_CHECK_ENUM(ObjectLocation::Local, fb::ObjectStatus::Local); -PLASMA_CHECK_ENUM(ObjectLocation::Remote, fb::ObjectStatus::Remote); -PLASMA_CHECK_ENUM(ObjectLocation::Nonexistent, fb::ObjectStatus::Nonexistent); - flatbuffers::Offset>> ToFlatbuffer(flatbuffers::FlatBufferBuilder* fbb, const ObjectID* object_ids, int64_t num_objects) { @@ -367,56 +363,6 @@ Status ReadDeleteReply(uint8_t* data, size_t size, std::vector* object return Status::OK(); } -// Satus messages. - -Status SendStatusRequest(int sock, const ObjectID* object_ids, int64_t num_objects) { - flatbuffers::FlatBufferBuilder fbb; - auto message = - fb::CreatePlasmaStatusRequest(fbb, ToFlatbuffer(&fbb, object_ids, num_objects)); - return PlasmaSend(sock, MessageType::PlasmaStatusRequest, &fbb, message); -} - -Status ReadStatusRequest(uint8_t* data, size_t size, ObjectID object_ids[], - int64_t num_objects) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - for (uoffset_t i = 0; i < num_objects; ++i) { - object_ids[i] = ObjectID::from_binary(message->object_ids()->Get(i)->str()); - } - return Status::OK(); -} - -Status SendStatusReply(int sock, ObjectID object_ids[], int object_status[], - int64_t num_objects) { - flatbuffers::FlatBufferBuilder fbb; - auto message = - fb::CreatePlasmaStatusReply(fbb, ToFlatbuffer(&fbb, object_ids, num_objects), - fbb.CreateVector(object_status, num_objects)); - return PlasmaSend(sock, MessageType::PlasmaStatusReply, &fbb, message); -} - -int64_t ReadStatusReply_num_objects(uint8_t* data, size_t size) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - return message->object_ids()->size(); -} - -Status ReadStatusReply(uint8_t* data, size_t size, ObjectID object_ids[], - int object_status[], int64_t num_objects) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - for (uoffset_t i = 0; i < num_objects; ++i) { - object_ids[i] = ObjectID::from_binary(message->object_ids()->Get(i)->str()); - } - for (uoffset_t i = 0; i < num_objects; ++i) { - object_status[i] = message->status()->data()[i]; - } - return Status::OK(); -} - // Contains messages. Status SendContainsRequest(int sock, ObjectID object_id) { @@ -640,95 +586,6 @@ Status ReadGetReply(uint8_t* data, size_t size, ObjectID object_ids[], } return Status::OK(); } -// Fetch messages. - -Status SendFetchRequest(int sock, const ObjectID* object_ids, int64_t num_objects) { - flatbuffers::FlatBufferBuilder fbb; - auto message = - fb::CreatePlasmaFetchRequest(fbb, ToFlatbuffer(&fbb, object_ids, num_objects)); - return PlasmaSend(sock, MessageType::PlasmaFetchRequest, &fbb, message); -} - -Status ReadFetchRequest(uint8_t* data, size_t size, std::vector& object_ids) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - for (uoffset_t i = 0; i < message->object_ids()->size(); ++i) { - object_ids.push_back(ObjectID::from_binary(message->object_ids()->Get(i)->str())); - } - return Status::OK(); -} - -// Wait messages. - -Status SendWaitRequest(int sock, ObjectRequest object_requests[], int64_t num_requests, - int num_ready_objects, int64_t timeout_ms) { - flatbuffers::FlatBufferBuilder fbb; - - std::vector> object_request_specs; - for (int i = 0; i < num_requests; i++) { - object_request_specs.push_back(fb::CreateObjectRequestSpec( - fbb, fbb.CreateString(object_requests[i].object_id.binary()), - static_cast(object_requests[i].type))); - } - - auto message = fb::CreatePlasmaWaitRequest(fbb, fbb.CreateVector(object_request_specs), - num_ready_objects, timeout_ms); - return PlasmaSend(sock, MessageType::PlasmaWaitRequest, &fbb, message); -} - -Status ReadWaitRequest(uint8_t* data, size_t size, ObjectRequestMap& object_requests, - int64_t* timeout_ms, int* num_ready_objects) { - DCHECK(data); - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - *num_ready_objects = message->num_ready_objects(); - *timeout_ms = message->timeout(); - - for (uoffset_t i = 0; i < message->object_requests()->size(); i++) { - ObjectID object_id = - ObjectID::from_binary(message->object_requests()->Get(i)->object_id()->str()); - ObjectRequest object_request( - {object_id, - static_cast(message->object_requests()->Get(i)->type()), - ObjectLocation::Nonexistent}); - object_requests[object_id] = object_request; - } - return Status::OK(); -} - -Status SendWaitReply(int sock, const ObjectRequestMap& object_requests, - int num_ready_objects) { - flatbuffers::FlatBufferBuilder fbb; - - std::vector> object_replies; - for (const auto& entry : object_requests) { - const auto& object_request = entry.second; - object_replies.push_back( - fb::CreateObjectReply(fbb, fbb.CreateString(object_request.object_id.binary()), - static_cast(object_request.location))); - } - - auto message = fb::CreatePlasmaWaitReply( - fbb, fbb.CreateVector(object_replies.data(), num_ready_objects), num_ready_objects); - return PlasmaSend(sock, MessageType::PlasmaWaitReply, &fbb, message); -} - -Status ReadWaitReply(uint8_t* data, size_t size, ObjectRequest object_requests[], - int* num_ready_objects) { - DCHECK(data); - - auto message = flatbuffers::GetRoot(data); - DCHECK(VerifyFlatbuffer(message, data, size)); - *num_ready_objects = message->num_ready_objects(); - for (int i = 0; i < *num_ready_objects; i++) { - object_requests[i].object_id = - ObjectID::from_binary(message->object_requests()->Get(i)->object_id()->str()); - object_requests[i].location = - static_cast(message->object_requests()->Get(i)->status()); - } - return Status::OK(); -} // Subscribe messages. diff --git a/cpp/src/plasma/protocol.h b/cpp/src/plasma/protocol.h index c8204584b8adb..0362bd47797d4 100644 --- a/cpp/src/plasma/protocol.h +++ b/cpp/src/plasma/protocol.h @@ -128,21 +128,6 @@ Status SendDeleteReply(int sock, const std::vector& object_ids, Status ReadDeleteReply(uint8_t* data, size_t size, std::vector* object_ids, std::vector* errors); -/* Satus messages. */ - -Status SendStatusRequest(int sock, const ObjectID* object_ids, int64_t num_objects); - -Status ReadStatusRequest(uint8_t* data, size_t size, ObjectID object_ids[], - int64_t num_objects); - -Status SendStatusReply(int sock, ObjectID object_ids[], int object_status[], - int64_t num_objects); - -int64_t ReadStatusReply_num_objects(uint8_t* data, size_t size); - -Status ReadStatusReply(uint8_t* data, size_t size, ObjectID object_ids[], - int object_status[], int64_t num_objects); - /* Plasma Constains message functions. */ Status SendContainsRequest(int sock, ObjectID object_id); @@ -184,26 +169,6 @@ Status SendEvictReply(int sock, int64_t num_bytes); Status ReadEvictReply(uint8_t* data, size_t size, int64_t& num_bytes); -/* Plasma Fetch Remote message functions. */ - -Status SendFetchRequest(int sock, const ObjectID* object_ids, int64_t num_objects); - -Status ReadFetchRequest(uint8_t* data, size_t size, std::vector& object_ids); - -/* Plasma Wait message functions. */ - -Status SendWaitRequest(int sock, ObjectRequest object_requests[], int64_t num_requests, - int num_ready_objects, int64_t timeout_ms); - -Status ReadWaitRequest(uint8_t* data, size_t size, ObjectRequestMap& object_requests, - int64_t* timeout_ms, int* num_ready_objects); - -Status SendWaitReply(int sock, const ObjectRequestMap& object_requests, - int num_ready_objects); - -Status ReadWaitReply(uint8_t* data, size_t size, ObjectRequest object_requests[], - int* num_ready_objects); - /* Plasma Subscribe message functions. */ Status SendSubscribeRequest(int sock); diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc index 65a9b71b7f251..30dc6850cd068 100644 --- a/cpp/src/plasma/test/client_tests.cc +++ b/cpp/src/plasma/test/client_tests.cc @@ -187,7 +187,6 @@ TEST_F(TestPlasmaStore, DeleteTest) { ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); ASSERT_TRUE(has_object); - // Avoid race condition of Plasma Manager waiting for notification. ARROW_CHECK_OK(client_.Release(object_id)); // object_id is marked as to-be-deleted, when it is not in use, it will be deleted. ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); @@ -251,7 +250,6 @@ TEST_F(TestPlasmaStore, ContainsTest) { // First create object. std::vector data(100, 0); CreateObject(client_, object_id, {42}, data); - // Avoid race condition of Plasma Manager waiting for notification. std::vector object_buffers; ARROW_CHECK_OK(client_.Get({object_id}, -1, &object_buffers)); ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); diff --git a/cpp/src/plasma/test/serialization_tests.cc b/cpp/src/plasma/test/serialization_tests.cc index 085ae97db980f..66d651d2923bf 100644 --- a/cpp/src/plasma/test/serialization_tests.cc +++ b/cpp/src/plasma/test/serialization_tests.cc @@ -254,44 +254,6 @@ TEST(PlasmaSerialization, DeleteReply) { close(fd); } -TEST(PlasmaSerialization, StatusRequest) { - int fd = create_temp_file(); - constexpr int64_t num_objects = 2; - ObjectID object_ids[num_objects]; - object_ids[0] = random_object_id(); - object_ids[1] = random_object_id(); - ARROW_CHECK_OK(SendStatusRequest(fd, object_ids, num_objects)); - std::vector data = - read_message_from_file(fd, MessageType::PlasmaStatusRequest); - ObjectID object_ids_read[num_objects]; - ARROW_CHECK_OK( - ReadStatusRequest(data.data(), data.size(), object_ids_read, num_objects)); - ASSERT_EQ(object_ids[0], object_ids_read[0]); - ASSERT_EQ(object_ids[1], object_ids_read[1]); - close(fd); -} - -TEST(PlasmaSerialization, StatusReply) { - int fd = create_temp_file(); - ObjectID object_ids[2]; - object_ids[0] = random_object_id(); - object_ids[1] = random_object_id(); - int object_statuses[2] = {42, 43}; - ARROW_CHECK_OK(SendStatusReply(fd, object_ids, object_statuses, 2)); - std::vector data = read_message_from_file(fd, MessageType::PlasmaStatusReply); - int64_t num_objects = ReadStatusReply_num_objects(data.data(), data.size()); - - std::vector object_ids_read(num_objects); - std::vector object_statuses_read(num_objects); - ARROW_CHECK_OK(ReadStatusReply(data.data(), data.size(), object_ids_read.data(), - object_statuses_read.data(), num_objects)); - ASSERT_EQ(object_ids[0], object_ids_read[0]); - ASSERT_EQ(object_ids[1], object_ids_read[1]); - ASSERT_EQ(object_statuses[0], object_statuses_read[0]); - ASSERT_EQ(object_statuses[1], object_statuses_read[1]); - close(fd); -} - TEST(PlasmaSerialization, EvictRequest) { int fd = create_temp_file(); int64_t num_bytes = 111; @@ -314,84 +276,6 @@ TEST(PlasmaSerialization, EvictReply) { close(fd); } -TEST(PlasmaSerialization, FetchRequest) { - int fd = create_temp_file(); - ObjectID object_ids[2]; - object_ids[0] = random_object_id(); - object_ids[1] = random_object_id(); - ARROW_CHECK_OK(SendFetchRequest(fd, object_ids, 2)); - std::vector data = read_message_from_file(fd, MessageType::PlasmaFetchRequest); - std::vector object_ids_read; - ARROW_CHECK_OK(ReadFetchRequest(data.data(), data.size(), object_ids_read)); - ASSERT_EQ(object_ids[0], object_ids_read[0]); - ASSERT_EQ(object_ids[1], object_ids_read[1]); - close(fd); -} - -TEST(PlasmaSerialization, WaitRequest) { - int fd = create_temp_file(); - const int num_objects_in = 2; - ObjectRequest object_requests_in[num_objects_in] = { - ObjectRequest({random_object_id(), ObjectRequestType::PLASMA_QUERY_ANYWHERE, - ObjectLocation::Local}), - ObjectRequest({random_object_id(), ObjectRequestType::PLASMA_QUERY_LOCAL, - ObjectLocation::Local})}; - const int num_ready_objects_in = 1; - int64_t timeout_ms = 1000; - - ARROW_CHECK_OK(SendWaitRequest(fd, &object_requests_in[0], num_objects_in, - num_ready_objects_in, timeout_ms)); - /* Read message back. */ - std::vector data = read_message_from_file(fd, MessageType::PlasmaWaitRequest); - int num_ready_objects_out; - int64_t timeout_ms_read; - ObjectRequestMap object_requests_out; - ARROW_CHECK_OK(ReadWaitRequest(data.data(), data.size(), object_requests_out, - &timeout_ms_read, &num_ready_objects_out)); - ASSERT_EQ(num_objects_in, object_requests_out.size()); - ASSERT_EQ(num_ready_objects_out, num_ready_objects_in); - for (int i = 0; i < num_objects_in; i++) { - const ObjectID& object_id = object_requests_in[i].object_id; - ASSERT_EQ(1, object_requests_out.count(object_id)); - const auto& entry = object_requests_out.find(object_id); - ASSERT_TRUE(entry != object_requests_out.end()); - ASSERT_EQ(entry->second.object_id, object_requests_in[i].object_id); - ASSERT_EQ(entry->second.type, object_requests_in[i].type); - } - close(fd); -} - -TEST(PlasmaSerialization, WaitReply) { - int fd = create_temp_file(); - const int num_objects_in = 2; - /* Create a map with two ObjectRequests in it. */ - ObjectRequestMap objects_in(num_objects_in); - ObjectID id1 = random_object_id(); - objects_in[id1] = - ObjectRequest({id1, ObjectRequestType::PLASMA_QUERY_LOCAL, ObjectLocation::Local}); - ObjectID id2 = random_object_id(); - objects_in[id2] = ObjectRequest( - {id2, ObjectRequestType::PLASMA_QUERY_LOCAL, ObjectLocation::Nonexistent}); - - ARROW_CHECK_OK(SendWaitReply(fd, objects_in, num_objects_in)); - /* Read message back. */ - std::vector data = read_message_from_file(fd, MessageType::PlasmaWaitReply); - ObjectRequest objects_out[2]; - int num_objects_out; - ARROW_CHECK_OK( - ReadWaitReply(data.data(), data.size(), &objects_out[0], &num_objects_out)); - ASSERT_EQ(num_objects_in, num_objects_out); - for (int i = 0; i < num_objects_out; i++) { - /* Each object request must appear exactly once. */ - ASSERT_EQ(objects_in.count(objects_out[i].object_id), 1); - const auto& entry = objects_in.find(objects_out[i].object_id); - ASSERT_TRUE(entry != objects_in.end()); - ASSERT_EQ(entry->second.object_id, objects_out[i].object_id); - ASSERT_EQ(entry->second.location, objects_out[i].location); - } - close(fd); -} - TEST(PlasmaSerialization, DataRequest) { int fd = create_temp_file(); ObjectID object_id1 = random_object_id(); diff --git a/docs/source/python/plasma.rst b/docs/source/python/plasma.rst index 3df68eff59e00..660c5fbba7918 100644 --- a/docs/source/python/plasma.rst +++ b/docs/source/python/plasma.rst @@ -60,7 +60,7 @@ socket name: .. code-block:: python import pyarrow.plasma as plasma - client = plasma.connect("/tmp/plasma", "") + client = plasma.connect("/tmp/plasma") If the following error occurs from running the above Python code, that means that either the socket given is incorrect, or the ``./plasma_store`` is @@ -68,7 +68,7 @@ not currently running. Check to see if the Plasma store is still running. .. code-block:: shell - >>> client = plasma.connect("/tmp/plasma", "") + >>> client = plasma.connect("/tmp/plasma") Connection to socket failed for pathname /tmp/plasma Could not connect to socket /tmp/plasma @@ -179,7 +179,7 @@ the object buffer. # Create a different client. Note that this second client could be # created in the same or in a separate, concurrent Python session. - client2 = plasma.connect("/tmp/plasma", "") + client2 = plasma.connect("/tmp/plasma") # Get the object in the second client. This blocks until the object has been sealed. object_id2 = plasma.ObjectID(20 * b"a") @@ -221,7 +221,7 @@ of the object info might change in the future): import pyarrow.plasma as plasma import time - client = plasma.connect("/tmp/plasma", "") + client = plasma.connect("/tmp/plasma") client.put("hello, world") # Sleep a little so we get different creation times @@ -452,7 +452,7 @@ You can test this with the following script: import pyarrow.plasma as plasma import time - client = plasma.connect("/tmp/plasma", "") + client = plasma.connect("/tmp/plasma") data = np.random.randn(100000000) tensor = pa.Tensor.from_numpy(data) diff --git a/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java b/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java index 3b67bc08ecfdc..8d6eec02e75a4 100644 --- a/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java +++ b/java/plasma/src/main/java/org/apache/arrow/plasma/ObjectStoreLink.java @@ -79,16 +79,6 @@ default byte[] get(byte[] objectId, int timeoutMs, boolean isMetadata) { */ List get(byte[][] objectIds, int timeoutMs); - /** - * Wait until numReturns objects in objectIds are ready. - * - * @param objectIds List of object IDs to wait for. - * @param timeoutMs Return to the caller after timeoutMs milliseconds. - * @param numReturns We are waiting for this number of objects to be ready. - * @return List of object IDs that are ready - */ - List wait(byte[][] objectIds, int timeoutMs, int numReturns); - /** * Compute the hash of an object in the object store. * @@ -98,23 +88,6 @@ default byte[] get(byte[] objectId, int timeoutMs, boolean isMetadata) { */ byte[] hash(byte[] objectId); - /** - * Fetch the object with the given ID from other plasma manager instances. - * - * @param objectId The object ID used to identify the object. - */ - default void fetch(byte[] objectId) { - byte[][] objectIds = {objectId}; - fetch(objectIds); - } - - /** - * Fetch the objects with the given IDs from other plasma manager instances. - * - * @param objectIds List of object IDs used to identify the objects. - */ - void fetch(byte[][] objectIds); - /** * Evict some objects to recover given count of bytes. * diff --git a/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java b/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java index db1f35e1641bb..d69b54df05ed1 100644 --- a/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java +++ b/java/plasma/src/main/java/org/apache/arrow/plasma/PlasmaClient.java @@ -81,34 +81,11 @@ public List get(byte[][] objectIds, int timeoutMs, boolean isMetadata) { return ret; } - @Override - public List wait(byte[][] objectIds, int timeoutMs, int numReturns) { - byte[][] readys = PlasmaClientJNI.wait(conn, objectIds, timeoutMs, numReturns); - - List ret = new ArrayList<>(); - for (byte[] ready : readys) { - for (byte[] id : objectIds) { - if (Arrays.equals(ready, id)) { - ret.add(id); - break; - } - } - } - - assert (ret.size() == readys.length); - return ret; - } - @Override public byte[] hash(byte[] objectId) { return PlasmaClientJNI.hash(conn, objectId); } - @Override - public void fetch(byte[][] objectIds) { - PlasmaClientJNI.fetch(conn, objectIds); - } - @Override public List get(byte[][] objectIds, int timeoutMs) { ByteBuffer[][] bufs = PlasmaClientJNI.get(conn, objectIds, timeoutMs); diff --git a/python/benchmarks/plasma.py b/python/benchmarks/plasma.py index 7cefcdffad2c6..398ec72561255 100644 --- a/python/benchmarks/plasma.py +++ b/python/benchmarks/plasma.py @@ -32,7 +32,7 @@ def setup(self, size): self.plasma_store_ctx = plasma.start_plasma_store( plasma_store_memory=10**9) plasma_store_name, p = self.plasma_store_ctx.__enter__() - self.plasma_client = plasma.connect(plasma_store_name, "", 64) + self.plasma_client = plasma.connect(plasma_store_name) self.data = np.random.randn(size // 8) @@ -52,7 +52,7 @@ def setup(self): self.plasma_store_ctx = plasma.start_plasma_store( plasma_store_memory=10**9) plasma_store_name, p = self.plasma_store_ctx.__enter__() - self.plasma_client = plasma.connect(plasma_store_name, "", 64) + self.plasma_client = plasma.connect(plasma_store_name) def teardown(self): self.plasma_store_ctx.__exit__(None, None, None) diff --git a/python/examples/plasma/sorting/sort_df.py b/python/examples/plasma/sorting/sort_df.py index 2e4df589ee38c..2a51759a67b89 100644 --- a/python/examples/plasma/sorting/sort_df.py +++ b/python/examples/plasma/sorting/sort_df.py @@ -49,7 +49,7 @@ # Connect to clients def connect(): global client - client = plasma.connect('/tmp/store', '', 0) + client = plasma.connect('/tmp/store') np.random.seed(int(time.time() * 10e7) % 10000000) diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx index f7db3b4e0fec3..cfaa39c96ea5d 100644 --- a/python/pyarrow/_plasma.pyx +++ b/python/pyarrow/_plasma.pyx @@ -63,11 +63,6 @@ cdef extern from "plasma/common.h" nogil: @staticmethod int64_t size() - cdef struct CObjectRequest" plasma::ObjectRequest": - CUniqueID object_id - int type - int location - cdef enum CObjectState" plasma::ObjectState": PLASMA_CREATED" plasma::ObjectState::PLASMA_CREATED" PLASMA_SEALED" plasma::ObjectState::PLASMA_SEALED" @@ -92,14 +87,6 @@ cdef extern from "plasma/common.h" nogil: cdef extern from "plasma/common.h": cdef int64_t kDigestSize" plasma::kDigestSize" - cdef enum ObjectRequestType: - PLASMA_QUERY_LOCAL"plasma::ObjectRequestType::PLASMA_QUERY_LOCAL", - PLASMA_QUERY_ANYWHERE"plasma::ObjectRequestType::PLASMA_QUERY_ANYWHERE" - - cdef enum ObjectLocation: - ObjectStatusLocal"plasma::ObjectLocation::Local" - ObjectStatusRemote"plasma::ObjectLocation::Remote" - cdef extern from "plasma/client.h" nogil: cdef cppclass CPlasmaClient" plasma::PlasmaClient": @@ -143,16 +130,6 @@ cdef extern from "plasma/client.h" nogil: CStatus Disconnect() - CStatus Fetch(int num_object_ids, const CUniqueID* object_ids) - - CStatus Wait(int64_t num_object_requests, - CObjectRequest* object_requests, - int num_ready_objects, int64_t timeout_ms, - int* num_objects_ready) - - CStatus Transfer(const char* addr, int port, - const CUniqueID& object_id) - CStatus Delete(const c_vector[CUniqueID] object_ids) cdef extern from "plasma/client.h" nogil: @@ -285,13 +262,11 @@ cdef class PlasmaClient: shared_ptr[CPlasmaClient] client int notification_fd c_string store_socket_name - c_string manager_socket_name def __cinit__(self): self.client.reset(new CPlasmaClient()) self.notification_fd = -1 self.store_socket_name = b"" - self.manager_socket_name = b"" cdef _get_object_buffers(self, object_ids, int64_t timeout_ms, c_vector[CObjectBuffer]* result): @@ -315,10 +290,6 @@ cdef class PlasmaClient: def store_socket_name(self): return self.store_socket_name.decode() - @property - def manager_socket_name(self): - return self.manager_socket_name.decode() - def create(self, ObjectID object_id, int64_t data_size, c_string metadata=b""): """ @@ -642,95 +613,6 @@ cdef class PlasmaClient: check_status(self.client.get().Evict(num_bytes, num_bytes_evicted)) return num_bytes_evicted - def transfer(self, address, int port, ObjectID object_id): - """ - Transfer local object with id object_id to another plasma instance - - Parameters - ---------- - addr : str - IPv4 address of the plasma instance the object is sent to. - port : int - Port number of the plasma instance the object is sent to. - object_id : str - A string used to identify an object. - """ - cdef c_string addr = address.encode() - with nogil: - check_status(self.client.get() - .Transfer(addr.c_str(), port, object_id.data)) - - def fetch(self, object_ids): - """ - Fetch the objects with the given IDs from other plasma managers. - - Parameters - ---------- - object_ids : list - A list of strings used to identify the objects. - """ - cdef c_vector[CUniqueID] ids - cdef ObjectID object_id - for object_id in object_ids: - ids.push_back(object_id.data) - with nogil: - check_status(self.client.get().Fetch(ids.size(), ids.data())) - - def wait(self, object_ids, int64_t timeout=PLASMA_WAIT_TIMEOUT, - int num_returns=1): - """ - Wait until num_returns objects in object_ids are ready. - Currently, the object ID arguments to wait must be unique. - - Parameters - ---------- - object_ids : list - List of object IDs to wait for. - timeout :int - Return to the caller after timeout milliseconds. - num_returns : int - We are waiting for this number of objects to be ready. - - Returns - ------- - list - List of object IDs that are ready. - list - List of object IDs we might still wait on. - """ - # Check that the object ID arguments are unique. The plasma manager - # currently crashes if given duplicate object IDs. - if len(object_ids) != len(set(object_ids)): - raise Exception("Wait requires a list of unique object IDs.") - cdef int64_t num_object_requests = len(object_ids) - cdef c_vector[CObjectRequest] object_requests = ( - c_vector[CObjectRequest](num_object_requests)) - cdef int num_objects_ready = 0 - cdef ObjectID object_id - for i, object_id in enumerate(object_ids): - object_requests[i].object_id = object_id.data - object_requests[i].type = PLASMA_QUERY_ANYWHERE - with nogil: - check_status(self.client.get().Wait(num_object_requests, - object_requests.data(), - num_returns, timeout, - &num_objects_ready)) - cdef int num_to_return = min(num_objects_ready, num_returns) - ready_ids = [] - waiting_ids = set(object_ids) - cdef int num_returned = 0 - for i in range(len(object_ids)): - if num_returned == num_to_return: - break - if (object_requests[i].location == ObjectStatusLocal or - object_requests[i].location == ObjectStatusRemote): - ready_ids.append( - ObjectID(object_requests[i].object_id.binary())) - waiting_ids.discard( - ObjectID(object_requests[i].object_id.binary())) - num_returned += 1 - return ready_ids, list(waiting_ids) - def subscribe(self): """Subscribe to notifications about sealed objects.""" with nogil: @@ -873,7 +755,7 @@ cdef class PlasmaClient: return result -def connect(store_socket_name, manager_socket_name, int release_delay=0, +def connect(store_socket_name, manager_socket_name=None, int release_delay=0, int num_retries=-1): """ Return a new PlasmaClient that is connected a plasma store and @@ -884,22 +766,24 @@ def connect(store_socket_name, manager_socket_name, int release_delay=0, store_socket_name : str Name of the socket the plasma store is listening at. manager_socket_name : str - Name of the socket the plasma manager is listening at. + This parameter is deprecated and has no effect. release_delay : int This parameter is deprecated and has no effect. num_retries : int, default -1 Number of times to try to connect to plasma store. Default value of -1 uses the default (50) """ + if manager_socket_name is not None: + warnings.warn( + "manager_socket_name in PlasmaClient.connect is deprecated", + FutureWarning) cdef PlasmaClient result = PlasmaClient() result.store_socket_name = store_socket_name.encode() - result.manager_socket_name = manager_socket_name.encode() if release_delay != 0: warnings.warn("release_delay in PlasmaClient.connect is deprecated", FutureWarning) with nogil: check_status(result.client.get() - .Connect(result.store_socket_name, - result.manager_socket_name, + .Connect(result.store_socket_name, b"", release_delay, num_retries)) return result diff --git a/python/pyarrow/tensorflow/plasma_op.cc b/python/pyarrow/tensorflow/plasma_op.cc index 4e6449adfc85c..852be339389e7 100644 --- a/python/pyarrow/tensorflow/plasma_op.cc +++ b/python/pyarrow/tensorflow/plasma_op.cc @@ -71,13 +71,10 @@ class TensorToPlasmaOp : public tf::AsyncOpKernel { explicit TensorToPlasmaOp(tf::OpKernelConstruction* context) : tf::AsyncOpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("plasma_store_socket_name", &plasma_store_socket_name_)); - OP_REQUIRES_OK(context, context->GetAttr("plasma_manager_socket_name", - &plasma_manager_socket_name_)); tf::mutex_lock lock(mu_); if (!connected_) { VLOG(1) << "Connecting to Plasma..."; - ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_, - plasma_manager_socket_name_)); + ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_)); VLOG(1) << "Connected!"; connected_ = true; } @@ -226,7 +223,6 @@ class TensorToPlasmaOp : public tf::AsyncOpKernel { private: std::string plasma_store_socket_name_; - std::string plasma_manager_socket_name_; tf::mutex mu_; bool connected_ = false; @@ -243,13 +239,10 @@ class PlasmaToTensorOp : public tf::AsyncOpKernel { explicit PlasmaToTensorOp(tf::OpKernelConstruction* context) : tf::AsyncOpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("plasma_store_socket_name", &plasma_store_socket_name_)); - OP_REQUIRES_OK(context, context->GetAttr("plasma_manager_socket_name", - &plasma_manager_socket_name_)); tf::mutex_lock lock(mu_); if (!connected_) { VLOG(1) << "Connecting to Plasma..."; - ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_, - plasma_manager_socket_name_)); + ARROW_CHECK_OK(client_.Connect(plasma_store_socket_name_)); VLOG(1) << "Connected!"; connected_ = true; } @@ -364,7 +357,6 @@ class PlasmaToTensorOp : public tf::AsyncOpKernel { private: std::string plasma_store_socket_name_; - std::string plasma_manager_socket_name_; tf::mutex mu_; bool connected_ = false; @@ -375,8 +367,7 @@ REGISTER_OP("TensorToPlasma") .Input("input_tensor: dtypes") .Input("plasma_object_id: string") .Attr("dtypes: list(type)") - .Attr("plasma_store_socket_name: string") - .Attr("plasma_manager_socket_name: string"); + .Attr("plasma_store_socket_name: string"); REGISTER_KERNEL_BUILDER(Name("TensorToPlasma").Device(tf::DEVICE_CPU), TensorToPlasmaOp); @@ -389,8 +380,7 @@ REGISTER_OP("PlasmaToTensor") .Input("plasma_object_id: string") .Output("tensor: dtype") .Attr("dtype: type") - .Attr("plasma_store_socket_name: string") - .Attr("plasma_manager_socket_name: string"); + .Attr("plasma_store_socket_name: string"); REGISTER_KERNEL_BUILDER(Name("PlasmaToTensor").Device(tf::DEVICE_CPU), PlasmaToTensorOp); diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py index 66449e6dba9a3..05375d7b65aee 100644 --- a/python/pyarrow/tests/test_plasma.py +++ b/python/pyarrow/tests/test_plasma.py @@ -121,8 +121,8 @@ def setup_method(self, test_method): use_one_memory_mapped_file=use_one_memory_mapped_file) self.plasma_store_name, self.p = self.plasma_store_ctx.__enter__() # Connect to Plasma. - self.plasma_client = plasma.connect(self.plasma_store_name, "") - self.plasma_client2 = plasma.connect(self.plasma_store_name, "") + self.plasma_client = plasma.connect(self.plasma_store_name) + self.plasma_client2 = plasma.connect(self.plasma_store_name) def teardown_method(self, test_method): try: @@ -147,7 +147,7 @@ def test_connection_failure_raises_exception(self): import pyarrow.plasma as plasma # ARROW-1264 with pytest.raises(IOError): - plasma.connect('unknown-store-name', '', 0, 1) + plasma.connect('unknown-store-name', num_retries=1) def test_create(self): # Create an object id string. @@ -860,7 +860,7 @@ def test_client_death_during_get(self): object_id = random_object_id() def client_blocked_in_get(plasma_store_name): - client = plasma.connect(self.plasma_store_name, "", 0) + client = plasma.connect(self.plasma_store_name) # Try to get an object ID that doesn't exist. This should block. client.get([object_id]) @@ -889,7 +889,7 @@ def test_client_getting_multiple_objects(self): object_ids = [random_object_id() for _ in range(10)] def client_get_multiple(plasma_store_name): - client = plasma.connect(self.plasma_store_name, "", 0) + client = plasma.connect(self.plasma_store_name) # Try to get an object ID that doesn't exist. This should block. client.get(object_ids) @@ -948,7 +948,7 @@ def test_use_huge_pages(): plasma_store_memory=2*10**9, plasma_directory="/mnt/hugepages", use_hugepages=True) as (plasma_store_name, p): - plasma_client = plasma.connect(plasma_store_name, "") + plasma_client = plasma.connect(plasma_store_name) create_object(plasma_client, 10**8) @@ -962,7 +962,7 @@ def test_plasma_client_sharing(): with plasma.start_plasma_store( plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY) \ as (plasma_store_name, p): - plasma_client = plasma.connect(plasma_store_name, "") + plasma_client = plasma.connect(plasma_store_name) object_id = plasma_client.put(np.zeros(3)) buf = plasma_client.get(object_id) del plasma_client @@ -977,7 +977,7 @@ def test_plasma_list(): with plasma.start_plasma_store( plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY) \ as (plasma_store_name, p): - plasma_client = plasma.connect(plasma_store_name, "", 0) + plasma_client = plasma.connect(plasma_store_name) # Test sizes u, _, _ = create_object(plasma_client, 11, metadata_size=7, seal=False) diff --git a/python/pyarrow/tests/test_plasma_tf_op.py b/python/pyarrow/tests/test_plasma_tf_op.py index 51e8b283e0a1d..e239055209f00 100644 --- a/python/pyarrow/tests/test_plasma_tf_op.py +++ b/python/pyarrow/tests/test_plasma_tf_op.py @@ -37,15 +37,13 @@ def ToPlasma(): return plasma.tf_plasma_op.tensor_to_plasma( [data_tensor, ones_tensor], object_id, - plasma_store_socket_name=plasma_store_name, - plasma_manager_socket_name="") + plasma_store_socket_name=plasma_store_name) def FromPlasma(): return plasma.tf_plasma_op.plasma_to_tensor( object_id, dtype=tf.as_dtype(dtype), - plasma_store_socket_name=plasma_store_name, - plasma_manager_socket_name="") + plasma_store_socket_name=plasma_store_name) with tf.device(FORCE_DEVICE): to_plasma = ToPlasma() @@ -94,7 +92,7 @@ def test_plasma_tf_op(use_gpu=False): pytest.skip("TensorFlow Op not found") with plasma.start_plasma_store(10**8) as (plasma_store_name, p): - client = plasma.connect(plasma_store_name, "") + client = plasma.connect(plasma_store_name) for dtype in [np.float32, np.float64, np.int8, np.int16, np.int32, np.int64]: run_tensorflow_test_with_dtype(tf, plasma, plasma_store_name, From 804502f941f808583e9f7043e203533de738d577 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Dec 2018 13:05:50 -0600 Subject: [PATCH 039/328] ARROW-3184: [C++] Enable modular builds and installs with ARROW_OPTIONAL_INSTALL option. Remove ARROW_GANDIVA_BUILD_TESTS Apparently CMake really does not want you to do `make $TARGET install` out of the box; I searched various threads about this and there's no great solutions. For expert users, this provides that option of installing only targets that have been built, while others will be ignored. Because the possibility of users shooting themselves in the foot is so high with this, it doesn't make sense to enable by default. In the hands of an expert though, this can significantly reduce build times and make it possible to build libraries and unit tests for only a part of the project, then only install those libraries. This will install all header files regardless of what libraries are built; I didn't see any easy way to work that out since you have to have knowledge of what headers are used by what library. Resolves ARROW-3994 Author: Wes McKinney Closes #3172 from wesm/ARROW-3184 and squashes the following commits: 583a916e0 plasma_store_server requires static libraries 3c2a21ea1 Add plasma_store_server to 'plasma' target 85fda6419 Build plasma again in Python build for now 1b3ac57dc Fix multiline comment in CMake d3ce84c4e More option reorg b6630605c Reorganize CMake options a bit more logically. Add more explicit warning about ARROW_OPTIONAL_INSTALL 262058b2f Do not build Gandiva JNI bindings by default 918fdb371 Fix ARROW_TEST_INCLUDE_LABELS option to actually work 578bc58f5 Use GLOB instead of GLOB_RECURSE daaafa214 Misc fixes a84643d6e Fix header install option f899bdd99 Work around ARROW-4026 via environment variable for now 001a3ad57 Pass in ARROW_TEST_INCLUDE_LABELS via environment variable a1df9ab3d Clarify documentation 2eca8a740 Enable modular builds and install with ARROW_OPTIONAL_INSTALL option. Remove ARROW_GANDIVA_BUILD_TESTS. Add helper function for installing header files. Build fewer targets using these options in some Travis CI jobs --- .travis.yml | 12 ++ ci/travis_before_script_cpp.sh | 18 +- ci/travis_script_python.sh | 10 +- cpp/CMakeLists.txt | 163 ++++++++++-------- cpp/README.md | 15 +- cpp/cmake_modules/BuildUtils.cmake | 20 ++- cpp/src/arrow/CMakeLists.txt | 26 +-- .../arrow/adapters/tensorflow/CMakeLists.txt | 5 +- cpp/src/arrow/compute/CMakeLists.txt | 7 +- cpp/src/arrow/csv/CMakeLists.txt | 7 +- cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt | 18 +- cpp/src/arrow/gpu/CMakeLists.txt | 7 +- cpp/src/arrow/ipc/CMakeLists.txt | 11 +- cpp/src/arrow/python/CMakeLists.txt | 24 +-- cpp/src/arrow/util/CMakeLists.txt | 40 +---- cpp/src/gandiva/CMakeLists.txt | 79 +++------ cpp/src/gandiva/precompiled/CMakeLists.txt | 2 +- cpp/src/parquet/CMakeLists.txt | 21 +-- cpp/src/parquet/api/CMakeLists.txt | 7 +- cpp/src/parquet/arrow/CMakeLists.txt | 7 +- cpp/src/parquet/util/CMakeLists.txt | 7 +- cpp/src/plasma/CMakeLists.txt | 6 +- cpp/tools/parquet/CMakeLists.txt | 4 +- dev/tasks/gandiva-jars/build-cpp.sh | 1 + 24 files changed, 215 insertions(+), 302 deletions(-) diff --git a/.travis.yml b/.travis.yml index d1fc6dba35dd2..d22a4e7df0fea 100644 --- a/.travis.yml +++ b/.travis.yml @@ -109,6 +109,12 @@ matrix: jdk: openjdk8 env: - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 + - ARROW_TRAVIS_GANDIVA_TESTS=1 + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 + - ARROW_CPP_BUILD_TARGETS="gandiva" + # TODO(wesm): Remove this after ARROW-4026 + - ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS="gandiva" - ARROW_TRAVIS_USE_TOOLCHAIN=1 # ARROW-3979 temporarily disabled. - ARROW_TRAVIS_VALGRIND=0 @@ -155,6 +161,12 @@ matrix: addons: env: - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 + - ARROW_TRAVIS_GANDIVA_TESTS=1 + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 + - ARROW_CPP_BUILD_TARGETS="gandiva" + # TODO(wesm): Remove this after ARROW-4026 + - ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS="gandiva" - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN before_script: diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 5f398e8c6e327..6cb7d6074f230 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -42,6 +42,7 @@ fi CMAKE_COMMON_FLAGS="\ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL \ +-DARROW_TEST_INCLUDE_LABELS=$ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS \ -DARROW_NO_DEPRECATED_API=ON \ -DARROW_EXTRA_ERROR_CONTEXT=ON" CMAKE_LINUX_FLAGS="" @@ -98,8 +99,11 @@ fi if [ $ARROW_TRAVIS_GANDIVA == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON" - if [ $only_library_mode == "no" ]; then - CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA_BUILD_TESTS=ON" + if [ $ARROW_TRAVIS_GANDIVA_JAVA == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA_JAVA=ON" + fi + if [ $ARROW_TRAVIS_GANDIVA_TESTS == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BUILD_TESTS=ON" fi fi @@ -119,6 +123,10 @@ if [ $ARROW_TRAVIS_USE_VENDORED_BOOST == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BOOST_VENDORED=ON" fi +if [ $ARROW_TRAVIS_OPTIONAL_INSTALL == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_OPTIONAL_INSTALL=ON" +fi + if [ $TRAVIS_OS_NAME == "linux" ]; then cmake $CMAKE_COMMON_FLAGS \ $CMAKE_LINUX_FLAGS \ @@ -139,8 +147,10 @@ else $ARROW_CPP_DIR fi -# Build and install libraries -$TRAVIS_MAKE -j4 +# Build and install libraries. Configure ARROW_CPP_BUILD_TARGETS environment +# variable to only build certain targets. If you use this, you must also set +# the environment variable ARROW_TRAVIS_OPTIONAL_INSTALL=1 +$TRAVIS_MAKE -j4 $ARROW_CPP_BUILD_TARGETS $TRAVIS_MAKE install popd diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 25bec262d861c..6d96ebe2dfb0b 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -86,19 +86,23 @@ rm -rf * # XXX Can we simply reuse CMAKE_COMMON_FLAGS from travis_before_script_cpp.sh? CMAKE_COMMON_FLAGS="-DARROW_EXTRA_ERROR_CONTEXT=ON" +PYTHON_CPP_BUILD_TARGETS="arrow_python plasma" + if [ $ARROW_TRAVIS_COVERAGE == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GENERATE_COVERAGE=ON" fi if [ $ARROW_TRAVIS_PYTHON_GANDIVA == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON -DARROW_GANDIVA_BUILD_TESTS=OFF" + PYTHON_CPP_BUILD_TARGETS="$PYTHON_CPP_BUILD_TARGETS gandiva" fi cmake -GNinja \ $CMAKE_COMMON_FLAGS \ - -DARROW_BUILD_TESTS=on \ + -DARROW_BUILD_TESTS=ON \ -DARROW_TEST_INCLUDE_LABELS=python \ - -DARROW_BUILD_UTILITIES=off \ + -DARROW_BUILD_UTILITIES=OFF \ + -DARROW_OPTIONAL_INSTALL=ON \ -DARROW_PLASMA=on \ -DARROW_TENSORFLOW=on \ -DARROW_PYTHON=on \ @@ -107,7 +111,7 @@ cmake -GNinja \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ $ARROW_CPP_DIR -ninja +ninja $PYTHON_CPP_BUILD_TARGETS ninja install popd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a83b9dd6d9409..54daaf96e8eb6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -130,26 +130,62 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Linkage of Arrow libraries with unit tests executables. \ static|shared (default shared)") - set(ARROW_TEST_INCLUDE_LABELS "" CACHE STRING - "Only build unit tests having the indicated label or labels. \ -Pass multiple labels by dividing with semicolons") - option(ARROW_NO_DEPRECATED_API "Exclude deprecated APIs from build" OFF) - option(ARROW_COMPUTE - "Build the Arrow Compute Modules" + option(ARROW_FUZZING + "Build Arrow Fuzzing executables" + OFF) + + # Disable this option to exercise non-SIMD fallbacks + option(ARROW_USE_SIMD + "Build with SIMD optimizations" ON) - option(ARROW_EXTRA_ERROR_CONTEXT - "Compile with extra error context (line numbers, code)" + option(ARROW_ALTIVEC + "Build Arrow with Altivec" + ON) + + option(ARROW_BUILD_UTILITIES + "Build Arrow commandline utilities" + ON) + + option(ARROW_RPATH_ORIGIN + "Build Arrow libraries with RATH set to \$ORIGIN" + OFF) + + option(ARROW_INSTALL_NAME_RPATH + "Build Arrow libraries with install_name set to @rpath" + ON) + + option(ARROW_GENERATE_COVERAGE + "Build with C++ code coverage enabled" + OFF) + + option(ARROW_VERBOSE_LINT + "If off, 'quiet' flags will be passed to linting tools" OFF) + #---------------------------------------------------------------------- + # Project components to enable / disable building + + option(ARROW_COMPUTE + "Build the Arrow Compute Modules" + ON) + option(ARROW_FLIGHT "Build the Arrow Flight RPC System (requires GRPC, Protocol Buffers)" OFF) + option(ARROW_GANDIVA + "Build the Gandiva libraries" + OFF) + + option(ARROW_PARQUET + "Build the Parquet libraries" + OFF) + option(ARROW_IPC "Build the Arrow IPC extensions" ON) @@ -174,58 +210,44 @@ Pass multiple labels by dividing with semicolons") "Build the Arrow HDFS bridge" ON) - option(ARROW_BOOST_USE_SHARED - "Rely on boost shared libraries where relevant" - ON) - - option(ARROW_BOOST_VENDORED - "Use vendored Boost instead of existing Boost" - OFF) - - option(ARROW_PROTOBUF_USE_SHARED - "Rely on Protocol Buffers shared libraries where relevant" - OFF) - option(ARROW_PYTHON "Build the Arrow CPython extensions" OFF) - option(ARROW_FUZZING - "Build Arrow Fuzzing executables" + option(ARROW_HIVESERVER2 + "Build the HiveServer2 client and Arrow adapter" OFF) - # Disable this option to exercise non-SIMD fallbacks - option(ARROW_USE_SIMD - "Build with SIMD optimizations" - ON) + option(ARROW_PLASMA + "Build the plasma object store along with Arrow" + OFF) - option(ARROW_ALTIVEC - "Build Arrow with Altivec" - ON) + option(ARROW_PLASMA_JAVA_CLIENT + "Build the plasma object store java client" + OFF) - option(ARROW_BUILD_UTILITIES - "Build Arrow commandline utilities" - ON) + #---------------------------------------------------------------------- + # Thirdparty toolchain options - option(ARROW_RPATH_ORIGIN - "Build Arrow libraries with RATH set to \$ORIGIN" + option(ARROW_VERBOSE_THIRDPARTY_BUILD + "If off, output from ExternalProjects will be logged to files rather than shown" OFF) - option(ARROW_INSTALL_NAME_RPATH - "Build Arrow libraries with install_name set to @rpath" + option(ARROW_BOOST_USE_SHARED + "Rely on boost shared libraries where relevant" ON) - option(ARROW_HIVESERVER2 - "Build the HiveServer2 client and Arrow adapter" + option(ARROW_BOOST_VENDORED + "Use vendored Boost instead of existing Boost" OFF) - option(ARROW_PLASMA - "Build the plasma object store along with Arrow" + option(ARROW_PROTOBUF_USE_SHARED + "Rely on Protocol Buffers shared libraries where relevant" OFF) - option(ARROW_PLASMA_JAVA_CLIENT - "Build the plasma object store java client" - OFF) + option(ARROW_USE_GLOG + "Build libraries with glog support for pluggable logging" + ON) option(ARROW_WITH_BROTLI "Build with Brotli compression" @@ -257,21 +279,8 @@ Pass multiple labels by dividing with semicolons") "Build with zstd compression" ${ARROW_WITH_ZSTD_DEFAULT}) - option(ARROW_GENERATE_COVERAGE - "Build with C++ code coverage enabled" - OFF) - - option(ARROW_VERBOSE_THIRDPARTY_BUILD - "If off, output from ExternalProjects will be logged to files rather than shown" - OFF) - - option(ARROW_VERBOSE_LINT - "If off, 'quiet' flags will be passed to linting tools" - OFF) - - option(ARROW_USE_GLOG - "Build libraries with glog support for pluggable logging" - ON) + #---------------------------------------------------------------------- + # Windows options if (MSVC) option(ARROW_USE_CLCACHE @@ -292,10 +301,8 @@ Pass multiple labels by dividing with semicolons") OFF) endif() - # Parquet-related build options - option(ARROW_PARQUET - "Build the Parquet libraries" - OFF) + #---------------------------------------------------------------------- + # Parquet build options option(PARQUET_MINIMAL_DEPENDENCY "Depend only on Thirdparty headers to build libparquet. \ @@ -310,9 +317,11 @@ Always OFF if building binaries" "Build the Parquet examples. Requires static libraries to be built." OFF) - # Gandiva related build options - option(ARROW_GANDIVA - "Build the Gandiva libraries" + #---------------------------------------------------------------------- + # Gandiva build options + + option(ARROW_GANDIVA_JAVA + "Build the Gandiva JNI wrappers" OFF) # ARROW-3860: Temporary workaround @@ -320,16 +329,30 @@ Always OFF if building binaries" "Include -static-libstdc++ -static-libgcc when linking with Gandiva static libraries" OFF) - option(ARROW_GANDIVA_JAVA - "Build the Gandiva JNI wrappers" - ON) + #---------------------------------------------------------------------- + # Advanced developer options - option(ARROW_GANDIVA_BUILD_TESTS - "Build the Gandiva googletest unit tests" + set(ARROW_TEST_INCLUDE_LABELS "" CACHE STRING + "Only build unit tests having the indicated label or labels. \ +Pass multiple labels by dividing with semicolons") + + option(ARROW_EXTRA_ERROR_CONTEXT + "Compile with extra error context (line numbers, code)" OFF) + option(ARROW_OPTIONAL_INSTALL + "If enabled install ONLY targets that have already been built. Please be \ +advised that if this is enabled 'install' will fail silently on components \ +that have not been built" + OFF) endif() +if (ARROW_OPTIONAL_INSTALL) + # Don't make the "install" target depend on the "all" target + set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) + + set(INSTALL_IS_OPTIONAL OPTIONAL) +endif() ############################################################ # "make lint" target diff --git a/cpp/README.md b/cpp/README.md index d1d76c17875d7..1f12117e8d01e 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -101,6 +101,19 @@ unit tests, and benchmarks (if enabled): * `make gandiva` for Gandiva (LLVM expression compiler) libraries * `make plasma` for Plasma libraries, server +If you wish to only build and install one or more project subcomponents, we +have provided the CMake option `ARROW_OPTIONAL_INSTALL` to only install targets +that have been built. For example, if you only wish to build the Parquet +libraries, its tests, and its dependencies, you can run: + +``` +cmake .. -DARROW_PARQUET=ON -DARROW_OPTIONAL_INSTALL=ON -DARROW_BUILD_TESTS=ON +make parquet +make install +``` + +If you omit an explicit target when invoking `make`, all targets will be built. + ## Parquet Development Notes To build the C++ libraries for Apache Parquet, add the flag @@ -269,7 +282,7 @@ The optional `gandiva` libraries and tests can be built by passing `-DARROW_GANDIVA=on`. ```shell -cmake .. -DARROW_GANDIVA=ON -DARROW_GANDIVA_BUILD_TESTS=ON +cmake .. -DARROW_GANDIVA=ON -DARROW_BUILD_TESTS=ON make ctest -L gandiva ``` diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index d5978e1d215ff..1abe97eecc59f 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -226,6 +226,7 @@ function(ADD_ARROW_LIB LIB_NAME) endif() install(TARGETS ${LIB_NAME}_shared + ${INSTALL_IS_OPTIONAL} EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} @@ -272,6 +273,7 @@ function(ADD_ARROW_LIB LIB_NAME) LINK_PUBLIC ${ARG_STATIC_LINK_LIBS}) install(TARGETS ${LIB_NAME}_static + ${INSTALL_IS_OPTIONAL} EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} @@ -401,7 +403,7 @@ function(ADD_TEST_CASE REL_TEST_NAME) if (NOT "${ARROW_TEST_INCLUDE_LABELS}" STREQUAL "") set(_SKIP_TEST TRUE) - foreach (_INCLUDED_LABEL ${ARG_LABELS}) + foreach (_INCLUDED_LABEL ${ARROW_TEST_INCLUDE_LABELS}) if ("${ARG_LABELS}" MATCHES "${_INCLUDED_LABEL}") set(_SKIP_TEST FALSE) endif() @@ -569,3 +571,19 @@ function(ADD_ARROW_FUZZING REL_FUZZING_NAME) PROPERTIES LINK_FLAGS "-fsanitize=fuzzer") endfunction() + +################################################### + +function(ARROW_INSTALL_ALL_HEADERS PATH) + set(options) + set(one_value_args) + set(multi_value_args PATTERN) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if (NOT ARG_PATTERN) + set(ARG_PATTERN "*.h") + endif() + file(GLOB CURRENT_DIRECTORY_HEADERS ${ARG_PATTERN}) + install(FILES + ${CURRENT_DIRECTORY_HEADERS} + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PATH}") +endfunction() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 2d043a9a27627..e12d2d2ee2958 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -213,8 +213,7 @@ if (ARROW_BUILD_STATIC AND WIN32) target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) endif() -if (ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS - OR ARROW_BUILD_BENCHMARKS) +if (ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) # that depend on gtest ADD_ARROW_LIB(arrow_testing SOURCES test-util.cc @@ -244,28 +243,7 @@ foreach(LIB_TARGET ${ARROW_LIBRARIES}) endforeach() # Headers: top level -install(FILES - allocator.h - api.h - array.h - buffer.h - builder.h - compare.h - memory_pool.h - pretty_print.h - record_batch.h - status.h - stl.h - table.h - table_builder.h - tensor.h - type.h - type_fwd.h - type_traits.h - test-util.h - visitor.h - visitor_inline.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow") +ARROW_INSTALL_ALL_HEADERS("arrow") # pkg-config support configure_file(arrow.pc.in diff --git a/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt b/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt index db4264b59ab63..5bb5b725910e3 100644 --- a/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt +++ b/cpp/src/arrow/adapters/tensorflow/CMakeLists.txt @@ -15,7 +15,4 @@ # specific language governing permissions and limitations # under the License. -# Headers: top level -install(FILES - convert.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/adapters/tensorflow") +ARROW_INSTALL_ALL_HEADERS("arrow/adapters/tensorflow") diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index d4369ed27b7c4..242937005cf9c 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -15,12 +15,7 @@ # specific language governing permissions and limitations # under the License. -# Headers: top level -install(FILES - api.h - context.h - kernel.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/compute") +ARROW_INSTALL_ALL_HEADERS("arrow/compute") # pkg-config support configure_file(arrow-compute.pc.in diff --git a/cpp/src/arrow/csv/CMakeLists.txt b/cpp/src/arrow/csv/CMakeLists.txt index db23d6feff111..2a72dceadad16 100644 --- a/cpp/src/arrow/csv/CMakeLists.txt +++ b/cpp/src/arrow/csv/CMakeLists.txt @@ -29,9 +29,4 @@ ADD_ARROW_BENCHMARK(converter-benchmark ADD_ARROW_BENCHMARK(parser-benchmark PREFIX "arrow-csv") -# Headers: top level -file(GLOB_RECURSE ARROW_CSV_HEADERS "*.h") - -install(FILES - ${ARROW_CSV_HEADERS} - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/csv") +ARROW_INSTALL_ALL_HEADERS("arrow/csv") diff --git a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt index eb4446f05d971..9fd7f924d3a69 100644 --- a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt +++ b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt @@ -18,15 +18,7 @@ add_custom_target(arrow_hiveserver2) # Headers: top level -install(FILES - api.h - columnar-row-set.h - operation.h - service.h - session.h - types.h - util.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/dbi/hiveserver2") +ARROW_INSTALL_ALL_HEADERS("arrow/dbi/hiveserver2") set(ARROW_HIVESERVER2_SRCS columnar-row-set.cc @@ -115,7 +107,9 @@ if (ARROW_BUILD_TESTS) STATIC_LINK_LIBS "${ARROW_HIVESERVER2_TEST_LINK_LIBS}" LABELS "arrow_hiveserver2" ) - set_property(TARGET arrow-hiveserver2-test - APPEND_STRING PROPERTY COMPILE_FLAGS - " -Wno-shadow-field") + if (TARGET arrow-hiveserver2-test) + set_property(TARGET arrow-hiveserver2-test + APPEND_STRING PROPERTY COMPILE_FLAGS + " -Wno-shadow-field") + endif() endif(ARROW_BUILD_TESTS) diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index 60407acb0a1ec..c37779aefa9aa 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -63,12 +63,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cuda_version.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu") -install(FILES - cuda_api.h - cuda_arrow_ipc.h - cuda_context.h - cuda_memory.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu") +ARROW_INSTALL_ALL_HEADERS("arrow/gpu") # pkg-config support configure_file(arrow-cuda.pc.in diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 44c56f033269d..c44f7b9fe1bfe 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -88,16 +88,7 @@ add_custom_command( add_custom_target(metadata_fbs DEPENDS ${FBS_OUTPUT_FILES}) # Headers: top level -install(FILES - api.h - dictionary.h - feather.h - json.h - json-simple.h - message.h - reader.h - writer.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/ipc") +ARROW_INSTALL_ALL_HEADERS("arrow/ipc") if (ARROW_BUILD_STATIC) set(ARROW_UTIL_LIB arrow_static) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 7f4603ae5dfaf..4913083537340 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -91,29 +91,7 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") COMPILE_FLAGS -Wno-parentheses-equality) endif() -install(FILES - api.h - arrow_to_pandas.h - benchmark.h - common.h - config.h - decimal.h - deserialize.h - helpers.h - inference.h - init.h - io.h - iterators.h - numpy_convert.h - numpy_interop.h - numpy_to_arrow.h - python_to_arrow.h - platform.h - pyarrow.h - serialize.h - type_traits.h - visibility.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python") +ARROW_INSTALL_ALL_HEADERS("arrow/python") # pkg-config support configure_file(arrow-python.pc.in diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 4f515b52e8e64..a09797183212f 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -20,45 +20,7 @@ ####################################### # Headers: top level -install(FILES - bit-stream-utils.h - bit-util.h - bpacking.h - checked_cast.h - compiler-util.h - compression.h - compression_brotli.h - compression_bz2.h - compression_lz4.h - compression_snappy.h - compression_zlib.h - compression_zstd.h - cpu-info.h - date.h - decimal.h - hash-util.h - hashing.h - io-util.h - key_value_metadata.h - lazy.h - logging.h - macros.h - memory.h - neon-util.h - parallel.h - rle-encoding.h - sse-util.h - stl.h - stopwatch.h - string.h - string_view.h - thread-pool.h - type_traits.h - utf8.h - variant.h - visibility.h - windows_compatibility.h - DESTINATION include/arrow/util) +ARROW_INSTALL_ALL_HEADERS("arrow/util") ####################################### # arrow_test_main diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 5ef573875b660..9763f297b0b8b 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -92,31 +92,8 @@ add_dependencies(gandiva ${GANDIVA_LIBRARIES}) # install for gandiva include(GNUInstallDirs) -# install libgandiva -install( - TARGETS gandiva_shared gandiva_static - DESTINATION ${CMAKE_INSTALL_LIBDIR} -) - # install the header files. -install(FILES - arrow.h - condition.h - configuration.h - expression.h - expression_registry.h - filter.h - func_descriptor.h - function_signature.h - gandiva_aliases.h - literal_holder.h - logging.h - node.h - node_visitor.h - projector.h - selection_vector.h - tree_expr_builder.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/gandiva") +ARROW_INSTALL_ALL_HEADERS("gandiva") # pkg-config support configure_file(gandiva.pc.in @@ -141,6 +118,10 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) set(multi_value_args) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if (NO_TESTS) + return() + endif() + set(TEST_ARGUMENTS ENABLED PREFIX "gandiva" @@ -159,39 +140,35 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) STATIC_LINK_LIBS ${GANDIVA_SHARED_TEST_LINK_LIBS}) endif() - if(${REL_TEST_NAME} MATCHES "llvm" OR - ${REL_TEST_NAME} MATCHES "expression_registry") + set(TARGET_NAME gandiva-${REL_TEST_NAME}) + + if((TARGET ${TARGET_NAME}) AND + (${REL_TEST_NAME} MATCHES "llvm" OR + ${REL_TEST_NAME} MATCHES "expression_registry")) # If the unit test has llvm in its name, include llvm. - add_dependencies(gandiva-${REL_TEST_NAME} LLVM::LLVM_INTERFACE) - target_link_libraries(gandiva-${REL_TEST_NAME} PRIVATE LLVM::LLVM_INTERFACE) + add_dependencies(${TARGET_NAME} LLVM::LLVM_INTERFACE) + target_link_libraries(${TARGET_NAME} PRIVATE LLVM::LLVM_INTERFACE) endif() endfunction() -if (ARROW_GANDIVA_BUILD_TESTS) - ADD_GANDIVA_TEST(bitmap_accumulator_test) - ADD_GANDIVA_TEST(engine_llvm_test) - ADD_GANDIVA_TEST(function_signature_test) - ADD_GANDIVA_TEST(function_registry_test) - ADD_GANDIVA_TEST(llvm_types_test) - ADD_GANDIVA_TEST(llvm_generator_test) - ADD_GANDIVA_TEST(annotator_test) - ADD_GANDIVA_TEST(tree_expr_test) - ADD_GANDIVA_TEST(expr_decomposer_test) - ADD_GANDIVA_TEST(expression_registry_test) - ADD_GANDIVA_TEST(selection_vector_test) - ADD_GANDIVA_TEST(lru_cache_test) - ADD_GANDIVA_TEST(to_date_holder_test) - ADD_GANDIVA_TEST(simple_arena_test) -endif() +ADD_GANDIVA_TEST(bitmap_accumulator_test) +ADD_GANDIVA_TEST(engine_llvm_test) +ADD_GANDIVA_TEST(function_signature_test) +ADD_GANDIVA_TEST(function_registry_test) +ADD_GANDIVA_TEST(llvm_types_test) +ADD_GANDIVA_TEST(llvm_generator_test) +ADD_GANDIVA_TEST(annotator_test) +ADD_GANDIVA_TEST(tree_expr_test) +ADD_GANDIVA_TEST(expr_decomposer_test) +ADD_GANDIVA_TEST(expression_registry_test) +ADD_GANDIVA_TEST(selection_vector_test) +ADD_GANDIVA_TEST(lru_cache_test) +ADD_GANDIVA_TEST(to_date_holder_test) +ADD_GANDIVA_TEST(simple_arena_test) if (ARROW_GANDIVA_JAVA) add_subdirectory(jni) endif() -add_subdirectory(precompiled) - -if (ARROW_GANDIVA_BUILD_TESTS) - include(CTest) - enable_testing() - add_subdirectory(tests) -endif() +add_subdirectory(precompiled) +add_subdirectory(tests) diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 886fdced887ff..0792fd6421d65 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -69,7 +69,7 @@ function(add_precompiled_unit_test REL_TEST_NAME) endfunction(add_precompiled_unit_test REL_TEST_NAME) # testing -if (ARROW_GANDIVA_BUILD_TESTS) +if (ARROW_BUILD_TESTS) add_precompiled_unit_test(bitmap_test.cc bitmap.cc) add_precompiled_unit_test(epoch_time_point_test.cc) add_precompiled_unit_test(time_test.cc time.cc timestamp_arithmetic.cc ../context_helper.cc) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 246f69dcc09fa..6b7846b709d0b 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -238,26 +238,7 @@ add_subdirectory(api) add_subdirectory(arrow) add_subdirectory(util) -# Headers: top level -install(FILES - bloom_filter.h - column_reader.h - column_page.h - column_scanner.h - column_writer.h - encoding.h - exception.h - file_reader.h - file_writer.h - hasher.h - metadata.h - murmur3.h - printer.h - properties.h - schema.h - statistics.h - types.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet") +ARROW_INSTALL_ALL_HEADERS("parquet") configure_file(parquet_version.h.in "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h" diff --git a/cpp/src/parquet/api/CMakeLists.txt b/cpp/src/parquet/api/CMakeLists.txt index 79fc716952a16..48fddb9d61ddf 100644 --- a/cpp/src/parquet/api/CMakeLists.txt +++ b/cpp/src/parquet/api/CMakeLists.txt @@ -16,9 +16,4 @@ # under the License. # Headers: public api -install(FILES - io.h - reader.h - writer.h - schema.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/api") +ARROW_INSTALL_ALL_HEADERS("parquet/api") diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index 429dadcd37e5e..9372c3110a3af 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -22,9 +22,4 @@ ADD_ARROW_BENCHMARK(reader-writer-benchmark PREFIX "parquet-arrow" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) -# Headers: top level -install(FILES - reader.h - schema.h - writer.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/arrow") +ARROW_INSTALL_ALL_HEADERS("parquet/arrow") diff --git a/cpp/src/parquet/util/CMakeLists.txt b/cpp/src/parquet/util/CMakeLists.txt index 72d4ca28f9b83..b5718b1601ee0 100644 --- a/cpp/src/parquet/util/CMakeLists.txt +++ b/cpp/src/parquet/util/CMakeLists.txt @@ -16,12 +16,7 @@ # under the License. # Headers: util -install(FILES - comparison.h - macros.h - memory.h - visibility.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/util") +ARROW_INSTALL_ALL_HEADERS("parquet/util") ADD_PARQUET_TEST(comparison-test) ADD_PARQUET_TEST(memory-test) diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 4ea4b76066cf7..317835bb7ac44 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -127,6 +127,7 @@ endif() # be copied around and used in different locations. add_executable(plasma_store_server store.cc) target_link_libraries(plasma_store_server plasma_static ${PLASMA_STATIC_LINK_LIBS}) +add_dependencies(plasma plasma_store_server) if (ARROW_RPATH_ORIGIN) if (APPLE) @@ -138,7 +139,6 @@ if (ARROW_RPATH_ORIGIN) INSTALL_RPATH ${_lib_install_rpath}) endif() -# Headers: top level install(FILES common.h compat.h @@ -149,7 +149,9 @@ install(FILES # Plasma store set_target_properties(plasma_store_server PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) -install(TARGETS plasma_store_server DESTINATION ${CMAKE_INSTALL_BINDIR}) +install(TARGETS plasma_store_server + ${INSTALL_IS_OPTIONAL} + DESTINATION ${CMAKE_INSTALL_BINDIR}) # pkg-config support configure_file(plasma.pc.in diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 47aea28ff6828..bbbec29c13009 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -26,7 +26,9 @@ if (PARQUET_BUILD_EXECUTABLES) target_link_libraries(${TOOL} parquet_static) # Avoid unsetting RPATH when installing set_target_properties(${TOOL} PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) - install(TARGETS ${TOOL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + install(TARGETS ${TOOL} + ${INSTALL_IS_OPTIONAL} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) add_dependencies(parquet ${PARQUET_TOOLS}) diff --git a/dev/tasks/gandiva-jars/build-cpp.sh b/dev/tasks/gandiva-jars/build-cpp.sh index 21289dee5a6b1..ae13f9c0193ce 100755 --- a/dev/tasks/gandiva-jars/build-cpp.sh +++ b/dev/tasks/gandiva-jars/build-cpp.sh @@ -27,6 +27,7 @@ pushd arrow/cpp pushd build cmake -DCMAKE_BUILD_TYPE=Release \ -DARROW_GANDIVA=ON \ + -DARROW_GANDIVA_JAVA=ON \ -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_BOOST_USE_SHARED=OFF \ From 73f94c93d7eee25a43415dfa7a806b887942abd1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Dec 2018 14:49:12 -0600 Subject: [PATCH 040/328] ARROW-3762: [C++/Python] Support reading Parquet BYTE_ARRAY columns containing over 2GB of data This patch ended up being a bit more of a bloodbath than I planned: please accept my apologies. Associated changes in this patch: * Split up builder.h/builder.cc into a new arrow/array directory. Public arrow/builder.h API preserved. I think this code is going to keep growing more specialized components, so I think we should get out ahead of it by having a subdirectory to contain files related to implementation details * Implement ChunkedBinaryBuilder, ChunkedStringBuilder classes, add tests and benchmarks * Deprecate parquet::arrow methods returning Array * Allow implicit construction of Datum from its variant types (makes for a lot nicer syntax) As far as what code to review, focus efforts on * src/parquet/arrow * src/arrow/array/builder_binary.h/cc, array-binary-test.cc, builder-benchmark * src/arrow/compute changes * Python changes I'm going to tackle ARROW-2970 which should not be complicated after this patch; I will submit that as a PR after this is reviews and merged. Author: Wes McKinney Closes #3171 from wesm/ARROW-3762 and squashes the following commits: 822451280 Fix int conversion warning on Windows 695ffc9df Remove unimplemented and unused ChunkedBinaryBuilder ctor 5a525115c Use strnlen to compute string length. Inline BinaryBuilder::AppendNextOffset b90eb4b71 Restore sstream include to pretty_print.cc 3669201be Fix deprecated API use 5fdbbb261 Rename columnar/ directory to array/ 8ffaec1ef Address preliminary code comments. Check in missing files 81e787c69 Fix up Python bindings, unit test 2efae064c Finish scaffolding. Get fully compiling again and original parquet-arrow test suite passing 3d075e4aa Additional refactoring to make things chunked. Allow implicit construction of arrow::compute::Datum 922811278 More refactoring 716322377 Split up builder.h, builder.cc into smaller headers, compilation units. add failing test case for ARROW-3762. Add ChunkedBinaryBuilder, make BinaryBuilder Append methods inline --- cpp/cmake_modules/SetupCxxFlags.cmake | 3 + cpp/examples/parquet/CMakeLists.txt | 2 +- .../parquet/parquet-arrow/CMakeLists.txt | 2 +- .../parquet-arrow/{src => }/reader-writer.cc | 4 +- cpp/src/arrow/CMakeLists.txt | 15 +- cpp/src/arrow/allocator-test.cc | 1 + cpp/src/arrow/array-binary-test.cc | 114 +- cpp/src/arrow/array-dict-test.cc | 8 +- cpp/src/arrow/array-list-test.cc | 4 +- cpp/src/arrow/array-struct-test.cc | 4 +- cpp/src/arrow/array-test.cc | 2 - cpp/src/arrow/array.cc | 1 + cpp/src/arrow/array.h | 1 - cpp/src/arrow/array/CMakeLists.txt | 27 + cpp/src/arrow/array/README.md | 20 + .../builder_adaptive.cc} | 4 +- cpp/src/arrow/array/builder_adaptive.h | 174 +++ cpp/src/arrow/array/builder_base.cc | 176 +++ cpp/src/arrow/array/builder_base.h | 227 ++++ .../builder_binary.cc} | 78 +- cpp/src/arrow/array/builder_binary.h | 304 +++++ cpp/src/arrow/array/builder_decimal.cc | 64 + cpp/src/arrow/array/builder_decimal.h | 45 + .../builder_dict.cc} | 4 +- cpp/src/arrow/array/builder_dict.h | 167 +++ cpp/src/arrow/array/builder_nested.cc | 156 +++ cpp/src/arrow/array/builder_nested.h | 121 ++ cpp/src/arrow/array/builder_primitive.cc | 272 ++++ cpp/src/arrow/array/builder_primitive.h | 401 ++++++ cpp/src/arrow/builder-benchmark.cc | 30 +- cpp/src/arrow/builder.cc | 503 +------ cpp/src/arrow/builder.h | 1177 +---------------- cpp/src/arrow/compute/compute-test.cc | 61 +- cpp/src/arrow/compute/kernel.h | 35 +- cpp/src/arrow/csv/column-builder.h | 21 +- cpp/src/arrow/csv/converter.cc | 1 + cpp/src/arrow/csv/parser.h | 1 + cpp/src/arrow/csv/reader.cc | 2 + cpp/src/arrow/io/buffered.cc | 2 +- cpp/src/arrow/io/buffered.h | 1 + cpp/src/arrow/ipc/feather-test.cc | 1 + cpp/src/arrow/ipc/json-simple-test.cc | 1 + cpp/src/arrow/memory_pool-test.h | 1 + cpp/src/arrow/memory_pool.cc | 12 +- cpp/src/arrow/pretty_print-test.cc | 4 +- cpp/src/arrow/pretty_print.cc | 2 +- cpp/src/arrow/pretty_print.h | 5 +- cpp/src/arrow/python/numpy_to_arrow.cc | 27 +- cpp/src/arrow/python/python-test.cc | 1 + cpp/src/arrow/record_batch.h | 1 + cpp/src/arrow/table.h | 5 + cpp/src/arrow/tensor.cc | 1 + cpp/src/arrow/test-util.cc | 13 +- cpp/src/arrow/test-util.h | 18 +- cpp/src/arrow/util/compression_lz4.cc | 1 + cpp/src/arrow/util/int-util-test.cc | 2 - cpp/src/arrow/util/string_view.h | 2 +- cpp/src/parquet/arrow/CMakeLists.txt | 5 +- .../parquet/arrow/arrow-reader-writer-test.cc | 15 +- cpp/src/parquet/arrow/reader.cc | 567 ++++---- cpp/src/parquet/arrow/reader.h | 38 +- cpp/src/parquet/arrow/record_reader.cc | 103 +- cpp/src/parquet/arrow/record_reader.h | 7 +- python/pyarrow/_parquet.pxd | 6 +- python/pyarrow/_parquet.pyx | 23 +- python/pyarrow/lib.pxd | 2 + python/pyarrow/tests/test_parquet.py | 27 + 67 files changed, 2985 insertions(+), 2140 deletions(-) rename cpp/examples/parquet/parquet-arrow/{src => }/reader-writer.cc (98%) create mode 100644 cpp/src/arrow/array/CMakeLists.txt create mode 100644 cpp/src/arrow/array/README.md rename cpp/src/arrow/{builder-adaptive.cc => array/builder_adaptive.cc} (99%) create mode 100644 cpp/src/arrow/array/builder_adaptive.h create mode 100644 cpp/src/arrow/array/builder_base.cc create mode 100644 cpp/src/arrow/array/builder_base.h rename cpp/src/arrow/{builder-binary.cc => array/builder_binary.cc} (86%) create mode 100644 cpp/src/arrow/array/builder_binary.h create mode 100644 cpp/src/arrow/array/builder_decimal.cc create mode 100644 cpp/src/arrow/array/builder_decimal.h rename cpp/src/arrow/{builder-dict.cc => array/builder_dict.cc} (99%) create mode 100644 cpp/src/arrow/array/builder_dict.h create mode 100644 cpp/src/arrow/array/builder_nested.cc create mode 100644 cpp/src/arrow/array/builder_nested.h create mode 100644 cpp/src/arrow/array/builder_primitive.cc create mode 100644 cpp/src/arrow/array/builder_primitive.h diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 893ec360d3e55..61fd14ca2cf46 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -25,6 +25,9 @@ CHECK_CXX_COMPILER_FLAG("-maltivec" CXX_SUPPORTS_ALTIVEC) # Arm64 compiler flags CHECK_CXX_COMPILER_FLAG("-march=armv8-a+crc" CXX_SUPPORTS_ARMCRC) +# Support C11 +set(CMAKE_C_STANDARD 11) + # This ensures that things like gnu++11 get passed correctly set(CMAKE_CXX_STANDARD 11) diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 98c5cd9402bb7..db172a2534f37 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -22,7 +22,7 @@ target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) -add_executable(parquet-arrow-example parquet-arrow/src/reader-writer.cc) +add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) target_link_libraries(parquet-arrow-example parquet_shared) add_dependencies(parquet diff --git a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt index d9e01acd3eea3..915930ec228e1 100644 --- a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt +++ b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt @@ -38,5 +38,5 @@ find_package(Parquet) include_directories(SYSTEM ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR}) -add_executable(parquet-arrow-example src/reader-writer.cc) +add_executable(parquet-arrow-example reader-writer.cc) target_link_libraries(parquet-arrow-example ${PARQUET_SHARED_LIB} ${ARROW_SHARED_LIB}) diff --git a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc b/cpp/examples/parquet/parquet-arrow/reader-writer.cc similarity index 98% rename from cpp/examples/parquet/parquet-arrow/src/reader-writer.cc rename to cpp/examples/parquet/parquet-arrow/reader-writer.cc index 8d474486e7413..a5f928b6d4f69 100644 --- a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc +++ b/cpp/examples/parquet/parquet-arrow/reader-writer.cc @@ -100,7 +100,7 @@ void read_single_column() { std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - std::shared_ptr array; + std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; @@ -119,7 +119,7 @@ void read_single_column_chunk() { std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - std::shared_ptr array; + std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index e12d2d2ee2958..b13c9b66ac48d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -53,11 +53,17 @@ endfunction() set(ARROW_SRCS array.cc - buffer.cc + builder.cc - builder-adaptive.cc - builder-binary.cc - builder-dict.cc + array/builder_adaptive.cc + array/builder_base.cc + array/builder_binary.cc + array/builder_decimal.cc + array/builder_dict.cc + array/builder_nested.cc + array/builder_primitive.cc + + buffer.cc compare.cc memory_pool.cc pretty_print.cc @@ -275,6 +281,7 @@ ADD_ARROW_TEST(tensor-test) ADD_ARROW_BENCHMARK(builder-benchmark) ADD_ARROW_BENCHMARK(column-benchmark) +add_subdirectory(array) add_subdirectory(csv) add_subdirectory(io) add_subdirectory(util) diff --git a/cpp/src/arrow/allocator-test.cc b/cpp/src/arrow/allocator-test.cc index cdffbd7e8494f..1a94467281dbc 100644 --- a/cpp/src/arrow/allocator-test.cc +++ b/cpp/src/arrow/allocator-test.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/cpp/src/arrow/array-binary-test.cc b/cpp/src/arrow/array-binary-test.cc index 4376695c68cba..6f938c82bfd0a 100644 --- a/cpp/src/arrow/array-binary-test.cc +++ b/cpp/src/arrow/array-binary-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -28,10 +26,14 @@ #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" namespace arrow { @@ -676,4 +678,112 @@ TEST_F(TestStringArray, TestSliceEquality) { CheckSliceEquality(); } TEST_F(TestBinaryArray, LengthZeroCtor) { BinaryArray array(0, nullptr, nullptr); } +// ---------------------------------------------------------------------- +// ChunkedBinaryBuilder tests + +class TestChunkedBinaryBuilder : public ::testing::Test { + public: + void SetUp() {} + + void Init(int32_t chunksize) { + builder_.reset(new internal::ChunkedBinaryBuilder(chunksize)); + } + + protected: + std::unique_ptr builder_; +}; + +TEST_F(TestChunkedBinaryBuilder, BasicOperation) { + const int32_t chunksize = 1000; + Init(chunksize); + + const int elem_size = 10; + uint8_t buf[elem_size]; + + BinaryBuilder unchunked_builder; + + const int iterations = 1000; + for (int i = 0; i < iterations; ++i) { + random_bytes(elem_size, i, buf); + + ASSERT_OK(unchunked_builder.Append(buf, elem_size)); + ASSERT_OK(builder_->Append(buf, elem_size)); + } + + std::shared_ptr unchunked; + ASSERT_OK(unchunked_builder.Finish(&unchunked)); + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + + // This assumes that everything is evenly divisible + ArrayVector expected_chunks; + const int elems_per_chunk = chunksize / elem_size; + for (int i = 0; i < iterations / elems_per_chunk; ++i) { + expected_chunks.emplace_back(unchunked->Slice(i * elems_per_chunk, elems_per_chunk)); + } + + ASSERT_EQ(expected_chunks.size(), chunks.size()); + for (size_t i = 0; i < chunks.size(); ++i) { + AssertArraysEqual(*expected_chunks[i], *chunks[i]); + } +} + +TEST_F(TestChunkedBinaryBuilder, NoData) { + Init(1000); + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + + ASSERT_EQ(1, chunks.size()); + ASSERT_EQ(0, chunks[0]->length()); +} + +TEST_F(TestChunkedBinaryBuilder, LargeElements) { + Init(100); + + const int bufsize = 101; + uint8_t buf[bufsize]; + + const int iterations = 100; + for (int i = 0; i < iterations; ++i) { + random_bytes(bufsize, i, buf); + ASSERT_OK(builder_->Append(buf, bufsize)); + } + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + ASSERT_EQ(iterations, static_cast(chunks.size())); + + int64_t total_data_size = 0; + for (auto chunk : chunks) { + ASSERT_EQ(1, chunk->length()); + total_data_size += + static_cast(static_cast(*chunk).GetView(0).size()); + } + ASSERT_EQ(iterations * bufsize, total_data_size); +} + +TEST(TestChunkedStringBuilder, BasicOperation) { + const int chunksize = 100; + internal::ChunkedStringBuilder builder(chunksize); + + std::string value = "0123456789"; + + const int iterations = 100; + for (int i = 0; i < iterations; ++i) { + ASSERT_OK(builder.Append(value)); + } + + ArrayVector chunks; + ASSERT_OK(builder.Finish(&chunks)); + + ASSERT_EQ(10, chunks.size()); + + // Type is correct + for (auto chunk : chunks) { + ASSERT_TRUE(chunk->type()->Equals(*::arrow::utf8())); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index cc471a3e54066..87cb2290a7bf9 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -15,23 +15,23 @@ // specific language governing permissions and limitations // under the License. -#include +#include #include -#include -#include #include +#include #include #include #include #include "arrow/array.h" -#include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/decimal.h" namespace arrow { diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc index 207acd4cf65d7..c49c5e3097058 100644 --- a/cpp/src/arrow/array-list-test.cc +++ b/cpp/src/arrow/array-list-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -32,6 +30,8 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" namespace arrow { diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc index dc8bafd4c0071..68c35f57116a8 100644 --- a/cpp/src/arrow/array-struct-test.cc +++ b/cpp/src/arrow/array-struct-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -26,12 +24,12 @@ #include #include "arrow/array.h" -#include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" namespace arrow { diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index de0885e6f5f3a..bdb7eda118d51 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -40,7 +39,6 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 05d66d5cffdb2..ff94aa2a1e6fe 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -18,6 +18,7 @@ #include "arrow/array.h" #include +#include #include #include #include diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 37fa5aedfc2d0..52c5207d8dddc 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -18,7 +18,6 @@ #ifndef ARROW_ARRAY_H #define ARROW_ARRAY_H -#include #include #include #include diff --git a/cpp/src/arrow/array/CMakeLists.txt b/cpp/src/arrow/array/CMakeLists.txt new file mode 100644 index 0000000000000..a789c88dd9d31 --- /dev/null +++ b/cpp/src/arrow/array/CMakeLists.txt @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Headers: top level +install(FILES + builder_adaptive.h + builder_base.h + builder_binary.h + builder_decimal.h + builder_dict.h + builder_nested.h + builder_primitive.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/array") diff --git a/cpp/src/arrow/array/README.md b/cpp/src/arrow/array/README.md new file mode 100644 index 0000000000000..09580193aad28 --- /dev/null +++ b/cpp/src/arrow/array/README.md @@ -0,0 +1,20 @@ + + +## Implementation details related to columnnar (array) data structures diff --git a/cpp/src/arrow/builder-adaptive.cc b/cpp/src/arrow/array/builder_adaptive.cc similarity index 99% rename from cpp/src/arrow/builder-adaptive.cc rename to cpp/src/arrow/array/builder_adaptive.cc index a715f469c7aa1..599e9e1c38d76 100644 --- a/cpp/src/arrow/builder-adaptive.cc +++ b/cpp/src/arrow/array/builder_adaptive.cc @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/array/builder_adaptive.h" + +#include #include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h new file mode 100644 index 0000000000000..6523de41622e4 --- /dev/null +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_base.h" + +namespace arrow { + +namespace internal { + +class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { + public: + explicit AdaptiveIntBuilderBase(MemoryPool* pool); + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(CommitPendingData()); + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + Status AppendNull() { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 0; + pending_has_nulls_ = true; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + virtual Status CommitPendingData() = 0; + + std::shared_ptr data_; + uint8_t* raw_data_; + uint8_t int_size_; + + static constexpr int32_t pending_size_ = 1024; + uint8_t pending_valid_[pending_size_]; + uint64_t pending_data_[pending_size_]; + int32_t pending_pos_; + bool pending_has_nulls_; +}; + +} // namespace internal + +class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveUIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using ArrayBuilder::Advance; + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const uint64_t val) { + pending_data_[pending_pos_] = val; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); +#define __LESS(a, b) (a) < (b) + template + typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); +#undef __LESS + + template + Status ExpandIntSizeN(); +}; + +class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using ArrayBuilder::Advance; + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const int64_t val) { + auto v = static_cast(val); + + pending_data_[pending_pos_] = v; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const int64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); +#define __LESS(a, b) (a) < (b) + template + typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); +#undef __LESS + + template + Status ExpandIntSizeN(); +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc new file mode 100644 index 0000000000000..321aa44dab5e3 --- /dev/null +++ b/cpp/src/arrow/array/builder_base.cc @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_base.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { + if (buffer) { + if (bytes_filled < buffer->size()) { + // Trim buffer + RETURN_NOT_OK(buffer->Resize(bytes_filled)); + } + // zero the padding + buffer->ZeroPadding(); + } else { + // Null buffers are allowed in place of 0-byte buffers + DCHECK_EQ(bytes_filled, 0); + } + return Status::OK(); +} + +Status ArrayBuilder::AppendToBitmap(bool is_valid) { + if (length_ == capacity_) { + // If the capacity was not already a multiple of 2, do so here + // TODO(emkornfield) doubling isn't great default allocation practice + // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md + // fo discussion + RETURN_NOT_OK(Resize(BitUtil::NextPower2(capacity_ + 1))); + } + UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + RETURN_NOT_OK(Reserve(length)); + + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +Status ArrayBuilder::Resize(int64_t capacity) { + // Target size of validity (null) bitmap data + const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &null_bitmap_)); + null_bitmap_data_ = null_bitmap_->mutable_data(); + + // Padding is zeroed by AllocateResizableBuffer + memset(null_bitmap_data_, 0, static_cast(new_bitmap_size)); + } else { + const int64_t old_bitmap_capacity = null_bitmap_->capacity(); + RETURN_NOT_OK(null_bitmap_->Resize(new_bitmap_size)); + + const int64_t new_bitmap_capacity = null_bitmap_->capacity(); + null_bitmap_data_ = null_bitmap_->mutable_data(); + + // Zero the region between the original capacity and the new capacity, + // including padding, which has not been zeroed, unlike + // AllocateResizableBuffer + if (old_bitmap_capacity < new_bitmap_capacity) { + memset(null_bitmap_data_ + old_bitmap_capacity, 0, + static_cast(new_bitmap_capacity - old_bitmap_capacity)); + } + } + capacity_ = capacity; + return Status::OK(); +} + +Status ArrayBuilder::Advance(int64_t elements) { + if (length_ + elements > capacity_) { + return Status::Invalid("Builder must be expanded"); + } + length_ += elements; + return Status::OK(); +} + +Status ArrayBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr internal_data; + RETURN_NOT_OK(FinishInternal(&internal_data)); + *out = MakeArray(internal_data); + return Status::OK(); +} + +Status ArrayBuilder::Reserve(int64_t additional_elements) { + if (length_ + additional_elements > capacity_) { + // TODO(emkornfield) power of 2 growth is potentially suboptimal + int64_t new_size = BitUtil::NextPower2(length_ + additional_elements); + return Resize(new_size); + } + return Status::OK(); +} + +void ArrayBuilder::Reset() { + capacity_ = length_ = null_count_ = 0; + null_bitmap_ = nullptr; +} + +Status ArrayBuilder::SetNotNull(int64_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeSetNotNull(length); + return Status::OK(); +} + +void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + if (valid_bytes == nullptr) { + UnsafeSetNotNull(length); + return; + } + UnsafeAppendToBitmap(valid_bytes, valid_bytes + length); +} + +void ArrayBuilder::UnsafeAppendToBitmap(const std::vector& is_valid) { + UnsafeAppendToBitmap(is_valid.begin(), is_valid.end()); +} + +void ArrayBuilder::UnsafeSetNotNull(int64_t length) { + const int64_t new_length = length + length_; + + // Fill up the bytes until we have a byte alignment + int64_t pad_to_byte = std::min(8 - (length_ % 8), length); + + if (pad_to_byte == 8) { + pad_to_byte = 0; + } + for (int64_t i = length_; i < length_ + pad_to_byte; ++i) { + BitUtil::SetBit(null_bitmap_data_, i); + } + + // Fast bitsetting + int64_t fast_length = (length - pad_to_byte) / 8; + memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF, + static_cast(fast_length)); + + // Trailing bits + for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) { + BitUtil::SetBit(null_bitmap_data_, i); + } + + length_ = new_length; +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h new file mode 100644 index 0000000000000..ae400fc463810 --- /dev/null +++ b/cpp/src/arrow/array/builder_base.h @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array/builder_base.h" + +#include // IWYU pragma: keep +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +struct ArrayData; + +constexpr int64_t kMinBuilderCapacity = 1 << 5; +constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; + +/// Base class for all data array builders. +/// +/// This class provides a facilities for incrementally building the null bitmap +/// (see Append methods) and as a side effect the current number of slots and +/// the null count. +/// +/// \note Users are expected to use builders as one of the concrete types below. +/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. +class ARROW_EXPORT ArrayBuilder { + public: + explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) + : type_(type), + pool_(pool), + null_bitmap_(NULLPTR), + null_count_(0), + null_bitmap_data_(NULLPTR), + length_(0), + capacity_(0) {} + + virtual ~ArrayBuilder() = default; + + /// For nested types. Since the objects are owned by this class instance, we + /// skip shared pointers and just return a raw pointer + ArrayBuilder* child(int i) { return children_[i].get(); } + + int num_children() const { return static_cast(children_.size()); } + + int64_t length() const { return length_; } + int64_t null_count() const { return null_count_; } + int64_t capacity() const { return capacity_; } + + /// \brief Ensure that enough memory has been allocated to fit the indicated + /// number of total elements in the builder, including any that have already + /// been appended. Does not account for reallocations that may be due to + /// variable size data, like binary values. To make space for incremental + /// appends, use Reserve instead. + /// + /// \param[in] capacity the minimum number of total array values to + /// accommodate. Must be greater than the current capacity. + /// \return Status + virtual Status Resize(int64_t capacity); + + /// \brief Ensure that there is enough space allocated to add the indicated + /// number of elements without any further calls to Resize. The memory + /// allocated is rounded up to the next highest power of 2 similar to memory + /// allocations in STL containers like std::vector + /// \param[in] additional_capacity the number of additional array values + /// \return Status + Status Reserve(int64_t additional_capacity); + + /// Reset the builder. + virtual void Reset(); + + /// For cases where raw data was memcpy'd into the internal buffers, allows us + /// to advance the length of the builder. It is your responsibility to use + /// this function responsibly. + Status Advance(int64_t elements); + + /// \brief Return result of builder as an internal generic ArrayData + /// object. Resets builder except for dictionary builder + /// + /// \param[out] out the finalized ArrayData object + /// \return Status + virtual Status FinishInternal(std::shared_ptr* out) = 0; + + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \param[out] out the finalized Array object + /// \return Status + Status Finish(std::shared_ptr* out); + + std::shared_ptr type() const { return type_; } + + protected: + ArrayBuilder() {} + + /// Append to null bitmap + Status AppendToBitmap(bool is_valid); + + /// Vector append. Treat each zero byte as a null. If valid_bytes is null + /// assume all of length bits are valid. + Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); + + /// Set the next length bits to not null (i.e. valid). + Status SetNotNull(int64_t length); + + // Unsafe operations (don't check capacity/don't resize) + + void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } + + // Append to null bitmap, update the length + void UnsafeAppendToBitmap(bool is_valid) { + if (is_valid) { + BitUtil::SetBit(null_bitmap_data_, length_); + } else { + ++null_count_; + } + ++length_; + } + + template + void UnsafeAppendToBitmap(const IterType& begin, const IterType& end) { + int64_t byte_offset = length_ / 8; + int64_t bit_offset = length_ % 8; + uint8_t bitset = null_bitmap_data_[byte_offset]; + + for (auto iter = begin; iter != end; ++iter) { + if (bit_offset == 8) { + bit_offset = 0; + null_bitmap_data_[byte_offset] = bitset; + byte_offset++; + // TODO: Except for the last byte, this shouldn't be needed + bitset = null_bitmap_data_[byte_offset]; + } + + if (*iter) { + bitset |= BitUtil::kBitmask[bit_offset]; + } else { + bitset &= BitUtil::kFlippedBitmask[bit_offset]; + ++null_count_; + } + + bit_offset++; + } + + if (bit_offset != 0) { + null_bitmap_data_[byte_offset] = bitset; + } + + length_ += std::distance(begin, end); + } + + // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null + // assume all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length); + + void UnsafeAppendToBitmap(const std::vector& is_valid); + + // Set the next length bits to not null (i.e. valid). + void UnsafeSetNotNull(int64_t length); + + static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); + + static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { + if (new_capacity < 0) { + return Status::Invalid("Resize capacity must be positive"); + } + if (new_capacity < old_capacity) { + return Status::Invalid("Resize cannot downsize"); + } + return Status::OK(); + } + + std::shared_ptr type_; + MemoryPool* pool_; + + // When null_bitmap are first appended to the builder, the null bitmap is allocated + std::shared_ptr null_bitmap_; + int64_t null_count_; + uint8_t* null_bitmap_data_; + + // Array length, so far. Also, the index of the next element to be added + int64_t length_; + int64_t capacity_; + + // Child value array builders. These are owned by this class + std::vector> children_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); +}; + +} // namespace arrow diff --git a/cpp/src/arrow/builder-binary.cc b/cpp/src/arrow/array/builder_binary.cc similarity index 86% rename from cpp/src/arrow/builder-binary.cc rename to cpp/src/arrow/array/builder_binary.cc index c250837b4a3fa..ad6ba11a484d1 100644 --- a/cpp/src/arrow/builder-binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/array/builder_binary.h" + #include #include #include @@ -27,7 +29,6 @@ #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -68,32 +69,11 @@ Status BinaryBuilder::ReserveData(int64_t elements) { return Status::OK(); } -Status BinaryBuilder::AppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { - std::stringstream ss; - ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " - << num_bytes; - return Status::CapacityError(ss.str()); - } - return offsets_builder_.Append(static_cast(num_bytes)); -} - -Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { - RETURN_NOT_OK(Reserve(1)); - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(value_data_builder_.Append(value, length)); - - UnsafeAppendToBitmap(true); - return Status::OK(); -} - -Status BinaryBuilder::AppendNull() { - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(Reserve(1)); - - UnsafeAppendToBitmap(false); - return Status::OK(); +Status BinaryBuilder::AppendOverflow(int64_t num_bytes) { + std::stringstream ss; + ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " + << num_bytes; + return Status::CapacityError(ss.str()); } Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { @@ -292,24 +272,46 @@ util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { } // ---------------------------------------------------------------------- -// Decimal128Builder +// ChunkedArray builders -Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool) - : FixedSizeBinaryBuilder(type, pool) {} +namespace internal { -Status Decimal128Builder::Append(const Decimal128& value) { - RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); - return FixedSizeBinaryBuilder::Append(value.ToBytes()); +ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_size, MemoryPool* pool) + : max_chunk_size_(max_chunk_size), + chunk_data_size_(0), + builder_(new BinaryBuilder(pool)) {} + +Status ChunkedBinaryBuilder::Finish(ArrayVector* out) { + if (builder_->length() > 0 || chunks_.size() == 0) { + std::shared_ptr chunk; + RETURN_NOT_OK(builder_->Finish(&chunk)); + chunks_.emplace_back(std::move(chunk)); + } + *out = std::move(chunks_); + return Status::OK(); } -Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr data; - RETURN_NOT_OK(byte_builder_.Finish(&data)); +Status ChunkedBinaryBuilder::NextChunk() { + std::shared_ptr chunk; + RETURN_NOT_OK(builder_->Finish(&chunk)); + chunks_.emplace_back(std::move(chunk)); - *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); + chunk_data_size_ = 0; + return Status::OK(); +} +Status ChunkedStringBuilder::Finish(ArrayVector* out) { + RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); + + // Change data type to string/utf8 + for (size_t i = 0; i < out->size(); ++i) { + std::shared_ptr data = (*out)[i]->data(); + data->type = ::arrow::utf8(); + (*out)[i] = std::make_shared(data); + } return Status::OK(); } +} // namespace internal + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h new file mode 100644 index 0000000000000..7c101bdffc5e4 --- /dev/null +++ b/cpp/src/arrow/array/builder_binary.h @@ -0,0 +1,304 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_base.h" +#include "arrow/status.h" +#include "arrow/type_traits.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" + +namespace arrow { + +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; + +// ---------------------------------------------------------------------- +// Binary and String + +/// \class BinaryBuilder +/// \brief Builder class for variable-length binary data +class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { + public: + explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + + Status Append(const uint8_t* value, int32_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status AppendNull() { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + Status Append(const char* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(util::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Offsets and data should have been presized using Reserve() and + /// ReserveData(), respectively. + void UnsafeAppend(const uint8_t* value, int32_t length) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(value, length); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppend(const char* value, int32_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppendNull() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(false); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \return size of values buffer so far + int64_t value_data_length() const { return value_data_builder_.length(); } + /// \return capacity of values buffer + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i, int32_t* out_length) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + util::string_view GetView(int64_t i) const; + + protected: + TypedBufferBuilder offsets_builder_; + TypedBufferBuilder value_data_builder_; + + Status AppendOverflow(int64_t num_bytes); + + Status AppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { + return AppendOverflow(num_bytes); + } + return offsets_builder_.Append(static_cast(num_bytes)); + } + + void UnsafeAppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } +}; + +/// \class StringBuilder +/// \brief Builder class for UTF8 strings +class ARROW_EXPORT StringBuilder : public BinaryBuilder { + public: + using BinaryBuilder::BinaryBuilder; + explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using BinaryBuilder::Append; + using BinaryBuilder::Reset; + using BinaryBuilder::UnsafeAppend; + + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector& values, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of nul-terminated strings in one shot. + /// If one of the values is NULL, it is processed as a null + /// value even if the corresponding valid_bytes entry is 1. + /// + /// \param[in] values a contiguous C array of nul-terminated char * + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const char** values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); +}; + +// ---------------------------------------------------------------------- +// FixedSizeBinaryBuilder + +class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { + public: + FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + Status Append(const uint8_t* value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return byte_builder_.Append(value, byte_width_); + } + + Status Append(const char* value) { + return Append(reinterpret_cast(value)); + } + + Status Append(const util::string_view& view) { +#ifndef NDEBUG + CheckValueSize(static_cast(view.size())); +#endif + return Append(reinterpret_cast(view.data())); + } + + Status Append(const std::string& s) { +#ifndef NDEBUG + CheckValueSize(static_cast(s.size())); +#endif + return Append(reinterpret_cast(s.data())); + } + + template + Status Append(const std::array& value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return byte_builder_.Append(value); + } + + Status AppendValues(const uint8_t* data, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + Status AppendNull(); + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \return size of values buffer so far + int64_t value_data_length() const { return byte_builder_.length(); } + + int32_t byte_width() const { return byte_width_; } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + util::string_view GetView(int64_t i) const; + + protected: + int32_t byte_width_; + BufferBuilder byte_builder_; + +#ifndef NDEBUG + void CheckValueSize(int64_t size); +#endif +}; + +// ---------------------------------------------------------------------- +// Chunked builders: build a sequence of BinaryArray or StringArray that are +// limited to a particular size (to the upper limit of 2GB) + +namespace internal { + +class ARROW_EXPORT ChunkedBinaryBuilder { + public: + ChunkedBinaryBuilder(int32_t max_chunk_size, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + virtual ~ChunkedBinaryBuilder() = default; + + Status Append(const uint8_t* value, int32_t length) { + if (ARROW_PREDICT_FALSE(length + chunk_data_size_ > max_chunk_size_)) { + // Move onto next chunk, unless the builder length is currently 0, which + // means that max_chunk_size_ is less than the item length + if (builder_->length() > 0) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + // else fall through + } + + chunk_data_size_ += length; + return builder_->Append(value, length); + } + + Status Append(const util::string_view& value) { + return Append(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + Status AppendNull() { + if (ARROW_PREDICT_FALSE(builder_->length() == std::numeric_limits::max())) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + return builder_->AppendNull(); + } + + virtual Status Finish(ArrayVector* out); + + protected: + Status NextChunk(); + + int32_t max_chunk_size_; + int32_t chunk_data_size_; + + std::unique_ptr builder_; + std::vector> chunks_; +}; + +class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder { + public: + using ChunkedBinaryBuilder::ChunkedBinaryBuilder; + + Status Finish(ArrayVector* out) override; +}; + +} // namespace internal + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_decimal.cc b/cpp/src/arrow/array/builder_decimal.cc new file mode 100644 index 0000000000000..d64c4db6f0c30 --- /dev/null +++ b/cpp/src/arrow/array/builder_decimal.cc @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_decimal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Decimal128Builder + +Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool) + : FixedSizeBinaryBuilder(type, pool) {} + +Status Decimal128Builder::Append(const Decimal128& value) { + RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); + return FixedSizeBinaryBuilder::Append(value.ToBytes()); +} + +Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr data; + RETURN_NOT_OK(byte_builder_.Finish(&data)); + + *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); + + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h new file mode 100644 index 0000000000000..fb40a7950abbd --- /dev/null +++ b/cpp/src/arrow/array/builder_decimal.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" + +namespace arrow { + +class Decimal128; + +class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { + public: + explicit Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(const Decimal128& val); + + Status FinishInternal(std::shared_ptr* out) override; +}; + +using DecimalBuilder = Decimal128Builder; + +} // namespace arrow diff --git a/cpp/src/arrow/builder-dict.cc b/cpp/src/arrow/array/builder_dict.cc similarity index 99% rename from cpp/src/arrow/builder-dict.cc rename to cpp/src/arrow/array/builder_dict.cc index b021c3a9d37cc..0891e4c0829f4 100644 --- a/cpp/src/arrow/builder-dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/array/builder_dict.h" + +#include #include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h new file mode 100644 index 0000000000000..6f0271683aea2 --- /dev/null +++ b/cpp/src/arrow/array/builder_dict.h @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export + +namespace arrow { + +// ---------------------------------------------------------------------- +// Dictionary builder + +namespace internal { + +template +struct DictionaryScalar { + using type = typename T::c_type; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +} // namespace internal + +/// \brief Array builder for created encoded DictionaryArray from dense array +/// +/// Unlike other builders, dictionary builder does not completely reset the state +/// on Finish calls. The arrays built after the initial Finish call will reuse +/// the previously created encoding and build a delta dictionary when new terms +/// occur. +/// +/// data +template +class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { + public: + using Scalar = typename internal::DictionaryScalar::type; + + // WARNING: the type given below is the value type, not the DictionaryType. + // The DictionaryType is instantiated on the Finish() call. + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + + template + explicit DictionaryBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) + : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} + + ~DictionaryBuilder() override; + + /// \brief Append a scalar value + Status Append(const Scalar& value); + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + Status Append(typename std::enable_if::value, + const uint8_t*>::type value) { + return Append(util::string_view(reinterpret_cast(value), byte_width_)); + } + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + Status Append(typename std::enable_if::value, + const char*>::type value) { + return Append(util::string_view(value, byte_width_)); + } + + /// \brief Append a scalar null value + Status AppendNull(); + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array); + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// is the dictionary builder in the delta building mode + bool is_building_delta() { return delta_offset_ > 0; } + + protected: + class MemoTableImpl; + std::unique_ptr memo_table_; + + int32_t delta_offset_; + // Only used for FixedSizeBinaryType + int32_t byte_width_; + + AdaptiveIntBuilder values_builder_; +}; + +template <> +class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { + public: + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + explicit DictionaryBuilder(MemoryPool* pool); + + /// \brief Append a scalar null value + Status AppendNull(); + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array); + + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + protected: + AdaptiveIntBuilder values_builder_; +}; + +class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { + public: + using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; + + Status Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const char* value, int32_t length) { + return Append(util::string_view(value, length)); + } +}; + +/// \brief Dictionary array builder with convenience methods for strings +class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder { + public: + using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; + + Status Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const char* value, int32_t length) { + return Append(util::string_view(value, length)); + } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc new file mode 100644 index 0000000000000..e73324323af3d --- /dev/null +++ b/cpp/src/arrow/array/builder_nested.cc @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_nested.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// ListBuilder + +ListBuilder::ListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type) + : ArrayBuilder(type ? type + : std::static_pointer_cast( + std::make_shared(value_builder->type())), + pool), + offsets_builder_(pool), + value_builder_(value_builder) {} + +Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); +} + +Status ListBuilder::AppendNextOffset() { + int64_t num_values = value_builder_->length(); + if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { + std::stringstream ss; + ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," + << " have " << num_values; + return Status::CapacityError(ss.str()); + } + return offsets_builder_.Append(static_cast(num_values)); +} + +Status ListBuilder::Append(bool is_valid) { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return AppendNextOffset(); +} + +Status ListBuilder::Resize(int64_t capacity) { + DCHECK_LE(capacity, kListMaximumElements); + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + // one more then requested for offsets + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); + return ArrayBuilder::Resize(capacity); +} + +Status ListBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + + std::shared_ptr items; + if (values_) { + items = values_->data(); + } else { + if (value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + RETURN_NOT_OK(value_builder_->Resize(0)); + } + RETURN_NOT_OK(value_builder_->FinishInternal(&items)); + } + + *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets}, null_count_); + (*out)->child_data.emplace_back(std::move(items)); + Reset(); + return Status::OK(); +} + +void ListBuilder::Reset() { + ArrayBuilder::Reset(); + values_.reset(); + offsets_builder_.Reset(); + value_builder_->Reset(); +} + +ArrayBuilder* ListBuilder::value_builder() const { + DCHECK(!values_) << "Using value builder is pointless when values_ is set"; + return value_builder_.get(); +} + +// ---------------------------------------------------------------------- +// Struct + +StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders) + : ArrayBuilder(type, pool) { + children_ = std::move(field_builders); +} + +void StructBuilder::Reset() { + ArrayBuilder::Reset(); + for (const auto& field_builder : children_) { + field_builder->Reset(); + } +} + +Status StructBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_); + + (*out)->child_data.resize(children_.size()); + for (size_t i = 0; i < children_.size(); ++i) { + if (length_ == 0) { + // Try to make sure the child buffers are initialized + RETURN_NOT_OK(children_[i]->Resize(0)); + } + RETURN_NOT_OK(children_[i]->FinishInternal(&(*out)->child_data[i])); + } + + null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h new file mode 100644 index 0000000000000..863e6fef06f7d --- /dev/null +++ b/cpp/src/arrow/array/builder_nested.h @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/builder_base.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// List builder + +/// \class ListBuilder +/// \brief Builder class for variable-length list array value types +/// +/// To use this class, you must append values to the child array builder and use +/// the Append function to delimit each distinct list value (once the values +/// have been appended to the child array) or use the bulk API to append +/// a sequence of offests and null values. +/// +/// A note on types. Per arrow/type.h all types in the c++ implementation are +/// logical so even though this class always builds list array, this can +/// represent multiple different logical types. If no logical type is provided +/// at construction time, the class defaults to List where t is taken from the +/// value_builder/values that the object is constructed with. +class ARROW_EXPORT ListBuilder : public ArrayBuilder { + public: + /// Use this constructor to incrementally build the value array along with offsets and + /// null bitmap. + ListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, + const std::shared_ptr& type = NULLPTR); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true); + + Status AppendNull() { return Append(false); } + + ArrayBuilder* value_builder() const; + + protected: + TypedBufferBuilder offsets_builder_; + std::shared_ptr value_builder_; + std::shared_ptr values_; + + Status AppendNextOffset(); +}; + +// ---------------------------------------------------------------------- +// Struct + +// --------------------------------------------------------------------------------- +// StructArray builder +/// Append, Resize and Reserve methods are acting on StructBuilder. +/// Please make sure all these methods of all child-builders' are consistently +/// called to maintain data-structure consistency. +class ARROW_EXPORT StructBuilder : public ArrayBuilder { + public: + StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders); + + Status FinishInternal(std::shared_ptr* out) override; + + /// Null bitmap is of equal length to every child field, and any zero byte + /// will be considered as a null for that field, but users must using app- + /// end methods or advance methods of the child builders' independently to + /// insert data. + Status AppendValues(int64_t length, const uint8_t* valid_bytes) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// Append an element to the Struct. All child-builders' Append method must + /// be called independently to maintain data-structure consistency. + Status Append(bool is_valid = true) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + Status AppendNull() { return Append(false); } + + void Reset() override; + + ArrayBuilder* field_builder(int i) const { return children_[i].get(); } + + int num_fields() const { return static_cast(children_.size()); } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc new file mode 100644 index 0000000000000..bc14000c3e10d --- /dev/null +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -0,0 +1,272 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/builder_primitive.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Null builder + +Status NullBuilder::FinishInternal(std::shared_ptr* out) { + *out = ArrayData::Make(null(), length_, {nullptr}, length_); + length_ = null_count_ = 0; + return Status::OK(); +} + +// ---------------------------------------------------------------------- + +template +void PrimitiveBuilder::Reset() { + data_.reset(); + raw_data_ = nullptr; +} + +template +Status PrimitiveBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + int64_t nbytes = TypeTraits::bytes_required(capacity); + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); + } else { + RETURN_NOT_OK(data_->Resize(nbytes)); + } + + raw_data_ = reinterpret_cast(data_->mutable_data()); + return ArrayBuilder::Resize(capacity); +} + +template +Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + if (length > 0) { + std::memcpy(raw_data_ + length_, values, + static_cast(TypeTraits::bytes_required(length))); + } + + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +template +Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid) { + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + if (length > 0) { + std::memcpy(raw_data_ + length_, values, + static_cast(TypeTraits::bytes_required(length))); + } + + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +template +Status PrimitiveBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); +} + +template +Status PrimitiveBuilder::AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); +} + +template +Status PrimitiveBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + RETURN_NOT_OK(TrimBuffer(TypeTraits::bytes_required(length_), data_.get())); + + *out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + + return Status::OK(); +} + +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; + +BooleanBuilder::BooleanBuilder(MemoryPool* pool) + : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} + +BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) + : BooleanBuilder(pool) { + DCHECK_EQ(Type::BOOL, type->id()); +} + +void BooleanBuilder::Reset() { + ArrayBuilder::Reset(); + data_.reset(); + raw_data_ = nullptr; +} + +Status BooleanBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &data_)); + raw_data_ = reinterpret_cast(data_->mutable_data()); + + // We zero the memory for booleans to keep things simple; for some reason if + // we do not, even though we may write every bit (through in-place | or &), + // valgrind will still show a warning. If we do not zero the bytes here, we + // will have to be careful to zero them in AppendNull and AppendNulls. Also, + // zeroing the bits results in deterministic bits when each byte may have a + // mix of nulls and not nulls. + // + // We only zero up to new_bitmap_size because the padding was zeroed by + // AllocateResizableBuffer + memset(raw_data_, 0, static_cast(new_bitmap_size)); + } else { + const int64_t old_bitmap_capacity = data_->capacity(); + RETURN_NOT_OK(data_->Resize(new_bitmap_size)); + const int64_t new_bitmap_capacity = data_->capacity(); + raw_data_ = reinterpret_cast(data_->mutable_data()); + + // See comment above about why we zero memory for booleans + memset(raw_data_ + old_bitmap_capacity, 0, + static_cast(new_bitmap_capacity - old_bitmap_capacity)); + } + + return ArrayBuilder::Resize(capacity); +} + +Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { + int64_t bit_offset = length_ % 8; + if (bit_offset > 0) { + // Adjust last byte + data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset]; + } + + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get())); + + *out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_}, null_count_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++] != 0; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid) { + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); +} + +Status BooleanBuilder::AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); +} + +Status BooleanBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + const int64_t length = static_cast(values.size()); + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const std::vector& values) { + const int64_t length = static_cast(values.size()); + RETURN_NOT_OK(Reserve(length)); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeSetNotNull(length); + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h new file mode 100644 index 0000000000000..13f6c229b2a23 --- /dev/null +++ b/cpp/src/arrow/array/builder_primitive.h @@ -0,0 +1,401 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/type.h" + +namespace arrow { + +class ARROW_EXPORT NullBuilder : public ArrayBuilder { + public: + explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : ArrayBuilder(null(), pool) {} + + Status AppendNull() { + ++null_count_; + ++length_; + return Status::OK(); + } + + Status Append(std::nullptr_t value) { return AppendNull(); } + + Status FinishInternal(std::shared_ptr* out) override; +}; + +template +class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { + public: + using value_type = typename Type::c_type; + + explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {} + + using ArrayBuilder::Advance; + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + /// The memory at the corresponding data slot is set to 0 to prevent uninitialized + /// memory access + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(raw_data_ + length_, 0, + static_cast(TypeTraits::bytes_required(length))); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() { + ARROW_RETURN_NOT_OK(Reserve(1)); + memset(raw_data_ + length_, 0, sizeof(value_type)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + value_type GetValue(int64_t index) const { + return reinterpret_cast(data_->data())[index]; + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \return Status + + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values. + /// \return Status + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + return Status::OK(); + } + + // Same as above, with a pointer type ValidIter + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + } + + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override; + void Reset() override; + + Status Resize(int64_t capacity) override; + + protected: + std::shared_ptr data_; + value_type* raw_data_; +}; + +/// Base class for all Builders that emit an Array of a scalar numerical type. +template +class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { + public: + using typename PrimitiveBuilder::value_type; + using PrimitiveBuilder::PrimitiveBuilder; + + template + explicit NumericBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool + ARROW_MEMORY_POOL_DEFAULT) + : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} + + using ArrayBuilder::UnsafeAppendNull; + using PrimitiveBuilder::AppendValues; + using PrimitiveBuilder::Resize; + using PrimitiveBuilder::Reserve; + + /// Append a single scalar and increase the size if necessary. + Status Append(const value_type val) { + ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + /// Append a single scalar under the assumption that the underlying Buffer is + /// large enough. + /// + /// This method does not capacity-check; make sure to call Reserve + /// beforehand. + void UnsafeAppend(const value_type val) { + BitUtil::SetBit(null_bitmap_data_, length_); + raw_data_[length_++] = val; + } + + protected: + using PrimitiveBuilder::length_; + using PrimitiveBuilder::null_bitmap_data_; + using PrimitiveBuilder::raw_data_; +}; + +// Builders + +using UInt8Builder = NumericBuilder; +using UInt16Builder = NumericBuilder; +using UInt32Builder = NumericBuilder; +using UInt64Builder = NumericBuilder; + +using Int8Builder = NumericBuilder; +using Int16Builder = NumericBuilder; +using Int32Builder = NumericBuilder; +using Int64Builder = NumericBuilder; +using TimestampBuilder = NumericBuilder; +using Time32Builder = NumericBuilder; +using Time64Builder = NumericBuilder; +using Date32Builder = NumericBuilder; +using Date64Builder = NumericBuilder; + +using HalfFloatBuilder = NumericBuilder; +using FloatBuilder = NumericBuilder; +using DoubleBuilder = NumericBuilder; + +class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { + public: + using value_type = bool; + explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); + + using ArrayBuilder::Advance; + using ArrayBuilder::UnsafeAppendNull; + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + + return Status::OK(); + } + + Status AppendNull() { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + + return Status::OK(); + } + + /// Scalar append + Status Append(const bool val) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + Status Append(const uint8_t val) { return Append(val != 0); } + + /// Scalar append, without checking for capacity + void UnsafeAppend(const bool val) { + BitUtil::SetBit(null_bitmap_data_, length_); + if (val) { + BitUtil::SetBit(raw_data_, length_); + } else { + BitUtil::ClearBit(raw_data_, length_); + } + ++length_; + } + + void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous array of bytes (non-zero is 1) + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// or null(0) values + /// \return Status + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values + /// \return Status + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + return Status::OK(); + } + + // Same as above, for a pointer type ValidIter + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + } + + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override; + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + std::shared_ptr data_; + uint8_t* raw_data_; +}; + +} // namespace arrow diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc index f96728dcd4fdf..fae9c89a14fdf 100644 --- a/cpp/src/arrow/builder-benchmark.cc +++ b/cpp/src/arrow/builder-benchmark.cc @@ -163,10 +163,11 @@ static void BM_BuildBooleanArrayNoNulls( } static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const reference - const int64_t iterations = 1 << 20; - + // About 160MB + const int64_t iterations = 1 << 24; std::string value = "1234567890"; - while (state.KeepRunning()) { + + for (auto _ : state) { BinaryBuilder builder; for (int64_t i = 0; i < iterations; i++) { ABORT_NOT_OK(builder.Append(value)); @@ -177,6 +178,26 @@ static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const state.SetBytesProcessed(state.iterations() * iterations * value.size()); } +static void BM_BuildChunkedBinaryArray( + benchmark::State& state) { // NOLINT non-const reference + // About 160MB + const int64_t iterations = 1 << 24; + std::string value = "1234567890"; + + for (auto _ : state) { + // 1MB chunks + const int32_t chunksize = 1 << 20; + internal::ChunkedBinaryBuilder builder(chunksize); + for (int64_t i = 0; i < iterations; i++) { + ABORT_NOT_OK(builder.Append(reinterpret_cast(value.data()), + static_cast(value.size()))); + } + ArrayVector out; + ABORT_NOT_OK(builder.Finish(&out)); + } + state.SetBytesProcessed(state.iterations() * iterations * value.size()); +} + static void BM_BuildFixedSizeBinaryArray( benchmark::State& state) { // NOLINT non-const reference const int64_t iterations = 1 << 20; @@ -371,7 +392,8 @@ BENCHMARK(BM_BuildAdaptiveUIntNoNullsScalarAppend) ->Repetitions(kRepetitions) ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildBinaryArray)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildChunkedBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(BM_BuildFixedSizeBinaryArray) ->Repetitions(kRepetitions) ->Unit(benchmark::kMicrosecond); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index aef4df05108b7..ff2b453bb4494 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -15,513 +15,20 @@ // specific language governing permissions and limitations // under the License. -#include -#include -#include -#include +#include "arrow/builder.h" + #include +#include #include #include -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/int-util.h" -#include "arrow/util/logging.h" namespace arrow { -using internal::checked_cast; - -Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { - if (buffer) { - if (bytes_filled < buffer->size()) { - // Trim buffer - RETURN_NOT_OK(buffer->Resize(bytes_filled)); - } - // zero the padding - buffer->ZeroPadding(); - } else { - // Null buffers are allowed in place of 0-byte buffers - DCHECK_EQ(bytes_filled, 0); - } - return Status::OK(); -} - -Status ArrayBuilder::AppendToBitmap(bool is_valid) { - if (length_ == capacity_) { - // If the capacity was not already a multiple of 2, do so here - // TODO(emkornfield) doubling isn't great default allocation practice - // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md - // fo discussion - RETURN_NOT_OK(Resize(BitUtil::NextPower2(capacity_ + 1))); - } - UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t length) { - RETURN_NOT_OK(Reserve(length)); - - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -Status ArrayBuilder::Resize(int64_t capacity) { - // Target size of validity (null) bitmap data - const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &null_bitmap_)); - null_bitmap_data_ = null_bitmap_->mutable_data(); - - // Padding is zeroed by AllocateResizableBuffer - memset(null_bitmap_data_, 0, static_cast(new_bitmap_size)); - } else { - const int64_t old_bitmap_capacity = null_bitmap_->capacity(); - RETURN_NOT_OK(null_bitmap_->Resize(new_bitmap_size)); - - const int64_t new_bitmap_capacity = null_bitmap_->capacity(); - null_bitmap_data_ = null_bitmap_->mutable_data(); - - // Zero the region between the original capacity and the new capacity, - // including padding, which has not been zeroed, unlike - // AllocateResizableBuffer - if (old_bitmap_capacity < new_bitmap_capacity) { - memset(null_bitmap_data_ + old_bitmap_capacity, 0, - static_cast(new_bitmap_capacity - old_bitmap_capacity)); - } - } - capacity_ = capacity; - return Status::OK(); -} - -Status ArrayBuilder::Advance(int64_t elements) { - if (length_ + elements > capacity_) { - return Status::Invalid("Builder must be expanded"); - } - length_ += elements; - return Status::OK(); -} - -Status ArrayBuilder::Finish(std::shared_ptr* out) { - std::shared_ptr internal_data; - RETURN_NOT_OK(FinishInternal(&internal_data)); - *out = MakeArray(internal_data); - return Status::OK(); -} - -Status ArrayBuilder::Reserve(int64_t additional_elements) { - if (length_ + additional_elements > capacity_) { - // TODO(emkornfield) power of 2 growth is potentially suboptimal - int64_t new_size = BitUtil::NextPower2(length_ + additional_elements); - return Resize(new_size); - } - return Status::OK(); -} - -void ArrayBuilder::Reset() { - capacity_ = length_ = null_count_ = 0; - null_bitmap_ = nullptr; -} - -Status ArrayBuilder::SetNotNull(int64_t length) { - RETURN_NOT_OK(Reserve(length)); - UnsafeSetNotNull(length); - return Status::OK(); -} - -void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { - if (valid_bytes == nullptr) { - UnsafeSetNotNull(length); - return; - } - UnsafeAppendToBitmap(valid_bytes, valid_bytes + length); -} - -void ArrayBuilder::UnsafeAppendToBitmap(const std::vector& is_valid) { - UnsafeAppendToBitmap(is_valid.begin(), is_valid.end()); -} - -void ArrayBuilder::UnsafeSetNotNull(int64_t length) { - const int64_t new_length = length + length_; - - // Fill up the bytes until we have a byte alignment - int64_t pad_to_byte = std::min(8 - (length_ % 8), length); - - if (pad_to_byte == 8) { - pad_to_byte = 0; - } - for (int64_t i = length_; i < length_ + pad_to_byte; ++i) { - BitUtil::SetBit(null_bitmap_data_, i); - } - - // Fast bitsetting - int64_t fast_length = (length - pad_to_byte) / 8; - memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF, - static_cast(fast_length)); - - // Trailing bits - for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) { - BitUtil::SetBit(null_bitmap_data_, i); - } - - length_ = new_length; -} - -// ---------------------------------------------------------------------- -// Null builder - -Status NullBuilder::FinishInternal(std::shared_ptr* out) { - *out = ArrayData::Make(null(), length_, {nullptr}, length_); - length_ = null_count_ = 0; - return Status::OK(); -} - -// ---------------------------------------------------------------------- - -template -void PrimitiveBuilder::Reset() { - data_.reset(); - raw_data_ = nullptr; -} - -template -Status PrimitiveBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - int64_t nbytes = TypeTraits::bytes_required(capacity); - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); - } else { - RETURN_NOT_OK(data_->Resize(nbytes)); - } - - raw_data_ = reinterpret_cast(data_->mutable_data()); - return ArrayBuilder::Resize(capacity); -} - -template -Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - - if (length > 0) { - std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); - } - - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -template -Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid) { - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - if (length > 0) { - std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); - } - - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -template -Status PrimitiveBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - return AppendValues(values.data(), static_cast(values.size()), is_valid); -} - -template -Status PrimitiveBuilder::AppendValues(const std::vector& values) { - return AppendValues(values.data(), static_cast(values.size())); -} - -template -Status PrimitiveBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - RETURN_NOT_OK(TrimBuffer(TypeTraits::bytes_required(length_), data_.get())); - - *out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_); - - data_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - - return Status::OK(); -} - -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; - -BooleanBuilder::BooleanBuilder(MemoryPool* pool) - : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} - -BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) - : BooleanBuilder(pool) { - DCHECK_EQ(Type::BOOL, type->id()); -} - -void BooleanBuilder::Reset() { - ArrayBuilder::Reset(); - data_.reset(); - raw_data_ = nullptr; -} - -Status BooleanBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &data_)); - raw_data_ = reinterpret_cast(data_->mutable_data()); - - // We zero the memory for booleans to keep things simple; for some reason if - // we do not, even though we may write every bit (through in-place | or &), - // valgrind will still show a warning. If we do not zero the bytes here, we - // will have to be careful to zero them in AppendNull and AppendNulls. Also, - // zeroing the bits results in deterministic bits when each byte may have a - // mix of nulls and not nulls. - // - // We only zero up to new_bitmap_size because the padding was zeroed by - // AllocateResizableBuffer - memset(raw_data_, 0, static_cast(new_bitmap_size)); - } else { - const int64_t old_bitmap_capacity = data_->capacity(); - RETURN_NOT_OK(data_->Resize(new_bitmap_size)); - const int64_t new_bitmap_capacity = data_->capacity(); - raw_data_ = reinterpret_cast(data_->mutable_data()); - - // See comment above about why we zero memory for booleans - memset(raw_data_ + old_bitmap_capacity, 0, - static_cast(new_bitmap_capacity - old_bitmap_capacity)); - } - - return ArrayBuilder::Resize(capacity); -} - -Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { - int64_t bit_offset = length_ % 8; - if (bit_offset > 0) { - // Adjust last byte - data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset]; - } - - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get())); - - *out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_}, null_count_); - - data_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [values, &i]() -> bool { return values[i++] != 0; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, - const std::vector& is_valid) { - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - return AppendValues(values.data(), static_cast(values.size()), is_valid); -} - -Status BooleanBuilder::AppendValues(const std::vector& values) { - return AppendValues(values.data(), static_cast(values.size())); -} - -Status BooleanBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - const int64_t length = static_cast(values.size()); - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const std::vector& values) { - const int64_t length = static_cast(values.size()); - RETURN_NOT_OK(Reserve(length)); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeSetNotNull(length); - return Status::OK(); -} - -// ---------------------------------------------------------------------- -// ListBuilder - -ListBuilder::ListBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, - const std::shared_ptr& type) - : ArrayBuilder(type ? type - : std::static_pointer_cast( - std::make_shared(value_builder->type())), - pool), - offsets_builder_(pool), - value_builder_(value_builder) {} - -Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - offsets_builder_.UnsafeAppend(offsets, length); - return Status::OK(); -} - -Status ListBuilder::AppendNextOffset() { - int64_t num_values = value_builder_->length(); - if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { - std::stringstream ss; - ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," - << " have " << num_values; - return Status::CapacityError(ss.str()); - } - return offsets_builder_.Append(static_cast(num_values)); -} - -Status ListBuilder::Append(bool is_valid) { - RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_valid); - return AppendNextOffset(); -} - -Status ListBuilder::Resize(int64_t capacity) { - DCHECK_LE(capacity, kListMaximumElements); - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); - return ArrayBuilder::Resize(capacity); -} - -Status ListBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(AppendNextOffset()); - - // Offset padding zeroed by BufferBuilder - std::shared_ptr offsets; - RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - - std::shared_ptr items; - if (values_) { - items = values_->data(); - } else { - if (value_builder_->length() == 0) { - // Try to make sure we get a non-null values buffer (ARROW-2744) - RETURN_NOT_OK(value_builder_->Resize(0)); - } - RETURN_NOT_OK(value_builder_->FinishInternal(&items)); - } - - *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets}, null_count_); - (*out)->child_data.emplace_back(std::move(items)); - Reset(); - return Status::OK(); -} - -void ListBuilder::Reset() { - ArrayBuilder::Reset(); - values_.reset(); - offsets_builder_.Reset(); - value_builder_->Reset(); -} - -ArrayBuilder* ListBuilder::value_builder() const { - DCHECK(!values_) << "Using value builder is pointless when values_ is set"; - return value_builder_.get(); -} - -// ---------------------------------------------------------------------- -// Struct - -StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders) - : ArrayBuilder(type, pool), field_builders_(std::move(field_builders)) {} - -void StructBuilder::Reset() { - ArrayBuilder::Reset(); - for (const auto& field_builder : field_builders_) { - field_builder->Reset(); - } -} - -Status StructBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_); - - (*out)->child_data.resize(field_builders_.size()); - for (size_t i = 0; i < field_builders_.size(); ++i) { - if (length_ == 0) { - // Try to make sure the child buffers are initialized - RETURN_NOT_OK(field_builders_[i]->Resize(0)); - } - RETURN_NOT_OK(field_builders_[i]->FinishInternal(&(*out)->child_data[i])); - } - - null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} +class MemoryPool; // ---------------------------------------------------------------------- // Helper functions @@ -566,7 +73,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, case Type::LIST: { std::unique_ptr value_builder; std::shared_ptr value_type = - checked_cast(*type).value_type(); + internal::checked_cast(*type).value_type(); RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); out->reset(new ListBuilder(pool, std::move(value_builder))); return Status::OK(); diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index d0016674215fc..a7ab22c1beedb 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -15,1184 +15,27 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_BUILDER_H -#define ARROW_BUILDER_H +#pragma once -#include // IWYU pragma: keep -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include "arrow/buffer.h" -#include "arrow/memory_pool.h" +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export +#include "arrow/array/builder_binary.h" // IWYU pragma: export +#include "arrow/array/builder_decimal.h" // IWYU pragma: export +#include "arrow/array/builder_dict.h" // IWYU pragma: export +#include "arrow/array/builder_nested.h" // IWYU pragma: export +#include "arrow/array/builder_primitive.h" // IWYU pragma: export #include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" -#include "arrow/util/type_traits.h" #include "arrow/util/visibility.h" namespace arrow { -class Array; -struct ArrayData; -class Decimal128; - -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; -constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; - -constexpr int64_t kMinBuilderCapacity = 1 << 5; - -/// Base class for all data array builders. -/// -/// This class provides a facilities for incrementally building the null bitmap -/// (see Append methods) and as a side effect the current number of slots and -/// the null count. -/// -/// \note Users are expected to use builders as one of the concrete types below. -/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. -class ARROW_EXPORT ArrayBuilder { - public: - explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) - : type_(type), - pool_(pool), - null_bitmap_(NULLPTR), - null_count_(0), - null_bitmap_data_(NULLPTR), - length_(0), - capacity_(0) {} - - virtual ~ArrayBuilder() = default; - - /// For nested types. Since the objects are owned by this class instance, we - /// skip shared pointers and just return a raw pointer - ArrayBuilder* child(int i) { return children_[i].get(); } - - int num_children() const { return static_cast(children_.size()); } - - int64_t length() const { return length_; } - int64_t null_count() const { return null_count_; } - int64_t capacity() const { return capacity_; } - - /// \brief Ensure that enough memory has been allocated to fit the indicated - /// number of total elements in the builder, including any that have already - /// been appended. Does not account for reallocations that may be due to - /// variable size data, like binary values. To make space for incremental - /// appends, use Reserve instead. - /// - /// \param[in] capacity the minimum number of total array values to - /// accommodate. Must be greater than the current capacity. - /// \return Status - virtual Status Resize(int64_t capacity); - - /// \brief Ensure that there is enough space allocated to add the indicated - /// number of elements without any further calls to Resize. The memory - /// allocated is rounded up to the next highest power of 2 similar to memory - /// allocations in STL containers like std::vector - /// \param[in] additional_capacity the number of additional array values - /// \return Status - Status Reserve(int64_t additional_capacity); - - /// Reset the builder. - virtual void Reset(); - - /// For cases where raw data was memcpy'd into the internal buffers, allows us - /// to advance the length of the builder. It is your responsibility to use - /// this function responsibly. - Status Advance(int64_t elements); - - /// \brief Return result of builder as an internal generic ArrayData - /// object. Resets builder except for dictionary builder - /// - /// \param[out] out the finalized ArrayData object - /// \return Status - virtual Status FinishInternal(std::shared_ptr* out) = 0; - - /// \brief Return result of builder as an Array object. - /// - /// The builder is reset except for DictionaryBuilder. - /// - /// \param[out] out the finalized Array object - /// \return Status - Status Finish(std::shared_ptr* out); - - std::shared_ptr type() const { return type_; } - - protected: - ArrayBuilder() {} - - /// Append to null bitmap - Status AppendToBitmap(bool is_valid); - - /// Vector append. Treat each zero byte as a null. If valid_bytes is null - /// assume all of length bits are valid. - Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); - - /// Set the next length bits to not null (i.e. valid). - Status SetNotNull(int64_t length); - - // Unsafe operations (don't check capacity/don't resize) - - void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } - - // Append to null bitmap, update the length - void UnsafeAppendToBitmap(bool is_valid) { - if (is_valid) { - BitUtil::SetBit(null_bitmap_data_, length_); - } else { - ++null_count_; - } - ++length_; - } - - template - void UnsafeAppendToBitmap(const IterType& begin, const IterType& end) { - int64_t byte_offset = length_ / 8; - int64_t bit_offset = length_ % 8; - uint8_t bitset = null_bitmap_data_[byte_offset]; - - for (auto iter = begin; iter != end; ++iter) { - if (bit_offset == 8) { - bit_offset = 0; - null_bitmap_data_[byte_offset] = bitset; - byte_offset++; - // TODO: Except for the last byte, this shouldn't be needed - bitset = null_bitmap_data_[byte_offset]; - } - - if (*iter) { - bitset |= BitUtil::kBitmask[bit_offset]; - } else { - bitset &= BitUtil::kFlippedBitmask[bit_offset]; - ++null_count_; - } - - bit_offset++; - } - - if (bit_offset != 0) { - null_bitmap_data_[byte_offset] = bitset; - } - - length_ += std::distance(begin, end); - } - - // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null - // assume all of length bits are valid. - void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length); - - void UnsafeAppendToBitmap(const std::vector& is_valid); - - // Set the next length bits to not null (i.e. valid). - void UnsafeSetNotNull(int64_t length); - - static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); - - static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { - if (new_capacity < 0) { - return Status::Invalid("Resize capacity must be positive"); - } - if (new_capacity < old_capacity) { - return Status::Invalid("Resize cannot downsize"); - } - return Status::OK(); - } - - std::shared_ptr type_; - MemoryPool* pool_; - - // When null_bitmap are first appended to the builder, the null bitmap is allocated - std::shared_ptr null_bitmap_; - int64_t null_count_; - uint8_t* null_bitmap_data_; - - // Array length, so far. Also, the index of the next element to be added - int64_t length_; - int64_t capacity_; - - // Child value array builders. These are owned by this class - std::vector> children_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); -}; - -class ARROW_EXPORT NullBuilder : public ArrayBuilder { - public: - explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(null(), pool) {} - - Status AppendNull() { - ++null_count_; - ++length_; - return Status::OK(); - } - - Status Append(std::nullptr_t value) { return AppendNull(); } - - Status FinishInternal(std::shared_ptr* out) override; -}; - -template -class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { - public: - using value_type = typename Type::c_type; - - explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {} - - using ArrayBuilder::Advance; - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - /// The memory at the corresponding data slot is set to 0 to prevent uninitialized - /// memory access - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(Reserve(length)); - memset(raw_data_ + length_, 0, - static_cast(TypeTraits::bytes_required(length))); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - /// \brief Append a single null element - Status AppendNull() { - ARROW_RETURN_NOT_OK(Reserve(1)); - memset(raw_data_ + length_, 0, sizeof(value_type)); - UnsafeAppendToBitmap(false); - return Status::OK(); - } - - value_type GetValue(int64_t index) const { - return reinterpret_cast(data_->data())[index]; - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of values - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of values - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \return Status - - template - Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - UnsafeSetNotNull(length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin InputIterator with elements indication valid(1) - /// or null(0) values. - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - static_assert(!internal::is_null_pointer::value, - "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " - "version instead"); - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - return Status::OK(); - } - - // Same as above, with a pointer type ValidIter - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - if (valid_begin == NULLPTR) { - UnsafeSetNotNull(length); - } else { - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - } - - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override; - void Reset() override; - - Status Resize(int64_t capacity) override; - - protected: - std::shared_ptr data_; - value_type* raw_data_; -}; - -/// Base class for all Builders that emit an Array of a scalar numerical type. -template -class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { - public: - using typename PrimitiveBuilder::value_type; - using PrimitiveBuilder::PrimitiveBuilder; - - template - explicit NumericBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool - ARROW_MEMORY_POOL_DEFAULT) - : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} - - using ArrayBuilder::UnsafeAppendNull; - using PrimitiveBuilder::AppendValues; - using PrimitiveBuilder::Resize; - using PrimitiveBuilder::Reserve; - - /// Append a single scalar and increase the size if necessary. - Status Append(const value_type val) { - ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - - /// Append a single scalar under the assumption that the underlying Buffer is - /// large enough. - /// - /// This method does not capacity-check; make sure to call Reserve - /// beforehand. - void UnsafeAppend(const value_type val) { - BitUtil::SetBit(null_bitmap_data_, length_); - raw_data_[length_++] = val; - } - - protected: - using PrimitiveBuilder::length_; - using PrimitiveBuilder::null_bitmap_data_; - using PrimitiveBuilder::raw_data_; -}; - -// Builders - -using UInt8Builder = NumericBuilder; -using UInt16Builder = NumericBuilder; -using UInt32Builder = NumericBuilder; -using UInt64Builder = NumericBuilder; - -using Int8Builder = NumericBuilder; -using Int16Builder = NumericBuilder; -using Int32Builder = NumericBuilder; -using Int64Builder = NumericBuilder; -using TimestampBuilder = NumericBuilder; -using Time32Builder = NumericBuilder; -using Time64Builder = NumericBuilder; -using Date32Builder = NumericBuilder; -using Date64Builder = NumericBuilder; - -using HalfFloatBuilder = NumericBuilder; -using FloatBuilder = NumericBuilder; -using DoubleBuilder = NumericBuilder; - -namespace internal { - -class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { - public: - explicit AdaptiveIntBuilderBase(MemoryPool* pool); - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(CommitPendingData()); - ARROW_RETURN_NOT_OK(Reserve(length)); - memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - Status AppendNull() { - pending_data_[pending_pos_] = 0; - pending_valid_[pending_pos_] = 0; - pending_has_nulls_ = true; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - - protected: - virtual Status CommitPendingData() = 0; - - std::shared_ptr data_; - uint8_t* raw_data_; - uint8_t int_size_; - - static constexpr int32_t pending_size_ = 1024; - uint8_t pending_valid_[pending_size_]; - uint64_t pending_data_[pending_size_]; - int32_t pending_pos_; - bool pending_has_nulls_; -}; - -} // namespace internal - -class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { - public: - explicit AdaptiveUIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using ArrayBuilder::Advance; - using internal::AdaptiveIntBuilderBase::Reset; - - /// Scalar append - Status Append(const uint64_t val) { - pending_data_[pending_pos_] = val; - pending_valid_[pending_pos_] = 1; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status FinishInternal(std::shared_ptr* out) override; - - protected: - Status CommitPendingData() override; - Status ExpandIntSize(uint8_t new_int_size); - - Status AppendValuesInternal(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes); - - template - typename std::enable_if= sizeof(new_type), Status>::type - ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) - template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type - ExpandIntSizeInternal(); -#undef __LESS - - template - Status ExpandIntSizeN(); -}; - -class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { - public: - explicit AdaptiveIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using ArrayBuilder::Advance; - using internal::AdaptiveIntBuilderBase::Reset; - - /// Scalar append - Status Append(const int64_t val) { - auto v = static_cast(val); - - pending_data_[pending_pos_] = v; - pending_valid_[pending_pos_] = 1; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const int64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status FinishInternal(std::shared_ptr* out) override; - - protected: - Status CommitPendingData() override; - Status ExpandIntSize(uint8_t new_int_size); - - Status AppendValuesInternal(const int64_t* values, int64_t length, - const uint8_t* valid_bytes); - - template - typename std::enable_if= sizeof(new_type), Status>::type - ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) - template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type - ExpandIntSizeInternal(); -#undef __LESS - - template - Status ExpandIntSizeN(); -}; - -class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { - public: - using value_type = bool; - explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); - - using ArrayBuilder::Advance; - using ArrayBuilder::UnsafeAppendNull; - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - - return Status::OK(); - } - - Status AppendNull() { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(false); - - return Status::OK(); - } - - /// Scalar append - Status Append(const bool val) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - - Status Append(const uint8_t val) { return Append(val != 0); } - - /// Scalar append, without checking for capacity - void UnsafeAppend(const bool val) { - BitUtil::SetBit(null_bitmap_data_, length_); - if (val) { - BitUtil::SetBit(raw_data_, length_); - } else { - BitUtil::ClearBit(raw_data_, length_); - } - ++length_; - } - - void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous array of bytes (non-zero is 1) - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const uint8_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const uint8_t* values, int64_t length, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of bytes - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of bytes - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values an std::vector indicating true (1) or false - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values an std::vector indicating true (1) or false - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// or null(0) values - /// \return Status - template - Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates length_ - UnsafeSetNotNull(length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin InputIterator with elements indication valid(1) - /// or null(0) values - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - static_assert(!internal::is_null_pointer::value, - "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " - "version instead"); - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - return Status::OK(); - } - - // Same as above, for a pointer type ValidIter - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates the length_ - if (valid_begin == NULLPTR) { - UnsafeSetNotNull(length); - } else { - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - } - - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override; - void Reset() override; - Status Resize(int64_t capacity) override; - - protected: - std::shared_ptr data_; - uint8_t* raw_data_; -}; - -// ---------------------------------------------------------------------- -// List builder - -/// \class ListBuilder -/// \brief Builder class for variable-length list array value types -/// -/// To use this class, you must append values to the child array builder and use -/// the Append function to delimit each distinct list value (once the values -/// have been appended to the child array) or use the bulk API to append -/// a sequence of offests and null values. -/// -/// A note on types. Per arrow/type.h all types in the c++ implementation are -/// logical so even though this class always builds list array, this can -/// represent multiple different logical types. If no logical type is provided -/// at construction time, the class defaults to List where t is taken from the -/// value_builder/values that the object is constructed with. -class ARROW_EXPORT ListBuilder : public ArrayBuilder { - public: - /// Use this constructor to incrementally build the value array along with offsets and - /// null bitmap. - ListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type = NULLPTR); - - Status Resize(int64_t capacity) override; - void Reset() override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \brief Vector append - /// - /// If passed, valid_bytes is of equal length to values, and any zero byte - /// will be considered as a null for that slot - Status AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Start a new variable-length list slot - /// - /// This function should be called before beginning to append elements to the - /// value builder - Status Append(bool is_valid = true); - - Status AppendNull() { return Append(false); } - - ArrayBuilder* value_builder() const; - - protected: - TypedBufferBuilder offsets_builder_; - std::shared_ptr value_builder_; - std::shared_ptr values_; - - Status AppendNextOffset(); -}; - -// ---------------------------------------------------------------------- -// Binary and String - -/// \class BinaryBuilder -/// \brief Builder class for variable-length binary data -class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { - public: - explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - - Status Append(const uint8_t* value, int32_t length); - - Status Append(const char* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(util::string_view value) { - return Append(value.data(), static_cast(value.size())); - } - - Status AppendNull(); - - /// \brief Append without checking capacity - /// - /// Offsets and data should have been presized using Reserve() and - /// ReserveData(), respectively. - void UnsafeAppend(const uint8_t* value, int32_t length) { - UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend(value, length); - UnsafeAppendToBitmap(true); - } - - void UnsafeAppend(const char* value, int32_t length) { - UnsafeAppend(reinterpret_cast(value), length); - } - - void UnsafeAppend(const std::string& value) { - UnsafeAppend(value.c_str(), static_cast(value.size())); - } - - void UnsafeAppendNull() { - const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - UnsafeAppendToBitmap(false); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - - /// \brief Ensures there is enough allocated capacity to append the indicated - /// number of bytes to the value data buffer without additional allocations - Status ReserveData(int64_t elements); - - Status FinishInternal(std::shared_ptr* out) override; - - /// \return size of values buffer so far - int64_t value_data_length() const { return value_data_builder_.length(); } - /// \return capacity of values buffer - int64_t value_data_capacity() const { return value_data_builder_.capacity(); } - - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i, int32_t* out_length) const; - - /// Temporary access to a value. - /// - /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; - - protected: - TypedBufferBuilder offsets_builder_; - TypedBufferBuilder value_data_builder_; - - Status AppendNextOffset(); - - void UnsafeAppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - } -}; - -/// \class StringBuilder -/// \brief Builder class for UTF8 strings -class ARROW_EXPORT StringBuilder : public BinaryBuilder { - public: - using BinaryBuilder::BinaryBuilder; - explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using BinaryBuilder::Append; - using BinaryBuilder::Reset; - using BinaryBuilder::UnsafeAppend; - - /// \brief Append a sequence of strings in one shot. - /// - /// \param[in] values a vector of strings - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const std::vector& values, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of nul-terminated strings in one shot. - /// If one of the values is NULL, it is processed as a null - /// value even if the corresponding valid_bytes entry is 1. - /// - /// \param[in] values a contiguous C array of nul-terminated char * - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const char** values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); -}; - -// ---------------------------------------------------------------------- -// FixedSizeBinaryBuilder - -class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { - public: - FixedSizeBinaryBuilder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - Status Append(const uint8_t* value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(true); - return byte_builder_.Append(value, byte_width_); - } - - Status Append(const char* value) { - return Append(reinterpret_cast(value)); - } - - Status Append(const util::string_view& view) { -#ifndef NDEBUG - CheckValueSize(static_cast(view.size())); -#endif - return Append(reinterpret_cast(view.data())); - } - - Status Append(const std::string& s) { -#ifndef NDEBUG - CheckValueSize(static_cast(s.size())); -#endif - return Append(reinterpret_cast(s.data())); - } - - template - Status Append(const std::array& value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(true); - return byte_builder_.Append(value); - } - - Status AppendValues(const uint8_t* data, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - Status AppendNull(); - - void Reset() override; - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \return size of values buffer so far - int64_t value_data_length() const { return byte_builder_.length(); } - - int32_t byte_width() const { return byte_width_; } - - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i) const; - - /// Temporary access to a value. - /// - /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; - - protected: - int32_t byte_width_; - BufferBuilder byte_builder_; - -#ifndef NDEBUG - void CheckValueSize(int64_t size); -#endif -}; - -class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { - public: - explicit Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using FixedSizeBinaryBuilder::Append; - using FixedSizeBinaryBuilder::AppendValues; - using FixedSizeBinaryBuilder::Reset; - - Status Append(const Decimal128& val); - - Status FinishInternal(std::shared_ptr* out) override; -}; - -using DecimalBuilder = Decimal128Builder; - -// ---------------------------------------------------------------------- -// Struct - -// --------------------------------------------------------------------------------- -// StructArray builder -/// Append, Resize and Reserve methods are acting on StructBuilder. -/// Please make sure all these methods of all child-builders' are consistently -/// called to maintain data-structure consistency. -class ARROW_EXPORT StructBuilder : public ArrayBuilder { - public: - StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders); - - Status FinishInternal(std::shared_ptr* out) override; - - /// Null bitmap is of equal length to every child field, and any zero byte - /// will be considered as a null for that field, but users must using app- - /// end methods or advance methods of the child builders' independently to - /// insert data. - Status AppendValues(int64_t length, const uint8_t* valid_bytes) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - /// Append an element to the Struct. All child-builders' Append method must - /// be called independently to maintain data-structure consistency. - Status Append(bool is_valid = true) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_valid); - return Status::OK(); - } - - Status AppendNull() { return Append(false); } - - void Reset() override; - - ArrayBuilder* field_builder(int i) const { return field_builders_[i].get(); } - - int num_fields() const { return static_cast(field_builders_.size()); } - - protected: - std::vector> field_builders_; -}; - -// ---------------------------------------------------------------------- -// Dictionary builder - -namespace internal { - -template -struct DictionaryScalar { - using type = typename T::c_type; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -} // namespace internal - -/// \brief Array builder for created encoded DictionaryArray from dense array -/// -/// Unlike other builders, dictionary builder does not completely reset the state -/// on Finish calls. The arrays built after the initial Finish call will reuse -/// the previously created encoding and build a delta dictionary when new terms -/// occur. -/// -/// data -template -class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { - public: - using Scalar = typename internal::DictionaryScalar::type; - - // WARNING: the type given below is the value type, not the DictionaryType. - // The DictionaryType is instantiated on the Finish() call. - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - - template - explicit DictionaryBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) - : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} - - ~DictionaryBuilder() override; - - /// \brief Append a scalar value - Status Append(const Scalar& value); - - /// \brief Append a fixed-width string (only for FixedSizeBinaryType) - template - Status Append(typename std::enable_if::value, - const uint8_t*>::type value) { - return Append(util::string_view(reinterpret_cast(value), byte_width_)); - } - - /// \brief Append a fixed-width string (only for FixedSizeBinaryType) - template - Status Append(typename std::enable_if::value, - const char*>::type value) { - return Append(util::string_view(value, byte_width_)); - } - - /// \brief Append a scalar null value - Status AppendNull(); - - /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array); - - void Reset() override; - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - /// is the dictionary builder in the delta building mode - bool is_building_delta() { return delta_offset_ > 0; } - - protected: - class MemoTableImpl; - std::unique_ptr memo_table_; - - int32_t delta_offset_; - // Only used for FixedSizeBinaryType - int32_t byte_width_; - - AdaptiveIntBuilder values_builder_; -}; - -template <> -class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { - public: - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - explicit DictionaryBuilder(MemoryPool* pool); - - /// \brief Append a scalar null value - Status AppendNull(); - - /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array); - - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - protected: - AdaptiveIntBuilder values_builder_; -}; - -class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { - public: - using DictionaryBuilder::Append; - using DictionaryBuilder::DictionaryBuilder; - - Status Append(const uint8_t* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); - } -}; - -/// \brief Dictionary array builder with convenience methods for strings -class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder { - public: - using DictionaryBuilder::Append; - using DictionaryBuilder::DictionaryBuilder; - - Status Append(const uint8_t* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); - } -}; - -// ---------------------------------------------------------------------- -// Helper functions +class DataType; +class MemoryPool; ARROW_EXPORT Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::unique_ptr* out); } // namespace arrow - -#endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 52fc58809604c..e34a086d8e2d9 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -70,6 +70,27 @@ shared_ptr _MakeArray(const shared_ptr& type, const vector& return result; } +// ---------------------------------------------------------------------- +// Datum + +template +void CheckImplicitConstructor(enum Datum::type expected_kind) { + std::shared_ptr value; + Datum datum = value; + ASSERT_EQ(expected_kind, datum.kind()); +} + +TEST(TestDatum, ImplicitConstructors) { + CheckImplicitConstructor(Datum::ARRAY); + + // Instantiate from array subclass + CheckImplicitConstructor(Datum::ARRAY); + + CheckImplicitConstructor(Datum::CHUNKED_ARRAY); + CheckImplicitConstructor(Datum::RECORD_BATCH); + CheckImplicitConstructor
(Datum::TABLE); +} + // ---------------------------------------------------------------------- // Cast @@ -781,7 +802,7 @@ TEST_F(TestCast, ChunkedArray) { CastOptions options; Datum out; - ASSERT_OK(Cast(&this->ctx_, Datum(carr), out_type, options, &out)); + ASSERT_OK(Cast(&this->ctx_, carr, out_type, options, &out)); ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); auto out_carr = out.chunked_array(); @@ -869,7 +890,7 @@ TEST_F(TestCast, PreallocatedMemory) { out_data->buffers.push_back(out_values); Datum out(out_data); - ASSERT_OK(kernel->Call(&this->ctx_, Datum(arr), &out)); + ASSERT_OK(kernel->Call(&this->ctx_, arr, &out)); // Buffer address unchanged ASSERT_EQ(out_values.get(), out_data->buffers[1].get()); @@ -912,8 +933,8 @@ void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr Datum out_second(out_second_data); // Cast each bit - ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(0, first_half)), &out_first)); - ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(first_half)), &out_second)); + ASSERT_OK(kernel->Call(ctx, arr->Slice(0, first_half), &out_first)); + ASSERT_OK(kernel->Call(ctx, arr->Slice(first_half), &out_second)); shared_ptr result = MakeArray(out_data); @@ -1105,7 +1126,7 @@ TYPED_TEST(TestDictionaryCast, Basic) { TestBase::MakeRandomArray::ArrayType>(10, 2); Datum out; - ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(plain_array->data()), &out)); + ASSERT_OK(DictionaryEncode(&this->ctx_, plain_array->data(), &out)); this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options); } @@ -1201,7 +1222,7 @@ void CheckUnique(FunctionContext* ctx, const shared_ptr& type, shared_ptr expected = _MakeArray(type, out_values, out_is_valid); shared_ptr result; - ASSERT_OK(Unique(ctx, Datum(input), &result)); + ASSERT_OK(Unique(ctx, input, &result)); ASSERT_ARRAYS_EQUAL(*expected, *result); } @@ -1218,7 +1239,7 @@ void CheckDictEncode(FunctionContext* ctx, const shared_ptr& type, DictionaryArray expected(dictionary(int32(), ex_dict), ex_indices); Datum datum_out; - ASSERT_OK(DictionaryEncode(ctx, Datum(input), &datum_out)); + ASSERT_OK(DictionaryEncode(ctx, input, &datum_out)); shared_ptr result = MakeArray(datum_out.array()); ASSERT_ARRAYS_EQUAL(expected, *result); @@ -1461,7 +1482,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) { // Unique shared_ptr result; - ASSERT_OK(Unique(&this->ctx_, Datum(carr), &result)); + ASSERT_OK(Unique(&this->ctx_, carr, &result)); ASSERT_ARRAYS_EQUAL(*ex_dict, *result); // Dictionary encode @@ -1475,7 +1496,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) { auto dict_carr = std::make_shared(dict_arrays); Datum encoded_out; - ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(carr), &encoded_out)); + ASSERT_OK(DictionaryEncode(&this->ctx_, carr, &encoded_out)); ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); @@ -1490,7 +1511,7 @@ class TestBooleanKernel : public ComputeFixture, public TestBase { const std::shared_ptr& right, const std::shared_ptr& expected) { Datum result; - ASSERT_OK(kernel(&this->ctx_, Datum(left), Datum(right), &result)); + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); std::shared_ptr result_array = result.make_array(); ASSERT_TRUE(result_array->Equals(expected)); @@ -1502,7 +1523,7 @@ class TestBooleanKernel : public ComputeFixture, public TestBase { const std::shared_ptr& expected) { Datum result; std::shared_ptr result_array; - ASSERT_OK(kernel(&this->ctx_, Datum(left), Datum(right), &result)); + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); std::shared_ptr result_ca = result.chunked_array(); ASSERT_TRUE(result_ca->Equals(expected)); @@ -1552,13 +1573,13 @@ TEST_F(TestBooleanKernel, Invert) { // Plain array Datum result; - ASSERT_OK(Invert(&this->ctx_, Datum(a1), &result)); + ASSERT_OK(Invert(&this->ctx_, a1, &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); std::shared_ptr result_array = result.make_array(); ASSERT_TRUE(result_array->Equals(a2)); // Array with offset - ASSERT_OK(Invert(&this->ctx_, Datum(a1->Slice(1)), &result)); + ASSERT_OK(Invert(&this->ctx_, a1->Slice(1), &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); result_array = result.make_array(); ASSERT_TRUE(result_array->Equals(a2->Slice(1))); @@ -1568,7 +1589,7 @@ TEST_F(TestBooleanKernel, Invert) { auto ca1 = std::make_shared(ca1_arrs); std::vector> ca2_arrs = {a2, a2->Slice(1)}; auto ca2 = std::make_shared(ca2_arrs); - ASSERT_OK(Invert(&this->ctx_, Datum(ca1), &result)); + ASSERT_OK(Invert(&this->ctx_, ca1, &result)); ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); std::shared_ptr result_ca = result.chunked_array(); ASSERT_TRUE(result_ca->Equals(ca2)); @@ -1618,14 +1639,14 @@ TEST_F(TestInvokeBinaryKernel, Exceptions) { auto a2 = _MakeArray(type, values2, {}); // Left is not an array-like - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel( - &this->ctx_, &kernel, Datum(table), Datum(a2), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, table, a2, + &outputs)); // Right is not an array-like - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, Datum(a1), - Datum(table), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, a1, table, + &outputs)); // Different sized inputs - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, Datum(a1), - Datum(a1->Slice(1)), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, a1, + a1->Slice(1), &outputs)); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index bef2b9af21cff..87080b1000d5f 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -61,19 +61,28 @@ struct ARROW_EXPORT Datum { /// \brief Empty datum, to be populated elsewhere Datum() : value(NULLPTR) {} - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : Datum(value->data()) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr
& value) : value(value) {} - - explicit Datum(const std::vector& value) : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : Datum(value ? value->data() : NULLPTR) {} + + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr
& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::vector& value) // NOLINT implicit conversion + : value(value) {} + + // Cast from subtypes of Array to Datum + template ::value>::type> + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : Datum(std::shared_ptr(value)) {} ~Datum() {} diff --git a/cpp/src/arrow/csv/column-builder.h b/cpp/src/arrow/csv/column-builder.h index b21cff76be5c6..054a642295cb5 100644 --- a/cpp/src/arrow/csv/column-builder.h +++ b/cpp/src/arrow/csv/column-builder.h @@ -18,22 +18,29 @@ #ifndef ARROW_CSV_COLUMN_BUILDER_H #define ARROW_CSV_COLUMN_BUILDER_H +#include #include -#include #include "arrow/array.h" -#include "arrow/csv/converter.h" -#include "arrow/csv/options.h" -#include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/table.h" -#include "arrow/type.h" -#include "arrow/util/task-group.h" #include "arrow/util/visibility.h" namespace arrow { + +class ChunkedArray; +class DataType; + +namespace internal { + +class TaskGroup; + +} // namespace internal + namespace csv { +class BlockParser; +struct ConvertOptions; + class ARROW_EXPORT ColumnBuilder { public: virtual ~ColumnBuilder() = default; diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 7d8bff870ba84..8a249a68c07ec 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "arrow/builder.h" #include "arrow/csv/parser.h" diff --git a/cpp/src/arrow/csv/parser.h b/cpp/src/arrow/csv/parser.h index 8a515744ee2d9..fdddc37a2c0fb 100644 --- a/cpp/src/arrow/csv/parser.h +++ b/cpp/src/arrow/csv/parser.h @@ -18,6 +18,7 @@ #ifndef ARROW_CSV_PARSER_H #define ARROW_CSV_PARSER_H +#include #include #include #include diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 8cf74d6b99901..b2a6b7b430ad0 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include "arrow/buffer.h" diff --git a/cpp/src/arrow/io/buffered.cc b/cpp/src/arrow/io/buffered.cc index 0c04ac21c208e..f3eae39c8e62e 100644 --- a/cpp/src/arrow/io/buffered.cc +++ b/cpp/src/arrow/io/buffered.cc @@ -21,10 +21,10 @@ #include #include #include -#include #include #include "arrow/buffer.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/util/logging.h" #include "arrow/util/string_view.h" diff --git a/cpp/src/arrow/io/buffered.h b/cpp/src/arrow/io/buffered.h index e4374ba8079d3..d5079556c7cfc 100644 --- a/cpp/src/arrow/io/buffered.h +++ b/cpp/src/arrow/io/buffered.h @@ -29,6 +29,7 @@ namespace arrow { +class Buffer; class MemoryPool; class Status; diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc index b0be28925cf23..8139c47e09fca 100644 --- a/cpp/src/arrow/ipc/feather-test.cc +++ b/cpp/src/arrow/ipc/feather-test.cc @@ -30,6 +30,7 @@ #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc index 45525212d2f4b..84a2210157f53 100644 --- a/cpp/src/arrow/ipc/json-simple-test.cc +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -34,6 +34,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" #if defined(_MSC_VER) // "warning C4307: '+': integral constant overflow" diff --git a/cpp/src/arrow/memory_pool-test.h b/cpp/src/arrow/memory_pool-test.h index 34523a181ba1e..fc86d943ec116 100644 --- a/cpp/src/arrow/memory_pool-test.h +++ b/cpp/src/arrow/memory_pool-test.h @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index 0a27141b447f7..d62db32b062ac 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -17,18 +17,16 @@ #include "arrow/memory_pool.h" -#include -#include -#include -#include -#include -#include +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep #include #include #include // IWYU pragma: keep #include "arrow/status.h" -#include "arrow/util/logging.h" +#include "arrow/util/logging.h" // IWYU pragma: keep #ifdef ARROW_JEMALLOC // Needed to support jemalloc 3 and 4 diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 8434e59b0ce79..a1acfb81aeff1 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -26,12 +26,10 @@ #include "arrow/array.h" #include "arrow/builder.h" -#include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/util/decimal.h" namespace arrow { @@ -342,7 +340,7 @@ TEST_F(TestPrettyPrint, DictionaryType) { TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { auto array = ArrayFromJSON(int32(), "[0, 1, null, 3, null]"); - ChunkedArray chunked_array({array}); + ChunkedArray chunked_array(array); static const char* expected = R"expected([ [ diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index ec23bfb00fcde..c524039c3e86a 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -19,7 +19,7 @@ #include #include #include -#include +#include // IWYU pragma: keep #include #include #include diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index fde6c293f9b68..ca50bc0bc993c 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -21,14 +21,17 @@ #include #include -#include "arrow/type_fwd.h" #include "arrow/util/visibility.h" namespace arrow { class Array; +class Column; class ChunkedArray; +class RecordBatch; +class Schema; class Status; +class Table; struct PrettyPrintOptions { PrettyPrintOptions(int indent_arg, int window_arg = 10, int indent_size_arg = 2, diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index f9a5ea1b0d67e..da288d3c6868e 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -539,33 +540,27 @@ Status NumPyConverter::Visit(const BinaryType& type) { auto data = reinterpret_cast(PyArray_DATA(arr_)); - int item_length = 0; + auto AppendNotNull = [&builder, this](const uint8_t* data) { + // This is annoying. NumPy allows strings to have nul-terminators, so + // we must check for them here + const size_t item_size = + strnlen(reinterpret_cast(data), static_cast(itemsize_)); + return builder.Append(data, static_cast(item_size)); + }; + if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); for (int64_t i = 0; i < length_; ++i) { if (mask_values[i]) { RETURN_NOT_OK(builder.AppendNull()); } else { - // This is annoying. NumPy allows strings to have nul-terminators, so - // we must check for them here - for (item_length = 0; item_length < itemsize_; ++item_length) { - if (data[item_length] == 0) { - break; - } - } - RETURN_NOT_OK(builder.Append(data, item_length)); + RETURN_NOT_OK(AppendNotNull(data)); } data += stride_; } } else { for (int64_t i = 0; i < length_; ++i) { - for (item_length = 0; item_length < itemsize_; ++item_length) { - // Look for nul-terminator - if (data[item_length] == 0) { - break; - } - } - RETURN_NOT_OK(builder.Append(data, item_length)); + RETURN_NOT_OK(AppendNotNull(data)); data += stride_; } } diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index 2d15ce45b3b7f..7443c54845630 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -25,6 +25,7 @@ #include "arrow/builder.h" #include "arrow/table.h" #include "arrow/test-util.h" +#include "arrow/util/decimal.h" #include "arrow/python/arrow_to_pandas.h" #include "arrow/python/decimal.h" diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 674b68b40fa6e..ceb6885da621e 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -32,6 +32,7 @@ namespace arrow { class Array; struct ArrayData; class Status; +class Table; /// \class RecordBatch /// \brief Collection of equal-length arrays matching a particular Schema diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 6b5733252879b..2ac34b4cde57d 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -44,6 +44,11 @@ class ARROW_EXPORT ChunkedArray { /// The vector should be non-empty and all its elements should have the same /// data type. explicit ChunkedArray(const ArrayVector& chunks); + + /// \brief Construct a chunked array from a single Array + explicit ChunkedArray(const std::shared_ptr& chunk) + : ChunkedArray(ArrayVector({chunk})) {} + /// \brief Construct a chunked array from a vector of arrays and a data type /// /// As the data type is passed explicitly, the vector may be empty. diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 589ee995e2181..792945b1740f3 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -17,6 +17,7 @@ #include "arrow/tensor.h" +#include #include #include #include diff --git a/cpp/src/arrow/test-util.cc b/cpp/src/arrow/test-util.cc index 38e07dd060ae4..8c5f36417f881 100644 --- a/cpp/src/arrow/test-util.cc +++ b/cpp/src/arrow/test-util.cc @@ -18,13 +18,12 @@ #include "arrow/test-util.h" #ifndef _WIN32 -#include -#include -#include +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep #endif #include -#include #include #include #include @@ -33,23 +32,17 @@ #include #include #include -#include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/ipc/json-simple.h" -#include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/decimal.h" #include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 7829ac25678a9..7fe7685f5a39f 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -17,23 +17,17 @@ #pragma once -#ifndef _WIN32 -#include -#include -#include -#endif - #include -#include #include #include +#include #include #include #include #include #include #include -#include +#include #include #include @@ -43,13 +37,13 @@ #include "arrow/builder.h" #include "arrow/memory_pool.h" #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" -#include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/util/macros.h" #include "arrow/util/visibility.h" #define STRINGIFY(x) #x @@ -102,6 +96,10 @@ namespace arrow { +class ChunkedArray; +class Column; +class Table; + using ArrayVector = std::vector>; #define ASSERT_ARRAYS_EQUAL(LEFT, RIGHT) \ diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index 0acd54d057218..97fd46ab6c587 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -18,6 +18,7 @@ #include "arrow/util/compression_lz4.h" #include +#include #include #include diff --git a/cpp/src/arrow/util/int-util-test.cc b/cpp/src/arrow/util/int-util-test.cc index 51fd96e4ea25a..018eeda7248a3 100644 --- a/cpp/src/arrow/util/int-util-test.cc +++ b/cpp/src/arrow/util/int-util-test.cc @@ -17,14 +17,12 @@ #include #include -#include #include #include #include #include -#include "arrow/test-util.h" #include "arrow/util/int-util.h" namespace arrow { diff --git a/cpp/src/arrow/util/string_view.h b/cpp/src/arrow/util/string_view.h index 2ee594a9e9ad3..0f35483e3738e 100644 --- a/cpp/src/arrow/util/string_view.h +++ b/cpp/src/arrow/util/string_view.h @@ -18,7 +18,7 @@ #ifndef ARROW_UTIL_STRING_VIEW_H #define ARROW_UTIL_STRING_VIEW_H -#include "arrow/util/string_view/string_view.hpp" +#include "arrow/util/string_view/string_view.hpp" // IWYU pragma: export namespace arrow { namespace util { diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index 9372c3110a3af..89afc39a23376 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -18,8 +18,11 @@ ADD_PARQUET_TEST(arrow-schema-test) ADD_PARQUET_TEST(arrow-reader-writer-test) -ADD_ARROW_BENCHMARK(reader-writer-benchmark +ADD_BENCHMARK(reader-writer-benchmark PREFIX "parquet-arrow" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) +if (TARGET parquet-arrow-reader-writer-benchmark) + add_dependencies(parquet parquet-arrow-reader-writer-benchmark) +endif() ARROW_INSTALL_ALL_HEADERS("parquet/arrow") diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 24ec0dd24eec3..07124ebb3057a 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -464,7 +464,11 @@ class TestParquetIO : public ::testing::Test { ASSERT_OK_NO_THROW(file_reader->GetColumn(0, &column_reader)); ASSERT_NE(nullptr, column_reader.get()); - ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out)); + std::shared_ptr chunked_out; + ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, &chunked_out)); + + ASSERT_EQ(1, chunked_out->num_chunks()); + *out = chunked_out->chunk(0); ASSERT_NE(nullptr, out->get()); } @@ -1745,10 +1749,11 @@ TEST(TestArrowReadWrite, ListLargeRecords) { std::vector> pieces; for (int i = 0; i < num_rows; ++i) { - std::shared_ptr piece; - ASSERT_OK(col_reader->NextBatch(1, &piece)); - ASSERT_EQ(1, piece->length()); - pieces.push_back(piece); + std::shared_ptr chunked_piece; + ASSERT_OK(col_reader->NextBatch(1, &chunked_piece)); + ASSERT_EQ(1, chunked_piece->length()); + ASSERT_EQ(1, chunked_piece->num_chunks()); + pieces.push_back(chunked_piece->chunk(0)); } auto chunked = std::make_shared<::arrow::ChunkedArray>(pieces); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 6273fda464025..2a7730d42ad23 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -32,6 +32,9 @@ #include "arrow/util/logging.h" #include "arrow/util/thread-pool.h" +// For arrow::compute::Datum. This should perhaps be promoted. See ARROW-4022 +#include "arrow/compute/kernel.h" + #include "parquet/arrow/record_reader.h" #include "parquet/arrow/schema.h" #include "parquet/column_reader.h" @@ -46,6 +49,7 @@ using arrow::Array; using arrow::BooleanArray; +using arrow::ChunkedArray; using arrow::Column; using arrow::Field; using arrow::Int32Array; @@ -57,6 +61,9 @@ using arrow::StructArray; using arrow::Table; using arrow::TimestampArray; +// For Array/ChunkedArray variant +using arrow::compute::Datum; + using parquet::schema::Node; // Help reduce verbosity @@ -85,6 +92,19 @@ static inline int64_t impala_timestamp_to_nanoseconds(const Int96& impala_timest template using ArrayType = typename ::arrow::TypeTraits::ArrayType; +namespace { + +Status GetSingleChunk(const ChunkedArray& chunked, std::shared_ptr* out) { + DCHECK_GT(chunked.num_chunks(), 0); + if (chunked.num_chunks() > 1) { + return Status::Invalid("Function call returned a chunked array"); + } + *out = chunked.chunk(0); + return Status::OK(); +} + +} // namespace + // ---------------------------------------------------------------------- // Iteration utilities @@ -223,15 +243,18 @@ class FileReader::Impl { virtual ~Impl() {} Status GetColumn(int i, std::unique_ptr* out); - Status ReadSchemaField(int i, std::shared_ptr* out); + + Status ReadSchemaField(int i, std::shared_ptr* out); Status ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr* out); + std::shared_ptr* out); + Status ReadColumn(int i, std::shared_ptr* out); + Status ReadColumnChunk(int column_index, int row_group_index, + std::shared_ptr* out); + Status GetReaderForNode(int index, const Node* node, const std::vector& indices, int16_t def_level, std::unique_ptr* out); - Status ReadColumn(int i, std::shared_ptr* out); - Status ReadColumnChunk(int column_index, int row_group_index, - std::shared_ptr* out); + Status GetSchema(std::shared_ptr<::arrow::Schema>* out); Status GetSchema(const std::vector& indices, std::shared_ptr<::arrow::Schema>* out); @@ -267,7 +290,8 @@ class FileReader::Impl { class ColumnReader::ColumnReaderImpl { public: virtual ~ColumnReaderImpl() {} - virtual Status NextBatch(int64_t records_to_read, std::shared_ptr* out) = 0; + virtual Status NextBatch(int64_t records_to_read, + std::shared_ptr* out) = 0; virtual Status GetDefLevels(const int16_t** data, size_t* length) = 0; virtual Status GetRepLevels(const int16_t** data, size_t* length) = 0; virtual const std::shared_ptr field() = 0; @@ -283,10 +307,10 @@ class PARQUET_NO_EXPORT PrimitiveImpl : public ColumnReader::ColumnReaderImpl { NextRowGroup(); } - Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; + Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; template - Status WrapIntoListArray(std::shared_ptr* array); + Status WrapIntoListArray(Datum* inout_array); Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; @@ -314,7 +338,7 @@ class PARQUET_NO_EXPORT StructImpl : public ColumnReader::ColumnReaderImpl { InitField(node, children); } - Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; + Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; const std::shared_ptr field() override { return field_; } @@ -395,7 +419,7 @@ Status FileReader::Impl::GetReaderForNode( return Status::OK(); } -Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { +Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { std::vector indices(reader_->metadata()->num_columns()); for (size_t j = 0; j < indices.size(); ++j) { @@ -406,7 +430,7 @@ Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { } Status FileReader::Impl::ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr* out) { + std::shared_ptr* out) { auto parquet_schema = reader_->metadata()->schema(); auto node = parquet_schema->group_node()->field(i).get(); @@ -432,7 +456,7 @@ Status FileReader::Impl::ReadSchemaField(int i, const std::vector& indices, return reader->NextBatch(records_to_read, out); } -Status FileReader::Impl::ReadColumn(int i, std::shared_ptr* out) { +Status FileReader::Impl::ReadColumn(int i, std::shared_ptr* out) { std::unique_ptr flat_column_reader; RETURN_NOT_OK(GetColumn(i, &flat_column_reader)); @@ -452,7 +476,7 @@ Status FileReader::Impl::GetSchema(const std::vector& indices, } Status FileReader::Impl::ReadColumnChunk(int column_index, int row_group_index, - std::shared_ptr* out) { + std::shared_ptr* out) { auto rg_metadata = reader_->metadata()->RowGroup(row_group_index); int64_t records_to_read = rg_metadata->ColumnChunk(column_index)->num_values(); @@ -463,10 +487,7 @@ Status FileReader::Impl::ReadColumnChunk(int column_index, int row_group_index, new PrimitiveImpl(pool_, std::move(input))); ColumnReader flat_column_reader(std::move(impl)); - std::shared_ptr array; - RETURN_NOT_OK(flat_column_reader.NextBatch(records_to_read, &array)); - *out = array; - return Status::OK(); + return flat_column_reader.NextBatch(records_to_read, out); } Status FileReader::Impl::ReadRowGroup(int row_group_index, @@ -485,7 +506,7 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index, auto ReadColumnFunc = [&indices, &row_group_index, &schema, &columns, this](int i) { int column_index = indices[i]; - std::shared_ptr array; + std::shared_ptr array; RETURN_NOT_OK(ReadColumnChunk(column_index, row_group_index, &array)); columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); @@ -532,7 +553,7 @@ Status FileReader::Impl::ReadTable(const std::vector& indices, std::vector> columns(num_fields); auto ReadColumnFunc = [&indices, &field_indices, &schema, &columns, this](int i) { - std::shared_ptr array; + std::shared_ptr array; RETURN_NOT_OK(ReadSchemaField(field_indices[i], indices, &array)); columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); @@ -576,8 +597,6 @@ Status FileReader::Impl::ReadTable(std::shared_ptr
* table) { Status FileReader::Impl::ReadRowGroups(const std::vector& row_groups, const std::vector& indices, std::shared_ptr
* table) { - // TODO(PARQUET-1393): Modify the record readers to already read this into a single, - // continuous array. std::vector> tables(row_groups.size(), nullptr); for (size_t i = 0; i < row_groups.size(); ++i) { @@ -633,7 +652,7 @@ Status FileReader::GetSchema(const std::vector& indices, return impl_->GetSchema(indices, out); } -Status FileReader::ReadColumn(int i, std::shared_ptr* out) { +Status FileReader::ReadColumn(int i, std::shared_ptr* out) { try { return impl_->ReadColumn(i, out); } catch (const ::parquet::ParquetException& e) { @@ -641,7 +660,7 @@ Status FileReader::ReadColumn(int i, std::shared_ptr* out) { } } -Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { +Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { try { return impl_->ReadSchemaField(i, out); } catch (const ::parquet::ParquetException& e) { @@ -649,6 +668,18 @@ Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { } } +Status FileReader::ReadColumn(int i, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(ReadColumn(i, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + +Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(ReadSchemaField(i, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + Status FileReader::GetRecordBatchReader(const std::vector& row_group_indices, std::shared_ptr* out) { std::vector indices(impl_->num_columns()); @@ -764,7 +795,28 @@ const ParquetFileReader* FileReader::parquet_reader() const { } template -Status PrimitiveImpl::WrapIntoListArray(std::shared_ptr* array) { +Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { + if (descr_->max_repetition_level() == 0) { + // Flat, no action + return Status::OK(); + } + + std::shared_ptr flat_array; + + // ARROW-3762(wesm): If inout_array is a chunked array, we reject as this is + // not yet implemented + if (inout_array->kind() == Datum::CHUNKED_ARRAY) { + if (inout_array->chunked_array()->num_chunks() > 1) { + return Status::NotImplemented( + "Nested data conversions not implemented for " + "chunked array outputs"); + } + flat_array = inout_array->chunked_array()->chunk(0); + } else { + DCHECK_EQ(Datum::ARRAY, inout_array->kind()); + flat_array = inout_array->make_array(); + } + const int16_t* def_levels = record_reader_->def_levels(); const int16_t* rep_levels = record_reader_->rep_levels(); const int64_t total_levels_read = record_reader_->levels_position(); @@ -775,110 +827,106 @@ Status PrimitiveImpl::WrapIntoListArray(std::shared_ptr* array) { &arrow_schema)); std::shared_ptr current_field = arrow_schema->field(0); - if (descr_->max_repetition_level() > 0) { - // Walk downwards to extract nullability - std::vector nullable; - std::vector> offset_builders; - std::vector> valid_bits_builders; - nullable.push_back(current_field->nullable()); - while (current_field->type()->num_children() > 0) { - if (current_field->type()->num_children() > 1) { - return Status::NotImplemented( - "Fields with more than one child are not supported."); - } else { - if (current_field->type()->id() != ::arrow::Type::LIST) { - return Status::NotImplemented( - "Currently only nesting with Lists is supported."); - } - current_field = current_field->type()->child(0); + // Walk downwards to extract nullability + std::vector nullable; + std::vector> offset_builders; + std::vector> valid_bits_builders; + nullable.push_back(current_field->nullable()); + while (current_field->type()->num_children() > 0) { + if (current_field->type()->num_children() > 1) { + return Status::NotImplemented("Fields with more than one child are not supported."); + } else { + if (current_field->type()->id() != ::arrow::Type::LIST) { + return Status::NotImplemented("Currently only nesting with Lists is supported."); } - offset_builders.emplace_back( - std::make_shared<::arrow::Int32Builder>(::arrow::int32(), pool_)); - valid_bits_builders.emplace_back( - std::make_shared<::arrow::BooleanBuilder>(::arrow::boolean(), pool_)); - nullable.push_back(current_field->nullable()); + current_field = current_field->type()->child(0); } + offset_builders.emplace_back( + std::make_shared<::arrow::Int32Builder>(::arrow::int32(), pool_)); + valid_bits_builders.emplace_back( + std::make_shared<::arrow::BooleanBuilder>(::arrow::boolean(), pool_)); + nullable.push_back(current_field->nullable()); + } - int64_t list_depth = offset_builders.size(); - // This describes the minimal definition that describes a level that - // reflects a value in the primitive values array. - int16_t values_def_level = descr_->max_definition_level(); - if (nullable[nullable.size() - 1]) { - values_def_level--; - } + int64_t list_depth = offset_builders.size(); + // This describes the minimal definition that describes a level that + // reflects a value in the primitive values array. + int16_t values_def_level = descr_->max_definition_level(); + if (nullable[nullable.size() - 1]) { + values_def_level--; + } - // The definition levels that are needed so that a list is declared - // as empty and not null. - std::vector empty_def_level(list_depth); - int def_level = 0; - for (int i = 0; i < list_depth; i++) { - if (nullable[i]) { - def_level++; - } - empty_def_level[i] = static_cast(def_level); + // The definition levels that are needed so that a list is declared + // as empty and not null. + std::vector empty_def_level(list_depth); + int def_level = 0; + for (int i = 0; i < list_depth; i++) { + if (nullable[i]) { def_level++; } + empty_def_level[i] = static_cast(def_level); + def_level++; + } - int32_t values_offset = 0; - std::vector null_counts(list_depth, 0); - for (int64_t i = 0; i < total_levels_read; i++) { - int16_t rep_level = rep_levels[i]; - if (rep_level < descr_->max_repetition_level()) { - for (int64_t j = rep_level; j < list_depth; j++) { - if (j == (list_depth - 1)) { - RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); - } else { - RETURN_NOT_OK(offset_builders[j]->Append( - static_cast(offset_builders[j + 1]->length()))); - } + int32_t values_offset = 0; + std::vector null_counts(list_depth, 0); + for (int64_t i = 0; i < total_levels_read; i++) { + int16_t rep_level = rep_levels[i]; + if (rep_level < descr_->max_repetition_level()) { + for (int64_t j = rep_level; j < list_depth; j++) { + if (j == (list_depth - 1)) { + RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); + } else { + RETURN_NOT_OK(offset_builders[j]->Append( + static_cast(offset_builders[j + 1]->length()))); + } - if (((empty_def_level[j] - 1) == def_levels[i]) && (nullable[j])) { - RETURN_NOT_OK(valid_bits_builders[j]->Append(false)); - null_counts[j]++; + if (((empty_def_level[j] - 1) == def_levels[i]) && (nullable[j])) { + RETURN_NOT_OK(valid_bits_builders[j]->Append(false)); + null_counts[j]++; + break; + } else { + RETURN_NOT_OK(valid_bits_builders[j]->Append(true)); + if (empty_def_level[j] == def_levels[i]) { break; - } else { - RETURN_NOT_OK(valid_bits_builders[j]->Append(true)); - if (empty_def_level[j] == def_levels[i]) { - break; - } } } } - if (def_levels[i] >= values_def_level) { - values_offset++; - } } - // Add the final offset to all lists - for (int64_t j = 0; j < list_depth; j++) { - if (j == (list_depth - 1)) { - RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); - } else { - RETURN_NOT_OK(offset_builders[j]->Append( - static_cast(offset_builders[j + 1]->length()))); - } + if (def_levels[i] >= values_def_level) { + values_offset++; } - - std::vector> offsets; - std::vector> valid_bits; - std::vector list_lengths; - for (int64_t j = 0; j < list_depth; j++) { - list_lengths.push_back(offset_builders[j]->length() - 1); - std::shared_ptr array; - RETURN_NOT_OK(offset_builders[j]->Finish(&array)); - offsets.emplace_back(std::static_pointer_cast(array)->values()); - RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array)); - valid_bits.emplace_back(std::static_pointer_cast(array)->values()); + } + // Add the final offset to all lists + for (int64_t j = 0; j < list_depth; j++) { + if (j == (list_depth - 1)) { + RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); + } else { + RETURN_NOT_OK(offset_builders[j]->Append( + static_cast(offset_builders[j + 1]->length()))); } + } - std::shared_ptr output(*array); - for (int64_t j = list_depth - 1; j >= 0; j--) { - auto list_type = - ::arrow::list(::arrow::field("item", output->type(), nullable[j + 1])); - output = std::make_shared<::arrow::ListArray>( - list_type, list_lengths[j], offsets[j], output, valid_bits[j], null_counts[j]); - } - *array = output; + std::vector> offsets; + std::vector> valid_bits; + std::vector list_lengths; + for (int64_t j = 0; j < list_depth; j++) { + list_lengths.push_back(offset_builders[j]->length() - 1); + std::shared_ptr array; + RETURN_NOT_OK(offset_builders[j]->Finish(&array)); + offsets.emplace_back(std::static_pointer_cast(array)->values()); + RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array)); + valid_bits.emplace_back(std::static_pointer_cast(array)->values()); + } + + std::shared_ptr output = flat_array; + for (int64_t j = list_depth - 1; j >= 0; j--) { + auto list_type = + ::arrow::list(::arrow::field("item", output->type(), nullable[j + 1])); + output = std::make_shared<::arrow::ListArray>(list_type, list_lengths[j], offsets[j], + output, valid_bits[j], null_counts[j]); } + *inout_array = output; return Status::OK(); } @@ -909,8 +957,7 @@ struct TransferFunctor { using ParquetCType = typename ParquetType::c_type; Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { static_assert(!std::is_same::value, "The fast path transfer functor should be used " "for primitive values"); @@ -938,8 +985,7 @@ template struct TransferFunctor> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); std::shared_ptr values = reader->ReleaseValues(); @@ -957,8 +1003,7 @@ struct TransferFunctor struct TransferFunctor<::arrow::BooleanType, BooleanType> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); std::shared_ptr data; @@ -991,8 +1036,7 @@ struct TransferFunctor<::arrow::BooleanType, BooleanType> { template <> struct TransferFunctor<::arrow::TimestampType, Int96Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1019,8 +1063,7 @@ struct TransferFunctor<::arrow::TimestampType, Int96Type> { template <> struct TransferFunctor<::arrow::Date64Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1046,19 +1089,24 @@ struct TransferFunctor<::arrow::Date64Type, Int32Type> { template struct TransferFunctor< ArrowType, ParquetType, - typename std::enable_if::value || - std::is_same::value>::type> { + typename std::enable_if< + (std::is_base_of<::arrow::BinaryType, ArrowType>::value || + std::is_same<::arrow::FixedSizeBinaryType, ArrowType>::value) && + (std::is_same::value || + std::is_same::value)>::type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { - RETURN_NOT_OK(reader->builder()->Finish(out)); + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + std::vector> chunks = reader->GetBuilderChunks(); if (type->id() == ::arrow::Type::STRING) { // Convert from BINARY type to STRING - auto new_data = (*out)->data()->Copy(); - new_data->type = type; - *out = ::arrow::MakeArray(new_data); + for (size_t i = 0; i < chunks.size(); ++i) { + auto new_data = chunks[i]->data()->Copy(); + new_data->type = type; + chunks[i] = ::arrow::MakeArray(new_data); + } } + *out = std::make_shared(chunks); return Status::OK(); } }; @@ -1166,121 +1214,133 @@ static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_wid BytesToIntegerPair(value, byte_width, high, low); } -/// \brief Convert an array of FixedLenByteArrays to an arrow::Decimal128Array -/// We do this by: -/// 1. Creating a arrow::FixedSizeBinaryArray from the RecordReader's builder -/// 2. Allocating a buffer for the arrow::Decimal128Array -/// 3. Converting the big-endian bytes in the FixedSizeBinaryArray to two integers -/// representing the high and low bits of each decimal value. +// ---------------------------------------------------------------------- +// BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 + +template +Status ConvertToDecimal128(const Array& array, const std::shared_ptr<::arrow::DataType>&, + MemoryPool* pool, std::shared_ptr*) { + return Status::NotImplemented("not implemented"); +} + template <> -struct TransferFunctor<::arrow::Decimal128Type, FLBAType> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { - DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); +Status ConvertToDecimal128(const Array& array, + const std::shared_ptr<::arrow::DataType>& type, + MemoryPool* pool, std::shared_ptr* out) { + const auto& fixed_size_binary_array = + static_cast(array); - // Finish the built data into a temporary array - std::shared_ptr array; - RETURN_NOT_OK(reader->builder()->Finish(&array)); - const auto& fixed_size_binary_array = - static_cast(*array); + // The byte width of each decimal value + const int32_t type_length = + static_cast(*type).byte_width(); - // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time - // this will be different from the decimal array width because we write the minimum - // number of bytes necessary to represent a given precision - const int32_t byte_width = - static_cast(*fixed_size_binary_array.type()) - .byte_width(); + // number of elements in the entire array + const int64_t length = fixed_size_binary_array.length(); - // The byte width of each decimal value - const int32_t type_length = - static_cast(*type).byte_width(); + // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time + // this will be different from the decimal array width because we write the minimum + // number of bytes necessary to represent a given precision + const int32_t byte_width = + static_cast(*fixed_size_binary_array.type()) + .byte_width(); - // number of elements in the entire array - const int64_t length = fixed_size_binary_array.length(); + // allocate memory for the decimal array + std::shared_ptr data; + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - // allocate memory for the decimal array - std::shared_ptr data; - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - - // raw bytes that we can write to - uint8_t* out_ptr = data->mutable_data(); - - // convert each FixedSizeBinary value to valid decimal bytes - const int64_t null_count = fixed_size_binary_array.null_count(); - if (null_count > 0) { - for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { - if (!fixed_size_binary_array.IsNull(i)) { - RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, - out_ptr); - } - } - } else { - for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + // raw bytes that we can write to + uint8_t* out_ptr = data->mutable_data(); + + // convert each FixedSizeBinary value to valid decimal bytes + const int64_t null_count = fixed_size_binary_array.null_count(); + if (null_count > 0) { + for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + if (!fixed_size_binary_array.IsNull(i)) { RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, out_ptr); } } - - *out = std::make_shared<::arrow::Decimal128Array>( - type, length, data, fixed_size_binary_array.null_bitmap(), null_count); - return Status::OK(); + } else { + for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, out_ptr); + } } -}; -/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array -/// We do this by: -/// 1. Creating an arrow::BinaryArray from the RecordReader's builder -/// 2. Allocating a buffer for the arrow::Decimal128Array -/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers -/// representing the high and low bits of each decimal value. -template <> -struct TransferFunctor<::arrow::Decimal128Type, ByteArrayType> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { - DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); + *out = std::make_shared<::arrow::Decimal128Array>( + type, length, data, fixed_size_binary_array.null_bitmap(), null_count); - // Finish the built data into a temporary array - std::shared_ptr array; - RETURN_NOT_OK(reader->builder()->Finish(&array)); - const auto& binary_array = static_cast(*array); + return Status::OK(); +} - const int64_t length = binary_array.length(); +template <> +Status ConvertToDecimal128(const Array& array, + const std::shared_ptr<::arrow::DataType>& type, + MemoryPool* pool, std::shared_ptr* out) { + const auto& binary_array = static_cast(array); + const int64_t length = binary_array.length(); - const auto& decimal_type = static_cast(*type); - const int64_t type_length = decimal_type.byte_width(); + const auto& decimal_type = static_cast(*type); + const int64_t type_length = decimal_type.byte_width(); - std::shared_ptr data; - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); + std::shared_ptr data; + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - // raw bytes that we can write to - uint8_t* out_ptr = data->mutable_data(); + // raw bytes that we can write to + uint8_t* out_ptr = data->mutable_data(); - const int64_t null_count = binary_array.null_count(); + const int64_t null_count = binary_array.null_count(); - // convert each BinaryArray value to valid decimal bytes - for (int64_t i = 0; i < length; i++, out_ptr += type_length) { - int32_t record_len = 0; - const uint8_t* record_loc = binary_array.GetValue(i, &record_len); + // convert each BinaryArray value to valid decimal bytes + for (int64_t i = 0; i < length; i++, out_ptr += type_length) { + int32_t record_len = 0; + const uint8_t* record_loc = binary_array.GetValue(i, &record_len); - if ((record_len < 0) || (record_len > type_length)) { - return Status::Invalid("Invalid BYTE_ARRAY size"); - } + if ((record_len < 0) || (record_len > type_length)) { + return Status::Invalid("Invalid BYTE_ARRAY size"); + } - auto out_ptr_view = reinterpret_cast(out_ptr); - out_ptr_view[0] = 0; - out_ptr_view[1] = 0; + auto out_ptr_view = reinterpret_cast(out_ptr); + out_ptr_view[0] = 0; + out_ptr_view[1] = 0; - // only convert rows that are not null if there are nulls, or - // all rows, if there are not - if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) { - RawBytesToDecimalBytes(record_loc, record_len, out_ptr); - } + // only convert rows that are not null if there are nulls, or + // all rows, if there are not + if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) { + RawBytesToDecimalBytes(record_loc, record_len, out_ptr); } + } + + *out = std::make_shared<::arrow::Decimal128Array>( + type, length, data, binary_array.null_bitmap(), null_count); + return Status::OK(); +} + +/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array +/// We do this by: +/// 1. Creating an arrow::BinaryArray from the RecordReader's builder +/// 2. Allocating a buffer for the arrow::Decimal128Array +/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers +/// representing the high and low bits of each decimal value. +template +struct TransferFunctor< + ArrowType, ParquetType, + typename std::enable_if::value && + (std::is_same::value || + std::is_same::value)>::type> { + Status operator()(RecordReader* reader, MemoryPool* pool, + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); - *out = std::make_shared<::arrow::Decimal128Array>( - type, length, data, binary_array.null_bitmap(), null_count); + ::arrow::ArrayVector chunks = reader->GetBuilderChunks(); + for (size_t i = 0; i < chunks.size(); ++i) { + std::shared_ptr chunk_as_decimal; + RETURN_NOT_OK( + ConvertToDecimal128(*chunks[i], type, pool, &chunk_as_decimal)); + + // Replace the chunk, which will hopefully also free memory as we go + chunks[i] = chunk_as_decimal; + } + *out = std::make_shared(chunks); return Status::OK(); } }; @@ -1295,7 +1355,7 @@ template ::value>::type> static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); const int64_t length = reader->values_written(); @@ -1342,8 +1402,7 @@ static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, template <> struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; @@ -1351,23 +1410,23 @@ struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { template <> struct TransferFunctor<::arrow::Decimal128Type, Int64Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; -#define TRANSFER_DATA(ArrowType, ParquetType) \ - TransferFunctor func; \ - RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), out)); \ - RETURN_NOT_OK(WrapIntoListArray(out)) +#define TRANSFER_DATA(ArrowType, ParquetType) \ + TransferFunctor func; \ + RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), &result)); \ + RETURN_NOT_OK(WrapIntoListArray(&result)) #define TRANSFER_CASE(ENUM, ArrowType, ParquetType) \ case ::arrow::Type::ENUM: { \ TRANSFER_DATA(ArrowType, ParquetType); \ } break; -Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status PrimitiveImpl::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { try { // Pre-allocation gives much better performance for flat columns record_reader_->Reserve(records_to_read); @@ -1387,6 +1446,7 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* return ::arrow::Status::IOError(e.what()); } + Datum result; switch (field_->type()->id()) { TRANSFER_CASE(BOOL, ::arrow::BooleanType, BooleanType) TRANSFER_CASE(UINT8, ::arrow::UInt8Type, Int32Type) @@ -1405,8 +1465,8 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* TRANSFER_CASE(DATE64, ::arrow::Date64Type, Int32Type) TRANSFER_CASE(FIXED_SIZE_BINARY, ::arrow::FixedSizeBinaryType, FLBAType) case ::arrow::Type::NA: { - *out = std::make_shared<::arrow::NullArray>(record_reader_->values_written()); - RETURN_NOT_OK(WrapIntoListArray(out)); + result = std::make_shared<::arrow::NullArray>(record_reader_->values_written()); + RETURN_NOT_OK(WrapIntoListArray(&result)); break; } case ::arrow::Type::DECIMAL: { @@ -1452,6 +1512,15 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* return Status::NotImplemented(ss.str()); } + DCHECK_NE(result.kind(), Datum::NONE); + + if (result.kind() == Datum::ARRAY) { + *out = std::make_shared(result.make_array()); + } else if (result.kind() == Datum::CHUNKED_ARRAY) { + *out = result.chunked_array(); + } else { + DCHECK(false) << "Should be impossible"; + } return Status::OK(); } @@ -1477,10 +1546,17 @@ ColumnReader::ColumnReader(std::unique_ptr impl) ColumnReader::~ColumnReader() {} -Status ColumnReader::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status ColumnReader::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { return impl_->NextBatch(records_to_read, out); } +Status ColumnReader::NextBatch(int64_t records_to_read, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(impl_->NextBatch(records_to_read, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + // StructImpl methods Status StructImpl::DefLevelsToNullArray(std::shared_ptr* null_bitmap_out, @@ -1565,17 +1641,21 @@ Status StructImpl::GetRepLevels(const int16_t** data, size_t* length) { return Status::NotImplemented("GetRepLevels is not implemented for struct"); } -Status StructImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status StructImpl::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { std::vector> children_arrays; std::shared_ptr null_bitmap; int64_t null_count; // Gather children arrays and def levels for (auto& child : children_) { - std::shared_ptr child_array; + std::shared_ptr field; + RETURN_NOT_OK(child->NextBatch(records_to_read, &field)); - RETURN_NOT_OK(child->NextBatch(records_to_read, &child_array)); - children_arrays.push_back(child_array); + if (field->num_chunks() > 1) { + return Status::Invalid("Chunked field reads not yet supported with StructArray"); + } + children_arrays.push_back(field->chunk(0)); } RETURN_NOT_OK(DefLevelsToNullArray(&null_bitmap, &null_count)); @@ -1589,8 +1669,9 @@ Status StructImpl::NextBatch(int64_t records_to_read, std::shared_ptr* ou } } - *out = std::make_shared(field()->type(), struct_length, children_arrays, - null_bitmap, null_count); + auto result = std::make_shared(field()->type(), struct_length, + children_arrays, null_bitmap, null_count); + *out = std::make_shared(result); return Status::OK(); } @@ -1613,10 +1694,16 @@ RowGroupReader::~RowGroupReader() {} RowGroupReader::RowGroupReader(FileReader::Impl* impl, int row_group_index) : impl_(impl), row_group_index_(row_group_index) {} -Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { +Status ColumnChunkReader::Read(std::shared_ptr<::arrow::ChunkedArray>* out) { return impl_->ReadColumnChunk(column_index_, row_group_index_, out); } +Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(impl_->ReadColumnChunk(column_index_, row_group_index_, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + ColumnChunkReader::~ColumnChunkReader() {} ColumnChunkReader::ColumnChunkReader(FileReader::Impl* impl, int row_group_index, diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 2cd94ca28fdcb..5286e742b08c1 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -30,6 +30,7 @@ namespace arrow { class Array; +class ChunkedArray; class MemoryPool; class RecordBatchReader; class Schema; @@ -125,6 +126,10 @@ class PARQUET_EXPORT FileReader { std::shared_ptr<::arrow::Schema>* out); // Read column as a whole into an Array. + ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out); // NOTE: Experimental API @@ -139,27 +144,11 @@ class PARQUET_EXPORT FileReader { // 2 foo3 // // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc - ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); + ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::ChunkedArray>* out); - // NOTE: Experimental API - // Reads a specific top level schema field into an Array, while keeping only chosen - // leaf columns. - // The index i refers the index of the top level schema field, which may - // be nested or flat, and indices vector refers to the leaf column indices - e.g. - // - // i indices - // 0 0 foo.bar - // 0 1 foo.bar.baz - // 0 2 foo.qux - // 1 3 foo2 - // 2 4 foo3 - // - // i=0 indices={0,2} will read a partial struct with foo.bar and foo.quox columns - // i=1 indices={3} will read foo2 column - // i=1 indices={2} will result in out=nullptr - // leaf indices which are unrelated to the schema field are ignored - ::arrow::Status ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr<::arrow::Array>* out); + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") + ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the /// ordering in row_group_indices matters. @@ -248,6 +237,10 @@ class PARQUET_EXPORT RowGroupReader { class PARQUET_EXPORT ColumnChunkReader { public: + ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out); virtual ~ColumnChunkReader(); @@ -281,6 +274,11 @@ class PARQUET_EXPORT ColumnReader { // // Returns Status::OK on a successful read, including if you have exhausted // the data available in the file. + ::arrow::Status NextBatch(int64_t batch_size, + std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out); private: diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index 4a3cd526b118a..d1bf2c5cdfdc6 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -86,14 +86,6 @@ class RecordReader::RecordReaderImpl { valid_bits_ = AllocateBuffer(pool); def_levels_ = AllocateBuffer(pool); rep_levels_ = AllocateBuffer(pool); - - if (descr->physical_type() == Type::BYTE_ARRAY) { - builder_.reset(new ::arrow::BinaryBuilder(pool)); - } else if (descr->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) { - int byte_width = descr->type_length(); - std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); - builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool)); - } Reset(); } @@ -229,8 +221,6 @@ class RecordReader::RecordReaderImpl { return result; } - ::arrow::ArrayBuilder* builder() { return builder_.get(); } - // Process written repetition/definition levels to reach the end of // records. Process no more levels than necessary to delimit the indicated // number of logical records. Updates internal state of RecordReader @@ -375,7 +365,7 @@ class RecordReader::RecordReaderImpl { records_read_ = 0; - // Calling Finish on the builders also resets them + // Call Finish on the binary builders to reset them } void ResetValues() { @@ -391,6 +381,8 @@ class RecordReader::RecordReaderImpl { virtual void DebugPrintState() = 0; + virtual std::vector> GetBuilderChunks() = 0; + protected: virtual bool ReadNewPage() = 0; @@ -434,9 +426,6 @@ class RecordReader::RecordReaderImpl { int64_t levels_position_; int64_t levels_capacity_; - // TODO(wesm): ByteArray / FixedLenByteArray types - std::unique_ptr<::arrow::ArrayBuilder> builder_; - std::shared_ptr<::arrow::ResizableBuffer> values_; template @@ -449,13 +438,32 @@ class RecordReader::RecordReaderImpl { std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; }; +template +struct RecordReaderTraits { + using BuilderType = ::arrow::ArrayBuilder; +}; + +template <> +struct RecordReaderTraits { + using BuilderType = ::arrow::internal::ChunkedBinaryBuilder; +}; + +template <> +struct RecordReaderTraits { + using BuilderType = ::arrow::FixedSizeBinaryBuilder; +}; + template class TypedRecordReader : public RecordReader::RecordReaderImpl { public: - typedef typename DType::c_type T; + using T = typename DType::c_type; - TypedRecordReader(const ColumnDescriptor* schema, ::arrow::MemoryPool* pool) - : RecordReader::RecordReaderImpl(schema, pool), current_decoder_(nullptr) {} + using BuilderType = typename RecordReaderTraits::BuilderType; + + TypedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) + : RecordReader::RecordReaderImpl(descr, pool), current_decoder_(nullptr) { + InitializeBuilder(); + } void ResetDecoders() override { decoders_.clear(); } @@ -546,6 +554,10 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { std::cout << std::endl; } + std::vector> GetBuilderChunks() override { + throw ParquetException("GetChunks only implemented for binary types"); + } + private: typedef Decoder DecoderType; @@ -554,11 +566,15 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { // plain-encoded data. std::unordered_map> decoders_; + std::unique_ptr builder_; + DecoderType* current_decoder_; // Advance to the next data page bool ReadNewPage() override; + void InitializeBuilder() {} + void ConfigureDictionary(const DictionaryPage* page); }; @@ -572,6 +588,36 @@ void TypedRecordReader::DebugPrintState() {} template <> void TypedRecordReader::DebugPrintState() {} +template <> +void TypedRecordReader::InitializeBuilder() { + // Maximum of 16MB chunks + constexpr int32_t kBinaryChunksize = 1 << 24; + DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); + builder_.reset(new ::arrow::internal::ChunkedBinaryBuilder(kBinaryChunksize, pool_)); +} + +template <> +void TypedRecordReader::InitializeBuilder() { + DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY); + int byte_width = descr_->type_length(); + std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); + builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool_)); +} + +template <> +::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { + ::arrow::ArrayVector chunks; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunks)); + return chunks; +} + +template <> +::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { + std::shared_ptr<::arrow::Array> chunk; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); + return ::arrow::ArrayVector({chunk}); +} + template <> inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) { auto values = ValuesHead(); @@ -579,10 +625,9 @@ inline void TypedRecordReader::ReadValuesDense(int64_t values_to_ current_decoder_->Decode(values, static_cast(values_to_read)); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::BinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { PARQUET_THROW_NOT_OK( - builder->Append(values[i].ptr, static_cast(values[i].len))); + builder_->Append(values[i].ptr, static_cast(values[i].len))); } ResetValues(); } @@ -594,9 +639,8 @@ inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) current_decoder_->Decode(values, static_cast(values_to_read)); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::FixedSizeBinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { - PARQUET_THROW_NOT_OK(builder->Append(values[i].ptr)); + PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); } ResetValues(); } @@ -613,14 +657,12 @@ inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to valid_bits_offset); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::BinaryBuilder*>(builder_.get()); - for (int64_t i = 0; i < num_decoded; i++) { if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { PARQUET_THROW_NOT_OK( - builder->Append(values[i].ptr, static_cast(values[i].len))); + builder_->Append(values[i].ptr, static_cast(values[i].len))); } else { - PARQUET_THROW_NOT_OK(builder->AppendNull()); + PARQUET_THROW_NOT_OK(builder_->AppendNull()); } } ResetValues(); @@ -638,12 +680,11 @@ inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to_read valid_bits_offset); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::FixedSizeBinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { - PARQUET_THROW_NOT_OK(builder->Append(values[i].ptr)); + PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); } else { - PARQUET_THROW_NOT_OK(builder->AppendNull()); + PARQUET_THROW_NOT_OK(builder_->AppendNull()); } } ResetValues(); @@ -845,8 +886,6 @@ std::shared_ptr RecordReader::ReleaseIsValid() { return impl_->ReleaseIsValid(); } -::arrow::ArrayBuilder* RecordReader::builder() { return impl_->builder(); } - int64_t RecordReader::values_written() const { return impl_->values_written(); } int64_t RecordReader::levels_position() const { return impl_->levels_position(); } @@ -863,6 +902,10 @@ void RecordReader::SetPageReader(std::unique_ptr reader) { impl_->SetPageReader(std::move(reader)); } +::arrow::ArrayVector RecordReader::GetBuilderChunks() { + return impl_->GetBuilderChunks(); +} + void RecordReader::DebugPrintState() { impl_->DebugPrintState(); } } // namespace internal diff --git a/cpp/src/parquet/arrow/record_reader.h b/cpp/src/parquet/arrow/record_reader.h index 7efd0d54899fe..0f62b744f323a 100644 --- a/cpp/src/parquet/arrow/record_reader.h +++ b/cpp/src/parquet/arrow/record_reader.h @@ -20,6 +20,7 @@ #include #include +#include #include "arrow/memory_pool.h" @@ -28,7 +29,7 @@ namespace arrow { -class ArrayBuilder; +class Array; } // namespace arrow @@ -77,7 +78,6 @@ class RecordReader { std::shared_ptr ReleaseValues(); std::shared_ptr ReleaseIsValid(); - ::arrow::ArrayBuilder* builder(); /// \brief Number of values written including nulls (if any) int64_t values_written() const; @@ -106,6 +106,9 @@ class RecordReader { void DebugPrintState(); + // For BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY types that may have chunked output + std::vector> GetBuilderChunks(); + private: std::unique_ptr impl_; explicit RecordReader(RecordReaderImpl* impl); diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 9e1a24961af0e..b63e72c57cfa8 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -19,7 +19,7 @@ # cython: language_level = 3 from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus, +from pyarrow.includes.libarrow cimport (CChunkedArray, CSchema, CStatus, CTable, CMemoryPool, CKeyValueMetadata, RandomAccessFile, OutputStream, @@ -272,8 +272,8 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef cppclass FileReader: FileReader(CMemoryPool* pool, unique_ptr[ParquetFileReader] reader) - CStatus ReadColumn(int i, shared_ptr[CArray]* out) - CStatus ReadSchemaField(int i, shared_ptr[CArray]* out) + CStatus ReadColumn(int i, shared_ptr[CChunkedArray]* out) + CStatus ReadSchemaField(int i, shared_ptr[CChunkedArray]* out) int num_row_groups() CStatus ReadRowGroup(int i, shared_ptr[CTable]* out) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 8112504e9e403..36a4d345c6a3d 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -26,6 +26,7 @@ from pyarrow.lib cimport (Array, Schema, check_status, MemoryPool, maybe_unbox_memory_pool, Table, + pyarrow_wrap_chunked_array, pyarrow_wrap_schema, pyarrow_wrap_table, NativeFile, get_reader, get_writer) @@ -770,28 +771,18 @@ cdef class ParquetReader: return self._column_idx_map[tobytes(column_name)] def read_column(self, int column_index): - cdef: - Array array = Array() - shared_ptr[CArray] carray - + cdef shared_ptr[CChunkedArray] out with nogil: check_status(self.reader.get() - .ReadColumn(column_index, &carray)) - - array.init(carray) - return array + .ReadColumn(column_index, &out)) + return pyarrow_wrap_chunked_array(out) def read_schema_field(self, int field_index): - cdef: - Array array = Array() - shared_ptr[CArray] carray - + cdef shared_ptr[CChunkedArray] out with nogil: check_status(self.reader.get() - .ReadSchemaField(field_index, &carray)) - - array.init(carray) - return array + .ReadSchemaField(field_index, &out)) + return pyarrow_wrap_chunked_array(out) cdef class ParquetWriter: diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 745a049e32a7c..3e628263ba36f 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -396,6 +396,8 @@ cdef object pyarrow_wrap_metadata( # cdef public object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array) +cdef public object pyarrow_wrap_chunked_array( + const shared_ptr[CChunkedArray]& sp_array) # XXX pyarrow.h calls it `wrap_record_batch` cdef public object pyarrow_wrap_batch(const shared_ptr[CRecordBatch]& cbatch) cdef public object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 89d3224580463..5c27a9b86a369 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1959,6 +1959,33 @@ def test_large_table_int32_overflow(): _write_table(table, f) +@pytest.mark.large_memory +def test_binary_array_overflow_to_chunked(): + # ARROW-3762 + + # 2^31 + 1 bytes + values = [b'x'] + [ + b'x' * (1 << 20) + ] * 2 * (1 << 10) + df = pd.DataFrame({'byte_col': values}) + + tbl = pa.Table.from_pandas(df, preserve_index=False) + + buf = io.BytesIO() + _write_table(tbl, buf) + buf.seek(0) + read_tbl = _read_table(buf) + buf = None + + col0_data = read_tbl[0].data + assert isinstance(col0_data, pa.ChunkedArray) + + # Split up into 16MB chunks. 128 * 16 = 2048, so 129 + assert col0_data.num_chunks == 129 + + assert tbl.equals(read_tbl) + + def test_index_column_name_duplicate(tempdir): data = { 'close': { From ce12fb55107e2ee5439267fe1a17ded8d2210849 Mon Sep 17 00:00:00 2001 From: Pindikura Ravindra Date: Sat, 15 Dec 2018 05:48:01 +0530 Subject: [PATCH 041/328] ARROW-1807: [Java] consolidate bufs to reduce heap (#3121) - for fixed-len vectors, alloc a combined arrow buf for value and validity. - Remove the read-write locks in AllocationMgr, they contribute about 150 bytes to the heap, and aren't very useful since there isn't much contention. --- .../arrow/memory/AllocationManager.java | 34 ++----- .../arrow/vector/BaseFixedWidthVector.java | 94 ++++++++++++------- .../vector/TestBufferOwnershipTransfer.java | 5 +- .../apache/arrow/vector/TestListVector.java | 10 +- 4 files changed, 73 insertions(+), 70 deletions(-) diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java index aaa1f506fb5c2..687674f951b89 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -22,11 +22,8 @@ import java.util.IdentityHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.arrow.memory.BaseAllocator.Verbosity; -import org.apache.arrow.memory.util.AutoCloseableLock; import org.apache.arrow.memory.util.HistoricalLog; import org.apache.arrow.util.Preconditions; @@ -73,9 +70,6 @@ public class AllocationManager { // ARROW-1627 Trying to minimize memory overhead caused by previously used IdentityHashMap // see JIRA for details private final LowCostIdentityHashMap map = new LowCostIdentityHashMap<>(); - private final ReadWriteLock lock = new ReentrantReadWriteLock(); - private final AutoCloseableLock readLock = new AutoCloseableLock(lock.readLock()); - private final AutoCloseableLock writeLock = new AutoCloseableLock(lock.writeLock()); private final long amCreationTime = System.nanoTime(); private volatile BufferLedger owningLedger; @@ -115,9 +109,8 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta "A buffer can only be associated between two allocators that share the same root."); } - try (AutoCloseableLock read = readLock.open()) { - - final BufferLedger ledger = map.get(allocator); + synchronized (this) { + BufferLedger ledger = map.get(allocator); if (ledger != null) { if (retain) { ledger.inc(); @@ -125,20 +118,7 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta return ledger; } - } - try (AutoCloseableLock write = writeLock.open()) { - // we have to recheck existing ledger since a second reader => writer could be competing - // with us. - - final BufferLedger existingLedger = map.get(allocator); - if (existingLedger != null) { - if (retain) { - existingLedger.inc(); - } - return existingLedger; - } - - final BufferLedger ledger = new BufferLedger(allocator); + ledger = new BufferLedger(allocator); if (retain) { ledger.inc(); } @@ -153,7 +133,7 @@ private BufferLedger associate(final BaseAllocator allocator, final boolean reta * The way that a particular BufferLedger communicates back to the AllocationManager that it * now longer needs to hold * a reference to particular piece of memory. - * Can only be called when you already hold the writeLock. + * Can only be called when you already hold the lock. */ private void release(final BufferLedger ledger) { final BaseAllocator allocator = ledger.getAllocator(); @@ -250,7 +230,7 @@ public boolean transferBalance(final BufferLedger target) { // since two balance transfers out from the allocator manager could cause incorrect // accounting, we need to ensure // that this won't happen by synchronizing on the allocator manager instance. - try (AutoCloseableLock write = writeLock.open()) { + synchronized (this) { if (owningLedger != this) { return true; } @@ -330,7 +310,7 @@ public int decrement(int decrement) { allocator.assertOpen(); final int outcome; - try (AutoCloseableLock write = writeLock.open()) { + synchronized (this) { outcome = bufRefCnt.addAndGet(-decrement); if (outcome == 0) { lDestructionTime = System.nanoTime(); @@ -431,7 +411,7 @@ public int getSize() { * @return Amount of accounted(owned) memory associated with this ledger. */ public int getAccountedSize() { - try (AutoCloseableLock read = readLock.open()) { + synchronized (this) { if (owningLedger == this) { return size; } else { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index bc0b77a0aeb0a..f69a9d1754ac7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -270,7 +270,7 @@ public boolean allocateNewSafe() { long curAllocationSizeValue = valueAllocationSizeInBytes; long curAllocationSizeValidity = validityAllocationSizeInBytes; - if (curAllocationSizeValue > MAX_ALLOCATION_SIZE) { + if (align(curAllocationSizeValue) + curAllocationSizeValidity > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory exceeds limit"); } @@ -302,7 +302,7 @@ public void allocateNew(int valueCount) { valueBufferSize = validityBufferSize; } - if (valueBufferSize > MAX_ALLOCATION_SIZE) { + if (align(valueBufferSize) + validityBufferSize > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); } @@ -317,6 +317,13 @@ public void allocateNew(int valueCount) { } } + /* + * align to a 8-byte value. + */ + private long align(long size) { + return ((size + 7) / 8) * 8; + } + /** * Actual memory allocation is done by this function. All the calculations * and knowledge about what size to allocate is upto the callers of this @@ -327,14 +334,24 @@ public void allocateNew(int valueCount) { * conditions. */ private void allocateBytes(final long valueBufferSize, final long validityBufferSize) { - /* allocate data buffer */ - int curSize = (int) valueBufferSize; - valueBuffer = allocator.buffer(curSize); + int valueBufferSlice = (int)align(valueBufferSize); + int validityBufferSlice = (int)validityBufferSize; + + /* allocate combined buffer */ + ArrowBuf buffer = allocator.buffer(valueBufferSlice + validityBufferSlice); + + valueAllocationSizeInBytes = valueBufferSlice; + valueBuffer = buffer.slice(0, valueBufferSlice); + valueBuffer.retain(); valueBuffer.readerIndex(0); - valueAllocationSizeInBytes = curSize; - /* allocate validity buffer */ - allocateValidityBuffer((int) validityBufferSize); + + validityAllocationSizeInBytes = validityBufferSlice; + validityBuffer = buffer.slice(valueBufferSlice, validityBufferSlice); + validityBuffer.retain(); + validityBuffer.readerIndex(0); zeroVector(); + + buffer.release(); } /** @@ -422,43 +439,50 @@ public ArrowBuf[] getBuffers(boolean clear) { */ @Override public void reAlloc() { - valueBuffer = reallocBufferHelper(valueBuffer, true); - validityBuffer = reallocBufferHelper(validityBuffer, false); - } - - /** - * Helper method for reallocating a particular internal buffer - * Returns the new buffer. - */ - private ArrowBuf reallocBufferHelper(ArrowBuf buffer, final boolean dataBuffer) { - final int currentBufferCapacity = buffer.capacity(); - long baseSize = (dataBuffer ? valueAllocationSizeInBytes - : validityAllocationSizeInBytes); + int valueBaseSize = Integer.max(valueBuffer.capacity(), valueAllocationSizeInBytes); + long newValueBufferSlice = align(valueBaseSize * 2L); + long newValidityBufferSlice; + if (typeWidth > 0) { + long targetValueBufferSize = align(BaseAllocator.nextPowerOfTwo(newValueBufferSlice)); + long targetValueCount = targetValueBufferSize / typeWidth; + targetValueBufferSize -= getValidityBufferSizeFromCount((int) targetValueCount); + if (newValueBufferSlice < targetValueBufferSize) { + newValueBufferSlice = targetValueBufferSize; + } - if (baseSize < (long) currentBufferCapacity) { - baseSize = (long) currentBufferCapacity; + newValidityBufferSlice = getValidityBufferSizeFromCount((int)(newValueBufferSlice / typeWidth)); + } else { + newValidityBufferSlice = newValueBufferSlice; } - long newAllocationSize = baseSize * 2L; - newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); + long newAllocationSize = newValueBufferSlice + newValidityBufferSlice; assert newAllocationSize >= 1; if (newAllocationSize > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Unable to expand the buffer"); } - final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); - newBuf.setBytes(0, buffer, 0, currentBufferCapacity); - newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); - buffer.release(1); - buffer = newBuf; - if (dataBuffer) { - valueAllocationSizeInBytes = (int) newAllocationSize; - } else { - validityAllocationSizeInBytes = (int) newAllocationSize; - } + final ArrowBuf newBuffer = allocator.buffer((int) newAllocationSize); + final ArrowBuf newValueBuffer = newBuffer.slice(0, (int)newValueBufferSlice); + newValueBuffer.setBytes(0, valueBuffer, 0, valueBuffer.capacity()); + newValueBuffer.setZero(valueBuffer.capacity(), (int)newValueBufferSlice - valueBuffer.capacity()); + newValueBuffer.retain(); + newValueBuffer.readerIndex(0); + valueBuffer.release(); + valueBuffer = newValueBuffer; + valueAllocationSizeInBytes = (int)newValueBufferSlice; + + final ArrowBuf newValidityBuffer = newBuffer.slice((int)newValueBufferSlice, + (int)newValidityBufferSlice); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), (int)newValidityBufferSlice - validityBuffer.capacity()); + newValidityBuffer.retain(); + newValidityBuffer.readerIndex(0); + validityBuffer.release(); + validityBuffer = newValidityBuffer; + validityAllocationSizeInBytes = (int)newValidityBufferSlice; - return buffer; + newBuffer.release(); } @Override diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java index 48bc8936d9fbe..9165343bfdc2b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java @@ -40,15 +40,14 @@ public void testTransferFixedWidth() { IntVector v1 = new IntVector("v1", childAllocator1); v1.allocateNew(); v1.setValueCount(4095); + long totalAllocatedMemory = childAllocator1.getAllocatedMemory(); IntVector v2 = new IntVector("v2", childAllocator2); v1.makeTransferPair(v2).transfer(); assertEquals(0, childAllocator1.getAllocatedMemory()); - int expectedBitVector = 512; - int expectedValueVector = 4096 * 4; - assertEquals(expectedBitVector + expectedValueVector, childAllocator2.getAllocatedMemory()); + assertEquals(totalAllocatedMemory, childAllocator2.getAllocatedMemory()); } @Test diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index 4e8d8f0f39944..68102b1c32a46 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -774,13 +774,13 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(512); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(4096, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 5); /* use density as 4 */ vector.setInitialCapacity(512, 4); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(512 * 4, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 4); /** * inner value capacity we pass to data vector is 512 * 0.1 => 51 @@ -793,7 +793,7 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(512, 0.1); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(64, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 51); /** * inner value capacity we pass to data vector is 512 * 0.01 => 5 @@ -806,7 +806,7 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(512, 0.01); vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); - assertEquals(8, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 5); /** * inner value capacity we pass to data vector is 5 * 0.1 => 0 @@ -822,7 +822,7 @@ public void testSetInitialCapacity() { vector.setInitialCapacity(5, 0.1); vector.allocateNew(); assertEquals(7, vector.getValueCapacity()); - assertEquals(1, vector.getDataVector().getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 1); } } From e098651a12f8199936f48f523e4f062a411969f7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Dec 2018 17:17:09 +0100 Subject: [PATCH 042/328] ARROW-3971: [Python] Remove deprecations in 0.11 and prior Author: Wes McKinney Closes #3180 from wesm/ARROW-3971 and squashes the following commits: 2a367f5d Remove Python deprecations in 0.11 and prior --- python/pyarrow/_parquet.pyx | 2 +- python/pyarrow/feather.py | 6 ----- python/pyarrow/filesystem.py | 5 +---- python/pyarrow/formatting.py | 43 ------------------------------------ python/pyarrow/parquet.py | 15 +++++-------- python/pyarrow/util.py | 16 +++++--------- 6 files changed, 14 insertions(+), 73 deletions(-) delete mode 100644 python/pyarrow/formatting.py diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 36a4d345c6a3d..2e92bac9a74d8 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -32,8 +32,8 @@ from pyarrow.lib cimport (Array, Schema, NativeFile, get_reader, get_writer) from pyarrow.compat import tobytes, frombytes -from pyarrow.formatting import indent from pyarrow.lib import ArrowException, NativeFile, _stringify_path +from pyarrow.util import indent import six import warnings diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 930e999a56116..faa2f7d892ee0 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -20,7 +20,6 @@ import six import pandas as pd -import warnings from pyarrow.compat import pdapi from pyarrow.lib import FeatherError # noqa @@ -44,11 +43,6 @@ def __init__(self, source): self.source = source self.open(source) - def read(self, *args, **kwargs): - warnings.warn("read has been deprecated. Use read_pandas instead.", - FutureWarning, stacklevel=2) - return self.read_pandas(*args, **kwargs) - def read_table(self, columns=None): if columns is None: return self._read() diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index f1d0eec3f8df5..8188a2607e21a 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -148,8 +148,7 @@ def _isfilestore(self): raise NotImplementedError def read_parquet(self, path, columns=None, metadata=None, schema=None, - use_threads=True, nthreads=None, - use_pandas_metadata=False): + use_threads=True, use_pandas_metadata=False): """ Read Parquet data from path in file system. Can read from a single file or a directory of files @@ -176,8 +175,6 @@ def read_parquet(self, path, columns=None, metadata=None, schema=None, table : pyarrow.Table """ from pyarrow.parquet import ParquetDataset - from pyarrow.util import _deprecate_nthreads - use_threads = _deprecate_nthreads(use_threads, nthreads) dataset = ParquetDataset(path, schema=schema, metadata=metadata, filesystem=self) return dataset.read(columns=columns, use_threads=use_threads, diff --git a/python/pyarrow/formatting.py b/python/pyarrow/formatting.py deleted file mode 100644 index 5ef9482ed144c..0000000000000 --- a/python/pyarrow/formatting.py +++ /dev/null @@ -1,43 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Pretty-printing and other formatting utilities for Arrow data structures - -import pyarrow.lib as lib -import warnings - -try: - from textwrap import indent -except ImportError: - def indent(text, prefix): - return ''.join(prefix + line for line in text.splitlines(True)) - - -def array_format(arr, window=10): - warnings.warn("array_format is deprecated, use Array.format() instead", - FutureWarning) - return arr.format(window=window) - - -def value_format(x, indent_level=0): - warnings.warn("value_format is deprecated", - FutureWarning) - if isinstance(x, lib.ListValue): - contents = ',\n'.join(value_format(item) for item in x) - return '[{0}]'.format(indent(contents, ' ').strip()) - else: - return repr(x) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 3ebfc8c0517ce..b89145adc4433 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -35,7 +35,7 @@ from pyarrow.compat import guid from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem, _get_fs_from_path) -from pyarrow.util import _is_path_like, _stringify_path, _deprecate_nthreads +from pyarrow.util import _is_path_like, _stringify_path def _check_contains_null(val): @@ -135,8 +135,8 @@ def schema(self): def num_row_groups(self): return self.reader.num_row_groups - def read_row_group(self, i, columns=None, nthreads=None, - use_threads=True, use_pandas_metadata=False): + def read_row_group(self, i, columns=None, use_threads=True, + use_pandas_metadata=False): """ Read a single row group from a Parquet file @@ -157,7 +157,6 @@ def read_row_group(self, i, columns=None, nthreads=None, pyarrow.table.Table Content of the row group as a table (of columns) """ - use_threads = _deprecate_nthreads(use_threads, nthreads) column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_row_group(i, column_indices=column_indices, @@ -1071,9 +1070,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1): def read_table(source, columns=None, use_threads=True, metadata=None, - use_pandas_metadata=False, memory_map=True, - nthreads=None): - use_threads = _deprecate_nthreads(use_threads, nthreads) + use_pandas_metadata=False, memory_map=True): if _is_path_like(source): fs = _get_fs_from_path(source) return fs.read_parquet(source, columns=columns, @@ -1094,8 +1091,8 @@ def read_table(source, columns=None, use_threads=True, metadata=None, Content of the file as a table (of columns)""") -def read_pandas(source, columns=None, use_threads=True, - memory_map=True, nthreads=None, metadata=None): +def read_pandas(source, columns=None, use_threads=True, memory_map=True, + metadata=None): return read_table(source, columns=columns, use_threads=use_threads, metadata=metadata, memory_map=True, diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index 1c26ee5e22f73..7cf57d88380e9 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -20,6 +20,12 @@ import six import warnings +try: + from textwrap import indent +except ImportError: + def indent(text, prefix): + return ''.join(prefix + line for line in text.splitlines(True)) + try: # pathlib might not be available try: @@ -72,13 +78,3 @@ def _stringify_path(path): return str(path) raise TypeError("not a path-like object") - - -def _deprecate_nthreads(use_threads, nthreads): - if nthreads is not None: - warnings.warn("`nthreads` argument is deprecated, " - "pass `use_threads` instead", FutureWarning, - stacklevel=3) - if nthreads > 1: - use_threads = True - return use_threads From 537aa2fabaad04455dbffceb77d5589230db3cea Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Sat, 15 Dec 2018 17:22:16 +0100 Subject: [PATCH 043/328] ARROW-4037: [Packaging] Remove workaround to verify 0.11.0 Author: Yosuke Shiro Closes #3182 from shiro615/packaging-remove-workaround and squashes the following commits: 60ae617b Remove workaround for 0.11.0 --- dev/release/verify-release-candidate.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 57b1850337067..45404b03dfb8a 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -211,8 +211,6 @@ test_glib() { gem install bundler fi - # Workaround for 0.11.0. 0.11.0 doesn't include c_glib/Gemfile. - wget https://raw.githubusercontent.com/apache/arrow/master/c_glib/Gemfile bundle install --path vendor/bundle bundle exec ruby test/run-test.rb From 23dfc1c5b1e303aa4ed699970c68235e319aa3d8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Dec 2018 17:27:28 +0100 Subject: [PATCH 044/328] ARROW-4006: Add CODE_OF_CONDUCT.md Many people are not aware that The ASF has a code of conduct. Having this document in the root directory will increase awareness of the type of professional behavior we expect from members of our community. Author: Wes McKinney Closes #3179 from wesm/ARROW-4006 and squashes the following commits: 6e88d8ab Add CODE_OF_CONDUCT.md --- CODE_OF_CONDUCT.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000..2efe740b77c50 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,24 @@ + + +# Code of Conduct + +* [Code of Conduct for The Apache Software Foundation][1] + +[1]: https://www.apache.org/foundation/policies/conduct.html \ No newline at end of file From 0936938e875c77c80f34b92884a30ff7fceeddcb Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Sat, 15 Dec 2018 20:19:31 +0100 Subject: [PATCH 045/328] ARROW-4039: [Python] Update link to 'development.rst' page from Python README.md Author: Benjamin Kietzman Closes #3185 from bkietz/ARROW-4039-update-development-link and squashes the following commits: a18596e7 Update README.md --- python/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/README.md b/python/README.md index 7d66dddd87c77..ce696939929f9 100644 --- a/python/README.md +++ b/python/README.md @@ -83,6 +83,6 @@ pip install -r ../docs/requirements.txt python setup.py build_sphinx -s ../docs/source ``` -[2]: https://github.com/apache/arrow/blob/master/python/doc/source/development.rst +[2]: https://github.com/apache/arrow/blob/master/docs/source/python/development.rst [3]: https://github.com/pandas-dev/pandas [4]: https://docs.pytest.org/en/latest/ From 2e8cfcac93596fb630310ca975b72a62208381d7 Mon Sep 17 00:00:00 2001 From: Tanya Schlusser Date: Sat, 15 Dec 2018 22:20:12 +0100 Subject: [PATCH 046/328] ARROW-3230: [Python] Missing comparisons on ChunkedArray, Table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `__eq__` method to `Table`, `Column`, and `ChunkedArray`, plus relevant tests. Author: Tanya Schlusser Author: Krisztián Szűcs Closes #3183 from tanyaschlusser/ARROW-3230 and squashes the following commits: 0ea512e0 minor fixes 2ea12f3c Add '__eq__' method to Table, Column, and ChunkedArray and remove '__richcmp__' from Column 47d24973 Add '==' and '!=' tests for Table, Column, and ChunkedArray --- python/pyarrow/table.pxi | 26 ++++++++++++++++++-------- python/pyarrow/tests/test_table.py | 9 +++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index cf3411dc03616..4d52f26e749fc 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -117,6 +117,12 @@ cdef class ChunkedArray: else: index -= self.chunked_array.chunk(j).get().length() + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + def equals(self, ChunkedArray other): """ Return whether the contents of two chunked arrays are equal @@ -411,14 +417,6 @@ cdef class Column: return result.getvalue() - def __richcmp__(Column self, Column other, int op): - if op == cp.Py_EQ: - return self.equals(other) - elif op == cp.Py_NE: - return not self.equals(other) - else: - raise TypeError('Invalid comparison') - def __getitem__(self, key): return self.data[key] @@ -540,6 +538,12 @@ cdef class Column: def __array__(self, dtype=None): return self.data.__array__(dtype=dtype) + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + def equals(self, Column other): """ Check if contents of two columns are equal @@ -1111,6 +1115,12 @@ cdef class Table: return pyarrow_wrap_table(flattened) + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + def equals(self, Table other): """ Check if contents of two tables are equal diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index ecbf93bd3e8b0..847b1a4ca550d 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -117,6 +117,8 @@ def eq(xarrs, yarrs): y = pa.chunked_array(yarrs) assert x.equals(y) assert y.equals(x) + assert x == y + assert x != str(y) def ne(xarrs, yarrs): if isinstance(xarrs, pa.ChunkedArray): @@ -129,6 +131,7 @@ def ne(xarrs, yarrs): y = pa.chunked_array(yarrs) assert not x.equals(y) assert not y.equals(x) + assert x != y eq(pa.chunked_array([], type=pa.int32()), pa.chunked_array([], type=pa.int32())) @@ -224,6 +227,9 @@ def test_column_basics(): assert len(column) == 5 assert column.shape == (5,) assert column.to_pylist() == [-10, -5, 0, 5, 10] + assert column == pa.Column.from_array("a", column.data) + assert column != pa.Column.from_array("b", column.data) + assert column != column.data def test_column_factory_function(): @@ -577,6 +583,9 @@ def test_table_basics(): col.data.chunk(col.data.num_chunks) assert table.columns == columns + assert table == pa.Table.from_arrays(columns) + assert table != pa.Table.from_arrays(columns[1:]) + assert table != columns def test_table_from_arrays_preserves_column_metadata(): From d61ae4ae488a74840c464576f59167b3d774f102 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Dec 2018 15:41:37 -0600 Subject: [PATCH 047/328] ARROW-3449: [C++] Fixes to build with CMake 3.2. Document what requires newer CMake This also resolves ARROW-3984 Author: Wes McKinney Closes #3174 from wesm/ARROW-3449 and squashes the following commits: 9f33412c4 Fixes to build with CMake 3.2. Document what features require newer CMake in the README. Add Docker task for CMake 3.2 from kszucs --- cpp/CMakeLists.txt | 4 +++- cpp/Dockerfile | 4 +++- cpp/README.md | 10 +++++++++- cpp/cmake_modules/ThirdpartyToolchain.cmake | 17 ++++++++++++++++- cpp/src/arrow/util/compression-test.cc | 9 +++++++-- cpp/src/gandiva/jni/CMakeLists.txt | 4 ++++ cpp/src/plasma/CMakeLists.txt | 4 ---- cpp/thirdparty/download_dependencies.sh | 2 +- cpp/thirdparty/versions.txt | 1 + dev/tasks/tests.yml | 10 ++++++++++ docker-compose.yml | 16 ++++++++++++++++ 11 files changed, 70 insertions(+), 11 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 54daaf96e8eb6..54ec1e5ef6501 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -16,6 +16,7 @@ # under the License. cmake_minimum_required(VERSION 3.2) +message(STATUS "Building using CMake version: ${CMAKE_VERSION}") # Extract Arrow version number file(READ "${CMAKE_CURRENT_SOURCE_DIR}/../java/pom.xml" POM_XML) @@ -436,11 +437,12 @@ endif() ############################################################ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) + # Currently the compression tests require at least these libraries; bz2 and + # zstd are optional. See ARROW-3984 set(ARROW_WITH_BROTLI ON) set(ARROW_WITH_LZ4 ON) set(ARROW_WITH_SNAPPY ON) set(ARROW_WITH_ZLIB ON) - set(ARROW_WITH_ZSTD ON) endif() if(ARROW_BUILD_TESTS) diff --git a/cpp/Dockerfile b/cpp/Dockerfile index c4791019634c1..84c00b91cc405 100644 --- a/cpp/Dockerfile +++ b/cpp/Dockerfile @@ -30,6 +30,7 @@ RUN apt-get update -y -q && \ wget # install conda and required packages +ARG EXTRA_CONDA_PKGS ENV PATH=/opt/conda/bin:$PATH \ CONDA_PREFIX=/opt/conda ADD ci/docker_install_conda.sh \ @@ -39,7 +40,8 @@ ADD ci/docker_install_conda.sh \ RUN arrow/ci/docker_install_conda.sh && \ conda install -c conda-forge \ --file arrow/ci/conda_env_cpp.yml \ - --file arrow/ci/conda_env_unix.yml && \ + --file arrow/ci/conda_env_unix.yml \ + $EXTRA_CONDA_PKGS && \ conda clean --all ENV CC=gcc \ diff --git a/cpp/README.md b/cpp/README.md index 1f12117e8d01e..71aa98ed9c924 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -30,7 +30,7 @@ in-source and out-of-source builds with the latter one being preferred. Building Arrow requires: * A C++11-enabled compiler. On Linux, gcc 4.8 and higher should be sufficient. -* CMake +* CMake 3.2 or higher * Boost On Ubuntu/Debian you can install the requirements with: @@ -459,6 +459,14 @@ both of these options would be used rarely. Current known uses-cases when they a * Parameterized tests in google test. +## CMake version requirements + +We support CMake 3.2 and higher. Some features require a newer version of CMake: + +* Building the benchmarks requires 3.6 or higher +* Building zstd from source requires 3.7 or higher +* Building Gandiva JNI bindings requires 3.11 or higher + [1]: https://brew.sh/ [2]: https://github.com/apache/arrow/blob/master/cpp/apidoc/Windows.md [3]: https://google.github.io/styleguide/cppguide.html diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 8f3fc2cabe3c2..c007b1c225bb9 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -407,6 +407,13 @@ else() # disable autolinking in boost add_definitions(-DBOOST_ALL_NO_LIB) endif() + + if (DEFINED ENV{BOOST_ROOT} OR DEFINED BOOST_ROOT) + # In older versions of CMake (such as 3.2), the system paths for Boost will + # be looked in first even if we set $BOOST_ROOT or pass -DBOOST_ROOT + set(Boost_NO_SYSTEM_PATHS ON) + endif() + if (ARROW_BOOST_USE_SHARED) # Find shared Boost libraries. set(Boost_USE_STATIC_LIBS OFF) @@ -629,8 +636,11 @@ if(ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS endif() if(ARROW_BUILD_BENCHMARKS) - if("$ENV{GBENCHMARK_HOME}" STREQUAL "") + if(CMAKE_VERSION VERSION_LESS 3.6) + message(FATAL_ERROR "Building gbenchmark from source requires at least CMake 3.6") + endif() + if(NOT MSVC) set(GBENCHMARK_CMAKE_CXX_FLAGS "-fPIC -std=c++11 ${EP_CXX_FLAGS}") endif() @@ -1095,6 +1105,11 @@ if (ARROW_WITH_ZSTD) "-DCMAKE_C_FLAGS=${EP_C_FLAGS}") endif() + if(CMAKE_VERSION VERSION_LESS 3.7) + message(FATAL_ERROR "Building zstd using ExternalProject requires \ +at least CMake 3.7") + endif() + ExternalProject_Add(zstd_ep ${EP_LOG_OPTIONS} CMAKE_ARGS ${ZSTD_CMAKE_ARGS} diff --git a/cpp/src/arrow/util/compression-test.cc b/cpp/src/arrow/util/compression-test.cc index e0e6f4837f201..22bec001bfd45 100644 --- a/cpp/src/arrow/util/compression-test.cc +++ b/cpp/src/arrow/util/compression-test.cc @@ -448,17 +448,22 @@ TEST_P(CodecTest, StreamingRoundtrip) { INSTANTIATE_TEST_CASE_P(TestGZip, CodecTest, ::testing::Values(Compression::GZIP)); -INSTANTIATE_TEST_CASE_P(TestZSTD, CodecTest, ::testing::Values(Compression::ZSTD)); - INSTANTIATE_TEST_CASE_P(TestSnappy, CodecTest, ::testing::Values(Compression::SNAPPY)); INSTANTIATE_TEST_CASE_P(TestLZ4, CodecTest, ::testing::Values(Compression::LZ4)); INSTANTIATE_TEST_CASE_P(TestBrotli, CodecTest, ::testing::Values(Compression::BROTLI)); +// bz2 requires a binary installation, there is no ExternalProject #if ARROW_WITH_BZ2 INSTANTIATE_TEST_CASE_P(TestBZ2, CodecTest, ::testing::Values(Compression::BZ2)); #endif +// The ExternalProject for zstd does not build on CMake < 3.7, so we do not +// require it here +#ifdef ARROW_WITH_ZSTD +INSTANTIATE_TEST_CASE_P(TestZSTD, CodecTest, ::testing::Values(Compression::ZSTD)); +#endif + } // namespace util } // namespace arrow diff --git a/cpp/src/gandiva/jni/CMakeLists.txt b/cpp/src/gandiva/jni/CMakeLists.txt index 9f7bc526dbf5b..ab04f536b4dd2 100644 --- a/cpp/src/gandiva/jni/CMakeLists.txt +++ b/cpp/src/gandiva/jni/CMakeLists.txt @@ -17,6 +17,10 @@ project(gandiva_jni) +if(CMAKE_VERSION VERSION_LESS 3.11) + message(FATAL_ERROR "Building the Gandiva JNI bindings requires CMake version >= 3.11") +endif() + # Find JNI find_package(JNI REQUIRED) diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 317835bb7ac44..15d16af0fb9aa 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -20,16 +20,12 @@ add_custom_target(plasma) # For the moment, Plasma is versioned like Arrow project(plasma VERSION "${ARROW_BASE_VERSION}") -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/../python/cmake_modules") - -find_package(PythonLibsNew REQUIRED) find_package(Threads) # The SO version is also the ABI version set(PLASMA_SO_VERSION "${ARROW_SO_VERSION}") set(PLASMA_FULL_SO_VERSION "${ARROW_FULL_SO_VERSION}") -include_directories(SYSTEM ${PYTHON_INCLUDE_DIRS}) include_directories("${FLATBUFFERS_INCLUDE_DIR}" "${CMAKE_CURRENT_LIST_DIR}/" "${CMAKE_CURRENT_LIST_DIR}/thirdparty/" "${CMAKE_CURRENT_LIST_DIR}/../") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_XOPEN_SOURCE=500 -D_POSIX_C_SOURCE=200809L") diff --git a/cpp/thirdparty/download_dependencies.sh b/cpp/thirdparty/download_dependencies.sh index ea63a8a41fb4e..de7d23ca2ef5e 100755 --- a/cpp/thirdparty/download_dependencies.sh +++ b/cpp/thirdparty/download_dependencies.sh @@ -38,7 +38,7 @@ download_dependency() { # --show-progress will not output to stdout, it is safe to pipe the result of # the script into eval. - wget --quiet --show-progress --continue --output-document="${out}" "${url}" + wget --quiet --continue --output-document="${out}" "${url}" } main() { diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 705f56c0e6130..fc539da73945b 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -61,6 +61,7 @@ DEPENDENCIES=( "ARROW_ORC_URL orc-${ORC_VERSION}.tar.gz https://github.com/apache/orc/archive/rel/release-${ORC_VERSION}.tar.gz" "ARROW_PROTOBUF_URL protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/google/protobuf/releases/download/${PROTOBUF_VERSION}/protobuf-all-${PROTOBUF_VERSION:1}.tar.gz" "ARROW_RAPIDJSON_URL rapidjson-${RAPIDJSON_VERSION}.tar.gz https://github.com/miloyip/rapidjson/archive/${RAPIDJSON_VERSION}.tar.gz" + "ARROW_RE2_URL re2-${RE2_VERSION}.tar.gz https://github.com/google/re2/archive/${RE2_VERSION}.tar.gz" "ARROW_SNAPPY_URL snappy-${SNAPPY_VERSION}.tar.gz https://github.com/google/snappy/releases/download/${SNAPPY_VERSION}/snappy-${SNAPPY_VERSION}.tar.gz" "ARROW_THRIFT_URL thrift-${THRIFT_VERSION}.tar.gz http://archive.apache.org/dist/thrift/${THRIFT_VERSION}/thrift-${THRIFT_VERSION}.tar.gz" "ARROW_ZLIB_URL zlib-${ZLIB_VERSION}.tar.gz http://zlib.net/fossils/zlib-${ZLIB_VERSION}.tar.gz" diff --git a/dev/tasks/tests.yml b/dev/tasks/tests.yml index d51fa7eac7a35..d9493b606e5a0 100644 --- a/dev/tasks/tests.yml +++ b/dev/tasks/tests.yml @@ -22,6 +22,7 @@ groups: - docker-rust - docker-cpp - docker-cpp-alpine + - docker-cpp-cmake32 - docker-c_glib - docker-go - docker-python-2.7 @@ -45,6 +46,7 @@ groups: cpp-python: - docker-cpp - docker-cpp-alpine + - docker-cpp-cmake32 - docker-python-2.7 - docker-python-2.7-alpine - docker-python-3.6 @@ -87,6 +89,14 @@ tasks: - docker-compose build cpp-alpine - docker-compose run cpp-alpine + docker-cpp-cmake32: + platform: linux + template: docker-tests/travis.linux.yml + params: + commands: + - docker-compose build cpp-cmake32 + - docker-compose run cpp-cmake32 + docker-c_glib: platform: linux template: docker-tests/travis.linux.yml diff --git a/docker-compose.yml b/docker-compose.yml index 51f1a49542212..d3a7990d5cc23 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -57,6 +57,22 @@ services: PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data volumes: *ubuntu-volumes + cpp-cmake32: + # Usage: + # docker-compose build cpp-cmake32 + # docker-compose run cpp-cmake32 + image: arrow:cpp-cmake32 + shm_size: 2G + build: + context: . + dockerfile: cpp/Dockerfile + args: + EXTRA_CONDA_PKGS: cmake=3.2 + environment: + ARROW_ORC: "OFF" + PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data + volumes: *ubuntu-volumes + cpp-alpine: # Usage: # docker-compose build cpp-alpine From 784d1cd04603f6f1f97904a62dac153d2569d2dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 15 Dec 2018 15:56:51 -0600 Subject: [PATCH 048/328] ARROW-4044: [Packaging/Python] Add hypothesis test dependency to pyarrow conda recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crossbow tests: [kszucs/crossbow/build-374](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-374) Author: Krisztián Szűcs Closes #3187 from kszucs/ARROW-4044 and squashes the following commits: 9bc4d880b add hypothesis to test requires --- dev/tasks/conda-recipes/pyarrow/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/tasks/conda-recipes/pyarrow/meta.yaml b/dev/tasks/conda-recipes/pyarrow/meta.yaml index 167056ba68e9c..7c653876765b5 100644 --- a/dev/tasks/conda-recipes/pyarrow/meta.yaml +++ b/dev/tasks/conda-recipes/pyarrow/meta.yaml @@ -58,6 +58,7 @@ test: requires: - pytest + - hypothesis commands: - pytest --pyargs pyarrow From 055496cd9a040d64d4a00d773261e61e7caac31b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Dec 2018 15:57:40 -0600 Subject: [PATCH 049/328] ARROW-4029: [C++] Exclude headers with 'internal' from installation. Document header file conventions in README In reviewing what usages of the `install` command we have, I added a helper function to add `.pc` files to reduce code duplication Author: Wes McKinney Closes #3176 from wesm/ARROW-4029 and squashes the following commits: f5b3811fc Exclude headers with 'internal' from installation. Document in README. Add function to reduce code duplication in adding pkg-config files --- cpp/README.md | 6 ++++++ cpp/cmake_modules/BuildUtils.cmake | 18 +++++++++++++++++- cpp/src/arrow/CMakeLists.txt | 7 +------ cpp/src/arrow/array/CMakeLists.txt | 10 +--------- cpp/src/arrow/compute/CMakeLists.txt | 7 +------ cpp/src/arrow/compute/kernels/CMakeLists.txt | 6 +----- cpp/src/arrow/flight/CMakeLists.txt | 7 +------ cpp/src/arrow/gpu/CMakeLists.txt | 10 +--------- cpp/src/arrow/io/CMakeLists.txt | 11 +---------- cpp/src/arrow/python/CMakeLists.txt | 7 +------ cpp/src/arrow/util/string_view/CMakeLists.txt | 2 +- cpp/src/arrow/util/variant/CMakeLists.txt | 8 +------- cpp/src/gandiva/CMakeLists.txt | 7 +------ cpp/src/parquet/CMakeLists.txt | 8 +------- cpp/src/plasma/CMakeLists.txt | 7 +------ 15 files changed, 36 insertions(+), 85 deletions(-) diff --git a/cpp/README.md b/cpp/README.md index 71aa98ed9c924..010387dbd4de3 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -313,6 +313,12 @@ which use the default pool without explicitly passing it. You can disable these constructors in your application (so that you are accounting properly for all memory allocations) by defining `ARROW_NO_DEFAULT_MEMORY_POOL`. +### Header files + +We use the `.h` extension for C++ header files. Any header file name not +containing `internal` is considered to be a public header, and will be +automatically installed by the build. + ### Error Handling and Exceptions For error handling, we use `arrow::Status` values instead of throwing C++ diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 1abe97eecc59f..7585ae9da8fa8 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -583,7 +583,23 @@ function(ARROW_INSTALL_ALL_HEADERS PATH) set(ARG_PATTERN "*.h") endif() file(GLOB CURRENT_DIRECTORY_HEADERS ${ARG_PATTERN}) + + set(PUBLIC_HEADERS) + foreach(HEADER ${CURRENT_DIRECTORY_HEADERS}) + if (NOT ((HEADER MATCHES "internal"))) + LIST(APPEND PUBLIC_HEADERS ${HEADER}) + endif() + endforeach() install(FILES - ${CURRENT_DIRECTORY_HEADERS} + ${PUBLIC_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PATH}") endfunction() + +function(ARROW_ADD_PKG_CONFIG MODULE) + configure_file(${MODULE}.pc.in + "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc" + @ONLY) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +endfunction() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index b13c9b66ac48d..bec290df2aa37 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -252,12 +252,7 @@ endforeach() ARROW_INSTALL_ALL_HEADERS("arrow") # pkg-config support -configure_file(arrow.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow") ####################################### # Unit tests diff --git a/cpp/src/arrow/array/CMakeLists.txt b/cpp/src/arrow/array/CMakeLists.txt index a789c88dd9d31..4a8ce3490abd1 100644 --- a/cpp/src/arrow/array/CMakeLists.txt +++ b/cpp/src/arrow/array/CMakeLists.txt @@ -16,12 +16,4 @@ # under the License. # Headers: top level -install(FILES - builder_adaptive.h - builder_base.h - builder_binary.h - builder_decimal.h - builder_dict.h - builder_nested.h - builder_primitive.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/array") +ARROW_INSTALL_ALL_HEADERS("arrow/array") diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index 242937005cf9c..75d152b0bafa3 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -18,12 +18,7 @@ ARROW_INSTALL_ALL_HEADERS("arrow/compute") # pkg-config support -configure_file(arrow-compute.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow-compute.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-compute.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow-compute") ####################################### # Unit tests diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 923c8c3bd4e81..a5a142b5c28ce 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -15,8 +15,4 @@ # specific language governing permissions and limitations # under the License. -install(FILES - boolean.h - cast.h - hash.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/compute/kernels") +ARROW_INSTALL_ALL_HEADERS("arrow/compute/kernels") diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index bc22d60b7131a..aa56269a8953e 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -18,12 +18,7 @@ add_custom_target(arrow_flight) # Header files -install(FILES - api.h - client.h - server.h - types.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/flight") +ARROW_INSTALL_ALL_HEADERS("arrow/flight") SET(ARROW_FLIGHT_STATIC_LINK_LIBS grpc_grpcpp diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index c37779aefa9aa..8b69c654bb1fe 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -64,15 +64,7 @@ install(FILES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu") ARROW_INSTALL_ALL_HEADERS("arrow/gpu") - -# pkg-config support -configure_file(arrow-cuda.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow-cuda.pc" - @ONLY) - -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-cuda.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow-cuda") set(ARROW_CUDA_TEST_LINK_LIBS arrow_cuda_shared diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index 80d68fb503bb9..13b577f7d41b2 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -41,13 +41,4 @@ ADD_ARROW_BENCHMARK(memory-benchmark PREFIX "arrow-io") # Headers: top level -install(FILES - api.h - buffered.h - compressed.h - file.h - hdfs.h - interfaces.h - memory.h - readahead.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/io") +ARROW_INSTALL_ALL_HEADERS("arrow/io") diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 4913083537340..98c105ae623ce 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -94,12 +94,7 @@ endif() ARROW_INSTALL_ALL_HEADERS("arrow/python") # pkg-config support -configure_file(arrow-python.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow-python.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-python.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow-python") # ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/string_view/CMakeLists.txt b/cpp/src/arrow/util/string_view/CMakeLists.txt index bae6bdb807d92..7e553077db1ad 100644 --- a/cpp/src/arrow/util/string_view/CMakeLists.txt +++ b/cpp/src/arrow/util/string_view/CMakeLists.txt @@ -17,4 +17,4 @@ install(FILES string_view.hpp - DESTINATION include/arrow/util/string_view) + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/util/string_view") diff --git a/cpp/src/arrow/util/variant/CMakeLists.txt b/cpp/src/arrow/util/variant/CMakeLists.txt index 0ebb2516246ed..b7a5692b6207c 100644 --- a/cpp/src/arrow/util/variant/CMakeLists.txt +++ b/cpp/src/arrow/util/variant/CMakeLists.txt @@ -19,10 +19,4 @@ # arrow_util_variant ####################################### -install(FILES - optional.h - recursive_wrapper.h - variant_cast.h - variant_io.h - variant_visitor.h - DESTINATION include/arrow/util/variant) +ARROW_INSTALL_ALL_HEADERS("arrow/util/variant") diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 9763f297b0b8b..da0d3bba69147 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -96,12 +96,7 @@ include(GNUInstallDirs) ARROW_INSTALL_ALL_HEADERS("gandiva") # pkg-config support -configure_file(gandiva.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/gandiva.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/gandiva.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("gandiva") set(GANDIVA_STATIC_TEST_LINK_LIBS gandiva_static diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 6b7846b709d0b..995c39adb7d35 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -249,13 +249,7 @@ install(FILES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet") # pkg-config support -configure_file(parquet.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/parquet.pc" - @ONLY) - -install(FILES - "${CMAKE_CURRENT_BINARY_DIR}/parquet.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("parquet") ADD_PARQUET_TEST(bloom_filter-test) ADD_PARQUET_TEST(column_reader-test) diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 15d16af0fb9aa..83c201d0f45a0 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -150,12 +150,7 @@ install(TARGETS plasma_store_server DESTINATION ${CMAKE_INSTALL_BINDIR}) # pkg-config support -configure_file(plasma.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/plasma.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/plasma.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("plasma") if(ARROW_PLASMA_JAVA_CLIENT) # Plasma java client support From ec154d232ed5585721e0ef12d61c1a6e2c06fdae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Sat, 15 Dec 2018 16:00:27 -0600 Subject: [PATCH 050/328] =?UTF-8?q?ARROW-2026:=20[C++]=20Enforce=20use=5Fd?= =?UTF-8?q?eprecated=5Fint96=5Ftimestamps=20to=20all=20time=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …stamps fields. This changes the behavior of `use_deprecated_int96_timestamps` to support all timestamp fields irregardless of the time unit. It would previously only apply this conversion to fields with Nanosecond resolution. People will only use this option when they use a system that only supports INT96 timestamps, systems that also support INT64 timestamps in other resolutions would not need the option. A notable API change is that this option now take precedence over the coerce_timestamps option. Author: François Saint-Jacques Closes #3173 from fsaintjacques/ARROW-2026-parquet-int96-conversion and squashes the following commits: 2897a7278 ARROW-2026: Enforce use_deprecated_int96_timestamps to all timestamps fields. --- .../parquet/arrow/arrow-reader-writer-test.cc | 185 +++++++++--------- cpp/src/parquet/arrow/reader.cc | 16 +- cpp/src/parquet/arrow/schema.cc | 73 ++++--- cpp/src/parquet/arrow/writer.cc | 74 ++++--- cpp/src/parquet/arrow/writer.h | 62 ++++-- cpp/src/parquet/types.h | 21 ++ python/pyarrow/parquet.py | 4 +- python/pyarrow/tests/test_parquet.py | 5 +- 8 files changed, 256 insertions(+), 184 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 07124ebb3057a..4e62a22c350ff 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -1193,65 +1193,116 @@ void MakeDateTimeTypesTable(std::shared_ptr
* out, bool nanos_as_micros = auto f0 = field("f0", ::arrow::date32()); auto f1 = field("f1", ::arrow::timestamp(TimeUnit::MILLI)); auto f2 = field("f2", ::arrow::timestamp(TimeUnit::MICRO)); - std::shared_ptr<::arrow::Field> f3; - if (nanos_as_micros) { - f3 = field("f3", ::arrow::timestamp(TimeUnit::MICRO)); - } else { - f3 = field("f3", ::arrow::timestamp(TimeUnit::NANO)); - } + auto f3_unit = nanos_as_micros ? TimeUnit::MICRO : TimeUnit::NANO; + auto f3 = field("f3", ::arrow::timestamp(f3_unit)); auto f4 = field("f4", ::arrow::time32(TimeUnit::MILLI)); auto f5 = field("f5", ::arrow::time64(TimeUnit::MICRO)); + std::shared_ptr<::arrow::Schema> schema(new ::arrow::Schema({f0, f1, f2, f3, f4, f5})); std::vector t32_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; - std::vector t64_values = {1489269000000, 1489270000000, 1489271000000, - 1489272000000, 1489272000000, 1489273000000}; + std::vector t64_ns_values = {1489269000000, 1489270000000, 1489271000000, + 1489272000000, 1489272000000, 1489273000000}; std::vector t64_us_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; + std::vector t64_ms_values = {1489269, 1489270, 1489271, + 1489272, 1489272, 1489273}; std::shared_ptr a0, a1, a2, a3, a4, a5; ArrayFromVector<::arrow::Date32Type, int32_t>(f0->type(), is_valid, t32_values, &a0); - ArrayFromVector<::arrow::TimestampType, int64_t>(f1->type(), is_valid, t64_values, &a1); - ArrayFromVector<::arrow::TimestampType, int64_t>(f2->type(), is_valid, t64_values, &a2); - if (nanos_as_micros) { - ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, t64_us_values, - &a3); - } else { - ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, t64_values, - &a3); - } + ArrayFromVector<::arrow::TimestampType, int64_t>(f1->type(), is_valid, t64_ms_values, + &a1); + ArrayFromVector<::arrow::TimestampType, int64_t>(f2->type(), is_valid, t64_us_values, + &a2); + auto f3_data = nanos_as_micros ? t64_us_values : t64_ns_values; + ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, f3_data, &a3); ArrayFromVector<::arrow::Time32Type, int32_t>(f4->type(), is_valid, t32_values, &a4); - ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_values, &a5); + ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5); std::vector> columns = { std::make_shared("f0", a0), std::make_shared("f1", a1), std::make_shared("f2", a2), std::make_shared("f3", a3), std::make_shared("f4", a4), std::make_shared("f5", a5)}; + *out = Table::Make(schema, columns); } TEST(TestArrowReadWrite, DateTimeTypes) { - std::shared_ptr
table; + std::shared_ptr
table, result; MakeDateTimeTypesTable(&table); - // Use deprecated INT96 type - std::shared_ptr
result; - ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( - table, false /* use_threads */, table->num_rows(), {}, &result, - ArrowWriterProperties::Builder().enable_deprecated_int96_timestamps()->build())); - - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); - // Cast nanaoseconds to microseconds and use INT64 physical type ASSERT_NO_FATAL_FAILURE( DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result)); - std::shared_ptr
expected; MakeDateTimeTypesTable(&table, true); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); } +TEST(TestArrowReadWrite, UseDeprecatedInt96) { + using ::arrow::ArrayFromVector; + using ::arrow::field; + using ::arrow::schema; + + std::vector is_valid = {true, true, true, false, true, true}; + + auto t_s = ::arrow::timestamp(TimeUnit::SECOND); + auto t_ms = ::arrow::timestamp(TimeUnit::MILLI); + auto t_us = ::arrow::timestamp(TimeUnit::MICRO); + auto t_ns = ::arrow::timestamp(TimeUnit::NANO); + + std::vector s_values = {1489269, 1489270, 1489271, 1489272, 1489272, 1489273}; + std::vector ms_values = {1489269000, 1489270000, 1489271000, + 1489272001, 1489272000, 1489273000}; + std::vector us_values = {1489269000000, 1489270000000, 1489271000000, + 1489272000001, 1489272000000, 1489273000000}; + std::vector ns_values = {1489269000000000LL, 1489270000000000LL, + 1489271000000000LL, 1489272000000001LL, + 1489272000000000LL, 1489273000000000LL}; + + std::shared_ptr a_s, a_ms, a_us, a_ns; + ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, is_valid, s_values, &a_s); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ms, is_valid, ms_values, &a_ms); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_us, is_valid, us_values, &a_us); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ns, is_valid, ns_values, &a_ns); + + // Each input is typed with a unique TimeUnit + auto input_schema = schema( + {field("f_s", t_s), field("f_ms", t_ms), field("f_us", t_us), field("f_ns", t_ns)}); + auto input = Table::Make( + input_schema, + {std::make_shared("f_s", a_s), std::make_shared("f_ms", a_ms), + std::make_shared("f_us", a_us), std::make_shared("f_ns", a_ns)}); + + // When reading parquet files, all int96 schema fields are converted to + // timestamp nanoseconds + auto ex_schema = schema({field("f_s", t_ns), field("f_ms", t_ns), field("f_us", t_ns), + field("f_ns", t_ns)}); + auto ex_result = Table::Make( + ex_schema, + {std::make_shared("f_s", a_ns), std::make_shared("f_ms", a_ns), + std::make_shared("f_us", a_ns), std::make_shared("f_ns", a_ns)}); + + std::shared_ptr
result; + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( + input, false /* use_threads */, input->num_rows(), {}, &result, + ArrowWriterProperties::Builder().enable_deprecated_int96_timestamps()->build())); + + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result)); + + // Ensure enable_deprecated_int96_timestamps as precedence over + // coerce_timestamps. + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(input, false /* use_threads */, + input->num_rows(), {}, &result, + ArrowWriterProperties::Builder() + .enable_deprecated_int96_timestamps() + ->coerce_timestamps(TimeUnit::MILLI) + ->build())); + + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result)); +} + TEST(TestArrowReadWrite, CoerceTimestamps) { using ::arrow::ArrayFromVector; using ::arrow::field; @@ -1297,6 +1348,12 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { {std::make_shared("f_s", a_ms), std::make_shared("f_ms", a_ms), std::make_shared("f_us", a_ms), std::make_shared("f_ns", a_ms)}); + std::shared_ptr
milli_result; + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( + input, false /* use_threads */, input->num_rows(), {}, &milli_result, + ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MILLI)->build())); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_milli_result, *milli_result)); + // Result when coercing to microseconds auto s3 = std::shared_ptr<::arrow::Schema>( new ::arrow::Schema({field("f_s", t_us), field("f_ms", t_us), field("f_us", t_us), @@ -1306,13 +1363,6 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { {std::make_shared("f_s", a_us), std::make_shared("f_ms", a_us), std::make_shared("f_us", a_us), std::make_shared("f_ns", a_us)}); - std::shared_ptr
milli_result; - ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( - input, false /* use_threads */, input->num_rows(), {}, &milli_result, - ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MILLI)->build())); - - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_milli_result, *milli_result)); - std::shared_ptr
micro_result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( input, false /* use_threads */, input->num_rows(), {}, µ_result, @@ -1457,65 +1507,6 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) { ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_table, *result)); } -// Regression for ARROW-2802 -TEST(TestArrowReadWrite, CoerceTimestampsAndSupportDeprecatedInt96) { - using ::arrow::Column; - using ::arrow::default_memory_pool; - using ::arrow::Field; - using ::arrow::Schema; - using ::arrow::Table; - using ::arrow::TimestampBuilder; - using ::arrow::TimestampType; - using ::arrow::TimeUnit; - - auto timestamp_type = std::make_shared(TimeUnit::NANO); - - TimestampBuilder builder(timestamp_type, default_memory_pool()); - for (std::int64_t ii = 0; ii < 10; ++ii) { - ASSERT_OK(builder.Append(1000000000L * ii)); - } - std::shared_ptr values; - ASSERT_OK(builder.Finish(&values)); - - std::vector> fields; - auto field = std::make_shared("nanos", timestamp_type); - fields.emplace_back(field); - - auto schema = std::make_shared(fields); - - std::vector> columns; - auto column = std::make_shared("nanos", values); - columns.emplace_back(column); - - auto table = Table::Make(schema, columns); - - auto arrow_writer_properties = ArrowWriterProperties::Builder() - .coerce_timestamps(TimeUnit::MICRO) - ->enable_deprecated_int96_timestamps() - ->build(); - - std::shared_ptr
result; - DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result, - arrow_writer_properties); - - ASSERT_EQ(table->num_columns(), result->num_columns()); - ASSERT_EQ(table->num_rows(), result->num_rows()); - - auto actual_column = result->column(0); - auto data = actual_column->data(); - auto expected_values = - static_cast<::arrow::NumericArray*>(values.get())->raw_values(); - for (int ii = 0; ii < data->num_chunks(); ++ii) { - auto chunk = - static_cast<::arrow::NumericArray*>(data->chunk(ii).get()); - auto values = chunk->raw_values(); - for (int64_t jj = 0; jj < chunk->length(); ++jj, ++expected_values) { - // Check that the nanos have been converted to micros - ASSERT_EQ(*expected_values / 1000, values[jj]); - } - } -} - void MakeDoubleTable(int num_columns, int num_rows, int nchunks, std::shared_ptr
* out) { std::shared_ptr<::arrow::Column> column; @@ -2289,11 +2280,13 @@ TEST_P(TestNestedSchemaRead, DeepNestedSchemaRead) { INSTANTIATE_TEST_CASE_P(Repetition_type, TestNestedSchemaRead, ::testing::Values(Repetition::REQUIRED, Repetition::OPTIONAL)); -TEST(TestImpalaConversion, NanosecondToImpala) { +TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) { // June 20, 2017 16:32:56 and 123456789 nanoseconds int64_t nanoseconds = INT64_C(1497976376123456789); - Int96 expected = {{UINT32_C(632093973), UINT32_C(13871), UINT32_C(2457925)}}; + Int96 calculated; + + Int96 expected = {{UINT32_C(632093973), UINT32_C(13871), UINT32_C(2457925)}}; internal::NanosecondsToImpalaTimestamp(nanoseconds, &calculated); ASSERT_EQ(expected, calculated); } diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 2a7730d42ad23..7830b6abc75d1 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -77,18 +77,6 @@ namespace arrow { using ::arrow::BitUtil::BytesForBits; -constexpr int64_t kJulianToUnixEpochDays = 2440588LL; -constexpr int64_t kMillisecondsInADay = 86400000LL; -constexpr int64_t kNanosecondsInADay = kMillisecondsInADay * 1000LL * 1000LL; - -static inline int64_t impala_timestamp_to_nanoseconds(const Int96& impala_timestamp) { - int64_t days_since_epoch = impala_timestamp.value[2] - kJulianToUnixEpochDays; - int64_t nanoseconds = 0; - - memcpy(&nanoseconds, &impala_timestamp.value, sizeof(int64_t)); - return days_since_epoch * kNanosecondsInADay + nanoseconds; -} - template using ArrayType = typename ::arrow::TypeTraits::ArrayType; @@ -1045,7 +1033,7 @@ struct TransferFunctor<::arrow::TimestampType, Int96Type> { auto data_ptr = reinterpret_cast(data->mutable_data()); for (int64_t i = 0; i < length; i++) { - *data_ptr++ = impala_timestamp_to_nanoseconds(values[i]); + *data_ptr++ = Int96GetNanoSeconds(values[i]); } if (reader->nullable_values()) { @@ -1072,7 +1060,7 @@ struct TransferFunctor<::arrow::Date64Type, Int32Type> { auto out_ptr = reinterpret_cast(data->mutable_data()); for (int64_t i = 0; i < length; i++) { - *out_ptr++ = static_cast(values[i]) * kMillisecondsInADay; + *out_ptr++ = static_cast(values[i]) * kMillisecondsPerDay; } if (reader->nullable_values()) { diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index d0014a6f3aa2a..af9fbc91a5042 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -423,45 +423,66 @@ Status StructToNode(const std::shared_ptr<::arrow::StructType>& type, return Status::OK(); } +static LogicalType::type LogicalTypeFromArrowTimeUnit(::arrow::TimeUnit::type time_unit) { + switch (time_unit) { + case ::arrow::TimeUnit::MILLI: + return LogicalType::TIMESTAMP_MILLIS; + case ::arrow::TimeUnit::MICRO: + return LogicalType::TIMESTAMP_MICROS; + case ::arrow::TimeUnit::SECOND: + case ::arrow::TimeUnit::NANO: + // No equivalent parquet logical type. + break; + } + + return LogicalType::NONE; +} + static Status GetTimestampMetadata(const ::arrow::TimestampType& type, const ArrowWriterProperties& properties, ParquetType::type* physical_type, LogicalType::type* logical_type) { - auto unit = type.unit(); - *physical_type = ParquetType::INT64; + const bool coerce = properties.coerce_timestamps_enabled(); + const auto unit = coerce ? properties.coerce_timestamps_unit() : type.unit(); - if (properties.coerce_timestamps_enabled()) { - auto coerce_unit = properties.coerce_timestamps_unit(); - if (coerce_unit == ::arrow::TimeUnit::MILLI) { - *logical_type = LogicalType::TIMESTAMP_MILLIS; - } else if (coerce_unit == ::arrow::TimeUnit::MICRO) { - *logical_type = LogicalType::TIMESTAMP_MICROS; - } else { - return Status::NotImplemented( - "Can only coerce Arrow timestamps to milliseconds" - " or microseconds"); + // The user is explicitly asking for Impala int96 encoding, there is no + // logical type. + if (properties.support_deprecated_int96_timestamps()) { + *physical_type = ParquetType::INT96; + return Status::OK(); + } + + *physical_type = ParquetType::INT64; + *logical_type = LogicalTypeFromArrowTimeUnit(unit); + + // The user is requesting that all timestamp columns are casted to a specific + // type. Only 2 TimeUnit are supported by arrow-parquet. + if (coerce) { + switch (unit) { + case ::arrow::TimeUnit::MILLI: + case ::arrow::TimeUnit::MICRO: + break; + case ::arrow::TimeUnit::NANO: + case ::arrow::TimeUnit::SECOND: + return Status::NotImplemented( + "Can only coerce Arrow timestamps to milliseconds" + " or microseconds"); } + return Status::OK(); } - if (unit == ::arrow::TimeUnit::MILLI) { - *logical_type = LogicalType::TIMESTAMP_MILLIS; - } else if (unit == ::arrow::TimeUnit::MICRO) { + // Until ARROW-3729 is resolved, nanoseconds are explicitly converted to + // int64 microseconds when deprecated int96 is not requested. + if (type.unit() == ::arrow::TimeUnit::NANO) *logical_type = LogicalType::TIMESTAMP_MICROS; - } else if (unit == ::arrow::TimeUnit::NANO) { - if (properties.support_deprecated_int96_timestamps()) { - *physical_type = ParquetType::INT96; - // No corresponding logical type - } else { - *logical_type = LogicalType::TIMESTAMP_MICROS; - } - } else { + else if (type.unit() == ::arrow::TimeUnit::SECOND) return Status::NotImplemented( "Only MILLI, MICRO, and NANOS units supported for Arrow timestamps with " "Parquet."); - } + return Status::OK(); -} +} // namespace arrow Status FieldToNode(const std::shared_ptr& field, const WriterProperties& properties, @@ -698,7 +719,7 @@ int32_t DecimalSize(int32_t precision) { } DCHECK(false); return -1; -} +} // namespace arrow } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 402cbf0f2027c..bce9f37026c97 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -386,7 +386,11 @@ class ArrowColumnWriter { Status WriteBatch(int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, const typename ParquetType::c_type* values) { - auto typed_writer = static_cast*>(writer_); + auto typed_writer = + ::arrow::internal::checked_cast*>(writer_); + // WriteBatch was called with type mismatching the writer_'s type. This + // could be a schema conversion problem. + DCHECK(typed_writer); PARQUET_CATCH_NOT_OK( typed_writer->WriteBatch(num_levels, def_levels, rep_levels, values)); return Status::OK(); @@ -397,7 +401,11 @@ class ArrowColumnWriter { const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const typename ParquetType::c_type* values) { - auto typed_writer = static_cast*>(writer_); + auto typed_writer = + ::arrow::internal::checked_cast*>(writer_); + // WriteBatchSpaced was called with type mismatching the writer_'s type. This + // could be a schema conversion problem. + DCHECK(typed_writer); PARQUET_CATCH_NOT_OK(typed_writer->WriteBatchSpaced( num_levels, def_levels, rep_levels, valid_bits, valid_bits_offset, values)); return Status::OK(); @@ -570,20 +578,42 @@ NULLABLE_BATCH_FAST_PATH(DoubleType, ::arrow::DoubleType, double) NULLABLE_BATCH_FAST_PATH(Int64Type, ::arrow::TimestampType, int64_t) NONNULLABLE_BATCH_FAST_PATH(Int64Type, ::arrow::TimestampType, int64_t) +#define CONV_CASE_LOOP(ConversionFunction) \ + for (int64_t i = 0; i < num_values; i++) \ + ConversionFunction(arrow_values[i], &output[i]); + +static void ConvertArrowTimestampToParquetInt96(const int64_t* arrow_values, + int64_t num_values, + ::arrow::TimeUnit ::type unit_type, + Int96* output) { + switch (unit_type) { + case TimeUnit::NANO: + CONV_CASE_LOOP(internal::NanosecondsToImpalaTimestamp); + break; + case TimeUnit::MICRO: + CONV_CASE_LOOP(internal::MicrosecondsToImpalaTimestamp); + break; + case TimeUnit::MILLI: + CONV_CASE_LOOP(internal::MillisecondsToImpalaTimestamp); + break; + case TimeUnit::SECOND: + CONV_CASE_LOOP(internal::SecondsToImpalaTimestamp); + break; + } +} + +#undef CONV_CASE_LOOP + template <> Status ArrowColumnWriter::WriteNullableBatch( const ::arrow::TimestampType& type, int64_t num_values, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const int64_t* values) { - Int96* buffer; + Int96* buffer = nullptr; RETURN_NOT_OK(ctx_->GetScratchData(num_values, &buffer)); - if (type.unit() == TimeUnit::NANO) { - for (int i = 0; i < num_values; i++) { - internal::NanosecondsToImpalaTimestamp(values[i], &buffer[i]); - } - } else { - return Status::NotImplemented("Only NANO timestamps are supported for Int96 writing"); - } + + ConvertArrowTimestampToParquetInt96(values, num_values, type.unit(), buffer); + return WriteBatchSpaced(num_levels, def_levels, rep_levels, valid_bits, valid_bits_offset, buffer); } @@ -592,15 +622,11 @@ template <> Status ArrowColumnWriter::WriteNonNullableBatch( const ::arrow::TimestampType& type, int64_t num_values, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, const int64_t* values) { - Int96* buffer; + Int96* buffer = nullptr; RETURN_NOT_OK(ctx_->GetScratchData(num_values, &buffer)); - if (type.unit() == TimeUnit::NANO) { - for (int i = 0; i < num_values; i++) { - internal::NanosecondsToImpalaTimestamp(values[i], buffer + i); - } - } else { - return Status::NotImplemented("Only NANO timestamps are supported for Int96 writing"); - } + + ConvertArrowTimestampToParquetInt96(values, num_values, type.unit(), buffer); + return WriteBatch(num_levels, def_levels, rep_levels, buffer); } @@ -611,21 +637,15 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& values, int64_t num_level const bool is_nanosecond = type.unit() == TimeUnit::NANO; - // In the case where support_deprecated_int96_timestamps was specified - // and coerce_timestamps_enabled was specified, a nanosecond column - // will have a physical type of int64. In that case, we fall through - // to the else if below. - // - // See https://issues.apache.org/jira/browse/ARROW-2082 - if (is_nanosecond && ctx_->properties->support_deprecated_int96_timestamps() && - !ctx_->properties->coerce_timestamps_enabled()) { + if (ctx_->properties->support_deprecated_int96_timestamps()) { + // The user explicitly required to use Int96 storage. return TypedWriteBatch(values, num_levels, def_levels, rep_levels); } else if (is_nanosecond || (ctx_->properties->coerce_timestamps_enabled() && (type.unit() != ctx_->properties->coerce_timestamps_unit()))) { // Casting is required. This covers several cases - // * Nanoseconds -> cast to microseconds + // * Nanoseconds -> cast to microseconds (until ARROW-3729 is resolved) // * coerce_timestamps_enabled_, cast all timestamps to requested unit return WriteTimestampsCoerce(ctx_->properties->truncated_timestamps_allowed(), values, num_levels, def_levels, rep_levels); diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h index 2538c028002e4..50cb4cfea7d8d 100644 --- a/cpp/src/parquet/arrow/writer.h +++ b/cpp/src/parquet/arrow/writer.h @@ -45,19 +45,19 @@ class PARQUET_EXPORT ArrowWriterProperties { class Builder { public: Builder() - : write_nanos_as_int96_(false), + : write_timestamps_as_int96_(false), coerce_timestamps_enabled_(false), coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), truncated_timestamps_allowed_(false) {} virtual ~Builder() {} Builder* disable_deprecated_int96_timestamps() { - write_nanos_as_int96_ = false; + write_timestamps_as_int96_ = false; return this; } Builder* enable_deprecated_int96_timestamps() { - write_nanos_as_int96_ = true; + write_timestamps_as_int96_ = true; return this; } @@ -79,19 +79,19 @@ class PARQUET_EXPORT ArrowWriterProperties { std::shared_ptr build() { return std::shared_ptr(new ArrowWriterProperties( - write_nanos_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, + write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, truncated_timestamps_allowed_)); } private: - bool write_nanos_as_int96_; + bool write_timestamps_as_int96_; bool coerce_timestamps_enabled_; ::arrow::TimeUnit::type coerce_timestamps_unit_; bool truncated_timestamps_allowed_; }; - bool support_deprecated_int96_timestamps() const { return write_nanos_as_int96_; } + bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; } ::arrow::TimeUnit::type coerce_timestamps_unit() const { @@ -105,12 +105,12 @@ class PARQUET_EXPORT ArrowWriterProperties { bool coerce_timestamps_enabled, ::arrow::TimeUnit::type coerce_timestamps_unit, bool truncated_timestamps_allowed) - : write_nanos_as_int96_(write_nanos_as_int96), + : write_timestamps_as_int96_(write_nanos_as_int96), coerce_timestamps_enabled_(coerce_timestamps_enabled), coerce_timestamps_unit_(coerce_timestamps_unit), truncated_timestamps_allowed_(truncated_timestamps_allowed) {} - const bool write_nanos_as_int96_; + const bool write_timestamps_as_int96_; const bool coerce_timestamps_enabled_; const ::arrow::TimeUnit::type coerce_timestamps_unit_; const bool truncated_timestamps_allowed_; @@ -208,24 +208,52 @@ namespace internal { * Timestamp conversion constants */ constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588); -constexpr int64_t kNanosecondsPerDay = INT64_C(86400000000000); -/** - * Converts nanosecond timestamps to Impala (Int96) format - */ -inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds, - Int96* impala_timestamp) { - int64_t julian_days = (nanoseconds / kNanosecondsPerDay) + kJulianEpochOffsetDays; +template +inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) { + int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays; (*impala_timestamp).value[2] = (uint32_t)julian_days; - int64_t last_day_nanos = nanoseconds % kNanosecondsPerDay; + int64_t last_day_units = time % UnitPerDay; int64_t* impala_last_day_nanos = reinterpret_cast(impala_timestamp); - *impala_last_day_nanos = last_day_nanos; + *impala_last_day_nanos = last_day_units * NanosecondsPerUnit; +} + +constexpr int64_t kSecondsInNanos = INT64_C(1000000000); + +inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp(seconds, + impala_timestamp); +} + +constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000); + +inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + milliseconds, impala_timestamp); +} + +constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000); + +inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + microseconds, impala_timestamp); +} + +constexpr int64_t kNanosecondsInNanos = INT64_C(1); + +inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + nanoseconds, impala_timestamp); } } // namespace internal } // namespace arrow + } // namespace parquet #endif // PARQUET_ARROW_WRITER_H diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index b27718027b0da..1812f5547abc2 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -175,6 +175,19 @@ struct FixedLenByteArray { using FLBA = FixedLenByteArray; +// Julian day at unix epoch. +// +// The Julian Day Number (JDN) is the integer assigned to a whole solar day in +// the Julian day count starting from noon Universal time, with Julian day +// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC, +// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian +// calendar), +constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588); +constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24); +constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000); +constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000); +constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000); + MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; }; STRUCT_END(Int96, 12); @@ -192,6 +205,14 @@ static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); } +static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) { + int64_t days_since_epoch = i96.value[2] - kJulianToUnixEpochDays; + int64_t nanoseconds = 0; + + memcpy(&nanoseconds, &i96.value, sizeof(int64_t)); + return days_since_epoch * kNanosecondsPerDay + nanoseconds; +} + static inline std::string Int96ToString(const Int96& a) { std::ostringstream result; std::copy(a.value, a.value + 3, std::ostream_iterator(result, " ")); diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index b89145adc4433..feaa890fc6cd9 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -284,8 +284,8 @@ def _sanitize_table(table, new_schema, flavor): Specify if we should use dictionary encoding in general or only for some columns. use_deprecated_int96_timestamps : boolean, default None - Write nanosecond resolution timestamps to INT96 Parquet - format. Defaults to False unless enabled by flavor argument + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. coerce_timestamps : string, default None Cast timestamps a particular resolution. Valid values: {None, 'ms', 'us'} diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 5c27a9b86a369..82c80e9e09d13 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -844,7 +844,7 @@ def test_date_time_types(): a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') - start = pd.Timestamp('2000-01-01').value / 1000 + start = pd.Timestamp('2001-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) @@ -892,8 +892,9 @@ def test_date_time_types(): # date64 as date32 # time32[s] to time32[ms] + # 'timestamp[ms]' is saved as INT96 timestamp # 'timestamp[ns]' is saved as INT96 timestamp - expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], + expected = pa.Table.from_arrays([a1, a1, a7, a4, a5, ex_a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', From 1fd2a25ec0b890a12837cbbfb4c431d2506d1845 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 15 Dec 2018 23:18:42 +0100 Subject: [PATCH 051/328] ARROW-3953: [Python] Compat with pandas 0.24 rename of MultiIndex labels -> codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Joris Van den Bossche Author: Krisztián Szűcs Closes #3120 from jorisvandenbossche/pandas-multiindex-codes and squashes the following commits: e5442a5e test no warns 329f3e47 Compat with pandas 0.24 rename of MultiIndex labels -> codes --- python/pyarrow/pandas_compat.py | 14 +++++++++++--- python/pyarrow/tests/test_convert_pandas.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index ec0e490291384..0eebcf6e1eec3 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -726,6 +726,14 @@ def _pandas_type_to_numpy_type(pandas_type): return np.dtype(pandas_type) +def _get_multiindex_codes(mi): + # compat for pandas < 0.24 (MI labels renamed to codes). + if isinstance(mi, pd.MultiIndex): + return mi.codes if hasattr(mi, 'codes') else mi.labels + else: + return None + + def _reconstruct_columns_from_metadata(columns, column_indexes): """Construct a pandas MultiIndex from `columns` and column index metadata in `column_indexes`. @@ -752,7 +760,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] - labels = getattr(columns, 'labels', None) or [ + labels = _get_multiindex_codes(columns) or [ pd.RangeIndex(len(level)) for level in levels ] @@ -779,7 +787,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): new_levels.append(level) - return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names) + return pd.MultiIndex(new_levels, labels, names=columns.names) def _table_to_blocks(options, block_table, memory_pool, categories): @@ -796,7 +804,7 @@ def _table_to_blocks(options, block_table, memory_pool, categories): def _flatten_single_level_multiindex(index): if isinstance(index, pd.MultiIndex) and index.nlevels == 1: levels, = index.levels - labels, = index.labels + labels, = _get_multiindex_codes(index) # Cheaply check that we do not somehow have duplicate column names if not index.is_unique: diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index ce9d6d117acb2..4d283b3150606 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -176,6 +176,16 @@ def test_multiindex_columns_unicode(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns) _check_pandas_roundtrip(df, preserve_index=True) + def test_multiindex_doesnt_warn(self): + # ARROW-3953: pandas 0.24 rename of MultiIndex labels to codes + columns = pd.MultiIndex.from_arrays([['one', 'two'], ['X', 'Y']]) + df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns) + + with pytest.warns(None) as record: + _check_pandas_roundtrip(df, preserve_index=True) + + assert len(record) == 0 + def test_integer_index_column(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')]) _check_pandas_roundtrip(df, preserve_index=True) From 715cba576db31bf643885cee6d3eb02f90ab001b Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Sun, 16 Dec 2018 10:29:15 +0900 Subject: [PATCH 052/328] ARROW-4035: [Ruby] Support msys2 mingw dependencies Author: Yosuke Shiro Closes #3181 from shiro615/support-msys2-mingw-dependencies and squashes the following commits: e20dce3c Support msys2 mingw dependencies --- ruby/red-arrow/red-arrow.gemspec | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec index cca87749ea19c..3f0f68aa332cf 100644 --- a/ruby/red-arrow/red-arrow.gemspec +++ b/ruby/red-arrow/red-arrow.gemspec @@ -52,4 +52,6 @@ Gem::Specification.new do |spec| spec.add_development_dependency("bundler") spec.add_development_dependency("rake") spec.add_development_dependency("test-unit") + + spec.metadata["msys2_mingw_dependencies"] = "apache-arrow" end From ac047b2fd2893b711116bf8c6b7df8b60c6dd8e3 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Sun, 16 Dec 2018 22:58:47 +0900 Subject: [PATCH 053/328] ARROW-4048: [GLib] Return ChunkedArray instead of Array in gparquet_arrow_file_reader_read_column Because `FileReader::ReadColumn(int i, std::shared_ptr* out)` is deprecated since 0.12. Author: Yosuke Shiro Closes #3192 from shiro615/glib-return-chunked-array-instead-of-array and squashes the following commits: b814c9a0 Add arrow_ prefix to Arrow C++ objects bd9c466e Return ChunkedArray instead of Array in gparquet_arrow_file_reader_read_column --- c_glib/parquet-glib/arrow-file-reader.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/c_glib/parquet-glib/arrow-file-reader.cpp b/c_glib/parquet-glib/arrow-file-reader.cpp index 398e85b02c08a..5c16e827fc14b 100644 --- a/c_glib/parquet-glib/arrow-file-reader.cpp +++ b/c_glib/parquet-glib/arrow-file-reader.cpp @@ -310,8 +310,8 @@ gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader, return NULL; } - std::shared_ptr arrow_array; - status = parquet_arrow_file_reader->ReadColumn(column_index, &arrow_array); + std::shared_ptr arrow_chunked_array; + status = parquet_arrow_file_reader->ReadColumn(column_index, &arrow_chunked_array); if (!garrow_error_check(error, status, "[parquet][arrow][file-reader][read-column]")) { @@ -319,7 +319,7 @@ gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader, } auto arrow_field = arrow_schema->field(0); - auto arrow_column = std::make_shared(arrow_field, arrow_array); + auto arrow_column = std::make_shared(arrow_field, arrow_chunked_array); return garrow_column_new_raw(&arrow_column); } From 77d3a46e14c5292024619c1fb08bba444c42b52c Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sun, 16 Dec 2018 14:01:53 -0600 Subject: [PATCH 054/328] ARROW-4049: [C++] Arrow never use glog even though glog is linked. The following is a part of arrow/util/logging.cc. ``` #ifdef ARROW_USE_GLOG typedef google::LogMessage LoggingProvider; #else typedef CerrLog LoggingProvider; #endif ``` As you see, when ARROW_USE_GLOG is defined, glog is intended to be used but it's not never defined and glog is never used. I've fixed this by adding `add_definition` command when CMake variable `ARROW_USE_GLOG` is ON. Author: Kousuke Saruta Closes #3196 from sarutak/arrow-use-glog and squashes the following commits: 87be74161 Fix to use glog --- cpp/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 54ec1e5ef6501..e3cc3f560a95f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -713,6 +713,7 @@ endif() if (ARROW_USE_GLOG) SET(ARROW_STATIC_LINK_LIBS glog_static ${ARROW_STATIC_LINK_LIBS}) + add_definitions("-DARROW_USE_GLOG") endif() if (ARROW_STATIC_LINK_LIBS) From 5d1934fc3f5c65f70a3966b71c68941b2fd8d362 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Mon, 17 Dec 2018 09:57:20 +0900 Subject: [PATCH 055/328] ARROW-4034: [Ruby] Add support :append option to FileOutputStream Author: Kouhei Sutou Closes #3193 from kou/ruby-file-output-stream-append and squashes the following commits: 6240f4b7 Add support :append option to FileOutputStream --- ruby/red-arrow/lib/arrow/field.rb | 1 + .../red-arrow/lib/arrow/file-output-stream.rb | 34 ++++++++++++ ruby/red-arrow/lib/arrow/loader.rb | 1 + ruby/red-arrow/lib/arrow/table.rb | 1 + .../red-arrow/test/test-file-output-stream.rb | 54 +++++++++++++++++++ 5 files changed, 91 insertions(+) create mode 100644 ruby/red-arrow/lib/arrow/file-output-stream.rb create mode 100644 ruby/red-arrow/test/test-file-output-stream.rb diff --git a/ruby/red-arrow/lib/arrow/field.rb b/ruby/red-arrow/lib/arrow/field.rb index b1ed1149deca9..be5865fd5564c 100644 --- a/ruby/red-arrow/lib/arrow/field.rb +++ b/ruby/red-arrow/lib/arrow/field.rb @@ -18,6 +18,7 @@ module Arrow class Field alias_method :initialize_raw, :initialize + private :initialize_raw def initialize(name, data_type) case data_type when String, Symbol diff --git a/ruby/red-arrow/lib/arrow/file-output-stream.rb b/ruby/red-arrow/lib/arrow/file-output-stream.rb new file mode 100644 index 0000000000000..f39ad14cacf5b --- /dev/null +++ b/ruby/red-arrow/lib/arrow/file-output-stream.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class FileOutputStream + alias_method :initialize_raw, :initialize + private :initialize_raw + def initialize(path, options={}) + append = nil + case options + when true, false + append = options + when Hash + append = options[:append] + end + append = false if append.nil? + initialize_raw(path, append) + end + end +end diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb index 736f25bd60438..2092e461c1786 100644 --- a/ruby/red-arrow/lib/arrow/loader.rb +++ b/ruby/red-arrow/lib/arrow/loader.rb @@ -44,6 +44,7 @@ def require_libraries require "arrow/date64-array" require "arrow/date64-array-builder" require "arrow/field" + require "arrow/file-output-stream" require "arrow/path-extension" require "arrow/record" require "arrow/record-batch" diff --git a/ruby/red-arrow/lib/arrow/table.rb b/ruby/red-arrow/lib/arrow/table.rb index 524517f03b9e6..69a1de31722a3 100644 --- a/ruby/red-arrow/lib/arrow/table.rb +++ b/ruby/red-arrow/lib/arrow/table.rb @@ -29,6 +29,7 @@ def load(path, options={}) end alias_method :initialize_raw, :initialize + private :initialize_raw def initialize(schema_or_raw_table_or_columns, columns=nil) if columns.nil? if schema_or_raw_table_or_columns[0].is_a?(Column) diff --git a/ruby/red-arrow/test/test-file-output-stream.rb b/ruby/red-arrow/test/test-file-output-stream.rb new file mode 100644 index 0000000000000..559406a4e1efe --- /dev/null +++ b/ruby/red-arrow/test/test-file-output-stream.rb @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFileOutputStream < Test::Unit::TestCase + sub_test_case(".open") do + def setup + @file = Tempfile.open("arrow-file-output-stream") + @file.write("Hello") + @file.close + end + + def test_default + Arrow::FileOutputStream.open(@file.path) do |file| + file.write(" World") + end + assert_equal(" World", File.read(@file.path)) + end + + def test_options_append + Arrow::FileOutputStream.open(@file.path, append: true) do |file| + file.write(" World") + end + assert_equal("Hello World", File.read(@file.path)) + end + + def test_append_true + Arrow::FileOutputStream.open(@file.path, true) do |file| + file.write(" World") + end + assert_equal("Hello World", File.read(@file.path)) + end + + def test_append_false + Arrow::FileOutputStream.open(@file.path, false) do |file| + file.write(" World") + end + assert_equal(" World", File.read(@file.path)) + end + end +end From 63fd350045edca00f4ddd0c2de23f87fecd3f323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 17 Dec 2018 12:34:27 +0100 Subject: [PATCH 056/328] ARROW-4043: [Packaging/Docker] Python tests on alpine miss pytest dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crossbow tests: [kszucs/crossbow/build-376](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-376) Author: Krisztián Szűcs Closes #3186 from kszucs/ARROW-4043 and squashes the following commits: d4bb8149 missing requirements.txt ab88181d remove redundant pandas dependency b2a89dff install tests dependencies from requirements-test.txt --- python/Dockerfile.alpine | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/Dockerfile.alpine b/python/Dockerfile.alpine index 7eedeac2860b0..ba0f2eb23f549 100644 --- a/python/Dockerfile.alpine +++ b/python/Dockerfile.alpine @@ -27,8 +27,10 @@ RUN export PYTHON_MAJOR=${PYTHON_VERSION:0:1} && \ pip install --upgrade pip setuptools # install python requirements -ADD python/requirements.txt /arrow/python/ -RUN pip install -r /arrow/python/requirements.txt cython pandas +ADD python/requirements.txt \ + python/requirements-test.txt \ + /arrow/python/ +RUN pip install -r /arrow/python/requirements-test.txt cython ENV ARROW_PYTHON=ON \ PYARROW_WITH_PARQUET=0 From 51f5e94612c92e81017898ab753f04dd55a868d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 17 Dec 2018 15:56:45 +0100 Subject: [PATCH 057/328] ARROW-4041: [CI] Python 2.7 run uses Python 3.6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Closes #3190 from kszucs/ARROW-4041 and squashes the following commits: 75d3cc91 remove python from env file 3abec8a7 single conda create command ba6a820e don't update python on travis --- ci/conda_env_python.yml | 1 - ci/travis_install_toolchain.sh | 1 - ci/travis_script_python.sh | 15 ++++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index c187155275eaa..d3756cbcfa8c9 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -22,7 +22,6 @@ nomkl numpy pandas pytest -python rsync setuptools setuptools_scm diff --git a/ci/travis_install_toolchain.sh b/ci/travis_install_toolchain.sh index 86ac56d043b96..82031e8fd362f 100755 --- a/ci/travis_install_toolchain.sh +++ b/ci/travis_install_toolchain.sh @@ -31,7 +31,6 @@ if [ ! -e $CPP_TOOLCHAIN ]; then --file=$TRAVIS_BUILD_DIR/ci/conda_env_cpp.yml \ ${CONDA_LLVM} \ ccache \ - curl \ ninja \ nomkl \ python=3.6 diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 6d96ebe2dfb0b..b8385c3834266 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -32,9 +32,6 @@ PYARROW_PYTEST_FLAGS=" -r sxX --durations=15 --parquet" PYTHON_VERSION=$1 CONDA_ENV_DIR=$TRAVIS_BUILD_DIR/pyarrow-test-$PYTHON_VERSION -conda create -y -q -p $CONDA_ENV_DIR python=$PYTHON_VERSION cmake curl -conda activate $CONDA_ENV_DIR - # We should use zlib in the target Python directory to avoid loading # wrong libpython on macOS at run-time. If we use zlib in # $ARROW_BUILD_TOOLCHAIN and libpython3.6m.dylib exists in both @@ -44,19 +41,23 @@ conda activate $CONDA_ENV_DIR # python-test fails. export ZLIB_HOME=$CONDA_ENV_DIR -python --version -which python - if [ $ARROW_TRAVIS_PYTHON_JVM == "1" ]; then CONDA_JVM_DEPS="jpype1" fi -conda install -y -q \ +conda create -y -q -p $CONDA_ENV_DIR \ --file $TRAVIS_BUILD_DIR/ci/conda_env_python.yml \ + cmake \ pip \ numpy=1.13.1 \ + python=${PYTHON_VERSION} \ ${CONDA_JVM_DEPS} +conda activate $CONDA_ENV_DIR + +python --version +which python + if [ "$ARROW_TRAVIS_PYTHON_DOCS" == "1" ] && [ "$PYTHON_VERSION" == "3.6" ]; then # Install documentation dependencies conda install -y -c conda-forge --file ci/conda_env_sphinx.yml From 4cfd6d3877e28624e271e022f7c98a8b1e3c5a5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 17 Dec 2018 16:12:36 +0100 Subject: [PATCH 058/328] ARROW-4045: [Packaging/Python] Add hypothesis test dependency to wheel crossbow tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test builds: [kszucs/crossbow/build-383](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-383) Author: Krisztián Szűcs Closes #3188 from kszucs/ARROW-4045 and squashes the following commits: a2bcdaf4 correct path 11093d97 missing cython on osx f06fb949 pin numpy version in appveyor.yml cabedfba remove last pandas version from tasks.yml 5309a5b2 requirements-wheel 97bc6ead fix requirements.txt path on osx c17d6748 win aa05e743 linux 6be30182 osx --- dev/release/rat_exclude_files.txt | 1 + dev/release/verify-release-candidate.sh | 2 +- dev/tasks/python-wheels/appveyor.yml | 2 +- dev/tasks/python-wheels/linux-test.sh | 2 +- dev/tasks/python-wheels/osx-build.sh | 12 ++++-------- dev/tasks/python-wheels/travis.linux.yml | 1 - dev/tasks/python-wheels/travis.osx.yml | 2 -- dev/tasks/python-wheels/win-build.bat | 2 +- dev/tasks/tasks.yml | 20 -------------------- python/manylinux1/build_arrow.sh | 11 +++-------- python/requirements-test.txt | 1 - python/requirements-wheel.txt | 4 ++++ 12 files changed, 16 insertions(+), 44 deletions(-) create mode 100644 python/requirements-wheel.txt diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index e274d97548068..f2e3f164fa284 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -130,6 +130,7 @@ python/pyarrow/includes/__init__.pxd python/pyarrow/tests/__init__.py python/requirements.txt python/requirements-test.txt +python/requirements-wheel.txt pax_global_header MANIFEST.in __init__.pxd diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 45404b03dfb8a..71324ec12f7c5 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -189,7 +189,7 @@ test_and_install_cpp() { test_python() { pushd python - pip install -r requirements-test.txt + pip install -r requirements.txt -r requirements-test.txt python setup.py build_ext --inplace --with-parquet --with-plasma py.test pyarrow -v --pdb diff --git a/dev/tasks/python-wheels/appveyor.yml b/dev/tasks/python-wheels/appveyor.yml index 016041a6c6701..c220f922bc45c 100644 --- a/dev/tasks/python-wheels/appveyor.yml +++ b/dev/tasks/python-wheels/appveyor.yml @@ -20,7 +20,7 @@ os: Visual Studio 2015 environment: ARCH: "64" GENERATOR: Visual Studio 14 2015 Win64 - NUMPY: "{{ numpy_version }}" + NUMPY: "1.14.5" PYTHON: "{{ python_version }}" MSVC_DEFAULT_OPTIONS: ON ARROW_SRC: C:\apache-arrow diff --git a/dev/tasks/python-wheels/linux-test.sh b/dev/tasks/python-wheels/linux-test.sh index 163730a9f38da..234ce8d561cec 100755 --- a/dev/tasks/python-wheels/linux-test.sh +++ b/dev/tasks/python-wheels/linux-test.sh @@ -30,5 +30,5 @@ python -c "import pyarrow.parquet" python -c "import pyarrow.plasma" # Run pyarrow tests -pip install pytest pandas +pip install -r /arrow/python/requirements-test.txt pytest --pyargs pyarrow diff --git a/dev/tasks/python-wheels/osx-build.sh b/dev/tasks/python-wheels/osx-build.sh index 5c69904ff4348..22c44c157337f 100755 --- a/dev/tasks/python-wheels/osx-build.sh +++ b/dev/tasks/python-wheels/osx-build.sh @@ -99,9 +99,8 @@ function build_wheel { # build will also work with newer NumPy versions. export ARROW_HOME=`pwd`/arrow-dist export PARQUET_HOME=`pwd`/arrow-dist - if [ -n "$BUILD_DEPENDS" ]; then - pip install $(pip_opts) $BUILD_DEPENDS - fi + + pip install $(pip_opts) -r python/requirements-wheel.txt cython pushd cpp mkdir build @@ -161,10 +160,6 @@ function install_run { wheelhouse="$PWD/python/dist" - # Install test dependencies and built wheel - if [ -n "$TEST_DEPENDS" ]; then - pip install $(pip_opts) $TEST_DEPENDS - fi # Install compatible wheel pip install $(pip_opts) \ $(python $multibuild_dir/supported_wheels.py $wheelhouse/*.whl) @@ -179,7 +174,8 @@ function install_run { python -c "import pyarrow.plasma" # Run pyarrow tests - pip install pytest pytest-faulthandler + pip install $(pip_opts) -r python/requirements-test.txt + py.test --pyargs pyarrow popd diff --git a/dev/tasks/python-wheels/travis.linux.yml b/dev/tasks/python-wheels/travis.linux.yml index 9a8f804d1cc51..17888ccc9f1bb 100644 --- a/dev/tasks/python-wheels/travis.linux.yml +++ b/dev/tasks/python-wheels/travis.linux.yml @@ -42,7 +42,6 @@ script: - docker run --shm-size=2g -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.version }} -e PYTHON_VERSIONS="{{ python_version }},{{ unicode_width }}" - -e WHEEL_VERSION={{ wheel_version }} -v $PWD:/io -v $PWD/../../:/arrow quay.io/xhochy/arrow_manylinux1_x86_64_base:latest /io/build_arrow.sh diff --git a/dev/tasks/python-wheels/travis.osx.yml b/dev/tasks/python-wheels/travis.osx.yml index 2f0d168a3fb46..c6bd010da4ebc 100644 --- a/dev/tasks/python-wheels/travis.osx.yml +++ b/dev/tasks/python-wheels/travis.osx.yml @@ -29,8 +29,6 @@ env: - PYARROW_VERSION={{ arrow.version }} - PYARROW_BUILD_VERBOSE=1 - MB_PYTHON_VERSION={{ python_version }} - - BUILD_DEPENDS="wheel=={{ wheel_version }} numpy=={{ numpy_version }} cython==0.27.3 six" - - TEST_DEPENDS="numpy=={{ numpy_version }} pandas=={{ pandas_version }} six" before_install: - git clone https://github.com/matthew-brett/multibuild # TODO pin it diff --git a/dev/tasks/python-wheels/win-build.bat b/dev/tasks/python-wheels/win-build.bat index 22e306ab1f1eb..f85c8e8b7490e 100644 --- a/dev/tasks/python-wheels/win-build.bat +++ b/dev/tasks/python-wheels/win-build.bat @@ -82,7 +82,7 @@ popd @rem test the wheel call deactivate conda create -n wheel-test -q -y python=%PYTHON% ^ - numpy=%NUMPY% pandas pytest + numpy=%NUMPY% pandas pytest hypothesis call activate wheel-test pip install --no-index --find-links=%ARROW_SRC%\python\dist\ pyarrow diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index bd49616f6bd3e..ea104d507eec1 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -146,7 +146,6 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 2.7 unicode_width: 16 test_docker_images: [] @@ -157,7 +156,6 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 2.7 unicode_width: 32 test_docker_images: @@ -169,7 +167,6 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 3.5 unicode_width: 16 test_docker_images: @@ -181,7 +178,6 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 3.6 unicode_width: 16 test_docker_images: @@ -193,7 +189,6 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 3.7 unicode_width: 16 test_docker_images: @@ -207,10 +202,7 @@ tasks: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 2.7 - wheel_version: 0.31.1 artifacts: - pyarrow-{version}-cp27-cp27m-macosx_10_6_intel.whl @@ -218,10 +210,7 @@ tasks: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 3.5 - wheel_version: 0.31.1 artifacts: - pyarrow-{version}-cp35-cp35m-macosx_10_6_intel.whl @@ -229,10 +218,7 @@ tasks: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 3.6 - wheel_version: 0.31.1 artifacts: - pyarrow-{version}-cp36-cp36m-macosx_10_6_intel.whl @@ -240,10 +226,7 @@ tasks: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 3.7 - wheel_version: 0.31.1 artifacts: - pyarrow-{version}-cp37-cp37m-macosx_10_6_intel.whl @@ -253,7 +236,6 @@ tasks: platform: win template: python-wheels/appveyor.yml params: - numpy_version: 1.14.5 python_version: 3.5 artifacts: - pyarrow-{version}-cp35-cp35m-win_amd64.whl @@ -262,7 +244,6 @@ tasks: platform: win template: python-wheels/appveyor.yml params: - numpy_version: 1.14.5 python_version: 3.6 artifacts: - pyarrow-{version}-cp36-cp36m-win_amd64.whl @@ -271,7 +252,6 @@ tasks: platform: win template: python-wheels/appveyor.yml params: - numpy_version: 1.14.5 python_version: 3.7 artifacts: - pyarrow-{version}-cp37-cp37m-win_amd64.whl diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index 904297375ef25..b1d8f8588dfc5 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -64,11 +64,6 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do fi fi - # pin wheel, because auditwheel is not compatible with wheel=0.32 - # pin after installing tensorflow, because it updates to wheel=0.32 - # TODO(kszucs): remove after auditwheel properly supports wheel>0.31 - $PIP install "wheel==${WHEEL_VERSION:-0.31.1}" - echo "=== (${PYTHON}) Building Arrow C++ libraries ===" ARROW_BUILD_DIR=/tmp/build-PY${PYTHON}-${U_WIDTH} mkdir -p "${ARROW_BUILD_DIR}" @@ -96,6 +91,9 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do # Check that we don't expose any unwanted symbols /io/scripts/check_arrow_visibility.sh + echo "=== (${PYTHON}) Install the wheel build dependencies ===" + $PIP install -r requirements-wheel.txt + # Clear output directory rm -rf dist/ echo "=== (${PYTHON}) Building wheel ===" @@ -107,9 +105,6 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER setup.py bdist_wheel PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER setup.py sdist - echo "=== (${PYTHON}) Ensure the existence of mandatory modules ===" - $PIP install -r requirements.txt - echo "=== (${PYTHON}) Tag the wheel with manylinux1 ===" mkdir -p repaired_wheels/ auditwheel -v repair -L . dist/pyarrow-*.whl -w repaired_wheels/ diff --git a/python/requirements-test.txt b/python/requirements-test.txt index 482e88860669a..89af5ecac437c 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -1,4 +1,3 @@ --r requirements.txt pandas pytest hypothesis diff --git a/python/requirements-wheel.txt b/python/requirements-wheel.txt new file mode 100644 index 0000000000000..c44903efd36cb --- /dev/null +++ b/python/requirements-wheel.txt @@ -0,0 +1,4 @@ +wheel==0.31.1 +six>=1.0.0 +numpy==1.14.5 +futures; python_version < "3.2" From 0190e60e4abd4f07428f8d6c04e76f42f70d4ce3 Mon Sep 17 00:00:00 2001 From: "Korn, Uwe" Date: Mon, 17 Dec 2018 10:01:16 -0600 Subject: [PATCH 059/328] ARROW-4054: [Python] Update gtest, flatbuffers and OpenSSL in manylinux1 base image Author: Korn, Uwe Closes #3202 from xhochy/ARROW-4054 and squashes the following commits: d777fe98a ARROW-4054: Update gtest, flatbuffers and OpenSSL in manylinux1 base image --- python/manylinux1/Dockerfile-x86_64_base | 2 +- .../manylinux1/scripts/build_flatbuffers.sh | 2 +- python/manylinux1/scripts/build_gtest.sh | 25 ++++++++++++++----- python/manylinux1/scripts/build_openssl.sh | 10 +++++--- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/python/manylinux1/Dockerfile-x86_64_base b/python/manylinux1/Dockerfile-x86_64_base index d4b84629c1735..8ba205ee3754e 100644 --- a/python/manylinux1/Dockerfile-x86_64_base +++ b/python/manylinux1/Dockerfile-x86_64_base @@ -34,7 +34,7 @@ RUN /install_cmake.sh ADD scripts/build_gtest.sh / RUN /build_gtest.sh -ENV GTEST_HOME /googletest-release-1.7.0 +ENV GTEST_HOME /usr ADD scripts/build_flatbuffers.sh / RUN /build_flatbuffers.sh diff --git a/python/manylinux1/scripts/build_flatbuffers.sh b/python/manylinux1/scripts/build_flatbuffers.sh index 70b184c9a59c9..cae32f5aac959 100755 --- a/python/manylinux1/scripts/build_flatbuffers.sh +++ b/python/manylinux1/scripts/build_flatbuffers.sh @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -export FLATBUFFERS_VERSION=1.9.0 +export FLATBUFFERS_VERSION=1.10.0 curl -sL https://github.com/google/flatbuffers/archive/v${FLATBUFFERS_VERSION}.tar.gz \ -o flatbuffers-${FLATBUFFERS_VERSION}.tar.gz tar xf flatbuffers-${FLATBUFFERS_VERSION}.tar.gz diff --git a/python/manylinux1/scripts/build_gtest.sh b/python/manylinux1/scripts/build_gtest.sh index f921efd489d67..5b29f5ee535c8 100755 --- a/python/manylinux1/scripts/build_gtest.sh +++ b/python/manylinux1/scripts/build_gtest.sh @@ -16,11 +16,24 @@ # specific language governing permissions and limitations # under the License. -curl -sL https://github.com/google/googletest/archive/release-1.7.0.tar.gz -o googletest-release-1.7.0.tar.gz -tar xf googletest-release-1.7.0.tar.gz +GTEST_VERSION=1.8.1 + +curl -sL https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz -o googletest-release-${GTEST_VERSION}.tar.gz +tar xf googletest-release-${GTEST_VERSION}.tar.gz ls -l -pushd googletest-release-1.7.0 -cmake -DCMAKE_CXX_FLAGS='-fPIC' -Dgtest_force_shared_crt=ON . -make -j5 +pushd googletest-release-${GTEST_VERSION} + +mkdir build_so +pushd build_so +cmake -DCMAKE_CXX_FLAGS='-fPIC' -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=ON -DBUILD_GMOCK=OFF -GNinja -DCMAKE_INSTALL_PREFIX=/usr .. +ninja install +popd + +mkdir build_a +pushd build_a +cmake -DCMAKE_CXX_FLAGS='-fPIC' -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=OFF -DBUILD_GMOCK=OFF -GNinja -DCMAKE_INSTALL_PREFIX=/usr .. +ninja install +popd + popd -rm -rf googletest-release-1.7.0.tar.gz +rm -rf googletest-release-${GTEST_VERSION}.tar.gz diff --git a/python/manylinux1/scripts/build_openssl.sh b/python/manylinux1/scripts/build_openssl.sh index 1a54d72f04696..622004d37f2c0 100755 --- a/python/manylinux1/scripts/build_openssl.sh +++ b/python/manylinux1/scripts/build_openssl.sh @@ -16,11 +16,13 @@ # specific language governing permissions and limitations # under the License. -wget --no-check-certificate https://www.openssl.org/source/openssl-1.0.2k.tar.gz -O openssl-1.0.2k.tar.gz -tar xf openssl-1.0.2k.tar.gz -pushd openssl-1.0.2k +OPENSSL_VERSION="1.0.2q" + +wget --no-check-certificate https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz -O openssl-${OPENSSL_VERSION}.tar.gz +tar xf openssl-${OPENSSL_VERSION}.tar.gz +pushd openssl-${OPENSSL_VERSION} ./config -fpic shared --prefix=/usr make -j5 make install popd -rm -rf openssl-1.0.2k.tar.gz openssl-1.0.2k +rm -rf openssl-${OPENSSL_VERSION}.tar.gz openssl-${OPENSSL_VERSION} From 39861574f064af741921f80436343268b19a6a2d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 17 Dec 2018 10:07:18 -0600 Subject: [PATCH 060/328] ARROW-3879: [C++] Fix uninitialized member in CudaBufferWriter Author: Antoine Pitrou Closes #3200 from pitrou/ARROW-3879-cuda-writer-uninitialized-member and squashes the following commits: e857fed22 ARROW-3879: Fix uninitialized member in CudaBufferWriter --- cpp/src/arrow/gpu/cuda_memory.cc | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/gpu/cuda_memory.cc b/cpp/src/arrow/gpu/cuda_memory.cc index cf0c51c23af02..a0da580acf927 100644 --- a/cpp/src/arrow/gpu/cuda_memory.cc +++ b/cpp/src/arrow/gpu/cuda_memory.cc @@ -221,9 +221,16 @@ class CudaBufferWriter::CudaBufferWriterImpl { mutable_data_ = buffer->mutable_data(); size_ = buffer->size(); position_ = 0; + closed_ = false; + } + +#define CHECK_CLOSED() \ + if (closed_) { \ + return Status::Invalid("Operation on closed CudaBufferWriter"); \ } Status Seek(int64_t position) { + CHECK_CLOSED(); if (position < 0 || position >= size_) { return Status::IOError("position out of bounds"); } @@ -234,12 +241,17 @@ class CudaBufferWriter::CudaBufferWriterImpl { Status Close() { if (!closed_) { closed_ = true; - RETURN_NOT_OK(Flush()); + RETURN_NOT_OK(FlushInternal()); } return Status::OK(); } Status Flush() { + CHECK_CLOSED(); + return FlushInternal(); + } + + Status FlushInternal() { if (buffer_size_ > 0 && buffer_position_ > 0) { // Only need to flush when the write has been buffered RETURN_NOT_OK( @@ -253,11 +265,13 @@ class CudaBufferWriter::CudaBufferWriterImpl { bool closed() const { return closed_; } Status Tell(int64_t* position) const { + CHECK_CLOSED(); *position = position_; return Status::OK(); } Status Write(const void* data, int64_t nbytes) { + CHECK_CLOSED(); if (nbytes == 0) { return Status::OK(); } @@ -283,11 +297,13 @@ class CudaBufferWriter::CudaBufferWriterImpl { Status WriteAt(int64_t position, const void* data, int64_t nbytes) { std::lock_guard guard(lock_); + CHECK_CLOSED(); RETURN_NOT_OK(Seek(position)); return Write(data, nbytes); } Status SetBufferSize(const int64_t buffer_size) { + CHECK_CLOSED(); if (buffer_position_ > 0) { // Flush any buffered data RETURN_NOT_OK(Flush()); @@ -303,6 +319,8 @@ class CudaBufferWriter::CudaBufferWriterImpl { int64_t buffer_position() const { return buffer_position_; } +#undef CHECK_CLOSED + private: std::shared_ptr context_; std::shared_ptr buffer_; From 836ad52aa54e665704f5bad5234d6cdad83bd20d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 17 Dec 2018 10:08:54 -0600 Subject: [PATCH 061/328] ARROW-4017: [C++] Move vendored libraries in dedicated directory Also update mapbox::variant to v1.1.5 (I'm not sure which version was previously vendored). Author: Antoine Pitrou Closes #3184 from pitrou/ARROW-4017-vendored-libraries and squashes the following commits: fe69566d7 ARROW-4017: Move vendored libraries in dedicated directory --- LICENSE.txt | 6 +- cpp/CMakeLists.txt | 6 +- cpp/build-support/clang_format_exclusions.txt | 7 +- cpp/build-support/lint_cpp_cli.py | 5 +- cpp/cmake_modules/BuildUtils.cmake | 3 +- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/util/CMakeLists.txt | 3 - cpp/src/arrow/util/parsing.h | 2 +- cpp/src/arrow/util/string_view.h | 2 +- cpp/src/arrow/util/variant.h | 1115 +---------------- cpp/src/arrow/util/variant/optional.h | 100 -- cpp/src/arrow/util/variant/variant_cast.h | 114 -- cpp/src/arrow/util/variant/variant_io.h | 72 -- cpp/src/arrow/util/variant/variant_visitor.h | 69 - .../string_view => vendored}/CMakeLists.txt | 6 +- cpp/src/arrow/{util => vendored}/date.h | 0 .../string_view => vendored}/string_view.hpp | 0 .../{util => vendored}/variant/CMakeLists.txt | 6 +- .../variant/recursive_wrapper.hpp} | 14 +- cpp/src/arrow/vendored/variant/variant.hpp | 1029 +++++++++++++++ cpp/src/arrow/vendored/variant/variant_io.hpp | 47 + .../vendored/variant/variant_visitor.hpp | 40 + .../arrow/{util => vendored}/xxhash/xxhash.c | 0 .../arrow/{util => vendored}/xxhash/xxhash.h | 0 .../gandiva/precompiled/epoch_time_point.h | 2 +- cpp/src/gandiva/to_date_holder.cc | 2 +- cpp/src/plasma/client.cc | 2 +- dev/release/rat_exclude_files.txt | 10 +- 28 files changed, 1165 insertions(+), 1498 deletions(-) delete mode 100644 cpp/src/arrow/util/variant/optional.h delete mode 100644 cpp/src/arrow/util/variant/variant_cast.h delete mode 100644 cpp/src/arrow/util/variant/variant_io.h delete mode 100644 cpp/src/arrow/util/variant/variant_visitor.h rename cpp/src/arrow/{util/string_view => vendored}/CMakeLists.txt (88%) rename cpp/src/arrow/{util => vendored}/date.h (100%) rename cpp/src/arrow/{util/string_view => vendored}/string_view.hpp (100%) rename cpp/src/arrow/{util => vendored}/variant/CMakeLists.txt (83%) rename cpp/src/arrow/{util/variant/recursive_wrapper.h => vendored/variant/recursive_wrapper.hpp} (89%) create mode 100644 cpp/src/arrow/vendored/variant/variant.hpp create mode 100644 cpp/src/arrow/vendored/variant/variant_io.hpp create mode 100644 cpp/src/arrow/vendored/variant/variant_visitor.hpp rename cpp/src/arrow/{util => vendored}/xxhash/xxhash.c (100%) rename cpp/src/arrow/{util => vendored}/xxhash/xxhash.h (100%) diff --git a/LICENSE.txt b/LICENSE.txt index 5c9aaddc14ff8..572d3ef548917 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -681,7 +681,7 @@ See the License for the specific language governing permissions and limitations under the License. -------------------------------------------------------------------------------- -The file cpp/src/arrow/util/date.h has the following license (MIT) +The file cpp/src/arrow/vendored/date.h has the following license (MIT) The MIT License (MIT) Copyright (c) 2015, 2016, 2017 Howard Hinnant @@ -736,7 +736,7 @@ SOFTWARE. -------------------------------------------------------------------------------- -The file cpp/src/util/string_view/string_view.hpp has the following license +The file cpp/src/arrow/vendored/string_view.hpp has the following license Boost Software License - Version 1.0 - August 17th, 2003 @@ -764,7 +764,7 @@ DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- -The files in cpp/src/arrow/util/xxhash/ have the following license +The files in cpp/src/arrow/vendored/xxhash/ have the following license (BSD 2-Clause License) xxHash Library diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e3cc3f560a95f..f563199c62470 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -373,12 +373,8 @@ if (UNIX) IF(NOT ((item MATCHES "_generated.h") OR (item MATCHES "pyarrow_api.h") OR (item MATCHES "pyarrow_lib.h") OR - (item MATCHES "xxhash.h") OR - (item MATCHES "xxhash.cc") OR (item MATCHES "config.h") OR - (item MATCHES "util/date.h") OR - (item MATCHES "util/string_view/") OR - (item MATCHES "util/variant") OR + (item MATCHES "vendored/") OR (item MATCHES "zmalloc.h") OR (item MATCHES "ae.h"))) LIST(APPEND FILTERED_LINT_FILES ${item}) diff --git a/cpp/build-support/clang_format_exclusions.txt b/cpp/build-support/clang_format_exclusions.txt index c04523af1db81..2964898f4f24d 100644 --- a/cpp/build-support/clang_format_exclusions.txt +++ b/cpp/build-support/clang_format_exclusions.txt @@ -4,11 +4,6 @@ *pyarrow_lib.h *python/config.h *python/platform.h -*util/date.h -*util/string_view/* -*util/variant.h -*util/variant/* *thirdparty/ae/* -*xxhash.cc -*xxhash.h +*vendored/* *RcppExports.cpp* diff --git a/cpp/build-support/lint_cpp_cli.py b/cpp/build-support/lint_cpp_cli.py index 4c26927740dbb..c8b25dfc5e48f 100644 --- a/cpp/build-support/lint_cpp_cli.py +++ b/cpp/build-support/lint_cpp_cli.py @@ -70,13 +70,10 @@ def lint_file(path): EXCLUSIONS = [ 'arrow/python/iterators.h', - 'arrow/util/date.h', 'arrow/util/hashing.h', 'arrow/util/macros.h', 'arrow/util/parallel.h', - 'arrow/util/string_view/string_view.hpp', - 'arrow/util/xxhash/xxhash.c', - 'arrow/util/xxhash/xxhash.h', + 'arrow/vendored', 'arrow/visitor_inline.h', 'gandiva/cache.h', 'gandiva/jni', diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 7585ae9da8fa8..812d0c39e7fa5 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -580,7 +580,8 @@ function(ARROW_INSTALL_ALL_HEADERS PATH) set(multi_value_args PATTERN) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if (NOT ARG_PATTERN) - set(ARG_PATTERN "*.h") + # The .hpp extension is used by some vendored libraries + set(ARG_PATTERN "*.h" "*.hpp") endif() file(GLOB CURRENT_DIRECTORY_HEADERS ${ARG_PATTERN}) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index bec290df2aa37..9291addca0e1c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -280,6 +280,7 @@ add_subdirectory(array) add_subdirectory(csv) add_subdirectory(io) add_subdirectory(util) +add_subdirectory(vendored) if(ARROW_FLIGHT) add_subdirectory(flight) diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index a09797183212f..b13b2f367b022 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -72,6 +72,3 @@ ADD_ARROW_BENCHMARK(int-util-benchmark) ADD_ARROW_BENCHMARK(lazy-benchmark) ADD_ARROW_BENCHMARK(number-parsing-benchmark) ADD_ARROW_BENCHMARK(utf8-util-benchmark) - -add_subdirectory(string_view) -add_subdirectory(variant) diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h index 23e0361235d3e..46d0f7c322b46 100644 --- a/cpp/src/arrow/util/parsing.h +++ b/cpp/src/arrow/util/parsing.h @@ -34,7 +34,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/date.h" +#include "arrow/vendored/date.h" namespace arrow { namespace internal { diff --git a/cpp/src/arrow/util/string_view.h b/cpp/src/arrow/util/string_view.h index 0f35483e3738e..a1a813726e4f0 100644 --- a/cpp/src/arrow/util/string_view.h +++ b/cpp/src/arrow/util/string_view.h @@ -18,7 +18,7 @@ #ifndef ARROW_UTIL_STRING_VIEW_H #define ARROW_UTIL_STRING_VIEW_H -#include "arrow/util/string_view/string_view.hpp" // IWYU pragma: export +#include "arrow/vendored/string_view.hpp" // IWYU pragma: export namespace arrow { namespace util { diff --git a/cpp/src/arrow/util/variant.h b/cpp/src/arrow/util/variant.h index 1aa9aa3732fdf..cb6500aef8044 100644 --- a/cpp/src/arrow/util/variant.h +++ b/cpp/src/arrow/util/variant.h @@ -1,1105 +1,34 @@ -// Copyright (c) MapBox -// All rights reserved. +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at // -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: +// http://www.apache.org/licenses/LICENSE-2.0 // -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. #ifndef ARROW_UTIL_VARIANT_H #define ARROW_UTIL_VARIANT_H -#include -#include // size_t -#include // operator new -#include // runtime_error -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - - -#ifdef _MSC_VER -// https://msdn.microsoft.com/en-us/library/bw1hbe6y.aspx -# ifdef NDEBUG -# define VARIANT_INLINE __forceinline -# else -# define VARIANT_INLINE //__declspec(noinline) -# endif -#else -# ifdef NDEBUG -# define VARIANT_INLINE //inline __attribute__((always_inline)) -# else -# define VARIANT_INLINE __attribute__((noinline)) -# endif -#endif -// clang-format on - -// Exceptions -#if defined( __EXCEPTIONS) || defined( _MSC_VER) -#define HAS_EXCEPTIONS -#endif - -#define VARIANT_MAJOR_VERSION 1 -#define VARIANT_MINOR_VERSION 1 -#define VARIANT_PATCH_VERSION 0 - -#define VARIANT_VERSION (VARIANT_MAJOR_VERSION * 100000) + (VARIANT_MINOR_VERSION * 100) + (VARIANT_PATCH_VERSION) +#include "arrow/vendored/variant/variant.hpp" // IWYU pragma: export namespace arrow { namespace util { -// XXX This should derive from std::logic_error instead of std::runtime_error. -// See https://github.com/mapbox/variant/issues/48 for details. -class bad_variant_access : public std::runtime_error -{ - -public: - explicit bad_variant_access(const std::string& what_arg) - : runtime_error(what_arg) {} - - explicit bad_variant_access(const char* what_arg) - : runtime_error(what_arg) {} - -}; // class bad_variant_access - -#if !defined(ARROW_VARIANT_MINIMIZE_SIZE) -using type_index_t = std::size_t; -#else -#if defined(ARROW_VARIANT_OPTIMIZE_FOR_SPEED) -using type_index_t = std::uint_fast8_t; -#else -using type_index_t = std::uint_least8_t; -#endif -#endif - -namespace detail { - -static constexpr type_index_t invalid_value = type_index_t(-1); - -template -struct direct_type; - -template -struct direct_type -{ - static constexpr type_index_t index = std::is_same::value - ? sizeof...(Types) - : direct_type::index; -}; - -template -struct direct_type -{ - static constexpr type_index_t index = invalid_value; -}; - -#if __cpp_lib_logical_traits >= 201510L - -using std::conjunction; -using std::disjunction; - -#else - -template -struct conjunction : std::true_type {}; - -template -struct conjunction : B1 {}; - -template -struct conjunction : std::conditional::type {}; - -template -struct conjunction : std::conditional, B1>::type {}; - -template -struct disjunction : std::false_type {}; - -template -struct disjunction : B1 {}; - -template -struct disjunction : std::conditional::type {}; - -template -struct disjunction : std::conditional>::type {}; - -#endif - -template -struct convertible_type; - -template -struct convertible_type -{ - static constexpr type_index_t index = std::is_convertible::value - ? disjunction...>::value ? invalid_value : sizeof...(Types) - : convertible_type::index; -}; - -template -struct convertible_type -{ - static constexpr type_index_t index = invalid_value; -}; - -template -struct value_traits -{ - using value_type = typename std::remove_const::type>::type; - using value_type_wrapper = recursive_wrapper; - static constexpr type_index_t direct_index = direct_type::index; - static constexpr bool is_direct = direct_index != invalid_value; - static constexpr type_index_t index_direct_or_wrapper = is_direct ? direct_index : direct_type::index; - static constexpr bool is_direct_or_wrapper = index_direct_or_wrapper != invalid_value; - static constexpr type_index_t index = is_direct_or_wrapper ? index_direct_or_wrapper : convertible_type::index; - static constexpr bool is_valid = index != invalid_value; - static constexpr type_index_t tindex = is_valid ? sizeof...(Types)-index : 0; - using target_type = typename std::tuple_element>::type; -}; - -template -struct enable_if_type -{ - using type = R; -}; - -template -struct result_of_unary_visit -{ - using type = typename std::result_of::type; -}; - -template -struct result_of_unary_visit::type> -{ - using type = typename F::result_type; -}; - -template -struct result_of_binary_visit -{ - using type = typename std::result_of::type; -}; - -template -struct result_of_binary_visit::type> -{ - using type = typename F::result_type; -}; - -template -struct static_max; - -template -struct static_max -{ - static const type_index_t value = arg; -}; - -template -struct static_max -{ - static const type_index_t value = arg1 >= arg2 ? static_max::value : static_max::value; -}; - -template -struct variant_helper; - -template -struct variant_helper -{ - VARIANT_INLINE static void destroy(const type_index_t type_index, void* data) - { - if (type_index == sizeof...(Types)) - { - reinterpret_cast(data)->~T(); - } - else - { - variant_helper::destroy(type_index, data); - } - } - - VARIANT_INLINE static void move(const type_index_t old_type_index, void* old_value, void* new_value) - { - if (old_type_index == sizeof...(Types)) - { - new (new_value) T(std::move(*reinterpret_cast(old_value))); - } - else - { - variant_helper::move(old_type_index, old_value, new_value); - } - } - - VARIANT_INLINE static void copy(const type_index_t old_type_index, const void* old_value, void* new_value) - { - if (old_type_index == sizeof...(Types)) - { - new (new_value) T(*reinterpret_cast(old_value)); - } - else - { - variant_helper::copy(old_type_index, old_value, new_value); - } - } -}; - -template <> -struct variant_helper<> -{ - VARIANT_INLINE static void destroy(const type_index_t, void*) {} - VARIANT_INLINE static void move(const type_index_t, void*, void*) {} - VARIANT_INLINE static void copy(const type_index_t, const void*, void*) {} -}; - -template -struct unwrapper -{ - static T const& apply_const(T const& obj) { return obj; } - static T& apply(T& obj) { return obj; } -}; - -template -struct unwrapper> -{ - static auto apply_const(recursive_wrapper const& obj) - -> typename recursive_wrapper::type const& - { - return obj.get(); - } - static auto apply(recursive_wrapper& obj) - -> typename recursive_wrapper::type& - { - return obj.get(); - } -}; - -template -struct unwrapper> -{ - static auto apply_const(std::reference_wrapper const& obj) - -> typename std::reference_wrapper::type const& - { - return obj.get(); - } - static auto apply(std::reference_wrapper& obj) - -> typename std::reference_wrapper::type& - { - return obj.get(); - } -}; - -template -struct dispatcher; - -template -struct dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v, F&& f) - { - if (v.template is()) - { - return f(unwrapper::apply_const(v.template get_unchecked())); - } - else - { - return dispatcher::apply_const(v, std::forward(f)); - } - } - - VARIANT_INLINE static R apply(V& v, F&& f) - { - if (v.template is()) - { - return f(unwrapper::apply(v.template get_unchecked())); - } - else - { - return dispatcher::apply(v, std::forward(f)); - } - } -}; - -template -struct dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v, F&& f) - { - return f(unwrapper::apply_const(v.template get_unchecked())); - } - - VARIANT_INLINE static R apply(V& v, F&& f) - { - return f(unwrapper::apply(v.template get_unchecked())); - } -}; - -template -struct binary_dispatcher_rhs; - -template -struct binary_dispatcher_rhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - if (rhs.template is()) // call binary functor - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_rhs::apply_const(lhs, rhs, std::forward(f)); - } - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - if (rhs.template is()) // call binary functor - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_rhs::apply(lhs, rhs, std::forward(f)); - } - } -}; - -template -struct binary_dispatcher_rhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } -}; - -template -struct binary_dispatcher_lhs; - -template -struct binary_dispatcher_lhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - if (lhs.template is()) // call binary functor - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_lhs::apply_const(lhs, rhs, std::forward(f)); - } - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - if (lhs.template is()) // call binary functor - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_lhs::apply(lhs, rhs, std::forward(f)); - } - } -}; - -template -struct binary_dispatcher_lhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } -}; - -template -struct binary_dispatcher; - -template -struct binary_dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) - { - if (v0.template is()) - { - if (v1.template is()) - { - return f(unwrapper::apply_const(v0.template get_unchecked()), - unwrapper::apply_const(v1.template get_unchecked())); // call binary functor - } - else - { - return binary_dispatcher_rhs::apply_const(v0, v1, std::forward(f)); - } - } - else if (v1.template is()) - { - return binary_dispatcher_lhs::apply_const(v0, v1, std::forward(f)); - } - return binary_dispatcher::apply_const(v0, v1, std::forward(f)); - } - - VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) - { - if (v0.template is()) - { - if (v1.template is()) - { - return f(unwrapper::apply(v0.template get_unchecked()), - unwrapper::apply(v1.template get_unchecked())); // call binary functor - } - else - { - return binary_dispatcher_rhs::apply(v0, v1, std::forward(f)); - } - } - else if (v1.template is()) - { - return binary_dispatcher_lhs::apply(v0, v1, std::forward(f)); - } - return binary_dispatcher::apply(v0, v1, std::forward(f)); - } -}; - -template -struct binary_dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) - { - return f(unwrapper::apply_const(v0.template get_unchecked()), - unwrapper::apply_const(v1.template get_unchecked())); // call binary functor - } - - VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) - { - return f(unwrapper::apply(v0.template get_unchecked()), - unwrapper::apply(v1.template get_unchecked())); // call binary functor - } -}; - -// comparator functors -struct equal_comp -{ - template - bool operator()(T const& lhs, T const& rhs) const - { - return lhs == rhs; - } -}; - -struct less_comp -{ - template - bool operator()(T const& lhs, T const& rhs) const - { - return lhs < rhs; - } -}; - -template -class comparer -{ -public: - explicit comparer(Variant const& lhs) noexcept - : lhs_(lhs) {} - comparer& operator=(comparer const&) = delete; - // visitor - template - bool operator()(T const& rhs_content) const - { - T const& lhs_content = lhs_.template get_unchecked(); - return Comp()(lhs_content, rhs_content); - } - -private: - Variant const& lhs_; -}; - -// hashing visitor -struct hasher -{ - template - std::size_t operator()(const T& hashable) const - { - return std::hash{}(hashable); - } -}; - -} // namespace detail - -struct no_init {}; - -template -class variant -{ - static_assert(sizeof...(Types) > 0, "Template parameter type list of variant can not be empty."); - static_assert(!detail::disjunction...>::value, "Variant can not hold reference types. Maybe use std::reference_wrapper?"); - static_assert(!detail::disjunction...>::value, "Variant can not hold array types."); - static_assert(sizeof...(Types) < std::numeric_limits::max(), "Internal index type must be able to accommodate all alternatives."); -private: - static const std::size_t data_size = detail::static_max::value; - static const std::size_t data_align = detail::static_max::value; -public: - struct adapted_variant_tag; - using types = std::tuple; -private: - using first_type = typename std::tuple_element<0, types>::type; - using data_type = typename std::aligned_storage::type; - using helper_type = detail::variant_helper; - - type_index_t type_index; - data_type data; - -public: - VARIANT_INLINE variant() noexcept(std::is_nothrow_default_constructible::value) - : type_index(sizeof...(Types)-1) - { - static_assert(std::is_default_constructible::value, "First type in variant must be default constructible to allow default construction of variant."); - new (&data) first_type(); - } - - VARIANT_INLINE variant(no_init) noexcept - : type_index(detail::invalid_value) {} - - // http://isocpp.org/blog/2012/11/universal-references-in-c11-scott-meyers - template , - typename Enable = typename std::enable_if, typename Traits::value_type>::value>::type > - VARIANT_INLINE variant(T&& val) noexcept(std::is_nothrow_constructible::value) - : type_index(Traits::index) - { - new (&data) typename Traits::target_type(std::forward(val)); - } - - VARIANT_INLINE variant(variant const& old) - : type_index(old.type_index) - { - helper_type::copy(old.type_index, &old.data, &data); - } - - VARIANT_INLINE variant(variant&& old) - noexcept(detail::conjunction...>::value) - : type_index(old.type_index) - { - helper_type::move(old.type_index, &old.data, &data); - } - -private: - VARIANT_INLINE void copy_assign(variant const& rhs) - { - helper_type::destroy(type_index, &data); - type_index = detail::invalid_value; - helper_type::copy(rhs.type_index, &rhs.data, &data); - type_index = rhs.type_index; - } - - VARIANT_INLINE void move_assign(variant&& rhs) - { - helper_type::destroy(type_index, &data); - type_index = detail::invalid_value; - helper_type::move(rhs.type_index, &rhs.data, &data); - type_index = rhs.type_index; - } - -public: - VARIANT_INLINE variant& operator=(variant&& other) - { - move_assign(std::move(other)); - return *this; - } - - VARIANT_INLINE variant& operator=(variant const& other) - { - copy_assign(other); - return *this; - } - - // conversions - // move-assign - template - VARIANT_INLINE variant& operator=(T&& rhs) noexcept - { - variant temp(std::forward(rhs)); - move_assign(std::move(temp)); - return *this; - } - - // copy-assign - template - VARIANT_INLINE variant& operator=(T const& rhs) - { - variant temp(rhs); - copy_assign(temp); - return *this; - } - - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE bool is() const - { - return type_index == detail::direct_type::index; - } - - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE bool is() const - { - return type_index == detail::direct_type, Types...>::index; - } - - VARIANT_INLINE bool valid() const - { - return type_index != detail::invalid_value; - } - - template - VARIANT_INLINE void set(Args&&... args) - { - helper_type::destroy(type_index, &data); - type_index = detail::invalid_value; - new (&data) T(std::forward(args)...); - type_index = detail::direct_type::index; - } - - // get_unchecked() - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get_unchecked() - { - return *reinterpret_cast(&data); - } - -#ifdef HAS_EXCEPTIONS - // get() - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get() - { - if (type_index == detail::direct_type::index) - { - return *reinterpret_cast(&data); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get_unchecked() const - { - return *reinterpret_cast(&data); - } - -#ifdef HAS_EXCEPTIONS - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get() const - { - if (type_index == detail::direct_type::index) - { - return *reinterpret_cast(&data); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - // get_unchecked() - T stored as recursive_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get_unchecked() - { - return (*reinterpret_cast*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - // get() - T stored as recursive_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get() - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get_unchecked() const - { - return (*reinterpret_cast const*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get() const - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast const*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - // get_unchecked() - T stored as std::reference_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get_unchecked() - { - return (*reinterpret_cast*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - // get() - T stored as std::reference_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get() - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get_unchecked() const - { - return (*reinterpret_cast const*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get() const - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast const*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - // This function is deprecated because it returns an internal index field. - // Use which() instead. - ARROW_DEPRECATED("Use which() instead") - VARIANT_INLINE type_index_t get_type_index() const - { - return type_index; - } - - VARIANT_INLINE int which() const noexcept - { - return static_cast(sizeof...(Types) - type_index - 1); - } - - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE static constexpr int which() noexcept - { - return static_cast(sizeof...(Types)-detail::direct_type::index - 1); - } - - // visitor - // unary - template ::type> - auto VARIANT_INLINE static visit(V const& v, F&& f) - -> decltype(detail::dispatcher::apply_const(v, std::forward(f))) - { - return detail::dispatcher::apply_const(v, std::forward(f)); - } - // non-const - template ::type> - auto VARIANT_INLINE static visit(V& v, F&& f) - -> decltype(detail::dispatcher::apply(v, std::forward(f))) - { - return detail::dispatcher::apply(v, std::forward(f)); - } - - // binary - // const - template ::type> - auto VARIANT_INLINE static binary_visit(V const& v0, V const& v1, F&& f) - -> decltype(detail::binary_dispatcher::apply_const(v0, v1, std::forward(f))) - { - return detail::binary_dispatcher::apply_const(v0, v1, std::forward(f)); - } - // non-const - template ::type> - auto VARIANT_INLINE static binary_visit(V& v0, V& v1, F&& f) - -> decltype(detail::binary_dispatcher::apply(v0, v1, std::forward(f))) - { - return detail::binary_dispatcher::apply(v0, v1, std::forward(f)); - } - - // match - // unary - template - auto VARIANT_INLINE match(Fs&&... fs) const - -> decltype(variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...))) - { - return variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...)); - } - // non-const - template - auto VARIANT_INLINE match(Fs&&... fs) - -> decltype(variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...))) - { - return variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...)); - } - - ~variant() noexcept // no-throw destructor - { - helper_type::destroy(type_index, &data); - } - - // comparison operators - // equality - VARIANT_INLINE bool operator==(variant const& rhs) const - { - assert(valid() && rhs.valid()); - if (this->which() != rhs.which()) - { - return false; - } - detail::comparer visitor(*this); - return visit(rhs, visitor); - } - - VARIANT_INLINE bool operator!=(variant const& rhs) const - { - return !(*this == rhs); - } - - // less than - VARIANT_INLINE bool operator<(variant const& rhs) const - { - assert(valid() && rhs.valid()); - if (this->which() != rhs.which()) - { - return this->which() < rhs.which(); - } - detail::comparer visitor(*this); - return visit(rhs, visitor); - } - VARIANT_INLINE bool operator>(variant const& rhs) const - { - return rhs < *this; - } - VARIANT_INLINE bool operator<=(variant const& rhs) const - { - return !(*this > rhs); - } - VARIANT_INLINE bool operator>=(variant const& rhs) const - { - return !(*this < rhs); - } -}; - -// unary visitor interface -// const -template -auto VARIANT_INLINE apply_visitor(F&& f, V const& v) -> decltype(V::visit(v, std::forward(f))) -{ - return V::visit(v, std::forward(f)); -} - -// non-const -template -auto VARIANT_INLINE apply_visitor(F&& f, V& v) -> decltype(V::visit(v, std::forward(f))) -{ - return V::visit(v, std::forward(f)); -} - -// binary visitor interface -// const -template -auto VARIANT_INLINE apply_visitor(F&& f, V const& v0, V const& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) -{ - return V::binary_visit(v0, v1, std::forward(f)); -} - -// non-const -template -auto VARIANT_INLINE apply_visitor(F&& f, V& v0, V& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) -{ - return V::binary_visit(v0, v1, std::forward(f)); -} - -// getter interface - -#ifdef HAS_EXCEPTIONS -template -auto get(T& var)->decltype(var.template get()) -{ - return var.template get(); -} -#endif - -template -ResultType& get_unchecked(T& var) -{ - return var.template get_unchecked(); -} - -#ifdef HAS_EXCEPTIONS -template -auto get(T const& var)->decltype(var.template get()) -{ - return var.template get(); -} -#endif - -template -ResultType const& get_unchecked(T const& var) -{ - return var.template get_unchecked(); -} -// variant_size -template -struct variant_size; - -//variable templates is c++14 -//template -//constexpr std::size_t variant_size_v = variant_size::value; - -template -struct variant_size - : variant_size {}; - -template -struct variant_size - : variant_size {}; - -template -struct variant_size - : variant_size {}; - -template -struct variant_size> - : std::integral_constant {}; - -// variant_alternative -template -struct variant_alternative; - -#if defined(__clang__) -#if __has_builtin(__type_pack_element) -#define has_type_pack_element -#endif -#endif - -#if defined(has_type_pack_element) -template -struct variant_alternative> -{ - static_assert(sizeof...(Types) > Index , "Index out of range"); - using type = __type_pack_element; -}; -#else -template -struct variant_alternative> - : variant_alternative> -{ - static_assert(sizeof...(Types) > Index -1 , "Index out of range"); -}; - -template -struct variant_alternative<0, variant> -{ - using type = First; -}; - -#endif - -template -using variant_alternative_t = typename variant_alternative::type; - -template -struct variant_alternative - : std::add_const> {}; - -template -struct variant_alternative - : std::add_volatile> {}; - -template -struct variant_alternative - : std::add_cv> {}; +using mapbox::util::apply_visitor; // seems akin to std::visit +using mapbox::util::bad_variant_access; +using mapbox::util::get; +using mapbox::util::variant; -} // namespace util -} // namespace arrow +} // namespace util +} // namespace arrow -#endif // ARROW_UTIL_VARIANT_H +#endif // ARROW_UTIL_VARIANT_H diff --git a/cpp/src/arrow/util/variant/optional.h b/cpp/src/arrow/util/variant/optional.h deleted file mode 100644 index 4c6671061fe80..0000000000000 --- a/cpp/src/arrow/util/variant/optional.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_OPTIONAL_H -#define ARROW_UTIL_VARIANT_OPTIONAL_H - -#pragma message("This implementation of optional is deprecated. See https://github.com/mapbox/variant/issues/64.") - -#include -#include - -#include - -namespace arrow { -namespace util { - -template -class optional -{ - static_assert(!std::is_reference::value, "optional doesn't support references"); - - struct none_type - { - }; - - variant variant_; - -public: - optional() = default; - - optional(optional const& rhs) - { - if (this != &rhs) - { // protect against invalid self-assignment - variant_ = rhs.variant_; - } - } - - optional(T const& v) { variant_ = v; } - - explicit operator bool() const noexcept { return variant_.template is(); } - - T const& get() const { return variant_.template get(); } - T& get() { return variant_.template get(); } - - T const& operator*() const { return this->get(); } - T operator*() { return this->get(); } - - optional& operator=(T const& v) - { - variant_ = v; - return *this; - } - - optional& operator=(optional const& rhs) - { - if (this != &rhs) - { - variant_ = rhs.variant_; - } - return *this; - } - - template - void emplace(Args&&... args) - { - variant_ = T{std::forward(args)...}; - } - - void reset() { variant_ = none_type{}; } - -}; // class optional - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_OPTIONAL_H diff --git a/cpp/src/arrow/util/variant/variant_cast.h b/cpp/src/arrow/util/variant/variant_cast.h deleted file mode 100644 index 71ae80b5dfab6..0000000000000 --- a/cpp/src/arrow/util/variant/variant_cast.h +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_CAST_H -#define ARROW_UTIL_VARIANT_CAST_H - -#include - -#include "arrow/util/macros.h" - -namespace arrow { -namespace util { - -namespace detail { - -template -class static_caster -{ -public: - template - T& operator()(V& v) const - { - return static_cast(v); - } -}; - -template -class dynamic_caster -{ -public: - using result_type = T&; - template - T& operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - throw std::bad_cast(); - } - template - T& operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - return dynamic_cast(v); - } -}; - -template -class dynamic_caster -{ -public: - using result_type = T*; - template - T* operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - return NULLPTR; - } - template - T* operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - return dynamic_cast(&v); - } -}; -} - -template -typename detail::dynamic_caster::result_type -dynamic_variant_cast(V& v) -{ - return arrow::util::apply_visitor(detail::dynamic_caster(), v); -} - -template -typename detail::dynamic_caster::result_type -dynamic_variant_cast(const V& v) -{ - return arrow::util::apply_visitor(detail::dynamic_caster(), v); -} - -template -T& static_variant_cast(V& v) -{ - return arrow::util::apply_visitor(detail::static_caster(), v); -} - -template -const T& static_variant_cast(const V& v) -{ - return arrow::util::apply_visitor(detail::static_caster(), v); -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_CAST_H diff --git a/cpp/src/arrow/util/variant/variant_io.h b/cpp/src/arrow/util/variant/variant_io.h deleted file mode 100644 index 5541a81f7035f..0000000000000 --- a/cpp/src/arrow/util/variant/variant_io.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_IO_H -#define ARROW_UTIL_VARIANT_IO_H - -#include - -#include - -namespace arrow { -namespace util { - -namespace detail { -// operator<< helper -template -class printer -{ -public: - explicit printer(Out& out) - : out_(out) {} - printer& operator=(printer const&) = delete; - - // visitor - template - void operator()(T const& operand) const - { - out_ << operand; - } - -private: - Out& out_; -}; -} - -// operator<< -template -VARIANT_INLINE std::basic_ostream& -operator<<(std::basic_ostream& out, variant const& rhs) -{ - detail::printer> visitor(out); - apply_visitor(visitor, rhs); - return out; -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_IO_H diff --git a/cpp/src/arrow/util/variant/variant_visitor.h b/cpp/src/arrow/util/variant/variant_visitor.h deleted file mode 100644 index 66b1dfea3d7c9..0000000000000 --- a/cpp/src/arrow/util/variant/variant_visitor.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_VISITOR_HPP -#define ARROW_UTIL_VARIANT_VISITOR_HPP - -#include - -namespace arrow { -namespace util { - -template -struct visitor; - -template -struct visitor : Fn -{ - using Fn::operator(); - - template - visitor(T&& fn) : Fn(std::forward(fn)) {} -}; - -template -struct visitor : Fn, visitor -{ - using Fn::operator(); - using visitor::operator(); - - template - visitor(T&& fn, Ts&&... fns) - : Fn(std::forward(fn)) - , visitor(std::forward(fns)...) {} -}; - -template -visitor::type...> make_visitor(Fns&&... fns) -{ - return visitor::type...> - (std::forward(fns)...); -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_VISITOR_HPP diff --git a/cpp/src/arrow/util/string_view/CMakeLists.txt b/cpp/src/arrow/vendored/CMakeLists.txt similarity index 88% rename from cpp/src/arrow/util/string_view/CMakeLists.txt rename to cpp/src/arrow/vendored/CMakeLists.txt index 7e553077db1ad..04ea67aa45d04 100644 --- a/cpp/src/arrow/util/string_view/CMakeLists.txt +++ b/cpp/src/arrow/vendored/CMakeLists.txt @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -install(FILES - string_view.hpp - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/util/string_view") +ARROW_INSTALL_ALL_HEADERS("arrow/vendored") + +add_subdirectory(variant) diff --git a/cpp/src/arrow/util/date.h b/cpp/src/arrow/vendored/date.h similarity index 100% rename from cpp/src/arrow/util/date.h rename to cpp/src/arrow/vendored/date.h diff --git a/cpp/src/arrow/util/string_view/string_view.hpp b/cpp/src/arrow/vendored/string_view.hpp similarity index 100% rename from cpp/src/arrow/util/string_view/string_view.hpp rename to cpp/src/arrow/vendored/string_view.hpp diff --git a/cpp/src/arrow/util/variant/CMakeLists.txt b/cpp/src/arrow/vendored/variant/CMakeLists.txt similarity index 83% rename from cpp/src/arrow/util/variant/CMakeLists.txt rename to cpp/src/arrow/vendored/variant/CMakeLists.txt index b7a5692b6207c..de26f938d72f3 100644 --- a/cpp/src/arrow/util/variant/CMakeLists.txt +++ b/cpp/src/arrow/vendored/variant/CMakeLists.txt @@ -15,8 +15,4 @@ # specific language governing permissions and limitations # under the License. -####################################### -# arrow_util_variant -####################################### - -ARROW_INSTALL_ALL_HEADERS("arrow/util/variant") +ARROW_INSTALL_ALL_HEADERS("arrow/vendored/variant") diff --git a/cpp/src/arrow/util/variant/recursive_wrapper.h b/cpp/src/arrow/vendored/variant/recursive_wrapper.hpp similarity index 89% rename from cpp/src/arrow/util/variant/recursive_wrapper.h rename to cpp/src/arrow/vendored/variant/recursive_wrapper.hpp index c9d9385394b38..96b6a3f217f5b 100644 --- a/cpp/src/arrow/util/variant/recursive_wrapper.h +++ b/cpp/src/arrow/vendored/variant/recursive_wrapper.hpp @@ -1,7 +1,9 @@ -#ifndef ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H -#define ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 -// Based on variant/recursive_wrapper.h from boost. +#ifndef MAPBOX_UTIL_RECURSIVE_WRAPPER_HPP +#define MAPBOX_UTIL_RECURSIVE_WRAPPER_HPP + +// Based on variant/recursive_wrapper.hpp from boost. // // Original license: // @@ -15,7 +17,7 @@ #include #include -namespace arrow { +namespace mapbox { namespace util { template @@ -117,6 +119,6 @@ inline void swap(recursive_wrapper& lhs, recursive_wrapper& rhs) noexcept lhs.swap(rhs); } } // namespace util -} // namespace arrow +} // namespace mapbox -#endif // ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H +#endif // MAPBOX_UTIL_RECURSIVE_WRAPPER_HPP diff --git a/cpp/src/arrow/vendored/variant/variant.hpp b/cpp/src/arrow/vendored/variant/variant.hpp new file mode 100644 index 0000000000000..bb399dece1d57 --- /dev/null +++ b/cpp/src/arrow/vendored/variant/variant.hpp @@ -0,0 +1,1029 @@ +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 + +#ifndef MAPBOX_UTIL_VARIANT_HPP +#define MAPBOX_UTIL_VARIANT_HPP + +#include +#include // size_t +#include // operator new +#include // runtime_error +#include +#include +#include +#include +#include +#include + +#include "recursive_wrapper.hpp" +#include "variant_visitor.hpp" + +// clang-format off +// [[deprecated]] is only available in C++14, use this for the time being +#if __cplusplus <= 201103L +# ifdef __GNUC__ +# define MAPBOX_VARIANT_DEPRECATED __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define MAPBOX_VARIANT_DEPRECATED __declspec(deprecated) +# else +# define MAPBOX_VARIANT_DEPRECATED +# endif +#else +# define MAPBOX_VARIANT_DEPRECATED [[deprecated]] +#endif + + +#ifdef _MSC_VER +// https://msdn.microsoft.com/en-us/library/bw1hbe6y.aspx +# ifdef NDEBUG +# define VARIANT_INLINE __forceinline +# else +# define VARIANT_INLINE //__declspec(noinline) +# endif +#else +# ifdef NDEBUG +# define VARIANT_INLINE //inline __attribute__((always_inline)) +# else +# define VARIANT_INLINE __attribute__((noinline)) +# endif +#endif +// clang-format on + +// Exceptions +#if defined( __EXCEPTIONS) || defined( _MSC_VER) +#define HAS_EXCEPTIONS +#endif + +#define VARIANT_MAJOR_VERSION 1 +#define VARIANT_MINOR_VERSION 1 +#define VARIANT_PATCH_VERSION 0 + +#define VARIANT_VERSION (VARIANT_MAJOR_VERSION * 100000) + (VARIANT_MINOR_VERSION * 100) + (VARIANT_PATCH_VERSION) + +namespace mapbox { +namespace util { + +// XXX This should derive from std::logic_error instead of std::runtime_error. +// See https://github.com/mapbox/variant/issues/48 for details. +class bad_variant_access : public std::runtime_error +{ + +public: + explicit bad_variant_access(const std::string& what_arg) + : runtime_error(what_arg) {} + + explicit bad_variant_access(const char* what_arg) + : runtime_error(what_arg) {} + +}; // class bad_variant_access + +template +struct MAPBOX_VARIANT_DEPRECATED static_visitor +{ + using result_type = R; + +protected: + static_visitor() {} + ~static_visitor() {} +}; + +namespace detail { + +static constexpr std::size_t invalid_value = std::size_t(-1); + +template +struct direct_type; + +template +struct direct_type +{ + static constexpr std::size_t index = std::is_same::value + ? sizeof...(Types) + : direct_type::index; +}; + +template +struct direct_type +{ + static constexpr std::size_t index = invalid_value; +}; + +#if __cpp_lib_logical_traits >= 201510L + +using std::conjunction; +using std::disjunction; + +#else + +template +struct conjunction : std::true_type {}; + +template +struct conjunction : B1 {}; + +template +struct conjunction : std::conditional::type {}; + +template +struct conjunction : std::conditional, B1>::type {}; + +template +struct disjunction : std::false_type {}; + +template +struct disjunction : B1 {}; + +template +struct disjunction : std::conditional::type {}; + +template +struct disjunction : std::conditional>::type {}; + +#endif + +template +struct convertible_type; + +template +struct convertible_type +{ + static constexpr std::size_t index = std::is_convertible::value + ? disjunction...>::value ? invalid_value : sizeof...(Types) + : convertible_type::index; +}; + +template +struct convertible_type +{ + static constexpr std::size_t index = invalid_value; +}; + +template +struct value_traits +{ + using value_type = typename std::remove_const::type>::type; + static constexpr std::size_t direct_index = direct_type::index; + static constexpr bool is_direct = direct_index != invalid_value; + static constexpr std::size_t index = is_direct ? direct_index : convertible_type::index; + static constexpr bool is_valid = index != invalid_value; + static constexpr std::size_t tindex = is_valid ? sizeof...(Types)-index : 0; + using target_type = typename std::tuple_element>::type; +}; + +template +struct enable_if_type +{ + using type = R; +}; + +template +struct result_of_unary_visit +{ + using type = typename std::result_of::type; +}; + +template +struct result_of_unary_visit::type> +{ + using type = typename F::result_type; +}; + +template +struct result_of_binary_visit +{ + using type = typename std::result_of::type; +}; + +template +struct result_of_binary_visit::type> +{ + using type = typename F::result_type; +}; + +template +struct static_max; + +template +struct static_max +{ + static const std::size_t value = arg; +}; + +template +struct static_max +{ + static const std::size_t value = arg1 >= arg2 ? static_max::value : static_max::value; +}; + +template +struct variant_helper; + +template +struct variant_helper +{ + VARIANT_INLINE static void destroy(const std::size_t type_index, void* data) + { + if (type_index == sizeof...(Types)) + { + reinterpret_cast(data)->~T(); + } + else + { + variant_helper::destroy(type_index, data); + } + } + + VARIANT_INLINE static void move(const std::size_t old_type_index, void* old_value, void* new_value) + { + if (old_type_index == sizeof...(Types)) + { + new (new_value) T(std::move(*reinterpret_cast(old_value))); + } + else + { + variant_helper::move(old_type_index, old_value, new_value); + } + } + + VARIANT_INLINE static void copy(const std::size_t old_type_index, const void* old_value, void* new_value) + { + if (old_type_index == sizeof...(Types)) + { + new (new_value) T(*reinterpret_cast(old_value)); + } + else + { + variant_helper::copy(old_type_index, old_value, new_value); + } + } +}; + +template <> +struct variant_helper<> +{ + VARIANT_INLINE static void destroy(const std::size_t, void*) {} + VARIANT_INLINE static void move(const std::size_t, void*, void*) {} + VARIANT_INLINE static void copy(const std::size_t, const void*, void*) {} +}; + +template +struct unwrapper +{ + static T const& apply_const(T const& obj) { return obj; } + static T& apply(T& obj) { return obj; } +}; + +template +struct unwrapper> +{ + static auto apply_const(recursive_wrapper const& obj) + -> typename recursive_wrapper::type const& + { + return obj.get(); + } + static auto apply(recursive_wrapper& obj) + -> typename recursive_wrapper::type& + { + return obj.get(); + } +}; + +template +struct unwrapper> +{ + static auto apply_const(std::reference_wrapper const& obj) + -> typename std::reference_wrapper::type const& + { + return obj.get(); + } + static auto apply(std::reference_wrapper& obj) + -> typename std::reference_wrapper::type& + { + return obj.get(); + } +}; + +template +struct dispatcher; + +template +struct dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v, F&& f) + { + if (v.template is()) + { + return f(unwrapper::apply_const(v.template get_unchecked())); + } + else + { + return dispatcher::apply_const(v, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& v, F&& f) + { + if (v.template is()) + { + return f(unwrapper::apply(v.template get_unchecked())); + } + else + { + return dispatcher::apply(v, std::forward(f)); + } + } +}; + +template +struct dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v, F&& f) + { + return f(unwrapper::apply_const(v.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& v, F&& f) + { + return f(unwrapper::apply(v.template get_unchecked())); + } +}; + +template +struct binary_dispatcher_rhs; + +template +struct binary_dispatcher_rhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + if (rhs.template is()) // call binary functor + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_rhs::apply_const(lhs, rhs, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + if (rhs.template is()) // call binary functor + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_rhs::apply(lhs, rhs, std::forward(f)); + } + } +}; + +template +struct binary_dispatcher_rhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } +}; + +template +struct binary_dispatcher_lhs; + +template +struct binary_dispatcher_lhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + if (lhs.template is()) // call binary functor + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_lhs::apply_const(lhs, rhs, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + if (lhs.template is()) // call binary functor + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_lhs::apply(lhs, rhs, std::forward(f)); + } + } +}; + +template +struct binary_dispatcher_lhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } +}; + +template +struct binary_dispatcher; + +template +struct binary_dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) + { + if (v0.template is()) + { + if (v1.template is()) + { + return f(unwrapper::apply_const(v0.template get_unchecked()), + unwrapper::apply_const(v1.template get_unchecked())); // call binary functor + } + else + { + return binary_dispatcher_rhs::apply_const(v0, v1, std::forward(f)); + } + } + else if (v1.template is()) + { + return binary_dispatcher_lhs::apply_const(v0, v1, std::forward(f)); + } + return binary_dispatcher::apply_const(v0, v1, std::forward(f)); + } + + VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) + { + if (v0.template is()) + { + if (v1.template is()) + { + return f(unwrapper::apply(v0.template get_unchecked()), + unwrapper::apply(v1.template get_unchecked())); // call binary functor + } + else + { + return binary_dispatcher_rhs::apply(v0, v1, std::forward(f)); + } + } + else if (v1.template is()) + { + return binary_dispatcher_lhs::apply(v0, v1, std::forward(f)); + } + return binary_dispatcher::apply(v0, v1, std::forward(f)); + } +}; + +template +struct binary_dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) + { + return f(unwrapper::apply_const(v0.template get_unchecked()), + unwrapper::apply_const(v1.template get_unchecked())); // call binary functor + } + + VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) + { + return f(unwrapper::apply(v0.template get_unchecked()), + unwrapper::apply(v1.template get_unchecked())); // call binary functor + } +}; + +// comparator functors +struct equal_comp +{ + template + bool operator()(T const& lhs, T const& rhs) const + { + return lhs == rhs; + } +}; + +struct less_comp +{ + template + bool operator()(T const& lhs, T const& rhs) const + { + return lhs < rhs; + } +}; + +template +class comparer +{ +public: + explicit comparer(Variant const& lhs) noexcept + : lhs_(lhs) {} + comparer& operator=(comparer const&) = delete; + // visitor + template + bool operator()(T const& rhs_content) const + { + T const& lhs_content = lhs_.template get_unchecked(); + return Comp()(lhs_content, rhs_content); + } + +private: + Variant const& lhs_; +}; + +// hashing visitor +struct hasher +{ + template + std::size_t operator()(const T& hashable) const + { + return std::hash{}(hashable); + } +}; + +} // namespace detail + +struct no_init +{ +}; + +template +class variant +{ + static_assert(sizeof...(Types) > 0, "Template parameter type list of variant can not be empty"); + static_assert(!detail::disjunction...>::value, "Variant can not hold reference types. Maybe use std::reference_wrapper?"); + +private: + static const std::size_t data_size = detail::static_max::value; + static const std::size_t data_align = detail::static_max::value; +public: + struct adapted_variant_tag; + using types = std::tuple; +private: + using first_type = typename std::tuple_element<0, types>::type; + using data_type = typename std::aligned_storage::type; + using helper_type = detail::variant_helper; + + std::size_t type_index; + data_type data; + +public: + VARIANT_INLINE variant() noexcept(std::is_nothrow_default_constructible::value) + : type_index(sizeof...(Types)-1) + { + static_assert(std::is_default_constructible::value, "First type in variant must be default constructible to allow default construction of variant"); + new (&data) first_type(); + } + + VARIANT_INLINE variant(no_init) noexcept + : type_index(detail::invalid_value) {} + + // http://isocpp.org/blog/2012/11/universal-references-in-c11-scott-meyers + template , + typename Enable = typename std::enable_if, typename Traits::value_type>::value>::type > + VARIANT_INLINE variant(T&& val) noexcept(std::is_nothrow_constructible::value) + : type_index(Traits::index) + { + new (&data) typename Traits::target_type(std::forward(val)); + } + + VARIANT_INLINE variant(variant const& old) + : type_index(old.type_index) + { + helper_type::copy(old.type_index, &old.data, &data); + } + + VARIANT_INLINE variant(variant&& old) + noexcept(detail::conjunction...>::value) + : type_index(old.type_index) + { + helper_type::move(old.type_index, &old.data, &data); + } + +private: + VARIANT_INLINE void copy_assign(variant const& rhs) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + helper_type::copy(rhs.type_index, &rhs.data, &data); + type_index = rhs.type_index; + } + + VARIANT_INLINE void move_assign(variant&& rhs) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + helper_type::move(rhs.type_index, &rhs.data, &data); + type_index = rhs.type_index; + } + +public: + VARIANT_INLINE variant& operator=(variant&& other) + { + move_assign(std::move(other)); + return *this; + } + + VARIANT_INLINE variant& operator=(variant const& other) + { + copy_assign(other); + return *this; + } + + // conversions + // move-assign + template + VARIANT_INLINE variant& operator=(T&& rhs) noexcept + { + variant temp(std::forward(rhs)); + move_assign(std::move(temp)); + return *this; + } + + // copy-assign + template + VARIANT_INLINE variant& operator=(T const& rhs) + { + variant temp(rhs); + copy_assign(temp); + return *this; + } + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE bool is() const + { + return type_index == detail::direct_type::index; + } + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE bool is() const + { + return type_index == detail::direct_type, Types...>::index; + } + + VARIANT_INLINE bool valid() const + { + return type_index != detail::invalid_value; + } + + template + VARIANT_INLINE void set(Args&&... args) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + new (&data) T(std::forward(args)...); + type_index = detail::direct_type::index; + } + + // get_unchecked() + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return *reinterpret_cast(&data); + } + +#ifdef HAS_EXCEPTIONS + // get() + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type::index) + { + return *reinterpret_cast(&data); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return *reinterpret_cast(&data); + } + +#ifdef HAS_EXCEPTIONS + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type::index) + { + return *reinterpret_cast(&data); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // get_unchecked() - T stored as recursive_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return (*reinterpret_cast*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + // get() - T stored as recursive_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return (*reinterpret_cast const*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast const*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // get_unchecked() - T stored as std::reference_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return (*reinterpret_cast*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + // get() - T stored as std::reference_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return (*reinterpret_cast const*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast const*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // This function is deprecated because it returns an internal index field. + // Use which() instead. + MAPBOX_VARIANT_DEPRECATED VARIANT_INLINE std::size_t get_type_index() const + { + return type_index; + } + + VARIANT_INLINE int which() const noexcept + { + return static_cast(sizeof...(Types)-type_index - 1); + } + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE static constexpr int which() noexcept + { + return static_cast(sizeof...(Types)-detail::direct_type::index - 1); + } + + // visitor + // unary + template ::type> + auto VARIANT_INLINE static visit(V const& v, F&& f) + -> decltype(detail::dispatcher::apply_const(v, std::forward(f))) + { + return detail::dispatcher::apply_const(v, std::forward(f)); + } + // non-const + template ::type> + auto VARIANT_INLINE static visit(V& v, F&& f) + -> decltype(detail::dispatcher::apply(v, std::forward(f))) + { + return detail::dispatcher::apply(v, std::forward(f)); + } + + // binary + // const + template ::type> + auto VARIANT_INLINE static binary_visit(V const& v0, V const& v1, F&& f) + -> decltype(detail::binary_dispatcher::apply_const(v0, v1, std::forward(f))) + { + return detail::binary_dispatcher::apply_const(v0, v1, std::forward(f)); + } + // non-const + template ::type> + auto VARIANT_INLINE static binary_visit(V& v0, V& v1, F&& f) + -> decltype(detail::binary_dispatcher::apply(v0, v1, std::forward(f))) + { + return detail::binary_dispatcher::apply(v0, v1, std::forward(f)); + } + + // match + // unary + template + auto VARIANT_INLINE match(Fs&&... fs) const + -> decltype(variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...))) + { + return variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...)); + } + // non-const + template + auto VARIANT_INLINE match(Fs&&... fs) + -> decltype(variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...))) + { + return variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...)); + } + + ~variant() noexcept // no-throw destructor + { + helper_type::destroy(type_index, &data); + } + + // comparison operators + // equality + VARIANT_INLINE bool operator==(variant const& rhs) const + { + assert(valid() && rhs.valid()); + if (this->which() != rhs.which()) + { + return false; + } + detail::comparer visitor(*this); + return visit(rhs, visitor); + } + + VARIANT_INLINE bool operator!=(variant const& rhs) const + { + return !(*this == rhs); + } + + // less than + VARIANT_INLINE bool operator<(variant const& rhs) const + { + assert(valid() && rhs.valid()); + if (this->which() != rhs.which()) + { + return this->which() < rhs.which(); + } + detail::comparer visitor(*this); + return visit(rhs, visitor); + } + VARIANT_INLINE bool operator>(variant const& rhs) const + { + return rhs < *this; + } + VARIANT_INLINE bool operator<=(variant const& rhs) const + { + return !(*this > rhs); + } + VARIANT_INLINE bool operator>=(variant const& rhs) const + { + return !(*this < rhs); + } +}; + +// unary visitor interface +// const +template +auto VARIANT_INLINE apply_visitor(F&& f, V const& v) -> decltype(V::visit(v, std::forward(f))) +{ + return V::visit(v, std::forward(f)); +} + +// non-const +template +auto VARIANT_INLINE apply_visitor(F&& f, V& v) -> decltype(V::visit(v, std::forward(f))) +{ + return V::visit(v, std::forward(f)); +} + +// binary visitor interface +// const +template +auto VARIANT_INLINE apply_visitor(F&& f, V const& v0, V const& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) +{ + return V::binary_visit(v0, v1, std::forward(f)); +} + +// non-const +template +auto VARIANT_INLINE apply_visitor(F&& f, V& v0, V& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) +{ + return V::binary_visit(v0, v1, std::forward(f)); +} + +// getter interface + +#ifdef HAS_EXCEPTIONS +template +auto get(T& var)->decltype(var.template get()) +{ + return var.template get(); +} +#endif + +template +ResultType& get_unchecked(T& var) +{ + return var.template get_unchecked(); +} + +#ifdef HAS_EXCEPTIONS +template +auto get(T const& var)->decltype(var.template get()) +{ + return var.template get(); +} +#endif + +template +ResultType const& get_unchecked(T const& var) +{ + return var.template get_unchecked(); +} +} // namespace util +} // namespace mapbox + +// hashable iff underlying types are hashable +namespace std { +template +struct hash< ::mapbox::util::variant> { + std::size_t operator()(const ::mapbox::util::variant& v) const noexcept + { + return ::mapbox::util::apply_visitor(::mapbox::util::detail::hasher{}, v); + } +}; +} + +#endif // MAPBOX_UTIL_VARIANT_HPP diff --git a/cpp/src/arrow/vendored/variant/variant_io.hpp b/cpp/src/arrow/vendored/variant/variant_io.hpp new file mode 100644 index 0000000000000..494d2a964e319 --- /dev/null +++ b/cpp/src/arrow/vendored/variant/variant_io.hpp @@ -0,0 +1,47 @@ +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 + +#ifndef MAPBOX_UTIL_VARIANT_IO_HPP +#define MAPBOX_UTIL_VARIANT_IO_HPP + +#include + +#include "variant.hpp" + +namespace mapbox { +namespace util { + +namespace detail { +// operator<< helper +template +class printer +{ +public: + explicit printer(Out& out) + : out_(out) {} + printer& operator=(printer const&) = delete; + + // visitor + template + void operator()(T const& operand) const + { + out_ << operand; + } + +private: + Out& out_; +}; +} + +// operator<< +template +VARIANT_INLINE std::basic_ostream& +operator<<(std::basic_ostream& out, variant const& rhs) +{ + detail::printer> visitor(out); + apply_visitor(visitor, rhs); + return out; +} +} // namespace util +} // namespace mapbox + +#endif // MAPBOX_UTIL_VARIANT_IO_HPP diff --git a/cpp/src/arrow/vendored/variant/variant_visitor.hpp b/cpp/src/arrow/vendored/variant/variant_visitor.hpp new file mode 100644 index 0000000000000..60020f4dd05dc --- /dev/null +++ b/cpp/src/arrow/vendored/variant/variant_visitor.hpp @@ -0,0 +1,40 @@ +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 + +#ifndef MAPBOX_UTIL_VARIANT_VISITOR_HPP +#define MAPBOX_UTIL_VARIANT_VISITOR_HPP + +namespace mapbox { +namespace util { + +template +struct visitor; + +template +struct visitor : Fn +{ + using type = Fn; + using Fn::operator(); + + visitor(Fn fn) : Fn(fn) {} +}; + +template +struct visitor : Fn, visitor +{ + using type = visitor; + using Fn::operator(); + using visitor::operator(); + + visitor(Fn fn, Fns... fns) : Fn(fn), visitor(fns...) {} +}; + +template +visitor make_visitor(Fns... fns) +{ + return visitor(fns...); +} + +} // namespace util +} // namespace mapbox + +#endif // MAPBOX_UTIL_VARIANT_VISITOR_HPP diff --git a/cpp/src/arrow/util/xxhash/xxhash.c b/cpp/src/arrow/vendored/xxhash/xxhash.c similarity index 100% rename from cpp/src/arrow/util/xxhash/xxhash.c rename to cpp/src/arrow/vendored/xxhash/xxhash.c diff --git a/cpp/src/arrow/util/xxhash/xxhash.h b/cpp/src/arrow/vendored/xxhash/xxhash.h similarity index 100% rename from cpp/src/arrow/util/xxhash/xxhash.h rename to cpp/src/arrow/vendored/xxhash/xxhash.h diff --git a/cpp/src/gandiva/precompiled/epoch_time_point.h b/cpp/src/gandiva/precompiled/epoch_time_point.h index dc6340d134e0a..115f019525118 100644 --- a/cpp/src/gandiva/precompiled/epoch_time_point.h +++ b/cpp/src/gandiva/precompiled/epoch_time_point.h @@ -19,7 +19,7 @@ #define GANDIVA_EPOCH_TIME_POINT_H // TODO(wesm): IR compilation does not have any include directories set -#include "../../arrow/util/date.h" +#include "../../arrow/vendored/date.h" // A point of time measured in millis since epoch. class EpochTimePoint { diff --git a/cpp/src/gandiva/to_date_holder.cc b/cpp/src/gandiva/to_date_holder.cc index 9c8562280041d..b512934e233a1 100644 --- a/cpp/src/gandiva/to_date_holder.cc +++ b/cpp/src/gandiva/to_date_holder.cc @@ -18,7 +18,7 @@ #include #include -#include "arrow/util/date.h" +#include "arrow/vendored/date.h" #include "gandiva/date_utils.h" #include "gandiva/execution_context.h" diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index 4215399c0b009..8d153585c3d4e 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -64,7 +64,7 @@ using arrow::cuda::CudaDeviceManager; #define XXH_INLINE_ALL 1 #define XXH_NAMESPACE plasma_client_ -#include "arrow/util/xxhash/xxhash.h" +#include "arrow/vendored/xxhash/xxhash.h" #define XXH64_DEFAULT_SEED 0 diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index f2e3f164fa284..66d62c6257570 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -13,15 +13,7 @@ cpp/src/arrow/io/mman.h cpp/src/arrow/util/random.h cpp/src/arrow/status.cc cpp/src/arrow/status.h -cpp/src/arrow/util/string_view/string_view.hpp -cpp/src/arrow/util/variant.h -cpp/src/arrow/util/variant/optional.h -cpp/src/arrow/util/variant/recursive_wrapper.h -cpp/src/arrow/util/variant/variant_cast.h -cpp/src/arrow/util/variant/variant_io.h -cpp/src/arrow/util/variant/variant_visitor.h -cpp/src/arrow/util/xxhash/xxhash.c -cpp/src/arrow/util/xxhash/xxhash.h +cpp/src/arrow/vendored/* cpp/build-support/asan_symbolize.py cpp/build-support/cpplint.py cpp/build-support/clang_format_exclusions.txt From a236464551df9427af3f5750a1630100d086d178 Mon Sep 17 00:00:00 2001 From: Dustin Long Date: Mon, 17 Dec 2018 10:09:30 -0600 Subject: [PATCH 062/328] ARROW-3674: [Go] Implement Date32 and Date64 array types Implement both Date32 and Date64 types for arrays. Also resolves ARROW-3675. Unit tests follow the same pattern as the existing float64 and Time{32,64} tests. Author: Dustin Long Closes #3170 from dustmop/date-types and squashes the following commits: 29ae27474 ARROW-3674: Date{32,64} as primitive fixed-width types, not parametric 07a261047 ARROW-3674: Implement Date32 and Date64 array types --- go/arrow/array/array.go | 4 +- go/arrow/array/numeric.gen.go | 90 ++++++++ go/arrow/array/numeric_test.go | 220 ++++++++++++++++++ go/arrow/array/numericbuilder.gen.go | 270 ++++++++++++++++++++++ go/arrow/array/numericbuilder_test.go | 220 ++++++++++++++++++ go/arrow/datatype_fixedwidth.go | 2 + go/arrow/datatype_numeric.gen.go | 16 ++ go/arrow/datatype_numeric.gen.go.tmpldata | 10 + go/arrow/numeric.tmpldata | 20 +- go/arrow/type_traits_numeric.gen.go | 98 ++++++++ 10 files changed, 947 insertions(+), 3 deletions(-) diff --git a/go/arrow/array/array.go b/go/arrow/array/array.go index b188dcd68c729..ef37aef42f602 100644 --- a/go/arrow/array/array.go +++ b/go/arrow/array/array.go @@ -180,8 +180,8 @@ func init() { arrow.STRING: func(data *Data) Interface { return NewStringData(data) }, arrow.BINARY: func(data *Data) Interface { return NewBinaryData(data) }, arrow.FIXED_SIZE_BINARY: func(data *Data) Interface { return NewFixedSizeBinaryData(data) }, - arrow.DATE32: unsupportedArrayType, - arrow.DATE64: unsupportedArrayType, + arrow.DATE32: func(data *Data) Interface { return NewDate32Data(data) }, + arrow.DATE64: func(data *Data) Interface { return NewDate64Data(data) }, arrow.TIMESTAMP: func(data *Data) Interface { return NewTimestampData(data) }, arrow.TIME32: func(data *Data) Interface { return NewTime32Data(data) }, arrow.TIME64: func(data *Data) Interface { return NewTime64Data(data) }, diff --git a/go/arrow/array/numeric.gen.go b/go/arrow/array/numeric.gen.go index 1f734c05127b4..1fb8257d940c4 100644 --- a/go/arrow/array/numeric.gen.go +++ b/go/arrow/array/numeric.gen.go @@ -609,3 +609,93 @@ func (a *Time64) setData(data *Data) { a.values = a.values[beg:end] } } + +// A type which represents an immutable sequence of arrow.Date32 values. +type Date32 struct { + array + values []arrow.Date32 +} + +func NewDate32Data(data *Data) *Date32 { + a := &Date32{} + a.refCount = 1 + a.setData(data) + return a +} + +func (a *Date32) Value(i int) arrow.Date32 { return a.values[i] } +func (a *Date32) Date32Values() []arrow.Date32 { return a.values } + +func (a *Date32) String() string { + o := new(strings.Builder) + o.WriteString("[") + for i, v := range a.values { + if i > 0 { + fmt.Fprintf(o, " ") + } + switch { + case a.IsNull(i): + o.WriteString("(null)") + default: + fmt.Fprintf(o, "%v", v) + } + } + o.WriteString("]") + return o.String() +} + +func (a *Date32) setData(data *Data) { + a.array.setData(data) + vals := data.buffers[1] + if vals != nil { + a.values = arrow.Date32Traits.CastFromBytes(vals.Bytes()) + beg := a.array.data.offset + end := beg + a.array.data.length + a.values = a.values[beg:end] + } +} + +// A type which represents an immutable sequence of arrow.Date64 values. +type Date64 struct { + array + values []arrow.Date64 +} + +func NewDate64Data(data *Data) *Date64 { + a := &Date64{} + a.refCount = 1 + a.setData(data) + return a +} + +func (a *Date64) Value(i int) arrow.Date64 { return a.values[i] } +func (a *Date64) Date64Values() []arrow.Date64 { return a.values } + +func (a *Date64) String() string { + o := new(strings.Builder) + o.WriteString("[") + for i, v := range a.values { + if i > 0 { + fmt.Fprintf(o, " ") + } + switch { + case a.IsNull(i): + o.WriteString("(null)") + default: + fmt.Fprintf(o, "%v", v) + } + } + o.WriteString("]") + return o.String() +} + +func (a *Date64) setData(data *Data) { + a.array.setData(data) + vals := data.buffers[1] + if vals != nil { + a.values = arrow.Date64Traits.CastFromBytes(vals.Bytes()) + beg := a.array.data.offset + end := beg + a.array.data.length + a.values = a.values[beg:end] + } +} diff --git a/go/arrow/array/numeric_test.go b/go/arrow/array/numeric_test.go index 9e8267a70de6c..fc7f04addbe0d 100644 --- a/go/arrow/array/numeric_test.go +++ b/go/arrow/array/numeric_test.go @@ -394,3 +394,223 @@ func TestTime64SliceDataWithNull(t *testing.T) { t.Fatalf("got=%v, want=%v", got, want) } } + +func TestNewDate32Data(t *testing.T) { + exp := []arrow.Date32{1, 2, 4, 8, 16} + + dtype := &arrow.Date32Type{} + ad := array.NewData( + dtype, len(exp), + []*memory.Buffer{nil, memory.NewBufferBytes(arrow.Date32Traits.CastToBytes(exp))}, + nil, 0, 0, + ) + fa := array.NewDate32Data(ad) + + assert.Equal(t, len(exp), fa.Len(), "unexpected Len()") + assert.Equal(t, exp, fa.Date32Values(), "unexpected Date32Values()") +} + +func TestDate32SliceData(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 4 + ) + + var ( + vs = []arrow.Date32{1, 2, 3, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate32Builder(pool) + defer b.Release() + + for _, v := range vs { + b.Append(v) + } + + arr := b.NewArray().(*array.Date32) + defer arr.Release() + + if got, want := arr.Len(), len(vs); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date32Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date32) + defer slice.Release() + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date32Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} + +func TestDate32SliceDataWithNull(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 5 + ) + + var ( + valids = []bool{true, true, true, false, true, true} + vs = []arrow.Date32{1, 2, 3, 0, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate32Builder(pool) + defer b.Release() + + b.AppendValues(vs, valids) + + arr := b.NewArray().(*array.Date32) + defer arr.Release() + + if got, want := arr.Len(), len(valids); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 1; got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date32Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date32) + defer slice.Release() + + if got, want := slice.NullN(), 1; got != want { + t.Errorf("got=%d, want=%d", got, want) + } + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date32Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} + +func TestNewDate64Data(t *testing.T) { + exp := []arrow.Date64{1, 2, 4, 8, 16} + + dtype := &arrow.Date64Type{} + ad := array.NewData( + dtype, len(exp), + []*memory.Buffer{nil, memory.NewBufferBytes(arrow.Date64Traits.CastToBytes(exp))}, + nil, 0, 0, + ) + fa := array.NewDate64Data(ad) + + assert.Equal(t, len(exp), fa.Len(), "unexpected Len()") + assert.Equal(t, exp, fa.Date64Values(), "unexpected Date64Values()") +} + +func TestDate64SliceData(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 4 + ) + + var ( + vs = []arrow.Date64{1, 2, 3, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate64Builder(pool) + defer b.Release() + + for _, v := range vs { + b.Append(v) + } + + arr := b.NewArray().(*array.Date64) + defer arr.Release() + + if got, want := arr.Len(), len(vs); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date64Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date64) + defer slice.Release() + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date64Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} + +func TestDate64SliceDataWithNull(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 5 + ) + + var ( + valids = []bool{true, true, true, false, true, true} + vs = []arrow.Date64{1, 2, 3, 0, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate64Builder(pool) + defer b.Release() + + b.AppendValues(vs, valids) + + arr := b.NewArray().(*array.Date64) + defer arr.Release() + + if got, want := arr.Len(), len(valids); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 1; got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date64Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date64) + defer slice.Release() + + if got, want := slice.NullN(), 1; got != want { + t.Errorf("got=%d, want=%d", got, want) + } + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date64Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} diff --git a/go/arrow/array/numericbuilder.gen.go b/go/arrow/array/numericbuilder.gen.go index 3a7dc167f15aa..946c5ba74aaeb 100644 --- a/go/arrow/array/numericbuilder.gen.go +++ b/go/arrow/array/numericbuilder.gen.go @@ -1772,6 +1772,274 @@ func (b *Time64Builder) newData() (data *Data) { return } +type Date32Builder struct { + builder + + data *memory.Buffer + rawData []arrow.Date32 +} + +func NewDate32Builder(mem memory.Allocator) *Date32Builder { + return &Date32Builder{builder: builder{refCount: 1, mem: mem}} +} + +// Release decreases the reference count by 1. +// When the reference count goes to zero, the memory is freed. +func (b *Date32Builder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) == 0 { + if b.nullBitmap != nil { + b.nullBitmap.Release() + b.nullBitmap = nil + } + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + } +} + +func (b *Date32Builder) Append(v arrow.Date32) { + b.Reserve(1) + b.UnsafeAppend(v) +} + +func (b *Date32Builder) AppendNull() { + b.Reserve(1) + b.UnsafeAppendBoolToBitmap(false) +} + +func (b *Date32Builder) UnsafeAppend(v arrow.Date32) { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + b.rawData[b.length] = v + b.length++ +} + +func (b *Date32Builder) UnsafeAppendBoolToBitmap(isValid bool) { + if isValid { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + } else { + b.nulls++ + } + b.length++ +} + +// AppendValues will append the values in the v slice. The valid slice determines which values +// in v are valid (not null). The valid slice must either be empty or be equal in length to v. If empty, +// all values in v are appended and considered valid. +func (b *Date32Builder) AppendValues(v []arrow.Date32, valid []bool) { + if len(v) != len(valid) && len(valid) != 0 { + panic("len(v) != len(valid) && len(valid) != 0") + } + + b.Reserve(len(v)) + if len(v) > 0 { + arrow.Date32Traits.Copy(b.rawData[b.length:], v) + } + b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) +} + +func (b *Date32Builder) init(capacity int) { + b.builder.init(capacity) + + b.data = memory.NewResizableBuffer(b.mem) + bytesN := arrow.Date32Traits.BytesRequired(capacity) + b.data.Resize(bytesN) + b.rawData = arrow.Date32Traits.CastFromBytes(b.data.Bytes()) +} + +// Reserve ensures there is enough space for appending n elements +// by checking the capacity and calling Resize if necessary. +func (b *Date32Builder) Reserve(n int) { + b.builder.reserve(n, b.Resize) +} + +// Resize adjusts the space allocated by b to n elements. If n is greater than b.Cap(), +// additional memory will be allocated. If n is smaller, the allocated memory may reduced. +func (b *Date32Builder) Resize(n int) { + nBuilder := n + if n < minBuilderCapacity { + n = minBuilderCapacity + } + + if b.capacity == 0 { + b.init(n) + } else { + b.builder.resize(nBuilder, b.init) + b.data.Resize(arrow.Date32Traits.BytesRequired(n)) + b.rawData = arrow.Date32Traits.CastFromBytes(b.data.Bytes()) + } +} + +// NewArray creates a Date32 array from the memory buffers used by the builder and resets the Date32Builder +// so it can be used to build a new array. +func (b *Date32Builder) NewArray() Interface { + return b.NewDate32Array() +} + +// NewDate32Array creates a Date32 array from the memory buffers used by the builder and resets the Date32Builder +// so it can be used to build a new array. +func (b *Date32Builder) NewDate32Array() (a *Date32) { + data := b.newData() + a = NewDate32Data(data) + data.Release() + return +} + +func (b *Date32Builder) newData() (data *Data) { + bytesRequired := arrow.Date32Traits.BytesRequired(b.length) + if bytesRequired > 0 && bytesRequired < b.data.Len() { + // trim buffers + b.data.Resize(bytesRequired) + } + data = NewData(arrow.PrimitiveTypes.Date32, b.length, []*memory.Buffer{b.nullBitmap, b.data}, nil, b.nulls, 0) + b.reset() + + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + + return +} + +type Date64Builder struct { + builder + + data *memory.Buffer + rawData []arrow.Date64 +} + +func NewDate64Builder(mem memory.Allocator) *Date64Builder { + return &Date64Builder{builder: builder{refCount: 1, mem: mem}} +} + +// Release decreases the reference count by 1. +// When the reference count goes to zero, the memory is freed. +func (b *Date64Builder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) == 0 { + if b.nullBitmap != nil { + b.nullBitmap.Release() + b.nullBitmap = nil + } + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + } +} + +func (b *Date64Builder) Append(v arrow.Date64) { + b.Reserve(1) + b.UnsafeAppend(v) +} + +func (b *Date64Builder) AppendNull() { + b.Reserve(1) + b.UnsafeAppendBoolToBitmap(false) +} + +func (b *Date64Builder) UnsafeAppend(v arrow.Date64) { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + b.rawData[b.length] = v + b.length++ +} + +func (b *Date64Builder) UnsafeAppendBoolToBitmap(isValid bool) { + if isValid { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + } else { + b.nulls++ + } + b.length++ +} + +// AppendValues will append the values in the v slice. The valid slice determines which values +// in v are valid (not null). The valid slice must either be empty or be equal in length to v. If empty, +// all values in v are appended and considered valid. +func (b *Date64Builder) AppendValues(v []arrow.Date64, valid []bool) { + if len(v) != len(valid) && len(valid) != 0 { + panic("len(v) != len(valid) && len(valid) != 0") + } + + b.Reserve(len(v)) + if len(v) > 0 { + arrow.Date64Traits.Copy(b.rawData[b.length:], v) + } + b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) +} + +func (b *Date64Builder) init(capacity int) { + b.builder.init(capacity) + + b.data = memory.NewResizableBuffer(b.mem) + bytesN := arrow.Date64Traits.BytesRequired(capacity) + b.data.Resize(bytesN) + b.rawData = arrow.Date64Traits.CastFromBytes(b.data.Bytes()) +} + +// Reserve ensures there is enough space for appending n elements +// by checking the capacity and calling Resize if necessary. +func (b *Date64Builder) Reserve(n int) { + b.builder.reserve(n, b.Resize) +} + +// Resize adjusts the space allocated by b to n elements. If n is greater than b.Cap(), +// additional memory will be allocated. If n is smaller, the allocated memory may reduced. +func (b *Date64Builder) Resize(n int) { + nBuilder := n + if n < minBuilderCapacity { + n = minBuilderCapacity + } + + if b.capacity == 0 { + b.init(n) + } else { + b.builder.resize(nBuilder, b.init) + b.data.Resize(arrow.Date64Traits.BytesRequired(n)) + b.rawData = arrow.Date64Traits.CastFromBytes(b.data.Bytes()) + } +} + +// NewArray creates a Date64 array from the memory buffers used by the builder and resets the Date64Builder +// so it can be used to build a new array. +func (b *Date64Builder) NewArray() Interface { + return b.NewDate64Array() +} + +// NewDate64Array creates a Date64 array from the memory buffers used by the builder and resets the Date64Builder +// so it can be used to build a new array. +func (b *Date64Builder) NewDate64Array() (a *Date64) { + data := b.newData() + a = NewDate64Data(data) + data.Release() + return +} + +func (b *Date64Builder) newData() (data *Data) { + bytesRequired := arrow.Date64Traits.BytesRequired(b.length) + if bytesRequired > 0 && bytesRequired < b.data.Len() { + // trim buffers + b.data.Resize(bytesRequired) + } + data = NewData(arrow.PrimitiveTypes.Date64, b.length, []*memory.Buffer{b.nullBitmap, b.data}, nil, b.nulls, 0) + b.reset() + + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + + return +} + var ( _ Builder = (*Int64Builder)(nil) _ Builder = (*Uint64Builder)(nil) @@ -1786,4 +2054,6 @@ var ( _ Builder = (*TimestampBuilder)(nil) _ Builder = (*Time32Builder)(nil) _ Builder = (*Time64Builder)(nil) + _ Builder = (*Date32Builder)(nil) + _ Builder = (*Date64Builder)(nil) ) diff --git a/go/arrow/array/numericbuilder_test.go b/go/arrow/array/numericbuilder_test.go index 65f3c86c2ea35..3bb49a3af7310 100644 --- a/go/arrow/array/numericbuilder_test.go +++ b/go/arrow/array/numericbuilder_test.go @@ -362,3 +362,223 @@ func TestTime64Builder_Resize(t *testing.T) { ab.Release() } + +func TestNewDate32Builder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + ab.Append(1) + ab.Append(2) + ab.Append(3) + ab.AppendNull() + ab.Append(5) + ab.Append(6) + ab.AppendNull() + ab.Append(8) + ab.Append(9) + ab.Append(10) + + // check state of builder before NewDate32Array + assert.Equal(t, 10, ab.Len(), "unexpected Len()") + assert.Equal(t, 2, ab.NullN(), "unexpected NullN()") + + a := ab.NewDate32Array() + + // check state of builder after NewDate32Array + assert.Zero(t, ab.Len(), "unexpected ArrayBuilder.Len(), NewDate32Array did not reset state") + assert.Zero(t, ab.Cap(), "unexpected ArrayBuilder.Cap(), NewDate32Array did not reset state") + assert.Zero(t, ab.NullN(), "unexpected ArrayBuilder.NullN(), NewDate32Array did not reset state") + + // check state of array + assert.Equal(t, 2, a.NullN(), "unexpected null count") + assert.Equal(t, []arrow.Date32{1, 2, 3, 0, 5, 6, 0, 8, 9, 10}, a.Date32Values(), "unexpected Date32Values") + assert.Equal(t, []byte{0xb7}, a.NullBitmapBytes()[:1]) // 4 bytes due to minBuilderCapacity + assert.Len(t, a.Date32Values(), 10, "unexpected length of Date32Values") + + a.Release() + + ab.Append(7) + ab.Append(8) + + a = ab.NewDate32Array() + + assert.Equal(t, 0, a.NullN()) + assert.Equal(t, []arrow.Date32{7, 8}, a.Date32Values()) + assert.Len(t, a.Date32Values(), 2) + + a.Release() +} + +func TestDate32Builder_AppendValues(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + exp := []arrow.Date32{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate32Array() + assert.Equal(t, exp, a.Date32Values()) + + a.Release() + ab.Release() +} + +func TestDate32Builder_Empty(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + exp := []arrow.Date32{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate32Array() + assert.Equal(t, exp, a.Date32Values()) + a.Release() + + a = ab.NewDate32Array() + assert.Zero(t, a.Len()) + a.Release() + + ab.Release() +} + +func TestDate32Builder_Resize(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + assert.Equal(t, 0, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + ab.Reserve(63) + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + for i := 0; i < 63; i++ { + ab.Append(0) + } + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 63, ab.Len()) + + ab.Resize(5) + assert.Equal(t, 5, ab.Len()) + + ab.Resize(32) + assert.Equal(t, 5, ab.Len()) + + ab.Release() +} + +func TestNewDate64Builder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + ab.Append(1) + ab.Append(2) + ab.Append(3) + ab.AppendNull() + ab.Append(5) + ab.Append(6) + ab.AppendNull() + ab.Append(8) + ab.Append(9) + ab.Append(10) + + // check state of builder before NewDate64Array + assert.Equal(t, 10, ab.Len(), "unexpected Len()") + assert.Equal(t, 2, ab.NullN(), "unexpected NullN()") + + a := ab.NewDate64Array() + + // check state of builder after NewDate64Array + assert.Zero(t, ab.Len(), "unexpected ArrayBuilder.Len(), NewDate64Array did not reset state") + assert.Zero(t, ab.Cap(), "unexpected ArrayBuilder.Cap(), NewDate64Array did not reset state") + assert.Zero(t, ab.NullN(), "unexpected ArrayBuilder.NullN(), NewDate64Array did not reset state") + + // check state of array + assert.Equal(t, 2, a.NullN(), "unexpected null count") + assert.Equal(t, []arrow.Date64{1, 2, 3, 0, 5, 6, 0, 8, 9, 10}, a.Date64Values(), "unexpected Date64Values") + assert.Equal(t, []byte{0xb7}, a.NullBitmapBytes()[:1]) // 4 bytes due to minBuilderCapacity + assert.Len(t, a.Date64Values(), 10, "unexpected length of Date64Values") + + a.Release() + + ab.Append(7) + ab.Append(8) + + a = ab.NewDate64Array() + + assert.Equal(t, 0, a.NullN()) + assert.Equal(t, []arrow.Date64{7, 8}, a.Date64Values()) + assert.Len(t, a.Date64Values(), 2) + + a.Release() +} + +func TestDate64Builder_AppendValues(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + exp := []arrow.Date64{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate64Array() + assert.Equal(t, exp, a.Date64Values()) + + a.Release() + ab.Release() +} + +func TestDate64Builder_Empty(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + exp := []arrow.Date64{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate64Array() + assert.Equal(t, exp, a.Date64Values()) + a.Release() + + a = ab.NewDate64Array() + assert.Zero(t, a.Len()) + a.Release() + + ab.Release() +} + +func TestDate64Builder_Resize(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + assert.Equal(t, 0, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + ab.Reserve(63) + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + for i := 0; i < 63; i++ { + ab.Append(0) + } + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 63, ab.Len()) + + ab.Resize(5) + assert.Equal(t, 5, ab.Len()) + + ab.Resize(32) + assert.Equal(t, 5, ab.Len()) + + ab.Release() +} diff --git a/go/arrow/datatype_fixedwidth.go b/go/arrow/datatype_fixedwidth.go index 60cc98a4b97d9..444495058a591 100644 --- a/go/arrow/datatype_fixedwidth.go +++ b/go/arrow/datatype_fixedwidth.go @@ -37,6 +37,8 @@ type ( Time32 int32 Time64 int64 TimeUnit int + Date32 int32 + Date64 int64 ) const ( diff --git a/go/arrow/datatype_numeric.gen.go b/go/arrow/datatype_numeric.gen.go index 2ec4c4098a4a6..9b5dc835b1ea2 100644 --- a/go/arrow/datatype_numeric.gen.go +++ b/go/arrow/datatype_numeric.gen.go @@ -78,6 +78,18 @@ func (t *Float64Type) ID() Type { return FLOAT64 } func (t *Float64Type) Name() string { return "float64" } func (t *Float64Type) BitWidth() int { return 64 } +type Date32Type struct{} + +func (t *Date32Type) ID() Type { return DATE32 } +func (t *Date32Type) Name() string { return "date32" } +func (t *Date32Type) BitWidth() int { return 32 } + +type Date64Type struct{} + +func (t *Date64Type) ID() Type { return DATE64 } +func (t *Date64Type) Name() string { return "date64" } +func (t *Date64Type) BitWidth() int { return 64 } + var ( PrimitiveTypes = struct { Int8 DataType @@ -90,6 +102,8 @@ var ( Uint64 DataType Float32 DataType Float64 DataType + Date32 DataType + Date64 DataType }{ Int8: &Int8Type{}, @@ -102,5 +116,7 @@ var ( Uint64: &Uint64Type{}, Float32: &Float32Type{}, Float64: &Float64Type{}, + Date32: &Date32Type{}, + Date64: &Date64Type{}, } ) diff --git a/go/arrow/datatype_numeric.gen.go.tmpldata b/go/arrow/datatype_numeric.gen.go.tmpldata index 415b51b2e16bd..9badc6ee2b211 100644 --- a/go/arrow/datatype_numeric.gen.go.tmpldata +++ b/go/arrow/datatype_numeric.gen.go.tmpldata @@ -48,5 +48,15 @@ "Name": "Float64", "Type": "float64", "Size": 64 + }, + { + "Name": "Date32", + "Type": "date32", + "Size": 32 + }, + { + "Name": "Date64", + "Type": "date64", + "Size": 64 } ] diff --git a/go/arrow/numeric.tmpldata b/go/arrow/numeric.tmpldata index b9e976eea0534..45452ab4468c6 100644 --- a/go/arrow/numeric.tmpldata +++ b/go/arrow/numeric.tmpldata @@ -107,5 +107,23 @@ "Opt": { "Parametric": true } + }, + { + "Name": "Date32", + "name": "date32", + "Type": "Date32", + "QualifiedType": "arrow.Date32", + "InternalType": "int32", + "Default": "0", + "Size": "4" + }, + { + "Name": "Date64", + "name": "date64", + "Type": "Date64", + "QualifiedType": "arrow.Date64", + "InternalType": "int64", + "Default": "0", + "Size": "8" } -] \ No newline at end of file +] diff --git a/go/arrow/type_traits_numeric.gen.go b/go/arrow/type_traits_numeric.gen.go index 59ed13f541a53..14fafbc57659b 100644 --- a/go/arrow/type_traits_numeric.gen.go +++ b/go/arrow/type_traits_numeric.gen.go @@ -38,6 +38,8 @@ var ( TimestampTraits timestampTraits Time32Traits time32Traits Time64Traits time64Traits + Date32Traits date32Traits + Date64Traits date64Traits ) // Int64 traits @@ -663,3 +665,99 @@ func (time64Traits) CastToBytes(b []Time64) []byte { // Copy copies src to dst. func (time64Traits) Copy(dst, src []Time64) { copy(dst, src) } + +// Date32 traits + +const ( + // Date32SizeBytes specifies the number of bytes required to store a single Date32 in memory + Date32SizeBytes = int(unsafe.Sizeof(Date32(0))) +) + +type date32Traits struct{} + +// BytesRequired returns the number of bytes required to store n elements in memory. +func (date32Traits) BytesRequired(n int) int { return Date32SizeBytes * n } + +// PutValue +func (date32Traits) PutValue(b []byte, v Date32) { + binary.LittleEndian.PutUint32(b, uint32(v)) +} + +// CastFromBytes reinterprets the slice b to a slice of type Date32. +// +// NOTE: len(b) must be a multiple of Date32SizeBytes. +func (date32Traits) CastFromBytes(b []byte) []Date32 { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []Date32 + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len / Date32SizeBytes + s.Cap = h.Cap / Date32SizeBytes + + return res +} + +// CastToBytes reinterprets the slice b to a slice of bytes. +func (date32Traits) CastToBytes(b []Date32) []byte { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []byte + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len * Date32SizeBytes + s.Cap = h.Cap * Date32SizeBytes + + return res +} + +// Copy copies src to dst. +func (date32Traits) Copy(dst, src []Date32) { copy(dst, src) } + +// Date64 traits + +const ( + // Date64SizeBytes specifies the number of bytes required to store a single Date64 in memory + Date64SizeBytes = int(unsafe.Sizeof(Date64(0))) +) + +type date64Traits struct{} + +// BytesRequired returns the number of bytes required to store n elements in memory. +func (date64Traits) BytesRequired(n int) int { return Date64SizeBytes * n } + +// PutValue +func (date64Traits) PutValue(b []byte, v Date64) { + binary.LittleEndian.PutUint64(b, uint64(v)) +} + +// CastFromBytes reinterprets the slice b to a slice of type Date64. +// +// NOTE: len(b) must be a multiple of Date64SizeBytes. +func (date64Traits) CastFromBytes(b []byte) []Date64 { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []Date64 + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len / Date64SizeBytes + s.Cap = h.Cap / Date64SizeBytes + + return res +} + +// CastToBytes reinterprets the slice b to a slice of bytes. +func (date64Traits) CastToBytes(b []Date64) []byte { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []byte + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len * Date64SizeBytes + s.Cap = h.Cap * Date64SizeBytes + + return res +} + +// Copy copies src to dst. +func (date64Traits) Copy(dst, src []Date64) { copy(dst, src) } From c7cb1cee388bbfa890ce724f6a7a95991bd8eb1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 17 Dec 2018 10:48:12 -0600 Subject: [PATCH 063/328] ARROW-3368: [Integration/CI/Python] Add dask integration test to docker-compose setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port dask integration testing, and refactored the hdfs one. Multiple python hdfs tests cases are failing, nut sure why. Author: Krisztián Szűcs Closes #3086 from kszucs/ARROW-3368 and squashes the following commits: d6e98ecb8 resolve load warning of native-hadoop library 237440ab0 cleanup 0b3e1fc4c port fastparquet hdfs test c497be0d6 better error msg cf06721ef minimal hdfs config d1e9b3717 arrow- test executable prefix 5c11d72ed add hdfs config files f7681d045 download hadoop from apache mirrors c84294a56 update comment 1eef5bfa8 remove dask_integration.sh 00ef67691 two datanodes; support env vars in conftest c2bc444c2 remove outdated files 840f313b3 test optional modules 0e8b3932b add dask-integration to nightlies c3d4a9bef remove comments from docker-compose ffe0ac7ea set hadoop version to 2.6.0 1c7bf304d unset LD_LIBRARY_PATH f1af0248d run dask tests 694e7e6ef docker-compose setup for dask integration --- ci/docker_build_python.sh | 7 +- dev/dask_integration.sh | 21 -- dev/dask_integration/Dockerfile | 22 -- dev/dask_integration/dask_integration.sh | 98 ------ dev/tasks/tests.yml | 22 +- docker-compose.yml | 44 ++- .../dask/Dockerfile | 20 +- integration/dask/runtest.sh | 34 ++ integration/hdfs/Dockerfile | 78 ++-- integration/hdfs/hdfs-site.xml | 44 +++ integration/hdfs/libhdfs3.xml | 332 ------------------ integration/hdfs/runtest.sh | 12 +- python/Dockerfile | 3 +- python/pyarrow/tests/conftest.py | 41 ++- python/pyarrow/tests/test_hdfs.py | 35 +- python/testing/README.md | 42 --- .../dask_tests/test_dask_integration.py | 58 --- python/testing/functions.sh | 75 ---- python/testing/parquet_interop.py | 51 --- python/testing/set_env_common.sh | 70 ---- python/testing/setup_toolchain.sh | 64 ---- 21 files changed, 246 insertions(+), 927 deletions(-) delete mode 100755 dev/dask_integration.sh delete mode 100644 dev/dask_integration/Dockerfile delete mode 100755 dev/dask_integration/dask_integration.sh rename python/testing/test_hdfs.sh => integration/dask/Dockerfile (68%) mode change 100755 => 100644 create mode 100755 integration/dask/runtest.sh create mode 100644 integration/hdfs/hdfs-site.xml delete mode 100644 integration/hdfs/libhdfs3.xml delete mode 100644 python/testing/README.md delete mode 100644 python/testing/dask_tests/test_dask_integration.py delete mode 100644 python/testing/functions.sh delete mode 100644 python/testing/parquet_interop.py delete mode 100644 python/testing/set_env_common.sh delete mode 100644 python/testing/setup_toolchain.sh diff --git a/ci/docker_build_python.sh b/ci/docker_build_python.sh index 8ba8a1d66f1be..23d852bcb8713 100755 --- a/ci/docker_build_python.sh +++ b/ci/docker_build_python.sh @@ -26,12 +26,17 @@ export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" export PYARROW_CXXFLAGS=$CXXFLAGS export PYARROW_CMAKE_GENERATOR=Ninja export PYARROW_BUILD_TYPE=${PYARROW_BUILD_TYPE:-debug} + +# Feature flags +export PYARROW_WITH_ORC=${PYARROW_WITH_ORC:-1} export PYARROW_WITH_PARQUET=${PYARROW_WITH_PARQUET:-1} export PYARROW_WITH_PLASMA=${PYARROW_WITH_PLASMA:-1} # Build pyarrow pushd ${source_dir} -python setup.py build_ext --build-temp=${build_dir} install +python setup.py build --build-temp=${build_dir} \ + install --single-version-externally-managed \ + --record=/build/python/record.txt popd diff --git a/dev/dask_integration.sh b/dev/dask_integration.sh deleted file mode 100755 index d344328b6af1e..0000000000000 --- a/dev/dask_integration.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Pass the service name to run_docker_compose.sh -# Which validates environment and runs the service -exec "$(dirname ${BASH_SOURCE})"/run_docker_compose.sh dask_integration diff --git a/dev/dask_integration/Dockerfile b/dev/dask_integration/Dockerfile deleted file mode 100644 index f0c1f03f6f93c..0000000000000 --- a/dev/dask_integration/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -FROM arrow_integration_xenial_base - -ADD . /apache-arrow -WORKDIR /apache-arrow - -CMD arrow/dev/dask_integration/dask_integration.sh diff --git a/dev/dask_integration/dask_integration.sh b/dev/dask_integration/dask_integration.sh deleted file mode 100755 index f4999c0ae447f..0000000000000 --- a/dev/dask_integration/dask_integration.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set up environment and working directory -cd /apache-arrow - -conda activate pyarrow-dev - -# install pytables from defaults for now -conda install -y pytables - -pip install -q git+https://github.com/dask/partd --upgrade --no-deps -pip install -q git+https://github.com/dask/zict --upgrade --no-deps -pip install -q git+https://github.com/dask/distributed --upgrade --no-deps -pip install -q git+https://github.com/mrocklin/sparse --upgrade --no-deps -pip install -q git+https://github.com/dask/s3fs --upgrade --no-deps - -conda install -y -q -c conda-forge numba cython \ - bcolz \ - blosc \ - bokeh \ - boto3 \ - chest \ - cloudpickle \ - coverage \ - cytoolz \ - distributed \ - graphviz \ - h5py \ - partd \ - psutil \ - "pytest<=3.1.1" \ - scikit-image \ - scikit-learn \ - sqlalchemy \ - toolz - -pip install -q git+https://github.com/dask/fastparquet - -pip install -q \ - cachey \ - graphviz \ - moto \ - pyarrow \ - --upgrade --no-deps - -pip install -q \ - cityhash \ - flake8 \ - mmh3 \ - pandas_datareader \ - pytest-xdist \ - xxhash \ - pycodestyle - -export ARROW_BUILD_TYPE=release -export ARROW_HOME=$(pwd)/dist -export PARQUET_HOME=$(pwd)/dist -CONDA_BASE=/home/ubuntu/miniconda -export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} - -# Allow for --user Python installation inside Docker -export HOME=$(pwd) - -# Clean up and get the dask master branch from github -rm -rf dask .local -export GIT_COMMITTER_NAME="Nobody" -export GIT_COMMITTER_EMAIL="nobody@nowhere.com" -git clone https://github.com/dask/dask.git -pushd dask -pip install --user -e .[complete] -# Verify integrity of the installed dask dataframe code -py.test dask/dataframe/tests/test_dataframe.py -popd - -# Run the integration test -pushd arrow/python/testing -py.test dask_tests -popd - -pushd dask/dask/dataframe/io -py.test tests/test_parquet.py -popd diff --git a/dev/tasks/tests.yml b/dev/tasks/tests.yml index d9493b606e5a0..a0c7676ba7312 100644 --- a/dev/tasks/tests.yml +++ b/dev/tasks/tests.yml @@ -36,12 +36,14 @@ groups: - docker-lint - docker-iwyu - docker-clang-format - - docker-hdfs-integration - docker-pandas-master + - docker-hdfs-integration + - docker-dask-integration integration: - - docker-hdfs-integration - docker-pandas-master + - docker-dask-integration + - docker-hdfs-integration cpp-python: - docker-cpp @@ -239,11 +241,27 @@ tasks: ############################## Integration tests ############################ + docker-dask-integration: + platform: linux + template: docker-tests/travis.linux.yml + params: + environment: + PYTHON_VERSION: 3.6 + commands: + - docker-compose build cpp + - docker-compose build python + - docker-compose build dask-integration + - docker-compose run dask-integration + docker-hdfs-integration: platform: linux template: docker-tests/travis.linux.yml params: + environment: + PYTHON_VERSION: 3.6 commands: + - docker-compose build cpp + - docker-compose build python - docker-compose build hdfs-integration - docker-compose run hdfs-integration diff --git a/docker-compose.yml b/docker-compose.yml index d3a7990d5cc23..0a01a7cbe97bf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -222,6 +222,20 @@ services: # - "21050" # hostname: impala + pandas-master: + # Usage: + # export PYTHON_VERSION=3.6 + # docker-compose build cpp + # docker-compose build python + # docker-compose build --no-cache pandas-master + # docker-compose run pandas-master + image: arrow:pandas-master + build: + context: . + dockerfile: integration/pandas/Dockerfile + shm_size: 2G + volumes: *ubuntu-volumes + hdfs-namenode: image: gelog/hadoop shm_size: 2G @@ -231,7 +245,7 @@ services: command: hdfs namenode hostname: hdfs-namenode - hdfs-datanode: + hdfs-datanode-1: image: gelog/hadoop command: hdfs datanode ports: @@ -241,6 +255,17 @@ services: links: - hdfs-namenode:hdfs-namenode + hdfs-datanode-2: + image: gelog/hadoop + command: hdfs datanode + ports: + # The host port is randomly assigned by Docker, to allow scaling + # to multiple DataNodes on the same host + - "50075" + links: + - hdfs-namenode:hdfs-namenode + + # TODO(kszucs): pass hdfs client version explicitly as a build argument hdfs-integration: # Usage: # export PYTHON_VERSION=3.6 @@ -250,7 +275,8 @@ services: # docker-compose run hdfs-integration links: - hdfs-namenode:hdfs-namenode - - hdfs-datanode:hdfs-datanode + - hdfs-datanode-1:hdfs-datanode-1 + - hdfs-datanode-2:hdfs-datanode-2 environment: - ARROW_HDFS_TEST_HOST=hdfs-namenode - ARROW_HDFS_TEST_PORT=9000 @@ -258,22 +284,20 @@ services: build: context: . dockerfile: integration/hdfs/Dockerfile + volumes: *ubuntu-volumes - pandas-master: + # TODO(kszucs): pass dask version explicitly as a build argument + dask-integration: # Usage: # export PYTHON_VERSION=3.6 # docker-compose build cpp # docker-compose build python - # docker-compose build --no-cache pandas-master - # docker-compose run pandas-master - image: arrow:pandas-master + # docker-compose build dask-integration + # docker-compose run dask-integration build: context: . - dockerfile: integration/pandas/Dockerfile - shm_size: 2G + dockerfile: integration/dask/Dockerfile volumes: *ubuntu-volumes - - # TODO(kszucs): dask-integration # TODO(kszucs): hive-integration # TODO(kszucs): spark-integration diff --git a/python/testing/test_hdfs.sh b/integration/dask/Dockerfile old mode 100755 new mode 100644 similarity index 68% rename from python/testing/test_hdfs.sh rename to integration/dask/Dockerfile index 016e54a66a671..5e054c51c561e --- a/python/testing/test_hdfs.sh +++ b/integration/dask/Dockerfile @@ -1,5 +1,3 @@ -#!/usr/bin/env bash -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,9 +15,17 @@ # specific language governing permissions and limitations # under the License. -set -ex +FROM arrow:python-3.6 + +# setup /etc/localtime +RUN DEBIAN_FRONTEND=noninteractive \ + apt-get install -y -q tzdata + +# install dask release from conda +RUN conda install -c conda-forge dask pytest=3 && \ + conda clean --all -docker build -t arrow-hdfs-test -f hdfs/Dockerfile . -bash hdfs/restart_docker_container.sh -docker exec -it arrow-hdfs /io/hdfs/run_tests.sh -docker stop arrow-hdfs +# build and test +CMD arrow/ci/docker_build_cpp.sh && \ + arrow/ci/docker_build_python.sh && \ + arrow/integration/dask/runtest.sh diff --git a/integration/dask/runtest.sh b/integration/dask/runtest.sh new file mode 100755 index 0000000000000..9a37e0a67ba9b --- /dev/null +++ b/integration/dask/runtest.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +# check that optional pyarrow modules are available +# because pytest would just skip the dask tests +python -c "import pyarrow.orc" +python -c "import pyarrow.parquet" + +# TODO(kszucs): the following tests are also uses pyarrow +# pytest -sv --pyargs dask.bytes.tests.test_s3 +# pytest -sv --pyargs dask.bytes.tests.test_hdfs +# pytest -sv --pyargs dask.bytes.tests.test_local + +pytest -v --pyargs dask.dataframe.io.tests.test_orc +pytest -v --pyargs dask.dataframe.io.tests.test_parquet +pytest -v --pyargs dask.dataframe.tests.test_dataframe diff --git a/integration/hdfs/Dockerfile b/integration/hdfs/Dockerfile index a1d3e4eb0a598..4fc266f267e76 100644 --- a/integration/hdfs/Dockerfile +++ b/integration/hdfs/Dockerfile @@ -15,63 +15,35 @@ # specific language governing permissions and limitations # under the License. -FROM gelog/hadoop +FROM arrow:python-3.6 -RUN apt-get update && \ - apt-get install -y \ - autoconf \ - automake \ - make \ - gcc \ - g++ \ - git \ - wget \ - pkg-config \ - ninja-build - -ENV CC=gcc \ - CXX=g++ \ - PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -# install dependencies -ARG PYTHON_VERSION=3.6 -ADD ci/docker_install_conda.sh \ - ci/conda_env_cpp.yml \ - ci/conda_env_python.yml \ - /arrow/ci/ -RUN arrow/ci/docker_install_conda.sh && \ - conda install -c conda-forge \ - --file arrow/ci/conda_env_cpp.yml \ - --file arrow/ci/conda_env_python.yml \ - python=$PYTHON_VERSION && \ - conda clean --all - -# installing in the previous step boost=1.60 and boost-cpp=1.67 gets installed, -# cmake finds 1.60 and parquet fails to compile -# installing it in a separate step, boost=1.60 and boost-cpp=1.64 gets -# installed, cmake finds 1.64 -# libhdfs3 needs to be pinned, see ARROW-1465 and ARROW-1445 +# installing libhdfs3, it needs to be pinned, see ARROW-1465 and ARROW-1445 RUN conda install -y -c conda-forge hdfs3 libhdfs3=2.2.31 && \ conda clean --all +# installing libhdfs (JNI) +ARG HADOOP_VERSION=2.6.5 +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + HADOOP_HOME=/usr/local/hadoop \ + HADOOP_OPTS=-Djava.library.path=/usr/local/hadoop/lib/native \ + PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin +RUN apt-get update -y && \ + apt-get install -y openjdk-8-jdk && \ + wget -q -O hadoop-$HADOOP_VERSION.tar.gz "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" && \ + tar -zxf /hadoop-$HADOOP_VERSION.tar.gz && \ + rm /hadoop-$HADOOP_VERSION.tar.gz && \ + mv hadoop-$HADOOP_VERSION /usr/local/hadoop +ADD integration/hdfs/hdfs-site.xml $HADOOP_HOME/etc/hadoop/ + # build cpp with tests -ENV ARROW_HDFS=ON \ +ENV CC=gcc \ + CXX=g++ \ + ARROW_ORC=ON \ + ARROW_HDFS=ON \ ARROW_PYTHON=ON \ - ARROW_BUILD_TESTS=ON \ - LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${HADOOP_HOME}/lib/native" -ADD ci/docker_build_cpp.sh /arrow/ci/ -ADD cpp /arrow/cpp -ADD format /arrow/format -ADD java/pom.xml /arrow/java/pom.xml -RUN arrow/ci/docker_build_cpp.sh - -# build python -ADD ci/docker_build_python.sh /arrow/ci/ -ADD python /arrow/python -RUN arrow/ci/docker_build_python.sh + ARROW_BUILD_TESTS=ON -# execute integration tests -ENV LIBHDFS3_CONF=/arrow/integration/hdfs/libhdfs3.xml -ADD integration /arrow/integration -CMD arrow/integration/hdfs/runtest.sh +# build and test +CMD arrow/ci/docker_build_cpp.sh && \ + arrow/ci/docker_build_python.sh && \ + arrow/integration/hdfs/runtest.sh diff --git a/integration/hdfs/hdfs-site.xml b/integration/hdfs/hdfs-site.xml new file mode 100644 index 0000000000000..a80b945a664b7 --- /dev/null +++ b/integration/hdfs/hdfs-site.xml @@ -0,0 +1,44 @@ + + + + + + + + + dfs.replication + 2 + + + dfs.datanode.data.dir + file:///data/dfs/data + + + dfs.namenode.name.dir + file:///data/dfs/name + + + dfs.namenode.checkpoint.dir + file:///data/dfs/namesecondary + + + dfs.namenode.datanode.registration.ip-hostname-check + false + + + dfs.default.replica + 1 + + diff --git a/integration/hdfs/libhdfs3.xml b/integration/hdfs/libhdfs3.xml deleted file mode 100644 index f929929b386da..0000000000000 --- a/integration/hdfs/libhdfs3.xml +++ /dev/null @@ -1,332 +0,0 @@ - - - - - - - - - - - - - - - rpc.client.timeout - 3600000 - - timeout interval of a RPC invocation in millisecond. default is 3600000. - - - - rpc.client.connect.tcpnodelay - true - - whether set socket TCP_NODELAY to true when connect to RPC server. default is true. - - - - - rpc.client.max.idle - 10000 - - the max idle time of a RPC connection in millisecond. default is 10000. - - - - - rpc.client.ping.interval - 10000 - - the interval which the RPC client send a heart beat to server. 0 means disable, default is 10000. - - - - - rpc.client.connect.timeout - 600000 - - the timeout interval in millisecond when the RPC client is trying to setup the connection. default is 600000. - - - - - rpc.client.connect.retry - 10 - - the max retry times if the RPC client fail to setup the connection to server. default is 10. - - - - - rpc.client.read.timeout - 3600000 - - the timeout interval in millisecond when the RPC client is trying to read from server. default is 3600000. - - - - - rpc.client.write.timeout - 3600000 - - the timeout interval in millisecond when the RPC client is trying to write to server. default is 3600000. - - - - - rpc.client.socket.linger.timeout - -1 - - set value to socket SO_LINGER when connect to RPC server. -1 means default OS value. default is -1. - - - - - - dfs.client.read.shortcircuit - false - - whether reading block file bypass datanode if the block and the client are on the same node. default is true. - - - - - dfs.default.replica - 1 - - the default number of replica. default is 3. - - - - - dfs.prefetchsize - 10 - - the default number of blocks which information will be prefetched. default is 10. - - - - - dfs.client.failover.max.attempts - 15 - - if multiply namenodes are configured, it is the max retry times when the dfs client try to issue a RPC call. default is 15. - - - - - dfs.default.blocksize - 134217728 - - default block size. default is 134217728. - - - - - dfs.client.log.severity - INFO - - the minimal log severity level, valid values include FATAL, ERROR, INFO, DEBUG1, DEBUG2, DEBUG3. default is INFO. - - - - - - input.connect.timeout - 600000 - - the timeout interval in millisecond when the input stream is trying to setup the connection to datanode. default is 600000. - - - - - input.read.timeout - 3600000 - - the timeout interval in millisecond when the input stream is trying to read from datanode. default is 3600000. - - - - - input.write.timeout - 3600000 - - the timeout interval in millisecond when the input stream is trying to write to datanode. default is 3600000. - - - - - input.localread.default.buffersize - 2097152 - - number of bytes of the buffer which is used to hold the data from block file and verify checksum. - it is only used when "dfs.client.read.shortcircuit" is set to true. default is 1048576. - - - - - input.localread.blockinfo.cachesize - 1000 - - the size of block file path information cache. default is 1000. - - - - - input.read.getblockinfo.retry - 3 - - the max retry times when the client fail to get block information from namenode. default is 3. - - - - - - output.replace-datanode-on-failure - false - - whether the client add new datanode into pipeline if the number of nodes in pipeline is less the specified number of replicas. default is false. - - - - - output.default.chunksize - 512 - - the number of bytes of a chunk in pipeline. default is 512. - - - - - output.default.packetsize - 65536 - - the number of bytes of a packet in pipeline. default is 65536. - - - - - output.default.write.retry - 10 - - the max retry times when the client fail to setup the pipeline. default is 10. - - - - - output.connect.timeout - 600000 - - the timeout interval in millisecond when the output stream is trying to setup the connection to datanode. default is 600000. - - - - - output.read.timeout - 3600000 - - the timeout interval in millisecond when the output stream is trying to read from datanode. default is 3600000. - - - - - output.write.timeout - 3600000 - - the timeout interval in millisecond when the output stream is trying to write to datanode. default is 3600000. - - - - - output.packetpool.size - 1024 - - the max number of packets in a file's packet pool. default is 1024. - - - - - output.close.timeout - 900000 - - the timeout interval in millisecond when close an output stream. default is 900000. - - - - - dfs.domain.socket.path - /var/lib/hadoop-hdfs/dn_socket - - Optional. This is a path to a UNIX domain socket that will be used for - communication between the DataNode and local HDFS clients. - If the string "_PORT" is present in this path, it will be replaced by the - TCP port of the DataNode. - - - - - dfs.client.use.legacy.blockreader.local - false - - Legacy short-circuit reader implementation based on HDFS-2246 is used - if this configuration parameter is true. - This is for the platforms other than Linux - where the new implementation based on HDFS-347 is not available. - - - - diff --git a/integration/hdfs/runtest.sh b/integration/hdfs/runtest.sh index a90eb93645369..2f090c8a81fba 100755 --- a/integration/hdfs/runtest.sh +++ b/integration/hdfs/runtest.sh @@ -20,9 +20,17 @@ set -e export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` +export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop +export LIBHDFS3_CONF=$HADOOP_CONF_DIR/hdfs-site.xml +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/ +# execute cpp tests pushd /build/cpp - debug/io-hdfs-test + debug/arrow-io-hdfs-test popd -pytest -v --pyargs pyarrow +# cannot use --pyargs with custom arguments like --hdfs or --only-hdfs, because +# pytest ignores them, see https://github.com/pytest-dev/pytest/issues/3517 +export PYARROW_TEST_ONLY_HDFS=ON + +pytest -v --pyargs pyarrow.tests.test_hdfs diff --git a/python/Dockerfile b/python/Dockerfile index 5c2ef1e30d142..a99a4206290f8 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -25,7 +25,8 @@ RUN conda install -c conda-forge \ python=$PYTHON_VERSION && \ conda clean --all -ENV ARROW_PYTHON=ON +ENV ARROW_PYTHON=ON \ + ARROW_BUILD_TESTS=OFF # build and test CMD arrow/ci/docker_build_cpp.sh && \ diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 69e8e82e2532a..3c092cfb60247 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -99,19 +99,36 @@ def pytest_configure(config): def pytest_addoption(parser): + def bool_env(name, default=None): + value = os.environ.get(name.upper()) + if value is None: + return default + value = value.lower() + if value in {'1', 'true', 'on', 'yes', 'y'}: + return True + elif value in {'0', 'false', 'off', 'no', 'n'}: + return False + else: + raise ValueError('{}={} is not parsable as boolean' + .format(name.upper(), value)) + for group in groups: - for flag in ['--{0}', '--enable-{0}']: - parser.addoption(flag.format(group), action='store_true', - default=defaults[group], - help=('Enable the {0} test group'.format(group))) - - parser.addoption('--disable-{0}'.format(group), action='store_true', - default=False, - help=('Disable the {0} test group'.format(group))) - - parser.addoption('--only-{0}'.format(group), action='store_true', - default=False, - help=('Run only the {0} test group'.format(group))) + for flag, envvar in [('--{}', 'PYARROW_TEST_{}'), + ('--enable-{}', 'PYARROW_TEST_ENABLE_{}')]: + default = bool_env(envvar.format(group), defaults[group]) + parser.addoption(flag.format(group), + action='store_true', default=default, + help=('Enable the {} test group'.format(group))) + + default = bool_env('PYARROW_TEST_DISABLE_{}'.format(group), False) + parser.addoption('--disable-{}'.format(group), + action='store_true', default=default, + help=('Disable the {} test group'.format(group))) + + default = bool_env('PYARROW_TEST_ONLY_{}'.format(group), False) + parser.addoption('--only-{}'.format(group), + action='store_true', default=default, + help=('Run only the {} test group'.format(group))) parser.addoption('--runslow', action='store_true', default=False, help='run slow tests') diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py index 81b03b6fb7e4e..f218a1604a9d9 100644 --- a/python/pyarrow/tests/test_hdfs.py +++ b/python/pyarrow/tests/test_hdfs.py @@ -15,21 +15,22 @@ # specific language governing permissions and limitations # under the License. -from io import BytesIO -from os.path import join as pjoin import os import pickle +import pytest import random import unittest +import pandas.util.testing as pdt + +from io import BytesIO +from os.path import join as pjoin import numpy as np -import pandas.util.testing as pdt -import pytest +import pyarrow as pa +import pyarrow.tests.test_parquet as test_parquet from pyarrow.compat import guid -import pyarrow as pa -import pyarrow.tests.test_parquet as test_parquet # ---------------------------------------------------------------------- # HDFS tests @@ -406,3 +407,25 @@ def _get_hdfs_uri(path): uri = "hdfs://{}:{}{}".format(host, port, path) return uri + + +@pytest.mark.parquet +@pytest.mark.fastparquet +@pytest.mark.parametrize('client', ['libhdfs', 'libhdfs3']) +def test_fastparquet_read_with_hdfs(client): + import pyarrow.parquet as pq + fastparquet = pytest.importorskip('fastparquet') + + fs = hdfs_test_client(client) + + df = pdt.makeDataFrame() + table = pa.Table.from_pandas(df) + + path = '/tmp/testing.parquet' + with fs.open(path, 'wb') as f: + pq.write_table(table, f) + + parquet_file = fastparquet.ParquetFile(path, open_with=fs.open) + + result = parquet_file.to_pandas() + pdt.assert_frame_equal(result, df) diff --git a/python/testing/README.md b/python/testing/README.md deleted file mode 100644 index d7d0ff0bb7f47..0000000000000 --- a/python/testing/README.md +++ /dev/null @@ -1,42 +0,0 @@ - - -# Testing tools for odds and ends - -## Testing Dask integration - -Initial integration testing with Dask has been Dockerized. -To invoke the test run the following command in the `arrow` -root-directory: - -```shell -bash dev/dask_integration.sh -``` - -This script will create a `dask` directory on the same level as -`arrow`. It will clone the Dask project from Github into `dask` -and do a Python `--user` install. The Docker code will use the parent -directory of `arrow` as `$HOME` and that's where Python will -install `dask` into a `.local` directory. - -The output of the Docker session will contain the results of tests -of the Dask dataframe followed by the single integration test that -now exists for Arrow. That test creates a set of `csv`-files and then -does parallel reading of `csv`-files into a Dask dataframe. The code -for this test resides here in the `dask_test` directory. diff --git a/python/testing/dask_tests/test_dask_integration.py b/python/testing/dask_tests/test_dask_integration.py deleted file mode 100644 index 842c45f57d1f7..0000000000000 --- a/python/testing/dask_tests/test_dask_integration.py +++ /dev/null @@ -1,58 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import pytest - -from datetime import date, timedelta -import csv -from random import randint - -import pyarrow as pa - -dd = pytest.importorskip('dask.dataframe') - - -def make_datafiles(tmpdir, prefix='data', num_files=20): - rowcount = 5000 - fieldnames = ['date', 'temperature', 'dewpoint'] - start_date = date(1900, 1, 1) - for i in range(num_files): - filename = '{0}/{1}-{2}.csv'.format(tmpdir, prefix, i) - with open(filename, 'w') as outcsv: - writer = csv.DictWriter(outcsv, fieldnames) - writer.writeheader() - the_date = start_date - for _ in range(rowcount): - temperature = randint(-10, 35) - dewpoint = temperature - randint(0, 10) - writer.writerow({'date': the_date, 'temperature': temperature, - 'dewpoint': dewpoint}) - the_date += timedelta(days=1) - - -def test_dask_file_read(tmpdir): - prefix = 'data' - make_datafiles(tmpdir, prefix) - # Read all datafiles in parallel - datafiles = '{0}/{1}-*.csv'.format(tmpdir, prefix) - dask_df = dd.read_csv(datafiles) - # Convert Dask dataframe to Arrow table - table = pa.Table.from_pandas(dask_df.compute()) - # Second column (1) is temperature - dask_temp = int(1000 * dask_df['temperature'].mean().compute()) - arrow_temp = int(1000 * table[1].to_pandas().mean()) - assert dask_temp == arrow_temp diff --git a/python/testing/functions.sh b/python/testing/functions.sh deleted file mode 100644 index 983f490331ff8..0000000000000 --- a/python/testing/functions.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -use_gcc() { - export CC=gcc-4.9 - export CXX=g++-4.9 -} - -use_clang() { - export CC=clang-4.0 - export CXX=clang++-4.0 -} - -bootstrap_python_env() { - PYTHON_VERSION=$1 - CONDA_ENV_DIR=$BUILD_DIR/pyarrow-test-$PYTHON_VERSION - - conda create -y -q -p $CONDA_ENV_DIR python=$PYTHON_VERSION cmake curl - conda activate $CONDA_ENV_DIR - - python --version - which python - - # faster builds, please - conda install -y -q nomkl pip numpy pandas cython -} - -build_pyarrow() { - # Other stuff pip install - pushd $ARROW_PYTHON_DIR - pip install -r requirements.txt - python setup.py build_ext --with-parquet --with-plasma \ - install --single-version-externally-managed --record=record.text - popd - - python -c "import pyarrow.parquet" - python -c "import pyarrow.plasma" - - export PYARROW_PATH=$CONDA_PREFIX/lib/python$PYTHON_VERSION/site-packages/pyarrow -} - -build_arrow() { - mkdir -p $ARROW_CPP_BUILD_DIR - pushd $ARROW_CPP_BUILD_DIR - - cmake -GNinja \ - -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ - -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - -DARROW_NO_DEPRECATED_API=ON \ - -DARROW_PARQUET=ON \ - -DARROW_PYTHON=ON \ - -DARROW_PLASMA=ON \ - -DARROW_BOOST_USE_SHARED=off \ - $ARROW_CPP_DIR - - ninja - ninja install - popd -} diff --git a/python/testing/parquet_interop.py b/python/testing/parquet_interop.py deleted file mode 100644 index 6d41ba4b6a5f1..0000000000000 --- a/python/testing/parquet_interop.py +++ /dev/null @@ -1,51 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os - -import fastparquet -import pyarrow as pa -import pyarrow.parquet as pq -import pandas.util.testing as tm - - -def hdfs_test_client(driver='libhdfs'): - host = os.environ.get('ARROW_HDFS_TEST_HOST', 'localhost') - user = os.environ['ARROW_HDFS_TEST_USER'] - try: - port = int(os.environ.get('ARROW_HDFS_TEST_PORT', 20500)) - except ValueError: - raise ValueError('Env variable ARROW_HDFS_TEST_PORT was not ' - 'an integer') - - return pa.HdfsClient(host, port, user, driver=driver) - - -def test_fastparquet_read_with_hdfs(): - fs = hdfs_test_client() - - df = tm.makeDataFrame() - table = pa.Table.from_pandas(df) - - path = '/tmp/testing.parquet' - with fs.open(path, 'wb') as f: - pq.write_table(table, f) - - parquet_file = fastparquet.ParquetFile(path, open_with=fs.open) - - result = parquet_file.to_pandas() - tm.assert_frame_equal(result, df) diff --git a/python/testing/set_env_common.sh b/python/testing/set_env_common.sh deleted file mode 100644 index 00251f92be4b4..0000000000000 --- a/python/testing/set_env_common.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -export MINICONDA=$HOME/miniconda -export CPP_TOOLCHAIN=$HOME/cpp-toolchain - -export PATH="$MINICONDA/bin:$PATH" -export CONDA_PKGS_DIRS=$HOME/.conda_packages - -export ARROW_CHECKOUT=$HOME/arrow -export BUILD_DIR=$ARROW_CHECKOUT - -export BUILD_OS_NAME=linux -export BUILD_TYPE=debug - -export ARROW_CPP_DIR=$BUILD_DIR/cpp -export ARROW_PYTHON_DIR=$BUILD_DIR/python -export ARROW_C_GLIB_DIR=$BUILD_DIR/c_glib -export ARROW_JAVA_DIR=${BUILD_DIR}/java -export ARROW_JS_DIR=${BUILD_DIR}/js -export ARROW_INTEGRATION_DIR=$BUILD_DIR/integration - -export CPP_BUILD_DIR=$BUILD_DIR/cpp-build - -export ARROW_CPP_INSTALL=$BUILD_DIR/cpp-install -export ARROW_CPP_BUILD_DIR=$BUILD_DIR/cpp-build -export ARROW_C_GLIB_INSTALL=$BUILD_DIR/c-glib-install - -export ARROW_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN -export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN - -export BOOST_ROOT=$CPP_TOOLCHAIN -export PATH=$CPP_TOOLCHAIN/bin:$PATH -export LD_LIBRARY_PATH=$CPP_TOOLCHAIN/lib:$LD_LIBRARY_PATH - -export VALGRIND="valgrind --tool=memcheck" - -export ARROW_HOME=$CPP_TOOLCHAIN -export PARQUET_HOME=$CPP_TOOLCHAIN - -# Arrow test variables - -export JAVA_HOME=/usr/lib/jvm/java-7-oracle -export HADOOP_HOME=/usr/lib/hadoop -export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` -export HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$HADOOP_HOME/lib/native" -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/ - -export ARROW_HDFS_TEST_HOST=arrow-hdfs -export ARROW_HDFS_TEST_PORT=9000 -export ARROW_HDFS_TEST_USER=ubuntu -export ARROW_LIBHDFS_DIR=/usr/lib - -export LIBHDFS3_CONF=/io/hdfs/libhdfs3-hdfs-client.xml diff --git a/python/testing/setup_toolchain.sh b/python/testing/setup_toolchain.sh deleted file mode 100644 index 498206ef33a79..0000000000000 --- a/python/testing/setup_toolchain.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -export PATH="$MINICONDA/bin:$PATH" -conda update -y -q conda -conda config --set auto_update_conda false -conda info -a - -conda config --set show_channel_urls True - -# Help with SSL timeouts to S3 -conda config --set remote_connect_timeout_secs 12 - -conda config --add channels https://repo.continuum.io/pkgs/free -conda config --add channels conda-forge -conda info -a - -# faster builds, please -conda install -y nomkl - -conda install --y conda-build jinja2 anaconda-client cmake curl - -# Set up C++ toolchain -conda create -y -q -p $CPP_TOOLCHAIN python=3.6 \ - jemalloc=4.4.0 \ - nomkl \ - boost-cpp \ - rapidjson \ - flatbuffers \ - gflags \ - lz4-c \ - snappy \ - zstd \ - brotli \ - zlib \ - git \ - cmake \ - curl \ - thrift-cpp \ - libhdfs3 \ - glog \ - ninja - -if [ $BUILD_OS_NAME == "osx" ]; then - brew update && brew bundle --file=python/Brewfile -fi From 5c48bdb5de7d46b0cf3a479f393224688474b940 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 17 Dec 2018 10:55:20 -0600 Subject: [PATCH 064/328] ARROW-2637: [C++/Python] Build support and instructions for development on Alpine Linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build support is tested by `docker-compose run cpp-alpine` Author: Krisztián Szűcs Closes #3191 from kszucs/ARROW-2637 and squashes the following commits: ea43e08ee add bash to run the tests 348e982a0 add README instructions to build arrow on alpine linux --- cpp/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/README.md b/cpp/README.md index 010387dbd4de3..a94c4be4f2cd4 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -46,6 +46,18 @@ sudo apt-get install \ libboost-system-dev ``` +On Alpine Linux: + +```shell +apk add autoconf \ + bash \ + boost-dev \ + cmake \ + g++ \ + gcc \ + make +``` + On macOS, you can use [Homebrew][1]: ```shell From 0b78f4bdf70617279bbd3997ed79a7194cf66438 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 17 Dec 2018 15:11:25 -0600 Subject: [PATCH 065/328] ARROW-4033: [C++] Use readlink -f instead of realpath in dependency download script This documentation might be better moved to the Sphinx docs. Author: Wes McKinney Closes #3205 from wesm/ARROW-4033 and squashes the following commits: 21349f02f Use readlink -f instead of realpath --- cpp/thirdparty/README.md | 45 +++++++++++++++---------- cpp/thirdparty/download_dependencies.sh | 2 +- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/cpp/thirdparty/README.md b/cpp/thirdparty/README.md index bd1cb28d81818..0353395dfb1ff 100644 --- a/cpp/thirdparty/README.md +++ b/cpp/thirdparty/README.md @@ -29,17 +29,24 @@ offline builds. To set up your own specific build toolchain, here are the relevant environment variables +* brotli: `BROTLI_HOME`, can be disabled with `-DARROW_WITH_BROTLI=off` * Boost: `BOOST_ROOT` +* double-conversion: `DOUBLE_CONVERSION_HOME` * Googletest: `GTEST_HOME` (only required to build the unit tests) * gflags: `GFLAGS_HOME` (only required to build the unit tests) +* glog: `GLOG_HOME` (only required if `ARROW_USE_GLOG=ON`) * Google Benchmark: `GBENCHMARK_HOME` (only required if building benchmarks) * Flatbuffers: `FLATBUFFERS_HOME` (only required for -DARROW_IPC=on, which is the default) * Hadoop: `HADOOP_HOME` (only required for the HDFS I/O extensions) * jemalloc: `JEMALLOC_HOME` -* brotli: `BROTLI_HOME`, can be disabled with `-DARROW_WITH_BROTLI=off` * lz4: `LZ4_HOME`, can be disabled with `-DARROW_WITH_LZ4=off` +* Apache ORC: `ORC_HOME` +* protobuf: `PROTOBUF_HOME` +* rapidjson: `RAPIDJSON_HOME` +* re2: `RE2_HOME` (only required to build Gandiva currently) * snappy: `SNAPPY_HOME`, can be disabled with `-DARROW_WITH_SNAPPY=off` +* thrift: `THRIFT_HOME` * zlib: `ZLIB_HOME`, can be disabled with `-DARROW_WITH_ZLIB=off` * zstd: `ZSTD_HOME`, can be disabled with `-DARROW_WITH_ZSTD=off` @@ -69,24 +76,26 @@ script: ```shell # Download tarballs into `$HOME/arrow-thirdparty-deps` -$ ./thirdparty/download_dependencies $HOME/arrow-thirdparty-deps -# some output omitted - +$ ./thirdparty/download_dependencies $HOME/arrow-thirdparty # Environment variables for offline Arrow build -export ARROW_BOOST_URL=$HOME/arrow-thirdparty-deps/boost.tar.gz -export ARROW_GTEST_URL=$HOME/arrow-thirdparty-deps/gtest.tar.gz -export ARROW_GFLAGS_URL=$HOME/arrow-thirdparty-deps/gflags.tar.gz -export ARROW_GBENCHMARK_URL=$HOME/arrow-thirdparty-deps/gbenchmark.tar.gz -export ARROW_FLATBUFFERS_URL=$HOME/arrow-thirdparty-deps/flatbuffers.tar.gz -export ARROW_RAPIDJSON_URL=$HOME/arrow-thirdparty-deps/rapidjson.tar.gz -export ARROW_SNAPPY_URL=$HOME/arrow-thirdparty-deps/snappy.tar.gz -export ARROW_BROTLI_URL=$HOME/arrow-thirdparty-deps/brotli.tar.gz -export ARROW_LZ4_URL=$HOME/arrow-thirdparty-deps/lz4.tar.gz -export ARROW_ZLIB_URL=$HOME/arrow-thirdparty-deps/zlib.tar.gz -export ARROW_ZSTD_URL=$HOME/arrow-thirdparty-deps/zstd.tar.gz -export ARROW_PROTOBUF_URL=$HOME/arrow-thirdparty-deps/protobuf.tar.gz -export ARROW_GRPC_URL=$HOME/arrow-thirdparty-deps/grpc.tar.gz -export ARROW_ORC_URL=$HOME/arrow-thirdparty-deps/orc.tar.gz +export ARROW_BOOST_URL=$HOME/arrow-thirdparty/boost-1.67.0.tar.gz +export ARROW_BROTLI_URL=$HOME/arrow-thirdparty/brotli-v0.6.0.tar.gz +export ARROW_DOUBLE_CONVERSION_URL=$HOME/arrow-thirdparty/double-conversion-v3.1.1.tar.gz +export ARROW_FLATBUFFERS_URL=$HOME/arrow-thirdparty/flatbuffers-02a7807dd8d26f5668ffbbec0360dc107bbfabd5.tar.gz +export ARROW_GBENCHMARK_URL=$HOME/arrow-thirdparty/gbenchmark-v1.4.1.tar.gz +export ARROW_GFLAGS_URL=$HOME/arrow-thirdparty/gflags-v2.2.0.tar.gz +export ARROW_GLOG_URL=$HOME/arrow-thirdparty/glog-v0.3.5.tar.gz +export ARROW_GRPC_URL=$HOME/arrow-thirdparty/grpc-v1.14.1.tar.gz +export ARROW_GTEST_URL=$HOME/arrow-thirdparty/gtest-1.8.0.tar.gz +export ARROW_LZ4_URL=$HOME/arrow-thirdparty/lz4-v1.7.5.tar.gz +export ARROW_ORC_URL=$HOME/arrow-thirdparty/orc-1.5.1.tar.gz +export ARROW_PROTOBUF_URL=$HOME/arrow-thirdparty/protobuf-v3.6.1.tar.gz +export ARROW_RAPIDJSON_URL=$HOME/arrow-thirdparty/rapidjson-v1.1.0.tar.gz +export ARROW_RE2_URL=$HOME/arrow-thirdparty/re2-2018-10-01.tar.gz +export ARROW_SNAPPY_URL=$HOME/arrow-thirdparty/snappy-1.1.3.tar.gz +export ARROW_THRIFT_URL=$HOME/arrow-thirdparty/thrift-0.11.0.tar.gz +export ARROW_ZLIB_URL=$HOME/arrow-thirdparty/zlib-1.2.8.tar.gz +export ARROW_ZSTD_URL=$HOME/arrow-thirdparty/zstd-v1.3.7.tar.gz ``` This can be automated by using inline source/eval: diff --git a/cpp/thirdparty/download_dependencies.sh b/cpp/thirdparty/download_dependencies.sh index de7d23ca2ef5e..f782963dd1450 100755 --- a/cpp/thirdparty/download_dependencies.sh +++ b/cpp/thirdparty/download_dependencies.sh @@ -30,7 +30,7 @@ else DESTDIR=$1 fi -DESTDIR=$(realpath "${DESTDIR}") +DESTDIR=$(readlink -f "${DESTDIR}") download_dependency() { local url=$1 From 9fcce64e6108dd911c9cfcd4121ea33e2b447c91 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 17 Dec 2018 15:23:42 -0600 Subject: [PATCH 066/328] ARROW-4026: [C++] Add *-all, *-tests, *-benchmarks modular CMake targets. Use in Travis CI This provides much more granular control over what targets are built. Before this patch `ninja arrow` would build all libraries _and_ tests if `ARROW_BUILD_TESTS=ON`. If you wanted to build the tests for a dependent target of the Arrow libraries, like Parquet or Plasma, you were forced to build the Arrow core unit tests. Now you can do ``` ninja parquet-tests ``` And it will only build the Arrow and Parquet libraries, and the tests labeled with "parquet-tests". Similarly this allows you to rebuild the libraries without necessarily having to relink all the unit tests (e.g. with `ninja arrow` or `ninja parquet`) Author: Wes McKinney Closes #3204 from wesm/ARROW-4026 and squashes the following commits: 1e41eee2d Misc fixes, add missing toolchain dependency 420282433 Add *-all, *-tests, *-benchmarks modular build targets. Use in Travis CI --- .travis.yml | 8 +-- ci/travis_before_script_cpp.sh | 4 -- ci/travis_script_gandiva_cpp.sh | 2 +- ci/travis_script_python.sh | 5 +- cpp/CMakeLists.txt | 17 ++++--- cpp/README.md | 8 ++- cpp/cmake_modules/BuildUtils.cmake | 52 +++++++++----------- cpp/cmake_modules/ThirdpartyToolchain.cmake | 30 ++++------- cpp/src/arrow/CMakeLists.txt | 19 +++++-- cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt | 5 +- cpp/src/arrow/flight/CMakeLists.txt | 1 - cpp/src/arrow/gpu/CMakeLists.txt | 13 ++++- cpp/src/arrow/ipc/CMakeLists.txt | 2 +- cpp/src/arrow/python/CMakeLists.txt | 5 +- cpp/src/gandiva/CMakeLists.txt | 15 +++--- cpp/src/gandiva/jni/CMakeLists.txt | 2 - cpp/src/gandiva/precompiled/CMakeLists.txt | 2 +- cpp/src/parquet/CMakeLists.txt | 25 ++++++---- cpp/src/parquet/arrow/CMakeLists.txt | 4 +- cpp/src/plasma/CMakeLists.txt | 6 ++- 20 files changed, 122 insertions(+), 103 deletions(-) diff --git a/.travis.yml b/.travis.yml index d22a4e7df0fea..bf0261b3fa1ea 100644 --- a/.travis.yml +++ b/.travis.yml @@ -112,9 +112,7 @@ matrix: - ARROW_TRAVIS_GANDIVA_JAVA=1 - ARROW_TRAVIS_GANDIVA_TESTS=1 - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - - ARROW_CPP_BUILD_TARGETS="gandiva" - # TODO(wesm): Remove this after ARROW-4026 - - ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS="gandiva" + - ARROW_CPP_BUILD_TARGETS="gandiva-all" - ARROW_TRAVIS_USE_TOOLCHAIN=1 # ARROW-3979 temporarily disabled. - ARROW_TRAVIS_VALGRIND=0 @@ -164,9 +162,7 @@ matrix: - ARROW_TRAVIS_GANDIVA_JAVA=1 - ARROW_TRAVIS_GANDIVA_TESTS=1 - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - - ARROW_CPP_BUILD_TARGETS="gandiva" - # TODO(wesm): Remove this after ARROW-4026 - - ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS="gandiva" + - ARROW_CPP_BUILD_TARGETS="gandiva-all" - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN before_script: diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 6cb7d6074f230..aa5b2a6ab084c 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -42,7 +42,6 @@ fi CMAKE_COMMON_FLAGS="\ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL \ --DARROW_TEST_INCLUDE_LABELS=$ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS \ -DARROW_NO_DEPRECATED_API=ON \ -DARROW_EXTRA_ERROR_CONTEXT=ON" CMAKE_LINUX_FLAGS="" @@ -102,9 +101,6 @@ if [ $ARROW_TRAVIS_GANDIVA == "1" ]; then if [ $ARROW_TRAVIS_GANDIVA_JAVA == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA_JAVA=ON" fi - if [ $ARROW_TRAVIS_GANDIVA_TESTS == "1" ]; then - CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BUILD_TESTS=ON" - fi fi if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then diff --git a/ci/travis_script_gandiva_cpp.sh b/ci/travis_script_gandiva_cpp.sh index f3c379393fe14..bc4a7a9a8f03b 100755 --- a/ci/travis_script_gandiva_cpp.sh +++ b/ci/travis_script_gandiva_cpp.sh @@ -23,7 +23,7 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh pushd $CPP_BUILD_DIR -PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva +PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva-tests popd diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index b8385c3834266..20ec57efc39e4 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -87,21 +87,20 @@ rm -rf * # XXX Can we simply reuse CMAKE_COMMON_FLAGS from travis_before_script_cpp.sh? CMAKE_COMMON_FLAGS="-DARROW_EXTRA_ERROR_CONTEXT=ON" -PYTHON_CPP_BUILD_TARGETS="arrow_python plasma" +PYTHON_CPP_BUILD_TARGETS="arrow_python-all plasma" if [ $ARROW_TRAVIS_COVERAGE == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GENERATE_COVERAGE=ON" fi if [ $ARROW_TRAVIS_PYTHON_GANDIVA == "1" ]; then - CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON -DARROW_GANDIVA_BUILD_TESTS=OFF" + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON" PYTHON_CPP_BUILD_TARGETS="$PYTHON_CPP_BUILD_TARGETS gandiva" fi cmake -GNinja \ $CMAKE_COMMON_FLAGS \ -DARROW_BUILD_TESTS=ON \ - -DARROW_TEST_INCLUDE_LABELS=python \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_OPTIONAL_INSTALL=ON \ -DARROW_PLASMA=on \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f563199c62470..60cbe85d10b6d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -333,10 +333,6 @@ Always OFF if building binaries" #---------------------------------------------------------------------- # Advanced developer options - set(ARROW_TEST_INCLUDE_LABELS "" CACHE STRING - "Only build unit tests having the indicated label or labels. \ -Pass multiple labels by dividing with semicolons") - option(ARROW_EXTRA_ERROR_CONTEXT "Compile with extra error context (line numbers, code)" OFF) @@ -466,10 +462,18 @@ endif() if(NOT ARROW_BUILD_TESTS) set(NO_TESTS 1) +else() + add_custom_target(all-tests) + add_custom_target(unittest ctest -L unittest) + add_dependencies(unittest all-tests) endif() if(NOT ARROW_BUILD_BENCHMARKS) set(NO_BENCHMARKS 1) +else() + add_custom_target(all-benchmarks) + add_custom_target(benchmark ctest -L benchmark) + add_dependencies(benchmark all-benchmarks) endif() if(NOT ARROW_BUILD_EXAMPLES) @@ -516,8 +520,6 @@ include(SetupCxxFlags) # Dependencies ############################################################ -add_custom_target(arrow_dependencies) - include(BuildUtils) enable_testing() @@ -712,6 +714,9 @@ if (ARROW_USE_GLOG) add_definitions("-DARROW_USE_GLOG") endif() +add_custom_target(arrow_dependencies) +add_dependencies(arrow_dependencies toolchain) + if (ARROW_STATIC_LINK_LIBS) add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS}) endif() diff --git a/cpp/README.md b/cpp/README.md index a94c4be4f2cd4..5940db1f44301 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -105,14 +105,18 @@ export LC_ALL="en_US.UTF-8" ## Modular Build Targets Since there are several major parts of the C++ project, we have provided -modular CMake targets for building each component along with its dependencies, -unit tests, and benchmarks (if enabled): +modular CMake targets for building each library component, group of unit tests +and benchmarks, and their dependencies: * `make arrow` for Arrow core libraries * `make parquet` for Parquet libraries * `make gandiva` for Gandiva (LLVM expression compiler) libraries * `make plasma` for Plasma libraries, server +To build the unit tests or benchmarks, add `-tests` or `-benchmarks` to the +target name. So `make arrow-tests` will build the Arrow core unit tests. Using +the `-all` target, e.g. `parquet-all`, will build everything. + If you wish to only build and install one or more project subcomponents, we have provided the CMake option `ARROW_OPTIONAL_INSTALL` to only install targets that have been built. For example, if you only wish to build the Parquet diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 812d0c39e7fa5..7c1db679bf23e 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -308,6 +308,9 @@ endfunction() # \arg PREFIX a string to append to the name of the benchmark executable. For # example, if you have src/arrow/foo/bar-benchmark.cc, then PREFIX "foo" will # create test executable foo-bar-benchmark +# \arg LABELS the benchmark label or labels to assign the unit tests to. By +# default, benchmarks will go in the "benchmark" group. Custom targets for the +# group names must exist function(ADD_BENCHMARK REL_BENCHMARK_NAME) set(options) set(one_value_args) @@ -343,20 +346,22 @@ function(ADD_BENCHMARK REL_BENCHMARK_NAME) set(NO_COLOR "") endif() + # Add test as dependency of relevant label targets + add_dependencies(all-benchmarks ${BENCHMARK_NAME}) + foreach (TARGET ${ARG_LABELS}) + add_dependencies(${TARGET} ${BENCHMARK_NAME}) + endforeach() + if (ARG_DEPENDENCIES) add_dependencies(${BENCHMARK_NAME} ${ARG_DEPENDENCIES}) endif() if (ARG_LABELS) - set(ARG_LABELS "${ARG_LABELS}") + set(ARG_LABELS "benchmark;${ARG_LABELS}") else() set(ARG_LABELS benchmark) endif() - foreach (TEST_LABEL ${ARG_LABELS}) - add_dependencies(${TEST_LABEL} ${BENCHMARK_NAME}) - endforeach() - add_test(${BENCHMARK_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) set_property(TEST ${BENCHMARK_NAME} @@ -389,7 +394,7 @@ endfunction() # \arg LABELS the unit test label or labels to assign the unit tests # to. By default, unit tests will go in the "unittest" group, but if we have # multiple unit tests in some subgroup, you can assign a test to multiple -# groups using the syntax unittest;GROUP2;GROUP3. Custom targets for the group +# groups use the syntax unittest;GROUP2;GROUP3. Custom targets for the group # names must exist function(ADD_TEST_CASE REL_TEST_NAME) set(options NO_VALGRIND ENABLED) @@ -401,18 +406,6 @@ function(ADD_TEST_CASE REL_TEST_NAME) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") endif() - if (NOT "${ARROW_TEST_INCLUDE_LABELS}" STREQUAL "") - set(_SKIP_TEST TRUE) - foreach (_INCLUDED_LABEL ${ARROW_TEST_INCLUDE_LABELS}) - if ("${ARG_LABELS}" MATCHES "${_INCLUDED_LABEL}") - set(_SKIP_TEST FALSE) - endif() - endforeach() - if (_SKIP_TEST) - return() - endif() - endif() - if (NO_TESTS AND NOT ARG_ENABLED) return() endif() @@ -422,12 +415,6 @@ function(ADD_TEST_CASE REL_TEST_NAME) set(TEST_NAME "${ARG_PREFIX}-${TEST_NAME}") endif() - if (ARG_LABELS) - set(ARG_LABELS "${ARG_LABELS}") - else() - set(ARG_LABELS unittest) - endif() - if (ARG_SOURCES) set(SOURCES ${ARG_SOURCES}) else() @@ -458,10 +445,6 @@ function(ADD_TEST_CASE REL_TEST_NAME) add_dependencies(${TEST_NAME} ${ARG_EXTRA_DEPENDENCIES}) endif() - foreach (TEST_LABEL ${ARG_LABELS}) - add_dependencies(${TEST_LABEL} ${TEST_NAME}) - endforeach() - if (ARROW_TEST_MEMCHECK AND NOT ARG_NO_VALGRIND) SET_PROPERTY(TARGET ${TEST_NAME} APPEND_STRING PROPERTY @@ -477,6 +460,18 @@ function(ADD_TEST_CASE REL_TEST_NAME) ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) endif() + # Add test as dependency of relevant targets + add_dependencies(all-tests ${TEST_NAME}) + foreach (TARGET ${ARG_LABELS}) + add_dependencies(${TARGET} ${TEST_NAME}) + endforeach() + + if (ARG_LABELS) + set(ARG_LABELS "unittest;${ARG_LABELS}") + else() + set(ARG_LABELS unittest) + endif() + set_property(TEST ${TEST_NAME} APPEND PROPERTY LABELS ${ARG_LABELS}) @@ -537,7 +532,6 @@ function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME) add_dependencies(${EXAMPLE_NAME} ${ARG_DEPENDENCIES}) endif() - add_test(${EXAMPLE_NAME} ${EXAMPLE_PATH}) set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "example") endfunction() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index c007b1c225bb9..d493de75a55f5 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(toolchain) + # ---------------------------------------------------------------------- # Toolchain linkage options @@ -401,7 +403,7 @@ if (ARROW_BOOST_VENDORED) ${EP_LOG_OPTIONS}) set(Boost_INCLUDE_DIR "${BOOST_PREFIX}") set(Boost_INCLUDE_DIRS "${BOOST_INCLUDE_DIR}") - add_dependencies(arrow_dependencies boost_ep) + add_dependencies(toolchain boost_ep) else() if (MSVC) # disable autolinking in boost @@ -506,15 +508,14 @@ if("${DOUBLE_CONVERSION_HOME}" STREQUAL "") CMAKE_ARGS ${DOUBLE_CONVERSION_CMAKE_ARGS} BUILD_BYPRODUCTS "${DOUBLE_CONVERSION_STATIC_LIB}") set(DOUBLE_CONVERSION_VENDORED 1) + add_dependencies(toolchain double-conversion_ep) else() find_package(double-conversion REQUIRED PATHS "${DOUBLE_CONVERSION_HOME}") set(DOUBLE_CONVERSION_VENDORED 0) endif() -if (DOUBLE_CONVERSION_VENDORED) - add_dependencies(arrow_dependencies double-conversion_ep) -else() +if (NOT DOUBLE_CONVERSION_VENDORED) get_property(DOUBLE_CONVERSION_STATIC_LIB TARGET double-conversion::double-conversion PROPERTY LOCATION) get_property(DOUBLE_CONVERSION_INCLUDE_DIR TARGET double-conversion::double-conversion @@ -532,9 +533,6 @@ message(STATUS "double-conversion static library: ${DOUBLE_CONVERSION_STATIC_LIB # ---------------------------------------------------------------------- # Google gtest & gflags -add_custom_target(unittest ctest -L unittest) -add_custom_target(benchmark ctest -L benchmark) - if(ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) if("${GTEST_HOME}" STREQUAL "") @@ -699,6 +697,7 @@ if (ARROW_IPC) ExternalProject_Get_Property(rapidjson_ep SOURCE_DIR) set(RAPIDJSON_INCLUDE_DIR "${SOURCE_DIR}/include") set(RAPIDJSON_VENDORED 1) + add_dependencies(toolchain rapidjson_ep) else() set(RAPIDJSON_INCLUDE_DIR "${RAPIDJSON_HOME}/include") set(RAPIDJSON_VENDORED 0) @@ -706,10 +705,6 @@ if (ARROW_IPC) message(STATUS "RapidJSON include dir: ${RAPIDJSON_INCLUDE_DIR}") include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR}) - if(RAPIDJSON_VENDORED) - add_dependencies(arrow_dependencies rapidjson_ep) - endif() - ## Flatbuffers if("${FLATBUFFERS_HOME}" STREQUAL "") set(FLATBUFFERS_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_ep-prefix/src/flatbuffers_ep-install") @@ -733,15 +728,12 @@ if (ARROW_IPC) set(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_PREFIX}/include") set(FLATBUFFERS_COMPILER "${FLATBUFFERS_PREFIX}/bin/flatc") set(FLATBUFFERS_VENDORED 1) + add_dependencies(toolchain flatbuffers_ep) else() find_package(Flatbuffers REQUIRED) set(FLATBUFFERS_VENDORED 0) endif() - if(FLATBUFFERS_VENDORED) - add_dependencies(arrow_dependencies flatbuffers_ep) - endif() - message(STATUS "Flatbuffers include dir: ${FLATBUFFERS_INCLUDE_DIR}") message(STATUS "Flatbuffers compiler: ${FLATBUFFERS_COMPILER}") include_directories(SYSTEM ${FLATBUFFERS_INCLUDE_DIR}) @@ -1155,6 +1147,7 @@ if (ARROW_GANDIVA) CMAKE_ARGS ${RE2_CMAKE_ARGS} BUILD_BYPRODUCTS "${RE2_STATIC_LIB}") set (RE2_VENDORED 1) + add_dependencies(toolchain re2_ep) else () find_package (RE2 REQUIRED) set (RE2_VENDORED 0) @@ -1171,10 +1164,6 @@ if (ARROW_GANDIVA) STATIC_LIB ${RE2_STATIC_LIB}) set(RE2_LIBRARY re2_static) endif() - - if (RE2_VENDORED) - add_dependencies (arrow_dependencies re2_ep) - endif () endif () @@ -1317,6 +1306,8 @@ if (ARROW_ORC) CMAKE_ARGS ${ORC_CMAKE_ARGS} ${EP_LOG_OPTIONS}) + add_dependencies(toolchain orc_ep) + set(ORC_VENDORED 1) add_dependencies(orc_ep ${ZLIB_LIBRARY}) if (LZ4_VENDORED) @@ -1342,7 +1333,6 @@ if (ARROW_ORC) if (ORC_VENDORED) add_dependencies(orc_static orc_ep) endif() - endif() # ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9291addca0e1c..8dd2ac082db0a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -15,13 +15,17 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(arrow-all) add_custom_target(arrow) +add_custom_target(arrow-benchmarks) +add_custom_target(arrow-tests) +add_dependencies(arrow-all arrow arrow-tests arrow-benchmarks) # Adding unit tests part of the "arrow" portion of the test suite function(ADD_ARROW_TEST REL_TEST_NAME) set(options) set(one_value_args PREFIX) - set(multi_value_args) + set(multi_value_args LABELS) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if (ARG_PREFIX) @@ -29,9 +33,16 @@ function(ADD_ARROW_TEST REL_TEST_NAME) else() set(PREFIX "arrow") endif() + + if (ARG_LABELS) + set(LABELS ${ARG_LABELS}) + else() + set(LABELS "arrow-tests") + endif() + ADD_TEST_CASE(${REL_TEST_NAME} PREFIX ${PREFIX} - LABELS "unittest;arrow" + LABELS ${LABELS} ${ARG_UNPARSED_ARGUMENTS}) endfunction() @@ -47,7 +58,7 @@ function(ADD_ARROW_BENCHMARK REL_TEST_NAME) endif() ADD_BENCHMARK(${REL_TEST_NAME} PREFIX ${PREFIX} - LABELS "benchmark;arrow" + LABELS "arrow-benchmarks" ${ARG_UNPARSED_ARGUMENTS}) endfunction() @@ -215,6 +226,8 @@ ADD_ARROW_LIB(arrow SHARED_PRIVATE_LINK_LIBS ${ARROW_SHARED_PRIVATE_LINK_LIBS} STATIC_LINK_LIBS ${ARROW_STATIC_LINK_LIBS}) +add_dependencies(arrow ${ARROW_LIBRARIES}) + if (ARROW_BUILD_STATIC AND WIN32) target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) endif() diff --git a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt index 9fd7f924d3a69..d2640a66b2f8f 100644 --- a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt +++ b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt @@ -16,6 +16,7 @@ # under the License. add_custom_target(arrow_hiveserver2) +add_custom_target(arrow_hiveserver2-tests) # Headers: top level ARROW_INSTALL_ALL_HEADERS("arrow/dbi/hiveserver2") @@ -103,9 +104,9 @@ set(ARROW_HIVESERVER2_TEST_LINK_LIBS thriftstatic) if (ARROW_BUILD_TESTS) - ADD_ARROW_TEST(hiveserver2-test + ADD_TEST_CASE(hiveserver2-test STATIC_LINK_LIBS "${ARROW_HIVESERVER2_TEST_LINK_LIBS}" - LABELS "arrow_hiveserver2" + LABELS "arrow_hiveserver2-tests" ) if (TARGET arrow-hiveserver2-test) set_property(TARGET arrow-hiveserver2-test diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index aa56269a8953e..2feaee1160b07 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -75,7 +75,6 @@ set(ARROW_FLIGHT_SRCS ADD_ARROW_LIB(arrow_flight SOURCES ${ARROW_FLIGHT_SRCS} - DEPENDENCIES arrow_dependencies SHARED_LINK_LIBS arrow_shared ${ARROW_FLIGHT_STATIC_LINK_LIBS} STATIC_LINK_LIBS arrow_static ${ARROW_FLIGHT_STATIC_LINK_LIBS}) diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index 8b69c654bb1fe..2fcdf23e42ad7 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -19,6 +19,12 @@ # arrow_cuda ####################################### +add_custom_target(arrow_cuda-all) +add_custom_target(arrow_cuda) +add_custom_target(arrow_cuda-benchmarks) +add_custom_target(arrow_cuda-tests) +add_dependencies(arrow_cuda-all arrow_cuda arrow_cuda-tests arrow_cuda-benchmarks) + if (DEFINED ENV{CUDA_HOME}) set(CUDA_TOOLKIT_ROOT_DIR "$ENV{CUDA_HOME}") endif() @@ -49,6 +55,8 @@ ADD_ARROW_LIB(arrow_cuda STATIC_LINK_LIBS ${ARROW_CUDA_SHARED_LINK_LIBS} ) +add_dependencies(arrow_cuda ${ARROW_CUDA_LIBRARIES}) + foreach(LIB_TARGET ${ARROW_CUDA_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) @@ -77,9 +85,10 @@ if (ARROW_BUILD_TESTS) endif() if (ARROW_BUILD_BENCHMARKS) - cuda_add_executable(cuda-benchmark cuda-benchmark.cc) - target_link_libraries(cuda-benchmark + cuda_add_executable(arrow-cuda-benchmark cuda-benchmark.cc) + target_link_libraries(arrow-cuda-benchmark arrow_cuda_shared gtest_static ${ARROW_BENCHMARK_LINK_LIBS}) + add_dependencies(arrow_cuda-benchmarks arrow-cuda-benchmark) endif() diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index c44f7b9fe1bfe..422e72e2edae2 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -17,7 +17,7 @@ # Targets required for protocol integration testing add_custom_target(integration) -add_dependencies(arrow integration) +add_dependencies(arrow-tests integration) ####################################### # Messaging and interprocess communication diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 98c105ae623ce..cccbf09d4fb4d 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -22,7 +22,10 @@ find_package(PythonLibsNew REQUIRED) find_package(NumPy REQUIRED) +add_custom_target(arrow_python-all) add_custom_target(arrow_python) +add_custom_target(arrow_python-tests) +add_dependencies(arrow_python-all arrow_python arrow_python-tests) set(ARROW_PYTHON_SRCS arrow_to_pandas.cc @@ -130,6 +133,6 @@ if (ARROW_BUILD_TESTS) STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}" EXTRA_LINK_LIBS ${PYTHON_LIBRARIES} EXTRA_INCLUDES "${ARROW_PYTHON_INCLUDES}" - LABELS "arrow_python" + LABELS "arrow_python-tests" NO_VALGRIND) endif() diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index da0d3bba69147..8052db5e8545d 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -15,12 +15,15 @@ # specific language governing permissions and limitations # under the License. -project(gandiva) - -find_package(LLVM) - # For "make gandiva" to build everything Gandiva-related +add_custom_target(gandiva-all) add_custom_target(gandiva) +add_custom_target(gandiva-tests) +add_custom_target(gandiva-benchmarks) + +add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks) + +find_package(LLVM) # Set the path where the byte-code files will be installed. set(GANDIVA_BC_INSTALL_DIR @@ -80,7 +83,7 @@ endif() ADD_ARROW_LIB(gandiva SOURCES ${SRC_FILES} OUTPUTS GANDIVA_LIBRARIES - DEPENDENCIES arrow_dependencies precompiled + DEPENDENCIES precompiled EXTRA_INCLUDES $ SHARED_LINK_LIBS arrow_shared @@ -120,7 +123,7 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) set(TEST_ARGUMENTS ENABLED PREFIX "gandiva" - LABELS "unittest;gandiva" + LABELS "gandiva-tests" ${ARG_UNPARSED_ARGUMENTS}) # and uses less disk space, but in some cases we need to force static diff --git a/cpp/src/gandiva/jni/CMakeLists.txt b/cpp/src/gandiva/jni/CMakeLists.txt index ab04f536b4dd2..a07d3903a75ac 100644 --- a/cpp/src/gandiva/jni/CMakeLists.txt +++ b/cpp/src/gandiva/jni/CMakeLists.txt @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -project(gandiva_jni) - if(CMAKE_VERSION VERSION_LESS 3.11) message(FATAL_ERROR "Building the Gandiva JNI bindings requires CMake version >= 3.11") endif() diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 0792fd6421d65..2af49084bf310 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -58,7 +58,7 @@ function(add_precompiled_unit_test REL_TEST_NAME) set(TEST_NAME "gandiva-precompiled-${TEST_NAME}") add_executable(${TEST_NAME} ${REL_TEST_NAME} ${ARGN}) - add_dependencies(gandiva ${TEST_NAME}) + add_dependencies(gandiva-tests ${TEST_NAME}) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/src) target_link_libraries(${TEST_NAME} PRIVATE ${ARROW_TEST_LINK_LIBS} ${RE2_LIBRARY} diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 995c39adb7d35..4eb8f68a2ba98 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -15,6 +15,12 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(parquet-all) +add_custom_target(parquet) +add_custom_target(parquet-benchmarks) +add_custom_target(parquet-tests) +add_dependencies(parquet-all parquet parquet-tests parquet-benchmarks) + file(READ "${CMAKE_CURRENT_SOURCE_DIR}/.parquetcppversion" PARQUET_VERSION) string(REPLACE "\n" "" PARQUET_VERSION "${PARQUET_VERSION}") string(REGEX MATCH "^([0-9]+\.[0-9]+\.[0-9]+(\.[0-9]+)?)" VERSION ${PARQUET_VERSION}) @@ -22,9 +28,6 @@ if(NOT VERSION) message(FATAL_ERROR "invalid .parquetcppversion") endif() -# For "make parquet" to build everything Parquet-related -add_custom_target(parquet) - function(ADD_PARQUET_TEST REL_TEST_NAME) set(options USE_STATIC_LINKING) set(one_value_args) @@ -34,19 +37,21 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") endif() + set(TEST_ARGUMENTS + PREFIX "parquet" + LABELS "parquet-tests") + # By default we prefer shared linking with libparquet, as it's faster # and uses less disk space, but in some cases we need to force static # linking (see rationale below). if (ARG_USE_STATIC_LINKING) ADD_TEST_CASE(${REL_TEST_NAME} STATIC_LINK_LIBS ${PARQUET_STATIC_TEST_LINK_LIBS} - PREFIX "parquet" - LABELS "unittest;parquet") + ${TEST_ARGUMENTS}) else() ADD_TEST_CASE(${REL_TEST_NAME} STATIC_LINK_LIBS ${PARQUET_SHARED_TEST_LINK_LIBS} - PREFIX "parquet" - LABELS "unittest;parquet") + ${TEST_ARGUMENTS}) endif() endfunction() @@ -217,6 +222,8 @@ ADD_ARROW_LIB(parquet STATIC_LINK_LIBS ${PARQUET_STATIC_LINK_LIBS} ) +add_dependencies(parquet ${PARQUET_LIBRARIES}) + # Thrift requires these definitions for some types that we use foreach(LIB_TARGET ${PARQUET_LIBRARIES}) target_compile_definitions(${LIB_TARGET} @@ -232,8 +239,6 @@ foreach(LIB_TARGET ${PARQUET_LIBRARIES}) endif() endforeach() -add_dependencies(parquet ${PARQUET_LIBRARIES}) - add_subdirectory(api) add_subdirectory(arrow) add_subdirectory(util) @@ -271,7 +276,9 @@ ADD_PARQUET_TEST(schema-test USE_STATIC_LINKING) ADD_ARROW_BENCHMARK(column-io-benchmark PREFIX "parquet" + LABELS "parquet-benchmarks" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) ADD_ARROW_BENCHMARK(encoding-benchmark PREFIX "parquet" + LABELS "parquet-benchmarks" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index 89afc39a23376..f4e4f7e0b975a 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -20,9 +20,7 @@ ADD_PARQUET_TEST(arrow-reader-writer-test) ADD_BENCHMARK(reader-writer-benchmark PREFIX "parquet-arrow" + LABELS "parquet-benchmarks" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) -if (TARGET parquet-arrow-reader-writer-benchmark) - add_dependencies(parquet parquet-arrow-reader-writer-benchmark) -endif() ARROW_INSTALL_ALL_HEADERS("parquet/arrow") diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 83c201d0f45a0..d9c7dcaedeac3 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -15,7 +15,11 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(plasma-all) add_custom_target(plasma) +add_custom_target(plasma-benchmarks) +add_custom_target(plasma-tests) +add_dependencies(plasma-all plasma plasma-tests plasma-benchmarks) # For the moment, Plasma is versioned like Arrow project(plasma VERSION "${ARROW_BASE_VERSION}") @@ -199,7 +203,7 @@ function(ADD_PLASMA_TEST REL_TEST_NAME) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) ADD_TEST_CASE(${REL_TEST_NAME} PREFIX "plasma" - LABELS "unittest;plasma" + LABELS "plasma-tests" ${ARG_UNPARSED_ARGUMENTS}) endfunction() From e9ed591db9cb87e0086bf9fef4201cc726bd5d03 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Mon, 17 Dec 2018 15:35:38 -0600 Subject: [PATCH 067/328] ARROW-4028: [Rust] Merge parquet-rs codebase This imports parquet-rs source code into Apache Arrow Rust implementation. I include most of the source code except a few things such as `fuzz` and benchmarks. Thinking about adding them later. The module hierarchy now looks like: - arrow: all the arrow code - parquet: all the parquet code (in future, parquet-arrow integration will live here) - util: common util libraries shared between arrow and parquet (I'll try to move the utils from parquet to here in future). Author: Chao Sun Author: Chao Sun Closes #3050 from sunchao/import-parquet and squashes the following commits: 2ce98bd2a Update git submodule 2d296f8f7 ARROW-4028: Merge parquet-rs codebase --- ci/rust-build-main.bat | 3 + ci/travis_script_rust.sh | 2 + cpp/submodules/parquet-testing | 2 +- docker-compose.yml | 2 + rust/Cargo.toml | 12 + rust/benches/array_from_vec.rs | 1 - rust/benches/builder.rs | 6 +- rust/build.rs | 43 + rust/examples/read_csv.rs | 5 +- rust/rustfmt.toml | 18 + rust/src/array.rs | 8 +- rust/src/array_data.rs | 3 +- rust/src/builder.rs | 6 +- rust/src/csv/reader.rs | 10 +- rust/src/lib.rs | 6 + rust/src/mod.rs | 28 + rust/src/parquet/basic.rs | 1497 ++++++++++ rust/src/parquet/column/mod.rs | 124 + rust/src/parquet/column/page.rs | 296 ++ rust/src/parquet/column/reader.rs | 1576 ++++++++++ rust/src/parquet/column/writer.rs | 1617 +++++++++++ rust/src/parquet/compression.rs | 321 +++ rust/src/parquet/data_type.rs | 463 +++ rust/src/parquet/encodings/decoding.rs | 1403 +++++++++ rust/src/parquet/encodings/encoding.rs | 1360 +++++++++ rust/src/parquet/encodings/levels.rs | 529 ++++ rust/src/parquet/encodings/mod.rs | 21 + rust/src/parquet/encodings/rle.rs | 839 ++++++ rust/src/parquet/errors.rs | 87 + rust/src/parquet/file/metadata.rs | 736 +++++ rust/src/parquet/file/mod.rs | 88 + rust/src/parquet/file/properties.rs | 648 +++++ rust/src/parquet/file/reader.rs | 899 ++++++ rust/src/parquet/file/statistics.rs | 692 +++++ rust/src/parquet/file/writer.rs | 936 ++++++ rust/src/parquet/mod.rs | 34 + rust/src/parquet/record/api.rs | 1439 ++++++++++ rust/src/parquet/record/mod.rs | 24 + rust/src/parquet/record/reader.rs | 1464 ++++++++++ rust/src/parquet/record/triplet.rs | 561 ++++ rust/src/parquet/schema/mod.rs | 66 + rust/src/parquet/schema/parser.rs | 764 +++++ rust/src/parquet/schema/printer.rs | 467 +++ rust/src/parquet/schema/types.rs | 1830 ++++++++++++ rust/src/parquet/util/bit_packing.rs | 3658 ++++++++++++++++++++++++ rust/src/parquet/util/bit_util.rs | 1058 +++++++ rust/src/parquet/util/hash_util.rs | 160 ++ rust/src/parquet/util/io.rs | 220 ++ rust/src/parquet/util/memory.rs | 524 ++++ rust/src/parquet/util/mod.rs | 26 + rust/src/parquet/util/test_common.rs | 190 ++ rust/src/record_batch.rs | 4 +- rust/src/tensor.rs | 1 + 53 files changed, 26757 insertions(+), 20 deletions(-) create mode 100644 rust/build.rs create mode 100644 rust/rustfmt.toml create mode 100644 rust/src/mod.rs create mode 100644 rust/src/parquet/basic.rs create mode 100644 rust/src/parquet/column/mod.rs create mode 100644 rust/src/parquet/column/page.rs create mode 100644 rust/src/parquet/column/reader.rs create mode 100644 rust/src/parquet/column/writer.rs create mode 100644 rust/src/parquet/compression.rs create mode 100644 rust/src/parquet/data_type.rs create mode 100644 rust/src/parquet/encodings/decoding.rs create mode 100644 rust/src/parquet/encodings/encoding.rs create mode 100644 rust/src/parquet/encodings/levels.rs create mode 100644 rust/src/parquet/encodings/mod.rs create mode 100644 rust/src/parquet/encodings/rle.rs create mode 100644 rust/src/parquet/errors.rs create mode 100644 rust/src/parquet/file/metadata.rs create mode 100644 rust/src/parquet/file/mod.rs create mode 100644 rust/src/parquet/file/properties.rs create mode 100644 rust/src/parquet/file/reader.rs create mode 100644 rust/src/parquet/file/statistics.rs create mode 100644 rust/src/parquet/file/writer.rs create mode 100644 rust/src/parquet/mod.rs create mode 100644 rust/src/parquet/record/api.rs create mode 100644 rust/src/parquet/record/mod.rs create mode 100644 rust/src/parquet/record/reader.rs create mode 100644 rust/src/parquet/record/triplet.rs create mode 100644 rust/src/parquet/schema/mod.rs create mode 100644 rust/src/parquet/schema/parser.rs create mode 100644 rust/src/parquet/schema/printer.rs create mode 100644 rust/src/parquet/schema/types.rs create mode 100644 rust/src/parquet/util/bit_packing.rs create mode 100644 rust/src/parquet/util/bit_util.rs create mode 100644 rust/src/parquet/util/hash_util.rs create mode 100644 rust/src/parquet/util/io.rs create mode 100644 rust/src/parquet/util/memory.rs create mode 100644 rust/src/parquet/util/mod.rs create mode 100644 rust/src/parquet/util/test_common.rs diff --git a/ci/rust-build-main.bat b/ci/rust-build-main.bat index c8a51fef6ec46..e338f7e172e6e 100644 --- a/ci/rust-build-main.bat +++ b/ci/rust-build-main.bat @@ -17,6 +17,9 @@ @rem The "main" Rust build script for Windows CI +@rem Retrieve git submodules, configure env var for Parquet unit tests +git submodule update --init || exit /B +set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data pushd rust @echo =================================== diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh index 55cce8f354e44..4b09bc22e4c20 100755 --- a/ci/travis_script_rust.sh +++ b/ci/travis_script_rust.sh @@ -19,6 +19,8 @@ set -e +source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh + RUST_DIR=${TRAVIS_BUILD_DIR}/rust pushd $RUST_DIR diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 46ae2605c2de3..92a8e6c2efdce 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 46ae2605c2de306f5740587107dcf333a527f2d1 +Subproject commit 92a8e6c2efdce1925c605d6313994db2c94478fb diff --git a/docker-compose.yml b/docker-compose.yml index 0a01a7cbe97bf..b61511ee56dea 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -152,6 +152,8 @@ services: build: context: . dockerfile: rust/Dockerfile + environment: + PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data volumes: *ubuntu-volumes r: diff --git a/rust/Cargo.toml b/rust/Cargo.toml index aa23815f74085..49e8a9d9c8470 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -42,10 +42,22 @@ serde_derive = "1.0.80" serde_json = "1.0.13" rand = "0.5" csv = "1.0.0" +parquet-format = "2.5.0" +quick-error = "1.2.2" +byteorder = "1" +thrift = "0.0.4" +snap = "0.2" +brotli = "2.5" +flate2 = "1.0.2" +lz4 = "1.23" +zstd = "0.4" +chrono = "0.4" +num-bigint = "0.2" num = "0.2" [dev-dependencies] criterion = "0.2" +lazy_static = "1" [[bench]] name = "array_from_vec" diff --git a/rust/benches/array_from_vec.rs b/rust/benches/array_from_vec.rs index 669b88eaa40d9..f9357140922a6 100644 --- a/rust/benches/array_from_vec.rs +++ b/rust/benches/array_from_vec.rs @@ -17,7 +17,6 @@ #[macro_use] extern crate criterion; - use criterion::Criterion; extern crate arrow; diff --git a/rust/benches/builder.rs b/rust/benches/builder.rs index 04f8a33b5bd55..90fd75a0da390 100644 --- a/rust/benches/builder.rs +++ b/rust/benches/builder.rs @@ -19,11 +19,13 @@ extern crate arrow; extern crate criterion; extern crate rand; -use arrow::builder::*; +use std::mem::size_of; + use criterion::*; use rand::distributions::Standard; use rand::{thread_rng, Rng}; -use std::mem::size_of; + +use arrow::builder::*; // Build arrays with 512k elements. const BATCH_SIZE: usize = 8 << 10; diff --git a/rust/build.rs b/rust/build.rs new file mode 100644 index 0000000000000..b42b2a4babfec --- /dev/null +++ b/rust/build.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::process::Command; + +fn main() { + // Set Parquet version, build hash and "created by" string. + let version = env!("CARGO_PKG_VERSION"); + let mut created_by = format!("parquet-rs version {}", version); + if let Ok(git_hash) = run(Command::new("git").arg("rev-parse").arg("HEAD")) { + created_by.push_str(format!(" (build {})", git_hash).as_str()); + println!("cargo:rustc-env=PARQUET_BUILD={}", git_hash); + } + println!("cargo:rustc-env=PARQUET_VERSION={}", version); + println!("cargo:rustc-env=PARQUET_CREATED_BY={}", created_by); +} + +/// Runs command and returns either content of stdout for successful execution, +/// or an error message otherwise. +fn run(command: &mut Command) -> Result { + println!("Running: `{:?}`", command); + match command.output() { + Ok(ref output) if output.status.success() => { + Ok(String::from_utf8_lossy(&output.stdout).trim().to_string()) + } + Ok(ref output) => Err(format!("Failed: `{:?}` ({})", command, output.status)), + Err(error) => Err(format!("Failed: `{:?}` ({})", command, error)), + } +} diff --git a/rust/examples/read_csv.rs b/rust/examples/read_csv.rs index df66a8112e5f2..147d2f9c23845 100644 --- a/rust/examples/read_csv.rs +++ b/rust/examples/read_csv.rs @@ -17,11 +17,12 @@ extern crate arrow; +use std::fs::File; +use std::sync::Arc; + use arrow::array::{BinaryArray, Float64Array}; use arrow::csv; use arrow::datatypes::{DataType, Field, Schema}; -use std::fs::File; -use std::sync::Arc; fn main() { let schema = Schema::new(vec![ diff --git a/rust/rustfmt.toml b/rust/rustfmt.toml new file mode 100644 index 0000000000000..72eeee0af1c53 --- /dev/null +++ b/rust/rustfmt.toml @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +format_doc_comments = true \ No newline at end of file diff --git a/rust/src/array.rs b/rust/src/array.rs index 11e732a1267ea..251dd35eea150 100644 --- a/rust/src/array.rs +++ b/rust/src/array.rs @@ -657,12 +657,14 @@ impl From> for StructArray { #[cfg(test)] mod tests { use super::*; + + use std::sync::Arc; + use std::thread; + use crate::array_data::ArrayData; use crate::buffer::Buffer; - use crate::datatypes::{DataType, Field, ToByteSlice}; + use crate::datatypes::{DataType, Field}; use crate::memory; - use std::sync::Arc; - use std::thread; #[test] fn test_primitive_array_from_vec() { diff --git a/rust/src/array_data.rs b/rust/src/array_data.rs index 36a817ee579a0..9ea01a402a9cb 100644 --- a/rust/src/array_data.rs +++ b/rust/src/array_data.rs @@ -225,9 +225,10 @@ impl ArrayDataBuilder { #[cfg(test)] mod tests { + use super::*; + use std::sync::Arc; - use super::{ArrayData, DataType}; use crate::buffer::Buffer; use crate::util::bit_util; diff --git a/rust/src/builder.rs b/rust/src/builder.rs index fc781ffa50641..d5d222d006fe8 100644 --- a/rust/src/builder.rs +++ b/rust/src/builder.rs @@ -456,10 +456,10 @@ impl BinaryArrayBuilder { #[cfg(test)] mod tests { - use crate::array::Array; - use super::*; + use crate::array::Array; + #[test] fn test_builder_i32_empty() { let b = Int32BufferBuilder::new(5); @@ -825,7 +825,6 @@ mod tests { #[test] fn test_binary_array_builder() { - use crate::array::BinaryArray; let mut builder = BinaryArrayBuilder::new(20); builder.push(b'h').unwrap(); @@ -860,7 +859,6 @@ mod tests { #[test] fn test_binary_array_builder_push_string() { - use crate::array::BinaryArray; let mut builder = BinaryArrayBuilder::new(20); let var = "hello".to_owned(); diff --git a/rust/src/csv/reader.rs b/rust/src/csv/reader.rs index 956408e4a40c3..632aa7ae7936d 100644 --- a/rust/src/csv/reader.rs +++ b/rust/src/csv/reader.rs @@ -29,16 +29,16 @@ //! use std::sync::Arc; //! //! let schema = Schema::new(vec![ -//! Field::new("city", DataType::Utf8, false), -//! Field::new("lat", DataType::Float64, false), -//! Field::new("lng", DataType::Float64, false), +//! Field::new("city", DataType::Utf8, false), +//! Field::new("lat", DataType::Float64, false), +//! Field::new("lng", DataType::Float64, false), //! ]); //! //! let file = File::open("test/data/uk_cities.csv").unwrap(); //! //! let mut csv = csv::Reader::new(file, Arc::new(schema), false, 1024, None); //! let batch = csv.next().unwrap().unwrap(); -//!``` +//! ``` use std::fs::File; use std::io::BufReader; @@ -195,8 +195,8 @@ impl Reader { #[cfg(test)] mod tests { - use super::*; + use crate::array::*; use crate::datatypes::Field; diff --git a/rust/src/lib.rs b/rust/src/lib.rs index f41d08f1427a6..d5708b10504c4 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -15,7 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![feature(type_ascription)] +#![feature(rustc_private)] #![feature(specialization)] +#![feature(try_from)] +#![allow(dead_code)] +#![allow(non_camel_case_types)] pub mod array; pub mod array_data; @@ -27,6 +32,7 @@ pub mod csv; pub mod datatypes; pub mod error; pub mod memory; +pub mod parquet; pub mod record_batch; pub mod tensor; pub mod util; diff --git a/rust/src/mod.rs b/rust/src/mod.rs new file mode 100644 index 0000000000000..b9fa43ab8184b --- /dev/null +++ b/rust/src/mod.rs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod array; +pub mod array_data; +pub mod bitmap; +pub mod buffer; +pub mod builder; +pub mod csv; +pub mod datatypes; +pub mod error; +pub mod memory; +pub mod record_batch; +pub mod tensor; diff --git a/rust/src/parquet/basic.rs b/rust/src/parquet/basic.rs new file mode 100644 index 0000000000000..22e16347dc00f --- /dev/null +++ b/rust/src/parquet/basic.rs @@ -0,0 +1,1497 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains Rust mappings for Thrift definition. +//! Refer to `parquet.thrift` file to see raw definitions. + +use std::{convert, fmt, result, str}; + +use parquet_format as parquet; + +use crate::parquet::errors::ParquetError; + +// ---------------------------------------------------------------------- +// Types from the Thrift definition + +// ---------------------------------------------------------------------- +// Mirrors `parquet::Type` + +/// Types supported by Parquet. +/// These physical types are intended to be used in combination with the encodings to +/// control the on disk storage format. +/// For example INT16 is not included as a type since a good encoding of INT32 +/// would handle this. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Type { + BOOLEAN, + INT32, + INT64, + INT96, + FLOAT, + DOUBLE, + BYTE_ARRAY, + FIXED_LEN_BYTE_ARRAY, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::ConvertedType` + +/// Common types (logical types) used by frameworks when using Parquet. +/// This helps map between types in those frameworks to the base types in Parquet. +/// This is only metadata and not needed to read or write the data. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum LogicalType { + NONE, + /// A BYTE_ARRAY actually contains UTF8 encoded chars. + UTF8, + + /// A map is converted as an optional field containing a repeated key/value pair. + MAP, + + /// A key/value pair is converted into a group of two fields. + MAP_KEY_VALUE, + + /// A list is converted into an optional field containing a repeated field for its + /// values. + LIST, + + /// An enum is converted into a binary field + ENUM, + + /// A decimal value. + /// This may be used to annotate binary or fixed primitive types. The + /// underlying byte array stores the unscaled value encoded as two's + /// complement using big-endian byte order (the most significant byte is the + /// zeroth element). + /// + /// This must be accompanied by a (maximum) precision and a scale in the + /// SchemaElement. The precision specifies the number of digits in the decimal + /// and the scale stores the location of the decimal point. For example 1.23 + /// would have precision 3 (3 total digits) and scale 2 (the decimal point is + /// 2 digits over). + DECIMAL, + + /// A date stored as days since Unix epoch, encoded as the INT32 physical type. + DATE, + + /// The total number of milliseconds since midnight. The value is stored as an INT32 + /// physical type. + TIME_MILLIS, + + /// The total number of microseconds since midnight. The value is stored as an INT64 + /// physical type. + TIME_MICROS, + + /// Date and time recorded as milliseconds since the Unix epoch. + /// Recorded as a physical type of INT64. + TIMESTAMP_MILLIS, + + /// Date and time recorded as microseconds since the Unix epoch. + /// The value is stored as an INT64 physical type. + TIMESTAMP_MICROS, + + /// An unsigned 8 bit integer value stored as INT32 physical type. + UINT_8, + + /// An unsigned 16 bit integer value stored as INT32 physical type. + UINT_16, + + /// An unsigned 32 bit integer value stored as INT32 physical type. + UINT_32, + + /// An unsigned 64 bit integer value stored as INT64 physical type. + UINT_64, + + /// A signed 8 bit integer value stored as INT32 physical type. + INT_8, + + /// A signed 16 bit integer value stored as INT32 physical type. + INT_16, + + /// A signed 32 bit integer value stored as INT32 physical type. + INT_32, + + /// A signed 64 bit integer value stored as INT64 physical type. + INT_64, + + /// A JSON document embedded within a single UTF8 column. + JSON, + + /// A BSON document embedded within a single BINARY column. + BSON, + + /// An interval of time. + /// + /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12. + /// This data is composed of three separate little endian unsigned integers. + /// Each stores a component of a duration of time. The first integer identifies + /// the number of months associated with the duration, the second identifies + /// the number of days associated with the duration and the third identifies + /// the number of milliseconds associated with the provided duration. + /// This duration of time is independent of any particular timezone or date. + INTERVAL, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::FieldRepetitionType` + +/// Representation of field types in schema. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Repetition { + /// Field is required (can not be null) and each record has exactly 1 value. + REQUIRED, + /// Field is optional (can be null) and each record has 0 or 1 values. + OPTIONAL, + /// Field is repeated and can contain 0 or more values. + REPEATED, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::Encoding` + +/// Encodings supported by Parquet. +/// Not all encodings are valid for all types. These enums are also used to specify the +/// encoding of definition and repetition levels. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum Encoding { + /// Default byte encoding. + /// - BOOLEAN - 1 bit per value, 0 is false; 1 is true. + /// - INT32 - 4 bytes per value, stored as little-endian. + /// - INT64 - 8 bytes per value, stored as little-endian. + /// - FLOAT - 4 bytes per value, stored as little-endian. + /// - DOUBLE - 8 bytes per value, stored as little-endian. + /// - BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. + /// - FIXED_LEN_BYTE_ARRAY - just the bytes are stored. + PLAIN, + + /// **Deprecated** dictionary encoding. + /// + /// The values in the dictionary are encoded using PLAIN encoding. + /// Since it is deprecated, RLE_DICTIONARY encoding is used for a data page, and PLAIN + /// encoding is used for dictionary page. + PLAIN_DICTIONARY, + + /// Group packed run length encoding. + /// + /// Usable for definition/repetition levels encoding and boolean values. + RLE, + + /// Bit packed encoding. + /// + /// This can only be used if the data has a known max width. + /// Usable for definition/repetition levels encoding. + BIT_PACKED, + + /// Delta encoding for integers, either INT32 or INT64. + /// + /// Works best on sorted data. + DELTA_BINARY_PACKED, + + /// Encoding for byte arrays to separate the length values and the data. + /// + /// The lengths are encoded using DELTA_BINARY_PACKED encoding. + DELTA_LENGTH_BYTE_ARRAY, + + /// Incremental encoding for byte arrays. + /// + /// Prefix lengths are encoded using DELTA_BINARY_PACKED encoding. + /// Suffixes are stored using DELTA_LENGTH_BYTE_ARRAY encoding. + DELTA_BYTE_ARRAY, + + /// Dictionary encoding. + /// + /// The ids are encoded using the RLE encoding. + RLE_DICTIONARY, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::CompressionCodec` + +/// Supported compression algorithms. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Compression { + UNCOMPRESSED, + SNAPPY, + GZIP, + LZO, + BROTLI, + LZ4, + ZSTD, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::PageType` + +/// Available data pages for Parquet file format. +/// Note that some of the page types may not be supported. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum PageType { + DATA_PAGE, + INDEX_PAGE, + DICTIONARY_PAGE, + DATA_PAGE_V2, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::ColumnOrder` + +/// Sort order for page and column statistics. +/// +/// Types are associated with sort orders and column stats are aggregated using a sort +/// order, and a sort order should be considered when comparing values with statistics +/// min/max. +/// +/// See reference in +/// https://github.com/apache/parquet-cpp/blob/master/src/parquet/types.h +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum SortOrder { + /// Signed (either value or legacy byte-wise) comparison. + SIGNED, + /// Unsigned (depending on physical type either value or byte-wise) comparison. + UNSIGNED, + /// Comparison is undefined. + UNDEFINED, +} + +/// Column order that specifies what method was used to aggregate min/max values for +/// statistics. +/// +/// If column order is undefined, then it is the legacy behaviour and all values should +/// be compared as signed values/bytes. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum ColumnOrder { + /// Column uses the order defined by its logical or physical type + /// (if there is no logical type), parquet-format 2.4.0+. + TYPE_DEFINED_ORDER(SortOrder), + /// Undefined column order, means legacy behaviour before parquet-format 2.4.0. + /// Sort order is always SIGNED. + UNDEFINED, +} + +impl ColumnOrder { + /// Returns sort order for a physical/logical type. + pub fn get_sort_order(logical_type: LogicalType, physical_type: Type) -> SortOrder { + match logical_type { + // Unsigned byte-wise comparison. + LogicalType::UTF8 | LogicalType::JSON | LogicalType::BSON | LogicalType::ENUM => { + SortOrder::UNSIGNED + } + + LogicalType::INT_8 + | LogicalType::INT_16 + | LogicalType::INT_32 + | LogicalType::INT_64 => SortOrder::SIGNED, + + LogicalType::UINT_8 + | LogicalType::UINT_16 + | LogicalType::UINT_32 + | LogicalType::UINT_64 => SortOrder::UNSIGNED, + + // Signed comparison of the represented value. + LogicalType::DECIMAL => SortOrder::SIGNED, + + LogicalType::DATE => SortOrder::SIGNED, + + LogicalType::TIME_MILLIS + | LogicalType::TIME_MICROS + | LogicalType::TIMESTAMP_MILLIS + | LogicalType::TIMESTAMP_MICROS => SortOrder::SIGNED, + + LogicalType::INTERVAL => SortOrder::UNSIGNED, + + LogicalType::LIST | LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { + SortOrder::UNDEFINED + } + + // Fall back to physical type. + LogicalType::NONE => Self::get_default_sort_order(physical_type), + } + } + + /// Returns default sort order based on physical type. + fn get_default_sort_order(physical_type: Type) -> SortOrder { + match physical_type { + // Order: false, true + Type::BOOLEAN => SortOrder::UNSIGNED, + Type::INT32 | Type::INT64 => SortOrder::SIGNED, + Type::INT96 => SortOrder::UNDEFINED, + // Notes to remember when comparing float/double values: + // If the min is a NaN, it should be ignored. + // If the max is a NaN, it should be ignored. + // If the min is +0, the row group may contain -0 values as well. + // If the max is -0, the row group may contain +0 values as well. + // When looking for NaN values, min and max should be ignored. + Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED, + // unsigned byte-wise comparison + Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNSIGNED, + } + } + + /// Returns sort order associated with this column order. + pub fn sort_order(&self) -> SortOrder { + match *self { + ColumnOrder::TYPE_DEFINED_ORDER(order) => order, + ColumnOrder::UNDEFINED => SortOrder::SIGNED, + } + } +} + +impl fmt::Display for Type { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for LogicalType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for Repetition { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for Encoding { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for Compression { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for PageType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for SortOrder { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for ColumnOrder { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +// ---------------------------------------------------------------------- +// parquet::Type <=> Type conversion + +impl convert::From for Type { + fn from(value: parquet::Type) -> Self { + match value { + parquet::Type::BOOLEAN => Type::BOOLEAN, + parquet::Type::INT32 => Type::INT32, + parquet::Type::INT64 => Type::INT64, + parquet::Type::INT96 => Type::INT96, + parquet::Type::FLOAT => Type::FLOAT, + parquet::Type::DOUBLE => Type::DOUBLE, + parquet::Type::BYTE_ARRAY => Type::BYTE_ARRAY, + parquet::Type::FIXED_LEN_BYTE_ARRAY => Type::FIXED_LEN_BYTE_ARRAY, + } + } +} + +impl convert::From for parquet::Type { + fn from(value: Type) -> Self { + match value { + Type::BOOLEAN => parquet::Type::BOOLEAN, + Type::INT32 => parquet::Type::INT32, + Type::INT64 => parquet::Type::INT64, + Type::INT96 => parquet::Type::INT96, + Type::FLOAT => parquet::Type::FLOAT, + Type::DOUBLE => parquet::Type::DOUBLE, + Type::BYTE_ARRAY => parquet::Type::BYTE_ARRAY, + Type::FIXED_LEN_BYTE_ARRAY => parquet::Type::FIXED_LEN_BYTE_ARRAY, + } + } +} + +// ---------------------------------------------------------------------- +// parquet::ConvertedType <=> LogicalType conversion + +impl convert::From> for LogicalType { + fn from(option: Option) -> Self { + match option { + None => LogicalType::NONE, + Some(value) => match value { + parquet::ConvertedType::UTF8 => LogicalType::UTF8, + parquet::ConvertedType::MAP => LogicalType::MAP, + parquet::ConvertedType::MAP_KEY_VALUE => LogicalType::MAP_KEY_VALUE, + parquet::ConvertedType::LIST => LogicalType::LIST, + parquet::ConvertedType::ENUM => LogicalType::ENUM, + parquet::ConvertedType::DECIMAL => LogicalType::DECIMAL, + parquet::ConvertedType::DATE => LogicalType::DATE, + parquet::ConvertedType::TIME_MILLIS => LogicalType::TIME_MILLIS, + parquet::ConvertedType::TIME_MICROS => LogicalType::TIME_MICROS, + parquet::ConvertedType::TIMESTAMP_MILLIS => LogicalType::TIMESTAMP_MILLIS, + parquet::ConvertedType::TIMESTAMP_MICROS => LogicalType::TIMESTAMP_MICROS, + parquet::ConvertedType::UINT_8 => LogicalType::UINT_8, + parquet::ConvertedType::UINT_16 => LogicalType::UINT_16, + parquet::ConvertedType::UINT_32 => LogicalType::UINT_32, + parquet::ConvertedType::UINT_64 => LogicalType::UINT_64, + parquet::ConvertedType::INT_8 => LogicalType::INT_8, + parquet::ConvertedType::INT_16 => LogicalType::INT_16, + parquet::ConvertedType::INT_32 => LogicalType::INT_32, + parquet::ConvertedType::INT_64 => LogicalType::INT_64, + parquet::ConvertedType::JSON => LogicalType::JSON, + parquet::ConvertedType::BSON => LogicalType::BSON, + parquet::ConvertedType::INTERVAL => LogicalType::INTERVAL, + }, + } + } +} + +impl convert::From for Option { + fn from(value: LogicalType) -> Self { + match value { + LogicalType::NONE => None, + LogicalType::UTF8 => Some(parquet::ConvertedType::UTF8), + LogicalType::MAP => Some(parquet::ConvertedType::MAP), + LogicalType::MAP_KEY_VALUE => Some(parquet::ConvertedType::MAP_KEY_VALUE), + LogicalType::LIST => Some(parquet::ConvertedType::LIST), + LogicalType::ENUM => Some(parquet::ConvertedType::ENUM), + LogicalType::DECIMAL => Some(parquet::ConvertedType::DECIMAL), + LogicalType::DATE => Some(parquet::ConvertedType::DATE), + LogicalType::TIME_MILLIS => Some(parquet::ConvertedType::TIME_MILLIS), + LogicalType::TIME_MICROS => Some(parquet::ConvertedType::TIME_MICROS), + LogicalType::TIMESTAMP_MILLIS => Some(parquet::ConvertedType::TIMESTAMP_MILLIS), + LogicalType::TIMESTAMP_MICROS => Some(parquet::ConvertedType::TIMESTAMP_MICROS), + LogicalType::UINT_8 => Some(parquet::ConvertedType::UINT_8), + LogicalType::UINT_16 => Some(parquet::ConvertedType::UINT_16), + LogicalType::UINT_32 => Some(parquet::ConvertedType::UINT_32), + LogicalType::UINT_64 => Some(parquet::ConvertedType::UINT_64), + LogicalType::INT_8 => Some(parquet::ConvertedType::INT_8), + LogicalType::INT_16 => Some(parquet::ConvertedType::INT_16), + LogicalType::INT_32 => Some(parquet::ConvertedType::INT_32), + LogicalType::INT_64 => Some(parquet::ConvertedType::INT_64), + LogicalType::JSON => Some(parquet::ConvertedType::JSON), + LogicalType::BSON => Some(parquet::ConvertedType::BSON), + LogicalType::INTERVAL => Some(parquet::ConvertedType::INTERVAL), + } + } +} + +// ---------------------------------------------------------------------- +// parquet::FieldRepetitionType <=> Repetition conversion + +impl convert::From for Repetition { + fn from(value: parquet::FieldRepetitionType) -> Self { + match value { + parquet::FieldRepetitionType::REQUIRED => Repetition::REQUIRED, + parquet::FieldRepetitionType::OPTIONAL => Repetition::OPTIONAL, + parquet::FieldRepetitionType::REPEATED => Repetition::REPEATED, + } + } +} + +impl convert::From for parquet::FieldRepetitionType { + fn from(value: Repetition) -> Self { + match value { + Repetition::REQUIRED => parquet::FieldRepetitionType::REQUIRED, + Repetition::OPTIONAL => parquet::FieldRepetitionType::OPTIONAL, + Repetition::REPEATED => parquet::FieldRepetitionType::REPEATED, + } + } +} + +// ---------------------------------------------------------------------- +// parquet::Encoding <=> Encoding conversion + +impl convert::From for Encoding { + fn from(value: parquet::Encoding) -> Self { + match value { + parquet::Encoding::PLAIN => Encoding::PLAIN, + parquet::Encoding::PLAIN_DICTIONARY => Encoding::PLAIN_DICTIONARY, + parquet::Encoding::RLE => Encoding::RLE, + parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED, + parquet::Encoding::DELTA_BINARY_PACKED => Encoding::DELTA_BINARY_PACKED, + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => Encoding::DELTA_LENGTH_BYTE_ARRAY, + parquet::Encoding::DELTA_BYTE_ARRAY => Encoding::DELTA_BYTE_ARRAY, + parquet::Encoding::RLE_DICTIONARY => Encoding::RLE_DICTIONARY, + } + } +} + +impl convert::From for parquet::Encoding { + fn from(value: Encoding) -> Self { + match value { + Encoding::PLAIN => parquet::Encoding::PLAIN, + Encoding::PLAIN_DICTIONARY => parquet::Encoding::PLAIN_DICTIONARY, + Encoding::RLE => parquet::Encoding::RLE, + Encoding::BIT_PACKED => parquet::Encoding::BIT_PACKED, + Encoding::DELTA_BINARY_PACKED => parquet::Encoding::DELTA_BINARY_PACKED, + Encoding::DELTA_LENGTH_BYTE_ARRAY => parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, + Encoding::DELTA_BYTE_ARRAY => parquet::Encoding::DELTA_BYTE_ARRAY, + Encoding::RLE_DICTIONARY => parquet::Encoding::RLE_DICTIONARY, + } + } +} + +// ---------------------------------------------------------------------- +// parquet::CompressionCodec <=> Compression conversion + +impl convert::From for Compression { + fn from(value: parquet::CompressionCodec) -> Self { + match value { + parquet::CompressionCodec::UNCOMPRESSED => Compression::UNCOMPRESSED, + parquet::CompressionCodec::SNAPPY => Compression::SNAPPY, + parquet::CompressionCodec::GZIP => Compression::GZIP, + parquet::CompressionCodec::LZO => Compression::LZO, + parquet::CompressionCodec::BROTLI => Compression::BROTLI, + parquet::CompressionCodec::LZ4 => Compression::LZ4, + parquet::CompressionCodec::ZSTD => Compression::ZSTD, + } + } +} + +impl convert::From for parquet::CompressionCodec { + fn from(value: Compression) -> Self { + match value { + Compression::UNCOMPRESSED => parquet::CompressionCodec::UNCOMPRESSED, + Compression::SNAPPY => parquet::CompressionCodec::SNAPPY, + Compression::GZIP => parquet::CompressionCodec::GZIP, + Compression::LZO => parquet::CompressionCodec::LZO, + Compression::BROTLI => parquet::CompressionCodec::BROTLI, + Compression::LZ4 => parquet::CompressionCodec::LZ4, + Compression::ZSTD => parquet::CompressionCodec::ZSTD, + } + } +} + +// ---------------------------------------------------------------------- +// parquet::PageType <=> PageType conversion + +impl convert::From for PageType { + fn from(value: parquet::PageType) -> Self { + match value { + parquet::PageType::DATA_PAGE => PageType::DATA_PAGE, + parquet::PageType::INDEX_PAGE => PageType::INDEX_PAGE, + parquet::PageType::DICTIONARY_PAGE => PageType::DICTIONARY_PAGE, + parquet::PageType::DATA_PAGE_V2 => PageType::DATA_PAGE_V2, + } + } +} + +impl convert::From for parquet::PageType { + fn from(value: PageType) -> Self { + match value { + PageType::DATA_PAGE => parquet::PageType::DATA_PAGE, + PageType::INDEX_PAGE => parquet::PageType::INDEX_PAGE, + PageType::DICTIONARY_PAGE => parquet::PageType::DICTIONARY_PAGE, + PageType::DATA_PAGE_V2 => parquet::PageType::DATA_PAGE_V2, + } + } +} + +// ---------------------------------------------------------------------- +// String conversions for schema parsing. + +impl str::FromStr for Repetition { + type Err = ParquetError; + + fn from_str(s: &str) -> result::Result { + match s { + "REQUIRED" => Ok(Repetition::REQUIRED), + "OPTIONAL" => Ok(Repetition::OPTIONAL), + "REPEATED" => Ok(Repetition::REPEATED), + other => Err(general_err!("Invalid repetition {}", other)), + } + } +} + +impl str::FromStr for Type { + type Err = ParquetError; + + fn from_str(s: &str) -> result::Result { + match s { + "BOOLEAN" => Ok(Type::BOOLEAN), + "INT32" => Ok(Type::INT32), + "INT64" => Ok(Type::INT64), + "INT96" => Ok(Type::INT96), + "FLOAT" => Ok(Type::FLOAT), + "DOUBLE" => Ok(Type::DOUBLE), + "BYTE_ARRAY" | "BINARY" => Ok(Type::BYTE_ARRAY), + "FIXED_LEN_BYTE_ARRAY" => Ok(Type::FIXED_LEN_BYTE_ARRAY), + other => Err(general_err!("Invalid type {}", other)), + } + } +} + +impl str::FromStr for LogicalType { + type Err = ParquetError; + + fn from_str(s: &str) -> result::Result { + match s { + "NONE" => Ok(LogicalType::NONE), + "UTF8" => Ok(LogicalType::UTF8), + "MAP" => Ok(LogicalType::MAP), + "MAP_KEY_VALUE" => Ok(LogicalType::MAP_KEY_VALUE), + "LIST" => Ok(LogicalType::LIST), + "ENUM" => Ok(LogicalType::ENUM), + "DECIMAL" => Ok(LogicalType::DECIMAL), + "DATE" => Ok(LogicalType::DATE), + "TIME_MILLIS" => Ok(LogicalType::TIME_MILLIS), + "TIME_MICROS" => Ok(LogicalType::TIME_MICROS), + "TIMESTAMP_MILLIS" => Ok(LogicalType::TIMESTAMP_MILLIS), + "TIMESTAMP_MICROS" => Ok(LogicalType::TIMESTAMP_MICROS), + "UINT_8" => Ok(LogicalType::UINT_8), + "UINT_16" => Ok(LogicalType::UINT_16), + "UINT_32" => Ok(LogicalType::UINT_32), + "UINT_64" => Ok(LogicalType::UINT_64), + "INT_8" => Ok(LogicalType::INT_8), + "INT_16" => Ok(LogicalType::INT_16), + "INT_32" => Ok(LogicalType::INT_32), + "INT_64" => Ok(LogicalType::INT_64), + "JSON" => Ok(LogicalType::JSON), + "BSON" => Ok(LogicalType::BSON), + "INTERVAL" => Ok(LogicalType::INTERVAL), + other => Err(general_err!("Invalid logical type {}", other)), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_display_type() { + assert_eq!(Type::BOOLEAN.to_string(), "BOOLEAN"); + assert_eq!(Type::INT32.to_string(), "INT32"); + assert_eq!(Type::INT64.to_string(), "INT64"); + assert_eq!(Type::INT96.to_string(), "INT96"); + assert_eq!(Type::FLOAT.to_string(), "FLOAT"); + assert_eq!(Type::DOUBLE.to_string(), "DOUBLE"); + assert_eq!(Type::BYTE_ARRAY.to_string(), "BYTE_ARRAY"); + assert_eq!( + Type::FIXED_LEN_BYTE_ARRAY.to_string(), + "FIXED_LEN_BYTE_ARRAY" + ); + } + + #[test] + fn test_from_type() { + assert_eq!(Type::from(parquet::Type::BOOLEAN), Type::BOOLEAN); + assert_eq!(Type::from(parquet::Type::INT32), Type::INT32); + assert_eq!(Type::from(parquet::Type::INT64), Type::INT64); + assert_eq!(Type::from(parquet::Type::INT96), Type::INT96); + assert_eq!(Type::from(parquet::Type::FLOAT), Type::FLOAT); + assert_eq!(Type::from(parquet::Type::DOUBLE), Type::DOUBLE); + assert_eq!(Type::from(parquet::Type::BYTE_ARRAY), Type::BYTE_ARRAY); + assert_eq!( + Type::from(parquet::Type::FIXED_LEN_BYTE_ARRAY), + Type::FIXED_LEN_BYTE_ARRAY + ); + } + + #[test] + fn test_into_type() { + assert_eq!(parquet::Type::BOOLEAN, Type::BOOLEAN.into()); + assert_eq!(parquet::Type::INT32, Type::INT32.into()); + assert_eq!(parquet::Type::INT64, Type::INT64.into()); + assert_eq!(parquet::Type::INT96, Type::INT96.into()); + assert_eq!(parquet::Type::FLOAT, Type::FLOAT.into()); + assert_eq!(parquet::Type::DOUBLE, Type::DOUBLE.into()); + assert_eq!(parquet::Type::BYTE_ARRAY, Type::BYTE_ARRAY.into()); + assert_eq!( + parquet::Type::FIXED_LEN_BYTE_ARRAY, + Type::FIXED_LEN_BYTE_ARRAY.into() + ); + } + + #[test] + fn test_from_string_into_type() { + assert_eq!( + Type::BOOLEAN.to_string().parse::().unwrap(), + Type::BOOLEAN + ); + assert_eq!( + Type::INT32.to_string().parse::().unwrap(), + Type::INT32 + ); + assert_eq!( + Type::INT64.to_string().parse::().unwrap(), + Type::INT64 + ); + assert_eq!( + Type::INT96.to_string().parse::().unwrap(), + Type::INT96 + ); + assert_eq!( + Type::FLOAT.to_string().parse::().unwrap(), + Type::FLOAT + ); + assert_eq!( + Type::DOUBLE.to_string().parse::().unwrap(), + Type::DOUBLE + ); + assert_eq!( + Type::BYTE_ARRAY.to_string().parse::().unwrap(), + Type::BYTE_ARRAY + ); + assert_eq!("BINARY".parse::().unwrap(), Type::BYTE_ARRAY); + assert_eq!( + Type::FIXED_LEN_BYTE_ARRAY + .to_string() + .parse::() + .unwrap(), + Type::FIXED_LEN_BYTE_ARRAY + ); + } + + #[test] + fn test_display_logical_type() { + assert_eq!(LogicalType::NONE.to_string(), "NONE"); + assert_eq!(LogicalType::UTF8.to_string(), "UTF8"); + assert_eq!(LogicalType::MAP.to_string(), "MAP"); + assert_eq!(LogicalType::MAP_KEY_VALUE.to_string(), "MAP_KEY_VALUE"); + assert_eq!(LogicalType::LIST.to_string(), "LIST"); + assert_eq!(LogicalType::ENUM.to_string(), "ENUM"); + assert_eq!(LogicalType::DECIMAL.to_string(), "DECIMAL"); + assert_eq!(LogicalType::DATE.to_string(), "DATE"); + assert_eq!(LogicalType::TIME_MILLIS.to_string(), "TIME_MILLIS"); + assert_eq!(LogicalType::DATE.to_string(), "DATE"); + assert_eq!(LogicalType::TIME_MICROS.to_string(), "TIME_MICROS"); + assert_eq!( + LogicalType::TIMESTAMP_MILLIS.to_string(), + "TIMESTAMP_MILLIS" + ); + assert_eq!( + LogicalType::TIMESTAMP_MICROS.to_string(), + "TIMESTAMP_MICROS" + ); + assert_eq!(LogicalType::UINT_8.to_string(), "UINT_8"); + assert_eq!(LogicalType::UINT_16.to_string(), "UINT_16"); + assert_eq!(LogicalType::UINT_32.to_string(), "UINT_32"); + assert_eq!(LogicalType::UINT_64.to_string(), "UINT_64"); + assert_eq!(LogicalType::INT_8.to_string(), "INT_8"); + assert_eq!(LogicalType::INT_16.to_string(), "INT_16"); + assert_eq!(LogicalType::INT_32.to_string(), "INT_32"); + assert_eq!(LogicalType::INT_64.to_string(), "INT_64"); + assert_eq!(LogicalType::JSON.to_string(), "JSON"); + assert_eq!(LogicalType::BSON.to_string(), "BSON"); + assert_eq!(LogicalType::INTERVAL.to_string(), "INTERVAL"); + } + + #[test] + fn test_from_logical_type() { + assert_eq!(LogicalType::from(None), LogicalType::NONE); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::UTF8)), + LogicalType::UTF8 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::MAP)), + LogicalType::MAP + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::MAP_KEY_VALUE)), + LogicalType::MAP_KEY_VALUE + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::LIST)), + LogicalType::LIST + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::ENUM)), + LogicalType::ENUM + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::DECIMAL)), + LogicalType::DECIMAL + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::DATE)), + LogicalType::DATE + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::TIME_MILLIS)), + LogicalType::TIME_MILLIS + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::TIME_MICROS)), + LogicalType::TIME_MICROS + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::TIMESTAMP_MILLIS)), + LogicalType::TIMESTAMP_MILLIS + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::TIMESTAMP_MICROS)), + LogicalType::TIMESTAMP_MICROS + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::UINT_8)), + LogicalType::UINT_8 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::UINT_16)), + LogicalType::UINT_16 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::UINT_32)), + LogicalType::UINT_32 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::UINT_64)), + LogicalType::UINT_64 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::INT_8)), + LogicalType::INT_8 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::INT_16)), + LogicalType::INT_16 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::INT_32)), + LogicalType::INT_32 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::INT_64)), + LogicalType::INT_64 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::JSON)), + LogicalType::JSON + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::BSON)), + LogicalType::BSON + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::INTERVAL)), + LogicalType::INTERVAL + ); + } + + #[test] + fn test_into_logical_type() { + let converted_type: Option = None; + assert_eq!(converted_type, LogicalType::NONE.into()); + assert_eq!(Some(parquet::ConvertedType::UTF8), LogicalType::UTF8.into()); + assert_eq!(Some(parquet::ConvertedType::MAP), LogicalType::MAP.into()); + assert_eq!( + Some(parquet::ConvertedType::MAP_KEY_VALUE), + LogicalType::MAP_KEY_VALUE.into() + ); + assert_eq!(Some(parquet::ConvertedType::LIST), LogicalType::LIST.into()); + assert_eq!(Some(parquet::ConvertedType::ENUM), LogicalType::ENUM.into()); + assert_eq!( + Some(parquet::ConvertedType::DECIMAL), + LogicalType::DECIMAL.into() + ); + assert_eq!(Some(parquet::ConvertedType::DATE), LogicalType::DATE.into()); + assert_eq!( + Some(parquet::ConvertedType::TIME_MILLIS), + LogicalType::TIME_MILLIS.into() + ); + assert_eq!( + Some(parquet::ConvertedType::TIME_MICROS), + LogicalType::TIME_MICROS.into() + ); + assert_eq!( + Some(parquet::ConvertedType::TIMESTAMP_MILLIS), + LogicalType::TIMESTAMP_MILLIS.into() + ); + assert_eq!( + Some(parquet::ConvertedType::TIMESTAMP_MICROS), + LogicalType::TIMESTAMP_MICROS.into() + ); + assert_eq!( + Some(parquet::ConvertedType::UINT_8), + LogicalType::UINT_8.into() + ); + assert_eq!( + Some(parquet::ConvertedType::UINT_16), + LogicalType::UINT_16.into() + ); + assert_eq!( + Some(parquet::ConvertedType::UINT_32), + LogicalType::UINT_32.into() + ); + assert_eq!( + Some(parquet::ConvertedType::UINT_64), + LogicalType::UINT_64.into() + ); + assert_eq!( + Some(parquet::ConvertedType::INT_8), + LogicalType::INT_8.into() + ); + assert_eq!( + Some(parquet::ConvertedType::INT_16), + LogicalType::INT_16.into() + ); + assert_eq!( + Some(parquet::ConvertedType::INT_32), + LogicalType::INT_32.into() + ); + assert_eq!( + Some(parquet::ConvertedType::INT_64), + LogicalType::INT_64.into() + ); + assert_eq!(Some(parquet::ConvertedType::JSON), LogicalType::JSON.into()); + assert_eq!(Some(parquet::ConvertedType::BSON), LogicalType::BSON.into()); + assert_eq!( + Some(parquet::ConvertedType::INTERVAL), + LogicalType::INTERVAL.into() + ); + } + + #[test] + fn test_from_string_into_logical_type() { + assert_eq!( + LogicalType::NONE + .to_string() + .parse::() + .unwrap(), + LogicalType::NONE + ); + assert_eq!( + LogicalType::UTF8 + .to_string() + .parse::() + .unwrap(), + LogicalType::UTF8 + ); + assert_eq!( + LogicalType::MAP.to_string().parse::().unwrap(), + LogicalType::MAP + ); + assert_eq!( + LogicalType::MAP_KEY_VALUE + .to_string() + .parse::() + .unwrap(), + LogicalType::MAP_KEY_VALUE + ); + assert_eq!( + LogicalType::LIST + .to_string() + .parse::() + .unwrap(), + LogicalType::LIST + ); + assert_eq!( + LogicalType::ENUM + .to_string() + .parse::() + .unwrap(), + LogicalType::ENUM + ); + assert_eq!( + LogicalType::DECIMAL + .to_string() + .parse::() + .unwrap(), + LogicalType::DECIMAL + ); + assert_eq!( + LogicalType::DATE + .to_string() + .parse::() + .unwrap(), + LogicalType::DATE + ); + assert_eq!( + LogicalType::TIME_MILLIS + .to_string() + .parse::() + .unwrap(), + LogicalType::TIME_MILLIS + ); + assert_eq!( + LogicalType::TIME_MICROS + .to_string() + .parse::() + .unwrap(), + LogicalType::TIME_MICROS + ); + assert_eq!( + LogicalType::TIMESTAMP_MILLIS + .to_string() + .parse::() + .unwrap(), + LogicalType::TIMESTAMP_MILLIS + ); + assert_eq!( + LogicalType::TIMESTAMP_MICROS + .to_string() + .parse::() + .unwrap(), + LogicalType::TIMESTAMP_MICROS + ); + assert_eq!( + LogicalType::UINT_8 + .to_string() + .parse::() + .unwrap(), + LogicalType::UINT_8 + ); + assert_eq!( + LogicalType::UINT_16 + .to_string() + .parse::() + .unwrap(), + LogicalType::UINT_16 + ); + assert_eq!( + LogicalType::UINT_32 + .to_string() + .parse::() + .unwrap(), + LogicalType::UINT_32 + ); + assert_eq!( + LogicalType::UINT_64 + .to_string() + .parse::() + .unwrap(), + LogicalType::UINT_64 + ); + assert_eq!( + LogicalType::INT_8 + .to_string() + .parse::() + .unwrap(), + LogicalType::INT_8 + ); + assert_eq!( + LogicalType::INT_16 + .to_string() + .parse::() + .unwrap(), + LogicalType::INT_16 + ); + assert_eq!( + LogicalType::INT_32 + .to_string() + .parse::() + .unwrap(), + LogicalType::INT_32 + ); + assert_eq!( + LogicalType::INT_64 + .to_string() + .parse::() + .unwrap(), + LogicalType::INT_64 + ); + assert_eq!( + LogicalType::JSON + .to_string() + .parse::() + .unwrap(), + LogicalType::JSON + ); + assert_eq!( + LogicalType::BSON + .to_string() + .parse::() + .unwrap(), + LogicalType::BSON + ); + assert_eq!( + LogicalType::INTERVAL + .to_string() + .parse::() + .unwrap(), + LogicalType::INTERVAL + ); + } + + #[test] + fn test_display_repetition() { + assert_eq!(Repetition::REQUIRED.to_string(), "REQUIRED"); + assert_eq!(Repetition::OPTIONAL.to_string(), "OPTIONAL"); + assert_eq!(Repetition::REPEATED.to_string(), "REPEATED"); + } + + #[test] + fn test_from_repetition() { + assert_eq!( + Repetition::from(parquet::FieldRepetitionType::REQUIRED), + Repetition::REQUIRED + ); + assert_eq!( + Repetition::from(parquet::FieldRepetitionType::OPTIONAL), + Repetition::OPTIONAL + ); + assert_eq!( + Repetition::from(parquet::FieldRepetitionType::REPEATED), + Repetition::REPEATED + ); + } + + #[test] + fn test_into_repetition() { + assert_eq!( + parquet::FieldRepetitionType::REQUIRED, + Repetition::REQUIRED.into() + ); + assert_eq!( + parquet::FieldRepetitionType::OPTIONAL, + Repetition::OPTIONAL.into() + ); + assert_eq!( + parquet::FieldRepetitionType::REPEATED, + Repetition::REPEATED.into() + ); + } + + #[test] + fn test_from_string_into_repetition() { + assert_eq!( + Repetition::REQUIRED + .to_string() + .parse::() + .unwrap(), + Repetition::REQUIRED + ); + assert_eq!( + Repetition::OPTIONAL + .to_string() + .parse::() + .unwrap(), + Repetition::OPTIONAL + ); + assert_eq!( + Repetition::REPEATED + .to_string() + .parse::() + .unwrap(), + Repetition::REPEATED + ); + } + + #[test] + fn test_display_encoding() { + assert_eq!(Encoding::PLAIN.to_string(), "PLAIN"); + assert_eq!(Encoding::PLAIN_DICTIONARY.to_string(), "PLAIN_DICTIONARY"); + assert_eq!(Encoding::RLE.to_string(), "RLE"); + assert_eq!(Encoding::BIT_PACKED.to_string(), "BIT_PACKED"); + assert_eq!( + Encoding::DELTA_BINARY_PACKED.to_string(), + "DELTA_BINARY_PACKED" + ); + assert_eq!( + Encoding::DELTA_LENGTH_BYTE_ARRAY.to_string(), + "DELTA_LENGTH_BYTE_ARRAY" + ); + assert_eq!(Encoding::DELTA_BYTE_ARRAY.to_string(), "DELTA_BYTE_ARRAY"); + assert_eq!(Encoding::RLE_DICTIONARY.to_string(), "RLE_DICTIONARY"); + } + + #[test] + fn test_from_encoding() { + assert_eq!(Encoding::from(parquet::Encoding::PLAIN), Encoding::PLAIN); + assert_eq!( + Encoding::from(parquet::Encoding::PLAIN_DICTIONARY), + Encoding::PLAIN_DICTIONARY + ); + assert_eq!(Encoding::from(parquet::Encoding::RLE), Encoding::RLE); + assert_eq!( + Encoding::from(parquet::Encoding::BIT_PACKED), + Encoding::BIT_PACKED + ); + assert_eq!( + Encoding::from(parquet::Encoding::DELTA_BINARY_PACKED), + Encoding::DELTA_BINARY_PACKED + ); + assert_eq!( + Encoding::from(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY), + Encoding::DELTA_LENGTH_BYTE_ARRAY + ); + assert_eq!( + Encoding::from(parquet::Encoding::DELTA_BYTE_ARRAY), + Encoding::DELTA_BYTE_ARRAY + ); + } + + #[test] + fn test_into_encoding() { + assert_eq!(parquet::Encoding::PLAIN, Encoding::PLAIN.into()); + assert_eq!( + parquet::Encoding::PLAIN_DICTIONARY, + Encoding::PLAIN_DICTIONARY.into() + ); + assert_eq!(parquet::Encoding::RLE, Encoding::RLE.into()); + assert_eq!(parquet::Encoding::BIT_PACKED, Encoding::BIT_PACKED.into()); + assert_eq!( + parquet::Encoding::DELTA_BINARY_PACKED, + Encoding::DELTA_BINARY_PACKED.into() + ); + assert_eq!( + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, + Encoding::DELTA_LENGTH_BYTE_ARRAY.into() + ); + assert_eq!( + parquet::Encoding::DELTA_BYTE_ARRAY, + Encoding::DELTA_BYTE_ARRAY.into() + ); + } + + #[test] + fn test_display_compression() { + assert_eq!(Compression::UNCOMPRESSED.to_string(), "UNCOMPRESSED"); + assert_eq!(Compression::SNAPPY.to_string(), "SNAPPY"); + assert_eq!(Compression::GZIP.to_string(), "GZIP"); + assert_eq!(Compression::LZO.to_string(), "LZO"); + assert_eq!(Compression::BROTLI.to_string(), "BROTLI"); + assert_eq!(Compression::LZ4.to_string(), "LZ4"); + assert_eq!(Compression::ZSTD.to_string(), "ZSTD"); + } + + #[test] + fn test_from_compression() { + assert_eq!( + Compression::from(parquet::CompressionCodec::UNCOMPRESSED), + Compression::UNCOMPRESSED + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::SNAPPY), + Compression::SNAPPY + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::GZIP), + Compression::GZIP + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::LZO), + Compression::LZO + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::BROTLI), + Compression::BROTLI + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::LZ4), + Compression::LZ4 + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::ZSTD), + Compression::ZSTD + ); + } + + #[test] + fn test_into_compression() { + assert_eq!( + parquet::CompressionCodec::UNCOMPRESSED, + Compression::UNCOMPRESSED.into() + ); + assert_eq!( + parquet::CompressionCodec::SNAPPY, + Compression::SNAPPY.into() + ); + assert_eq!(parquet::CompressionCodec::GZIP, Compression::GZIP.into()); + assert_eq!(parquet::CompressionCodec::LZO, Compression::LZO.into()); + assert_eq!( + parquet::CompressionCodec::BROTLI, + Compression::BROTLI.into() + ); + assert_eq!(parquet::CompressionCodec::LZ4, Compression::LZ4.into()); + assert_eq!(parquet::CompressionCodec::ZSTD, Compression::ZSTD.into()); + } + + #[test] + fn test_display_page_type() { + assert_eq!(PageType::DATA_PAGE.to_string(), "DATA_PAGE"); + assert_eq!(PageType::INDEX_PAGE.to_string(), "INDEX_PAGE"); + assert_eq!(PageType::DICTIONARY_PAGE.to_string(), "DICTIONARY_PAGE"); + assert_eq!(PageType::DATA_PAGE_V2.to_string(), "DATA_PAGE_V2"); + } + + #[test] + fn test_from_page_type() { + assert_eq!( + PageType::from(parquet::PageType::DATA_PAGE), + PageType::DATA_PAGE + ); + assert_eq!( + PageType::from(parquet::PageType::INDEX_PAGE), + PageType::INDEX_PAGE + ); + assert_eq!( + PageType::from(parquet::PageType::DICTIONARY_PAGE), + PageType::DICTIONARY_PAGE + ); + assert_eq!( + PageType::from(parquet::PageType::DATA_PAGE_V2), + PageType::DATA_PAGE_V2 + ); + } + + #[test] + fn test_into_page_type() { + assert_eq!(parquet::PageType::DATA_PAGE, PageType::DATA_PAGE.into()); + assert_eq!(parquet::PageType::INDEX_PAGE, PageType::INDEX_PAGE.into()); + assert_eq!( + parquet::PageType::DICTIONARY_PAGE, + PageType::DICTIONARY_PAGE.into() + ); + assert_eq!( + parquet::PageType::DATA_PAGE_V2, + PageType::DATA_PAGE_V2.into() + ); + } + + #[test] + fn test_display_sort_order() { + assert_eq!(SortOrder::SIGNED.to_string(), "SIGNED"); + assert_eq!(SortOrder::UNSIGNED.to_string(), "UNSIGNED"); + assert_eq!(SortOrder::UNDEFINED.to_string(), "UNDEFINED"); + } + + #[test] + fn test_display_column_order() { + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED).to_string(), + "TYPE_DEFINED_ORDER(SIGNED)" + ); + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED).to_string(), + "TYPE_DEFINED_ORDER(UNSIGNED)" + ); + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).to_string(), + "TYPE_DEFINED_ORDER(UNDEFINED)" + ); + assert_eq!(ColumnOrder::UNDEFINED.to_string(), "UNDEFINED"); + } + + #[test] + fn test_column_order_get_sort_order() { + // Helper to check the order in a list of values. + // Only logical type is checked. + fn check_sort_order(types: Vec, expected_order: SortOrder) { + for tpe in types { + assert_eq!( + ColumnOrder::get_sort_order(tpe, Type::BYTE_ARRAY), + expected_order + ); + } + } + + // Unsigned comparison (physical type does not matter) + let unsigned = vec![ + LogicalType::UTF8, + LogicalType::JSON, + LogicalType::BSON, + LogicalType::ENUM, + LogicalType::UINT_8, + LogicalType::UINT_16, + LogicalType::UINT_32, + LogicalType::UINT_64, + LogicalType::INTERVAL, + ]; + check_sort_order(unsigned, SortOrder::UNSIGNED); + + // Signed comparison (physical type does not matter) + let signed = vec![ + LogicalType::INT_8, + LogicalType::INT_16, + LogicalType::INT_32, + LogicalType::INT_64, + LogicalType::DECIMAL, + LogicalType::DATE, + LogicalType::TIME_MILLIS, + LogicalType::TIME_MICROS, + LogicalType::TIMESTAMP_MILLIS, + LogicalType::TIMESTAMP_MICROS, + ]; + check_sort_order(signed, SortOrder::SIGNED); + + // Undefined comparison + let undefined = vec![ + LogicalType::LIST, + LogicalType::MAP, + LogicalType::MAP_KEY_VALUE, + ]; + check_sort_order(undefined, SortOrder::UNDEFINED); + + // Check None logical type + // This should return a sort order for byte array type. + check_sort_order(vec![LogicalType::NONE], SortOrder::UNSIGNED); + } + + #[test] + fn test_column_order_get_default_sort_order() { + // Comparison based on physical type + assert_eq!( + ColumnOrder::get_default_sort_order(Type::BOOLEAN), + SortOrder::UNSIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::INT32), + SortOrder::SIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::INT64), + SortOrder::SIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::INT96), + SortOrder::UNDEFINED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::FLOAT), + SortOrder::SIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::DOUBLE), + SortOrder::SIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::BYTE_ARRAY), + SortOrder::UNSIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::FIXED_LEN_BYTE_ARRAY), + SortOrder::UNSIGNED + ); + } + + #[test] + fn test_column_order_sort_order() { + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED).sort_order(), + SortOrder::SIGNED + ); + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED).sort_order(), + SortOrder::UNSIGNED + ); + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).sort_order(), + SortOrder::UNDEFINED + ); + assert_eq!(ColumnOrder::UNDEFINED.sort_order(), SortOrder::SIGNED); + } +} diff --git a/rust/src/parquet/column/mod.rs b/rust/src/parquet/column/mod.rs new file mode 100644 index 0000000000000..09c4bde51f771 --- /dev/null +++ b/rust/src/parquet/column/mod.rs @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Low level column reader and writer APIs. +//! +//! This API is designed for reading and writing column values, definition and repetition +//! levels directly. +//! +//! # Example of writing and reading data +//! +//! Data has the following format: +//! ```text +//! +---------------+ +//! | values| +//! +---------------+ +//! |[1, 2] | +//! |[3, null, null]| +//! +---------------+ +//! ``` +//! +//! The example uses column writer and reader APIs to write raw values, definition and +//! repetition levels and read them to verify write/read correctness. +//! +//! ```rust +//! use std::{fs, path::Path, rc::Rc}; +//! +//! use arrow::parquet::{ +//! column::{reader::ColumnReader, writer::ColumnWriter}, +//! file::{ +//! properties::WriterProperties, +//! reader::{FileReader, SerializedFileReader}, +//! writer::{FileWriter, SerializedFileWriter}, +//! }, +//! schema::parser::parse_message_type, +//! }; +//! +//! let path = Path::new("target/debug/examples/column_sample.parquet"); +//! +//! // Writing data using column writer API. +//! +//! let message_type = " +//! message schema { +//! optional group values (LIST) { +//! repeated group list { +//! optional INT32 element; +//! } +//! } +//! } +//! "; +//! let schema = Rc::new(parse_message_type(message_type).unwrap()); +//! let props = Rc::new(WriterProperties::builder().build()); +//! let file = fs::File::create(path).unwrap(); +//! let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); +//! let mut row_group_writer = writer.next_row_group().unwrap(); +//! while let Some(mut col_writer) = row_group_writer.next_column().unwrap() { +//! match col_writer { +//! // You can also use `get_typed_column_writer` method to extract typed writer. +//! ColumnWriter::Int32ColumnWriter(ref mut typed_writer) => { +//! typed_writer +//! .write_batch(&[1, 2, 3], Some(&[3, 3, 3, 2, 2]), Some(&[0, 1, 0, 1, 1])) +//! .unwrap(); +//! } +//! _ => {} +//! } +//! row_group_writer.close_column(col_writer).unwrap(); +//! } +//! writer.close_row_group(row_group_writer).unwrap(); +//! writer.close().unwrap(); +//! +//! // Reading data using column reader API. +//! +//! let file = fs::File::open(path).unwrap(); +//! let reader = SerializedFileReader::new(file).unwrap(); +//! let metadata = reader.metadata(); +//! +//! let mut res = Ok((0, 0)); +//! let mut values = vec![0; 8]; +//! let mut def_levels = vec![0; 8]; +//! let mut rep_levels = vec![0; 8]; +//! +//! for i in 0..metadata.num_row_groups() { +//! let row_group_reader = reader.get_row_group(i).unwrap(); +//! let row_group_metadata = metadata.row_group(i); +//! +//! for j in 0..row_group_metadata.num_columns() { +//! let mut column_reader = row_group_reader.get_column_reader(j).unwrap(); +//! match column_reader { +//! // You can also use `get_typed_column_reader` method to extract typed reader. +//! ColumnReader::Int32ColumnReader(ref mut typed_reader) => { +//! res = typed_reader.read_batch( +//! 8, // batch size +//! Some(&mut def_levels), +//! Some(&mut rep_levels), +//! &mut values, +//! ); +//! } +//! _ => {} +//! } +//! } +//! } +//! +//! assert_eq!(res, Ok((3, 5))); +//! assert_eq!(values, vec![1, 2, 3, 0, 0, 0, 0, 0]); +//! assert_eq!(def_levels, vec![3, 3, 3, 2, 2, 0, 0, 0]); +//! assert_eq!(rep_levels, vec![0, 1, 0, 1, 1, 0, 0, 0]); +//! ``` + +pub mod page; +pub mod reader; +pub mod writer; diff --git a/rust/src/parquet/column/page.rs b/rust/src/parquet/column/page.rs new file mode 100644 index 0000000000000..115037cba0bd5 --- /dev/null +++ b/rust/src/parquet/column/page.rs @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains Parquet Page definitions and page reader interface. + +use crate::parquet::basic::{Encoding, PageType}; +use crate::parquet::errors::Result; +use crate::parquet::file::{metadata::ColumnChunkMetaData, statistics::Statistics}; +use crate::parquet::util::memory::ByteBufferPtr; + +/// Parquet Page definition. +/// +/// List of supported pages. +/// These are 1-to-1 mapped from the equivalent Thrift definitions, except `buf` which +/// used to store uncompressed bytes of the page. +pub enum Page { + DataPage { + buf: ByteBufferPtr, + num_values: u32, + encoding: Encoding, + def_level_encoding: Encoding, + rep_level_encoding: Encoding, + statistics: Option, + }, + DataPageV2 { + buf: ByteBufferPtr, + num_values: u32, + encoding: Encoding, + num_nulls: u32, + num_rows: u32, + def_levels_byte_len: u32, + rep_levels_byte_len: u32, + is_compressed: bool, + statistics: Option, + }, + DictionaryPage { + buf: ByteBufferPtr, + num_values: u32, + encoding: Encoding, + is_sorted: bool, + }, +} + +impl Page { + /// Returns [`PageType`](`::basic::PageType`) for this page. + pub fn page_type(&self) -> PageType { + match self { + &Page::DataPage { .. } => PageType::DATA_PAGE, + &Page::DataPageV2 { .. } => PageType::DATA_PAGE_V2, + &Page::DictionaryPage { .. } => PageType::DICTIONARY_PAGE, + } + } + + /// Returns internal byte buffer reference for this page. + pub fn buffer(&self) -> &ByteBufferPtr { + match self { + &Page::DataPage { ref buf, .. } => &buf, + &Page::DataPageV2 { ref buf, .. } => &buf, + &Page::DictionaryPage { ref buf, .. } => &buf, + } + } + + /// Returns number of values in this page. + pub fn num_values(&self) -> u32 { + match self { + &Page::DataPage { num_values, .. } => num_values, + &Page::DataPageV2 { num_values, .. } => num_values, + &Page::DictionaryPage { num_values, .. } => num_values, + } + } + + /// Returns this page [`Encoding`](`::basic::Encoding`). + pub fn encoding(&self) -> Encoding { + match self { + &Page::DataPage { encoding, .. } => encoding, + &Page::DataPageV2 { encoding, .. } => encoding, + &Page::DictionaryPage { encoding, .. } => encoding, + } + } + + /// Returns optional [`Statistics`](`::file::metadata::Statistics`). + pub fn statistics(&self) -> Option<&Statistics> { + match self { + &Page::DataPage { ref statistics, .. } => statistics.as_ref(), + &Page::DataPageV2 { ref statistics, .. } => statistics.as_ref(), + &Page::DictionaryPage { .. } => None, + } + } +} + +/// Helper struct to represent pages with potentially compressed buffer (data page v1) or +/// compressed and concatenated buffer (def levels + rep levels + compressed values for +/// data page v2). +/// +/// The difference with `Page` is that `Page` buffer is always uncompressed. +pub struct CompressedPage { + compressed_page: Page, + uncompressed_size: usize, +} + +impl CompressedPage { + /// Creates `CompressedPage` from a page with potentially compressed buffer and + /// uncompressed size. + pub fn new(compressed_page: Page, uncompressed_size: usize) -> Self { + Self { + compressed_page, + uncompressed_size, + } + } + + /// Returns page type. + pub fn page_type(&self) -> PageType { + self.compressed_page.page_type() + } + + /// Returns underlying page with potentially compressed buffer. + pub fn compressed_page(&self) -> &Page { + &self.compressed_page + } + + /// Returns uncompressed size in bytes. + pub fn uncompressed_size(&self) -> usize { + self.uncompressed_size + } + + /// Returns compressed size in bytes. + /// + /// Note that it is assumed that buffer is compressed, but it may not be. In this + /// case compressed size will be equal to uncompressed size. + pub fn compressed_size(&self) -> usize { + self.compressed_page.buffer().len() + } + + /// Number of values in page. + pub fn num_values(&self) -> u32 { + self.compressed_page.num_values() + } + + /// Returns encoding for values in page. + pub fn encoding(&self) -> Encoding { + self.compressed_page.encoding() + } + + /// Returns slice of compressed buffer in the page. + pub fn data(&self) -> &[u8] { + self.compressed_page.buffer().data() + } +} + +/// Contains page write metrics. +pub struct PageWriteSpec { + pub page_type: PageType, + pub uncompressed_size: usize, + pub compressed_size: usize, + pub num_values: u32, + pub offset: u64, + pub bytes_written: u64, +} + +impl PageWriteSpec { + /// Creates new spec with default page write metrics. + pub fn new() -> Self { + Self { + page_type: PageType::DATA_PAGE, + uncompressed_size: 0, + compressed_size: 0, + num_values: 0, + offset: 0, + bytes_written: 0, + } + } +} + +/// API for reading pages from a column chunk. +/// This offers a iterator like API to get the next page. +pub trait PageReader { + /// Gets the next page in the column chunk associated with this reader. + /// Returns `None` if there are no pages left. + fn get_next_page(&mut self) -> Result>; +} + +/// API for writing pages in a column chunk. +/// +/// It is reasonable to assume that all pages will be written in the correct order, e.g. +/// dictionary page followed by data pages, or a set of data pages, etc. +pub trait PageWriter { + /// Writes a page into the output stream/sink. + /// Returns `PageWriteSpec` that contains information about written page metrics, + /// including number of bytes, size, number of values, offset, etc. + /// + /// This method is called for every compressed page we write into underlying buffer, + /// either data page or dictionary page. + fn write_page(&mut self, page: CompressedPage) -> Result; + + /// Writes column chunk metadata into the output stream/sink. + /// + /// This method is called once before page writer is closed, normally when writes are + /// finalised in column writer. + fn write_metadata(&mut self, metadata: &ColumnChunkMetaData) -> Result<()>; + + /// Closes resources and flushes underlying sink. + /// Page writer should not be used after this method is called. + fn close(&mut self) -> Result<()>; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_page() { + let data_page = Page::DataPage { + buf: ByteBufferPtr::new(vec![0, 1, 2]), + num_values: 10, + encoding: Encoding::PLAIN, + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + }; + assert_eq!(data_page.page_type(), PageType::DATA_PAGE); + assert_eq!(data_page.buffer().data(), vec![0, 1, 2].as_slice()); + assert_eq!(data_page.num_values(), 10); + assert_eq!(data_page.encoding(), Encoding::PLAIN); + assert_eq!( + data_page.statistics(), + Some(&Statistics::int32(Some(1), Some(2), None, 1, true)) + ); + + let data_page_v2 = Page::DataPageV2 { + buf: ByteBufferPtr::new(vec![0, 1, 2]), + num_values: 10, + encoding: Encoding::PLAIN, + num_nulls: 5, + num_rows: 20, + def_levels_byte_len: 30, + rep_levels_byte_len: 40, + is_compressed: false, + statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + }; + assert_eq!(data_page_v2.page_type(), PageType::DATA_PAGE_V2); + assert_eq!(data_page_v2.buffer().data(), vec![0, 1, 2].as_slice()); + assert_eq!(data_page_v2.num_values(), 10); + assert_eq!(data_page_v2.encoding(), Encoding::PLAIN); + assert_eq!( + data_page_v2.statistics(), + Some(&Statistics::int32(Some(1), Some(2), None, 1, true)) + ); + + let dict_page = Page::DictionaryPage { + buf: ByteBufferPtr::new(vec![0, 1, 2]), + num_values: 10, + encoding: Encoding::PLAIN, + is_sorted: false, + }; + assert_eq!(dict_page.page_type(), PageType::DICTIONARY_PAGE); + assert_eq!(dict_page.buffer().data(), vec![0, 1, 2].as_slice()); + assert_eq!(dict_page.num_values(), 10); + assert_eq!(dict_page.encoding(), Encoding::PLAIN); + assert_eq!(dict_page.statistics(), None); + } + + #[test] + fn test_compressed_page() { + let data_page = Page::DataPage { + buf: ByteBufferPtr::new(vec![0, 1, 2]), + num_values: 10, + encoding: Encoding::PLAIN, + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + }; + + let cpage = CompressedPage::new(data_page, 5); + + assert_eq!(cpage.page_type(), PageType::DATA_PAGE); + assert_eq!(cpage.uncompressed_size(), 5); + assert_eq!(cpage.compressed_size(), 3); + assert_eq!(cpage.num_values(), 10); + assert_eq!(cpage.encoding(), Encoding::PLAIN); + assert_eq!(cpage.data(), &[0, 1, 2]); + } +} diff --git a/rust/src/parquet/column/reader.rs b/rust/src/parquet/column/reader.rs new file mode 100644 index 0000000000000..f3dde31ab9a14 --- /dev/null +++ b/rust/src/parquet/column/reader.rs @@ -0,0 +1,1576 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains column reader API. + +use std::{ + cmp::{max, min}, + collections::HashMap, + mem, +}; + +use super::page::{Page, PageReader}; +use crate::parquet::basic::*; +use crate::parquet::data_type::*; +use crate::parquet::encodings::{ + decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder}, + levels::LevelDecoder, +}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::schema::types::ColumnDescPtr; +use crate::parquet::util::memory::ByteBufferPtr; + +/// Column reader for a Parquet type. +pub enum ColumnReader { + BoolColumnReader(ColumnReaderImpl), + Int32ColumnReader(ColumnReaderImpl), + Int64ColumnReader(ColumnReaderImpl), + Int96ColumnReader(ColumnReaderImpl), + FloatColumnReader(ColumnReaderImpl), + DoubleColumnReader(ColumnReaderImpl), + ByteArrayColumnReader(ColumnReaderImpl), + FixedLenByteArrayColumnReader(ColumnReaderImpl), +} + +/// Gets a specific column reader corresponding to column descriptor `col_descr`. The +/// column reader will read from pages in `col_page_reader`. +pub fn get_column_reader( + col_descr: ColumnDescPtr, + col_page_reader: Box, +) -> ColumnReader { + match col_descr.physical_type() { + Type::BOOLEAN => { + ColumnReader::BoolColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT32 => { + ColumnReader::Int32ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT64 => { + ColumnReader::Int64ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT96 => { + ColumnReader::Int96ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::FLOAT => { + ColumnReader::FloatColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::DOUBLE => { + ColumnReader::DoubleColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::BYTE_ARRAY => { + ColumnReader::ByteArrayColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::FIXED_LEN_BYTE_ARRAY => ColumnReader::FixedLenByteArrayColumnReader( + ColumnReaderImpl::new(col_descr, col_page_reader), + ), + } +} + +/// Gets a typed column reader for the specific type `T`, by "up-casting" `col_reader` of +/// non-generic type to a generic column reader type `ColumnReaderImpl`. +/// +/// NOTE: the caller MUST guarantee that the actual enum value for `col_reader` matches +/// the type `T`. Otherwise, disastrous consequence could happen. +pub fn get_typed_column_reader(col_reader: ColumnReader) -> ColumnReaderImpl { + match col_reader { + ColumnReader::BoolColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::Int32ColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::Int64ColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::Int96ColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::FloatColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::DoubleColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::ByteArrayColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::FixedLenByteArrayColumnReader(r) => unsafe { mem::transmute(r) }, + } +} + +/// Typed value reader for a particular primitive column. +pub struct ColumnReaderImpl { + descr: ColumnDescPtr, + def_level_decoder: Option, + rep_level_decoder: Option, + page_reader: Box, + current_encoding: Option, + + // The total number of values stored in the data page. + num_buffered_values: u32, + + // The number of values from the current data page that has been decoded into memory + // so far. + num_decoded_values: u32, + + // Cache of decoders for existing encodings + decoders: HashMap>>, +} + +impl ColumnReaderImpl { + /// Creates new column reader based on column descriptor and page reader. + pub fn new(descr: ColumnDescPtr, page_reader: Box) -> Self { + Self { + descr, + def_level_decoder: None, + rep_level_decoder: None, + page_reader, + current_encoding: None, + num_buffered_values: 0, + num_decoded_values: 0, + decoders: HashMap::new(), + } + } + + /// Reads a batch of values of at most `batch_size`. + /// + /// This will try to read from the row group, and fills up at most `batch_size` values + /// for `def_levels`, `rep_levels` and `values`. It will stop either when the row group + /// is depleted or `batch_size` values has been read, or there is no space in the input + /// slices (values/definition levels/repetition levels). + /// + /// Note that in case the field being read is not required, `values` could contain less + /// values than `def_levels`. Also note that this will skip reading def / rep levels if + /// the field is required / not repeated, respectively. + /// + /// If `def_levels` or `rep_levels` is `None`, this will also skip reading the + /// respective levels. This is useful when the caller of this function knows in advance + /// that the field is required and non-repeated, therefore can avoid allocating memory + /// for the levels data. Note that if field has definition levels, but caller provides + /// None, there might be inconsistency between levels/values (see comments below). + /// + /// Returns a tuple where the first element is the actual number of values read, + /// and the second element is the actual number of levels read. + #[inline] + pub fn read_batch( + &mut self, + batch_size: usize, + mut def_levels: Option<&mut [i16]>, + mut rep_levels: Option<&mut [i16]>, + values: &mut [T::T], + ) -> Result<(usize, usize)> { + let mut values_read = 0; + let mut levels_read = 0; + + // Compute the smallest batch size we can read based on provided slices + let mut batch_size = min(batch_size, values.len()); + if let Some(ref levels) = def_levels { + batch_size = min(batch_size, levels.len()); + } + if let Some(ref levels) = rep_levels { + batch_size = min(batch_size, levels.len()); + } + + // Read exhaustively all pages until we read all batch_size values/levels + // or there are no more values/levels to read. + while max(values_read, levels_read) < batch_size { + if !self.has_next()? { + break; + } + + // Batch size for the current iteration + let iter_batch_size = { + // Compute approximate value based on values decoded so far + let mut adjusted_size = min( + batch_size, + (self.num_buffered_values - self.num_decoded_values) as usize, + ); + + // Adjust batch size by taking into account how much space is left in values + // slice or levels slices (if available) + adjusted_size = min(adjusted_size, values.len() - values_read); + if let Some(ref levels) = def_levels { + adjusted_size = min(adjusted_size, levels.len() - levels_read); + } + if let Some(ref levels) = rep_levels { + adjusted_size = min(adjusted_size, levels.len() - levels_read); + } + + adjusted_size + }; + + let mut values_to_read = 0; + let mut num_def_levels = 0; + let mut num_rep_levels = 0; + + // If the field is required and non-repeated, there are no definition levels + if self.descr.max_def_level() > 0 && def_levels.as_ref().is_some() { + if let Some(ref mut levels) = def_levels { + num_def_levels = self + .read_def_levels(&mut levels[levels_read..levels_read + iter_batch_size])?; + for i in levels_read..levels_read + num_def_levels { + if levels[i] == self.descr.max_def_level() { + values_to_read += 1; + } + } + } + } else { + // If max definition level == 0, then it is REQUIRED field, read all values. + // If definition levels are not provided, we still read all values. + values_to_read = iter_batch_size; + } + + if self.descr.max_rep_level() > 0 && rep_levels.is_some() { + if let Some(ref mut levels) = rep_levels { + num_rep_levels = self + .read_rep_levels(&mut levels[levels_read..levels_read + iter_batch_size])?; + + // If definition levels are defined, check that rep levels == def levels + if def_levels.is_some() { + assert_eq!( + num_def_levels, num_rep_levels, + "Number of decoded rep / def levels did not match" + ); + } + } + } + + // At this point we have read values, definition and repetition levels. + // If both definition and repetition levels are defined, their counts + // should be equal. Values count is always less or equal to definition levels. + // + // Note that if field is not required, but no definition levels are provided, + // we would read values of batch size and (if provided, of course) repetition + // levels of batch size - [!] they will not be synced, because only definition + // levels enforce number of non-null values to read. + + let curr_values_read = + self.read_values(&mut values[values_read..values_read + values_to_read])?; + + // Update all "return" counters and internal state. + + // This is to account for when def or rep levels are not provided + let curr_levels_read = max(num_def_levels, num_rep_levels); + self.num_decoded_values += max(curr_levels_read, curr_values_read) as u32; + levels_read += curr_levels_read; + values_read += curr_values_read; + } + + Ok((values_read, levels_read)) + } + + /// Reads a new page and set up the decoders for levels, values or dictionary. + /// Returns false if there's no page left. + fn read_new_page(&mut self) -> Result { + #[allow(while_true)] + while true { + match self.page_reader.get_next_page()? { + // No more page to read + None => return Ok(false), + Some(current_page) => { + match current_page { + // 1. Dictionary page: configure dictionary for this page. + p @ Page::DictionaryPage { .. } => { + self.configure_dictionary(p)?; + continue; + } + // 2. Data page v1 + Page::DataPage { + buf, + num_values, + encoding, + def_level_encoding, + rep_level_encoding, + statistics: _, + } => { + self.num_buffered_values = num_values; + self.num_decoded_values = 0; + + let mut buffer_ptr = buf; + + if self.descr.max_rep_level() > 0 { + let mut rep_decoder = LevelDecoder::v1( + rep_level_encoding, + self.descr.max_rep_level(), + ); + let total_bytes = rep_decoder + .set_data(self.num_buffered_values as usize, buffer_ptr.all()); + buffer_ptr = buffer_ptr.start_from(total_bytes); + self.rep_level_decoder = Some(rep_decoder); + } + + if self.descr.max_def_level() > 0 { + let mut def_decoder = LevelDecoder::v1( + def_level_encoding, + self.descr.max_def_level(), + ); + let total_bytes = def_decoder + .set_data(self.num_buffered_values as usize, buffer_ptr.all()); + buffer_ptr = buffer_ptr.start_from(total_bytes); + self.def_level_decoder = Some(def_decoder); + } + + // Data page v1 does not have offset, all content of buffer should be passed + self.set_current_page_encoding( + encoding, + &buffer_ptr, + 0, + num_values as usize, + )?; + return Ok(true); + } + // 3. Data page v2 + Page::DataPageV2 { + buf, + num_values, + encoding, + num_nulls: _, + num_rows: _, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed: _, + statistics: _, + } => { + self.num_buffered_values = num_values; + self.num_decoded_values = 0; + + let mut offset = 0; + + // DataPage v2 only supports RLE encoding for repetition levels + if self.descr.max_rep_level() > 0 { + let mut rep_decoder = LevelDecoder::v2(self.descr.max_rep_level()); + let bytes_read = rep_decoder.set_data_range( + self.num_buffered_values as usize, + &buf, + offset, + rep_levels_byte_len as usize, + ); + offset += bytes_read; + self.rep_level_decoder = Some(rep_decoder); + } + + // DataPage v2 only supports RLE encoding for definition levels + if self.descr.max_def_level() > 0 { + let mut def_decoder = LevelDecoder::v2(self.descr.max_def_level()); + let bytes_read = def_decoder.set_data_range( + self.num_buffered_values as usize, + &buf, + offset, + def_levels_byte_len as usize, + ); + offset += bytes_read; + self.def_level_decoder = Some(def_decoder); + } + + self.set_current_page_encoding( + encoding, + &buf, + offset, + num_values as usize, + )?; + return Ok(true); + } + }; + } + } + } + + Ok(true) + } + + /// Resolves and updates encoding and set decoder for the current page + fn set_current_page_encoding( + &mut self, + mut encoding: Encoding, + buffer_ptr: &ByteBufferPtr, + offset: usize, + len: usize, + ) -> Result<()> { + if encoding == Encoding::PLAIN_DICTIONARY { + encoding = Encoding::RLE_DICTIONARY; + } + + let decoder = if encoding == Encoding::RLE_DICTIONARY { + self.decoders + .get_mut(&encoding) + .expect("Decoder for dict should have been set") + } else { + // Search cache for data page decoder + if !self.decoders.contains_key(&encoding) { + // Initialize decoder for this page + let data_decoder = get_decoder::(self.descr.clone(), encoding)?; + self.decoders.insert(encoding, data_decoder); + } + self.decoders.get_mut(&encoding).unwrap() + }; + + decoder.set_data(buffer_ptr.start_from(offset), len as usize)?; + self.current_encoding = Some(encoding); + Ok(()) + } + + #[inline] + fn has_next(&mut self) -> Result { + if self.num_buffered_values == 0 || self.num_buffered_values == self.num_decoded_values { + // TODO: should we return false if read_new_page() = true and + // num_buffered_values = 0? + if !self.read_new_page()? { + Ok(false) + } else { + Ok(self.num_buffered_values != 0) + } + } else { + Ok(true) + } + } + + #[inline] + fn read_rep_levels(&mut self, buffer: &mut [i16]) -> Result { + let level_decoder = self + .rep_level_decoder + .as_mut() + .expect("rep_level_decoder be set"); + level_decoder.get(buffer) + } + + #[inline] + fn read_def_levels(&mut self, buffer: &mut [i16]) -> Result { + let level_decoder = self + .def_level_decoder + .as_mut() + .expect("def_level_decoder be set"); + level_decoder.get(buffer) + } + + #[inline] + fn read_values(&mut self, buffer: &mut [T::T]) -> Result { + let encoding = self + .current_encoding + .expect("current_encoding should be set"); + let current_decoder = self + .decoders + .get_mut(&encoding) + .expect(format!("decoder for encoding {} should be set", encoding).as_str()); + current_decoder.get(buffer) + } + + #[inline] + fn configure_dictionary(&mut self, page: Page) -> Result { + let mut encoding = page.encoding(); + if encoding == Encoding::PLAIN || encoding == Encoding::PLAIN_DICTIONARY { + encoding = Encoding::RLE_DICTIONARY + } + + if self.decoders.contains_key(&encoding) { + return Err(general_err!("Column cannot have more than one dictionary")); + } + + if encoding == Encoding::RLE_DICTIONARY { + let mut dictionary = PlainDecoder::::new(self.descr.type_length()); + let num_values = page.num_values(); + dictionary.set_data(page.buffer().clone(), num_values as usize)?; + + let mut decoder = DictDecoder::new(); + decoder.set_dict(Box::new(dictionary))?; + self.decoders.insert(encoding, Box::new(decoder)); + Ok(true) + } else { + Err(nyi_err!( + "Invalid/Unsupported encoding type for dictionary: {}", + encoding + )) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use rand::distributions::range::SampleRange; + use std::{collections::VecDeque, rc::Rc, vec::IntoIter}; + + use crate::parquet::basic::Type as PhysicalType; + use crate::parquet::column::page::Page; + use crate::parquet::encodings::{ + encoding::{get_encoder, DictEncoder, Encoder}, + levels::{max_buffer_size, LevelEncoder}, + }; + use crate::parquet::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::parquet::util::{ + memory::{ByteBufferPtr, MemTracker, MemTrackerPtr}, + test_common::random_numbers_range, + }; + + const NUM_LEVELS: usize = 128; + const NUM_PAGES: usize = 2; + const MAX_DEF_LEVEL: i16 = 5; + const MAX_REP_LEVEL: i16 = 5; + + // Macro to generate test cases + macro_rules! test { + // branch for generating i32 cases + ($test_func:ident, i32, $func:ident, $def_level:expr, $rep_level:expr, + $num_pages:expr, $num_levels:expr, $batch_size:expr, $min:expr, $max:expr) => { + test_internal!( + $test_func, + Int32Type, + get_test_int32_type, + $func, + $def_level, + $rep_level, + $num_pages, + $num_levels, + $batch_size, + $min, + $max + ); + }; + // branch for generating i64 cases + ($test_func:ident, i64, $func:ident, $def_level:expr, $rep_level:expr, + $num_pages:expr, $num_levels:expr, $batch_size:expr, $min:expr, $max:expr) => { + test_internal!( + $test_func, + Int64Type, + get_test_int64_type, + $func, + $def_level, + $rep_level, + $num_pages, + $num_levels, + $batch_size, + $min, + $max + ); + }; + } + + macro_rules! test_internal { + ($test_func:ident, $ty:ident, $pty:ident, $func:ident, $def_level:expr, + $rep_level:expr, $num_pages:expr, $num_levels:expr, $batch_size:expr, + $min:expr, $max:expr) => { + #[test] + fn $test_func() { + let desc = Rc::new(ColumnDescriptor::new( + Rc::new($pty()), + None, + $def_level, + $rep_level, + ColumnPath::new(Vec::new()), + )); + let mut tester = ColumnReaderTester::<$ty>::new(); + tester.$func(desc, $num_pages, $num_levels, $batch_size, $min, $max); + } + }; + } + + test!( + test_read_plain_v1_int32, + i32, + plain_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i32::MIN, + ::std::i32::MAX + ); + test!( + test_read_plain_v2_int32, + i32, + plain_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i32::MIN, + ::std::i32::MAX + ); + + test!( + test_read_plain_v1_int32_uneven, + i32, + plain_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 17, + ::std::i32::MIN, + ::std::i32::MAX + ); + test!( + test_read_plain_v2_int32_uneven, + i32, + plain_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 17, + ::std::i32::MIN, + ::std::i32::MAX + ); + + test!( + test_read_plain_v1_int32_multi_page, + i32, + plain_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 512, + ::std::i32::MIN, + ::std::i32::MAX + ); + test!( + test_read_plain_v2_int32_multi_page, + i32, + plain_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 512, + ::std::i32::MIN, + ::std::i32::MAX + ); + + // test cases when column descriptor has MAX_DEF_LEVEL = 0 and MAX_REP_LEVEL = 0 + test!( + test_read_plain_v1_int32_required_non_repeated, + i32, + plain_v1, + 0, + 0, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i32::MIN, + ::std::i32::MAX + ); + test!( + test_read_plain_v2_int32_required_non_repeated, + i32, + plain_v2, + 0, + 0, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i32::MIN, + ::std::i32::MAX + ); + + test!( + test_read_plain_v1_int64, + i64, + plain_v1, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i64::MIN, + ::std::i64::MAX + ); + test!( + test_read_plain_v2_int64, + i64, + plain_v2, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i64::MIN, + ::std::i64::MAX + ); + + test!( + test_read_plain_v1_int64_uneven, + i64, + plain_v1, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 17, + ::std::i64::MIN, + ::std::i64::MAX + ); + test!( + test_read_plain_v2_int64_uneven, + i64, + plain_v2, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 17, + ::std::i64::MIN, + ::std::i64::MAX + ); + + test!( + test_read_plain_v1_int64_multi_page, + i64, + plain_v1, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 512, + ::std::i64::MIN, + ::std::i64::MAX + ); + test!( + test_read_plain_v2_int64_multi_page, + i64, + plain_v2, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 512, + ::std::i64::MIN, + ::std::i64::MAX + ); + + // test cases when column descriptor has MAX_DEF_LEVEL = 0 and MAX_REP_LEVEL = 0 + test!( + test_read_plain_v1_int64_required_non_repeated, + i64, + plain_v1, + 0, + 0, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i64::MIN, + ::std::i64::MAX + ); + test!( + test_read_plain_v2_int64_required_non_repeated, + i64, + plain_v2, + 0, + 0, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i64::MIN, + ::std::i64::MAX + ); + + test!( + test_read_dict_v1_int32_small, + i32, + dict_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + 2, + 2, + 16, + 0, + 3 + ); + test!( + test_read_dict_v2_int32_small, + i32, + dict_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + 2, + 2, + 16, + 0, + 3 + ); + + test!( + test_read_dict_v1_int32, + i32, + dict_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + 0, + 3 + ); + test!( + test_read_dict_v2_int32, + i32, + dict_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + 0, + 3 + ); + + test!( + test_read_dict_v1_int32_uneven, + i32, + dict_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 17, + 0, + 3 + ); + test!( + test_read_dict_v2_int32_uneven, + i32, + dict_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 17, + 0, + 3 + ); + + test!( + test_read_dict_v1_int32_multi_page, + i32, + dict_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 512, + 0, + 3 + ); + test!( + test_read_dict_v2_int32_multi_page, + i32, + dict_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 512, + 0, + 3 + ); + + test!( + test_read_dict_v1_int64, + i64, + dict_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + 0, + 3 + ); + test!( + test_read_dict_v2_int64, + i64, + dict_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + 0, + 3 + ); + + #[test] + fn test_read_batch_values_only() { + test_read_batch_int32(16, &mut vec![0; 10], None, None); // < batch_size + test_read_batch_int32(16, &mut vec![0; 16], None, None); // == batch_size + test_read_batch_int32(16, &mut vec![0; 51], None, None); // > batch_size + } + + #[test] + fn test_read_batch_values_def_levels() { + test_read_batch_int32(16, &mut vec![0; 10], Some(&mut vec![0; 10]), None); + test_read_batch_int32(16, &mut vec![0; 16], Some(&mut vec![0; 16]), None); + test_read_batch_int32(16, &mut vec![0; 51], Some(&mut vec![0; 51]), None); + } + + #[test] + fn test_read_batch_values_rep_levels() { + test_read_batch_int32(16, &mut vec![0; 10], None, Some(&mut vec![0; 10])); + test_read_batch_int32(16, &mut vec![0; 16], None, Some(&mut vec![0; 16])); + test_read_batch_int32(16, &mut vec![0; 51], None, Some(&mut vec![0; 51])); + } + + #[test] + fn test_read_batch_different_buf_sizes() { + test_read_batch_int32( + 16, + &mut vec![0; 8], + Some(&mut vec![0; 9]), + Some(&mut vec![0; 7]), + ); + test_read_batch_int32( + 16, + &mut vec![0; 1], + Some(&mut vec![0; 9]), + Some(&mut vec![0; 3]), + ); + } + + #[test] + fn test_read_batch_values_def_rep_levels() { + test_read_batch_int32( + 128, + &mut vec![0; 128], + Some(&mut vec![0; 128]), + Some(&mut vec![0; 128]), + ); + } + + #[test] + fn test_read_batch_adjust_after_buffering_page() { + // This test covers scenario when buffering new page results in setting number + // of decoded values to 0, resulting on reading `batch_size` of values, but it is + // larger than we can insert into slice (affects values and levels). + // + // Note: values are chosen to reproduce the issue. + // + let primitive_type = get_test_int32_type(); + let desc = Rc::new(ColumnDescriptor::new( + Rc::new(primitive_type), + None, + 1, + 1, + ColumnPath::new(Vec::new()), + )); + + let num_pages = 2; + let num_levels = 4; + let batch_size = 5; + let values = &mut vec![0; 7]; + let def_levels = &mut vec![0; 7]; + let rep_levels = &mut vec![0; 7]; + + let mut tester = ColumnReaderTester::::new(); + tester.test_read_batch( + desc, + Encoding::RLE_DICTIONARY, + num_pages, + num_levels, + batch_size, + ::std::i32::MIN, + ::std::i32::MAX, + values, + Some(def_levels), + Some(rep_levels), + false, + ); + } + + // ---------------------------------------------------------------------- + // Helper methods to make pages and test + // + // # Overview + // + // Most of the test functionality is implemented in `ColumnReaderTester`, which + // provides some general data page test methods: + // - `test_read_batch_general` + // - `test_read_batch` + // + // There are also some high level wrappers that are part of `ColumnReaderTester`: + // - `plain_v1` -> call `test_read_batch_general` with data page v1 and plain encoding + // - `plain_v2` -> call `test_read_batch_general` with data page v2 and plain encoding + // - `dict_v1` -> call `test_read_batch_general` with data page v1 + dictionary page + // - `dict_v2` -> call `test_read_batch_general` with data page v2 + dictionary page + // + // And even higher level wrappers that simplify testing of almost the same test cases: + // - `get_test_int32_type`, provides dummy schema type + // - `get_test_int64_type`, provides dummy schema type + // - `test_read_batch_int32`, wrapper for `read_batch` tests, since they are basically + // the same, just different def/rep levels and batch size. + // + // # Page assembly + // + // Page construction and generation of values, definition and repetition levels happens + // in `make_pages` function. + // All values are randomly generated based on provided min/max, levels are calculated + // based on provided max level for column descriptor (which is basically either int32 + // or int64 type in tests) and `levels_per_page` variable. + // + // We use `DataPageBuilder` and its implementation `DataPageBuilderImpl` to actually + // turn values, definition and repetition levels into data pages (either v1 or v2). + // + // Those data pages are then stored as part of `TestPageReader` (we just pass vector + // of generated pages directly), which implements `PageReader` interface. + // + // # Comparison + // + // This allows us to pass test page reader into column reader, so we can test + // functionality of column reader - see `test_read_batch`, where we create column + // reader -> typed column reader, buffer values in `read_batch` method and compare + // output with generated data. + + // Returns dummy Parquet `Type` for primitive field, because most of our tests use + // INT32 physical type. + fn get_test_int32_type() -> SchemaType { + SchemaType::primitive_type_builder("a", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_32) + .with_length(-1) + .build() + .expect("build() should be OK") + } + + // Returns dummy Parquet `Type` for INT64 physical type. + fn get_test_int64_type() -> SchemaType { + SchemaType::primitive_type_builder("a", PhysicalType::INT64) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_64) + .with_length(-1) + .build() + .expect("build() should be OK") + } + + // Tests `read_batch()` functionality for INT32. + // + // This is a high level wrapper on `ColumnReaderTester` that allows us to specify some + // boilerplate code for setting up definition/repetition levels and column descriptor. + fn test_read_batch_int32( + batch_size: usize, + values: &mut [i32], + def_levels: Option<&mut [i16]>, + rep_levels: Option<&mut [i16]>, + ) { + let primitive_type = get_test_int32_type(); + // make field is required based on provided slices of levels + let max_def_level = if def_levels.is_some() { + MAX_DEF_LEVEL + } else { + 0 + }; + let max_rep_level = if def_levels.is_some() { + MAX_REP_LEVEL + } else { + 0 + }; + + let desc = Rc::new(ColumnDescriptor::new( + Rc::new(primitive_type), + None, + max_def_level, + max_rep_level, + ColumnPath::new(Vec::new()), + )); + let mut tester = ColumnReaderTester::::new(); + tester.test_read_batch( + desc, + Encoding::RLE_DICTIONARY, + NUM_PAGES, + NUM_LEVELS, + batch_size, + ::std::i32::MIN, + ::std::i32::MAX, + values, + def_levels, + rep_levels, + false, + ); + } + + struct ColumnReaderTester + where + T::T: PartialOrd + SampleRange + Copy, + { + rep_levels: Vec, + def_levels: Vec, + values: Vec, + } + + impl ColumnReaderTester + where + T::T: PartialOrd + SampleRange + Copy, + { + pub fn new() -> Self { + Self { + rep_levels: Vec::new(), + def_levels: Vec::new(), + values: Vec::new(), + } + } + + // Method to generate and test data pages v1 + fn plain_v1( + &mut self, + desc: ColumnDescPtr, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + ) { + self.test_read_batch_general( + desc, + Encoding::PLAIN, + num_pages, + num_levels, + batch_size, + min, + max, + false, + ); + } + + // Method to generate and test data pages v2 + fn plain_v2( + &mut self, + desc: ColumnDescPtr, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + ) { + self.test_read_batch_general( + desc, + Encoding::PLAIN, + num_pages, + num_levels, + batch_size, + min, + max, + true, + ); + } + + // Method to generate and test dictionary page + data pages v1 + fn dict_v1( + &mut self, + desc: ColumnDescPtr, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + ) { + self.test_read_batch_general( + desc, + Encoding::RLE_DICTIONARY, + num_pages, + num_levels, + batch_size, + min, + max, + false, + ); + } + + // Method to generate and test dictionary page + data pages v2 + fn dict_v2( + &mut self, + desc: ColumnDescPtr, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + ) { + self.test_read_batch_general( + desc, + Encoding::RLE_DICTIONARY, + num_pages, + num_levels, + batch_size, + min, + max, + true, + ); + } + + // Helper function for the general case of `read_batch()` where `values`, + // `def_levels` and `rep_levels` are always provided with enough space. + fn test_read_batch_general( + &mut self, + desc: ColumnDescPtr, + encoding: Encoding, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + use_v2: bool, + ) { + let mut def_levels = vec![0; num_levels * num_pages]; + let mut rep_levels = vec![0; num_levels * num_pages]; + let mut values = vec![T::T::default(); num_levels * num_pages]; + self.test_read_batch( + desc, + encoding, + num_pages, + num_levels, + batch_size, + min, + max, + &mut values, + Some(&mut def_levels), + Some(&mut rep_levels), + use_v2, + ); + } + + // Helper function to test `read_batch()` method with custom buffers for values, + // definition and repetition levels. + fn test_read_batch( + &mut self, + desc: ColumnDescPtr, + encoding: Encoding, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + values: &mut [T::T], + mut def_levels: Option<&mut [i16]>, + mut rep_levels: Option<&mut [i16]>, + use_v2: bool, + ) { + let mut pages = VecDeque::new(); + make_pages::( + desc.clone(), + encoding, + num_pages, + num_levels, + min, + max, + &mut self.def_levels, + &mut self.rep_levels, + &mut self.values, + &mut pages, + use_v2, + ); + let max_def_level = desc.max_def_level(); + let page_reader = TestPageReader::new(Vec::from(pages)); + let column_reader: ColumnReader = get_column_reader(desc, Box::new(page_reader)); + let mut typed_column_reader = get_typed_column_reader::(column_reader); + + let mut curr_values_read = 0; + let mut curr_levels_read = 0; + let mut done = false; + while !done { + let actual_def_levels = match &mut def_levels { + Some(ref mut vec) => Some(&mut vec[curr_levels_read..]), + None => None, + }; + let actual_rep_levels = match rep_levels { + Some(ref mut vec) => Some(&mut vec[curr_levels_read..]), + None => None, + }; + + let (values_read, levels_read) = typed_column_reader + .read_batch( + batch_size, + actual_def_levels, + actual_rep_levels, + &mut values[curr_values_read..], + ) + .expect("read_batch() should be OK"); + + if values_read == 0 && levels_read == 0 { + done = true; + } + + curr_values_read += values_read; + curr_levels_read += levels_read; + } + + assert!( + values.len() >= curr_values_read, + "values.len() >= values_read" + ); + assert_eq!( + &values[0..curr_values_read], + &self.values[0..curr_values_read], + "values content doesn't match" + ); + + if let Some(ref levels) = def_levels { + assert!( + levels.len() >= curr_levels_read, + "def_levels.len() >= levels_read" + ); + assert_eq!( + &levels[0..curr_levels_read], + &self.def_levels[0..curr_levels_read], + "definition levels content doesn't match" + ); + } + + if let Some(ref levels) = rep_levels { + assert!( + levels.len() >= curr_levels_read, + "rep_levels.len() >= levels_read" + ); + assert_eq!( + &levels[0..curr_levels_read], + &self.rep_levels[0..curr_levels_read], + "repetition levels content doesn't match" + ); + } + + if def_levels.is_none() && rep_levels.is_none() { + assert!( + curr_levels_read == 0, + "expected to read 0 levels, found {}", + curr_levels_read + ); + } else if def_levels.is_some() && max_def_level > 0 { + assert!( + curr_levels_read >= curr_values_read, + "expected levels read to be greater than values read" + ); + } + } + } + + struct TestPageReader { + pages: IntoIter, + } + + impl TestPageReader { + pub fn new(pages: Vec) -> Self { + Self { + pages: pages.into_iter(), + } + } + } + + impl PageReader for TestPageReader { + fn get_next_page(&mut self) -> Result> { + Ok(self.pages.next()) + } + } + + // ---------------------------------------------------------------------- + // Utility functions for generating testing pages + + trait DataPageBuilder { + fn add_rep_levels(&mut self, max_level: i16, rep_levels: &[i16]); + fn add_def_levels(&mut self, max_level: i16, def_levels: &[i16]); + fn add_values(&mut self, encoding: Encoding, values: &[T::T]); + fn add_indices(&mut self, indices: ByteBufferPtr); + fn consume(self) -> Page; + } + + /// A utility struct for building data pages (v1 or v2). Callers must call: + /// - add_rep_levels() + /// - add_def_levels() + /// - add_values() for normal data page / add_indices() for dictionary data page + /// - consume() + /// in order to populate and obtain a data page. + struct DataPageBuilderImpl { + desc: ColumnDescPtr, + encoding: Option, + mem_tracker: MemTrackerPtr, + num_values: u32, + buffer: Vec, + rep_levels_byte_len: u32, + def_levels_byte_len: u32, + datapage_v2: bool, + } + + impl DataPageBuilderImpl { + // `num_values` is the number of non-null values to put in the data page. + // `datapage_v2` flag is used to indicate if the generated data page should use V2 + // format or not. + fn new(desc: ColumnDescPtr, num_values: u32, datapage_v2: bool) -> Self { + DataPageBuilderImpl { + desc, + encoding: None, + mem_tracker: Rc::new(MemTracker::new()), + num_values, + buffer: vec![], + rep_levels_byte_len: 0, + def_levels_byte_len: 0, + datapage_v2, + } + } + + // Adds levels to the buffer and return number of encoded bytes + fn add_levels(&mut self, max_level: i16, levels: &[i16]) -> u32 { + let size = max_buffer_size(Encoding::RLE, max_level, levels.len()); + let mut level_encoder = LevelEncoder::v1(Encoding::RLE, max_level, vec![0; size]); + level_encoder.put(levels).expect("put() should be OK"); + let encoded_levels = level_encoder.consume().expect("consume() should be OK"); + // Actual encoded bytes (without length offset) + let encoded_bytes = &encoded_levels[mem::size_of::()..]; + if self.datapage_v2 { + // Level encoder always initializes with offset of i32, where it stores length of + // encoded data; for data page v2 we explicitly store length, therefore we should + // skip i32 bytes. + self.buffer.extend_from_slice(encoded_bytes); + } else { + self.buffer.extend_from_slice(encoded_levels.as_slice()); + } + encoded_bytes.len() as u32 + } + } + + impl DataPageBuilder for DataPageBuilderImpl { + fn add_rep_levels(&mut self, max_levels: i16, rep_levels: &[i16]) { + self.num_values = rep_levels.len() as u32; + self.rep_levels_byte_len = self.add_levels(max_levels, rep_levels); + } + + fn add_def_levels(&mut self, max_levels: i16, def_levels: &[i16]) { + assert!( + self.num_values == def_levels.len() as u32, + "Must call `add_rep_levels() first!`" + ); + + self.def_levels_byte_len = self.add_levels(max_levels, def_levels); + } + + fn add_values(&mut self, encoding: Encoding, values: &[T::T]) { + assert!( + self.num_values >= values.len() as u32, + "num_values: {}, values.len(): {}", + self.num_values, + values.len() + ); + self.encoding = Some(encoding); + let mut encoder: Box> = + get_encoder::(self.desc.clone(), encoding, self.mem_tracker.clone()) + .expect("get_encoder() should be OK"); + encoder.put(values).expect("put() should be OK"); + let encoded_values = encoder + .flush_buffer() + .expect("consume_buffer() should be OK"); + self.buffer.extend_from_slice(encoded_values.data()); + } + + fn add_indices(&mut self, indices: ByteBufferPtr) { + self.encoding = Some(Encoding::RLE_DICTIONARY); + self.buffer.extend_from_slice(indices.data()); + } + + fn consume(self) -> Page { + if self.datapage_v2 { + Page::DataPageV2 { + buf: ByteBufferPtr::new(self.buffer), + num_values: self.num_values, + encoding: self.encoding.unwrap(), + num_nulls: 0, // set to dummy value - don't need this when reading data page + num_rows: self.num_values, // also don't need this when reading data page + def_levels_byte_len: self.def_levels_byte_len, + rep_levels_byte_len: self.rep_levels_byte_len, + is_compressed: false, + statistics: None, // set to None, we do not need statistics for tests + } + } else { + Page::DataPage { + buf: ByteBufferPtr::new(self.buffer), + num_values: self.num_values, + encoding: self.encoding.unwrap(), + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + statistics: None, // set to None, we do not need statistics for tests + } + } + } + } + + fn make_pages( + desc: ColumnDescPtr, + encoding: Encoding, + num_pages: usize, + levels_per_page: usize, + min: T::T, + max: T::T, + def_levels: &mut Vec, + rep_levels: &mut Vec, + values: &mut Vec, + pages: &mut VecDeque, + use_v2: bool, + ) where + T::T: PartialOrd + SampleRange + Copy, + { + let mut num_values = 0; + let max_def_level = desc.max_def_level(); + let max_rep_level = desc.max_rep_level(); + + let mem_tracker = Rc::new(MemTracker::new()); + let mut dict_encoder = DictEncoder::::new(desc.clone(), mem_tracker); + + for i in 0..num_pages { + let mut num_values_cur_page = 0; + let level_range = i * levels_per_page..(i + 1) * levels_per_page; + + if max_def_level > 0 { + random_numbers_range(levels_per_page, 0, max_def_level + 1, def_levels); + for dl in &def_levels[level_range.clone()] { + if *dl == max_def_level { + num_values_cur_page += 1; + } + } + } else { + num_values_cur_page = levels_per_page; + } + if max_rep_level > 0 { + random_numbers_range(levels_per_page, 0, max_rep_level + 1, rep_levels); + } + random_numbers_range(num_values_cur_page, min, max, values); + + // Generate the current page + + let mut pb = DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); + if max_rep_level > 0 { + pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); + } + if max_def_level > 0 { + pb.add_def_levels(max_def_level, &def_levels[level_range]); + } + + let value_range = num_values..num_values + num_values_cur_page; + match encoding { + Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { + let _ = dict_encoder.put(&values[value_range.clone()]); + let indices = dict_encoder + .write_indices() + .expect("write_indices() should be OK"); + pb.add_indices(indices); + } + Encoding::PLAIN => { + pb.add_values::(encoding, &values[value_range]); + } + enc @ _ => panic!("Unexpected encoding {}", enc), + } + + let data_page = pb.consume(); + pages.push_back(data_page); + num_values += num_values_cur_page; + } + + if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY { + let dict = dict_encoder + .write_dict() + .expect("write_dict() should be OK"); + let dict_page = Page::DictionaryPage { + buf: dict, + num_values: dict_encoder.num_entries() as u32, + encoding: Encoding::RLE_DICTIONARY, + is_sorted: false, + }; + pages.push_front(dict_page); + } + } +} diff --git a/rust/src/parquet/column/writer.rs b/rust/src/parquet/column/writer.rs new file mode 100644 index 0000000000000..4798d9ad17927 --- /dev/null +++ b/rust/src/parquet/column/writer.rs @@ -0,0 +1,1617 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains column writer API. + +use std::{cmp, collections::VecDeque, mem, rc::Rc}; + +use crate::parquet::basic::{Compression, Encoding, PageType, Type}; +use crate::parquet::column::page::{CompressedPage, Page, PageWriteSpec, PageWriter}; +use crate::parquet::compression::{create_codec, Codec}; +use crate::parquet::data_type::*; +use crate::parquet::encodings::{ + encoding::{get_encoder, DictEncoder, Encoder}, + levels::{max_buffer_size, LevelEncoder}, +}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::file::{ + metadata::ColumnChunkMetaData, + properties::{WriterProperties, WriterPropertiesPtr, WriterVersion}, +}; +use crate::parquet::schema::types::ColumnDescPtr; +use crate::parquet::util::memory::{ByteBufferPtr, MemTracker}; + +/// Column writer for a Parquet type. +pub enum ColumnWriter { + BoolColumnWriter(ColumnWriterImpl), + Int32ColumnWriter(ColumnWriterImpl), + Int64ColumnWriter(ColumnWriterImpl), + Int96ColumnWriter(ColumnWriterImpl), + FloatColumnWriter(ColumnWriterImpl), + DoubleColumnWriter(ColumnWriterImpl), + ByteArrayColumnWriter(ColumnWriterImpl), + FixedLenByteArrayColumnWriter(ColumnWriterImpl), +} + +/// Gets a specific column writer corresponding to column descriptor `descr`. +pub fn get_column_writer( + descr: ColumnDescPtr, + props: WriterPropertiesPtr, + page_writer: Box, +) -> ColumnWriter { + match descr.physical_type() { + Type::BOOLEAN => { + ColumnWriter::BoolColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::INT32 => { + ColumnWriter::Int32ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::INT64 => { + ColumnWriter::Int64ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::INT96 => { + ColumnWriter::Int96ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::FLOAT => { + ColumnWriter::FloatColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::DOUBLE => { + ColumnWriter::DoubleColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::BYTE_ARRAY => { + ColumnWriter::ByteArrayColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::FIXED_LEN_BYTE_ARRAY => ColumnWriter::FixedLenByteArrayColumnWriter( + ColumnWriterImpl::new(descr, props, page_writer), + ), + } +} + +/// Gets a typed column writer for the specific type `T`, by "up-casting" `col_writer` of +/// non-generic type to a generic column writer type `ColumnWriterImpl`. +/// +/// NOTE: the caller MUST guarantee that the actual enum value for `col_writer` matches +/// the type `T`. Otherwise, disastrous consequence could happen. +pub fn get_typed_column_writer(col_writer: ColumnWriter) -> ColumnWriterImpl { + match col_writer { + ColumnWriter::BoolColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::Int32ColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::Int64ColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::Int96ColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::FloatColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::DoubleColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::ByteArrayColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::FixedLenByteArrayColumnWriter(r) => unsafe { mem::transmute(r) }, + } +} + +/// Typed column writer for a primitive column. +pub struct ColumnWriterImpl { + // Column writer properties + descr: ColumnDescPtr, + props: WriterPropertiesPtr, + page_writer: Box, + has_dictionary: bool, + dict_encoder: Option>, + encoder: Box>, + codec: Compression, + compressor: Option>, + // Metrics per page + num_buffered_values: u32, + num_buffered_encoded_values: u32, + num_buffered_rows: u32, + // Metrics per column writer + total_bytes_written: u64, + total_rows_written: u64, + total_uncompressed_size: u64, + total_compressed_size: u64, + total_num_values: u64, + dictionary_page_offset: Option, + data_page_offset: Option, + // Reused buffers + def_levels_sink: Vec, + rep_levels_sink: Vec, + data_pages: VecDeque, +} + +impl ColumnWriterImpl { + pub fn new( + descr: ColumnDescPtr, + props: WriterPropertiesPtr, + page_writer: Box, + ) -> Self { + let codec = props.compression(descr.path()); + let compressor = create_codec(codec).unwrap(); + + // Optionally set dictionary encoder. + let dict_encoder = + if props.dictionary_enabled(descr.path()) && Self::has_dictionary_support(&props) { + Some(DictEncoder::new(descr.clone(), Rc::new(MemTracker::new()))) + } else { + None + }; + + // Whether or not this column writer has a dictionary encoding. + let has_dictionary = dict_encoder.is_some(); + + // Set either main encoder or fallback encoder. + let fallback_encoder = get_encoder( + descr.clone(), + props + .encoding(descr.path()) + .unwrap_or(Self::fallback_encoding(&props)), + Rc::new(MemTracker::new()), + ) + .unwrap(); + + Self { + descr, + props, + page_writer, + has_dictionary, + dict_encoder, + encoder: fallback_encoder, + codec, + compressor, + num_buffered_values: 0, + num_buffered_encoded_values: 0, + num_buffered_rows: 0, + total_bytes_written: 0, + total_rows_written: 0, + total_uncompressed_size: 0, + total_compressed_size: 0, + total_num_values: 0, + dictionary_page_offset: None, + data_page_offset: None, + def_levels_sink: vec![], + rep_levels_sink: vec![], + data_pages: VecDeque::new(), + } + } + + /// Writes batch of values, definition levels and repetition levels. + /// Returns number of values processed (written). + /// + /// If definition and repetition levels are provided, we write fully those levels and + /// select how many values to write (this number will be returned), since number of + /// actual written values may be smaller than provided values. + /// + /// If only values are provided, then all values are written and the length of + /// of the values buffer is returned. + /// + /// Definition and/or repetition levels can be omitted, if values are + /// non-nullable and/or non-repeated. + pub fn write_batch( + &mut self, + values: &[T::T], + def_levels: Option<&[i16]>, + rep_levels: Option<&[i16]>, + ) -> Result { + // We check for DataPage limits only after we have inserted the values. If a user + // writes a large number of values, the DataPage size can be well above the limit. + // + // The purpose of this chunking is to bound this. Even if a user writes large number + // of values, the chunking will ensure that we add data page at a reasonable pagesize + // limit. + + // TODO: find out why we don't account for size of levels when we estimate page size. + + // Find out the minimal length to prevent index out of bound errors. + let mut min_len = values.len(); + if let Some(levels) = def_levels { + min_len = cmp::min(min_len, levels.len()); + } + if let Some(levels) = rep_levels { + min_len = cmp::min(min_len, levels.len()); + } + + // Find out number of batches to process. + let write_batch_size = self.props.write_batch_size(); + let num_batches = min_len / write_batch_size; + + let mut values_offset = 0; + let mut levels_offset = 0; + + for _ in 0..num_batches { + values_offset += self.write_mini_batch( + &values[values_offset..values_offset + write_batch_size], + def_levels.map(|lv| &lv[levels_offset..levels_offset + write_batch_size]), + rep_levels.map(|lv| &lv[levels_offset..levels_offset + write_batch_size]), + )?; + levels_offset += write_batch_size; + } + + values_offset += self.write_mini_batch( + &values[values_offset..], + def_levels.map(|lv| &lv[levels_offset..]), + rep_levels.map(|lv| &lv[levels_offset..]), + )?; + + // Return total number of values processed. + Ok(values_offset) + } + + /// Returns total number of bytes written by this column writer so far. + /// This value is also returned when column writer is closed. + pub fn get_total_bytes_written(&self) -> u64 { + self.total_bytes_written + } + + /// Returns total number of rows written by this column writer so far. + /// This value is also returned when column writer is closed. + pub fn get_total_rows_written(&self) -> u64 { + self.total_rows_written + } + + /// Finalises writes and closes the column writer. + /// Returns total bytes written, total rows written and column chunk metadata. + pub fn close(mut self) -> Result<(u64, u64, ColumnChunkMetaData)> { + if self.dict_encoder.is_some() { + self.write_dictionary_page()?; + } + self.flush_data_pages()?; + let metadata = self.write_column_metadata()?; + self.dict_encoder = None; + self.page_writer.close()?; + + Ok((self.total_bytes_written, self.total_rows_written, metadata)) + } + + /// Writes mini batch of values, definition and repetition levels. + /// This allows fine-grained processing of values and maintaining a reasonable + /// page size. + fn write_mini_batch( + &mut self, + values: &[T::T], + def_levels: Option<&[i16]>, + rep_levels: Option<&[i16]>, + ) -> Result { + let num_values; + let mut values_to_write = 0; + + // Check if number of definition levels is the same as number of repetition levels. + if def_levels.is_some() && rep_levels.is_some() { + let def = def_levels.unwrap(); + let rep = rep_levels.unwrap(); + if def.len() != rep.len() { + return Err(general_err!( + "Inconsistent length of definition and repetition levels: {} != {}", + def.len(), + rep.len() + )); + } + } + + // Process definition levels and determine how many values to write. + if self.descr.max_def_level() > 0 { + if def_levels.is_none() { + return Err(general_err!( + "Definition levels are required, because max definition level = {}", + self.descr.max_def_level() + )); + } + + let levels = def_levels.unwrap(); + num_values = levels.len(); + for &level in levels { + values_to_write += (level == self.descr.max_def_level()) as usize; + } + + self.write_definition_levels(levels); + } else { + values_to_write = values.len(); + num_values = values_to_write; + } + + // Process repetition levels and determine how many rows we are about to process. + if self.descr.max_rep_level() > 0 { + // A row could contain more than one value. + if rep_levels.is_none() { + return Err(general_err!( + "Repetition levels are required, because max repetition level = {}", + self.descr.max_rep_level() + )); + } + + // Count the occasions where we start a new row + let levels = rep_levels.unwrap(); + for &level in levels { + self.num_buffered_rows += (level == 0) as u32 + } + + self.write_repetition_levels(levels); + } else { + // Each value is exactly one row. + // Equals to the number of values, we count nulls as well. + self.num_buffered_rows += num_values as u32; + } + + // Check that we have enough values to write. + if values.len() < values_to_write { + return Err(general_err!( + "Expected to write {} values, but have only {}", + values_to_write, + values.len() + )); + } + + // TODO: update page statistics + + self.write_values(&values[0..values_to_write])?; + + self.num_buffered_values += num_values as u32; + self.num_buffered_encoded_values += values_to_write as u32; + + if self.should_add_data_page() { + self.add_data_page()?; + } + + if self.should_dict_fallback() { + self.dict_fallback()?; + } + + Ok(values_to_write) + } + + #[inline] + fn write_definition_levels(&mut self, def_levels: &[i16]) { + self.def_levels_sink.extend_from_slice(def_levels); + } + + #[inline] + fn write_repetition_levels(&mut self, rep_levels: &[i16]) { + self.rep_levels_sink.extend_from_slice(rep_levels); + } + + #[inline] + fn write_values(&mut self, values: &[T::T]) -> Result<()> { + match self.dict_encoder { + Some(ref mut encoder) => encoder.put(values), + None => self.encoder.put(values), + } + } + + /// Returns true if we need to fall back to non-dictionary encoding. + /// + /// We can only fall back if dictionary encoder is set and we have exceeded dictionary + /// size. + #[inline] + fn should_dict_fallback(&self) -> bool { + match self.dict_encoder { + Some(ref encoder) => { + encoder.dict_encoded_size() >= self.props.dictionary_pagesize_limit() + } + None => false, + } + } + + /// Returns true if there is enough data for a data page, false otherwise. + #[inline] + fn should_add_data_page(&self) -> bool { + self.encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit() + } + + /// Performs dictionary fallback. + /// Prepares and writes dictionary and all data pages into page writer. + fn dict_fallback(&mut self) -> Result<()> { + // At this point we know that we need to fall back. + self.write_dictionary_page()?; + self.flush_data_pages()?; + self.dict_encoder = None; + Ok(()) + } + + /// Adds data page. + /// Data page is either buffered in case of dictionary encoding or written directly. + fn add_data_page(&mut self) -> Result<()> { + // Extract encoded values + let value_bytes = match self.dict_encoder { + Some(ref mut encoder) => encoder.write_indices()?, + None => self.encoder.flush_buffer()?, + }; + + // Select encoding based on current encoder and writer version (v1 or v2). + let encoding = if self.dict_encoder.is_some() { + self.props.dictionary_data_page_encoding() + } else { + self.encoder.encoding() + }; + + let max_def_level = self.descr.max_def_level(); + let max_rep_level = self.descr.max_rep_level(); + + let compressed_page = match self.props.writer_version() { + WriterVersion::PARQUET_1_0 => { + let mut buffer = vec![]; + + if max_rep_level > 0 { + buffer.extend_from_slice( + &self.encode_levels_v1( + Encoding::RLE, + &self.rep_levels_sink[..], + max_rep_level, + )?[..], + ); + } + + if max_def_level > 0 { + buffer.extend_from_slice( + &self.encode_levels_v1( + Encoding::RLE, + &self.def_levels_sink[..], + max_def_level, + )?[..], + ); + } + + buffer.extend_from_slice(value_bytes.data()); + let uncompressed_size = buffer.len(); + + if let Some(ref mut cmpr) = self.compressor { + let mut compressed_buf = Vec::with_capacity(value_bytes.data().len()); + cmpr.compress(&buffer[..], &mut compressed_buf)?; + buffer = compressed_buf; + } + + let data_page = Page::DataPage { + buf: ByteBufferPtr::new(buffer), + num_values: self.num_buffered_values, + encoding, + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + // TODO: process statistics + statistics: None, + }; + + CompressedPage::new(data_page, uncompressed_size) + } + WriterVersion::PARQUET_2_0 => { + let mut rep_levels_byte_len = 0; + let mut def_levels_byte_len = 0; + let mut buffer = vec![]; + + if max_rep_level > 0 { + let levels = self.encode_levels_v2(&self.rep_levels_sink[..], max_rep_level)?; + rep_levels_byte_len = levels.len(); + buffer.extend_from_slice(&levels[..]); + } + + if max_def_level > 0 { + let levels = self.encode_levels_v2(&self.def_levels_sink[..], max_def_level)?; + def_levels_byte_len = levels.len(); + buffer.extend_from_slice(&levels[..]); + } + + let uncompressed_size = + rep_levels_byte_len + def_levels_byte_len + value_bytes.len(); + + // Data Page v2 compresses values only. + match self.compressor { + Some(ref mut cmpr) => { + let mut compressed_buf = Vec::with_capacity(value_bytes.data().len()); + cmpr.compress(value_bytes.data(), &mut compressed_buf)?; + buffer.extend_from_slice(&compressed_buf[..]); + } + None => { + buffer.extend_from_slice(value_bytes.data()); + } + } + + let data_page = Page::DataPageV2 { + buf: ByteBufferPtr::new(buffer), + num_values: self.num_buffered_values, + encoding, + num_nulls: self.num_buffered_values - self.num_buffered_encoded_values, + num_rows: self.num_buffered_rows, + def_levels_byte_len: def_levels_byte_len as u32, + rep_levels_byte_len: rep_levels_byte_len as u32, + is_compressed: self.compressor.is_some(), + // TODO: process statistics + statistics: None, + }; + + CompressedPage::new(data_page, uncompressed_size) + } + }; + + // Check if we need to buffer data page or flush it to the sink directly. + if self.dict_encoder.is_some() { + self.data_pages.push_back(compressed_page); + } else { + self.write_data_page(compressed_page)?; + } + + // Update total number of rows. + self.total_rows_written += self.num_buffered_rows as u64; + + // Reset state. + self.rep_levels_sink.clear(); + self.def_levels_sink.clear(); + self.num_buffered_values = 0; + self.num_buffered_encoded_values = 0; + self.num_buffered_rows = 0; + + Ok(()) + } + + /// Finalises any outstanding data pages and flushes buffered data pages from + /// dictionary encoding into underlying sink. + #[inline] + fn flush_data_pages(&mut self) -> Result<()> { + // Write all outstanding data to a new page. + if self.num_buffered_values > 0 { + self.add_data_page()?; + } + + while let Some(page) = self.data_pages.pop_front() { + self.write_data_page(page)?; + } + + Ok(()) + } + + /// Assembles and writes column chunk metadata. + fn write_column_metadata(&mut self) -> Result { + let total_compressed_size = self.total_compressed_size as i64; + let total_uncompressed_size = self.total_uncompressed_size as i64; + let num_values = self.total_num_values as i64; + let dict_page_offset = self.dictionary_page_offset.map(|v| v as i64); + // If data page offset is not set, then no pages have been written + let data_page_offset = self.data_page_offset.unwrap_or(0) as i64; + + let file_offset; + let mut encodings = Vec::new(); + + if self.has_dictionary { + assert!(dict_page_offset.is_some(), "Dictionary offset is not set"); + file_offset = dict_page_offset.unwrap() + total_compressed_size; + // NOTE: This should be in sync with writing dictionary pages. + encodings.push(self.props.dictionary_page_encoding()); + encodings.push(self.props.dictionary_data_page_encoding()); + // Fallback to alternative encoding, add it to the list. + if self.dict_encoder.is_none() { + encodings.push(self.encoder.encoding()); + } + } else { + file_offset = data_page_offset + total_compressed_size; + encodings.push(self.encoder.encoding()); + } + // We use only RLE level encoding for data page v1 and data page v2. + encodings.push(Encoding::RLE); + + let metadata = ColumnChunkMetaData::builder(self.descr.clone()) + .set_compression(self.codec) + .set_encodings(encodings) + .set_file_offset(file_offset) + .set_total_compressed_size(total_compressed_size) + .set_total_uncompressed_size(total_uncompressed_size) + .set_num_values(num_values) + .set_data_page_offset(data_page_offset) + .set_dictionary_page_offset(dict_page_offset) + .build()?; + + self.page_writer.write_metadata(&metadata)?; + + Ok(metadata) + } + + /// Encodes definition or repetition levels for Data Page v1. + #[inline] + fn encode_levels_v1( + &self, + encoding: Encoding, + levels: &[i16], + max_level: i16, + ) -> Result> { + let size = max_buffer_size(encoding, max_level, levels.len()); + let mut encoder = LevelEncoder::v1(encoding, max_level, vec![0; size]); + encoder.put(&levels)?; + encoder.consume() + } + + /// Encodes definition or repetition levels for Data Page v2. + /// Encoding is always RLE. + #[inline] + fn encode_levels_v2(&self, levels: &[i16], max_level: i16) -> Result> { + let size = max_buffer_size(Encoding::RLE, max_level, levels.len()); + let mut encoder = LevelEncoder::v2(max_level, vec![0; size]); + encoder.put(&levels)?; + encoder.consume() + } + + /// Writes compressed data page into underlying sink and updates global metrics. + #[inline] + fn write_data_page(&mut self, page: CompressedPage) -> Result<()> { + let page_spec = self.page_writer.write_page(page)?; + self.update_metrics_for_page(page_spec); + Ok(()) + } + + /// Writes dictionary page into underlying sink. + #[inline] + fn write_dictionary_page(&mut self) -> Result<()> { + if self.dict_encoder.is_none() { + return Err(general_err!("Dictionary encoder is not set")); + } + + let compressed_page = { + let encoder = self.dict_encoder.as_ref().unwrap(); + let is_sorted = encoder.is_sorted(); + let num_values = encoder.num_entries(); + let mut values_buf = encoder.write_dict()?; + let uncompressed_size = values_buf.len(); + + if let Some(ref mut cmpr) = self.compressor { + let mut output_buf = Vec::with_capacity(uncompressed_size); + cmpr.compress(values_buf.data(), &mut output_buf)?; + values_buf = ByteBufferPtr::new(output_buf); + } + + let dict_page = Page::DictionaryPage { + buf: values_buf, + num_values: num_values as u32, + encoding: self.props.dictionary_page_encoding(), + is_sorted, + }; + CompressedPage::new(dict_page, uncompressed_size) + }; + + let page_spec = self.page_writer.write_page(compressed_page)?; + self.update_metrics_for_page(page_spec); + Ok(()) + } + + /// Updates column writer metrics with each page metadata. + #[inline] + fn update_metrics_for_page(&mut self, page_spec: PageWriteSpec) { + self.total_uncompressed_size += page_spec.uncompressed_size as u64; + self.total_compressed_size += page_spec.compressed_size as u64; + self.total_num_values += page_spec.num_values as u64; + self.total_bytes_written += page_spec.bytes_written; + + match page_spec.page_type { + PageType::DATA_PAGE | PageType::DATA_PAGE_V2 => { + if self.data_page_offset.is_none() { + self.data_page_offset = Some(page_spec.offset); + } + } + PageType::DICTIONARY_PAGE => { + assert!( + self.dictionary_page_offset.is_none(), + "Dictionary offset is already set" + ); + self.dictionary_page_offset = Some(page_spec.offset); + } + _ => {} + } + } + + /// Returns reference to the underlying page writer. + /// This method is intended to use in tests only. + fn get_page_writer_ref(&self) -> &Box { + &self.page_writer + } +} + +// ---------------------------------------------------------------------- +// Encoding support for column writer. +// This mirrors parquet-mr default encodings for writes. See: +// https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/column/values/factory/DefaultV1ValuesWriterFactory.java +// https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/column/values/factory/DefaultV2ValuesWriterFactory.java + +/// Trait to define default encoding for types, including whether or not the type +/// supports dictionary encoding. +trait EncodingWriteSupport { + /// Returns encoding for a column when no other encoding is provided in writer + /// properties. + fn fallback_encoding(props: &WriterProperties) -> Encoding; + + /// Returns true if dictionary is supported for column writer, false otherwise. + fn has_dictionary_support(props: &WriterProperties) -> bool; +} + +// Basic implementation, always falls back to PLAIN and supports dictionary. +impl EncodingWriteSupport for ColumnWriterImpl { + default fn fallback_encoding(_props: &WriterProperties) -> Encoding { + Encoding::PLAIN + } + + default fn has_dictionary_support(_props: &WriterProperties) -> bool { + true + } +} + +impl EncodingWriteSupport for ColumnWriterImpl { + fn fallback_encoding(props: &WriterProperties) -> Encoding { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::RLE, + } + } + + // Boolean column does not support dictionary encoding and should fall back to + // whatever fallback encoding is defined. + fn has_dictionary_support(_props: &WriterProperties) -> bool { + false + } +} + +impl EncodingWriteSupport for ColumnWriterImpl { + fn fallback_encoding(props: &WriterProperties) -> Encoding { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::DELTA_BINARY_PACKED, + } + } +} + +impl EncodingWriteSupport for ColumnWriterImpl { + fn fallback_encoding(props: &WriterProperties) -> Encoding { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::DELTA_BINARY_PACKED, + } + } +} + +impl EncodingWriteSupport for ColumnWriterImpl { + fn fallback_encoding(props: &WriterProperties) -> Encoding { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY, + } + } +} + +impl EncodingWriteSupport for ColumnWriterImpl { + fn fallback_encoding(props: &WriterProperties) -> Encoding { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY, + } + } + + fn has_dictionary_support(props: &WriterProperties) -> bool { + match props.writer_version() { + // Dictionary encoding was not enabled in PARQUET 1.0 + WriterVersion::PARQUET_1_0 => false, + WriterVersion::PARQUET_2_0 => true, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::error::Error; + + use rand::distributions::range::SampleRange; + + use crate::parquet::column::{ + page::PageReader, + reader::{get_column_reader, get_typed_column_reader, ColumnReaderImpl}, + }; + use crate::parquet::file::{ + properties::WriterProperties, reader::SerializedPageReader, writer::SerializedPageWriter, + }; + use crate::parquet::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::parquet::util::{ + io::{FileSink, FileSource}, + test_common::{get_temp_file, random_numbers_range}, + }; + + #[test] + fn test_column_writer_inconsistent_def_rep_length() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 1, 1, props); + let res = writer.write_batch(&[1, 2, 3, 4], Some(&[1, 1, 1]), Some(&[0, 0])); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!( + err.description(), + "Inconsistent length of definition and repetition levels: 3 != 2" + ); + } + } + + #[test] + fn test_column_writer_invalid_def_levels() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 1, 0, props); + let res = writer.write_batch(&[1, 2, 3, 4], None, None); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!( + err.description(), + "Definition levels are required, because max definition level = 1" + ); + } + } + + #[test] + fn test_column_writer_invalid_rep_levels() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 0, 1, props); + let res = writer.write_batch(&[1, 2, 3, 4], None, None); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!( + err.description(), + "Repetition levels are required, because max repetition level = 1" + ); + } + } + + #[test] + fn test_column_writer_not_enough_values_to_write() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 1, 0, props); + let res = writer.write_batch(&[1, 2], Some(&[1, 1, 1, 1]), None); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!( + err.description(), + "Expected to write 4 values, but have only 2" + ); + } + } + + #[test] + #[should_panic(expected = "Dictionary offset is already set")] + fn test_column_writer_write_only_one_dictionary_page() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + // First page should be correctly written. + let res = writer.write_dictionary_page(); + assert!(res.is_ok()); + writer.write_dictionary_page().unwrap(); + } + + #[test] + fn test_column_writer_error_when_writing_disabled_dictionary() { + let page_writer = get_test_page_writer(); + let props = Rc::new( + WriterProperties::builder() + .set_dictionary_enabled(false) + .build(), + ); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + let res = writer.write_dictionary_page(); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!(err.description(), "Dictionary encoder is not set"); + } + } + + #[test] + fn test_column_writer_boolean_type_does_not_support_dictionary() { + let page_writer = get_test_page_writer(); + let props = Rc::new( + WriterProperties::builder() + .set_dictionary_enabled(true) + .build(), + ); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer + .write_batch(&[true, false, true, false], None, None) + .unwrap(); + + let (bytes_written, rows_written, metadata) = writer.close().unwrap(); + // PlainEncoder uses bit writer to write boolean values, which all fit into 1 byte. + assert_eq!(bytes_written, 1); + assert_eq!(rows_written, 4); + assert_eq!(metadata.encodings(), &vec![Encoding::PLAIN, Encoding::RLE]); + assert_eq!(metadata.num_values(), 4); // just values + assert_eq!(metadata.dictionary_page_offset(), None); + } + + #[test] + fn test_column_writer_default_encoding_support_bool() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[true, false], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[true, false], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[true, false], + None, + &[Encoding::RLE, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[true, false], + None, + &[Encoding::RLE, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_int32() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[1, 2], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[1, 2], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[1, 2], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[1, 2], + None, + &[Encoding::DELTA_BINARY_PACKED, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_int64() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[1, 2], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[1, 2], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[1, 2], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[1, 2], + None, + &[Encoding::DELTA_BINARY_PACKED, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_int96() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[Int96::from(vec![1, 2, 3])], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[Int96::from(vec![1, 2, 3])], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[Int96::from(vec![1, 2, 3])], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[Int96::from(vec![1, 2, 3])], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_float() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[1.0, 2.0], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[1.0, 2.0], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[1.0, 2.0], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[1.0, 2.0], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_double() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[1.0, 2.0], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[1.0, 2.0], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[1.0, 2.0], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[1.0, 2.0], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_byte_array() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[ByteArray::from(vec![1u8])], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[ByteArray::from(vec![1u8])], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[ByteArray::from(vec![1u8])], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[ByteArray::from(vec![1u8])], + None, + &[Encoding::DELTA_BYTE_ARRAY, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_fixed_len_byte_array() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[ByteArray::from(vec![1u8])], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[ByteArray::from(vec![1u8])], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[ByteArray::from(vec![1u8])], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[ByteArray::from(vec![1u8])], + None, + &[Encoding::DELTA_BYTE_ARRAY, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_check_metadata() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + + let (bytes_written, rows_written, metadata) = writer.close().unwrap(); + assert_eq!(bytes_written, 20); + assert_eq!(rows_written, 4); + assert_eq!( + metadata.encodings(), + &vec![Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE] + ); + assert_eq!(metadata.num_values(), 8); // dictionary + value indexes + assert_eq!(metadata.compressed_size(), 20); + assert_eq!(metadata.uncompressed_size(), 20); + assert_eq!(metadata.data_page_offset(), 0); + assert_eq!(metadata.dictionary_page_offset(), Some(0)); + } + + #[test] + fn test_column_writer_empty_column_roundtrip() { + let props = WriterProperties::builder().build(); + column_roundtrip::("test_col_writer_rnd_1", props, &[], None, None); + } + + #[test] + fn test_column_writer_non_nullable_values_roundtrip() { + let props = WriterProperties::builder().build(); + column_roundtrip_random::( + "test_col_writer_rnd_2", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 0, + 0, + ); + } + + #[test] + fn test_column_writer_nullable_non_repeated_values_roundtrip() { + let props = WriterProperties::builder().build(); + column_roundtrip_random::( + "test_column_writer_nullable_non_repeated_values_roundtrip", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 0, + ); + } + + #[test] + fn test_column_writer_nullable_repeated_values_roundtrip() { + let props = WriterProperties::builder().build(); + column_roundtrip_random::( + "test_col_writer_rnd_3", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + #[test] + fn test_column_writer_dictionary_fallback_small_data_page() { + let props = WriterProperties::builder() + .set_dictionary_pagesize_limit(32) + .set_data_pagesize_limit(32) + .build(); + column_roundtrip_random::( + "test_col_writer_rnd_4", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + #[test] + fn test_column_writer_small_write_batch_size() { + for i in vec![1, 2, 5, 10, 11, 1023] { + let props = WriterProperties::builder().set_write_batch_size(i).build(); + + column_roundtrip_random::( + "test_col_writer_rnd_5", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + } + + #[test] + fn test_column_writer_dictionary_disabled_v1() { + let props = WriterProperties::builder() + .set_writer_version(WriterVersion::PARQUET_1_0) + .set_dictionary_enabled(false) + .build(); + column_roundtrip_random::( + "test_col_writer_rnd_6", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + #[test] + fn test_column_writer_dictionary_disabled_v2() { + let props = WriterProperties::builder() + .set_writer_version(WriterVersion::PARQUET_2_0) + .set_dictionary_enabled(false) + .build(); + column_roundtrip_random::( + "test_col_writer_rnd_7", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + #[test] + fn test_column_writer_compression_v1() { + let props = WriterProperties::builder() + .set_writer_version(WriterVersion::PARQUET_1_0) + .set_compression(Compression::SNAPPY) + .build(); + column_roundtrip_random::( + "test_col_writer_rnd_8", + props, + 2048, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + #[test] + fn test_column_writer_compression_v2() { + let props = WriterProperties::builder() + .set_writer_version(WriterVersion::PARQUET_2_0) + .set_compression(Compression::SNAPPY) + .build(); + column_roundtrip_random::( + "test_col_writer_rnd_9", + props, + 2048, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + /// Performs write-read roundtrip with randomly generated values and levels. + /// `max_size` is maximum number of values or levels (if `max_def_level` > 0) to write + /// for a column. + fn column_roundtrip_random<'a, T: DataType>( + file_name: &'a str, + props: WriterProperties, + max_size: usize, + min_value: T::T, + max_value: T::T, + max_def_level: i16, + max_rep_level: i16, + ) where + T::T: PartialOrd + SampleRange + Copy, + { + let mut num_values: usize = 0; + + let mut buf: Vec = Vec::new(); + let def_levels = if max_def_level > 0 { + random_numbers_range(max_size, 0, max_def_level + 1, &mut buf); + for &dl in &buf[..] { + if dl == max_def_level { + num_values += 1; + } + } + Some(&buf[..]) + } else { + num_values = max_size; + None + }; + + let mut buf: Vec = Vec::new(); + let rep_levels = if max_rep_level > 0 { + random_numbers_range(max_size, 0, max_rep_level + 1, &mut buf); + Some(&buf[..]) + } else { + None + }; + + let mut values: Vec = Vec::new(); + random_numbers_range(num_values, min_value, max_value, &mut values); + + column_roundtrip::(file_name, props, &values[..], def_levels, rep_levels); + } + + /// Performs write-read roundtrip and asserts written values and levels. + fn column_roundtrip<'a, T: DataType>( + file_name: &'a str, + props: WriterProperties, + values: &[T::T], + def_levels: Option<&[i16]>, + rep_levels: Option<&[i16]>, + ) { + let file = get_temp_file(file_name, &[]); + let sink = FileSink::new(&file); + let page_writer = Box::new(SerializedPageWriter::new(sink)); + + let max_def_level = match def_levels { + Some(buf) => *buf.iter().max().unwrap_or(&0i16), + None => 0i16, + }; + + let max_rep_level = match rep_levels { + Some(buf) => *buf.iter().max().unwrap_or(&0i16), + None => 0i16, + }; + + let mut max_batch_size = values.len(); + if let Some(levels) = def_levels { + max_batch_size = cmp::max(max_batch_size, levels.len()); + } + if let Some(levels) = rep_levels { + max_batch_size = cmp::max(max_batch_size, levels.len()); + } + + let mut writer = + get_test_column_writer::(page_writer, max_def_level, max_rep_level, Rc::new(props)); + + let values_written = writer.write_batch(values, def_levels, rep_levels).unwrap(); + assert_eq!(values_written, values.len()); + let (bytes_written, rows_written, column_metadata) = writer.close().unwrap(); + + let source = FileSource::new(&file, 0, bytes_written as usize); + let page_reader = Box::new( + SerializedPageReader::new( + source, + column_metadata.num_values(), + column_metadata.compression(), + T::get_physical_type(), + ) + .unwrap(), + ); + let reader = get_test_column_reader::(page_reader, max_def_level, max_rep_level); + + let mut actual_values = vec![T::T::default(); max_batch_size]; + let mut actual_def_levels = match def_levels { + Some(_) => Some(vec![0i16; max_batch_size]), + None => None, + }; + let mut actual_rep_levels = match rep_levels { + Some(_) => Some(vec![0i16; max_batch_size]), + None => None, + }; + + let (values_read, levels_read) = read_fully( + reader, + max_batch_size, + actual_def_levels.as_mut(), + actual_rep_levels.as_mut(), + actual_values.as_mut_slice(), + ); + + // Assert values, definition and repetition levels. + + assert_eq!(&actual_values[..values_read], values); + match actual_def_levels { + Some(ref vec) => assert_eq!(Some(&vec[..levels_read]), def_levels), + None => assert_eq!(None, def_levels), + } + match actual_rep_levels { + Some(ref vec) => assert_eq!(Some(&vec[..levels_read]), rep_levels), + None => assert_eq!(None, rep_levels), + } + + // Assert written rows. + + if let Some(levels) = actual_rep_levels { + let mut actual_rows_written = 0; + for l in levels { + if l == 0 { + actual_rows_written += 1; + } + } + assert_eq!(actual_rows_written, rows_written); + } else if actual_def_levels.is_some() { + assert_eq!(levels_read as u64, rows_written); + } else { + assert_eq!(values_read as u64, rows_written); + } + } + + /// Performs write of provided values and returns column metadata of those values. + /// Used to test encoding support for column writer. + fn column_write_and_get_metadata( + props: WriterProperties, + values: &[T::T], + ) -> ColumnChunkMetaData { + let page_writer = get_test_page_writer(); + let props = Rc::new(props); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer.write_batch(values, None, None).unwrap(); + let (_, _, metadata) = writer.close().unwrap(); + metadata + } + + // Function to use in tests for EncodingWriteSupport. This checks that dictionary + // offset and encodings to make sure that column writer uses provided by trait + // encodings. + fn check_encoding_write_support( + version: WriterVersion, + dict_enabled: bool, + data: &[T::T], + dictionary_page_offset: Option, + encodings: &[Encoding], + ) { + let props = WriterProperties::builder() + .set_writer_version(version) + .set_dictionary_enabled(dict_enabled) + .build(); + let meta = column_write_and_get_metadata::(props, data); + assert_eq!(meta.dictionary_page_offset(), dictionary_page_offset); + assert_eq!(meta.encodings(), &encodings); + } + + /// Reads one batch of data, considering that batch is large enough to capture all of + /// the values and levels. + fn read_fully( + mut reader: ColumnReaderImpl, + batch_size: usize, + mut def_levels: Option<&mut Vec>, + mut rep_levels: Option<&mut Vec>, + values: &mut [T::T], + ) -> (usize, usize) { + let actual_def_levels = match &mut def_levels { + Some(ref mut vec) => Some(&mut vec[..]), + None => None, + }; + let actual_rep_levels = match rep_levels { + Some(ref mut vec) => Some(&mut vec[..]), + None => None, + }; + reader + .read_batch(batch_size, actual_def_levels, actual_rep_levels, values) + .unwrap() + } + + /// Returns column writer. + fn get_test_column_writer( + page_writer: Box, + max_def_level: i16, + max_rep_level: i16, + props: WriterPropertiesPtr, + ) -> ColumnWriterImpl { + let descr = Rc::new(get_test_column_descr::(max_def_level, max_rep_level)); + let column_writer = get_column_writer(descr, props, page_writer); + get_typed_column_writer::(column_writer) + } + + /// Returns column reader. + fn get_test_column_reader( + page_reader: Box, + max_def_level: i16, + max_rep_level: i16, + ) -> ColumnReaderImpl { + let descr = Rc::new(get_test_column_descr::(max_def_level, max_rep_level)); + let column_reader = get_column_reader(descr, page_reader); + get_typed_column_reader::(column_reader) + } + + /// Returns descriptor for primitive column. + fn get_test_column_descr( + max_def_level: i16, + max_rep_level: i16, + ) -> ColumnDescriptor { + let path = ColumnPath::from("col"); + let tpe = SchemaType::primitive_type_builder("col", T::get_physical_type()) + // length is set for "encoding support" tests for FIXED_LEN_BYTE_ARRAY type, + // it should be no-op for other types + .with_length(1) + .build() + .unwrap(); + ColumnDescriptor::new(Rc::new(tpe), None, max_def_level, max_rep_level, path) + } + + /// Returns page writer that collects pages without serializing them. + fn get_test_page_writer() -> Box { + Box::new(TestPageWriter {}) + } + + struct TestPageWriter {} + + impl PageWriter for TestPageWriter { + fn write_page(&mut self, page: CompressedPage) -> Result { + let mut res = PageWriteSpec::new(); + res.page_type = page.page_type(); + res.uncompressed_size = page.uncompressed_size(); + res.compressed_size = page.compressed_size(); + res.num_values = page.num_values(); + res.offset = 0; + res.bytes_written = page.data().len() as u64; + Ok(res) + } + + fn write_metadata(&mut self, _metadata: &ColumnChunkMetaData) -> Result<()> { + Ok(()) + } + + fn close(&mut self) -> Result<()> { + Ok(()) + } + } +} diff --git a/rust/src/parquet/compression.rs b/rust/src/parquet/compression.rs new file mode 100644 index 0000000000000..3690cca032361 --- /dev/null +++ b/rust/src/parquet/compression.rs @@ -0,0 +1,321 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains codec interface and supported codec implementations. +//! +//! See [`Compression`](`::basic::Compression`) enum for all available compression +//! algorithms. +//! +//! # Example +//! +//! ```rust +//! use arrow::parquet::{basic::Compression, compression::create_codec}; +//! +//! let mut codec = match create_codec(Compression::SNAPPY) { +//! Ok(Some(codec)) => codec, +//! _ => panic!(), +//! }; +//! +//! let data = vec![b'p', b'a', b'r', b'q', b'u', b'e', b't']; +//! let mut compressed = vec![]; +//! codec.compress(&data[..], &mut compressed).unwrap(); +//! +//! let mut output = vec![]; +//! codec.decompress(&compressed[..], &mut output).unwrap(); +//! +//! assert_eq!(output, data); +//! ``` + +use std::io::{self, Read, Write}; + +use brotli; +use flate2::{read, write, Compression}; +use lz4; +use snap::{decompress_len, max_compress_len, Decoder, Encoder}; +use zstd; + +use crate::parquet::basic::Compression as CodecType; +use crate::parquet::errors::{ParquetError, Result}; + +/// Parquet compression codec interface. +pub trait Codec { + /// Compresses data stored in slice `input_buf` and writes the compressed result + /// to `output_buf`. + /// Note that you'll need to call `clear()` before reusing the same `output_buf` across + /// different `compress` calls. + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()>; + + /// Decompresses data stored in slice `input_buf` and writes output to `output_buf`. + /// Returns the total number of bytes written. + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result; +} + +/// Given the compression type `codec`, returns a codec used to compress and decompress +/// bytes for the compression type. +/// This returns `None` if the codec type is `UNCOMPRESSED`. +pub fn create_codec(codec: CodecType) -> Result>> { + match codec { + CodecType::BROTLI => Ok(Some(Box::new(BrotliCodec::new()))), + CodecType::GZIP => Ok(Some(Box::new(GZipCodec::new()))), + CodecType::SNAPPY => Ok(Some(Box::new(SnappyCodec::new()))), + CodecType::LZ4 => Ok(Some(Box::new(LZ4Codec::new()))), + CodecType::ZSTD => Ok(Some(Box::new(ZSTDCodec::new()))), + CodecType::UNCOMPRESSED => Ok(None), + _ => Err(nyi_err!("The codec type {} is not supported yet", codec)), + } +} + +/// Codec for Snappy compression format. +pub struct SnappyCodec { + decoder: Decoder, + encoder: Encoder, +} + +impl SnappyCodec { + /// Creates new Snappy compression codec. + fn new() -> Self { + Self { + decoder: Decoder::new(), + encoder: Encoder::new(), + } + } +} + +impl Codec for SnappyCodec { + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + let len = decompress_len(input_buf)?; + output_buf.resize(len, 0); + self.decoder + .decompress(input_buf, output_buf) + .map_err(|e| e.into()) + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let required_len = max_compress_len(input_buf.len()); + if output_buf.len() < required_len { + output_buf.resize(required_len, 0); + } + let n = self.encoder.compress(input_buf, &mut output_buf[..])?; + output_buf.truncate(n); + Ok(()) + } +} + +/// Codec for GZIP compression algorithm. +pub struct GZipCodec {} + +impl GZipCodec { + /// Creates new GZIP compression codec. + fn new() -> Self { + Self {} + } +} + +impl Codec for GZipCodec { + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + let mut decoder = read::GzDecoder::new(input_buf); + decoder.read_to_end(output_buf).map_err(|e| e.into()) + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let mut encoder = write::GzEncoder::new(output_buf, Compression::default()); + encoder.write_all(input_buf)?; + encoder.try_finish().map_err(|e| e.into()) + } +} + +const BROTLI_DEFAULT_BUFFER_SIZE: usize = 4096; +const BROTLI_DEFAULT_COMPRESSION_QUALITY: u32 = 1; // supported levels 0-9 +const BROTLI_DEFAULT_LG_WINDOW_SIZE: u32 = 22; // recommended between 20-22 + +/// Codec for Brotli compression algorithm. +pub struct BrotliCodec {} + +impl BrotliCodec { + /// Creates new Brotli compression codec. + fn new() -> Self { + Self {} + } +} + +impl Codec for BrotliCodec { + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + brotli::Decompressor::new(input_buf, BROTLI_DEFAULT_BUFFER_SIZE) + .read_to_end(output_buf) + .map_err(|e| e.into()) + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let mut encoder = brotli::CompressorWriter::new( + output_buf, + BROTLI_DEFAULT_BUFFER_SIZE, + BROTLI_DEFAULT_COMPRESSION_QUALITY, + BROTLI_DEFAULT_LG_WINDOW_SIZE, + ); + encoder.write_all(&input_buf[..])?; + encoder.flush().map_err(|e| e.into()) + } +} + +const LZ4_BUFFER_SIZE: usize = 4096; + +/// Codec for LZ4 compression algorithm. +pub struct LZ4Codec {} + +impl LZ4Codec { + /// Creates new LZ4 compression codec. + fn new() -> Self { + Self {} + } +} + +impl Codec for LZ4Codec { + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + let mut decoder = lz4::Decoder::new(input_buf)?; + let mut buffer: [u8; LZ4_BUFFER_SIZE] = [0; LZ4_BUFFER_SIZE]; + let mut total_len = 0; + loop { + let len = decoder.read(&mut buffer)?; + if len == 0 { + break; + } + total_len += len; + output_buf.write_all(&buffer[0..len])?; + } + Ok(total_len) + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let mut encoder = lz4::EncoderBuilder::new().build(output_buf)?; + let mut from = 0; + loop { + let to = ::std::cmp::min(from + LZ4_BUFFER_SIZE, input_buf.len()); + encoder.write_all(&input_buf[from..to])?; + from += LZ4_BUFFER_SIZE; + if from >= input_buf.len() { + break; + } + } + encoder.finish().1.map_err(|e| e.into()) + } +} + +/// Codec for Zstandard compression algorithm. +pub struct ZSTDCodec {} + +impl ZSTDCodec { + /// Creates new Zstandard compression codec. + fn new() -> Self { + Self {} + } +} + +/// Compression level (1-21) for ZSTD. Choose 1 here for better compression speed. +const ZSTD_COMPRESSION_LEVEL: i32 = 1; + +impl Codec for ZSTDCodec { + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + let mut decoder = zstd::Decoder::new(input_buf)?; + match io::copy(&mut decoder, output_buf) { + Ok(n) => Ok(n as usize), + Err(e) => Err(e.into()), + } + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let mut encoder = zstd::Encoder::new(output_buf, ZSTD_COMPRESSION_LEVEL)?; + encoder.write_all(&input_buf[..])?; + match encoder.finish() { + Ok(_) => Ok(()), + Err(e) => Err(e.into()), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::parquet::util::test_common::*; + + fn test_roundtrip(c: CodecType, data: &Vec) { + let mut c1 = create_codec(c).unwrap().unwrap(); + let mut c2 = create_codec(c).unwrap().unwrap(); + + // Compress with c1 + let mut compressed = Vec::new(); + let mut decompressed = Vec::new(); + c1.compress(data.as_slice(), &mut compressed) + .expect("Error when compressing"); + + // Decompress with c2 + let mut decompressed_size = c2 + .decompress(compressed.as_slice(), &mut decompressed) + .expect("Error when decompressing"); + assert_eq!(data.len(), decompressed_size); + decompressed.truncate(decompressed_size); + assert_eq!(*data, decompressed); + + compressed.clear(); + + // Compress with c2 + c2.compress(data.as_slice(), &mut compressed) + .expect("Error when compressing"); + + // Decompress with c1 + decompressed_size = c1 + .decompress(compressed.as_slice(), &mut decompressed) + .expect("Error when decompressing"); + assert_eq!(data.len(), decompressed_size); + decompressed.truncate(decompressed_size); + assert_eq!(*data, decompressed); + } + + fn test_codec(c: CodecType) { + let sizes = vec![100, 10000, 100000]; + for size in sizes { + let mut data = random_bytes(size); + test_roundtrip(c, &mut data); + } + } + + #[test] + fn test_codec_snappy() { + test_codec(CodecType::SNAPPY); + } + + #[test] + fn test_codec_gzip() { + test_codec(CodecType::GZIP); + } + + #[test] + fn test_codec_brotli() { + test_codec(CodecType::BROTLI); + } + + #[test] + fn test_codec_lz4() { + test_codec(CodecType::LZ4); + } + + #[test] + fn test_codec_zstd() { + test_codec(CodecType::ZSTD); + } + +} diff --git a/rust/src/parquet/data_type.rs b/rust/src/parquet/data_type.rs new file mode 100644 index 0000000000000..26bdebd71bc8b --- /dev/null +++ b/rust/src/parquet/data_type.rs @@ -0,0 +1,463 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Data types that connect Parquet physical types with their Rust-specific +//! representations. + +use std::mem; + +use byteorder::{BigEndian, ByteOrder}; + +use crate::parquet::basic::Type; +use crate::parquet::util::memory::{ByteBuffer, ByteBufferPtr}; + +/// Rust representation for logical type INT96, value is backed by an array of `u32`. +/// The type only takes 12 bytes, without extra padding. +#[derive(Clone, Debug)] +pub struct Int96 { + value: Option<[u32; 3]>, +} + +impl Int96 { + /// Creates new INT96 type struct with no data set. + pub fn new() -> Self { + Self { value: None } + } + + /// Returns underlying data as slice of [`u32`]. + pub fn data(&self) -> &[u32] { + assert!(self.value.is_some()); + self.value.as_ref().unwrap() + } + + /// Sets data for this INT96 type. + pub fn set_data(&mut self, elem0: u32, elem1: u32, elem2: u32) { + self.value = Some([elem0, elem1, elem2]); + } +} + +impl Default for Int96 { + fn default() -> Self { + Self { value: None } + } +} + +impl PartialEq for Int96 { + fn eq(&self, other: &Int96) -> bool { + self.data() == other.data() + } +} + +impl From> for Int96 { + fn from(buf: Vec) -> Self { + assert_eq!(buf.len(), 3); + let mut result = Self::new(); + result.set_data(buf[0], buf[1], buf[2]); + result + } +} + +/// Rust representation for BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY Parquet physical types. +/// Value is backed by a byte buffer. +#[derive(Clone, Debug)] +pub struct ByteArray { + data: Option, +} + +impl ByteArray { + /// Creates new byte array with no data set. + pub fn new() -> Self { + ByteArray { data: None } + } + + /// Gets length of the underlying byte buffer. + pub fn len(&self) -> usize { + assert!(self.data.is_some()); + self.data.as_ref().unwrap().len() + } + + /// Returns slice of data. + pub fn data(&self) -> &[u8] { + assert!(self.data.is_some()); + self.data.as_ref().unwrap().as_ref() + } + + /// Set data from another byte buffer. + pub fn set_data(&mut self, data: ByteBufferPtr) { + self.data = Some(data); + } + + /// Returns `ByteArray` instance with slice of values for a data. + pub fn slice(&self, start: usize, len: usize) -> Self { + assert!(self.data.is_some()); + Self::from(self.data.as_ref().unwrap().range(start, len)) + } +} + +impl From> for ByteArray { + fn from(buf: Vec) -> ByteArray { + Self { + data: Some(ByteBufferPtr::new(buf)), + } + } +} + +impl<'a> From<&'a str> for ByteArray { + fn from(s: &'a str) -> ByteArray { + let mut v = Vec::new(); + v.extend_from_slice(s.as_bytes()); + Self { + data: Some(ByteBufferPtr::new(v)), + } + } +} + +impl From for ByteArray { + fn from(ptr: ByteBufferPtr) -> ByteArray { + Self { data: Some(ptr) } + } +} + +impl From for ByteArray { + fn from(mut buf: ByteBuffer) -> ByteArray { + Self { + data: Some(buf.consume()), + } + } +} + +impl Default for ByteArray { + fn default() -> Self { + ByteArray { data: None } + } +} + +impl PartialEq for ByteArray { + fn eq(&self, other: &ByteArray) -> bool { + self.data() == other.data() + } +} + +/// Rust representation for Decimal values. +/// +/// This is not a representation of Parquet physical type, but rather a wrapper for +/// DECIMAL logical type, and serves as container for raw parts of decimal values: +/// unscaled value in bytes, precision and scale. +#[derive(Clone, Debug)] +pub enum Decimal { + /// Decimal backed by `i32`. + Int32 { + value: [u8; 4], + precision: i32, + scale: i32, + }, + /// Decimal backed by `i64`. + Int64 { + value: [u8; 8], + precision: i32, + scale: i32, + }, + /// Decimal backed by byte array. + Bytes { + value: ByteArray, + precision: i32, + scale: i32, + }, +} + +impl Decimal { + /// Creates new decimal value from `i32`. + pub fn from_i32(value: i32, precision: i32, scale: i32) -> Self { + let mut bytes = [0; 4]; + BigEndian::write_i32(&mut bytes, value); + Decimal::Int32 { + value: bytes, + precision, + scale, + } + } + + /// Creates new decimal value from `i64`. + pub fn from_i64(value: i64, precision: i32, scale: i32) -> Self { + let mut bytes = [0; 8]; + BigEndian::write_i64(&mut bytes, value); + Decimal::Int64 { + value: bytes, + precision, + scale, + } + } + + /// Creates new decimal value from `ByteArray`. + pub fn from_bytes(value: ByteArray, precision: i32, scale: i32) -> Self { + Decimal::Bytes { + value, + precision, + scale, + } + } + + /// Returns bytes of unscaled value. + pub fn data(&self) -> &[u8] { + match *self { + Decimal::Int32 { ref value, .. } => value, + Decimal::Int64 { ref value, .. } => value, + Decimal::Bytes { ref value, .. } => value.data(), + } + } + + /// Returns decimal precision. + pub fn precision(&self) -> i32 { + match *self { + Decimal::Int32 { precision, .. } => precision, + Decimal::Int64 { precision, .. } => precision, + Decimal::Bytes { precision, .. } => precision, + } + } + + /// Returns decimal scale. + pub fn scale(&self) -> i32 { + match *self { + Decimal::Int32 { scale, .. } => scale, + Decimal::Int64 { scale, .. } => scale, + Decimal::Bytes { scale, .. } => scale, + } + } +} + +impl Default for Decimal { + fn default() -> Self { + Self::from_i32(0, 0, 0) + } +} + +impl PartialEq for Decimal { + fn eq(&self, other: &Decimal) -> bool { + self.precision() == other.precision() + && self.scale() == other.scale() + && self.data() == other.data() + } +} + +/// Converts an instance of data type to a slice of bytes as `u8`. +pub trait AsBytes { + /// Returns slice of bytes for this data type. + fn as_bytes(&self) -> &[u8]; +} + +macro_rules! gen_as_bytes { + ($source_ty:ident) => { + impl AsBytes for $source_ty { + fn as_bytes(&self) -> &[u8] { + unsafe { + ::std::slice::from_raw_parts( + self as *const $source_ty as *const u8, + ::std::mem::size_of::<$source_ty>(), + ) + } + } + } + }; +} + +gen_as_bytes!(bool); +gen_as_bytes!(u8); +gen_as_bytes!(i32); +gen_as_bytes!(u32); +gen_as_bytes!(i64); +gen_as_bytes!(f32); +gen_as_bytes!(f64); + +impl AsBytes for Int96 { + fn as_bytes(&self) -> &[u8] { + unsafe { ::std::slice::from_raw_parts(self.data() as *const [u32] as *const u8, 12) } + } +} + +impl AsBytes for ByteArray { + fn as_bytes(&self) -> &[u8] { + self.data() + } +} + +impl AsBytes for Decimal { + fn as_bytes(&self) -> &[u8] { + self.data() + } +} + +impl AsBytes for Vec { + fn as_bytes(&self) -> &[u8] { + self.as_slice() + } +} + +impl<'a> AsBytes for &'a str { + fn as_bytes(&self) -> &[u8] { + (self as &str).as_bytes() + } +} + +impl AsBytes for str { + fn as_bytes(&self) -> &[u8] { + (self as &str).as_bytes() + } +} + +/// Contains the Parquet physical type information as well as the Rust primitive type +/// presentation. +pub trait DataType: 'static { + type T: ::std::cmp::PartialEq + + ::std::fmt::Debug + + ::std::default::Default + + ::std::clone::Clone + + AsBytes; + + /// Returns Parquet physical type. + fn get_physical_type() -> Type; + + /// Returns size in bytes for Rust representation of the physical type. + fn get_type_size() -> usize; +} + +macro_rules! make_type { + ($name:ident, $physical_ty:path, $native_ty:ty, $size:expr) => { + pub struct $name {} + + impl DataType for $name { + type T = $native_ty; + + fn get_physical_type() -> Type { + $physical_ty + } + + fn get_type_size() -> usize { + $size + } + } + }; +} + +/// Generate struct definitions for all physical types + +make_type!(BoolType, Type::BOOLEAN, bool, 1); +make_type!(Int32Type, Type::INT32, i32, 4); +make_type!(Int64Type, Type::INT64, i64, 8); +make_type!(Int96Type, Type::INT96, Int96, mem::size_of::()); +make_type!(FloatType, Type::FLOAT, f32, 4); +make_type!(DoubleType, Type::DOUBLE, f64, 8); +make_type!( + ByteArrayType, + Type::BYTE_ARRAY, + ByteArray, + mem::size_of::() +); +make_type!( + FixedLenByteArrayType, + Type::FIXED_LEN_BYTE_ARRAY, + ByteArray, + mem::size_of::() +); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_as_bytes() { + assert_eq!(false.as_bytes(), &[0]); + assert_eq!(true.as_bytes(), &[1]); + assert_eq!((7 as i32).as_bytes(), &[7, 0, 0, 0]); + assert_eq!((555 as i32).as_bytes(), &[43, 2, 0, 0]); + assert_eq!((555 as u32).as_bytes(), &[43, 2, 0, 0]); + assert_eq!(i32::max_value().as_bytes(), &[255, 255, 255, 127]); + assert_eq!(i32::min_value().as_bytes(), &[0, 0, 0, 128]); + assert_eq!((7 as i64).as_bytes(), &[7, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!((555 as i64).as_bytes(), &[43, 2, 0, 0, 0, 0, 0, 0]); + assert_eq!( + (i64::max_value()).as_bytes(), + &[255, 255, 255, 255, 255, 255, 255, 127] + ); + assert_eq!((i64::min_value()).as_bytes(), &[0, 0, 0, 0, 0, 0, 0, 128]); + assert_eq!((3.14 as f32).as_bytes(), &[195, 245, 72, 64]); + assert_eq!( + (3.14 as f64).as_bytes(), + &[31, 133, 235, 81, 184, 30, 9, 64] + ); + assert_eq!("hello".as_bytes(), &[b'h', b'e', b'l', b'l', b'o']); + assert_eq!( + Vec::from("hello".as_bytes()).as_bytes(), + &[b'h', b'e', b'l', b'l', b'o'] + ); + + // Test Int96 + let i96 = Int96::from(vec![1, 2, 3]); + assert_eq!(i96.as_bytes(), &[1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0]); + + // Test ByteArray + let ba = ByteArray::from(vec![1, 2, 3]); + assert_eq!(ba.as_bytes(), &[1, 2, 3]); + + // Test Decimal + let decimal = Decimal::from_i32(123, 5, 2); + assert_eq!(decimal.as_bytes(), &[0, 0, 0, 123]); + let decimal = Decimal::from_i64(123, 5, 2); + assert_eq!(decimal.as_bytes(), &[0, 0, 0, 0, 0, 0, 0, 123]); + let decimal = Decimal::from_bytes(ByteArray::from(vec![1, 2, 3]), 5, 2); + assert_eq!(decimal.as_bytes(), &[1, 2, 3]); + } + + #[test] + fn test_int96_from() { + assert_eq!( + Int96::from(vec![1, 12345, 1234567890]).data(), + &[1, 12345, 1234567890] + ); + } + + #[test] + fn test_byte_array_from() { + assert_eq!( + ByteArray::from(vec![b'A', b'B', b'C']).data(), + &[b'A', b'B', b'C'] + ); + assert_eq!(ByteArray::from("ABC").data(), &[b'A', b'B', b'C']); + assert_eq!( + ByteArray::from(ByteBufferPtr::new(vec![1u8, 2u8, 3u8, 4u8, 5u8])).data(), + &[1u8, 2u8, 3u8, 4u8, 5u8] + ); + let mut buf = ByteBuffer::new(); + buf.set_data(vec![6u8, 7u8, 8u8, 9u8, 10u8]); + assert_eq!(ByteArray::from(buf).data(), &[6u8, 7u8, 8u8, 9u8, 10u8]); + } + + #[test] + fn test_decimal_partial_eq() { + assert_eq!(Decimal::default(), Decimal::from_i32(0, 0, 0)); + assert_eq!(Decimal::from_i32(222, 5, 2), Decimal::from_i32(222, 5, 2)); + assert_eq!( + Decimal::from_bytes(ByteArray::from(vec![0, 0, 0, 3]), 5, 2), + Decimal::from_i32(3, 5, 2) + ); + + assert!(Decimal::from_i32(222, 5, 2) != Decimal::from_i32(111, 5, 2)); + assert!(Decimal::from_i32(222, 5, 2) != Decimal::from_i32(222, 6, 2)); + assert!(Decimal::from_i32(222, 5, 2) != Decimal::from_i32(222, 5, 3)); + + assert!(Decimal::from_i64(222, 5, 2) != Decimal::from_i32(222, 5, 2)); + } +} diff --git a/rust/src/parquet/encodings/decoding.rs b/rust/src/parquet/encodings/decoding.rs new file mode 100644 index 0000000000000..c6a6fd49ee336 --- /dev/null +++ b/rust/src/parquet/encodings/decoding.rs @@ -0,0 +1,1403 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains all supported decoders for Parquet. + +use std::{cmp, marker::PhantomData, mem, slice::from_raw_parts_mut}; + +use super::rle::RleDecoder; + +use byteorder::{ByteOrder, LittleEndian}; + +use crate::parquet::basic::*; +use crate::parquet::data_type::*; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::schema::types::ColumnDescPtr; +use crate::parquet::util::{ + bit_util::BitReader, + memory::{ByteBuffer, ByteBufferPtr}, +}; + +// ---------------------------------------------------------------------- +// Decoders + +/// A Parquet decoder for the data type `T`. +pub trait Decoder { + /// Sets the data to decode to be `data`, which should contain `num_values` of values + /// to decode. + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()>; + + /// Consumes values from this decoder and write the results to `buffer`. This will try + /// to fill up `buffer`. + /// + /// Returns the actual number of values decoded, which should be equal to `buffer.len()` + /// unless the remaining number of values is less than `buffer.len()`. + fn get(&mut self, buffer: &mut [T::T]) -> Result; + + /// Returns the number of values left in this decoder stream. + fn values_left(&self) -> usize; + + /// Returns the encoding for this decoder. + fn encoding(&self) -> Encoding; +} + +/// Gets a decoder for the column descriptor `descr` and encoding type `encoding`. +/// +/// NOTE: the primitive type in `descr` MUST match the data type `T`, otherwise +/// disastrous consequence could occur. +pub fn get_decoder( + descr: ColumnDescPtr, + encoding: Encoding, +) -> Result>> { + let decoder: Box> = match encoding { + Encoding::PLAIN => Box::new(PlainDecoder::new(descr.type_length())), + Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => { + return Err(general_err!( + "Cannot initialize this encoding through this function" + )); + } + Encoding::RLE => Box::new(RleValueDecoder::new()), + Encoding::DELTA_BINARY_PACKED => Box::new(DeltaBitPackDecoder::new()), + Encoding::DELTA_LENGTH_BYTE_ARRAY => Box::new(DeltaLengthByteArrayDecoder::new()), + Encoding::DELTA_BYTE_ARRAY => Box::new(DeltaByteArrayDecoder::new()), + e => return Err(nyi_err!("Encoding {} is not supported", e)), + }; + Ok(decoder) +} + +// ---------------------------------------------------------------------- +// PLAIN Decoding + +/// Plain decoding that supports all types. +/// Values are encoded back to back. For native types, data is encoded as little endian. +/// Floating point types are encoded in IEEE. +/// See [`PlainDecoder`](`::encoding::PlainEncoder`) for more information. +pub struct PlainDecoder { + // The remaining number of values in the byte array + num_values: usize, + + // The current starting index in the byte array. + start: usize, + + // The length for the type `T`. Only used when `T` is `FixedLenByteArrayType` + type_length: i32, + + // The byte array to decode from. Not set if `T` is bool. + data: Option, + + // Read `data` bit by bit. Only set if `T` is bool. + bit_reader: Option, + + // To allow `T` in the generic parameter for this struct. This doesn't take any space. + _phantom: PhantomData, +} + +impl PlainDecoder { + /// Creates new plain decoder. + pub fn new(type_length: i32) -> Self { + PlainDecoder { + data: None, + bit_reader: None, + type_length, + num_values: 0, + start: 0, + _phantom: PhantomData, + } + } +} + +impl Decoder for PlainDecoder { + #[inline] + default fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + self.num_values = num_values; + self.start = 0; + self.data = Some(data); + Ok(()) + } + + #[inline] + fn values_left(&self) -> usize { + self.num_values + } + + #[inline] + fn encoding(&self) -> Encoding { + Encoding::PLAIN + } + + #[inline] + default fn get(&mut self, buffer: &mut [T::T]) -> Result { + assert!(self.data.is_some()); + + let data = self.data.as_mut().unwrap(); + let num_values = cmp::min(buffer.len(), self.num_values); + let bytes_left = data.len() - self.start; + let bytes_to_decode = mem::size_of::() * num_values; + if bytes_left < bytes_to_decode { + return Err(eof_err!("Not enough bytes to decode")); + } + let raw_buffer: &mut [u8] = + unsafe { from_raw_parts_mut(buffer.as_ptr() as *mut u8, bytes_to_decode) }; + raw_buffer.copy_from_slice(data.range(self.start, bytes_to_decode).as_ref()); + self.start += bytes_to_decode; + self.num_values -= num_values; + + Ok(num_values) + } +} + +impl Decoder for PlainDecoder { + fn get(&mut self, buffer: &mut [Int96]) -> Result { + assert!(self.data.is_some()); + + let data = self.data.as_ref().unwrap(); + let num_values = cmp::min(buffer.len(), self.num_values); + let bytes_left = data.len() - self.start; + let bytes_to_decode = 12 * num_values; + if bytes_left < bytes_to_decode { + return Err(eof_err!("Not enough bytes to decode")); + } + + let data_range = data.range(self.start, bytes_to_decode); + let bytes: &[u8] = data_range.data(); + self.start += bytes_to_decode; + + let mut pos = 0; // position in byte array + for i in 0..num_values { + let elem0 = LittleEndian::read_u32(&bytes[pos..pos + 4]); + let elem1 = LittleEndian::read_u32(&bytes[pos + 4..pos + 8]); + let elem2 = LittleEndian::read_u32(&bytes[pos + 8..pos + 12]); + buffer[i].set_data(elem0, elem1, elem2); + pos += 12; + } + self.num_values -= num_values; + + Ok(num_values) + } +} + +impl Decoder for PlainDecoder { + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + self.num_values = num_values; + self.bit_reader = Some(BitReader::new(data)); + Ok(()) + } + + fn get(&mut self, buffer: &mut [bool]) -> Result { + assert!(self.bit_reader.is_some()); + + let bit_reader = self.bit_reader.as_mut().unwrap(); + let values_read = bit_reader.get_batch::(buffer, 1); + self.num_values -= values_read; + + Ok(values_read) + } +} + +impl Decoder for PlainDecoder { + fn get(&mut self, buffer: &mut [ByteArray]) -> Result { + assert!(self.data.is_some()); + + let data = self.data.as_mut().unwrap(); + let num_values = cmp::min(buffer.len(), self.num_values); + for i in 0..num_values { + let len: usize = read_num_bytes!(u32, 4, data.start_from(self.start).as_ref()) as usize; + self.start += mem::size_of::(); + if data.len() < self.start + len { + return Err(eof_err!("Not enough bytes to decode")); + } + buffer[i].set_data(data.range(self.start, len)); + self.start += len; + } + self.num_values -= num_values; + + Ok(num_values) + } +} + +impl Decoder for PlainDecoder { + fn get(&mut self, buffer: &mut [ByteArray]) -> Result { + assert!(self.data.is_some()); + assert!(self.type_length > 0); + + let data = self.data.as_mut().unwrap(); + let type_length = self.type_length as usize; + let num_values = cmp::min(buffer.len(), self.num_values); + for i in 0..num_values { + if data.len() < self.start + type_length { + return Err(eof_err!("Not enough bytes to decode")); + } + buffer[i].set_data(data.range(self.start, type_length)); + self.start += type_length; + } + self.num_values -= num_values; + + Ok(num_values) + } +} + +// ---------------------------------------------------------------------- +// RLE_DICTIONARY/PLAIN_DICTIONARY Decoding + +/// Dictionary decoder. +/// The dictionary encoding builds a dictionary of values encountered in a given column. +/// The dictionary is be stored in a dictionary page per column chunk. +/// See [`DictEncoder`](`::encoding::DictEncoder`) for more information. +pub struct DictDecoder { + // The dictionary, which maps ids to the values + dictionary: Vec, + + // Whether `dictionary` has been initialized + has_dictionary: bool, + + // The decoder for the value ids + rle_decoder: Option, + + // Number of values left in the data stream + num_values: usize, +} + +impl DictDecoder { + /// Creates new dictionary decoder. + pub fn new() -> Self { + Self { + dictionary: vec![], + has_dictionary: false, + rle_decoder: None, + num_values: 0, + } + } + + /// Decodes and sets values for dictionary using `decoder` decoder. + pub fn set_dict(&mut self, mut decoder: Box>) -> Result<()> { + let num_values = decoder.values_left(); + self.dictionary.resize(num_values, T::T::default()); + let _ = decoder.get(&mut self.dictionary)?; + self.has_dictionary = true; + Ok(()) + } +} + +impl Decoder for DictDecoder { + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + // First byte in `data` is bit width + let bit_width = data.as_ref()[0]; + let mut rle_decoder = RleDecoder::new(bit_width); + rle_decoder.set_data(data.start_from(1)); + self.num_values = num_values; + self.rle_decoder = Some(rle_decoder); + Ok(()) + } + + fn get(&mut self, buffer: &mut [T::T]) -> Result { + assert!(self.rle_decoder.is_some()); + assert!(self.has_dictionary, "Must call set_dict() first!"); + + let rle = self.rle_decoder.as_mut().unwrap(); + let num_values = cmp::min(buffer.len(), self.num_values); + rle.get_batch_with_dict(&self.dictionary[..], buffer, num_values) + } + + /// Number of values left in this decoder stream + fn values_left(&self) -> usize { + self.num_values + } + + fn encoding(&self) -> Encoding { + Encoding::RLE_DICTIONARY + } +} + +// ---------------------------------------------------------------------- +// RLE Decoding + +/// RLE/Bit-Packing hybrid decoding for values. +/// Currently is used only for data pages v2 and supports boolean types. +/// See [`RleValueEncoder`](`::encoding::RleValueEncoder`) for more information. +pub struct RleValueDecoder { + values_left: usize, + decoder: Option, + _phantom: PhantomData, +} + +impl RleValueDecoder { + pub fn new() -> Self { + Self { + values_left: 0, + decoder: None, + _phantom: PhantomData, + } + } + + #[inline] + fn set_data_internal(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + // We still need to remove prefix of i32 from the stream. + let i32_size = mem::size_of::(); + let data_size = read_num_bytes!(i32, i32_size, data.as_ref()) as usize; + let rle_decoder = self + .decoder + .as_mut() + .expect("RLE decoder is not initialized"); + rle_decoder.set_data(data.range(i32_size, data_size)); + self.values_left = num_values; + Ok(()) + } +} + +impl Decoder for RleValueDecoder { + #[inline] + default fn set_data(&mut self, _data: ByteBufferPtr, _num_values: usize) -> Result<()> { + panic!("RleValueDecoder only supports BoolType"); + } + + #[inline] + fn values_left(&self) -> usize { + self.values_left + } + + #[inline] + fn encoding(&self) -> Encoding { + Encoding::RLE + } + + #[inline] + fn get(&mut self, buffer: &mut [T::T]) -> Result { + let rle_decoder = self + .decoder + .as_mut() + .expect("RLE decoder is not initialized"); + let values_read = rle_decoder.get_batch(buffer)?; + self.values_left -= values_read; + Ok(values_read) + } +} + +impl Decoder for RleValueDecoder { + #[inline] + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + // Only support RLE value reader for boolean values with bit width of 1. + self.decoder = Some(RleDecoder::new(1)); + self.set_data_internal(data, num_values) + } +} + +// ---------------------------------------------------------------------- +// DELTA_BINARY_PACKED Decoding + +/// Delta binary packed decoder. +/// Supports INT32 and INT64 types. +/// See [`DeltaBitPackEncoder`](`::encoding::DeltaBitPackEncoder`) for more information. +pub struct DeltaBitPackDecoder { + bit_reader: BitReader, + initialized: bool, + + // Header info + num_values: usize, + num_mini_blocks: i64, + values_per_mini_block: usize, + values_current_mini_block: usize, + first_value: i64, + first_value_read: bool, + + // Per block info + min_delta: i64, + mini_block_idx: usize, + delta_bit_width: u8, + delta_bit_widths: ByteBuffer, + deltas_in_mini_block: Vec, // eagerly loaded deltas for a mini block + use_batch: bool, + + current_value: i64, + + _phantom: PhantomData, +} + +impl DeltaBitPackDecoder { + /// Creates new delta bit packed decoder. + pub fn new() -> Self { + Self { + bit_reader: BitReader::from(vec![]), + initialized: false, + num_values: 0, + num_mini_blocks: 0, + values_per_mini_block: 0, + values_current_mini_block: 0, + first_value: 0, + first_value_read: false, + min_delta: 0, + mini_block_idx: 0, + delta_bit_width: 0, + delta_bit_widths: ByteBuffer::new(), + deltas_in_mini_block: vec![], + use_batch: mem::size_of::() == 4, + current_value: 0, + _phantom: PhantomData, + } + } + + /// Returns underlying bit reader offset. + pub fn get_offset(&self) -> usize { + assert!(self.initialized, "Bit reader is not initialized"); + self.bit_reader.get_byte_offset() + } + + /// Initializes new mini block. + #[inline] + fn init_block(&mut self) -> Result<()> { + self.min_delta = self + .bit_reader + .get_zigzag_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'min_delta'"))?; + + let mut widths = vec![]; + for _ in 0..self.num_mini_blocks { + let w = self + .bit_reader + .get_aligned::(1) + .ok_or(eof_err!("Not enough data to decode 'width'"))?; + widths.push(w); + } + + self.delta_bit_widths.set_data(widths); + self.mini_block_idx = 0; + self.delta_bit_width = self.delta_bit_widths.data()[0]; + self.values_current_mini_block = self.values_per_mini_block; + Ok(()) + } + + /// Loads delta into mini block. + #[inline] + fn load_deltas_in_mini_block(&mut self) -> Result<()> { + self.deltas_in_mini_block.clear(); + if self.use_batch { + self.deltas_in_mini_block + .resize(self.values_current_mini_block, T::T::default()); + let loaded = self.bit_reader.get_batch::( + &mut self.deltas_in_mini_block[..], + self.delta_bit_width as usize, + ); + assert!(loaded == self.values_current_mini_block); + } else { + for _ in 0..self.values_current_mini_block { + // TODO: load one batch at a time similar to int32 + let delta = self + .bit_reader + .get_value::(self.delta_bit_width as usize) + .ok_or(eof_err!("Not enough data to decode 'delta'"))?; + self.deltas_in_mini_block.push(delta); + } + } + + Ok(()) + } +} + +impl Decoder for DeltaBitPackDecoder { + // # of total values is derived from encoding + #[inline] + default fn set_data(&mut self, data: ByteBufferPtr, _: usize) -> Result<()> { + self.bit_reader = BitReader::new(data); + self.initialized = true; + + let block_size = self + .bit_reader + .get_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'block_size'"))?; + self.num_mini_blocks = self + .bit_reader + .get_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'num_mini_blocks'"))?; + self.num_values = + self.bit_reader + .get_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'num_values'"))? as usize; + self.first_value = self + .bit_reader + .get_zigzag_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'first_value'"))?; + + // Reset decoding state + self.first_value_read = false; + self.mini_block_idx = 0; + self.delta_bit_widths.clear(); + self.values_current_mini_block = 0; + + self.values_per_mini_block = (block_size / self.num_mini_blocks) as usize; + assert!(self.values_per_mini_block % 8 == 0); + + Ok(()) + } + + default fn get(&mut self, buffer: &mut [T::T]) -> Result { + assert!(self.initialized, "Bit reader is not initialized"); + + let num_values = cmp::min(buffer.len(), self.num_values); + for i in 0..num_values { + if !self.first_value_read { + self.set_decoded_value(buffer, i, self.first_value); + self.current_value = self.first_value; + self.first_value_read = true; + continue; + } + + if self.values_current_mini_block == 0 { + self.mini_block_idx += 1; + if self.mini_block_idx < self.delta_bit_widths.size() { + self.delta_bit_width = self.delta_bit_widths.data()[self.mini_block_idx]; + self.values_current_mini_block = self.values_per_mini_block; + } else { + self.init_block()?; + } + self.load_deltas_in_mini_block()?; + } + + // we decrement values in current mini block, so we need to invert index for delta + let delta = + self.get_delta(self.deltas_in_mini_block.len() - self.values_current_mini_block); + // It is OK for deltas to contain "overflowed" values after encoding, + // e.g. i64::MAX - i64::MIN, so we use `wrapping_add` to "overflow" again and + // restore original value. + self.current_value = self.current_value.wrapping_add(self.min_delta); + self.current_value = self.current_value.wrapping_add(delta as i64); + self.set_decoded_value(buffer, i, self.current_value); + self.values_current_mini_block -= 1; + } + + self.num_values -= num_values; + Ok(num_values) + } + + fn values_left(&self) -> usize { + self.num_values + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_BINARY_PACKED + } +} + +/// Helper trait to define specific conversions when decoding values +trait DeltaBitPackDecoderConversion { + /// Sets decoded value based on type `T`. + #[inline] + fn get_delta(&self, index: usize) -> i64; + + #[inline] + fn set_decoded_value(&self, buffer: &mut [T::T], index: usize, value: i64); +} + +impl DeltaBitPackDecoderConversion for DeltaBitPackDecoder { + #[inline] + default fn get_delta(&self, _: usize) -> i64 { + panic!("DeltaBitPackDecoder only supports Int32Type and Int64Type") + } + + #[inline] + default fn set_decoded_value(&self, _: &mut [T::T], _: usize, _: i64) { + panic!("DeltaBitPackDecoder only supports Int32Type and Int64Type") + } +} + +impl DeltaBitPackDecoderConversion for DeltaBitPackDecoder { + #[inline] + fn get_delta(&self, index: usize) -> i64 { + self.deltas_in_mini_block[index] as i64 + } + + #[inline] + fn set_decoded_value(&self, buffer: &mut [i32], index: usize, value: i64) { + buffer[index] = value as i32; + } +} + +impl DeltaBitPackDecoderConversion for DeltaBitPackDecoder { + #[inline] + fn get_delta(&self, index: usize) -> i64 { + self.deltas_in_mini_block[index] + } + + #[inline] + fn set_decoded_value(&self, buffer: &mut [i64], index: usize, value: i64) { + buffer[index] = value; + } +} + +// ---------------------------------------------------------------------- +// DELTA_LENGTH_BYTE_ARRAY Decoding + +/// Delta length byte array decoder. +/// Only applied to byte arrays to separate the length values and the data, the lengths +/// are encoded using DELTA_BINARY_PACKED encoding. +/// See [`DeltaLengthByteArrayEncoder`](`::encoding::DeltaLengthByteArrayEncoder`) +/// for more information. +pub struct DeltaLengthByteArrayDecoder { + // Lengths for each byte array in `data` + // TODO: add memory tracker to this + lengths: Vec, + + // Current index into `lengths` + current_idx: usize, + + // Concatenated byte array data + data: Option, + + // Offset into `data`, always point to the beginning of next byte array. + offset: usize, + + // Number of values left in this decoder stream + num_values: usize, + + // Placeholder to allow `T` as generic parameter + _phantom: PhantomData, +} + +impl DeltaLengthByteArrayDecoder { + /// Creates new delta length byte array decoder. + pub fn new() -> Self { + Self { + lengths: vec![], + current_idx: 0, + data: None, + offset: 0, + num_values: 0, + _phantom: PhantomData, + } + } +} + +impl Decoder for DeltaLengthByteArrayDecoder { + default fn set_data(&mut self, _: ByteBufferPtr, _: usize) -> Result<()> { + Err(general_err!( + "DeltaLengthByteArrayDecoder only support ByteArrayType" + )) + } + + default fn get(&mut self, _: &mut [T::T]) -> Result { + Err(general_err!( + "DeltaLengthByteArrayDecoder only support ByteArrayType" + )) + } + + fn values_left(&self) -> usize { + self.num_values + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_LENGTH_BYTE_ARRAY + } +} + +impl Decoder for DeltaLengthByteArrayDecoder { + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + let mut len_decoder = DeltaBitPackDecoder::::new(); + len_decoder.set_data(data.all(), num_values)?; + let num_lengths = len_decoder.values_left(); + self.lengths.resize(num_lengths, 0); + len_decoder.get(&mut self.lengths[..])?; + + self.data = Some(data.start_from(len_decoder.get_offset())); + self.offset = 0; + self.current_idx = 0; + self.num_values = num_lengths; + Ok(()) + } + + fn get(&mut self, buffer: &mut [ByteArray]) -> Result { + assert!(self.data.is_some()); + + let data = self.data.as_ref().unwrap(); + let num_values = cmp::min(buffer.len(), self.num_values); + for i in 0..num_values { + let len = self.lengths[self.current_idx] as usize; + buffer[i].set_data(data.range(self.offset, len)); + self.offset += len; + self.current_idx += 1; + } + + self.num_values -= num_values; + Ok(num_values) + } +} + +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY Decoding + +/// Delta byte array decoder. +/// Prefix lengths are encoded using `DELTA_BINARY_PACKED` encoding, Suffixes are stored +/// using `DELTA_LENGTH_BYTE_ARRAY` encoding. +/// See [`DeltaByteArrayEncoder`](`::encoding::DeltaByteArrayEncoder`) for more +/// information. +pub struct DeltaByteArrayDecoder { + // Prefix lengths for each byte array + // TODO: add memory tracker to this + prefix_lengths: Vec, + + // The current index into `prefix_lengths`, + current_idx: usize, + + // Decoder for all suffixes, the # of which should be the same as + // `prefix_lengths.len()` + suffix_decoder: Option>, + + // The last byte array, used to derive the current prefix + previous_value: Vec, + + // Number of values left + num_values: usize, + + // Placeholder to allow `T` as generic parameter + _phantom: PhantomData, +} + +impl DeltaByteArrayDecoder { + /// Creates new delta byte array decoder. + pub fn new() -> Self { + Self { + prefix_lengths: vec![], + current_idx: 0, + suffix_decoder: None, + previous_value: vec![], + num_values: 0, + _phantom: PhantomData, + } + } +} + +impl<'m, T: DataType> Decoder for DeltaByteArrayDecoder { + default fn set_data(&mut self, _: ByteBufferPtr, _: usize) -> Result<()> { + Err(general_err!( + "DeltaByteArrayDecoder only supports ByteArrayType and FixedLenByteArrayType" + )) + } + + default fn get(&mut self, _: &mut [T::T]) -> Result { + Err(general_err!( + "DeltaByteArrayDecoder only supports ByteArrayType and FixedLenByteArrayType" + )) + } + + fn values_left(&self) -> usize { + self.num_values + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_BYTE_ARRAY + } +} + +impl Decoder for DeltaByteArrayDecoder { + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + let mut prefix_len_decoder = DeltaBitPackDecoder::::new(); + prefix_len_decoder.set_data(data.all(), num_values)?; + let num_prefixes = prefix_len_decoder.values_left(); + self.prefix_lengths.resize(num_prefixes, 0); + prefix_len_decoder.get(&mut self.prefix_lengths[..])?; + + let mut suffix_decoder = DeltaLengthByteArrayDecoder::new(); + suffix_decoder.set_data(data.start_from(prefix_len_decoder.get_offset()), num_values)?; + self.suffix_decoder = Some(suffix_decoder); + self.num_values = num_prefixes; + self.current_idx = 0; + self.previous_value.clear(); + Ok(()) + } + + fn get(&mut self, buffer: &mut [ByteArray]) -> Result { + assert!(self.suffix_decoder.is_some()); + + let num_values = cmp::min(buffer.len(), self.num_values); + let mut v: [ByteArray; 1] = [ByteArray::new(); 1]; + for i in 0..num_values { + // Process suffix + // TODO: this is awkward - maybe we should add a non-vectorized API? + let suffix_decoder = self.suffix_decoder.as_mut().unwrap(); + suffix_decoder.get(&mut v[..])?; + let suffix = v[0].data(); + + // Extract current prefix length, can be 0 + let prefix_len = self.prefix_lengths[self.current_idx] as usize; + + // Concatenate prefix with suffix + let mut result = Vec::new(); + result.extend_from_slice(&self.previous_value[0..prefix_len]); + result.extend_from_slice(suffix); + + let data = ByteBufferPtr::new(result.clone()); + buffer[i].set_data(data); + self.previous_value = result; + self.current_idx += 1; + } + + self.num_values -= num_values; + Ok(num_values) + } +} + +impl Decoder for DeltaByteArrayDecoder { + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + let s: &mut DeltaByteArrayDecoder = unsafe { mem::transmute(self) }; + s.set_data(data, num_values) + } + + fn get(&mut self, buffer: &mut [ByteArray]) -> Result { + let s: &mut DeltaByteArrayDecoder = unsafe { mem::transmute(self) }; + s.get(buffer) + } +} + +#[cfg(test)] +mod tests { + use super::{super::encoding::*, *}; + + use std::{mem, rc::Rc}; + + use crate::parquet::schema::types::{ + ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, + }; + use crate::parquet::util::{bit_util::set_array_bit, memory::MemTracker, test_common::RandGen}; + + #[test] + fn test_get_decoders() { + // supported encodings + create_and_check_decoder::(Encoding::PLAIN, None); + create_and_check_decoder::(Encoding::DELTA_BINARY_PACKED, None); + create_and_check_decoder::(Encoding::DELTA_LENGTH_BYTE_ARRAY, None); + create_and_check_decoder::(Encoding::DELTA_BYTE_ARRAY, None); + create_and_check_decoder::(Encoding::RLE, None); + + // error when initializing + create_and_check_decoder::( + Encoding::RLE_DICTIONARY, + Some(general_err!( + "Cannot initialize this encoding through this function" + )), + ); + create_and_check_decoder::( + Encoding::PLAIN_DICTIONARY, + Some(general_err!( + "Cannot initialize this encoding through this function" + )), + ); + + // unsupported + create_and_check_decoder::( + Encoding::BIT_PACKED, + Some(nyi_err!("Encoding BIT_PACKED is not supported")), + ); + } + + #[test] + fn test_plain_decode_int32() { + let data = vec![42, 18, 52]; + let data_bytes = Int32Type::to_byte_array(&data[..]); + let mut buffer = vec![0; 3]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 3, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_int64() { + let data = vec![42, 18, 52]; + let data_bytes = Int64Type::to_byte_array(&data[..]); + let mut buffer = vec![0; 3]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 3, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_float() { + let data = vec![3.14, 2.414, 12.51]; + let data_bytes = FloatType::to_byte_array(&data[..]); + let mut buffer = vec![0.0; 3]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 3, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_double() { + let data = vec![3.14f64, 2.414f64, 12.51f64]; + let data_bytes = DoubleType::to_byte_array(&data[..]); + let mut buffer = vec![0.0f64; 3]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 3, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_int96() { + let mut data = vec![Int96::new(); 4]; + data[0].set_data(11, 22, 33); + data[1].set_data(44, 55, 66); + data[2].set_data(10, 20, 30); + data[3].set_data(40, 50, 60); + let data_bytes = Int96Type::to_byte_array(&data[..]); + let mut buffer = vec![Int96::new(); 4]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 4, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_bool() { + let data = vec![ + false, true, false, false, true, false, true, true, false, true, + ]; + let data_bytes = BoolType::to_byte_array(&data[..]); + let mut buffer = vec![false; 10]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 10, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_byte_array() { + let mut data = vec![ByteArray::new(); 2]; + data[0].set_data(ByteBufferPtr::new(String::from("hello").into_bytes())); + data[1].set_data(ByteBufferPtr::new(String::from("parquet").into_bytes())); + let data_bytes = ByteArrayType::to_byte_array(&data[..]); + let mut buffer = vec![ByteArray::new(); 2]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 2, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_fixed_len_byte_array() { + let mut data = vec![ByteArray::default(); 3]; + data[0].set_data(ByteBufferPtr::new(String::from("bird").into_bytes())); + data[1].set_data(ByteBufferPtr::new(String::from("come").into_bytes())); + data[2].set_data(ByteBufferPtr::new(String::from("flow").into_bytes())); + let data_bytes = FixedLenByteArrayType::to_byte_array(&data[..]); + let mut buffer = vec![ByteArray::default(); 3]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 3, + 4, + &mut buffer[..], + &data[..], + ); + } + + #[test] + #[should_panic(expected = "RleValueEncoder only supports BoolType")] + fn test_rle_value_encode_int32_not_supported() { + let mut encoder = RleValueEncoder::::new(); + encoder.put(&vec![1, 2, 3, 4]).unwrap(); + } + + #[test] + #[should_panic(expected = "RleValueDecoder only supports BoolType")] + fn test_rle_value_decode_int32_not_supported() { + let mut decoder = RleValueDecoder::::new(); + decoder + .set_data(ByteBufferPtr::new(vec![5, 0, 0, 0]), 1) + .unwrap(); + } + + #[test] + fn test_rle_value_decode_bool_decode() { + // Test multiple 'put' calls on the same encoder + let data = vec![ + BoolType::gen_vec(-1, 256), + BoolType::gen_vec(-1, 257), + BoolType::gen_vec(-1, 126), + ]; + test_rle_value_decode::(data); + } + + #[test] + #[should_panic(expected = "Bit reader is not initialized")] + fn test_delta_bit_packed_not_initialized_offset() { + // Fail if set_data() is not called before get_offset() + let decoder = DeltaBitPackDecoder::::new(); + decoder.get_offset(); + } + + #[test] + #[should_panic(expected = "Bit reader is not initialized")] + fn test_delta_bit_packed_not_initialized_get() { + // Fail if set_data() is not called before get() + let mut decoder = DeltaBitPackDecoder::::new(); + let mut buffer = vec![]; + decoder.get(&mut buffer).unwrap(); + } + + #[test] + fn test_delta_bit_packed_int32_empty() { + let data = vec![vec![0; 0]]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_int32_repeat() { + let block_data = vec![ + 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, + 6, 7, 8, + ]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int32_uneven() { + let block_data = vec![1, -2, 3, -4, 5, 6, 7, 8, 9, 10, 11]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int32_same_values() { + let block_data = vec![ + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + ]; + test_delta_bit_packed_decode::(vec![block_data]); + + let block_data = vec![ + -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, + -127, -127, + ]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int32_min_max() { + let block_data = vec![ + i32::min_value(), + i32::max_value(), + i32::min_value(), + i32::max_value(), + i32::min_value(), + i32::max_value(), + i32::min_value(), + i32::max_value(), + ]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int32_multiple_blocks() { + // Test multiple 'put' calls on the same encoder + let data = vec![ + Int32Type::gen_vec(-1, 64), + Int32Type::gen_vec(-1, 128), + Int32Type::gen_vec(-1, 64), + ]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_int32_data_across_blocks() { + // Test multiple 'put' calls on the same encoder + let data = vec![Int32Type::gen_vec(-1, 256), Int32Type::gen_vec(-1, 257)]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_int32_with_empty_blocks() { + let data = vec![ + Int32Type::gen_vec(-1, 128), + vec![0; 0], + Int32Type::gen_vec(-1, 64), + ]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_int64_empty() { + let data = vec![vec![0; 0]]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_int64_min_max() { + let block_data = vec![ + i64::min_value(), + i64::max_value(), + i64::min_value(), + i64::max_value(), + i64::min_value(), + i64::max_value(), + i64::min_value(), + i64::max_value(), + ]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int64_multiple_blocks() { + // Test multiple 'put' calls on the same encoder + let data = vec![ + Int64Type::gen_vec(-1, 64), + Int64Type::gen_vec(-1, 128), + Int64Type::gen_vec(-1, 64), + ]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_decoder_sample() { + let data_bytes = vec![ + 128, 1, 4, 3, 58, 28, 6, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ]; + let buffer = ByteBufferPtr::new(data_bytes); + let mut decoder: DeltaBitPackDecoder = DeltaBitPackDecoder::new(); + decoder.set_data(buffer, 3).unwrap(); + // check exact offsets, because when reading partial values we end up with + // some data not being read from bit reader + assert_eq!(decoder.get_offset(), 5); + let mut result = vec![0, 0, 0]; + decoder.get(&mut result).unwrap(); + assert_eq!(decoder.get_offset(), 34); + assert_eq!(result, vec![29, 43, 89]); + } + + #[test] + fn test_delta_byte_array_same_arrays() { + let data = vec![ + vec![ByteArray::from(vec![1, 2, 3, 4, 5, 6])], + vec![ + ByteArray::from(vec![1, 2, 3, 4, 5, 6]), + ByteArray::from(vec![1, 2, 3, 4, 5, 6]), + ], + vec![ + ByteArray::from(vec![1, 2, 3, 4, 5, 6]), + ByteArray::from(vec![1, 2, 3, 4, 5, 6]), + ], + ]; + test_delta_byte_array_decode(data); + } + + #[test] + fn test_delta_byte_array_unique_arrays() { + let data = vec![ + vec![ByteArray::from(vec![1])], + vec![ByteArray::from(vec![2, 3]), ByteArray::from(vec![4, 5, 6])], + vec![ + ByteArray::from(vec![7, 8]), + ByteArray::from(vec![9, 0, 1, 2]), + ], + ]; + test_delta_byte_array_decode(data); + } + + #[test] + fn test_delta_byte_array_single_array() { + let data = vec![vec![ByteArray::from(vec![1, 2, 3, 4, 5, 6])]]; + test_delta_byte_array_decode(data); + } + + fn test_plain_decode( + data: ByteBufferPtr, + num_values: usize, + type_length: i32, + buffer: &mut [T::T], + expected: &[T::T], + ) { + let mut decoder: PlainDecoder = PlainDecoder::new(type_length); + let result = decoder.set_data(data, num_values); + assert!(result.is_ok()); + let result = decoder.get(&mut buffer[..]); + assert!(result.is_ok()); + assert_eq!(decoder.values_left(), 0); + assert_eq!(buffer, expected); + } + + fn test_rle_value_decode(data: Vec>) { + test_encode_decode::(data, Encoding::RLE); + } + + fn test_delta_bit_packed_decode(data: Vec>) { + test_encode_decode::(data, Encoding::DELTA_BINARY_PACKED); + } + + fn test_delta_byte_array_decode(data: Vec>) { + test_encode_decode::(data, Encoding::DELTA_BYTE_ARRAY); + } + + // Input data represents vector of data slices to write (test multiple `put()` calls) + // For example, + // vec![vec![1, 2, 3]] invokes `put()` once and writes {1, 2, 3} + // vec![vec![1, 2], vec![3]] invokes `put()` twice and writes {1, 2, 3} + fn test_encode_decode(data: Vec>, encoding: Encoding) { + // Type length should not really matter for encode/decode test, + // otherwise change it based on type + let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type()); + + // Encode data + let mut encoder = get_encoder::(col_descr.clone(), encoding, Rc::new(MemTracker::new())) + .expect("get encoder"); + + for v in &data[..] { + encoder.put(&v[..]).expect("ok to encode"); + } + let bytes = encoder.flush_buffer().expect("ok to flush buffer"); + + // Flatten expected data as contiguous array of values + let expected: Vec = data.iter().flat_map(|s| s.clone()).collect(); + + // Decode data and compare with original + let mut decoder = get_decoder::(col_descr.clone(), encoding).expect("get decoder"); + + let mut result = vec![T::T::default(); expected.len()]; + decoder + .set_data(bytes, expected.len()) + .expect("ok to set data"); + let mut result_num_values = 0; + while decoder.values_left() > 0 { + result_num_values += decoder + .get(&mut result[result_num_values..]) + .expect("ok to decode"); + } + assert_eq!(result_num_values, expected.len()); + assert_eq!(result, expected); + } + + fn create_and_check_decoder(encoding: Encoding, err: Option) { + let descr = create_test_col_desc_ptr(-1, T::get_physical_type()); + let decoder = get_decoder::(descr, encoding); + match err { + Some(parquet_error) => { + assert!(decoder.is_err()); + assert_eq!(decoder.err().unwrap(), parquet_error); + } + None => { + assert!(decoder.is_ok()); + assert_eq!(decoder.unwrap().encoding(), encoding); + } + } + } + + // Creates test column descriptor. + fn create_test_col_desc_ptr(type_len: i32, t: Type) -> ColumnDescPtr { + let ty = SchemaType::primitive_type_builder("t", t) + .with_length(type_len) + .build() + .unwrap(); + Rc::new(ColumnDescriptor::new( + Rc::new(ty), + None, + 0, + 0, + ColumnPath::new(vec![]), + )) + } + + fn usize_to_bytes(v: usize) -> [u8; 4] { + unsafe { mem::transmute::(v as u32) } + } + + /// A util trait to convert slices of different types to byte arrays + trait ToByteArray { + fn to_byte_array(data: &[T::T]) -> Vec; + } + + impl ToByteArray for T + where + T: DataType, + { + default fn to_byte_array(data: &[T::T]) -> Vec { + let mut v = vec![]; + let type_len = ::std::mem::size_of::(); + v.extend_from_slice(unsafe { + ::std::slice::from_raw_parts(data.as_ptr() as *const u8, data.len() * type_len) + }); + v + } + } + + impl ToByteArray for BoolType { + fn to_byte_array(data: &[bool]) -> Vec { + let mut v = vec![]; + for i in 0..data.len() { + if i % 8 == 0 { + v.push(0); + } + if data[i] { + set_array_bit(&mut v[..], i); + } + } + v + } + } + + impl ToByteArray for Int96Type { + fn to_byte_array(data: &[Int96]) -> Vec { + let mut v = vec![]; + for d in data { + unsafe { + let copy = ::std::slice::from_raw_parts(d.data().as_ptr() as *const u8, 12); + v.extend_from_slice(copy); + }; + } + v + } + } + + impl ToByteArray for ByteArrayType { + fn to_byte_array(data: &[ByteArray]) -> Vec { + let mut v = vec![]; + for d in data { + let buf = d.data(); + let len = &usize_to_bytes(buf.len()); + v.extend_from_slice(len); + v.extend(buf); + } + v + } + } + + impl ToByteArray for FixedLenByteArrayType { + fn to_byte_array(data: &[ByteArray]) -> Vec { + let mut v = vec![]; + for d in data { + let buf = d.data(); + v.extend(buf); + } + v + } + } +} diff --git a/rust/src/parquet/encodings/encoding.rs b/rust/src/parquet/encodings/encoding.rs new file mode 100644 index 0000000000000..cecb03cb540a9 --- /dev/null +++ b/rust/src/parquet/encodings/encoding.rs @@ -0,0 +1,1360 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains all supported encoders for Parquet. + +use std::{cmp, io::Write, marker::PhantomData, mem, slice}; + +use crate::parquet::basic::*; +use crate::parquet::data_type::*; +use crate::parquet::encodings::rle::RleEncoder; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::schema::types::ColumnDescPtr; +use crate::parquet::util::{ + bit_util::{log2, num_required_bits, BitWriter}, + hash_util, + memory::{Buffer, ByteBuffer, ByteBufferPtr, MemTrackerPtr}, +}; + +// ---------------------------------------------------------------------- +// Encoders + +/// An Parquet encoder for the data type `T`. +/// +/// Currently this allocates internal buffers for the encoded values. After done putting +/// values, caller should call `flush_buffer()` to get an immutable buffer pointer. +pub trait Encoder { + /// Encodes data from `values`. + fn put(&mut self, values: &[T::T]) -> Result<()>; + + /// Returns the encoding type of this encoder. + fn encoding(&self) -> Encoding; + + /// Returns an estimate of the encoded data, in bytes. + /// Method call must be O(1). + fn estimated_data_encoded_size(&self) -> usize; + + /// Flushes the underlying byte buffer that's being processed by this encoder, and + /// return the immutable copy of it. This will also reset the internal state. + fn flush_buffer(&mut self) -> Result; +} + +/// Gets a encoder for the particular data type `T` and encoding `encoding`. Memory usage +/// for the encoder instance is tracked by `mem_tracker`. +pub fn get_encoder( + desc: ColumnDescPtr, + encoding: Encoding, + mem_tracker: MemTrackerPtr, +) -> Result>> { + let encoder: Box> = match encoding { + Encoding::PLAIN => Box::new(PlainEncoder::new(desc, mem_tracker, vec![])), + Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => { + return Err(general_err!( + "Cannot initialize this encoding through this function" + )); + } + Encoding::RLE => Box::new(RleValueEncoder::new()), + Encoding::DELTA_BINARY_PACKED => Box::new(DeltaBitPackEncoder::new()), + Encoding::DELTA_LENGTH_BYTE_ARRAY => Box::new(DeltaLengthByteArrayEncoder::new()), + Encoding::DELTA_BYTE_ARRAY => Box::new(DeltaByteArrayEncoder::new()), + e => return Err(nyi_err!("Encoding {} is not supported", e)), + }; + Ok(encoder) +} + +// ---------------------------------------------------------------------- +// Plain encoding + +/// Plain encoding that supports all types. +/// Values are encoded back to back. +/// The plain encoding is used whenever a more efficient encoding can not be used. +/// It stores the data in the following format: +/// - BOOLEAN - 1 bit per value, 0 is false; 1 is true. +/// - INT32 - 4 bytes per value, stored as little-endian. +/// - INT64 - 8 bytes per value, stored as little-endian. +/// - FLOAT - 4 bytes per value, stored as IEEE little-endian. +/// - DOUBLE - 8 bytes per value, stored as IEEE little-endian. +/// - BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. +/// - FIXED_LEN_BYTE_ARRAY - just the bytes are stored. +pub struct PlainEncoder { + buffer: ByteBuffer, + bit_writer: BitWriter, + desc: ColumnDescPtr, + _phantom: PhantomData, +} + +impl PlainEncoder { + /// Creates new plain encoder. + pub fn new(desc: ColumnDescPtr, mem_tracker: MemTrackerPtr, vec: Vec) -> Self { + let mut byte_buffer = ByteBuffer::new().with_mem_tracker(mem_tracker); + byte_buffer.set_data(vec); + Self { + buffer: byte_buffer, + bit_writer: BitWriter::new(256), + desc, + _phantom: PhantomData, + } + } +} + +impl Encoder for PlainEncoder { + default fn put(&mut self, values: &[T::T]) -> Result<()> { + let bytes = unsafe { + slice::from_raw_parts( + values as *const [T::T] as *const u8, + mem::size_of::() * values.len(), + ) + }; + self.buffer.write(bytes)?; + Ok(()) + } + + fn encoding(&self) -> Encoding { + Encoding::PLAIN + } + + fn estimated_data_encoded_size(&self) -> usize { + self.buffer.size() + self.bit_writer.bytes_written() + } + + #[inline] + default fn flush_buffer(&mut self) -> Result { + self.buffer.write(self.bit_writer.flush_buffer())?; + self.buffer.flush()?; + self.bit_writer.clear(); + + Ok(self.buffer.consume()) + } +} + +impl Encoder for PlainEncoder { + fn put(&mut self, values: &[bool]) -> Result<()> { + for v in values { + self.bit_writer.put_value(*v as u64, 1); + } + Ok(()) + } +} + +impl Encoder for PlainEncoder { + fn put(&mut self, values: &[Int96]) -> Result<()> { + for v in values { + self.buffer.write(v.as_bytes())?; + } + self.buffer.flush()?; + Ok(()) + } +} + +impl Encoder for PlainEncoder { + fn put(&mut self, values: &[ByteArray]) -> Result<()> { + for v in values { + self.buffer.write(&(v.len().to_le() as u32).as_bytes())?; + self.buffer.write(v.data())?; + } + self.buffer.flush()?; + Ok(()) + } +} + +impl Encoder for PlainEncoder { + fn put(&mut self, values: &[ByteArray]) -> Result<()> { + for v in values { + self.buffer.write(v.data())?; + } + self.buffer.flush()?; + Ok(()) + } +} + +// ---------------------------------------------------------------------- +// Dictionary encoding + +const INITIAL_HASH_TABLE_SIZE: usize = 1024; +const MAX_HASH_LOAD: f32 = 0.7; +const HASH_SLOT_EMPTY: i32 = -1; + +/// Dictionary encoder. +/// The dictionary encoding builds a dictionary of values encountered in a given column. +/// The dictionary page is written first, before the data pages of the column chunk. +/// +/// Dictionary page format: the entries in the dictionary - in dictionary order - +/// using the plain encoding. +/// +/// Data page format: the bit width used to encode the entry ids stored as 1 byte +/// (max bit width = 32), followed by the values encoded using RLE/Bit packed described +/// above (with the given bit width). +pub struct DictEncoder { + // Descriptor for the column to be encoded. + desc: ColumnDescPtr, + + // Size of the table. **Must be** a power of 2. + hash_table_size: usize, + + // Store `hash_table_size` - 1, so that `j & mod_bitmask` is equivalent to + // `j % hash_table_size`, but uses far fewer CPU cycles. + mod_bitmask: u32, + + // Stores indices which map (many-to-one) to the values in the `uniques` array. + // Here we are using fix-sized array with linear probing. + // A slot with `HASH_SLOT_EMPTY` indicates the slot is not currently occupied. + hash_slots: Buffer, + + // Indices that have not yet be written out by `write_indices()`. + buffered_indices: Buffer, + + // The unique observed values. + uniques: Buffer, + + // Size in bytes needed to encode this dictionary. + uniques_size_in_bytes: usize, + + // Tracking memory usage for the various data structures in this struct. + mem_tracker: MemTrackerPtr, +} + +impl DictEncoder { + /// Creates new dictionary encoder. + pub fn new(desc: ColumnDescPtr, mem_tracker: MemTrackerPtr) -> Self { + let mut slots = Buffer::new().with_mem_tracker(mem_tracker.clone()); + slots.resize(INITIAL_HASH_TABLE_SIZE, -1); + Self { + desc, + hash_table_size: INITIAL_HASH_TABLE_SIZE, + mod_bitmask: (INITIAL_HASH_TABLE_SIZE - 1) as u32, + hash_slots: slots, + buffered_indices: Buffer::new().with_mem_tracker(mem_tracker.clone()), + uniques: Buffer::new().with_mem_tracker(mem_tracker.clone()), + uniques_size_in_bytes: 0, + mem_tracker, + } + } + + /// Returns true if dictionary entries are sorted, false otherwise. + #[inline] + pub fn is_sorted(&self) -> bool { + // Sorting is not supported currently. + false + } + + /// Returns number of unique values (keys) in the dictionary. + pub fn num_entries(&self) -> usize { + self.uniques.size() + } + + /// Returns size of unique values (keys) in the dictionary, in bytes. + pub fn dict_encoded_size(&self) -> usize { + self.uniques_size_in_bytes + } + + /// Writes out the dictionary values with PLAIN encoding in a byte buffer, and return + /// the result. + #[inline] + pub fn write_dict(&self) -> Result { + let mut plain_encoder = + PlainEncoder::::new(self.desc.clone(), self.mem_tracker.clone(), vec![]); + plain_encoder.put(self.uniques.data())?; + plain_encoder.flush_buffer() + } + + /// Writes out the dictionary values with RLE encoding in a byte buffer, and return the + /// result. + #[inline] + pub fn write_indices(&mut self) -> Result { + // TODO: the caller should allocate the buffer + let buffer_len = self.estimated_data_encoded_size(); + let mut buffer: Vec = vec![0; buffer_len as usize]; + buffer[0] = self.bit_width() as u8; + self.mem_tracker.alloc(buffer.capacity() as i64); + + // Write bit width in the first byte + buffer.write((self.bit_width() as u8).as_bytes())?; + let mut encoder = RleEncoder::new_from_buf(self.bit_width(), buffer, 1); + for index in self.buffered_indices.data() { + if !encoder.put(*index as u64)? { + return Err(general_err!("Encoder doesn't have enough space")); + } + } + self.buffered_indices.clear(); + Ok(ByteBufferPtr::new(encoder.consume()?)) + } + + #[inline] + fn put_one(&mut self, value: &T::T) -> Result<()> { + let mut j = (hash_util::hash(value, 0) & self.mod_bitmask) as usize; + let mut index = self.hash_slots[j]; + + while index != HASH_SLOT_EMPTY && self.uniques[index as usize] != *value { + j += 1; + if j == self.hash_table_size { + j = 0; + } + index = self.hash_slots[j]; + } + + if index == HASH_SLOT_EMPTY { + index = self.uniques.size() as i32; + self.hash_slots[j] = index; + self.add_dict_key(value.clone()); + + if self.uniques.size() > (self.hash_table_size as f32 * MAX_HASH_LOAD) as usize { + self.double_table_size(); + } + } + + self.buffered_indices.push(index); + Ok(()) + } + + #[inline] + fn add_dict_key(&mut self, value: T::T) { + self.uniques_size_in_bytes += self.get_encoded_size(&value); + self.uniques.push(value); + } + + #[inline] + fn bit_width(&self) -> u8 { + let num_entries = self.uniques.size(); + if num_entries == 0 { + 0 + } else if num_entries == 1 { + 1 + } else { + log2(num_entries as u64) as u8 + } + } + + #[inline] + fn double_table_size(&mut self) { + let new_size = self.hash_table_size * 2; + let mut new_hash_slots = Buffer::new().with_mem_tracker(self.mem_tracker.clone()); + new_hash_slots.resize(new_size, HASH_SLOT_EMPTY); + for i in 0..self.hash_table_size { + let index = self.hash_slots[i]; + if index == HASH_SLOT_EMPTY { + continue; + } + let value = &self.uniques[index as usize]; + let mut j = (hash_util::hash(value, 0) & ((new_size - 1) as u32)) as usize; + let mut slot = new_hash_slots[j]; + while slot != HASH_SLOT_EMPTY && self.uniques[slot as usize] != *value { + j += 1; + if j == new_size { + j = 0; + } + slot = new_hash_slots[j]; + } + + new_hash_slots[j] = index; + } + + self.hash_table_size = new_size; + self.mod_bitmask = (new_size - 1) as u32; + mem::replace(&mut self.hash_slots, new_hash_slots); + } +} + +impl Encoder for DictEncoder { + #[inline] + fn put(&mut self, values: &[T::T]) -> Result<()> { + for i in values { + self.put_one(&i)? + } + Ok(()) + } + + #[inline] + fn encoding(&self) -> Encoding { + Encoding::PLAIN_DICTIONARY + } + + #[inline] + fn estimated_data_encoded_size(&self) -> usize { + let bit_width = self.bit_width(); + 1 + RleEncoder::min_buffer_size(bit_width) + + RleEncoder::max_buffer_size(bit_width, self.buffered_indices.size()) + } + + #[inline] + fn flush_buffer(&mut self) -> Result { + self.write_indices() + } +} + +/// Provides encoded size for a data type. +/// This is a workaround to calculate dictionary size in bytes. +trait DictEncodedSize { + #[inline] + fn get_encoded_size(&self, value: &T::T) -> usize; +} + +impl DictEncodedSize for DictEncoder { + #[inline] + default fn get_encoded_size(&self, _: &T::T) -> usize { + mem::size_of::() + } +} + +impl DictEncodedSize for DictEncoder { + #[inline] + fn get_encoded_size(&self, value: &ByteArray) -> usize { + mem::size_of::() + value.len() + } +} + +impl DictEncodedSize for DictEncoder { + #[inline] + fn get_encoded_size(&self, _value: &ByteArray) -> usize { + self.desc.type_length() as usize + } +} + +// ---------------------------------------------------------------------- +// RLE encoding + +const DEFAULT_RLE_BUFFER_LEN: usize = 1024; + +/// RLE/Bit-Packing hybrid encoding for values. +/// Currently is used only for data pages v2 and supports boolean types. +pub struct RleValueEncoder { + // Buffer with raw values that we collect, + // when flushing buffer they are encoded using RLE encoder + encoder: Option, + _phantom: PhantomData, +} + +impl RleValueEncoder { + /// Creates new rle value encoder. + pub fn new() -> Self { + Self { + encoder: None, + _phantom: PhantomData, + } + } +} + +impl Encoder for RleValueEncoder { + #[inline] + default fn put(&mut self, _values: &[T::T]) -> Result<()> { + panic!("RleValueEncoder only supports BoolType"); + } + + fn encoding(&self) -> Encoding { + Encoding::RLE + } + + #[inline] + default fn estimated_data_encoded_size(&self) -> usize { + match self.encoder { + Some(ref enc) => enc.len(), + None => 0, + } + } + + #[inline] + default fn flush_buffer(&mut self) -> Result { + panic!("RleValueEncoder only supports BoolType"); + } +} + +impl Encoder for RleValueEncoder { + #[inline] + default fn put(&mut self, values: &[bool]) -> Result<()> { + if self.encoder.is_none() { + self.encoder = Some(RleEncoder::new(1, DEFAULT_RLE_BUFFER_LEN)); + } + let rle_encoder = self.encoder.as_mut().unwrap(); + for value in values { + if !rle_encoder.put(*value as u64)? { + return Err(general_err!("RLE buffer is full")); + } + } + Ok(()) + } + + #[inline] + fn flush_buffer(&mut self) -> Result { + assert!( + self.encoder.is_some(), + "RLE value encoder is not initialized" + ); + let rle_encoder = self.encoder.as_mut().unwrap(); + + // Flush all encoder buffers and raw values + let encoded_data = { + let buf = rle_encoder.flush_buffer()?; + + // Note that buf does not have any offset, all data is encoded bytes + let len = (buf.len() as i32).to_le(); + let len_bytes = len.as_bytes(); + let mut encoded_data = Vec::new(); + encoded_data.extend_from_slice(len_bytes); + encoded_data.extend_from_slice(buf); + encoded_data + }; + // Reset rle encoder for the next batch + rle_encoder.clear(); + + Ok(ByteBufferPtr::new(encoded_data)) + } +} + +// ---------------------------------------------------------------------- +// DELTA_BINARY_PACKED encoding + +const MAX_PAGE_HEADER_WRITER_SIZE: usize = 32; +const MAX_BIT_WRITER_SIZE: usize = 10 * 1024 * 1024; +const DEFAULT_BLOCK_SIZE: usize = 128; +const DEFAULT_NUM_MINI_BLOCKS: usize = 4; + +/// Delta bit packed encoder. +/// Consists of a header followed by blocks of delta encoded values binary packed. +/// +/// Delta-binary-packing: +/// ```shell +/// [page-header] [block 1], [block 2], ... [block N] +/// ``` +/// +/// Each page header consists of: +/// ```shell +/// [block size] [number of miniblocks in a block] [total value count] [first value] +/// ``` +/// +/// Each block consists of: +/// ```shell +/// [min delta] [list of bitwidths of miniblocks] [miniblocks] +/// ``` +/// +/// Current implementation writes values in `put` method, multiple calls to `put` to +/// existing block or start new block if block size is exceeded. Calling `flush_buffer` +/// writes out all data and resets internal state, including page header. +/// +/// Supports only INT32 and INT64. +pub struct DeltaBitPackEncoder { + page_header_writer: BitWriter, + bit_writer: BitWriter, + total_values: usize, + first_value: i64, + current_value: i64, + block_size: usize, + mini_block_size: usize, + num_mini_blocks: usize, + values_in_block: usize, + deltas: Vec, + _phantom: PhantomData, +} + +impl DeltaBitPackEncoder { + /// Creates new delta bit packed encoder. + pub fn new() -> Self { + let block_size = DEFAULT_BLOCK_SIZE; + let num_mini_blocks = DEFAULT_NUM_MINI_BLOCKS; + let mini_block_size = block_size / num_mini_blocks; + assert!(mini_block_size % 8 == 0); + Self::assert_supported_type(); + + DeltaBitPackEncoder { + page_header_writer: BitWriter::new(MAX_PAGE_HEADER_WRITER_SIZE), + bit_writer: BitWriter::new(MAX_BIT_WRITER_SIZE), + total_values: 0, + first_value: 0, + current_value: 0, // current value to keep adding deltas + block_size, // can write fewer values than block size for last block + mini_block_size, + num_mini_blocks, + values_in_block: 0, // will be at most block_size + deltas: vec![0; block_size], + _phantom: PhantomData, + } + } + + /// Writes page header for blocks, this method is invoked when we are done encoding + /// values. It is also okay to encode when no values have been provided + fn write_page_header(&mut self) { + // We ignore the result of each 'put' operation, because MAX_PAGE_HEADER_WRITER_SIZE + // is chosen to fit all header values and guarantees that writes will not fail. + + // Write the size of each block + self.page_header_writer.put_vlq_int(self.block_size as u64); + // Write the number of mini blocks + self.page_header_writer + .put_vlq_int(self.num_mini_blocks as u64); + // Write the number of all values (including non-encoded first value) + self.page_header_writer + .put_vlq_int(self.total_values as u64); + // Write first value + self.page_header_writer.put_zigzag_vlq_int(self.first_value); + } + + // Write current delta buffer (<= 'block size' values) into bit writer + fn flush_block_values(&mut self) -> Result<()> { + if self.values_in_block == 0 { + return Ok(()); + } + + let mut min_delta = i64::max_value(); + for i in 0..self.values_in_block { + min_delta = cmp::min(min_delta, self.deltas[i]); + } + + // Write min delta + self.bit_writer.put_zigzag_vlq_int(min_delta); + + // Slice to store bit width for each mini block + // apply unsafe allocation to avoid double mutable borrow + let mini_block_widths: &mut [u8] = unsafe { + let tmp_slice = self.bit_writer.get_next_byte_ptr(self.num_mini_blocks)?; + slice::from_raw_parts_mut(tmp_slice.as_ptr() as *mut u8, self.num_mini_blocks) + }; + + for i in 0..self.num_mini_blocks { + // Find how many values we need to encode - either block size or whatever values + // left + let n = cmp::min(self.mini_block_size, self.values_in_block); + if n == 0 { + break; + } + + // Compute the max delta in current mini block + let mut max_delta = i64::min_value(); + for j in 0..n { + max_delta = cmp::max(max_delta, self.deltas[i * self.mini_block_size + j]); + } + + // Compute bit width to store (max_delta - min_delta) + let bit_width = num_required_bits(self.subtract_u64(max_delta, min_delta)); + mini_block_widths[i] = bit_width as u8; + + // Encode values in current mini block using min_delta and bit_width + for j in 0..n { + let packed_value = + self.subtract_u64(self.deltas[i * self.mini_block_size + j], min_delta); + self.bit_writer.put_value(packed_value, bit_width); + } + + // Pad the last block (n < mini_block_size) + for _ in n..self.mini_block_size { + self.bit_writer.put_value(0, bit_width); + } + + self.values_in_block -= n; + } + + assert!( + self.values_in_block == 0, + "Expected 0 values in block, found {}", + self.values_in_block + ); + Ok(()) + } +} + +// Implementation is shared between Int32Type and Int64Type, +// see `DeltaBitPackEncoderConversion` below for specifics. +impl Encoder for DeltaBitPackEncoder { + fn put(&mut self, values: &[T::T]) -> Result<()> { + if values.is_empty() { + return Ok(()); + } + + let mut idx; + // Define values to encode, initialize state + if self.total_values == 0 { + self.first_value = self.as_i64(values, 0); + self.current_value = self.first_value; + idx = 1; + } else { + idx = 0; + } + // Add all values (including first value) + self.total_values += values.len(); + + // Write block + while idx < values.len() { + let value = self.as_i64(values, idx); + self.deltas[self.values_in_block] = self.subtract(value, self.current_value); + self.current_value = value; + idx += 1; + self.values_in_block += 1; + if self.values_in_block == self.block_size { + self.flush_block_values()?; + } + } + Ok(()) + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_BINARY_PACKED + } + + fn estimated_data_encoded_size(&self) -> usize { + self.bit_writer.bytes_written() + } + + fn flush_buffer(&mut self) -> Result { + // Write remaining values + self.flush_block_values()?; + // Write page header with total values + self.write_page_header(); + + let mut buffer = ByteBuffer::new(); + buffer.write(self.page_header_writer.flush_buffer())?; + buffer.write(self.bit_writer.flush_buffer())?; + buffer.flush()?; + + // Reset state + self.page_header_writer.clear(); + self.bit_writer.clear(); + self.total_values = 0; + self.first_value = 0; + self.current_value = 0; + self.values_in_block = 0; + + Ok(buffer.consume()) + } +} + +/// Helper trait to define specific conversions and subtractions when computing deltas +trait DeltaBitPackEncoderConversion { + // Method should panic if type is not supported, otherwise no-op + #[inline] + fn assert_supported_type(); + + #[inline] + fn as_i64(&self, values: &[T::T], index: usize) -> i64; + + #[inline] + fn subtract(&self, left: i64, right: i64) -> i64; + + #[inline] + fn subtract_u64(&self, left: i64, right: i64) -> u64; +} + +impl DeltaBitPackEncoderConversion for DeltaBitPackEncoder { + #[inline] + default fn assert_supported_type() { + panic!("DeltaBitPackDecoder only supports Int32Type and Int64Type"); + } + + #[inline] + default fn as_i64(&self, _values: &[T::T], _index: usize) -> i64 { + 0 + } + + #[inline] + default fn subtract(&self, _left: i64, _right: i64) -> i64 { + 0 + } + + #[inline] + default fn subtract_u64(&self, _left: i64, _right: i64) -> u64 { + 0 + } +} + +impl DeltaBitPackEncoderConversion for DeltaBitPackEncoder { + #[inline] + fn assert_supported_type() { + // no-op: supported type + } + + #[inline] + fn as_i64(&self, values: &[i32], index: usize) -> i64 { + values[index] as i64 + } + + #[inline] + fn subtract(&self, left: i64, right: i64) -> i64 { + // It is okay for values to overflow, wrapping_sub wrapping around at the boundary + (left as i32).wrapping_sub(right as i32) as i64 + } + + #[inline] + fn subtract_u64(&self, left: i64, right: i64) -> u64 { + // Conversion of i32 -> u32 -> u64 is to avoid non-zero left most bytes in int + // representation + (left as i32).wrapping_sub(right as i32) as u32 as u64 + } +} + +impl DeltaBitPackEncoderConversion for DeltaBitPackEncoder { + #[inline] + fn assert_supported_type() { + // no-op: supported type + } + + #[inline] + fn as_i64(&self, values: &[i64], index: usize) -> i64 { + values[index] + } + + #[inline] + fn subtract(&self, left: i64, right: i64) -> i64 { + // It is okay for values to overflow, wrapping_sub wrapping around at the boundary + left.wrapping_sub(right) + } + + #[inline] + fn subtract_u64(&self, left: i64, right: i64) -> u64 { + left.wrapping_sub(right) as u64 + } +} + +// ---------------------------------------------------------------------- +// DELTA_LENGTH_BYTE_ARRAY encoding + +/// Encoding for byte arrays to separate the length values and the data. +/// The lengths are encoded using DELTA_BINARY_PACKED encoding, data is +/// stored as raw bytes. +pub struct DeltaLengthByteArrayEncoder { + // length encoder + len_encoder: DeltaBitPackEncoder, + // byte array data + data: Vec, + // data size in bytes of encoded values + encoded_size: usize, + _phantom: PhantomData, +} + +impl DeltaLengthByteArrayEncoder { + /// Creates new delta length byte array encoder. + pub fn new() -> Self { + Self { + len_encoder: DeltaBitPackEncoder::new(), + data: vec![], + encoded_size: 0, + _phantom: PhantomData, + } + } +} + +impl Encoder for DeltaLengthByteArrayEncoder { + default fn put(&mut self, _values: &[T::T]) -> Result<()> { + panic!("DeltaLengthByteArrayEncoder only supports ByteArrayType"); + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_LENGTH_BYTE_ARRAY + } + + fn estimated_data_encoded_size(&self) -> usize { + self.len_encoder.estimated_data_encoded_size() + self.encoded_size + } + + default fn flush_buffer(&mut self) -> Result { + panic!("DeltaLengthByteArrayEncoder only supports ByteArrayType"); + } +} + +impl Encoder for DeltaLengthByteArrayEncoder { + fn put(&mut self, values: &[ByteArray]) -> Result<()> { + let lengths: Vec = values + .iter() + .map(|byte_array| byte_array.len() as i32) + .collect(); + self.len_encoder.put(&lengths)?; + for byte_array in values { + self.encoded_size += byte_array.len(); + self.data.push(byte_array.clone()); + } + Ok(()) + } + + fn flush_buffer(&mut self) -> Result { + let mut total_bytes = vec![]; + let lengths = self.len_encoder.flush_buffer()?; + total_bytes.extend_from_slice(lengths.data()); + self.data.iter().for_each(|byte_array| { + total_bytes.extend_from_slice(byte_array.data()); + }); + self.data.clear(); + self.encoded_size = 0; + Ok(ByteBufferPtr::new(total_bytes)) + } +} + +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encoding + +/// Encoding for byte arrays, prefix lengths are encoded using DELTA_BINARY_PACKED +/// encoding, followed by suffixes with DELTA_LENGTH_BYTE_ARRAY encoding. +pub struct DeltaByteArrayEncoder { + prefix_len_encoder: DeltaBitPackEncoder, + suffix_writer: DeltaLengthByteArrayEncoder, + previous: Vec, + _phantom: PhantomData, +} + +impl DeltaByteArrayEncoder { + /// Creates new delta byte array encoder. + pub fn new() -> Self { + Self { + prefix_len_encoder: DeltaBitPackEncoder::::new(), + suffix_writer: DeltaLengthByteArrayEncoder::::new(), + previous: vec![], + _phantom: PhantomData, + } + } +} + +impl Encoder for DeltaByteArrayEncoder { + default fn put(&mut self, _values: &[T::T]) -> Result<()> { + panic!("DeltaByteArrayEncoder only supports ByteArrayType and FixedLenByteArrayType"); + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_BYTE_ARRAY + } + + fn estimated_data_encoded_size(&self) -> usize { + self.prefix_len_encoder.estimated_data_encoded_size() + + self.suffix_writer.estimated_data_encoded_size() + } + + default fn flush_buffer(&mut self) -> Result { + panic!("DeltaByteArrayEncoder only supports ByteArrayType and FixedLenByteArrayType"); + } +} + +impl Encoder for DeltaByteArrayEncoder { + fn put(&mut self, values: &[ByteArray]) -> Result<()> { + let mut prefix_lengths: Vec = vec![]; + let mut suffixes: Vec = vec![]; + + for byte_array in values { + let current = byte_array.data(); + // Maximum prefix length that is shared between previous value and current value + let prefix_len = cmp::min(self.previous.len(), current.len()); + let mut match_len = 0; + while match_len < prefix_len && self.previous[match_len] == current[match_len] { + match_len += 1; + } + prefix_lengths.push(match_len as i32); + suffixes.push(byte_array.slice(match_len, byte_array.len() - match_len)); + // Update previous for the next prefix + self.previous.clear(); + self.previous.extend_from_slice(current); + } + self.prefix_len_encoder.put(&prefix_lengths)?; + self.suffix_writer.put(&suffixes)?; + Ok(()) + } + + fn flush_buffer(&mut self) -> Result { + // TODO: investigate if we can merge lengths and suffixes + // without copying data into new vector. + let mut total_bytes = vec![]; + // Insert lengths ... + let lengths = self.prefix_len_encoder.flush_buffer()?; + total_bytes.extend_from_slice(lengths.data()); + // ... followed by suffixes + let suffixes = self.suffix_writer.flush_buffer()?; + total_bytes.extend_from_slice(suffixes.data()); + + self.previous.clear(); + Ok(ByteBufferPtr::new(total_bytes)) + } +} + +impl Encoder for DeltaByteArrayEncoder { + fn put(&mut self, values: &[ByteArray]) -> Result<()> { + let s: &mut DeltaByteArrayEncoder = unsafe { mem::transmute(self) }; + s.put(values) + } + + fn flush_buffer(&mut self) -> Result { + let s: &mut DeltaByteArrayEncoder = unsafe { mem::transmute(self) }; + s.flush_buffer() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::rc::Rc; + + use crate::parquet::decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder}; + use crate::parquet::schema::types::{ + ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, + }; + use crate::parquet::util::{memory::MemTracker, test_common::RandGen}; + + const TEST_SET_SIZE: usize = 1024; + + #[test] + fn test_get_encoders() { + // supported encodings + create_and_check_encoder::(Encoding::PLAIN, None); + create_and_check_encoder::(Encoding::DELTA_BINARY_PACKED, None); + create_and_check_encoder::(Encoding::DELTA_LENGTH_BYTE_ARRAY, None); + create_and_check_encoder::(Encoding::DELTA_BYTE_ARRAY, None); + create_and_check_encoder::(Encoding::RLE, None); + + // error when initializing + create_and_check_encoder::( + Encoding::RLE_DICTIONARY, + Some(general_err!( + "Cannot initialize this encoding through this function" + )), + ); + create_and_check_encoder::( + Encoding::PLAIN_DICTIONARY, + Some(general_err!( + "Cannot initialize this encoding through this function" + )), + ); + + // unsupported + create_and_check_encoder::( + Encoding::BIT_PACKED, + Some(nyi_err!("Encoding BIT_PACKED is not supported")), + ); + } + + #[test] + fn test_bool() { + BoolType::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + BoolType::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + BoolType::test(Encoding::RLE, TEST_SET_SIZE, -1); + } + + #[test] + fn test_i32() { + Int32Type::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + Int32Type::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + Int32Type::test(Encoding::DELTA_BINARY_PACKED, TEST_SET_SIZE, -1); + } + + #[test] + fn test_i64() { + Int64Type::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + Int64Type::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + Int64Type::test(Encoding::DELTA_BINARY_PACKED, TEST_SET_SIZE, -1); + } + + #[test] + fn test_i96() { + Int96Type::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + Int96Type::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + } + + #[test] + fn test_float() { + FloatType::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + FloatType::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + } + + #[test] + fn test_double() { + DoubleType::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + DoubleType::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + } + + #[test] + fn test_byte_array() { + ByteArrayType::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + ByteArrayType::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + ByteArrayType::test(Encoding::DELTA_LENGTH_BYTE_ARRAY, TEST_SET_SIZE, -1); + ByteArrayType::test(Encoding::DELTA_BYTE_ARRAY, TEST_SET_SIZE, -1); + } + + #[test] + fn test_fixed_lenbyte_array() { + FixedLenByteArrayType::test(Encoding::PLAIN, TEST_SET_SIZE, 100); + FixedLenByteArrayType::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, 100); + FixedLenByteArrayType::test(Encoding::DELTA_BYTE_ARRAY, TEST_SET_SIZE, 100); + } + + #[test] + fn test_dict_encoded_size() { + fn run_test(type_length: i32, values: &[T::T], expected_size: usize) { + let mut encoder = create_test_dict_encoder::(type_length); + assert_eq!(encoder.dict_encoded_size(), 0); + encoder.put(values).unwrap(); + assert_eq!(encoder.dict_encoded_size(), expected_size); + // We do not reset encoded size of the dictionary keys after flush_buffer + encoder.flush_buffer().unwrap(); + assert_eq!(encoder.dict_encoded_size(), expected_size); + } + + // Only 2 variations of values 1 byte each + run_test::(-1, &[true, false, true, false, true], 2); + run_test::(-1, &[1i32, 2i32, 3i32, 4i32, 5i32], 20); + run_test::(-1, &[1i64, 2i64, 3i64, 4i64, 5i64], 40); + run_test::(-1, &[1f32, 2f32, 3f32, 4f32, 5f32], 20); + run_test::(-1, &[1f64, 2f64, 3f64, 4f64, 5f64], 40); + // Int96: len + reference + run_test::( + -1, + &[Int96::from(vec![1, 2, 3]), Int96::from(vec![2, 3, 4])], + 32, + ); + run_test::(-1, &[ByteArray::from("abcd"), ByteArray::from("efj")], 15); + run_test::(2, &[ByteArray::from("ab"), ByteArray::from("bc")], 4); + } + + #[test] + fn test_estimated_data_encoded_size() { + fn run_test( + encoding: Encoding, + type_length: i32, + values: &[T::T], + initial_size: usize, + max_size: usize, + flush_size: usize, + ) { + let mut encoder = match encoding { + Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { + Box::new(create_test_dict_encoder::(type_length)) + } + _ => create_test_encoder::(type_length, encoding), + }; + assert_eq!(encoder.estimated_data_encoded_size(), initial_size); + + encoder.put(values).unwrap(); + assert_eq!(encoder.estimated_data_encoded_size(), max_size); + + encoder.flush_buffer().unwrap(); + assert_eq!(encoder.estimated_data_encoded_size(), flush_size); + } + + // PLAIN + run_test::(Encoding::PLAIN, -1, &vec![123; 1024], 0, 4096, 0); + + // DICTIONARY + // NOTE: The final size is almost the same because the dictionary entries are + // preserved after encoded values have been written. + run_test::(Encoding::RLE_DICTIONARY, -1, &vec![123, 1024], 11, 68, 66); + + // DELTA_BINARY_PACKED + run_test::( + Encoding::DELTA_BINARY_PACKED, + -1, + &vec![123; 1024], + 0, + 35, + 0, + ); + + // RLE + let mut values = vec![]; + values.extend_from_slice(&vec![true; 16]); + values.extend_from_slice(&vec![false; 16]); + run_test::(Encoding::RLE, -1, &values, 0, 2, 0); + + // DELTA_LENGTH_BYTE_ARRAY + run_test::( + Encoding::DELTA_LENGTH_BYTE_ARRAY, + -1, + &[ByteArray::from("ab"), ByteArray::from("abc")], + 0, + 5, // only value bytes, length encoder is not flushed yet + 0, + ); + + // DELTA_BYTE_ARRAY + run_test::( + Encoding::DELTA_BYTE_ARRAY, + -1, + &[ByteArray::from("ab"), ByteArray::from("abc")], + 0, + 3, // only suffix bytes, length encoder is not flushed yet + 0, + ); + } + + // See: https://github.com/sunchao/parquet-rs/issues/47 + #[test] + fn test_issue_47() { + let mut encoder = create_test_encoder::(0, Encoding::DELTA_BYTE_ARRAY); + let mut decoder = create_test_decoder::(0, Encoding::DELTA_BYTE_ARRAY); + + let mut input = vec![]; + input.push(ByteArray::from("aa")); + input.push(ByteArray::from("aaa")); + input.push(ByteArray::from("aa")); + input.push(ByteArray::from("aaa")); + let mut output = vec![ByteArray::default(); input.len()]; + + let mut result = put_and_get(&mut encoder, &mut decoder, &input[..2], &mut output[..2]); + assert!( + result.is_ok(), + "first put_and_get() failed with: {}", + result.unwrap_err() + ); + result = put_and_get(&mut encoder, &mut decoder, &input[2..], &mut output[2..]); + assert!( + result.is_ok(), + "second put_and_get() failed with: {}", + result.unwrap_err() + ); + assert_eq!(output, input); + } + + trait EncodingTester { + fn test(enc: Encoding, total: usize, type_length: i32) { + let result = match enc { + Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { + Self::test_dict_internal(total, type_length) + } + enc @ _ => Self::test_internal(enc, total, type_length), + }; + + assert!( + result.is_ok(), + "Expected result to be OK but got err:\n {}", + result.unwrap_err() + ); + } + + fn test_internal(enc: Encoding, total: usize, type_length: i32) -> Result<()>; + + fn test_dict_internal(total: usize, type_length: i32) -> Result<()>; + } + + impl EncodingTester for T { + fn test_internal(enc: Encoding, total: usize, type_length: i32) -> Result<()> { + let mut encoder = create_test_encoder::(type_length, enc); + let mut decoder = create_test_decoder::(type_length, enc); + let mut values = >::gen_vec(type_length, total); + let mut result_data = vec![T::T::default(); total]; + + let mut actual_total = put_and_get( + &mut encoder, + &mut decoder, + &values[..], + &mut result_data[..], + )?; + assert_eq!(actual_total, total); + assert_eq!(result_data, values); + + // Encode more data after flush and test with decoder + + values = >::gen_vec(type_length, total); + actual_total = put_and_get( + &mut encoder, + &mut decoder, + &values[..], + &mut result_data[..], + )?; + assert_eq!(actual_total, total); + assert_eq!(result_data, values); + + Ok(()) + } + + fn test_dict_internal(total: usize, type_length: i32) -> Result<()> { + let mut encoder = create_test_dict_encoder::(type_length); + let mut values = >::gen_vec(type_length, total); + encoder.put(&values[..])?; + + let mut data = encoder.flush_buffer()?; + let mut decoder = create_test_dict_decoder::(); + let mut dict_decoder = PlainDecoder::::new(type_length); + dict_decoder.set_data(encoder.write_dict()?, encoder.num_entries())?; + decoder.set_dict(Box::new(dict_decoder))?; + let mut result_data = vec![T::T::default(); total]; + decoder.set_data(data, total)?; + let mut actual_total = decoder.get(&mut result_data)?; + + assert_eq!(actual_total, total); + assert_eq!(result_data, values); + + // Encode more data after flush and test with decoder + + values = >::gen_vec(type_length, total); + encoder.put(&values[..])?; + data = encoder.flush_buffer()?; + + let mut dict_decoder = PlainDecoder::::new(type_length); + dict_decoder.set_data(encoder.write_dict()?, encoder.num_entries())?; + decoder.set_dict(Box::new(dict_decoder))?; + decoder.set_data(data, total)?; + actual_total = decoder.get(&mut result_data)?; + + assert_eq!(actual_total, total); + assert_eq!(result_data, values); + + Ok(()) + } + } + + fn put_and_get( + encoder: &mut Box>, + decoder: &mut Box>, + input: &[T::T], + output: &mut [T::T], + ) -> Result { + encoder.put(input)?; + let data = encoder.flush_buffer()?; + decoder.set_data(data, input.len())?; + decoder.get(output) + } + + fn create_and_check_encoder(encoding: Encoding, err: Option) { + let descr = create_test_col_desc_ptr(-1, T::get_physical_type()); + let mem_tracker = Rc::new(MemTracker::new()); + let encoder = get_encoder::(descr, encoding, mem_tracker); + match err { + Some(parquet_error) => { + assert!(encoder.is_err()); + assert_eq!(encoder.err().unwrap(), parquet_error); + } + None => { + assert!(encoder.is_ok()); + assert_eq!(encoder.unwrap().encoding(), encoding); + } + } + } + + // Creates test column descriptor. + fn create_test_col_desc_ptr(type_len: i32, t: Type) -> ColumnDescPtr { + let ty = SchemaType::primitive_type_builder("t", t) + .with_length(type_len) + .build() + .unwrap(); + Rc::new(ColumnDescriptor::new( + Rc::new(ty), + None, + 0, + 0, + ColumnPath::new(vec![]), + )) + } + + fn create_test_encoder(type_len: i32, enc: Encoding) -> Box> { + let desc = create_test_col_desc_ptr(type_len, T::get_physical_type()); + let mem_tracker = Rc::new(MemTracker::new()); + get_encoder(desc, enc, mem_tracker).unwrap() + } + + fn create_test_decoder(type_len: i32, enc: Encoding) -> Box> { + let desc = create_test_col_desc_ptr(type_len, T::get_physical_type()); + get_decoder(desc, enc).unwrap() + } + + fn create_test_dict_encoder(type_len: i32) -> DictEncoder { + let desc = create_test_col_desc_ptr(type_len, T::get_physical_type()); + let mem_tracker = Rc::new(MemTracker::new()); + DictEncoder::::new(desc, mem_tracker) + } + + fn create_test_dict_decoder() -> DictDecoder { + DictDecoder::::new() + } +} diff --git a/rust/src/parquet/encodings/levels.rs b/rust/src/parquet/encodings/levels.rs new file mode 100644 index 0000000000000..ec65198ce55f0 --- /dev/null +++ b/rust/src/parquet/encodings/levels.rs @@ -0,0 +1,529 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{cmp, mem}; + +use super::rle::{RleDecoder, RleEncoder}; + +use crate::parquet::basic::Encoding; +use crate::parquet::data_type::AsBytes; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::util::{ + bit_util::{ceil, log2, BitReader, BitWriter}, + memory::ByteBufferPtr, +}; + +/// Computes max buffer size for level encoder/decoder based on encoding, max +/// repetition/definition level and number of total buffered values (includes null +/// values). +#[inline] +pub fn max_buffer_size(encoding: Encoding, max_level: i16, num_buffered_values: usize) -> usize { + let bit_width = log2(max_level as u64 + 1) as u8; + match encoding { + Encoding::RLE => { + RleEncoder::max_buffer_size(bit_width, num_buffered_values) + + RleEncoder::min_buffer_size(bit_width) + } + Encoding::BIT_PACKED => ceil((num_buffered_values * bit_width as usize) as i64, 8) as usize, + _ => panic!("Unsupported encoding type {}", encoding), + } +} + +/// Encoder for definition/repetition levels. +/// Currently only supports RLE and BIT_PACKED (dev/null) encoding, including v2. +pub enum LevelEncoder { + RLE(RleEncoder), + RLE_V2(RleEncoder), + BIT_PACKED(u8, BitWriter), +} + +impl LevelEncoder { + /// Creates new level encoder based on encoding, max level and underlying byte buffer. + /// For bit packed encoding it is assumed that buffer is already allocated with + /// `levels::max_buffer_size` method. + /// + /// Used to encode levels for Data Page v1. + /// + /// Panics, if encoding is not supported. + pub fn v1(encoding: Encoding, max_level: i16, byte_buffer: Vec) -> Self { + let bit_width = log2(max_level as u64 + 1) as u8; + match encoding { + Encoding::RLE => LevelEncoder::RLE(RleEncoder::new_from_buf( + bit_width, + byte_buffer, + mem::size_of::(), + )), + Encoding::BIT_PACKED => { + // Here we set full byte buffer without adjusting for num_buffered_values, + // because byte buffer will already be allocated with size from + // `max_buffer_size()` method. + LevelEncoder::BIT_PACKED(bit_width, BitWriter::new_from_buf(byte_buffer, 0)) + } + _ => panic!("Unsupported encoding type {}", encoding), + } + } + + /// Creates new level encoder based on RLE encoding. Used to encode Data Page v2 + /// repetition and definition levels. + pub fn v2(max_level: i16, byte_buffer: Vec) -> Self { + let bit_width = log2(max_level as u64 + 1) as u8; + LevelEncoder::RLE_V2(RleEncoder::new_from_buf(bit_width, byte_buffer, 0)) + } + + /// Put/encode levels vector into this level encoder. + /// Returns number of encoded values that are less than or equal to length of the input + /// buffer. + /// + /// RLE and BIT_PACKED level encoders return Err() when internal buffer overflows or + /// flush fails. + #[inline] + pub fn put(&mut self, buffer: &[i16]) -> Result { + let mut num_encoded = 0; + match *self { + LevelEncoder::RLE(ref mut encoder) | LevelEncoder::RLE_V2(ref mut encoder) => { + for value in buffer { + if !encoder.put(*value as u64)? { + return Err(general_err!("RLE buffer is full")); + } + num_encoded += 1; + } + encoder.flush()?; + } + LevelEncoder::BIT_PACKED(bit_width, ref mut encoder) => { + for value in buffer { + if !encoder.put_value(*value as u64, bit_width as usize) { + return Err(general_err!("Not enough bytes left")); + } + num_encoded += 1; + } + encoder.flush(); + } + } + Ok(num_encoded) + } + + /// Finalizes level encoder, flush all intermediate buffers and return resulting + /// encoded buffer. Returned buffer is already truncated to encoded bytes only. + #[inline] + pub fn consume(self) -> Result> { + match self { + LevelEncoder::RLE(encoder) => { + let mut encoded_data = encoder.consume()?; + // Account for the buffer offset + let encoded_len = encoded_data.len() - mem::size_of::(); + let len = (encoded_len as i32).to_le(); + let len_bytes = len.as_bytes(); + encoded_data[0..len_bytes.len()].copy_from_slice(len_bytes); + Ok(encoded_data) + } + LevelEncoder::RLE_V2(encoder) => encoder.consume(), + LevelEncoder::BIT_PACKED(_, encoder) => Ok(encoder.consume()), + } + } +} + +/// Decoder for definition/repetition levels. +/// Currently only supports RLE and BIT_PACKED encoding for Data Page v1 and +/// RLE for Data Page v2. +pub enum LevelDecoder { + RLE(Option, RleDecoder), + RLE_V2(Option, RleDecoder), + BIT_PACKED(Option, u8, BitReader), +} + +impl LevelDecoder { + /// Creates new level decoder based on encoding and max definition/repetition level. + /// This method only initializes level decoder, `set_data` method must be called + /// before reading any value. + /// + /// Used to encode levels for Data Page v1. + /// + /// Panics if encoding is not supported + pub fn v1(encoding: Encoding, max_level: i16) -> Self { + let bit_width = log2(max_level as u64 + 1) as u8; + match encoding { + Encoding::RLE => LevelDecoder::RLE(None, RleDecoder::new(bit_width)), + Encoding::BIT_PACKED => { + LevelDecoder::BIT_PACKED(None, bit_width, BitReader::from(Vec::new())) + } + _ => panic!("Unsupported encoding type {}", encoding), + } + } + + /// Creates new level decoder based on RLE encoding. + /// Used to decode Data Page v2 repetition and definition levels. + /// + /// To set data for this decoder, use `set_data_range` method. + pub fn v2(max_level: i16) -> Self { + let bit_width = log2(max_level as u64 + 1) as u8; + LevelDecoder::RLE_V2(None, RleDecoder::new(bit_width)) + } + + /// Sets data for this level decoder, and returns total number of bytes set. + /// This is used for Data Page v1 levels. + /// + /// `data` is encoded data as byte buffer, `num_buffered_values` represents total + /// number of values that is expected. + /// + /// Both RLE and BIT_PACKED level decoders set `num_buffered_values` as total number of + /// values that they can return and track num values. + #[inline] + pub fn set_data(&mut self, num_buffered_values: usize, data: ByteBufferPtr) -> usize { + match *self { + LevelDecoder::RLE(ref mut num_values, ref mut decoder) => { + *num_values = Some(num_buffered_values); + let i32_size = mem::size_of::(); + let data_size = read_num_bytes!(i32, i32_size, data.as_ref()) as usize; + decoder.set_data(data.range(i32_size, data_size)); + i32_size + data_size + } + LevelDecoder::BIT_PACKED(ref mut num_values, bit_width, ref mut decoder) => { + *num_values = Some(num_buffered_values); + // Set appropriate number of bytes: if max size is larger than buffer - set full + // buffer + let num_bytes = ceil((num_buffered_values * bit_width as usize) as i64, 8); + let data_size = cmp::min(num_bytes as usize, data.len()); + decoder.reset(data.range(data.start(), data_size)); + data_size + } + _ => panic!(), + } + } + + /// Sets byte array explicitly when start position `start` and length `len` are known + /// in advance. Only supported by RLE level decoder and used for Data Page v2 levels. + /// Returns number of total bytes set for this decoder (len). + #[inline] + pub fn set_data_range( + &mut self, + num_buffered_values: usize, + data: &ByteBufferPtr, + start: usize, + len: usize, + ) -> usize { + match *self { + LevelDecoder::RLE_V2(ref mut num_values, ref mut decoder) => { + decoder.set_data(data.range(start, len)); + *num_values = Some(num_buffered_values); + len + } + _ => panic!("set_data_range() method is only supported by RLE v2 encoding type"), + } + } + + /// Returns true if data is set for decoder, false otherwise. + #[inline] + pub fn is_data_set(&self) -> bool { + match self { + LevelDecoder::RLE(ref num_values, _) => num_values.is_some(), + LevelDecoder::RLE_V2(ref num_values, _) => num_values.is_some(), + LevelDecoder::BIT_PACKED(ref num_values, ..) => num_values.is_some(), + } + } + + /// Decodes values and puts them into `buffer`. + /// Returns number of values that were successfully decoded (less than or equal to + /// buffer length). + #[inline] + pub fn get(&mut self, buffer: &mut [i16]) -> Result { + assert!(self.is_data_set(), "No data set for decoding"); + match *self { + LevelDecoder::RLE(ref mut num_values, ref mut decoder) + | LevelDecoder::RLE_V2(ref mut num_values, ref mut decoder) => { + // Max length we can read + let len = cmp::min(num_values.unwrap(), buffer.len()); + let values_read = decoder.get_batch::(&mut buffer[0..len])?; + *num_values = num_values.map(|len| len - values_read); + Ok(values_read) + } + LevelDecoder::BIT_PACKED(ref mut num_values, bit_width, ref mut decoder) => { + // When extracting values from bit reader, it might return more values than left + // because of padding to a full byte, we use num_values to track precise number + // of values. + let len = cmp::min(num_values.unwrap(), buffer.len()); + let values_read = decoder.get_batch::(&mut buffer[..len], bit_width as usize); + *num_values = num_values.map(|len| len - values_read); + Ok(values_read) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::parquet::util::test_common::random_numbers_range; + + fn test_internal_roundtrip(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { + let size = max_buffer_size(enc, max_level, levels.len()); + let mut encoder = if v2 { + LevelEncoder::v2(max_level, vec![0; size]) + } else { + LevelEncoder::v1(enc, max_level, vec![0; size]) + }; + encoder.put(&levels).expect("put() should be OK"); + let encoded_levels = encoder.consume().expect("consume() should be OK"); + + let byte_buf = ByteBufferPtr::new(encoded_levels); + let mut decoder; + if v2 { + decoder = LevelDecoder::v2(max_level); + decoder.set_data_range(levels.len(), &byte_buf, 0, byte_buf.len()); + } else { + decoder = LevelDecoder::v1(enc, max_level); + decoder.set_data(levels.len(), byte_buf); + }; + + let mut buffer = vec![0; levels.len()]; + let num_decoded = decoder.get(&mut buffer).expect("get() should be OK"); + assert_eq!(num_decoded, levels.len()); + assert_eq!(buffer, levels); + } + + // Performs incremental read until all bytes are read + fn test_internal_roundtrip_incremental( + enc: Encoding, + levels: &[i16], + max_level: i16, + v2: bool, + ) { + let size = max_buffer_size(enc, max_level, levels.len()); + let mut encoder = if v2 { + LevelEncoder::v2(max_level, vec![0; size]) + } else { + LevelEncoder::v1(enc, max_level, vec![0; size]) + }; + encoder.put(&levels).expect("put() should be OK"); + let encoded_levels = encoder.consume().expect("consume() should be OK"); + + let byte_buf = ByteBufferPtr::new(encoded_levels); + let mut decoder; + if v2 { + decoder = LevelDecoder::v2(max_level); + decoder.set_data_range(levels.len(), &byte_buf, 0, byte_buf.len()); + } else { + decoder = LevelDecoder::v1(enc, max_level); + decoder.set_data(levels.len(), byte_buf); + } + + let mut buffer = vec![0; levels.len() * 2]; + let mut total_decoded = 0; + let mut safe_stop = levels.len() * 2; // still terminate in case of issues in the code + while safe_stop > 0 { + safe_stop -= 1; + let num_decoded = decoder + .get(&mut buffer[total_decoded..total_decoded + 1]) + .expect("get() should be OK"); + if num_decoded == 0 { + break; + } + total_decoded += num_decoded; + } + assert!( + safe_stop > 0, + "Failed to read values incrementally, reached safe stop" + ); + assert_eq!(total_decoded, levels.len()); + assert_eq!(&buffer[0..levels.len()], levels); + } + + // Tests encoding/decoding of values when output buffer is larger than number of + // encoded values + fn test_internal_roundtrip_underflow(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { + let size = max_buffer_size(enc, max_level, levels.len()); + let mut encoder = if v2 { + LevelEncoder::v2(max_level, vec![0; size]) + } else { + LevelEncoder::v1(enc, max_level, vec![0; size]) + }; + // Encode only one value + let num_encoded = encoder.put(&levels[0..1]).expect("put() should be OK"); + let encoded_levels = encoder.consume().expect("consume() should be OK"); + assert_eq!(num_encoded, 1); + + let byte_buf = ByteBufferPtr::new(encoded_levels); + let mut decoder; + // Set one encoded value as `num_buffered_values` + if v2 { + decoder = LevelDecoder::v2(max_level); + decoder.set_data_range(1, &byte_buf, 0, byte_buf.len()); + } else { + decoder = LevelDecoder::v1(enc, max_level); + decoder.set_data(1, byte_buf); + } + + let mut buffer = vec![0; levels.len()]; + let num_decoded = decoder.get(&mut buffer).expect("get() should be OK"); + assert_eq!(num_decoded, num_encoded); + assert_eq!(buffer[0..num_decoded], levels[0..num_decoded]); + } + + // Tests when encoded values are larger than encoder's buffer + fn test_internal_roundtrip_overflow(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { + let size = max_buffer_size(enc, max_level, levels.len()); + let mut encoder = if v2 { + LevelEncoder::v2(max_level, vec![0; size]) + } else { + LevelEncoder::v1(enc, max_level, vec![0; size]) + }; + let mut found_err = false; + // Insert a large number of values, so we run out of space + for _ in 0..100 { + match encoder.put(&levels) { + Err(err) => { + assert!(format!("{}", err).contains("Not enough bytes left")); + found_err = true; + break; + } + Ok(_) => {} + } + } + if !found_err { + panic!("Failed test: no buffer overflow"); + } + } + + #[test] + fn test_roundtrip_one() { + let levels = vec![0, 1, 1, 1, 1, 0, 0, 0, 0, 1]; + let max_level = 1; + test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip() { + let levels = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + let max_level = 10; + test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip_incremental() { + let levels = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + let max_level = 10; + test_internal_roundtrip_incremental(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip_incremental(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip_incremental(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip_all_zeros() { + let levels = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let max_level = 1; + test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip_random() { + // This test is mainly for bit packed level encoder/decoder + let mut levels = Vec::new(); + let max_level = 5; + random_numbers_range::(120, 0, max_level, &mut levels); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip_underflow() { + let levels = vec![1, 1, 2, 3, 2, 1, 1, 2, 3, 1]; + let max_level = 3; + test_internal_roundtrip_underflow(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip_underflow(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip_underflow(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip_overflow() { + let levels = vec![1, 1, 2, 3, 2, 1, 1, 2, 3, 1]; + let max_level = 3; + test_internal_roundtrip_overflow(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip_overflow(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip_overflow(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_rle_decoder_set_data_range() { + // Buffer containing both repetition and definition levels + let buffer = ByteBufferPtr::new(vec![5, 198, 2, 5, 42, 168, 10, 0, 2, 3, 36, 73]); + + let max_rep_level = 1; + let mut decoder = LevelDecoder::v2(max_rep_level); + assert_eq!(decoder.set_data_range(10, &buffer, 0, 3), 3); + let mut result = vec![0; 10]; + let num_decoded = decoder.get(&mut result).expect("get() should be OK"); + assert_eq!(num_decoded, 10); + assert_eq!(result, vec![0, 1, 1, 0, 0, 0, 1, 1, 0, 1]); + + let max_def_level = 2; + let mut decoder = LevelDecoder::v2(max_def_level); + assert_eq!(decoder.set_data_range(10, &buffer, 3, 5), 5); + let mut result = vec![0; 10]; + let num_decoded = decoder.get(&mut result).expect("get() should be OK"); + assert_eq!(num_decoded, 10); + assert_eq!(result, vec![2, 2, 2, 0, 0, 2, 2, 2, 2, 2]); + } + + #[test] + #[should_panic(expected = "set_data_range() method is only supported by RLE v2 encoding type")] + fn test_bit_packed_decoder_set_data_range() { + // Buffer containing both repetition and definition levels + let buffer = ByteBufferPtr::new(vec![1, 2, 3, 4, 5]); + let max_level = 1; + let mut decoder = LevelDecoder::v1(Encoding::BIT_PACKED, max_level); + decoder.set_data_range(10, &buffer, 0, 3); + } + + #[test] + fn test_bit_packed_decoder_set_data() { + // Test the maximum size that is assigned based on number of values and buffer length + let buffer = ByteBufferPtr::new(vec![1, 2, 3, 4, 5]); + let max_level = 1; + let mut decoder = LevelDecoder::v1(Encoding::BIT_PACKED, max_level); + // This should reset to entire buffer + assert_eq!(decoder.set_data(1024, buffer.all()), buffer.len()); + // This should set smallest num bytes + assert_eq!(decoder.set_data(3, buffer.all()), 1); + } + + #[test] + #[should_panic(expected = "No data set for decoding")] + fn test_rle_level_decoder_get_no_set_data() { + // `get()` normally panics because bit_reader is not set for RLE decoding + // we have explicit check now in set_data + let max_rep_level = 2; + let mut decoder = LevelDecoder::v1(Encoding::RLE, max_rep_level); + let mut buffer = vec![0; 16]; + decoder.get(&mut buffer).unwrap(); + } + + #[test] + #[should_panic(expected = "No data set for decoding")] + fn test_bit_packed_level_decoder_get_no_set_data() { + let max_rep_level = 2; + let mut decoder = LevelDecoder::v1(Encoding::BIT_PACKED, max_rep_level); + let mut buffer = vec![0; 16]; + decoder.get(&mut buffer).unwrap(); + } +} diff --git a/rust/src/parquet/encodings/mod.rs b/rust/src/parquet/encodings/mod.rs new file mode 100644 index 0000000000000..33b1e233d8931 --- /dev/null +++ b/rust/src/parquet/encodings/mod.rs @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod decoding; +pub mod encoding; +pub mod levels; +mod rle; diff --git a/rust/src/parquet/encodings/rle.rs b/rust/src/parquet/encodings/rle.rs new file mode 100644 index 0000000000000..5b56c2a250495 --- /dev/null +++ b/rust/src/parquet/encodings/rle.rs @@ -0,0 +1,839 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + cmp, + mem::{size_of, transmute_copy}, +}; + +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::util::{ + bit_util::{self, BitReader, BitWriter}, + memory::ByteBufferPtr, +}; + +/// Rle/Bit-Packing Hybrid Encoding +/// The grammar for this encoding looks like the following (copied verbatim +/// from https://github.com/Parquet/parquet-format/blob/master/Encodings.md): +/// +/// rle-bit-packed-hybrid: +/// length := length of the in bytes stored as 4 bytes little endian +/// encoded-data := * +/// run := | +/// bit-packed-run := +/// bit-packed-header := varint-encode( << 1 | 1) +/// we always bit-pack a multiple of 8 values at a time, so we only store the number of +/// values / 8 +/// bit-pack-count := (number of values in this run) / 8 +/// bit-packed-values := *see 1 below* +/// rle-run := +/// rle-header := varint-encode( (number of times repeated) << 1) +/// repeated-value := value that is repeated, using a fixed-width of +/// round-up-to-next-byte(bit-width) + +/// Maximum groups per bit-packed run. Current value is 64. +const MAX_GROUPS_PER_BIT_PACKED_RUN: usize = 1 << 6; +const MAX_VALUES_PER_BIT_PACKED_RUN: usize = MAX_GROUPS_PER_BIT_PACKED_RUN * 8; +const MAX_WRITER_BUF_SIZE: usize = 1 << 10; + +/// A RLE/Bit-Packing hybrid encoder. +// TODO: tracking memory usage +pub struct RleEncoder { + // Number of bits needed to encode the value. Must be in the range of [0, 64]. + bit_width: u8, + + // Underlying writer which holds an internal buffer. + bit_writer: BitWriter, + + // If this is true, the buffer is full and subsequent `put()` calls will fail. + buffer_full: bool, + + // The maximum byte size a single run can take. + max_run_byte_size: usize, + + // Buffered values for bit-packed runs. + buffered_values: [u64; 8], + + // Number of current buffered values. Must be less than 8. + num_buffered_values: usize, + + // The current (also last) value that was written and the count of how many + // times in a row that value has been seen. + current_value: u64, + + // The number of repetitions for `current_value`. If this gets too high we'd + // switch to use RLE encoding. + repeat_count: usize, + + // Number of bit-packed values in the current run. This doesn't include values + // in `buffered_values`. + bit_packed_count: usize, + + // The position of the indicator byte in the `bit_writer`. + indicator_byte_pos: i64, +} + +impl RleEncoder { + pub fn new(bit_width: u8, buffer_len: usize) -> Self { + let buffer = vec![0; buffer_len]; + RleEncoder::new_from_buf(bit_width, buffer, 0) + } + + /// Initialize the encoder from existing `buffer` and the starting offset `start`. + pub fn new_from_buf(bit_width: u8, buffer: Vec, start: usize) -> Self { + assert!(bit_width <= 64, "bit_width ({}) out of range.", bit_width); + let max_run_byte_size = RleEncoder::min_buffer_size(bit_width); + assert!( + buffer.len() >= max_run_byte_size, + "buffer length {} must be greater than {}", + buffer.len(), + max_run_byte_size + ); + let bit_writer = BitWriter::new_from_buf(buffer, start); + RleEncoder { + bit_width, + bit_writer, + buffer_full: false, + max_run_byte_size, + buffered_values: [0; 8], + num_buffered_values: 0, + current_value: 0, + repeat_count: 0, + bit_packed_count: 0, + indicator_byte_pos: -1, + } + } + + /// Returns the minimum buffer size needed to use the encoder for `bit_width`. + /// This is the maximum length of a single run for `bit_width`. + pub fn min_buffer_size(bit_width: u8) -> usize { + let max_bit_packed_run_size = 1 + bit_util::ceil( + (MAX_VALUES_PER_BIT_PACKED_RUN * bit_width as usize) as i64, + 8, + ); + let max_rle_run_size = + bit_util::MAX_VLQ_BYTE_LEN + bit_util::ceil(bit_width as i64, 8) as usize; + ::std::cmp::max(max_bit_packed_run_size as usize, max_rle_run_size) + } + + /// Returns the maximum buffer size takes to encode `num_values` values with + /// `bit_width`. + pub fn max_buffer_size(bit_width: u8, num_values: usize) -> usize { + // First the maximum size for bit-packed run + let bytes_per_run = bit_width; + let num_runs = bit_util::ceil(num_values as i64, 8) as usize; + let bit_packed_max_size = num_runs + num_runs * bytes_per_run as usize; + + // Second the maximum size for RLE run + let min_rle_run_size = 1 + bit_util::ceil(bit_width as i64, 8) as usize; + let rle_max_size = bit_util::ceil(num_values as i64, 8) as usize * min_rle_run_size; + ::std::cmp::max(bit_packed_max_size, rle_max_size) as usize + } + + /// Encodes `value`, which must be representable with `bit_width` bits. + /// Returns true if the value fits in buffer, false if it doesn't, or + /// error if something is wrong. + #[inline] + pub fn put(&mut self, value: u64) -> Result { + // This function buffers 8 values at a time. After seeing 8 values, it + // decides whether the current run should be encoded in bit-packed or RLE. + if self.buffer_full { + // The value cannot fit in the current buffer. + return Ok(false); + } + if self.current_value == value { + self.repeat_count += 1; + if self.repeat_count > 8 { + // A continuation of last value. No need to buffer. + return Ok(true); + } + } else { + if self.repeat_count >= 8 { + // The current RLE run has ended and we've gathered enough. Flush first. + assert_eq!(self.bit_packed_count, 0); + self.flush_rle_run()?; + } + self.repeat_count = 1; + self.current_value = value; + } + + self.buffered_values[self.num_buffered_values] = value; + self.num_buffered_values += 1; + if self.num_buffered_values == 8 { + // Buffered values are full. Flush them. + assert_eq!(self.bit_packed_count % 8, 0); + self.flush_buffered_values()?; + } + + Ok(true) + } + + #[inline] + pub fn buffer(&self) -> &[u8] { + self.bit_writer.buffer() + } + + #[inline] + pub fn len(&self) -> usize { + self.bit_writer.bytes_written() + } + + #[inline] + pub fn consume(mut self) -> Result> { + self.flush()?; + Ok(self.bit_writer.consume()) + } + + /// Borrow equivalent of the `consume` method. + /// Call `clear()` after invoking this method. + #[inline] + pub fn flush_buffer(&mut self) -> Result<&[u8]> { + self.flush()?; + Ok(self.bit_writer.flush_buffer()) + } + + /// Clears the internal state so this encoder can be reused (e.g., after becoming full). + #[inline] + pub fn clear(&mut self) { + self.bit_writer.clear(); + self.buffer_full = false; + self.num_buffered_values = 0; + self.current_value = 0; + self.repeat_count = 0; + self.bit_packed_count = 0; + self.indicator_byte_pos = -1; + } + + /// Flushes all remaining values and return the final byte buffer maintained by the + /// internal writer. + #[inline] + pub fn flush(&mut self) -> Result<()> { + if self.bit_packed_count > 0 || self.repeat_count > 0 || self.num_buffered_values > 0 { + let all_repeat = self.bit_packed_count == 0 + && (self.repeat_count == self.num_buffered_values || self.num_buffered_values == 0); + if self.repeat_count > 0 && all_repeat { + self.flush_rle_run()?; + } else { + // Buffer the last group of bit-packed values to 8 by padding with 0s. + if self.num_buffered_values > 0 { + while self.num_buffered_values < 8 { + self.buffered_values[self.num_buffered_values] = 0; + self.num_buffered_values += 1; + } + } + self.bit_packed_count += self.num_buffered_values; + self.flush_bit_packed_run(true)?; + self.repeat_count = 0; + } + } + Ok(()) + } + + #[inline] + fn flush_rle_run(&mut self) -> Result<()> { + assert!(self.repeat_count > 0); + let indicator_value = self.repeat_count << 1 | 0; + let mut result = self.bit_writer.put_vlq_int(indicator_value as u64); + result &= self.bit_writer.put_aligned( + self.current_value, + bit_util::ceil(self.bit_width as i64, 8) as usize, + ); + if !result { + return Err(general_err!("Failed to write RLE run")); + } + self.num_buffered_values = 0; + self.repeat_count = 0; + Ok(()) + } + + #[inline] + fn flush_bit_packed_run(&mut self, update_indicator_byte: bool) -> Result<()> { + if self.indicator_byte_pos < 0 { + self.indicator_byte_pos = self.bit_writer.skip(1)? as i64; + } + + // Write all buffered values as bit-packed literals + for i in 0..self.num_buffered_values { + let _ = self + .bit_writer + .put_value(self.buffered_values[i], self.bit_width as usize); + } + self.num_buffered_values = 0; + if update_indicator_byte { + // Write the indicator byte to the reserved position in `bit_writer` + let num_groups = self.bit_packed_count / 8; + let indicator_byte = ((num_groups << 1) | 1) as u8; + if !self.bit_writer.put_aligned_offset( + indicator_byte, + 1, + self.indicator_byte_pos as usize, + ) { + return Err(general_err!("Not enough space to write indicator byte")); + } + self.indicator_byte_pos = -1; + self.bit_packed_count = 0; + } + Ok(()) + } + + #[inline] + fn flush_buffered_values(&mut self) -> Result<()> { + if self.repeat_count >= 8 { + self.num_buffered_values = 0; + if self.bit_packed_count > 0 { + // In this case we choose RLE encoding. Flush the current buffered values + // as bit-packed encoding. + assert_eq!(self.bit_packed_count % 8, 0); + self.flush_bit_packed_run(true)? + } + return Ok(()); + } + + self.bit_packed_count += self.num_buffered_values; + let num_groups = self.bit_packed_count / 8; + if num_groups + 1 >= MAX_GROUPS_PER_BIT_PACKED_RUN { + // We've reached the maximum value that can be hold in a single bit-packed run. + assert!(self.indicator_byte_pos >= 0); + self.flush_bit_packed_run(true)?; + } else { + self.flush_bit_packed_run(false)?; + } + self.repeat_count = 0; + Ok(()) + } +} + +/// A RLE/Bit-Packing hybrid decoder. +pub struct RleDecoder { + // Number of bits used to encode the value. Must be between [0, 64]. + bit_width: u8, + + // Bit reader loaded with input buffer. + bit_reader: Option, + + // Buffer used when `bit_reader` is not `None`, for batch reading. + index_buf: Option<[i32; 1024]>, + + // The remaining number of values in RLE for this run + rle_left: u32, + + // The remaining number of values in Bit-Packing for this run + bit_packed_left: u32, + + // The current value for the case of RLE mode + current_value: Option, +} + +impl RleDecoder { + pub fn new(bit_width: u8) -> Self { + RleDecoder { + bit_width, + rle_left: 0, + bit_packed_left: 0, + bit_reader: None, + index_buf: None, + current_value: None, + } + } + + pub fn set_data(&mut self, data: ByteBufferPtr) { + if let Some(ref mut bit_reader) = self.bit_reader { + bit_reader.reset(data); + } else { + self.bit_reader = Some(BitReader::new(data)); + self.index_buf = Some([0; 1024]); + } + + let _ = self.reload(); + } + + #[inline] + pub fn get(&mut self) -> Result> { + assert!(size_of::() <= 8); + + while self.rle_left <= 0 && self.bit_packed_left <= 0 { + if !self.reload() { + return Ok(None); + } + } + + let value = if self.rle_left > 0 { + let rle_value = unsafe { + transmute_copy::( + self.current_value + .as_mut() + .expect("current_value should be Some"), + ) + }; + self.rle_left -= 1; + rle_value + } else { + // self.bit_packed_left > 0 + let bit_reader = self.bit_reader.as_mut().expect("bit_reader should be Some"); + let bit_packed_value = bit_reader + .get_value(self.bit_width as usize) + .ok_or(eof_err!("Not enough data for 'bit_packed_value'"))?; + self.bit_packed_left -= 1; + bit_packed_value + }; + + Ok(Some(value)) + } + + #[inline] + pub fn get_batch(&mut self, buffer: &mut [T]) -> Result { + assert!(self.bit_reader.is_some()); + assert!(size_of::() <= 8); + + let mut values_read = 0; + while values_read < buffer.len() { + if self.rle_left > 0 { + assert!(self.current_value.is_some()); + let num_values = cmp::min(buffer.len() - values_read, self.rle_left as usize); + for i in 0..num_values { + let repeated_value = + unsafe { transmute_copy::(self.current_value.as_mut().unwrap()) }; + buffer[values_read + i] = repeated_value; + } + self.rle_left -= num_values as u32; + values_read += num_values; + } else if self.bit_packed_left > 0 { + assert!(self.bit_reader.is_some()); + let mut num_values = + cmp::min(buffer.len() - values_read, self.bit_packed_left as usize); + if let Some(ref mut bit_reader) = self.bit_reader { + num_values = bit_reader.get_batch::( + &mut buffer[values_read..values_read + num_values], + self.bit_width as usize, + ); + self.bit_packed_left -= num_values as u32; + values_read += num_values; + } + } else { + if !self.reload() { + break; + } + } + } + + Ok(values_read) + } + + #[inline] + pub fn get_batch_with_dict( + &mut self, + dict: &[T], + buffer: &mut [T], + max_values: usize, + ) -> Result + where + T: Default + Clone, + { + assert!(buffer.len() >= max_values); + + let mut values_read = 0; + while values_read < max_values { + if self.rle_left > 0 { + assert!(self.current_value.is_some()); + let num_values = cmp::min(max_values - values_read, self.rle_left as usize); + let dict_idx = self.current_value.unwrap() as usize; + for i in 0..num_values { + buffer[values_read + i] = dict[dict_idx].clone(); + } + self.rle_left -= num_values as u32; + values_read += num_values; + } else if self.bit_packed_left > 0 { + assert!(self.bit_reader.is_some()); + let mut num_values = + cmp::min(max_values - values_read, self.bit_packed_left as usize); + if let Some(ref mut bit_reader) = self.bit_reader { + let mut index_buf = self.index_buf.unwrap(); + num_values = cmp::min(num_values, index_buf.len()); + loop { + num_values = bit_reader.get_batch::( + &mut index_buf[..num_values], + self.bit_width as usize, + ); + for i in 0..num_values { + buffer[values_read + i] = dict[index_buf[i] as usize].clone(); + } + self.bit_packed_left -= num_values as u32; + values_read += num_values; + if num_values < index_buf.len() { + break; + } + } + } + } else { + if !self.reload() { + break; + } + } + } + + Ok(values_read) + } + + #[inline] + fn reload(&mut self) -> bool { + assert!(self.bit_reader.is_some()); + if let Some(ref mut bit_reader) = self.bit_reader { + if let Some(indicator_value) = bit_reader.get_vlq_int() { + if indicator_value & 1 == 1 { + self.bit_packed_left = ((indicator_value >> 1) * 8) as u32; + } else { + self.rle_left = (indicator_value >> 1) as u32; + let value_width = bit_util::ceil(self.bit_width as i64, 8); + self.current_value = bit_reader.get_aligned::(value_width as usize); + assert!(self.current_value.is_some()); + } + return true; + } else { + return false; + } + } + return false; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use rand::{ + self, + distributions::{Distribution, Standard}, + thread_rng, Rng, SeedableRng, + }; + + use crate::parquet::util::memory::ByteBufferPtr; + + const MAX_WIDTH: usize = 32; + + #[test] + fn test_rle_decode_int32() { + // Test data: 0-7 with bit width 3 + // 00000011 10001000 11000110 11111010 + let data = ByteBufferPtr::new(vec![0x03, 0x88, 0xC6, 0xFA]); + let mut decoder: RleDecoder = RleDecoder::new(3); + decoder.set_data(data); + let mut buffer = vec![0; 8]; + let expected = vec![0, 1, 2, 3, 4, 5, 6, 7]; + let result = decoder.get_batch::(&mut buffer); + assert!(result.is_ok()); + assert_eq!(buffer, expected); + } + + #[test] + fn test_rle_consume_flush_buffer() { + let data = vec![1, 1, 1, 2, 2, 3, 3, 3]; + let mut encoder1 = RleEncoder::new(3, 256); + let mut encoder2 = RleEncoder::new(3, 256); + for value in data { + encoder1.put(value as u64).unwrap(); + encoder2.put(value as u64).unwrap(); + } + let res1 = encoder1.flush_buffer().unwrap(); + let res2 = encoder2.consume().unwrap(); + assert_eq!(res1, &res2[..]); + } + + #[test] + fn test_rle_decode_bool() { + // RLE test data: 50 1s followed by 50 0s + // 01100100 00000001 01100100 00000000 + let data1 = ByteBufferPtr::new(vec![0x64, 0x01, 0x64, 0x00]); + + // Bit-packing test data: alternating 1s and 0s, 100 total + // 100 / 8 = 13 groups + // 00011011 10101010 ... 00001010 + let data2 = ByteBufferPtr::new(vec![ + 0x1B, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x0A, + ]); + + let mut decoder: RleDecoder = RleDecoder::new(1); + decoder.set_data(data1); + let mut buffer = vec![false; 100]; + let mut expected = vec![]; + for i in 0..100 { + if i < 50 { + expected.push(true); + } else { + expected.push(false); + } + } + let result = decoder.get_batch::(&mut buffer); + assert!(result.is_ok()); + assert_eq!(buffer, expected); + + decoder.set_data(data2); + let mut buffer = vec![false; 100]; + let mut expected = vec![]; + for i in 0..100 { + if i % 2 == 0 { + expected.push(false); + } else { + expected.push(true); + } + } + let result = decoder.get_batch::(&mut buffer); + assert!(result.is_ok()); + assert_eq!(buffer, expected); + } + + #[test] + fn test_rle_decode_with_dict_int32() { + // Test RLE encoding: 3 0s followed by 4 1s followed by 5 2s + // 00000110 00000000 00001000 00000001 00001010 00000010 + let dict = vec![10, 20, 30]; + let data = ByteBufferPtr::new(vec![0x06, 0x00, 0x08, 0x01, 0x0A, 0x02]); + let mut decoder: RleDecoder = RleDecoder::new(3); + decoder.set_data(data); + let mut buffer = vec![0; 12]; + let expected = vec![10, 10, 10, 20, 20, 20, 20, 30, 30, 30, 30, 30]; + let result = decoder.get_batch_with_dict::(&dict, &mut buffer, 12); + assert!(result.is_ok()); + assert_eq!(buffer, expected); + + // Test bit-pack encoding: 345345345455 (2 groups: 8 and 4) + // 011 100 101 011 100 101 011 100 101 100 101 101 + // 00000011 01100011 11000111 10001110 00000011 01100101 00001011 + let dict = vec!["aaa", "bbb", "ccc", "ddd", "eee", "fff"]; + let data = ByteBufferPtr::new(vec![0x03, 0x63, 0xC7, 0x8E, 0x03, 0x65, 0x0B]); + let mut decoder: RleDecoder = RleDecoder::new(3); + decoder.set_data(data); + let mut buffer = vec![""; 12]; + let expected = vec![ + "ddd", "eee", "fff", "ddd", "eee", "fff", "ddd", "eee", "fff", "eee", "fff", "fff", + ]; + let result = + decoder.get_batch_with_dict::<&str>(dict.as_slice(), buffer.as_mut_slice(), 12); + assert!(result.is_ok()); + assert_eq!(buffer, expected); + } + + fn validate_rle( + values: &[i64], + bit_width: u8, + expected_encoding: Option<&[u8]>, + expected_len: i32, + ) { + let buffer_len = 64 * 1024; + let mut encoder = RleEncoder::new(bit_width, buffer_len); + for v in values { + let result = encoder.put(*v as u64); + assert!(result.is_ok()); + } + let buffer = ByteBufferPtr::new(encoder.consume().expect("Expect consume() OK")); + if expected_len != -1 { + assert_eq!(buffer.len(), expected_len as usize); + } + match expected_encoding { + Some(b) => assert_eq!(buffer.as_ref(), b), + _ => (), + } + + // Verify read + let mut decoder = RleDecoder::new(bit_width); + decoder.set_data(buffer.all()); + for v in values { + let val: i64 = decoder + .get() + .expect("get() should be OK") + .expect("get() should return more value"); + assert_eq!(val, *v); + } + + // Verify batch read + decoder.set_data(buffer); + let mut values_read: Vec = vec![0; values.len()]; + decoder + .get_batch(&mut values_read[..]) + .expect("get_batch() should be OK"); + assert_eq!(&values_read[..], values); + } + + #[test] + fn test_rle_specific_sequences() { + let mut expected_buffer = Vec::new(); + let mut values = Vec::new(); + for _ in 0..50 { + values.push(0); + } + for _ in 0..50 { + values.push(1); + } + expected_buffer.push(50 << 1); + expected_buffer.push(0); + expected_buffer.push(50 << 1); + expected_buffer.push(1); + + for width in 1..9 { + validate_rle(&values[..], width, Some(&expected_buffer[..]), 4); + } + for width in 9..MAX_WIDTH + 1 { + validate_rle( + &values[..], + width as u8, + None, + 2 * (1 + bit_util::ceil(width as i64, 8) as i32), + ); + } + + // Test 100 0's and 1's alternating + values.clear(); + expected_buffer.clear(); + for i in 0..101 { + values.push(i % 2); + } + let num_groups = bit_util::ceil(100, 8) as u8; + expected_buffer.push(((num_groups << 1) as u8) | 1); + for _ in 1..(100 / 8) + 1 { + expected_buffer.push(0b10101010); + } + // For the last 4 0 and 1's, padded with 0. + expected_buffer.push(0b00001010); + validate_rle( + &values, + 1, + Some(&expected_buffer[..]), + 1 + num_groups as i32, + ); + for width in 2..MAX_WIDTH + 1 { + let num_values = bit_util::ceil(100, 8) * 8; + validate_rle( + &values, + width as u8, + None, + 1 + bit_util::ceil(width as i64 * num_values, 8) as i32, + ); + } + } + + // `validate_rle` on `num_vals` with width `bit_width`. If `value` is -1, that value + // is used, otherwise alternating values are used. + fn test_rle_values(bit_width: usize, num_vals: usize, value: i32) { + let mod_val = if bit_width == 64 { + 1 + } else { + 1u64 << bit_width + }; + let mut values: Vec = vec![]; + for v in 0..num_vals { + let val = if value == -1 { + v as i64 % mod_val as i64 + } else { + value as i64 + }; + values.push(val); + } + validate_rle(&values, bit_width as u8, None, -1); + } + + #[test] + fn test_values() { + for width in 1..MAX_WIDTH + 1 { + test_rle_values(width, 1, -1); + test_rle_values(width, 1024, -1); + test_rle_values(width, 1024, 0); + test_rle_values(width, 1024, 1); + } + } + + #[test] + fn test_rle_specific_roundtrip() { + let bit_width = 1; + let buffer_len = RleEncoder::min_buffer_size(bit_width); + let values: Vec = vec![0, 1, 1, 1, 1, 0, 0, 0, 0, 1]; + let mut encoder = RleEncoder::new(bit_width, buffer_len); + for v in &values { + assert!(encoder.put(*v as u64).expect("put() should be OK")); + } + let buffer = encoder.consume().expect("consume() should be OK"); + let mut decoder = RleDecoder::new(bit_width); + decoder.set_data(ByteBufferPtr::new(buffer)); + let mut actual_values: Vec = vec![0; values.len()]; + decoder + .get_batch(&mut actual_values) + .expect("get_batch() should be OK"); + assert_eq!(actual_values, values); + } + + fn test_round_trip(values: &[i32], bit_width: u8) { + let buffer_len = 64 * 1024; + let mut encoder = RleEncoder::new(bit_width, buffer_len); + for v in values { + let result = encoder.put(*v as u64).expect("put() should be OK"); + assert!(result, "put() should not return false"); + } + + let buffer = ByteBufferPtr::new(encoder.consume().expect("consume() should be OK")); + + // Verify read + let mut decoder = RleDecoder::new(bit_width); + decoder.set_data(buffer.all()); + for v in values { + let val = decoder + .get::() + .expect("get() should be OK") + .expect("get() should return value"); + assert_eq!(val, *v); + } + + // Verify batch read + let mut decoder = RleDecoder::new(bit_width); + decoder.set_data(buffer); + let mut values_read: Vec = vec![0; values.len()]; + decoder + .get_batch(&mut values_read[..]) + .expect("get_batch() should be OK"); + assert_eq!(&values_read[..], values); + } + + #[test] + fn test_random() { + let seed_len = 32; + let niters = 50; + let ngroups = 1000; + let max_group_size = 15; + let mut values = vec![]; + + for _ in 0..niters { + values.clear(); + let mut rng = thread_rng(); + let seed_vec: Vec = Standard.sample_iter(&mut rng).take(seed_len).collect(); + let mut seed = [0u8; 32]; + seed.copy_from_slice(&seed_vec[0..seed_len]); + let mut gen = rand::StdRng::from_seed(seed); + + let mut parity = false; + for _ in 0..ngroups { + let mut group_size = gen.gen_range::(1, 20); + if group_size > max_group_size { + group_size = 1; + } + for _ in 0..group_size { + values.push(parity as i32); + } + parity = !parity; + } + let bit_width = bit_util::num_required_bits(values.len() as u64); + assert!(bit_width < 64); + test_round_trip(&values[..], bit_width as u8); + } + } +} diff --git a/rust/src/parquet/errors.rs b/rust/src/parquet/errors.rs new file mode 100644 index 0000000000000..a5532c1eb66dc --- /dev/null +++ b/rust/src/parquet/errors.rs @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Common Parquet errors and macros. + +use std::{cell, convert, io, result}; + +use quick_error::quick_error; +use snap; +use thrift; + +quick_error! { + /// Set of errors that can be produced during different operations in Parquet. + #[derive(Debug, PartialEq)] + pub enum ParquetError { + /// General Parquet error. + /// Returned when code violates normal workflow of working with Parquet files. + General(message: String) { + display("Parquet error: {}", message) + description(message) + from(e: io::Error) -> (format!("underlying IO error: {}", e)) + from(e: snap::Error) -> (format!("underlying snap error: {}", e)) + from(e: thrift::Error) -> (format!("underlying Thrift error: {}", e)) + from(e: cell::BorrowMutError) -> (format!("underlying borrow error: {}", e)) + } + /// "Not yet implemented" Parquet error. + /// Returned when functionality is not yet available. + NYI(message: String) { + display("NYI: {}", message) + description(message) + } + /// "End of file" Parquet error. + /// Returned when IO related failures occur, e.g. when there are not enough bytes to + /// decode. + EOF(message: String) { + display("EOF: {}", message) + description(message) + } + } +} + +/// A specialized `Result` for Parquet errors. +pub type Result = result::Result; + +// ---------------------------------------------------------------------- +// Conversion from `ParquetError` to other types of `Error`s + +impl convert::From for io::Error { + fn from(e: ParquetError) -> Self { + io::Error::new(io::ErrorKind::Other, e) + } +} + +// ---------------------------------------------------------------------- +// Convenient macros for different errors + +macro_rules! general_err { + ($fmt:expr) => (ParquetError::General($fmt.to_owned())); + ($fmt:expr, $($args:expr),*) => (ParquetError::General(format!($fmt, $($args),*))); + ($e:expr, $fmt:expr) => (ParquetError::General($fmt.to_owned(), $e)); + ($e:ident, $fmt:expr, $($args:tt),*) => ( + ParquetError::General(&format!($fmt, $($args),*), $e)); +} + +macro_rules! nyi_err { + ($fmt:expr) => (ParquetError::NYI($fmt.to_owned())); + ($fmt:expr, $($args:expr),*) => (ParquetError::NYI(format!($fmt, $($args),*))); +} + +macro_rules! eof_err { + ($fmt:expr) => (ParquetError::EOF($fmt.to_owned())); + ($fmt:expr, $($args:expr),*) => (ParquetError::EOF(format!($fmt, $($args),*))); +} diff --git a/rust/src/parquet/file/metadata.rs b/rust/src/parquet/file/metadata.rs new file mode 100644 index 0000000000000..7f2442506f67f --- /dev/null +++ b/rust/src/parquet/file/metadata.rs @@ -0,0 +1,736 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains information about available Parquet metadata. +//! +//! The hierarchy of metadata is as follows: +//! +//! [`ParquetMetaData`](struct.ParquetMetaData.html) contains +//! [`FileMetaData`](struct.FileMetaData.html) and zero or more +//! [`RowGroupMetaData`](struct.RowGroupMetaData.html) for each row group. +//! +//! [`FileMetaData`](struct.FileMetaData.html) includes file version, application specific +//! metadata. +//! +//! Each [`RowGroupMetaData`](struct.RowGroupMetaData.html) contains information about row +//! group and one or more [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) for +//! each column chunk. +//! +//! [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) has information about column +//! chunk (primitive leaf column), including encoding/compression, number of values, etc. + +use std::rc::Rc; + +use parquet_format::{ColumnChunk, ColumnMetaData, RowGroup}; + +use crate::parquet::basic::{ColumnOrder, Compression, Encoding, Type}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::file::statistics::{self, Statistics}; +use crate::parquet::schema::types::{ + ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, + Type as SchemaType, TypePtr, +}; + +/// Reference counted pointer for [`ParquetMetaData`]. +pub type ParquetMetaDataPtr = Rc; + +/// Global Parquet metadata. +pub struct ParquetMetaData { + file_metadata: FileMetaDataPtr, + row_groups: Vec, +} + +impl ParquetMetaData { + /// Creates Parquet metadata from file metadata and a list of row group metadata `Rc`s + /// for each available row group. + pub fn new(file_metadata: FileMetaData, row_group_ptrs: Vec) -> Self { + ParquetMetaData { + file_metadata: Rc::new(file_metadata), + row_groups: row_group_ptrs, + } + } + + /// Returns file metadata as reference counted clone. + pub fn file_metadata(&self) -> FileMetaDataPtr { + self.file_metadata.clone() + } + + /// Returns number of row groups in this file. + pub fn num_row_groups(&self) -> usize { + self.row_groups.len() + } + + /// Returns row group metadata for `i`th position. + /// Position should be less than number of row groups `num_row_groups`. + pub fn row_group(&self, i: usize) -> RowGroupMetaDataPtr { + self.row_groups[i].clone() + } + + /// Returns slice of row group reference counted pointers in this file. + pub fn row_groups(&self) -> &[RowGroupMetaDataPtr] { + &self.row_groups.as_slice() + } +} + +/// Reference counted pointer for [`FileMetaData`]. +pub type FileMetaDataPtr = Rc; + +/// Metadata for a Parquet file. +pub struct FileMetaData { + version: i32, + num_rows: i64, + created_by: Option, + schema: TypePtr, + schema_descr: SchemaDescPtr, + column_orders: Option>, +} + +impl FileMetaData { + /// Creates new file metadata. + pub fn new( + version: i32, + num_rows: i64, + created_by: Option, + schema: TypePtr, + schema_descr: SchemaDescPtr, + column_orders: Option>, + ) -> Self { + FileMetaData { + version, + num_rows, + created_by, + schema, + schema_descr, + column_orders, + } + } + + /// Returns version of this file. + pub fn version(&self) -> i32 { + self.version + } + + /// Returns number of rows in the file. + pub fn num_rows(&self) -> i64 { + self.num_rows + } + + /// String message for application that wrote this file. + /// + /// This should have the following format: + /// ` version (build )`. + /// + /// ```shell + /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b) + /// ``` + pub fn created_by(&self) -> &Option { + &self.created_by + } + + /// Returns Parquet ['Type`] that describes schema in this file. + pub fn schema(&self) -> &SchemaType { + self.schema.as_ref() + } + + /// Returns a reference to schema descriptor. + pub fn schema_descr(&self) -> &SchemaDescriptor { + &self.schema_descr + } + + /// Returns reference counted clone for schema descriptor. + pub fn schema_descr_ptr(&self) -> SchemaDescPtr { + self.schema_descr.clone() + } + + /// Column (sort) order used for `min` and `max` values of each column in this file. + /// + /// Each column order corresponds to one column, determined by its position in the list, + /// matching the position of the column in the schema. + /// + /// When `None` is returned, there are no column orders available, and each column + /// should be assumed to have undefined (legacy) column order. + pub fn column_orders(&self) -> Option<&Vec> { + self.column_orders.as_ref() + } + + /// Returns column order for `i`th column in this file. + /// If column orders are not available, returns undefined (legacy) column order. + pub fn column_order(&self, i: usize) -> ColumnOrder { + self.column_orders + .as_ref() + .map(|data| data[i]) + .unwrap_or(ColumnOrder::UNDEFINED) + } +} + +/// Reference counted pointer for [`RowGroupMetaData`]. +pub type RowGroupMetaDataPtr = Rc; + +/// Metadata for a row group. +pub struct RowGroupMetaData { + columns: Vec, + num_rows: i64, + total_byte_size: i64, + schema_descr: SchemaDescPtr, +} + +impl RowGroupMetaData { + /// Returns builer for row group metadata. + pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder { + RowGroupMetaDataBuilder::new(schema_descr) + } + + /// Number of columns in this row group. + pub fn num_columns(&self) -> usize { + self.columns.len() + } + + /// Returns column chunk metadata for `i`th column. + pub fn column(&self, i: usize) -> &ColumnChunkMetaData { + &self.columns[i] + } + + /// Returns slice of column chunk metadata [`Rc`] pointers. + pub fn columns(&self) -> &[ColumnChunkMetaDataPtr] { + &self.columns + } + + /// Number of rows in this row group. + pub fn num_rows(&self) -> i64 { + self.num_rows + } + + /// Total byte size of all uncompressed column data in this row group. + pub fn total_byte_size(&self) -> i64 { + self.total_byte_size + } + + /// Returns reference to a schema descriptor. + pub fn schema_descr(&self) -> &SchemaDescriptor { + self.schema_descr.as_ref() + } + + /// Returns reference counted clone of schema descriptor. + pub fn schema_descr_ptr(&self) -> SchemaDescPtr { + self.schema_descr.clone() + } + + /// Method to convert from Thrift. + pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result { + assert_eq!(schema_descr.num_columns(), rg.columns.len()); + let total_byte_size = rg.total_byte_size; + let num_rows = rg.num_rows; + let mut columns = vec![]; + for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) { + let cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; + columns.push(Rc::new(cc)); + } + Ok(RowGroupMetaData { + columns, + num_rows, + total_byte_size, + schema_descr, + }) + } + + /// Method to convert to Thrift. + pub fn to_thrift(&self) -> RowGroup { + RowGroup { + columns: self.columns().into_iter().map(|v| v.to_thrift()).collect(), + total_byte_size: self.total_byte_size, + num_rows: self.num_rows, + sorting_columns: None, + } + } +} + +/// Builder for row group metadata. +pub struct RowGroupMetaDataBuilder { + columns: Vec, + schema_descr: SchemaDescPtr, + num_rows: i64, + total_byte_size: i64, +} + +impl RowGroupMetaDataBuilder { + /// Creates new builder from schema descriptor. + fn new(schema_descr: SchemaDescPtr) -> Self { + Self { + columns: Vec::with_capacity(schema_descr.num_columns()), + schema_descr, + num_rows: 0, + total_byte_size: 0, + } + } + + /// Sets number of rows in this row group. + pub fn set_num_rows(mut self, value: i64) -> Self { + self.num_rows = value; + self + } + + /// Sets total size in bytes for this row group. + pub fn set_total_byte_size(mut self, value: i64) -> Self { + self.total_byte_size = value; + self + } + + /// Sets column metadata for this row group. + pub fn set_column_metadata(mut self, value: Vec) -> Self { + self.columns = value; + self + } + + /// Builds row group metadata. + pub fn build(self) -> Result { + if self.schema_descr.num_columns() != self.columns.len() { + return Err(general_err!( + "Column length mismatch: {} != {}", + self.schema_descr.num_columns(), + self.columns.len() + )); + } + + Ok(RowGroupMetaData { + columns: self.columns, + num_rows: self.num_rows, + total_byte_size: self.total_byte_size, + schema_descr: self.schema_descr, + }) + } +} + +/// Reference counted pointer for [`ColumnChunkMetaData`]. +pub type ColumnChunkMetaDataPtr = Rc; + +/// Metadata for a column chunk. +pub struct ColumnChunkMetaData { + column_type: Type, + column_path: ColumnPath, + column_descr: ColumnDescPtr, + encodings: Vec, + file_path: Option, + file_offset: i64, + num_values: i64, + compression: Compression, + total_compressed_size: i64, + total_uncompressed_size: i64, + data_page_offset: i64, + index_page_offset: Option, + dictionary_page_offset: Option, + statistics: Option, +} + +/// Represents common operations for a column chunk. +impl ColumnChunkMetaData { + /// Returns builder for column chunk metadata. + pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder { + ColumnChunkMetaDataBuilder::new(column_descr) + } + + /// File where the column chunk is stored. + /// + /// If not set, assumed to belong to the same file as the metadata. + /// This path is relative to the current file. + pub fn file_path(&self) -> Option<&String> { + self.file_path.as_ref() + } + + /// Byte offset in `file_path()`. + pub fn file_offset(&self) -> i64 { + self.file_offset + } + + /// Type of this column. Must be primitive. + pub fn column_type(&self) -> Type { + self.column_type + } + + /// Path (or identifier) of this column. + pub fn column_path(&self) -> &ColumnPath { + &self.column_path + } + + /// Descriptor for this column. + pub fn column_descr(&self) -> &ColumnDescriptor { + self.column_descr.as_ref() + } + + /// Reference counted clone of descriptor for this column. + pub fn column_descr_ptr(&self) -> ColumnDescPtr { + self.column_descr.clone() + } + + /// All encodings used for this column. + pub fn encodings(&self) -> &Vec { + &self.encodings + } + + /// Total number of values in this column chunk. + pub fn num_values(&self) -> i64 { + self.num_values + } + + /// Compression for this column. + pub fn compression(&self) -> Compression { + self.compression + } + + /// Returns the total compressed data size of this column chunk. + pub fn compressed_size(&self) -> i64 { + self.total_compressed_size + } + + /// Returns the total uncompressed data size of this column chunk. + pub fn uncompressed_size(&self) -> i64 { + self.total_uncompressed_size + } + + /// Returns the offset for the column data. + pub fn data_page_offset(&self) -> i64 { + self.data_page_offset + } + + /// Returns `true` if this column chunk contains a index page, `false` otherwise. + pub fn has_index_page(&self) -> bool { + self.index_page_offset.is_some() + } + + /// Returns the offset for the index page. + pub fn index_page_offset(&self) -> Option { + self.index_page_offset + } + + /// Returns `true` if this column chunk contains a dictionary page, `false` otherwise. + pub fn has_dictionary_page(&self) -> bool { + self.dictionary_page_offset.is_some() + } + + /// Returns the offset for the dictionary page, if any. + pub fn dictionary_page_offset(&self) -> Option { + self.dictionary_page_offset + } + + /// Returns statistics that are set for this column chunk, + /// or `None` if no statistics are available. + pub fn statistics(&self) -> Option<&Statistics> { + self.statistics.as_ref() + } + + /// Method to convert from Thrift. + pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { + if cc.meta_data.is_none() { + return Err(general_err!("Expected to have column metadata")); + } + let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap(); + let column_type = Type::from(col_metadata.type_); + let column_path = ColumnPath::new(col_metadata.path_in_schema); + let encodings = col_metadata + .encodings + .drain(0..) + .map(Encoding::from) + .collect(); + let compression = Compression::from(col_metadata.codec); + let file_path = cc.file_path; + let file_offset = cc.file_offset; + let num_values = col_metadata.num_values; + let total_compressed_size = col_metadata.total_compressed_size; + let total_uncompressed_size = col_metadata.total_uncompressed_size; + let data_page_offset = col_metadata.data_page_offset; + let index_page_offset = col_metadata.index_page_offset; + let dictionary_page_offset = col_metadata.dictionary_page_offset; + let statistics = statistics::from_thrift(column_type, col_metadata.statistics); + let result = ColumnChunkMetaData { + column_type, + column_path, + column_descr, + encodings, + file_path, + file_offset, + num_values, + compression, + total_compressed_size, + total_uncompressed_size, + data_page_offset, + index_page_offset, + dictionary_page_offset, + statistics, + }; + Ok(result) + } + + /// Method to convert to Thrift. + pub fn to_thrift(&self) -> ColumnChunk { + let column_metadata = ColumnMetaData { + type_: self.column_type.into(), + encodings: self.encodings().into_iter().map(|&v| v.into()).collect(), + path_in_schema: Vec::from(self.column_path.as_ref()), + codec: self.compression.into(), + num_values: self.num_values, + total_uncompressed_size: self.total_uncompressed_size, + total_compressed_size: self.total_compressed_size, + key_value_metadata: None, + data_page_offset: self.data_page_offset, + index_page_offset: self.index_page_offset, + dictionary_page_offset: self.dictionary_page_offset, + statistics: statistics::to_thrift(self.statistics.as_ref()), + encoding_stats: None, + }; + + ColumnChunk { + file_path: self.file_path().map(|v| v.clone()), + file_offset: self.file_offset, + meta_data: Some(column_metadata), + offset_index_offset: None, + offset_index_length: None, + column_index_offset: None, + column_index_length: None, + } + } +} + +/// Builder for column chunk metadata. +pub struct ColumnChunkMetaDataBuilder { + column_descr: ColumnDescPtr, + encodings: Vec, + file_path: Option, + file_offset: i64, + num_values: i64, + compression: Compression, + total_compressed_size: i64, + total_uncompressed_size: i64, + data_page_offset: i64, + index_page_offset: Option, + dictionary_page_offset: Option, + statistics: Option, +} + +impl ColumnChunkMetaDataBuilder { + /// Creates new column chunk metadata builder. + fn new(column_descr: ColumnDescPtr) -> Self { + Self { + column_descr, + encodings: Vec::new(), + file_path: None, + file_offset: 0, + num_values: 0, + compression: Compression::UNCOMPRESSED, + total_compressed_size: 0, + total_uncompressed_size: 0, + data_page_offset: 0, + index_page_offset: None, + dictionary_page_offset: None, + statistics: None, + } + } + + /// Sets list of encodings for this column chunk. + pub fn set_encodings(mut self, encodings: Vec) -> Self { + self.encodings = encodings; + self + } + + /// Sets optional file path for this column chunk. + pub fn set_file_path(mut self, value: String) -> Self { + self.file_path = Some(value); + self + } + + /// Sets file offset in bytes. + pub fn set_file_offset(mut self, value: i64) -> Self { + self.file_offset = value; + self + } + + /// Sets number of values. + pub fn set_num_values(mut self, value: i64) -> Self { + self.num_values = value; + self + } + + /// Sets compression. + pub fn set_compression(mut self, value: Compression) -> Self { + self.compression = value; + self + } + + /// Sets total compressed size in bytes. + pub fn set_total_compressed_size(mut self, value: i64) -> Self { + self.total_compressed_size = value; + self + } + + /// Sets total uncompressed size in bytes. + pub fn set_total_uncompressed_size(mut self, value: i64) -> Self { + self.total_uncompressed_size = value; + self + } + + /// Sets data page offset in bytes. + pub fn set_data_page_offset(mut self, value: i64) -> Self { + self.data_page_offset = value; + self + } + + /// Sets optional dictionary page ofset in bytes. + pub fn set_dictionary_page_offset(mut self, value: Option) -> Self { + self.dictionary_page_offset = value; + self + } + + /// Sets optional index page offset in bytes. + pub fn set_index_page_offset(mut self, value: Option) -> Self { + self.index_page_offset = value; + self + } + + /// Sets statistics for this column chunk. + pub fn set_statistics(mut self, value: Statistics) -> Self { + self.statistics = Some(value); + self + } + + /// Builds column chunk metadata. + pub fn build(self) -> Result { + Ok(ColumnChunkMetaData { + column_type: self.column_descr.physical_type(), + column_path: self.column_descr.path().clone(), + column_descr: self.column_descr, + encodings: self.encodings, + file_path: self.file_path, + file_offset: self.file_offset, + num_values: self.num_values, + compression: self.compression, + total_compressed_size: self.total_compressed_size, + total_uncompressed_size: self.total_uncompressed_size, + data_page_offset: self.data_page_offset, + index_page_offset: self.index_page_offset, + dictionary_page_offset: self.dictionary_page_offset, + statistics: self.statistics, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_row_group_metadata_thrift_conversion() { + let schema_descr = get_test_schema_descr(); + + let mut columns = vec![]; + for ptr in schema_descr.columns() { + let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap(); + columns.push(Rc::new(column)); + } + let row_group_meta = RowGroupMetaData::builder(schema_descr.clone()) + .set_num_rows(1000) + .set_total_byte_size(2000) + .set_column_metadata(columns) + .build() + .unwrap(); + + let row_group_exp = row_group_meta.to_thrift(); + let row_group_res = + RowGroupMetaData::from_thrift(schema_descr.clone(), row_group_exp.clone()) + .unwrap() + .to_thrift(); + + assert_eq!(row_group_res, row_group_exp); + } + + #[test] + fn test_row_group_metadata_thrift_conversion_empty() { + let schema_descr = get_test_schema_descr(); + + let row_group_meta = RowGroupMetaData::builder(schema_descr.clone()).build(); + + assert!(row_group_meta.is_err()); + if let Err(e) = row_group_meta { + assert_eq!( + e.to_string(), + "Parquet error: Column length mismatch: 2 != 0" + ); + } + } + + #[test] + fn test_column_chunk_metadata_thrift_conversion() { + let column_descr = get_test_schema_descr().column(0); + + let col_metadata = ColumnChunkMetaData::builder(column_descr.clone()) + .set_encodings(vec![Encoding::PLAIN, Encoding::RLE]) + .set_file_path("file_path".to_owned()) + .set_file_offset(100) + .set_num_values(1000) + .set_compression(Compression::SNAPPY) + .set_total_compressed_size(2000) + .set_total_uncompressed_size(3000) + .set_data_page_offset(4000) + .set_dictionary_page_offset(Some(5000)) + .build() + .unwrap(); + + let col_chunk_exp = col_metadata.to_thrift(); + + let col_chunk_res = + ColumnChunkMetaData::from_thrift(column_descr.clone(), col_chunk_exp.clone()) + .unwrap() + .to_thrift(); + + assert_eq!(col_chunk_res, col_chunk_exp); + } + + #[test] + fn test_column_chunk_metadata_thrift_conversion_empty() { + let column_descr = get_test_schema_descr().column(0); + + let col_metadata = ColumnChunkMetaData::builder(column_descr.clone()) + .build() + .unwrap(); + + let col_chunk_exp = col_metadata.to_thrift(); + let col_chunk_res = + ColumnChunkMetaData::from_thrift(column_descr.clone(), col_chunk_exp.clone()) + .unwrap() + .to_thrift(); + + assert_eq!(col_chunk_res, col_chunk_exp); + } + + /// Returns sample schema descriptor so we can create column metadata. + fn get_test_schema_descr() -> SchemaDescPtr { + let schema = SchemaType::group_type_builder("schema") + .with_fields(&mut vec![ + Rc::new( + SchemaType::primitive_type_builder("a", Type::INT32) + .build() + .unwrap(), + ), + Rc::new( + SchemaType::primitive_type_builder("b", Type::INT32) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(); + + Rc::new(SchemaDescriptor::new(Rc::new(schema))) + } +} diff --git a/rust/src/parquet/file/mod.rs b/rust/src/parquet/file/mod.rs new file mode 100644 index 0000000000000..ebaebbad0bb6f --- /dev/null +++ b/rust/src/parquet/file/mod.rs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Main entrypoint for working with Parquet API. +//! +//! Provides access to file and row group readers and writers, record API, metadata, etc. +//! +//! See [`reader::SerializedFileReader`](reader/struct.SerializedFileReader.html) or +//! [`writer::SerializedFileWriter`](writer/struct.SerializedFileWriter.html) for a +//! starting reference, [`metadata::ParquetMetaData`](metadata/index.html) for file +//! metadata, and [`statistics`](statistics/index.html) for working with statistics. +//! +//! # Example of writing a new file +//! +//! ```rust +//! use std::{fs, path::Path, rc::Rc}; +//! +//! use arrow::parquet::{ +//! file::{ +//! properties::WriterProperties, +//! writer::{FileWriter, SerializedFileWriter}, +//! }, +//! schema::parser::parse_message_type, +//! }; +//! +//! let path = Path::new("target/debug/examples/sample.parquet"); +//! +//! let message_type = " +//! message schema { +//! REQUIRED INT32 b; +//! } +//! "; +//! let schema = Rc::new(parse_message_type(message_type).unwrap()); +//! let props = Rc::new(WriterProperties::builder().build()); +//! let file = fs::File::create(&path).unwrap(); +//! let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); +//! let mut row_group_writer = writer.next_row_group().unwrap(); +//! while let Some(mut col_writer) = row_group_writer.next_column().unwrap() { +//! // ... write values to a column writer +//! row_group_writer.close_column(col_writer).unwrap(); +//! } +//! writer.close_row_group(row_group_writer).unwrap(); +//! writer.close().unwrap(); +//! +//! let bytes = fs::read(&path).unwrap(); +//! assert_eq!(&bytes[0..4], &[b'P', b'A', b'R', b'1']); +//! ``` +//! # Example of reading an existing file +//! +//! ```rust +//! use arrow::parquet::file::reader::{FileReader, SerializedFileReader}; +//! use std::{fs::File, path::Path}; +//! +//! let path = Path::new("target/debug/examples/sample.parquet"); +//! if let Ok(file) = File::open(&path) { +//! let file = File::open(&path).unwrap(); +//! let reader = SerializedFileReader::new(file).unwrap(); +//! +//! let parquet_metadata = reader.metadata(); +//! assert_eq!(parquet_metadata.num_row_groups(), 1); +//! +//! let row_group_reader = reader.get_row_group(0).unwrap(); +//! assert_eq!(row_group_reader.num_columns(), 1); +//! } +//! ``` + +pub mod metadata; +pub mod properties; +pub mod reader; +pub mod statistics; +pub mod writer; + +const FOOTER_SIZE: usize = 8; +const PARQUET_MAGIC: [u8; 4] = [b'P', b'A', b'R', b'1']; diff --git a/rust/src/parquet/file/properties.rs b/rust/src/parquet/file/properties.rs new file mode 100644 index 0000000000000..911ec55733490 --- /dev/null +++ b/rust/src/parquet/file/properties.rs @@ -0,0 +1,648 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Writer properties. +//! +//! # Usage +//! +//! ```rust +//! use arrow::parquet::{ +//! basic::{Compression, Encoding}, +//! file::properties::*, +//! schema::types::ColumnPath, +//! }; +//! +//! // Create properties with default configuration. +//! let props = WriterProperties::builder().build(); +//! +//! // Use properties builder to set certain options and assemble the configuration. +//! let props = WriterProperties::builder() +//! .set_writer_version(WriterVersion::PARQUET_1_0) +//! .set_encoding(Encoding::PLAIN) +//! .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED) +//! .set_compression(Compression::SNAPPY) +//! .build(); +//! +//! assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0); +//! assert_eq!( +//! props.encoding(&ColumnPath::from("col1")), +//! Some(Encoding::DELTA_BINARY_PACKED) +//! ); +//! assert_eq!( +//! props.encoding(&ColumnPath::from("col2")), +//! Some(Encoding::PLAIN) +//! ); +//! ``` + +use std::{collections::HashMap, rc::Rc}; + +use crate::parquet::basic::{Compression, Encoding}; +use crate::parquet::schema::types::ColumnPath; + +const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; +const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; +const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0; +const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED; +const DEFAULT_DICTIONARY_ENABLED: bool = true; +const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE; +const DEFAULT_STATISTICS_ENABLED: bool = true; +const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; +const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 128 * 1024 * 1024; +const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY"); + +/// Parquet writer version. +/// +/// Basic constant, which is not part of the Thrift definition. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum WriterVersion { + PARQUET_1_0, + PARQUET_2_0, +} + +impl WriterVersion { + /// Returns writer version as `i32`. + pub fn as_num(&self) -> i32 { + match self { + WriterVersion::PARQUET_1_0 => 1, + WriterVersion::PARQUET_2_0 => 2, + } + } +} + +/// Reference counted writer properties. +pub type WriterPropertiesPtr = Rc; + +/// Writer properties. +/// +/// It is created as an immutable data structure, use [`WriterPropertiesBuilder`] to +/// assemble the properties. +#[derive(Debug, Clone)] +pub struct WriterProperties { + data_pagesize_limit: usize, + dictionary_pagesize_limit: usize, + write_batch_size: usize, + max_row_group_size: usize, + writer_version: WriterVersion, + created_by: String, + default_column_properties: ColumnProperties, + column_properties: HashMap, +} + +impl WriterProperties { + /// Returns builder for writer properties with default values. + pub fn builder() -> WriterPropertiesBuilder { + WriterPropertiesBuilder::with_defaults() + } + + /// Returns data page size limit. + pub fn data_pagesize_limit(&self) -> usize { + self.data_pagesize_limit + } + + /// Returns dictionary page size limit. + pub fn dictionary_pagesize_limit(&self) -> usize { + self.dictionary_pagesize_limit + } + + /// Returns configured batch size for writes. + /// + /// When writing a batch of data, this setting allows to split it internally into + /// smaller batches so we can better estimate the size of a page currently being + /// written. + pub fn write_batch_size(&self) -> usize { + self.write_batch_size + } + + /// Returns max size for a row group. + pub fn max_row_group_size(&self) -> usize { + self.max_row_group_size + } + + /// Returns configured writer version. + pub fn writer_version(&self) -> WriterVersion { + self.writer_version + } + + /// Returns `created_by` string. + pub fn created_by(&self) -> &str { + &self.created_by + } + + /// Returns encoding for a data page, when dictionary encoding is enabled. + /// This is not configurable. + #[inline] + pub fn dictionary_data_page_encoding(&self) -> Encoding { + // PLAIN_DICTIONARY encoding is deprecated in writer version 1. + // Dictionary values are encoded using RLE_DICTIONARY encoding. + Encoding::RLE_DICTIONARY + } + + /// Returns encoding for dictionary page, when dictionary encoding is enabled. + /// This is not configurable. + #[inline] + pub fn dictionary_page_encoding(&self) -> Encoding { + // PLAIN_DICTIONARY is deprecated in writer version 1. + // Dictionary is encoded using plain encoding. + Encoding::PLAIN + } + + /// Returns encoding for a column, if set. + /// In case when dictionary is enabled, returns fallback encoding. + /// + /// If encoding is not set, then column writer will choose the best encoding + /// based on the column type. + pub fn encoding(&self, col: &ColumnPath) -> Option { + self.column_properties + .get(col) + .and_then(|c| c.encoding()) + .or_else(|| self.default_column_properties.encoding()) + } + + /// Returns compression codec for a column. + pub fn compression(&self, col: &ColumnPath) -> Compression { + self.column_properties + .get(col) + .and_then(|c| c.compression()) + .or_else(|| self.default_column_properties.compression()) + .unwrap_or(DEFAULT_COMPRESSION) + } + + /// Returns `true` if dictionary encoding is enabled for a column. + pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool { + self.column_properties + .get(col) + .and_then(|c| c.dictionary_enabled()) + .or_else(|| self.default_column_properties.dictionary_enabled()) + .unwrap_or(DEFAULT_DICTIONARY_ENABLED) + } + + /// Returns `true` if statistics are enabled for a column. + pub fn statistics_enabled(&self, col: &ColumnPath) -> bool { + self.column_properties + .get(col) + .and_then(|c| c.statistics_enabled()) + .or_else(|| self.default_column_properties.statistics_enabled()) + .unwrap_or(DEFAULT_STATISTICS_ENABLED) + } + + /// Returns max size for statistics. + /// Only applicable if statistics are enabled. + pub fn max_statistics_size(&self, col: &ColumnPath) -> usize { + self.column_properties + .get(col) + .and_then(|c| c.max_statistics_size()) + .or_else(|| self.default_column_properties.max_statistics_size()) + .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE) + } +} + +/// Writer properties builder. +pub struct WriterPropertiesBuilder { + data_pagesize_limit: usize, + dictionary_pagesize_limit: usize, + write_batch_size: usize, + max_row_group_size: usize, + writer_version: WriterVersion, + created_by: String, + default_column_properties: ColumnProperties, + column_properties: HashMap, +} + +impl WriterPropertiesBuilder { + /// Returns default state of the builder. + fn with_defaults() -> Self { + Self { + data_pagesize_limit: DEFAULT_PAGE_SIZE, + dictionary_pagesize_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, + write_batch_size: DEFAULT_WRITE_BATCH_SIZE, + max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE, + writer_version: DEFAULT_WRITER_VERSION, + created_by: DEFAULT_CREATED_BY.to_string(), + default_column_properties: ColumnProperties::new(), + column_properties: HashMap::new(), + } + } + + /// Finalizes the configuration and returns immutable writer properties struct. + pub fn build(self) -> WriterProperties { + WriterProperties { + data_pagesize_limit: self.data_pagesize_limit, + dictionary_pagesize_limit: self.dictionary_pagesize_limit, + write_batch_size: self.write_batch_size, + max_row_group_size: self.max_row_group_size, + writer_version: self.writer_version, + created_by: self.created_by, + default_column_properties: self.default_column_properties, + column_properties: self.column_properties, + } + } + + // ---------------------------------------------------------------------- + // Writer properies related to a file + + /// Sets writer version. + pub fn set_writer_version(mut self, value: WriterVersion) -> Self { + self.writer_version = value; + self + } + + /// Sets data page size limit. + pub fn set_data_pagesize_limit(mut self, value: usize) -> Self { + self.data_pagesize_limit = value; + self + } + + /// Sets dictionary page size limit. + pub fn set_dictionary_pagesize_limit(mut self, value: usize) -> Self { + self.dictionary_pagesize_limit = value; + self + } + + /// Sets write batch size. + pub fn set_write_batch_size(mut self, value: usize) -> Self { + self.write_batch_size = value; + self + } + + /// Sets max size for a row group. + pub fn set_max_row_group_size(mut self, value: usize) -> Self { + self.max_row_group_size = value; + self + } + + /// Sets "created by" property. + pub fn set_created_by(mut self, value: String) -> Self { + self.created_by = value; + self + } + + // ---------------------------------------------------------------------- + // Setters for any column (global) + + /// Sets encoding for any column. + /// + /// If dictionary is not enabled, this is treated as a primary encoding for all columns. + /// In case when dictionary is enabled for any column, this value is considered to + /// be a fallback encoding for that column. + /// + /// Panics if user tries to set dictionary encoding here, regardless of dictinoary + /// encoding flag being set. + pub fn set_encoding(mut self, value: Encoding) -> Self { + self.default_column_properties.set_encoding(value); + self + } + + /// Sets compression codec for any column. + pub fn set_compression(mut self, value: Compression) -> Self { + self.default_column_properties.set_compression(value); + self + } + + /// Sets flag to enable/disable dictionary encoding for any column. + /// + /// Use this method to set dictionary encoding, instead of explicitly specifying + /// encoding in `set_encoding` method. + pub fn set_dictionary_enabled(mut self, value: bool) -> Self { + self.default_column_properties.set_dictionary_enabled(value); + self + } + + /// Sets flag to enable/disable statistics for any column. + pub fn set_statistics_enabled(mut self, value: bool) -> Self { + self.default_column_properties.set_statistics_enabled(value); + self + } + + /// Sets max statistics size for any column. + /// Applicable only if statistics are enabled. + pub fn set_max_statistics_size(mut self, value: usize) -> Self { + self.default_column_properties + .set_max_statistics_size(value); + self + } + + // ---------------------------------------------------------------------- + // Setters for a specific column + + /// Helper method to get existing or new mutable reference of column properties. + #[inline] + fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties { + self.column_properties + .entry(col) + .or_insert(ColumnProperties::new()) + } + + /// Sets encoding for a column. + /// Takes precedence over globally defined settings. + /// + /// If dictionary is not enabled, this is treated as a primary encoding for this column. + /// In case when dictionary is enabled for this column, either through global defaults + /// or explicitly, this value is considered to be a fallback encoding for this column. + /// + /// Panics if user tries to set dictionary encoding here, regardless of dictinoary + /// encoding flag being set. + pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self { + self.get_mut_props(col).set_encoding(value); + self + } + + /// Sets compression codec for a column. + /// Takes precedence over globally defined settings. + pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self { + self.get_mut_props(col).set_compression(value); + self + } + + /// Sets flag to enable/disable dictionary encoding for a column. + /// Takes precedence over globally defined settings. + pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self { + self.get_mut_props(col).set_dictionary_enabled(value); + self + } + + /// Sets flag to enable/disable statistics for a column. + /// Takes precedence over globally defined settings. + pub fn set_column_statistics_enabled(mut self, col: ColumnPath, value: bool) -> Self { + self.get_mut_props(col).set_statistics_enabled(value); + self + } + + /// Sets max size for statistics for a column. + /// Takes precedence over globally defined settings. + pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self { + self.get_mut_props(col).set_max_statistics_size(value); + self + } +} + +/// Container for column properties that can be changed as part of writer. +/// +/// If a field is `None`, it means that no specific value has been set for this column, +/// so some subsequent or default value must be used. +#[derive(Debug, Clone, PartialEq)] +struct ColumnProperties { + encoding: Option, + codec: Option, + dictionary_enabled: Option, + statistics_enabled: Option, + max_statistics_size: Option, +} + +impl ColumnProperties { + /// Initialise column properties with default values. + fn new() -> Self { + Self { + encoding: None, + codec: None, + dictionary_enabled: None, + statistics_enabled: None, + max_statistics_size: None, + } + } + + /// Sets encoding for this column. + /// + /// If dictionary is not enabled, this is treated as a primary encoding for a column. + /// In case when dictionary is enabled for a column, this value is considered to + /// be a fallback encoding. + /// + /// Panics if user tries to set dictionary encoding here, regardless of dictinoary + /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary + /// for a column. + fn set_encoding(&mut self, value: Encoding) { + if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY { + panic!("Dictionary encoding can not be used as fallback encoding"); + } + self.encoding = Some(value); + } + + /// Sets compression codec for this column. + fn set_compression(&mut self, value: Compression) { + self.codec = Some(value); + } + + /// Sets whether or not dictionary encoding is enabled for this column. + fn set_dictionary_enabled(&mut self, enabled: bool) { + self.dictionary_enabled = Some(enabled); + } + + /// Sets whether or not statistics are enabled for this column. + fn set_statistics_enabled(&mut self, enabled: bool) { + self.statistics_enabled = Some(enabled); + } + + /// Sets max size for statistics for this column. + fn set_max_statistics_size(&mut self, value: usize) { + self.max_statistics_size = Some(value); + } + + /// Returns optional encoding for this column. + fn encoding(&self) -> Option { + self.encoding + } + + /// Returns optional compression codec for this column. + fn compression(&self) -> Option { + self.codec + } + + /// Returns `Some(true)` if dictionary encoding is enabled for this column, if disabled + /// then returns `Some(false)`. If result is `None`, then no setting has been provided. + fn dictionary_enabled(&self) -> Option { + self.dictionary_enabled + } + + /// Returns `Some(true)` if statistics are enabled for this column, if disabled then + /// returns `Some(false)`. If result is `None`, then no setting has been provided. + fn statistics_enabled(&self) -> Option { + self.statistics_enabled + } + + /// Returns optional max size in bytes for statistics. + fn max_statistics_size(&self) -> Option { + self.max_statistics_size + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_writer_version() { + assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1); + assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2); + } + + #[test] + fn test_writer_properties_default_settings() { + let props = WriterProperties::builder().build(); + assert_eq!(props.data_pagesize_limit(), DEFAULT_PAGE_SIZE); + assert_eq!( + props.dictionary_pagesize_limit(), + DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT + ); + assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE); + assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE); + assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION); + assert_eq!(props.created_by(), DEFAULT_CREATED_BY); + assert_eq!(props.encoding(&ColumnPath::from("col")), None); + assert_eq!( + props.compression(&ColumnPath::from("col")), + DEFAULT_COMPRESSION + ); + assert_eq!( + props.dictionary_enabled(&ColumnPath::from("col")), + DEFAULT_DICTIONARY_ENABLED + ); + assert_eq!( + props.statistics_enabled(&ColumnPath::from("col")), + DEFAULT_STATISTICS_ENABLED + ); + assert_eq!( + props.max_statistics_size(&ColumnPath::from("col")), + DEFAULT_MAX_STATISTICS_SIZE + ); + } + + #[test] + fn test_writer_properties_dictionary_encoding() { + // dictionary encoding is not configurable, and it should be the same for both + // writer version 1 and 2. + for version in vec![WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] { + let props = WriterProperties::builder() + .set_writer_version(version) + .build(); + assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN); + assert_eq!( + props.dictionary_data_page_encoding(), + Encoding::RLE_DICTIONARY + ); + } + } + + #[test] + #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")] + fn test_writer_properties_panic_when_plain_dictionary_is_fallback() { + // Should panic when user specifies dictionary encoding as fallback encoding. + WriterProperties::builder() + .set_encoding(Encoding::PLAIN_DICTIONARY) + .build(); + } + + #[test] + #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")] + fn test_writer_properties_panic_when_rle_dictionary_is_fallback() { + // Should panic when user specifies dictionary encoding as fallback encoding. + WriterProperties::builder() + .set_encoding(Encoding::RLE_DICTIONARY) + .build(); + } + + #[test] + #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")] + fn test_writer_properties_panic_when_dictionary_is_enabled() { + WriterProperties::builder() + .set_dictionary_enabled(true) + .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY) + .build(); + } + + #[test] + #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")] + fn test_writer_properties_panic_when_dictionary_is_disabled() { + WriterProperties::builder() + .set_dictionary_enabled(false) + .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY) + .build(); + } + + #[test] + fn test_writer_properties_builder() { + let props = WriterProperties::builder() + // file settings + .set_writer_version(WriterVersion::PARQUET_2_0) + .set_data_pagesize_limit(10) + .set_dictionary_pagesize_limit(20) + .set_write_batch_size(30) + .set_max_row_group_size(40) + .set_created_by("default".to_owned()) + // global column settings + .set_encoding(Encoding::DELTA_BINARY_PACKED) + .set_compression(Compression::GZIP) + .set_dictionary_enabled(false) + .set_statistics_enabled(false) + .set_max_statistics_size(50) + // specific column settings + .set_column_encoding(ColumnPath::from("col"), Encoding::RLE) + .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY) + .set_column_dictionary_enabled(ColumnPath::from("col"), true) + .set_column_statistics_enabled(ColumnPath::from("col"), true) + .set_column_max_statistics_size(ColumnPath::from("col"), 123) + .build(); + + assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0); + assert_eq!(props.data_pagesize_limit(), 10); + assert_eq!(props.dictionary_pagesize_limit(), 20); + assert_eq!(props.write_batch_size(), 30); + assert_eq!(props.max_row_group_size(), 40); + assert_eq!(props.created_by(), "default"); + + assert_eq!( + props.encoding(&ColumnPath::from("a")), + Some(Encoding::DELTA_BINARY_PACKED) + ); + assert_eq!(props.compression(&ColumnPath::from("a")), Compression::GZIP); + assert_eq!(props.dictionary_enabled(&ColumnPath::from("a")), false); + assert_eq!(props.statistics_enabled(&ColumnPath::from("a")), false); + assert_eq!(props.max_statistics_size(&ColumnPath::from("a")), 50); + + assert_eq!( + props.encoding(&ColumnPath::from("col")), + Some(Encoding::RLE) + ); + assert_eq!( + props.compression(&ColumnPath::from("col")), + Compression::SNAPPY + ); + assert_eq!(props.dictionary_enabled(&ColumnPath::from("col")), true); + assert_eq!(props.statistics_enabled(&ColumnPath::from("col")), true); + assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123); + } + + #[test] + fn test_writer_properties_builder_partial_defaults() { + let props = WriterProperties::builder() + .set_encoding(Encoding::DELTA_BINARY_PACKED) + .set_compression(Compression::GZIP) + .set_column_encoding(ColumnPath::from("col"), Encoding::RLE) + .build(); + + assert_eq!( + props.encoding(&ColumnPath::from("col")), + Some(Encoding::RLE) + ); + assert_eq!( + props.compression(&ColumnPath::from("col")), + Compression::GZIP + ); + assert_eq!( + props.dictionary_enabled(&ColumnPath::from("col")), + DEFAULT_DICTIONARY_ENABLED + ); + } +} diff --git a/rust/src/parquet/file/reader.rs b/rust/src/parquet/file/reader.rs new file mode 100644 index 0000000000000..c2e5dd176dac5 --- /dev/null +++ b/rust/src/parquet/file/reader.rs @@ -0,0 +1,899 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains file reader API and provides methods to access file metadata, row group +//! readers to read individual column chunks, or access record iterator. + +use std::{ + convert::TryFrom, + fs::File, + io::{BufReader, Cursor, Read, Seek, SeekFrom}, + path::Path, + rc::Rc, +}; + +use byteorder::{ByteOrder, LittleEndian}; +use parquet_format::{ + ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData, PageHeader, PageType, +}; +use thrift::protocol::TCompactInputProtocol; + +use crate::parquet::basic::{ColumnOrder, Compression, Encoding, Type}; +use crate::parquet::column::{ + page::{Page, PageReader}, + reader::{ColumnReader, ColumnReaderImpl}, +}; +use crate::parquet::compression::{create_codec, Codec}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::file::{metadata::*, statistics, FOOTER_SIZE, PARQUET_MAGIC}; +use crate::parquet::record::reader::RowIter; +use crate::parquet::schema::types::{self, SchemaDescriptor, Type as SchemaType}; +use crate::parquet::util::{io::FileSource, memory::ByteBufferPtr}; + +// ---------------------------------------------------------------------- +// APIs for file & row group readers + +/// Parquet file reader API. With this, user can get metadata information about the +/// Parquet file, can get reader for each row group, and access record iterator. +pub trait FileReader { + /// Get metadata information about this file. + fn metadata(&self) -> ParquetMetaDataPtr; + + /// Get the total number of row groups for this file. + fn num_row_groups(&self) -> usize; + + /// Get the `i`th row group reader. Note this doesn't do bound check. + fn get_row_group(&self, i: usize) -> Result>; + + /// Get full iterator of `Row`s from a file (over all row groups). + /// + /// Iterator will automatically load the next row group to advance. + /// + /// Projected schema can be a subset of or equal to the file schema, when it is None, + /// full file schema is assumed. + fn get_row_iter(&self, projection: Option) -> Result; +} + +/// Parquet row group reader API. With this, user can get metadata information about the +/// row group, as well as readers for each individual column chunk. +pub trait RowGroupReader { + /// Get metadata information about this row group. + fn metadata(&self) -> RowGroupMetaDataPtr; + + /// Get the total number of column chunks in this row group. + fn num_columns(&self) -> usize; + + /// Get page reader for the `i`th column chunk. + fn get_column_page_reader(&self, i: usize) -> Result>; + + /// Get value reader for the `i`th column chunk. + fn get_column_reader(&self, i: usize) -> Result; + + /// Get iterator of `Row`s from this row group. + /// + /// Projected schema can be a subset of or equal to the file schema, when it is None, + /// full file schema is assumed. + fn get_row_iter(&self, projection: Option) -> Result; +} + +// ---------------------------------------------------------------------- +// Serialized impl for file & row group readers + +/// Length should return the amount of bytes that implementor contains. +/// It's mainly used to read the metadata, which is at the end of the source. +pub trait Length { + /// Returns the amount of bytes of the inner source. + fn len(&self) -> u64; +} + +/// TryClone tries to clone the type and should maintain the `Seek` position of the given +/// instance. +pub trait TryClone: Sized { + /// Clones the type returning a new instance or an error if it's not possible + /// to clone it. + fn try_clone(&self) -> Result; +} + +impl Length for File { + fn len(&self) -> u64 { + self.metadata().map(|m| m.len()).unwrap_or(0u64) + } +} + +impl TryClone for File { + fn try_clone(&self) -> Result { + self.try_clone().map_err(|e| e.into()) + } +} + +impl<'a> Length for Cursor<&'a [u8]> { + fn len(&self) -> u64 { + self.get_ref().len() as u64 + } +} + +impl<'a> TryClone for Cursor<&'a [u8]> { + fn try_clone(&self) -> Result { + Ok(self.clone()) + } +} + +/// ParquetReader is the interface which needs to be fulfilled to be able to parse a +/// parquet source. +pub trait ParquetReader: Read + Seek + Length + TryClone {} +impl ParquetReader for T {} + +/// A serialized implementation for Parquet [`FileReader`]. +pub struct SerializedFileReader { + buf: BufReader, + metadata: ParquetMetaDataPtr, +} + +impl SerializedFileReader { + /// Creates file reader from a Parquet file. + /// Returns error if Parquet file does not exist or is corrupt. + pub fn new(reader: R) -> Result { + let mut buf = BufReader::new(reader); + let metadata = Self::parse_metadata(&mut buf)?; + Ok(Self { + buf, + metadata: Rc::new(metadata), + }) + } + + // Layout of Parquet file + // +---------------------------+---+-----+ + // | Rest of file | B | A | + // +---------------------------+---+-----+ + // where A: parquet footer, B: parquet metadata. + // + fn parse_metadata(buf: &mut BufReader) -> Result { + let file_size = buf.get_ref().len(); + if file_size < (FOOTER_SIZE as u64) { + return Err(general_err!( + "Invalid Parquet file. Size is smaller than footer" + )); + } + let mut footer_buffer: [u8; FOOTER_SIZE] = [0; FOOTER_SIZE]; + buf.seek(SeekFrom::End(-(FOOTER_SIZE as i64)))?; + buf.read_exact(&mut footer_buffer)?; + if footer_buffer[4..] != PARQUET_MAGIC { + return Err(general_err!("Invalid Parquet file. Corrupt footer")); + } + let metadata_len = LittleEndian::read_i32(&footer_buffer[0..4]) as i64; + if metadata_len < 0 { + return Err(general_err!( + "Invalid Parquet file. Metadata length is less than zero ({})", + metadata_len + )); + } + let metadata_start: i64 = file_size as i64 - FOOTER_SIZE as i64 - metadata_len; + if metadata_start < 0 { + return Err(general_err!( + "Invalid Parquet file. Metadata start is less than zero ({})", + metadata_start + )); + } + buf.seek(SeekFrom::Start(metadata_start as u64))?; + let metadata_buf = buf.take(metadata_len as u64).into_inner(); + + // TODO: row group filtering + let mut prot = TCompactInputProtocol::new(metadata_buf); + let mut t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) + .map_err(|e| ParquetError::General(format!("Could not parse metadata: {}", e)))?; + let schema = types::from_thrift(&mut t_file_metadata.schema)?; + let schema_descr = Rc::new(SchemaDescriptor::new(schema.clone())); + let mut row_groups = Vec::new(); + for rg in t_file_metadata.row_groups { + row_groups.push(Rc::new(RowGroupMetaData::from_thrift( + schema_descr.clone(), + rg, + )?)); + } + let column_orders = Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr); + + let file_metadata = FileMetaData::new( + t_file_metadata.version, + t_file_metadata.num_rows, + t_file_metadata.created_by, + schema, + schema_descr, + column_orders, + ); + Ok(ParquetMetaData::new(file_metadata, row_groups)) + } + + /// Parses column orders from Thrift definition. + /// If no column orders are defined, returns `None`. + fn parse_column_orders( + t_column_orders: Option>, + schema_descr: &SchemaDescriptor, + ) -> Option> { + match t_column_orders { + Some(orders) => { + // Should always be the case + assert_eq!( + orders.len(), + schema_descr.num_columns(), + "Column order length mismatch" + ); + let mut res = Vec::new(); + for (i, column) in schema_descr.columns().iter().enumerate() { + match orders[i] { + TColumnOrder::TYPEORDER(_) => { + let sort_order = ColumnOrder::get_sort_order( + column.logical_type(), + column.physical_type(), + ); + res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order)); + } + } + } + Some(res) + } + None => None, + } + } +} + +impl FileReader for SerializedFileReader { + fn metadata(&self) -> ParquetMetaDataPtr { + self.metadata.clone() + } + + fn num_row_groups(&self) -> usize { + self.metadata.num_row_groups() + } + + fn get_row_group(&self, i: usize) -> Result> { + let row_group_metadata = self.metadata.row_group(i); + // Row groups should be processed sequentially. + let f = self.buf.get_ref().try_clone()?; + Ok(Box::new(SerializedRowGroupReader::new( + f, + row_group_metadata, + ))) + } + + fn get_row_iter(&self, projection: Option) -> Result { + RowIter::from_file(projection, self) + } +} + +impl TryFrom for SerializedFileReader { + type Error = ParquetError; + + fn try_from(file: File) -> Result { + Self::new(file) + } +} + +impl<'a> TryFrom<&'a Path> for SerializedFileReader { + type Error = ParquetError; + + fn try_from(path: &Path) -> Result { + let file = File::open(path)?; + Self::try_from(file) + } +} + +impl TryFrom for SerializedFileReader { + type Error = ParquetError; + + fn try_from(path: String) -> Result { + Self::try_from(Path::new(&path)) + } +} + +impl<'a> TryFrom<&'a str> for SerializedFileReader { + type Error = ParquetError; + + fn try_from(path: &str) -> Result { + Self::try_from(Path::new(&path)) + } +} + +/// A serialized implementation for Parquet [`RowGroupReader`]. +pub struct SerializedRowGroupReader { + buf: BufReader, + metadata: RowGroupMetaDataPtr, +} + +impl SerializedRowGroupReader { + /// Creates new row group reader from a file and row group metadata. + fn new(file: R, metadata: RowGroupMetaDataPtr) -> Self { + let buf = BufReader::new(file); + Self { buf, metadata } + } +} + +impl RowGroupReader for SerializedRowGroupReader { + fn metadata(&self) -> RowGroupMetaDataPtr { + self.metadata.clone() + } + + fn num_columns(&self) -> usize { + self.metadata.num_columns() + } + + // TODO: fix PARQUET-816 + fn get_column_page_reader(&self, i: usize) -> Result> { + let col = self.metadata.column(i); + let mut col_start = col.data_page_offset(); + if col.has_dictionary_page() { + col_start = col.dictionary_page_offset().unwrap(); + } + let col_length = col.compressed_size(); + let file_chunk = FileSource::new(self.buf.get_ref(), col_start as u64, col_length as usize); + let page_reader = SerializedPageReader::new( + file_chunk, + col.num_values(), + col.compression(), + col.column_descr().physical_type(), + )?; + Ok(Box::new(page_reader)) + } + + fn get_column_reader(&self, i: usize) -> Result { + let schema_descr = self.metadata.schema_descr(); + let col_descr = schema_descr.column(i); + let col_page_reader = self.get_column_page_reader(i)?; + let col_reader = match col_descr.physical_type() { + Type::BOOLEAN => { + ColumnReader::BoolColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT32 => { + ColumnReader::Int32ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT64 => { + ColumnReader::Int64ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT96 => { + ColumnReader::Int96ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::FLOAT => { + ColumnReader::FloatColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::DOUBLE => { + ColumnReader::DoubleColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::BYTE_ARRAY => ColumnReader::ByteArrayColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::FIXED_LEN_BYTE_ARRAY => ColumnReader::FixedLenByteArrayColumnReader( + ColumnReaderImpl::new(col_descr, col_page_reader), + ), + }; + Ok(col_reader) + } + + fn get_row_iter(&self, projection: Option) -> Result { + RowIter::from_row_group(projection, self) + } +} + +/// A serialized implementation for Parquet [`PageReader`]. +pub struct SerializedPageReader { + // The file source buffer which references exactly the bytes for the column trunk + // to be read by this page reader. + buf: T, + + // The compression codec for this column chunk. Only set for non-PLAIN codec. + decompressor: Option>, + + // The number of values we have seen so far. + seen_num_values: i64, + + // The number of total values in this column chunk. + total_num_values: i64, + + // Column chunk type. + physical_type: Type, +} + +impl SerializedPageReader { + /// Creates a new serialized page reader from file source. + pub fn new( + buf: T, + total_num_values: i64, + compression: Compression, + physical_type: Type, + ) -> Result { + let decompressor = create_codec(compression)?; + let result = Self { + buf, + total_num_values, + seen_num_values: 0, + decompressor, + physical_type, + }; + Ok(result) + } + + /// Reads Page header from Thrift. + fn read_page_header(&mut self) -> Result { + let mut prot = TCompactInputProtocol::new(&mut self.buf); + let page_header = PageHeader::read_from_in_protocol(&mut prot)?; + Ok(page_header) + } +} + +impl PageReader for SerializedPageReader { + fn get_next_page(&mut self) -> Result> { + while self.seen_num_values < self.total_num_values { + let page_header = self.read_page_header()?; + + // When processing data page v2, depending on enabled compression for the page, we + // should account for uncompressed data ('offset') of repetition and definition + // levels. + // + // We always use 0 offset for other pages other than v2, `true` flag means that + // compression will be applied if decompressor is defined + let mut offset: usize = 0; + let mut can_decompress = true; + + if let Some(ref header_v2) = page_header.data_page_header_v2 { + offset = (header_v2.definition_levels_byte_length + + header_v2.repetition_levels_byte_length) as usize; + // When is_compressed flag is missing the page is considered compressed + can_decompress = header_v2.is_compressed.unwrap_or(true); + } + + let compressed_len = page_header.compressed_page_size as usize - offset; + let uncompressed_len = page_header.uncompressed_page_size as usize - offset; + // We still need to read all bytes from buffered stream + let mut buffer = vec![0; offset + compressed_len]; + self.buf.read_exact(&mut buffer)?; + + // TODO: page header could be huge because of statistics. We should set a maximum + // page header size and abort if that is exceeded. + if let Some(decompressor) = self.decompressor.as_mut() { + if can_decompress { + let mut decompressed_buffer = Vec::with_capacity(uncompressed_len); + let decompressed_size = + decompressor.decompress(&buffer[offset..], &mut decompressed_buffer)?; + if decompressed_size != uncompressed_len { + return Err(general_err!( + "Actual decompressed size doesn't match the expected one ({} vs {})", + decompressed_size, + uncompressed_len + )); + } + if offset == 0 { + buffer = decompressed_buffer; + } else { + // Prepend saved offsets to the buffer + buffer.truncate(offset); + buffer.append(&mut decompressed_buffer); + } + } + } + + let result = match page_header.type_ { + PageType::DICTIONARY_PAGE => { + assert!(page_header.dictionary_page_header.is_some()); + let dict_header = page_header.dictionary_page_header.as_ref().unwrap(); + let is_sorted = dict_header.is_sorted.unwrap_or(false); + Page::DictionaryPage { + buf: ByteBufferPtr::new(buffer), + num_values: dict_header.num_values as u32, + encoding: Encoding::from(dict_header.encoding), + is_sorted, + } + } + PageType::DATA_PAGE => { + assert!(page_header.data_page_header.is_some()); + let header = page_header.data_page_header.unwrap(); + self.seen_num_values += header.num_values as i64; + Page::DataPage { + buf: ByteBufferPtr::new(buffer), + num_values: header.num_values as u32, + encoding: Encoding::from(header.encoding), + def_level_encoding: Encoding::from(header.definition_level_encoding), + rep_level_encoding: Encoding::from(header.repetition_level_encoding), + statistics: statistics::from_thrift(self.physical_type, header.statistics), + } + } + PageType::DATA_PAGE_V2 => { + assert!(page_header.data_page_header_v2.is_some()); + let header = page_header.data_page_header_v2.unwrap(); + let is_compressed = header.is_compressed.unwrap_or(true); + self.seen_num_values += header.num_values as i64; + Page::DataPageV2 { + buf: ByteBufferPtr::new(buffer), + num_values: header.num_values as u32, + encoding: Encoding::from(header.encoding), + num_nulls: header.num_nulls as u32, + num_rows: header.num_rows as u32, + def_levels_byte_len: header.definition_levels_byte_length as u32, + rep_levels_byte_len: header.repetition_levels_byte_length as u32, + is_compressed, + statistics: statistics::from_thrift(self.physical_type, header.statistics), + } + } + _ => { + // For unknown page type (e.g., INDEX_PAGE), skip and read next. + continue; + } + }; + return Ok(Some(result)); + } + + // We are at the end of this column chunk and no more page left. Return None. + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use parquet_format::TypeDefinedOrder; + + use crate::parquet::basic::SortOrder; + use crate::parquet::util::test_common::{get_temp_file, get_test_file, get_test_path}; + + #[test] + fn test_file_reader_metadata_size_smaller_than_footer() { + let test_file = get_temp_file("corrupt-1.parquet", &[]); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_err()); + assert_eq!( + reader_result.err().unwrap(), + general_err!("Invalid Parquet file. Size is smaller than footer") + ); + } + + // #[test] + // fn test_cursor_and_file_has_the_same_behaviour() { + // let path = get_test_path("alltypes_plain.parquet"); + // let buffer = include_bytes!(path); + // let cursor = Cursor::new(buffer.as_ref()); + + // let read_from_file = + // SerializedFileReader::new(File::open("testdata/alltypes_plain.parquet").unwrap()) + // .unwrap(); + // let read_from_cursor = SerializedFileReader::new(cursor).unwrap(); + + // let file_iter = read_from_file.get_row_iter(None).unwrap(); + // let cursor_iter = read_from_cursor.get_row_iter(None).unwrap(); + + // assert!(file_iter.eq(cursor_iter)); + // } + + #[test] + fn test_file_reader_metadata_corrupt_footer() { + let test_file = get_temp_file("corrupt-2.parquet", &[1, 2, 3, 4, 5, 6, 7, 8]); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_err()); + assert_eq!( + reader_result.err().unwrap(), + general_err!("Invalid Parquet file. Corrupt footer") + ); + } + + #[test] + fn test_file_reader_metadata_invalid_length() { + let test_file = get_temp_file("corrupt-3.parquet", &[0, 0, 0, 255, b'P', b'A', b'R', b'1']); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_err()); + assert_eq!( + reader_result.err().unwrap(), + general_err!("Invalid Parquet file. Metadata length is less than zero (-16777216)") + ); + } + + #[test] + fn test_file_reader_metadata_invalid_start() { + let test_file = get_temp_file("corrupt-4.parquet", &[255, 0, 0, 0, b'P', b'A', b'R', b'1']); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_err()); + assert_eq!( + reader_result.err().unwrap(), + general_err!("Invalid Parquet file. Metadata start is less than zero (-255)") + ); + } + + #[test] + fn test_file_reader_column_orders_parse() { + // Define simple schema, we do not need to provide logical types. + let mut fields = vec![ + Rc::new( + SchemaType::primitive_type_builder("col1", Type::INT32) + .build() + .unwrap(), + ), + Rc::new( + SchemaType::primitive_type_builder("col2", Type::FLOAT) + .build() + .unwrap(), + ), + ]; + let schema = SchemaType::group_type_builder("schema") + .with_fields(&mut fields) + .build() + .unwrap(); + let schema_descr = SchemaDescriptor::new(Rc::new(schema)); + + let t_column_orders = Some(vec![ + TColumnOrder::TYPEORDER(TypeDefinedOrder::new()), + TColumnOrder::TYPEORDER(TypeDefinedOrder::new()), + ]); + + assert_eq!( + SerializedFileReader::::parse_column_orders(t_column_orders, &schema_descr), + Some(vec![ + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED) + ]) + ); + + // Test when no column orders are defined. + assert_eq!( + SerializedFileReader::::parse_column_orders(None, &schema_descr), + None + ); + } + + #[test] + #[should_panic(expected = "Column order length mismatch")] + fn test_file_reader_column_orders_len_mismatch() { + let schema = SchemaType::group_type_builder("schema").build().unwrap(); + let schema_descr = SchemaDescriptor::new(Rc::new(schema)); + + let t_column_orders = Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); + + SerializedFileReader::::parse_column_orders(t_column_orders, &schema_descr); + } + + #[test] + fn test_file_reader_try_from() { + // Valid file path + let test_file = get_test_file("alltypes_plain.parquet"); + let test_path_buf = get_test_path("alltypes_plain.parquet"); + let test_path = test_path_buf.as_path(); + let test_path_str = test_path.to_str().unwrap(); + + let reader = SerializedFileReader::try_from(test_file); + assert!(reader.is_ok()); + + let reader = SerializedFileReader::try_from(test_path); + assert!(reader.is_ok()); + + let reader = SerializedFileReader::try_from(test_path_str); + assert!(reader.is_ok()); + + let reader = SerializedFileReader::try_from(test_path_str.to_string()); + assert!(reader.is_ok()); + + // Invalid file path + let test_path = Path::new("invalid.parquet"); + let test_path_str = test_path.to_str().unwrap(); + + let reader = SerializedFileReader::try_from(test_path); + assert!(reader.is_err()); + + let reader = SerializedFileReader::try_from(test_path_str); + assert!(reader.is_err()); + + let reader = SerializedFileReader::try_from(test_path_str.to_string()); + assert!(reader.is_err()); + } + + #[test] + fn test_reuse_file_chunk() { + // This test covers the case of maintaining the correct start position in a file + // stream for each column reader after initializing and moving to the next one + // (without necessarily reading the entire column). + let test_file = get_test_file("alltypes_plain.parquet"); + let reader = SerializedFileReader::new(test_file).unwrap(); + let row_group = reader.get_row_group(0).unwrap(); + + let mut page_readers = Vec::new(); + for i in 0..row_group.num_columns() { + page_readers.push(row_group.get_column_page_reader(i).unwrap()); + } + + // Now buffer each col reader, we do not expect any failures like: + // General("underlying Thrift error: end of file") + for mut page_reader in page_readers { + assert!(page_reader.get_next_page().is_ok()); + } + } + + #[test] + fn test_file_reader() { + let test_file = get_test_file("alltypes_plain.parquet"); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_ok()); + let reader = reader_result.unwrap(); + + // Test contents in Parquet metadata + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + // Test contents in file metadata + let file_metadata = metadata.file_metadata(); + assert!(file_metadata.created_by().is_some()); + assert_eq!( + file_metadata.created_by().as_ref().unwrap(), + "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)" + ); + assert_eq!(file_metadata.num_rows(), 8); + assert_eq!(file_metadata.version(), 1); + assert_eq!(file_metadata.column_orders(), None); + + // Test contents in row group metadata + let row_group_metadata = metadata.row_group(0); + assert_eq!(row_group_metadata.num_columns(), 11); + assert_eq!(row_group_metadata.num_rows(), 8); + assert_eq!(row_group_metadata.total_byte_size(), 671); + // Check each column order + for i in 0..row_group_metadata.num_columns() { + assert_eq!(file_metadata.column_order(i), ColumnOrder::UNDEFINED); + } + + // Test row group reader + let row_group_reader_result = reader.get_row_group(0); + assert!(row_group_reader_result.is_ok()); + let row_group_reader: Box = row_group_reader_result.unwrap(); + assert_eq!( + row_group_reader.num_columns(), + row_group_metadata.num_columns() + ); + assert_eq!( + row_group_reader.metadata().total_byte_size(), + row_group_metadata.total_byte_size() + ); + + // Test page readers + // TODO: test for every column + let page_reader_0_result = row_group_reader.get_column_page_reader(0); + assert!(page_reader_0_result.is_ok()); + let mut page_reader_0: Box = page_reader_0_result.unwrap(); + let mut page_count = 0; + while let Ok(Some(page)) = page_reader_0.get_next_page() { + let is_expected_page = match page { + Page::DictionaryPage { + buf, + num_values, + encoding, + is_sorted, + } => { + assert_eq!(buf.len(), 32); + assert_eq!(num_values, 8); + assert_eq!(encoding, Encoding::PLAIN_DICTIONARY); + assert_eq!(is_sorted, false); + true + } + Page::DataPage { + buf, + num_values, + encoding, + def_level_encoding, + rep_level_encoding, + statistics, + } => { + assert_eq!(buf.len(), 11); + assert_eq!(num_values, 8); + assert_eq!(encoding, Encoding::PLAIN_DICTIONARY); + assert_eq!(def_level_encoding, Encoding::RLE); + assert_eq!(rep_level_encoding, Encoding::BIT_PACKED); + assert!(statistics.is_none()); + true + } + _ => false, + }; + assert!(is_expected_page); + page_count += 1; + } + assert_eq!(page_count, 2); + } + + #[test] + fn test_file_reader_datapage_v2() { + let test_file = get_test_file("datapage_v2.snappy.parquet"); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_ok()); + let reader = reader_result.unwrap(); + + // Test contents in Parquet metadata + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + // Test contents in file metadata + let file_metadata = metadata.file_metadata(); + assert!(file_metadata.created_by().is_some()); + assert_eq!( + file_metadata.created_by().as_ref().unwrap(), + "parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)" + ); + assert_eq!(file_metadata.num_rows(), 5); + assert_eq!(file_metadata.version(), 1); + assert_eq!(file_metadata.column_orders(), None); + + let row_group_metadata = metadata.row_group(0); + + // Check each column order + for i in 0..row_group_metadata.num_columns() { + assert_eq!(file_metadata.column_order(i), ColumnOrder::UNDEFINED); + } + + // Test row group reader + let row_group_reader_result = reader.get_row_group(0); + assert!(row_group_reader_result.is_ok()); + let row_group_reader: Box = row_group_reader_result.unwrap(); + assert_eq!( + row_group_reader.num_columns(), + row_group_metadata.num_columns() + ); + assert_eq!( + row_group_reader.metadata().total_byte_size(), + row_group_metadata.total_byte_size() + ); + + // Test page readers + // TODO: test for every column + let page_reader_0_result = row_group_reader.get_column_page_reader(0); + assert!(page_reader_0_result.is_ok()); + let mut page_reader_0: Box = page_reader_0_result.unwrap(); + let mut page_count = 0; + while let Ok(Some(page)) = page_reader_0.get_next_page() { + let is_expected_page = match page { + Page::DictionaryPage { + buf, + num_values, + encoding, + is_sorted, + } => { + assert_eq!(buf.len(), 7); + assert_eq!(num_values, 1); + assert_eq!(encoding, Encoding::PLAIN); + assert_eq!(is_sorted, false); + true + } + Page::DataPageV2 { + buf, + num_values, + encoding, + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed, + statistics, + } => { + assert_eq!(buf.len(), 4); + assert_eq!(num_values, 5); + assert_eq!(encoding, Encoding::RLE_DICTIONARY); + assert_eq!(num_nulls, 1); + assert_eq!(num_rows, 5); + assert_eq!(def_levels_byte_len, 2); + assert_eq!(rep_levels_byte_len, 0); + assert_eq!(is_compressed, true); + assert!(statistics.is_some()); + true + } + _ => false, + }; + assert!(is_expected_page); + page_count += 1; + } + assert_eq!(page_count, 2); + } +} diff --git a/rust/src/parquet/file/statistics.rs b/rust/src/parquet/file/statistics.rs new file mode 100644 index 0000000000000..ff4d731857f16 --- /dev/null +++ b/rust/src/parquet/file/statistics.rs @@ -0,0 +1,692 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains definitions for working with Parquet statistics. +//! +//! Though some common methods are available on enum, use pattern match to extract +//! actual min and max values from statistics, see below: +//! +//! ```rust +//! use arrow::parquet::file::statistics::Statistics; +//! +//! let stats = Statistics::int32(Some(1), Some(10), None, 3, true); +//! assert_eq!(stats.null_count(), 3); +//! assert!(stats.has_min_max_set()); +//! assert!(stats.is_min_max_deprecated()); +//! +//! match stats { +//! Statistics::Int32(ref typed) => { +//! assert_eq!(*typed.min(), 1); +//! assert_eq!(*typed.max(), 10); +//! } +//! _ => {} +//! } +//! ``` + +use std::{cmp, fmt}; + +use byteorder::{ByteOrder, LittleEndian}; +use parquet_format::Statistics as TStatistics; + +use crate::parquet::basic::Type; +use crate::parquet::data_type::*; + +// Macro to generate methods create Statistics. +macro_rules! statistics_new_func { + ($func:ident, $vtype:ty, $stat:ident) => { + pub fn $func( + min: $vtype, + max: $vtype, + distinct: Option, + nulls: u64, + is_deprecated: bool, + ) -> Self { + Statistics::$stat(TypedStatistics::new( + min, + max, + distinct, + nulls, + is_deprecated, + )) + } + }; +} + +// Macro to generate getter functions for Statistics. +macro_rules! statistics_enum_func { + ($self:ident, $func:ident) => {{ + match *$self { + Statistics::Boolean(ref typed) => typed.$func(), + Statistics::Int32(ref typed) => typed.$func(), + Statistics::Int64(ref typed) => typed.$func(), + Statistics::Int96(ref typed) => typed.$func(), + Statistics::Float(ref typed) => typed.$func(), + Statistics::Double(ref typed) => typed.$func(), + Statistics::ByteArray(ref typed) => typed.$func(), + Statistics::FixedLenByteArray(ref typed) => typed.$func(), + } + }}; +} + +/// Converts Thrift definition into `Statistics`. +pub fn from_thrift(physical_type: Type, thrift_stats: Option) -> Option { + match thrift_stats { + Some(stats) => { + // Number of nulls recorded, when it is not available, we just mark it as 0. + let null_count = stats.null_count.unwrap_or(0); + assert!( + null_count >= 0, + "Statistics null count is negative ({})", + null_count + ); + + // Generic null count. + let null_count = null_count as u64; + // Generic distinct count (count of distinct values occurring) + let distinct_count = stats.distinct_count.map(|value| value as u64); + // Whether or not statistics use deprecated min/max fields. + let old_format = stats.min_value.is_none() && stats.max_value.is_none(); + // Generic min value as bytes. + let min = if old_format { + stats.min + } else { + stats.min_value + }; + // Generic max value as bytes. + let max = if old_format { + stats.max + } else { + stats.max_value + }; + + // Values are encoded using PLAIN encoding definition, except that + // variable-length byte arrays do not include a length prefix. + // + // Instead of using actual decoder, we manually convert values. + let res = match physical_type { + Type::BOOLEAN => Statistics::boolean( + min.map(|data| data[0] != 0), + max.map(|data| data[0] != 0), + distinct_count, + null_count, + old_format, + ), + Type::INT32 => Statistics::int32( + min.map(|data| LittleEndian::read_i32(&data)), + max.map(|data| LittleEndian::read_i32(&data)), + distinct_count, + null_count, + old_format, + ), + Type::INT64 => Statistics::int64( + min.map(|data| LittleEndian::read_i64(&data)), + max.map(|data| LittleEndian::read_i64(&data)), + distinct_count, + null_count, + old_format, + ), + Type::INT96 => { + // INT96 statistics may not be correct, because comparison is signed + // byte-wise, not actual timestamps. It is recommended to ignore min/max + // statistics for INT96 columns. + let min = min.map(|data| { + assert_eq!(data.len(), 12); + unsafe { + let raw = ::std::slice::from_raw_parts(data.as_ptr() as *mut u32, 3); + Int96::from(Vec::from(raw)) + } + }); + let max = max.map(|data| { + assert_eq!(data.len(), 12); + unsafe { + let raw = ::std::slice::from_raw_parts(data.as_ptr() as *mut u32, 3); + Int96::from(Vec::from(raw)) + } + }); + Statistics::int96(min, max, distinct_count, null_count, old_format) + } + Type::FLOAT => Statistics::float( + min.map(|data| LittleEndian::read_f32(&data)), + max.map(|data| LittleEndian::read_f32(&data)), + distinct_count, + null_count, + old_format, + ), + Type::DOUBLE => Statistics::double( + min.map(|data| LittleEndian::read_f64(&data)), + max.map(|data| LittleEndian::read_f64(&data)), + distinct_count, + null_count, + old_format, + ), + Type::BYTE_ARRAY => Statistics::byte_array( + min.map(|data| ByteArray::from(data)), + max.map(|data| ByteArray::from(data)), + distinct_count, + null_count, + old_format, + ), + Type::FIXED_LEN_BYTE_ARRAY => Statistics::fixed_len_byte_array( + min.map(|data| ByteArray::from(data)), + max.map(|data| ByteArray::from(data)), + distinct_count, + null_count, + old_format, + ), + }; + + Some(res) + } + None => None, + } +} + +// Convert Statistics into Thrift definition. +pub fn to_thrift(stats: Option<&Statistics>) -> Option { + if stats.is_none() { + return None; + } + + let stats = stats.unwrap(); + + let mut thrift_stats = TStatistics { + max: None, + min: None, + null_count: if stats.has_nulls() { + Some(stats.null_count() as i64) + } else { + None + }, + distinct_count: stats.distinct_count().map(|value| value as i64), + max_value: None, + min_value: None, + }; + + // Get min/max if set. + let (min, max) = if stats.has_min_max_set() { + ( + Some(stats.min_bytes().to_vec()), + Some(stats.max_bytes().to_vec()), + ) + } else { + (None, None) + }; + + if stats.is_min_max_deprecated() { + thrift_stats.min = min; + thrift_stats.max = max; + } else { + thrift_stats.min_value = min; + thrift_stats.max_value = max; + } + + Some(thrift_stats) +} + +/// Statistics for a column chunk and data page. +#[derive(Debug, PartialEq)] +pub enum Statistics { + Boolean(TypedStatistics), + Int32(TypedStatistics), + Int64(TypedStatistics), + Int96(TypedStatistics), + Float(TypedStatistics), + Double(TypedStatistics), + ByteArray(TypedStatistics), + FixedLenByteArray(TypedStatistics), +} + +impl Statistics { + statistics_new_func![boolean, Option, Boolean]; + + statistics_new_func![int32, Option, Int32]; + + statistics_new_func![int64, Option, Int64]; + + statistics_new_func![int96, Option, Int96]; + + statistics_new_func![float, Option, Float]; + + statistics_new_func![double, Option, Double]; + + statistics_new_func![byte_array, Option, ByteArray]; + + statistics_new_func![fixed_len_byte_array, Option, FixedLenByteArray]; + + /// Returns `true` if statistics have old `min` and `max` fields set. + /// This means that the column order is likely to be undefined, which, for old files + /// could mean a signed sort order of values. + /// + /// Refer to [`ColumnOrder`](`::basic::ColumnOrder`) and + /// [`SortOrder`](`::basic::SortOrder`) for more information. + pub fn is_min_max_deprecated(&self) -> bool { + statistics_enum_func![self, is_min_max_deprecated] + } + + /// Returns optional value of number of distinct values occurring. + /// When it is `None`, the value should be ignored. + pub fn distinct_count(&self) -> Option { + statistics_enum_func![self, distinct_count] + } + + /// Returns number of null values for the column. + /// Note that this includes all nulls when column is part of the complex type. + pub fn null_count(&self) -> u64 { + statistics_enum_func![self, null_count] + } + + /// Returns `true` if statistics collected any null values, `false` otherwise. + pub fn has_nulls(&self) -> bool { + self.null_count() > 0 + } + + /// Returns `true` if min value and max value are set. + /// Normally both min/max values will be set to `Some(value)` or `None`. + pub fn has_min_max_set(&self) -> bool { + statistics_enum_func![self, has_min_max_set] + } + + /// Returns slice of bytes that represent min value. + /// Panics if min value is not set. + pub fn min_bytes(&self) -> &[u8] { + statistics_enum_func![self, min_bytes] + } + + /// Returns slice of bytes that represent max value. + /// Panics if max value is not set. + pub fn max_bytes(&self) -> &[u8] { + statistics_enum_func![self, max_bytes] + } + + /// Returns physical type associated with statistics. + pub fn physical_type(&self) -> Type { + match self { + Statistics::Boolean(_) => Type::BOOLEAN, + Statistics::Int32(_) => Type::INT32, + Statistics::Int64(_) => Type::INT64, + Statistics::Int96(_) => Type::INT96, + Statistics::Float(_) => Type::FLOAT, + Statistics::Double(_) => Type::DOUBLE, + Statistics::ByteArray(_) => Type::BYTE_ARRAY, + Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY, + } + } +} + +impl fmt::Display for Statistics { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Statistics::Boolean(typed) => write!(f, "{}", typed), + Statistics::Int32(typed) => write!(f, "{}", typed), + Statistics::Int64(typed) => write!(f, "{}", typed), + Statistics::Int96(typed) => write!(f, "{}", typed), + Statistics::Float(typed) => write!(f, "{}", typed), + Statistics::Double(typed) => write!(f, "{}", typed), + Statistics::ByteArray(typed) => write!(f, "{}", typed), + Statistics::FixedLenByteArray(typed) => write!(f, "{}", typed), + } + } +} + +/// Typed implementation for [`Statistics`]. +pub struct TypedStatistics { + min: Option, + max: Option, + // Distinct count could be omitted in some cases + distinct_count: Option, + null_count: u64, + is_min_max_deprecated: bool, +} + +impl TypedStatistics { + /// Creates new typed statistics. + pub fn new( + min: Option, + max: Option, + distinct_count: Option, + null_count: u64, + is_min_max_deprecated: bool, + ) -> Self { + Self { + min, + max, + distinct_count, + null_count, + is_min_max_deprecated, + } + } + + /// Returns min value of the statistics. + /// + /// Panics if min value is not set, e.g. all values are `null`. + /// Use `has_min_max_set` method to check that. + pub fn min(&self) -> &T::T { + self.min.as_ref().unwrap() + } + + /// Returns max value of the statistics. + /// + /// Panics if max value is not set, e.g. all values are `null`. + /// Use `has_min_max_set` method to check that. + pub fn max(&self) -> &T::T { + self.max.as_ref().unwrap() + } + + /// Returns min value as bytes of the statistics. + /// + /// Panics if min value is not set, use `has_min_max_set` method to check + /// if values are set. + pub fn min_bytes(&self) -> &[u8] { + self.min().as_bytes() + } + + /// Returns max value as bytes of the statistics. + /// + /// Panics if max value is not set, use `has_min_max_set` method to check + /// if values are set. + pub fn max_bytes(&self) -> &[u8] { + self.max().as_bytes() + } + + /// Whether or not min and max values are set. + /// Normally both min/max values will be set to `Some(value)` or `None`. + fn has_min_max_set(&self) -> bool { + self.min.is_some() && self.max.is_some() + } + + /// Returns optional value of number of distinct values occurring. + fn distinct_count(&self) -> Option { + self.distinct_count + } + + /// Returns null count. + fn null_count(&self) -> u64 { + self.null_count + } + + /// Returns `true` if statistics were created using old min/max fields. + fn is_min_max_deprecated(&self) -> bool { + self.is_min_max_deprecated + } +} + +impl fmt::Display for TypedStatistics { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{{")?; + write!(f, "min: ")?; + match self.min { + Some(ref value) => self.value_fmt(f, value)?, + None => write!(f, "N/A")?, + } + write!(f, ", max: ")?; + match self.max { + Some(ref value) => self.value_fmt(f, value)?, + None => write!(f, "N/A")?, + } + write!(f, ", distinct_count: ")?; + match self.distinct_count { + Some(value) => write!(f, "{}", value)?, + None => write!(f, "N/A")?, + } + write!(f, ", null_count: {}", self.null_count)?; + write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?; + write!(f, "}}") + } +} + +impl fmt::Debug for TypedStatistics { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {}, \ + min_max_deprecated: {}}}", + self.min, self.max, self.distinct_count, self.null_count, self.is_min_max_deprecated + ) + } +} + +impl cmp::PartialEq for TypedStatistics { + fn eq(&self, other: &TypedStatistics) -> bool { + self.min == other.min + && self.max == other.max + && self.distinct_count == other.distinct_count + && self.null_count == other.null_count + && self.is_min_max_deprecated == other.is_min_max_deprecated + } +} + +/// Trait to provide a specific write format for values. +/// For example, we should display vector slices for byte array types, and original +/// values for other types. +trait ValueDisplay { + fn value_fmt(&self, f: &mut fmt::Formatter, value: &T::T) -> fmt::Result; +} + +impl ValueDisplay for TypedStatistics { + default fn value_fmt(&self, f: &mut fmt::Formatter, value: &T::T) -> fmt::Result { + write!(f, "{:?}", value) + } +} + +impl ValueDisplay for TypedStatistics { + fn value_fmt(&self, f: &mut fmt::Formatter, value: &Int96) -> fmt::Result { + write!(f, "{:?}", value.data()) + } +} + +impl ValueDisplay for TypedStatistics { + fn value_fmt(&self, f: &mut fmt::Formatter, value: &ByteArray) -> fmt::Result { + write!(f, "{:?}", value.data()) + } +} + +impl ValueDisplay for TypedStatistics { + fn value_fmt(&self, f: &mut fmt::Formatter, value: &ByteArray) -> fmt::Result { + write!(f, "{:?}", value.data()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_statistics_min_max_bytes() { + let stats = Statistics::int32(Some(-123), Some(234), None, 1, false); + assert!(stats.has_min_max_set()); + assert_eq!(stats.min_bytes(), (-123).as_bytes()); + assert_eq!(stats.max_bytes(), 234.as_bytes()); + + let stats = Statistics::byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![3, 4, 5])), + None, + 1, + true, + ); + assert!(stats.has_min_max_set()); + assert_eq!(stats.min_bytes(), &[1, 2, 3]); + assert_eq!(stats.max_bytes(), &[3, 4, 5]); + } + + #[test] + #[should_panic(expected = "Statistics null count is negative (-10)")] + fn test_statistics_negative_null_count() { + let thrift_stats = TStatistics { + max: None, + min: None, + null_count: Some(-10), + distinct_count: None, + max_value: None, + min_value: None, + }; + + from_thrift(Type::INT32, Some(thrift_stats)); + } + + #[test] + fn test_statistics_thrift_none() { + assert_eq!(from_thrift(Type::INT32, None), None); + assert_eq!(from_thrift(Type::BYTE_ARRAY, None), None); + } + + #[test] + fn test_statistics_debug() { + let stats = Statistics::int32(Some(1), Some(12), None, 12, true); + assert_eq!( + format!("{:?}", stats), + "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 12, \ + min_max_deprecated: true})" + ); + + let stats = Statistics::int32(None, None, None, 7, false); + assert_eq!( + format!("{:?}", stats), + "Int32({min: None, max: None, distinct_count: None, null_count: 7, \ + min_max_deprecated: false})" + ) + } + + #[test] + fn test_statistics_display() { + let stats = Statistics::int32(Some(1), Some(12), None, 12, true); + assert_eq!( + format!("{}", stats), + "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true}" + ); + + let stats = Statistics::int64(None, None, None, 7, false); + assert_eq!( + format!("{}", stats), + "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \ + false}" + ); + + let stats = Statistics::int96( + Some(Int96::from(vec![1, 0, 0])), + Some(Int96::from(vec![2, 3, 4])), + None, + 3, + true, + ); + assert_eq!( + format!("{}", stats), + "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \ + min_max_deprecated: true}" + ); + + let stats = Statistics::byte_array( + Some(ByteArray::from(vec![1u8])), + Some(ByteArray::from(vec![2u8])), + Some(5), + 7, + false, + ); + assert_eq!( + format!("{}", stats), + "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false}" + ); + } + + #[test] + fn test_statistics_partial_eq() { + let expected = Statistics::int32(Some(12), Some(45), None, 11, true); + + assert!(Statistics::int32(Some(12), Some(45), None, 11, true) == expected); + assert!(Statistics::int32(Some(11), Some(45), None, 11, true) != expected); + assert!(Statistics::int32(Some(12), Some(44), None, 11, true) != expected); + assert!(Statistics::int32(Some(12), Some(45), None, 23, true) != expected); + assert!(Statistics::int32(Some(12), Some(45), None, 11, false) != expected); + + assert!( + Statistics::int32(Some(12), Some(45), None, 11, false) + != Statistics::int64(Some(12), Some(45), None, 11, false) + ); + + assert!( + Statistics::boolean(Some(false), Some(true), None, 0, true) + != Statistics::double(Some(1.2), Some(4.5), None, 0, true) + ); + + assert!( + Statistics::byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![1, 2, 3])), + None, + 0, + true + ) != Statistics::fixed_len_byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![1, 2, 3])), + None, + 0, + true + ) + ); + } + + #[test] + fn test_statistics_from_thrift() { + // Helper method to check statistics conversion. + fn check_stats(stats: Statistics) { + let tpe = stats.physical_type(); + let thrift_stats = to_thrift(Some(&stats)); + assert_eq!(from_thrift(tpe, thrift_stats), Some(stats)); + } + + check_stats(Statistics::boolean(Some(false), Some(true), None, 7, true)); + check_stats(Statistics::boolean(Some(false), Some(true), None, 7, true)); + check_stats(Statistics::boolean(Some(false), Some(true), None, 0, false)); + check_stats(Statistics::boolean(Some(true), Some(true), None, 7, true)); + check_stats(Statistics::boolean(Some(false), Some(false), None, 7, true)); + check_stats(Statistics::boolean(None, None, None, 7, true)); + + check_stats(Statistics::int32(Some(-100), Some(500), None, 7, true)); + check_stats(Statistics::int32(Some(-100), Some(500), None, 0, false)); + check_stats(Statistics::int32(None, None, None, 7, true)); + + check_stats(Statistics::int64(Some(-100), Some(200), None, 7, true)); + check_stats(Statistics::int64(Some(-100), Some(200), None, 0, false)); + check_stats(Statistics::int64(None, None, None, 7, true)); + + check_stats(Statistics::float(Some(1.2), Some(3.4), None, 7, true)); + check_stats(Statistics::float(Some(1.2), Some(3.4), None, 0, false)); + check_stats(Statistics::float(None, None, None, 7, true)); + + check_stats(Statistics::double(Some(1.2), Some(3.4), None, 7, true)); + check_stats(Statistics::double(Some(1.2), Some(3.4), None, 0, false)); + check_stats(Statistics::double(None, None, None, 7, true)); + + check_stats(Statistics::byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![3, 4, 5])), + None, + 7, + true, + )); + check_stats(Statistics::byte_array(None, None, None, 7, true)); + + check_stats(Statistics::fixed_len_byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![3, 4, 5])), + None, + 7, + true, + )); + check_stats(Statistics::fixed_len_byte_array(None, None, None, 7, true)); + } +} diff --git a/rust/src/parquet/file/writer.rs b/rust/src/parquet/file/writer.rs new file mode 100644 index 0000000000000..1e0c11641f9a4 --- /dev/null +++ b/rust/src/parquet/file/writer.rs @@ -0,0 +1,936 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains file writer API, and provides methods to write row groups and columns by +//! using row group writers and column writers respectively. + +use std::{ + fs::File, + io::{Seek, SeekFrom, Write}, + rc::Rc, +}; + +use byteorder::{ByteOrder, LittleEndian}; +use parquet_format as parquet; +use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol}; + +use crate::parquet::basic::PageType; +use crate::parquet::column::{ + page::{CompressedPage, Page, PageWriteSpec, PageWriter}, + writer::{get_column_writer, ColumnWriter}, +}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::file::{ + metadata::*, properties::WriterPropertiesPtr, statistics::to_thrift as statistics_to_thrift, + FOOTER_SIZE, PARQUET_MAGIC, +}; +use crate::parquet::schema::types::{self, SchemaDescPtr, SchemaDescriptor, TypePtr}; +use crate::parquet::util::io::{FileSink, Position}; + +// ---------------------------------------------------------------------- +// APIs for file & row group writers + +/// Parquet file writer API. +/// Provides methods to write row groups sequentially. +/// +/// The main workflow should be as following: +/// - Create file writer, this will open a new file and potentially write some metadata. +/// - Request a new row group writer by calling `next_row_group`. +/// - Once finished writing row group, close row group writer by passing it into +/// `close_row_group` method - this will finalise row group metadata and update metrics. +/// - Write subsequent row groups, if necessary. +/// - After all row groups have been written, close the file writer using `close` method. +pub trait FileWriter { + /// Creates new row group from this file writer. + /// In case of IO error or Thrift error, returns `Err`. + /// + /// There is no limit on a number of row groups in a file; however, row groups have + /// to be written sequentially. Every time the next row group is requested, the + /// previous row group must be finalised and closed using `close_row_group` method. + fn next_row_group(&mut self) -> Result>; + + /// Finalises and closes row group that was created using `next_row_group` method. + /// After calling this method, the next row group is available for writes. + fn close_row_group(&mut self, row_group_writer: Box) -> Result<()>; + + /// Closes and finalises file writer. + /// + /// All row groups must be appended before this method is called. + /// No writes are allowed after this point. + /// + /// Can be called multiple times. It is up to implementation to either result in no-op, + /// or return an `Err` for subsequent calls. + fn close(&mut self) -> Result<()>; +} + +/// Parquet row group writer API. +/// Provides methods to access column writers in an iterator-like fashion, order is +/// guaranteed to match the order of schema leaves (column descriptors). +/// +/// All columns should be written sequentially; the main workflow is: +/// - Request the next column using `next_column` method - this will return `None` if no +/// more columns are available to write. +/// - Once done writing a column, close column writer with `close_column` method - this +/// will finalise column chunk metadata and update row group metrics. +/// - Once all columns have been written, close row group writer with `close` method - +/// it will return row group metadata and is no-op on already closed row group. +pub trait RowGroupWriter { + /// Returns the next column writer, if available; otherwise returns `None`. + /// In case of any IO error or Thrift error, or if row group writer has already been + /// closed returns `Err`. + /// + /// To request the next column writer, the previous one must be finalised and closed + /// using `close_column`. + fn next_column(&mut self) -> Result>; + + /// Closes column writer that was created using `next_column` method. + /// This should be called before requesting the next column writer. + fn close_column(&mut self, column_writer: ColumnWriter) -> Result<()>; + + /// Closes this row group writer and returns row group metadata. + /// After calling this method row group writer must not be used. + /// + /// It is recommended to call this method before requesting another row group, but it + /// will be closed automatically before returning a new row group. + /// + /// Can be called multiple times. In subsequent calls will result in no-op and return + /// already created row group metadata. + fn close(&mut self) -> Result; +} + +// ---------------------------------------------------------------------- +// Serialized impl for file & row group writers + +/// A serialized implementation for Parquet [`FileWriter`]. +/// See documentation on file writer for more information. +pub struct SerializedFileWriter { + file: File, + schema: TypePtr, + descr: SchemaDescPtr, + props: WriterPropertiesPtr, + total_num_rows: u64, + row_groups: Vec, + previous_writer_closed: bool, + is_closed: bool, +} + +impl SerializedFileWriter { + /// Creates new file writer. + pub fn new(mut file: File, schema: TypePtr, properties: WriterPropertiesPtr) -> Result { + Self::start_file(&mut file)?; + Ok(Self { + file, + schema: schema.clone(), + descr: Rc::new(SchemaDescriptor::new(schema)), + props: properties, + total_num_rows: 0, + row_groups: Vec::new(), + previous_writer_closed: true, + is_closed: false, + }) + } + + /// Writes magic bytes at the beginning of the file. + fn start_file(file: &mut File) -> Result<()> { + file.write(&PARQUET_MAGIC)?; + Ok(()) + } + + /// Finalises active row group writer, otherwise no-op. + fn finalise_row_group_writer( + &mut self, + mut row_group_writer: Box, + ) -> Result<()> { + let row_group_metadata = row_group_writer.close()?; + self.row_groups.push(row_group_metadata); + Ok(()) + } + + /// Assembles and writes metadata at the end of the file. + fn write_metadata(&mut self) -> Result<()> { + let file_metadata = parquet::FileMetaData { + version: self.props.writer_version().as_num(), + schema: types::to_thrift(self.schema.as_ref())?, + num_rows: self.total_num_rows as i64, + row_groups: self + .row_groups + .as_slice() + .into_iter() + .map(|v| v.to_thrift()) + .collect(), + key_value_metadata: None, + created_by: Some(self.props.created_by().to_owned()), + column_orders: None, + }; + + // Write file metadata + let start_pos = self.file.seek(SeekFrom::Current(0))?; + { + let mut protocol = TCompactOutputProtocol::new(&mut self.file); + file_metadata.write_to_out_protocol(&mut protocol)?; + protocol.flush()?; + } + let end_pos = self.file.seek(SeekFrom::Current(0))?; + + // Write footer + let mut footer_buffer: [u8; FOOTER_SIZE] = [0; FOOTER_SIZE]; + let metadata_len = (end_pos - start_pos) as i32; + LittleEndian::write_i32(&mut footer_buffer, metadata_len); + (&mut footer_buffer[4..]).write(&PARQUET_MAGIC)?; + self.file.write(&footer_buffer)?; + Ok(()) + } + + #[inline] + fn assert_closed(&self) -> Result<()> { + if self.is_closed { + Err(general_err!("File writer is closed")) + } else { + Ok(()) + } + } + + #[inline] + fn assert_previous_writer_closed(&self) -> Result<()> { + if !self.previous_writer_closed { + Err(general_err!("Previous row group writer was not closed")) + } else { + Ok(()) + } + } +} + +impl FileWriter for SerializedFileWriter { + #[inline] + fn next_row_group(&mut self) -> Result> { + self.assert_closed()?; + self.assert_previous_writer_closed()?; + let row_group_writer = + SerializedRowGroupWriter::new(self.descr.clone(), self.props.clone(), &self.file); + self.previous_writer_closed = false; + Ok(Box::new(row_group_writer)) + } + + #[inline] + fn close_row_group(&mut self, row_group_writer: Box) -> Result<()> { + self.assert_closed()?; + let res = self.finalise_row_group_writer(row_group_writer); + self.previous_writer_closed = res.is_ok(); + res + } + + #[inline] + fn close(&mut self) -> Result<()> { + self.assert_closed()?; + self.assert_previous_writer_closed()?; + self.write_metadata()?; + self.is_closed = true; + Ok(()) + } +} + +/// A serialized implementation for Parquet [`RowGroupWriter`]. +/// Coordinates writing of a row group with column writers. +/// See documentation on row group writer for more information. +pub struct SerializedRowGroupWriter { + descr: SchemaDescPtr, + props: WriterPropertiesPtr, + file: File, + total_rows_written: Option, + total_bytes_written: u64, + column_index: usize, + previous_writer_closed: bool, + row_group_metadata: Option, + column_chunks: Vec, +} + +impl SerializedRowGroupWriter { + pub fn new(schema_descr: SchemaDescPtr, properties: WriterPropertiesPtr, file: &File) -> Self { + let num_columns = schema_descr.num_columns(); + Self { + descr: schema_descr, + props: properties, + file: file.try_clone().unwrap(), + total_rows_written: None, + total_bytes_written: 0, + column_index: 0, + previous_writer_closed: true, + row_group_metadata: None, + column_chunks: Vec::with_capacity(num_columns), + } + } + + /// Checks and finalises current column writer. + fn finalise_column_writer(&mut self, writer: ColumnWriter) -> Result<()> { + let (bytes_written, rows_written, metadata) = match writer { + ColumnWriter::BoolColumnWriter(typed) => typed.close()?, + ColumnWriter::Int32ColumnWriter(typed) => typed.close()?, + ColumnWriter::Int64ColumnWriter(typed) => typed.close()?, + ColumnWriter::Int96ColumnWriter(typed) => typed.close()?, + ColumnWriter::FloatColumnWriter(typed) => typed.close()?, + ColumnWriter::DoubleColumnWriter(typed) => typed.close()?, + ColumnWriter::ByteArrayColumnWriter(typed) => typed.close()?, + ColumnWriter::FixedLenByteArrayColumnWriter(typed) => typed.close()?, + }; + + // Update row group writer metrics + self.total_bytes_written += bytes_written; + self.column_chunks.push(Rc::new(metadata)); + if let Some(rows) = self.total_rows_written { + if rows != rows_written { + return Err(general_err!( + "Incorrect number of rows, expected {} != {} rows", + rows, + rows_written + )); + } + } else { + self.total_rows_written = Some(rows_written); + } + + Ok(()) + } + + #[inline] + fn assert_closed(&self) -> Result<()> { + if self.row_group_metadata.is_some() { + Err(general_err!("Row group writer is closed")) + } else { + Ok(()) + } + } + + #[inline] + fn assert_previous_writer_closed(&self) -> Result<()> { + if !self.previous_writer_closed { + Err(general_err!("Previous column writer was not closed")) + } else { + Ok(()) + } + } +} + +impl RowGroupWriter for SerializedRowGroupWriter { + #[inline] + fn next_column(&mut self) -> Result> { + self.assert_closed()?; + self.assert_previous_writer_closed()?; + + if self.column_index >= self.descr.num_columns() { + return Ok(None); + } + let sink = FileSink::new(&self.file); + let page_writer = Box::new(SerializedPageWriter::new(sink)); + let column_writer = get_column_writer( + self.descr.column(self.column_index), + self.props.clone(), + page_writer, + ); + self.column_index += 1; + self.previous_writer_closed = false; + + Ok(Some(column_writer)) + } + + #[inline] + fn close_column(&mut self, column_writer: ColumnWriter) -> Result<()> { + let res = self.finalise_column_writer(column_writer); + self.previous_writer_closed = res.is_ok(); + res + } + + #[inline] + fn close(&mut self) -> Result { + if self.row_group_metadata.is_none() { + self.assert_previous_writer_closed()?; + + let row_group_metadata = RowGroupMetaData::builder(self.descr.clone()) + .set_column_metadata(self.column_chunks.clone()) + .set_total_byte_size(self.total_bytes_written as i64) + .set_num_rows(self.total_rows_written.unwrap_or(0) as i64) + .build()?; + + self.row_group_metadata = Some(Rc::new(row_group_metadata)); + } + + let metadata = self.row_group_metadata.as_ref().unwrap().clone(); + Ok(metadata) + } +} + +/// A serialized implementation for Parquet [`PageWriter`]. +/// Writes and serializes pages and metadata into output stream. +/// +/// `SerializedPageWriter` should not be used after calling `close()`. +pub struct SerializedPageWriter { + sink: T, +} + +impl SerializedPageWriter { + /// Creates new page writer. + pub fn new(sink: T) -> Self { + Self { sink } + } + + /// Serializes page header into Thrift. + /// Returns number of bytes that have been written into the sink. + #[inline] + fn serialize_page_header(&mut self, header: parquet::PageHeader) -> Result { + let start_pos = self.sink.pos(); + { + let mut protocol = TCompactOutputProtocol::new(&mut self.sink); + header.write_to_out_protocol(&mut protocol)?; + protocol.flush()?; + } + Ok((self.sink.pos() - start_pos) as usize) + } + + /// Serializes column chunk into Thrift. + /// Returns Ok() if there are not errors serializing and writing data into the sink. + #[inline] + fn serialize_column_chunk(&mut self, chunk: parquet::ColumnChunk) -> Result<()> { + let mut protocol = TCompactOutputProtocol::new(&mut self.sink); + chunk.write_to_out_protocol(&mut protocol)?; + protocol.flush()?; + Ok(()) + } +} + +impl PageWriter for SerializedPageWriter { + fn write_page(&mut self, page: CompressedPage) -> Result { + let uncompressed_size = page.uncompressed_size(); + let compressed_size = page.compressed_size(); + let num_values = page.num_values(); + let encoding = page.encoding(); + let page_type = page.page_type(); + + let mut page_header = parquet::PageHeader { + type_: page_type.into(), + uncompressed_page_size: uncompressed_size as i32, + compressed_page_size: compressed_size as i32, + // TODO: Add support for crc checksum + crc: None, + data_page_header: None, + index_page_header: None, + dictionary_page_header: None, + data_page_header_v2: None, + }; + + match page.compressed_page() { + &Page::DataPage { + def_level_encoding, + rep_level_encoding, + ref statistics, + .. + } => { + let data_page_header = parquet::DataPageHeader { + num_values: num_values as i32, + encoding: encoding.into(), + definition_level_encoding: def_level_encoding.into(), + repetition_level_encoding: rep_level_encoding.into(), + statistics: statistics_to_thrift(statistics.as_ref()), + }; + page_header.data_page_header = Some(data_page_header); + } + &Page::DataPageV2 { + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed, + ref statistics, + .. + } => { + let data_page_header_v2 = parquet::DataPageHeaderV2 { + num_values: num_values as i32, + num_nulls: num_nulls as i32, + num_rows: num_rows as i32, + encoding: encoding.into(), + definition_levels_byte_length: def_levels_byte_len as i32, + repetition_levels_byte_length: rep_levels_byte_len as i32, + is_compressed: Some(is_compressed), + statistics: statistics_to_thrift(statistics.as_ref()), + }; + page_header.data_page_header_v2 = Some(data_page_header_v2); + } + &Page::DictionaryPage { is_sorted, .. } => { + let dictionary_page_header = parquet::DictionaryPageHeader { + num_values: num_values as i32, + encoding: encoding.into(), + is_sorted: Some(is_sorted), + }; + page_header.dictionary_page_header = Some(dictionary_page_header); + } + } + + let start_pos = self.sink.pos(); + + let header_size = self.serialize_page_header(page_header)?; + self.sink.write_all(page.data())?; + + let mut spec = PageWriteSpec::new(); + spec.page_type = page_type; + spec.uncompressed_size = uncompressed_size + header_size; + spec.compressed_size = compressed_size + header_size; + spec.offset = start_pos; + spec.bytes_written = self.sink.pos() - start_pos; + // Number of values is incremented for data pages only + if page_type == PageType::DATA_PAGE || page_type == PageType::DATA_PAGE_V2 { + spec.num_values = num_values; + } + + Ok(spec) + } + + fn write_metadata(&mut self, metadata: &ColumnChunkMetaData) -> Result<()> { + self.serialize_column_chunk(metadata.to_thrift()) + } + + fn close(&mut self) -> Result<()> { + self.sink.flush()?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::{error::Error, io::Cursor}; + + use crate::parquet::basic::{Compression, Encoding, Repetition, Type}; + use crate::parquet::column::page::PageReader; + use crate::parquet::compression::{create_codec, Codec}; + use crate::parquet::file::{ + properties::WriterProperties, + reader::{FileReader, SerializedFileReader, SerializedPageReader}, + statistics::{from_thrift, to_thrift, Statistics}, + }; + use crate::parquet::record::RowAccessor; + use crate::parquet::util::{memory::ByteBufferPtr, test_common::get_temp_file}; + + #[test] + fn test_file_writer_error_after_close() { + let file = get_temp_file("test_file_writer_error_after_close", &[]); + let schema = Rc::new(types::Type::group_type_builder("schema").build().unwrap()); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); + writer.close().unwrap(); + { + let res = writer.next_row_group(); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!(err.description(), "File writer is closed"); + } + } + { + let res = writer.close(); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!(err.description(), "File writer is closed"); + } + } + } + + #[test] + fn test_row_group_writer_error_after_close() { + let file = get_temp_file("test_file_writer_row_group_error_after_close", &[]); + let schema = Rc::new(types::Type::group_type_builder("schema").build().unwrap()); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + row_group_writer.close().unwrap(); + + let res = row_group_writer.next_column(); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!(err.description(), "Row group writer is closed"); + } + } + + #[test] + fn test_row_group_writer_error_not_all_columns_written() { + let file = get_temp_file("test_row_group_writer_error_not_all_columns_written", &[]); + let schema = Rc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![Rc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + let res = row_group_writer.close(); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!(err.description(), "Column length mismatch: 1 != 0"); + } + } + + #[test] + fn test_row_group_writer_num_records_mismatch() { + let file = get_temp_file("test_row_group_writer_num_records_mismatch", &[]); + let schema = Rc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![ + Rc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ), + Rc::new( + types::Type::primitive_type_builder("col2", Type::INT32) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + ); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + if let ColumnWriter::Int32ColumnWriter(ref mut typed) = col_writer { + typed.write_batch(&[1, 2, 3], None, None).unwrap(); + } + row_group_writer.close_column(col_writer).unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + if let ColumnWriter::Int32ColumnWriter(ref mut typed) = col_writer { + typed.write_batch(&[1, 2], None, None).unwrap(); + } + + let res = row_group_writer.close_column(col_writer); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!( + err.description(), + "Incorrect number of rows, expected 3 != 2 rows" + ); + } + } + + #[test] + fn test_file_writer_empty_file() { + let file = get_temp_file("test_file_writer_write_empty_file", &[]); + + let schema = Rc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![Rc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = + SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + assert_eq!(reader.get_row_iter(None).unwrap().count(), 0); + } + + #[test] + fn test_file_writer_empty_row_groups() { + let file = get_temp_file("test_file_writer_write_empty_row_groups", &[]); + test_file_roundtrip(file, vec![]); + } + + #[test] + fn test_file_writer_single_row_group() { + let file = get_temp_file("test_file_writer_write_single_row_group", &[]); + test_file_roundtrip(file, vec![vec![1, 2, 3, 4, 5]]); + } + + #[test] + fn test_file_writer_multiple_row_groups() { + let file = get_temp_file("test_file_writer_write_multiple_row_groups", &[]); + test_file_roundtrip( + file, + vec![ + vec![1, 2, 3, 4, 5], + vec![1, 2, 3], + vec![1], + vec![1, 2, 3, 4, 5, 6], + ], + ); + } + + #[test] + fn test_file_writer_multiple_large_row_groups() { + let file = get_temp_file("test_file_writer_multiple_large_row_groups", &[]); + test_file_roundtrip( + file, + vec![vec![123; 1024], vec![124; 1000], vec![125; 15], vec![]], + ); + } + + #[test] + fn test_page_writer_data_pages() { + let pages = vec![ + Page::DataPage { + buf: ByteBufferPtr::new(vec![1, 2, 3, 4, 5, 6, 7, 8]), + num_values: 10, + encoding: Encoding::DELTA_BINARY_PACKED, + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + }, + Page::DataPageV2 { + buf: ByteBufferPtr::new(vec![4; 128]), + num_values: 10, + encoding: Encoding::DELTA_BINARY_PACKED, + num_nulls: 2, + num_rows: 12, + def_levels_byte_len: 24, + rep_levels_byte_len: 32, + is_compressed: false, + statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + }, + ]; + + test_page_roundtrip(&pages[..], Compression::SNAPPY, Type::INT32); + test_page_roundtrip(&pages[..], Compression::UNCOMPRESSED, Type::INT32); + } + + #[test] + fn test_page_writer_dict_pages() { + let pages = vec![ + Page::DictionaryPage { + buf: ByteBufferPtr::new(vec![1, 2, 3, 4, 5]), + num_values: 5, + encoding: Encoding::RLE_DICTIONARY, + is_sorted: false, + }, + Page::DataPage { + buf: ByteBufferPtr::new(vec![1, 2, 3, 4, 5, 6, 7, 8]), + num_values: 10, + encoding: Encoding::DELTA_BINARY_PACKED, + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + }, + Page::DataPageV2 { + buf: ByteBufferPtr::new(vec![4; 128]), + num_values: 10, + encoding: Encoding::DELTA_BINARY_PACKED, + num_nulls: 2, + num_rows: 12, + def_levels_byte_len: 24, + rep_levels_byte_len: 32, + is_compressed: false, + statistics: None, + }, + ]; + + test_page_roundtrip(&pages[..], Compression::SNAPPY, Type::INT32); + test_page_roundtrip(&pages[..], Compression::UNCOMPRESSED, Type::INT32); + } + + /// Tests writing and reading pages. + /// Physical type is for statistics only, should match any defined statistics type in + /// pages. + fn test_page_roundtrip(pages: &[Page], codec: Compression, physical_type: Type) { + let mut compressed_pages = vec![]; + let mut total_num_values = 0i64; + let mut compressor = create_codec(codec).unwrap(); + + for page in pages { + let uncompressed_len = page.buffer().len(); + + let compressed_page = match page { + &Page::DataPage { + ref buf, + num_values, + encoding, + def_level_encoding, + rep_level_encoding, + ref statistics, + } => { + total_num_values += num_values as i64; + let output_buf = compress_helper(compressor.as_mut(), buf.data()); + + Page::DataPage { + buf: ByteBufferPtr::new(output_buf), + num_values, + encoding, + def_level_encoding, + rep_level_encoding, + statistics: from_thrift(physical_type, to_thrift(statistics.as_ref())), + } + } + &Page::DataPageV2 { + ref buf, + num_values, + encoding, + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + ref statistics, + .. + } => { + total_num_values += num_values as i64; + let offset = (def_levels_byte_len + rep_levels_byte_len) as usize; + let cmp_buf = compress_helper(compressor.as_mut(), &buf.data()[offset..]); + let mut output_buf = Vec::from(&buf.data()[..offset]); + output_buf.extend_from_slice(&cmp_buf[..]); + + Page::DataPageV2 { + buf: ByteBufferPtr::new(output_buf), + num_values, + encoding, + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed: compressor.is_some(), + statistics: from_thrift(physical_type, to_thrift(statistics.as_ref())), + } + } + &Page::DictionaryPage { + ref buf, + num_values, + encoding, + is_sorted, + } => { + let output_buf = compress_helper(compressor.as_mut(), buf.data()); + + Page::DictionaryPage { + buf: ByteBufferPtr::new(output_buf), + num_values, + encoding, + is_sorted, + } + } + }; + + let compressed_page = CompressedPage::new(compressed_page, uncompressed_len); + compressed_pages.push(compressed_page); + } + + let mut buffer: Vec = vec![]; + let mut result_pages: Vec = vec![]; + { + let cursor = Cursor::new(&mut buffer); + let mut page_writer = SerializedPageWriter::new(cursor); + + for page in compressed_pages { + page_writer.write_page(page).unwrap(); + } + page_writer.close().unwrap(); + } + { + let mut page_reader = SerializedPageReader::new( + Cursor::new(&buffer), + total_num_values, + codec, + physical_type, + ) + .unwrap(); + + while let Some(page) = page_reader.get_next_page().unwrap() { + result_pages.push(page); + } + } + + assert_eq!(result_pages.len(), pages.len()); + for i in 0..result_pages.len() { + assert_page(&result_pages[i], &pages[i]); + } + } + + /// Helper function to compress a slice + fn compress_helper(compressor: Option<&mut Box>, data: &[u8]) -> Vec { + let mut output_buf = vec![]; + if let Some(cmpr) = compressor { + cmpr.compress(data, &mut output_buf).unwrap(); + } else { + output_buf.extend_from_slice(data); + } + output_buf + } + + /// Check if pages match. + fn assert_page(left: &Page, right: &Page) { + assert_eq!(left.page_type(), right.page_type()); + assert_eq!(left.buffer().data(), right.buffer().data()); + assert_eq!(left.num_values(), right.num_values()); + assert_eq!(left.encoding(), right.encoding()); + assert_eq!(to_thrift(left.statistics()), to_thrift(right.statistics())); + } + + /// File write-read roundtrip. + /// `data` consists of arrays of values for each row group. + fn test_file_roundtrip(file: File, data: Vec>) { + let schema = Rc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![Rc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ); + let props = Rc::new(WriterProperties::builder().build()); + let mut file_writer = + SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + + for subset in &data { + let mut row_group_writer = file_writer.next_row_group().unwrap(); + let col_writer = row_group_writer.next_column().unwrap(); + if let Some(mut writer) = col_writer { + match writer { + ColumnWriter::Int32ColumnWriter(ref mut typed) => { + typed.write_batch(&subset[..], None, None).unwrap(); + } + _ => { + unimplemented!(); + } + } + row_group_writer.close_column(writer).unwrap(); + } + file_writer.close_row_group(row_group_writer).unwrap(); + } + + file_writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + assert_eq!(reader.num_row_groups(), data.len()); + for i in 0..reader.num_row_groups() { + let row_group_reader = reader.get_row_group(i).unwrap(); + let iter = row_group_reader.get_row_iter(None).unwrap(); + let res = iter + .map(|elem| elem.get_int(0).unwrap()) + .collect::>(); + assert_eq!(res, data[i]); + } + } +} diff --git a/rust/src/parquet/mod.rs b/rust/src/parquet/mod.rs new file mode 100644 index 0000000000000..58cc7b13df6d6 --- /dev/null +++ b/rust/src/parquet/mod.rs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +pub mod errors; +pub mod basic; +pub mod data_type; + +// Exported for external use, such as benchmarks +pub use self::encodings::{decoding, encoding}; +pub use self::util::memory; + +#[macro_use] +mod util; +pub mod column; +pub mod compression; +mod encodings; +pub mod file; +pub mod record; +pub mod schema; diff --git a/rust/src/parquet/record/api.rs b/rust/src/parquet/record/api.rs new file mode 100644 index 0000000000000..d6e3ec19b76f6 --- /dev/null +++ b/rust/src/parquet/record/api.rs @@ -0,0 +1,1439 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains Row enum that is used to represent record in Rust. + +use std::fmt; + +use chrono::{Local, TimeZone}; +use num_bigint::{BigInt, Sign}; + +use crate::parquet::basic::{LogicalType, Type as PhysicalType}; +use crate::parquet::data_type::{ByteArray, Decimal, Int96}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::schema::types::ColumnDescPtr; + +/// Macro as a shortcut to generate 'not yet implemented' panic error. +macro_rules! nyi { + ($column_descr:ident, $value:ident) => {{ + unimplemented!( + "Conversion for physical type {}, logical type {}, value {:?}", + $column_descr.physical_type(), + $column_descr.logical_type(), + $value + ); + }}; +} + +/// `Row` represents a nested Parquet record. +#[derive(Clone, Debug, PartialEq)] +pub struct Row { + fields: Vec<(String, Field)>, +} + +impl Row { + /// Get the number of fields in this row. + pub fn len(&self) -> usize { + self.fields.len() + } +} + +/// Trait for type-safe convenient access to fields within a Row. +pub trait RowAccessor { + fn get_bool(&self, i: usize) -> Result; + fn get_byte(&self, i: usize) -> Result; + fn get_short(&self, i: usize) -> Result; + fn get_int(&self, i: usize) -> Result; + fn get_long(&self, i: usize) -> Result; + fn get_ubyte(&self, i: usize) -> Result; + fn get_ushort(&self, i: usize) -> Result; + fn get_uint(&self, i: usize) -> Result; + fn get_ulong(&self, i: usize) -> Result; + fn get_float(&self, i: usize) -> Result; + fn get_double(&self, i: usize) -> Result; + fn get_timestamp(&self, i: usize) -> Result; + fn get_decimal(&self, i: usize) -> Result<&Decimal>; + fn get_string(&self, i: usize) -> Result<&String>; + fn get_bytes(&self, i: usize) -> Result<&ByteArray>; + fn get_group(&self, i: usize) -> Result<&Row>; + fn get_list(&self, i: usize) -> Result<&List>; + fn get_map(&self, i: usize) -> Result<&Map>; +} + +/// Macro to generate type-safe get_xxx methods for primitive types, +/// e.g. `get_bool`, `get_short`. +macro_rules! row_primitive_accessor { + ($METHOD:ident, $VARIANT:ident, $TY:ty) => { + fn $METHOD(&self, i: usize) -> Result<$TY> { + match self.fields[i].1 { + Field::$VARIANT(v) => Ok(v), + _ => Err(general_err!("Cannot access {} as {}", + self.fields[i].1.get_type_name(), stringify!($VARIANT))) + } + } + } +} + +/// Macro to generate type-safe get_xxx methods for reference types, +/// e.g. `get_list`, `get_map`. +macro_rules! row_complex_accessor { + ($METHOD:ident, $VARIANT:ident, $TY:ty) => { + fn $METHOD(&self, i: usize) -> Result<&$TY> { + match self.fields[i].1 { + Field::$VARIANT(ref v) => Ok(v), + _ => Err(general_err!("Cannot access {} as {}", + self.fields[i].1.get_type_name(), stringify!($VARIANT))) + } + } + } +} + +impl RowAccessor for Row { + row_primitive_accessor!(get_bool, Bool, bool); + + row_primitive_accessor!(get_byte, Byte, i8); + + row_primitive_accessor!(get_short, Short, i16); + + row_primitive_accessor!(get_int, Int, i32); + + row_primitive_accessor!(get_long, Long, i64); + + row_primitive_accessor!(get_ubyte, UByte, u8); + + row_primitive_accessor!(get_ushort, UShort, u16); + + row_primitive_accessor!(get_uint, UInt, u32); + + row_primitive_accessor!(get_ulong, ULong, u64); + + row_primitive_accessor!(get_float, Float, f32); + + row_primitive_accessor!(get_double, Double, f64); + + row_primitive_accessor!(get_timestamp, Timestamp, u64); + + row_complex_accessor!(get_decimal, Decimal, Decimal); + + row_complex_accessor!(get_string, Str, String); + + row_complex_accessor!(get_bytes, Bytes, ByteArray); + + row_complex_accessor!(get_group, Group, Row); + + row_complex_accessor!(get_list, ListInternal, List); + + row_complex_accessor!(get_map, MapInternal, Map); +} + +/// Constructs a `Row` from the list of `fields` and returns it. +#[inline] +pub fn make_row(fields: Vec<(String, Field)>) -> Row { + Row { fields } +} + +impl fmt::Display for Row { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{{")?; + for (i, &(ref key, ref value)) in self.fields.iter().enumerate() { + key.fmt(f)?; + write!(f, ": ")?; + value.fmt(f)?; + if i < self.fields.len() - 1 { + write!(f, ", ")?; + } + } + write!(f, "}}") + } +} + +/// `List` represents a list which contains an array of elements. +#[derive(Clone, Debug, PartialEq)] +pub struct List { + elements: Vec, +} + +impl List { + /// Get the number of fields in this row + pub fn len(&self) -> usize { + self.elements.len() + } +} + +/// Constructs a `List` from the list of `fields` and returns it. +#[inline] +pub fn make_list(elements: Vec) -> List { + List { elements } +} + +/// Trait for type-safe access of an index for a `List`. +/// Note that the get_XXX methods do not do bound checking. +pub trait ListAccessor { + fn get_bool(&self, i: usize) -> Result; + fn get_byte(&self, i: usize) -> Result; + fn get_short(&self, i: usize) -> Result; + fn get_int(&self, i: usize) -> Result; + fn get_long(&self, i: usize) -> Result; + fn get_ubyte(&self, i: usize) -> Result; + fn get_ushort(&self, i: usize) -> Result; + fn get_uint(&self, i: usize) -> Result; + fn get_ulong(&self, i: usize) -> Result; + fn get_float(&self, i: usize) -> Result; + fn get_double(&self, i: usize) -> Result; + fn get_timestamp(&self, i: usize) -> Result; + fn get_decimal(&self, i: usize) -> Result<&Decimal>; + fn get_string(&self, i: usize) -> Result<&String>; + fn get_bytes(&self, i: usize) -> Result<&ByteArray>; + fn get_group(&self, i: usize) -> Result<&Row>; + fn get_list(&self, i: usize) -> Result<&List>; + fn get_map(&self, i: usize) -> Result<&Map>; +} + +/// Macro to generate type-safe get_xxx methods for primitive types, +/// e.g. get_bool, get_short +macro_rules! list_primitive_accessor { + ($METHOD:ident, $VARIANT:ident, $TY:ty) => { + fn $METHOD(&self, i: usize) -> Result<$TY> { + match self.elements[i] { + Field::$VARIANT(v) => Ok(v), + _ => Err(general_err!( + "Cannot access {} as {}", + self.elements[i].get_type_name(), stringify!($VARIANT)) + ) + } + } + } +} + +/// Macro to generate type-safe get_xxx methods for reference types +/// e.g. get_list, get_map +macro_rules! list_complex_accessor { + ($METHOD:ident, $VARIANT:ident, $TY:ty) => { + fn $METHOD(&self, i: usize) -> Result<&$TY> { + match self.elements[i] { + Field::$VARIANT(ref v) => Ok(v), + _ => Err(general_err!( + "Cannot access {} as {}", + self.elements[i].get_type_name(), stringify!($VARIANT)) + ) + } + } + } +} + +impl ListAccessor for List { + list_primitive_accessor!(get_bool, Bool, bool); + + list_primitive_accessor!(get_byte, Byte, i8); + + list_primitive_accessor!(get_short, Short, i16); + + list_primitive_accessor!(get_int, Int, i32); + + list_primitive_accessor!(get_long, Long, i64); + + list_primitive_accessor!(get_ubyte, UByte, u8); + + list_primitive_accessor!(get_ushort, UShort, u16); + + list_primitive_accessor!(get_uint, UInt, u32); + + list_primitive_accessor!(get_ulong, ULong, u64); + + list_primitive_accessor!(get_float, Float, f32); + + list_primitive_accessor!(get_double, Double, f64); + + list_primitive_accessor!(get_timestamp, Timestamp, u64); + + list_complex_accessor!(get_decimal, Decimal, Decimal); + + list_complex_accessor!(get_string, Str, String); + + list_complex_accessor!(get_bytes, Bytes, ByteArray); + + list_complex_accessor!(get_group, Group, Row); + + list_complex_accessor!(get_list, ListInternal, List); + + list_complex_accessor!(get_map, MapInternal, Map); +} + +/// `Map` represents a map which contains an list of key->value pairs. +#[derive(Clone, Debug, PartialEq)] +pub struct Map { + entries: Vec<(Field, Field)>, +} + +impl Map { + /// Get the number of fields in this row + pub fn len(&self) -> usize { + self.entries.len() + } +} + +/// Constructs a `Map` from the list of `entries` and returns it. +#[inline] +pub fn make_map(entries: Vec<(Field, Field)>) -> Map { + Map { entries } +} + +/// Trait for type-safe access of an index for a `Map` +pub trait MapAccessor { + fn get_keys<'a>(&'a self) -> Box; + fn get_values<'a>(&'a self) -> Box; +} + +struct MapList<'a> { + elements: Vec<&'a Field>, +} + +/// Macro to generate type-safe get_xxx methods for primitive types, +/// e.g. get_bool, get_short +macro_rules! map_list_primitive_accessor { + ($METHOD:ident, $VARIANT:ident, $TY:ty) => { + fn $METHOD(&self, i: usize) -> Result<$TY> { + match self.elements[i] { + Field::$VARIANT(v) => Ok(*v), + _ => Err(general_err!( + "Cannot access {} as {}", + self.elements[i].get_type_name(), stringify!($VARIANT)) + ) + } + } + } +} + +impl<'a> ListAccessor for MapList<'a> { + map_list_primitive_accessor!(get_bool, Bool, bool); + + map_list_primitive_accessor!(get_byte, Byte, i8); + + map_list_primitive_accessor!(get_short, Short, i16); + + map_list_primitive_accessor!(get_int, Int, i32); + + map_list_primitive_accessor!(get_long, Long, i64); + + map_list_primitive_accessor!(get_ubyte, UByte, u8); + + map_list_primitive_accessor!(get_ushort, UShort, u16); + + map_list_primitive_accessor!(get_uint, UInt, u32); + + map_list_primitive_accessor!(get_ulong, ULong, u64); + + map_list_primitive_accessor!(get_float, Float, f32); + + map_list_primitive_accessor!(get_double, Double, f64); + + map_list_primitive_accessor!(get_timestamp, Timestamp, u64); + + list_complex_accessor!(get_decimal, Decimal, Decimal); + + list_complex_accessor!(get_string, Str, String); + + list_complex_accessor!(get_bytes, Bytes, ByteArray); + + list_complex_accessor!(get_group, Group, Row); + + list_complex_accessor!(get_list, ListInternal, List); + + list_complex_accessor!(get_map, MapInternal, Map); +} + +impl MapAccessor for Map { + fn get_keys<'a>(&'a self) -> Box { + let map_list = MapList { + elements: self.entries.iter().map(|v| &v.0).collect(), + }; + Box::new(map_list) + } + + fn get_values<'a>(&'a self) -> Box { + let map_list = MapList { + elements: self.entries.iter().map(|v| &v.1).collect(), + }; + Box::new(map_list) + } +} + +/// API to represent a single field in a `Row`. +#[derive(Clone, Debug, PartialEq)] +pub enum Field { + // Primitive types + /// Null value. + Null, + /// Boolean value (`true`, `false`). + Bool(bool), + /// Signed integer INT_8. + Byte(i8), + /// Signed integer INT_16. + Short(i16), + /// Signed integer INT_32. + Int(i32), + /// Signed integer INT_64. + Long(i64), + // Unsigned integer UINT_8. + UByte(u8), + // Unsigned integer UINT_16. + UShort(u16), + // Unsigned integer UINT_32. + UInt(u32), + // Unsigned integer UINT_64. + ULong(u64), + /// IEEE 32-bit floating point value. + Float(f32), + /// IEEE 64-bit floating point value. + Double(f64), + /// Decimal value. + Decimal(Decimal), + /// UTF-8 encoded character string. + Str(String), + /// General binary value. + Bytes(ByteArray), + /// Date without a time of day, stores the number of days from the + /// Unix epoch, 1 January 1970. + Date(u32), + /// Milliseconds from the Unix epoch, 1 January 1970. + Timestamp(u64), + + // ---------------------------------------------------------------------- + // Complex types + /// Struct, child elements are tuples of field-value pairs. + Group(Row), + /// List of elements. + ListInternal(List), + /// List of key-value pairs. + MapInternal(Map), +} + +impl Field { + /// Get the type name. + fn get_type_name(&self) -> &'static str { + match *self { + Field::Null => "Null", + Field::Bool(_) => "Bool", + Field::Byte(_) => "Byte", + Field::Short(_) => "Short", + Field::Int(_) => "Int", + Field::Long(_) => "Long", + Field::UByte(_) => "UByte", + Field::UShort(_) => "UShort", + Field::UInt(_) => "UInt", + Field::ULong(_) => "ULong", + Field::Float(_) => "Float", + Field::Double(_) => "Double", + Field::Decimal(_) => "Decimal", + Field::Date(_) => "Date", + Field::Str(_) => "Str", + Field::Bytes(_) => "Bytes", + Field::Timestamp(_) => "Timestamp", + Field::Group(_) => "Group", + Field::ListInternal(_) => "ListInternal", + Field::MapInternal(_) => "MapInternal", + } + } + + /// Determines if this Row represents a primitive value. + pub fn is_primitive(&self) -> bool { + match *self { + Field::Group(_) => false, + Field::ListInternal(_) => false, + Field::MapInternal(_) => false, + _ => true, + } + } + + /// Converts Parquet BOOLEAN type with logical type into `bool` value. + #[inline] + pub fn convert_bool(_descr: &ColumnDescPtr, value: bool) -> Self { + Field::Bool(value) + } + + /// Converts Parquet INT32 type with logical type into `i32` value. + #[inline] + pub fn convert_int32(descr: &ColumnDescPtr, value: i32) -> Self { + match descr.logical_type() { + LogicalType::INT_8 => Field::Byte(value as i8), + LogicalType::INT_16 => Field::Short(value as i16), + LogicalType::INT_32 | LogicalType::NONE => Field::Int(value), + LogicalType::UINT_8 => Field::UByte(value as u8), + LogicalType::UINT_16 => Field::UShort(value as u16), + LogicalType::UINT_32 => Field::UInt(value as u32), + LogicalType::DATE => Field::Date(value as u32), + LogicalType::DECIMAL => Field::Decimal(Decimal::from_i32( + value, + descr.type_precision(), + descr.type_scale(), + )), + _ => nyi!(descr, value), + } + } + + /// Converts Parquet INT64 type with logical type into `i64` value. + #[inline] + pub fn convert_int64(descr: &ColumnDescPtr, value: i64) -> Self { + match descr.logical_type() { + LogicalType::INT_64 | LogicalType::NONE => Field::Long(value), + LogicalType::UINT_64 => Field::ULong(value as u64), + LogicalType::TIMESTAMP_MILLIS => Field::Timestamp(value as u64), + LogicalType::DECIMAL => Field::Decimal(Decimal::from_i64( + value, + descr.type_precision(), + descr.type_scale(), + )), + _ => nyi!(descr, value), + } + } + + /// Converts Parquet INT96 (nanosecond timestamps) type and logical type into + /// `Timestamp` value. + #[inline] + pub fn convert_int96(_descr: &ColumnDescPtr, value: Int96) -> Self { + const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; + const SECONDS_PER_DAY: i64 = 86_400; + const MILLIS_PER_SECOND: i64 = 1_000; + + let day = value.data()[2] as i64; + let nanoseconds = ((value.data()[1] as i64) << 32) + value.data()[0] as i64; + let seconds = (day - JULIAN_DAY_OF_EPOCH) * SECONDS_PER_DAY; + let millis = seconds * MILLIS_PER_SECOND + nanoseconds / 1_000_000; + + // TODO: Add support for negative milliseconds. + // Chrono library does not handle negative timestamps, but we could probably write + // something similar to java.util.Date and java.util.Calendar. + if millis < 0 { + panic!( + "Expected non-negative milliseconds when converting Int96, found {}", + millis + ); + } + + Field::Timestamp(millis as u64) + } + + /// Converts Parquet FLOAT type with logical type into `f32` value. + #[inline] + pub fn convert_float(_descr: &ColumnDescPtr, value: f32) -> Self { + Field::Float(value) + } + + /// Converts Parquet DOUBLE type with logical type into `f64` value. + #[inline] + pub fn convert_double(_descr: &ColumnDescPtr, value: f64) -> Self { + Field::Double(value) + } + + /// Converts Parquet BYTE_ARRAY type with logical type into either UTF8 string or + /// array of bytes. + #[inline] + pub fn convert_byte_array(descr: &ColumnDescPtr, value: ByteArray) -> Self { + match descr.physical_type() { + PhysicalType::BYTE_ARRAY => match descr.logical_type() { + LogicalType::UTF8 | LogicalType::ENUM | LogicalType::JSON => { + let value = unsafe { String::from_utf8_unchecked(value.data().to_vec()) }; + Field::Str(value) + } + LogicalType::BSON | LogicalType::NONE => Field::Bytes(value), + LogicalType::DECIMAL => Field::Decimal(Decimal::from_bytes( + value, + descr.type_precision(), + descr.type_scale(), + )), + _ => nyi!(descr, value), + }, + PhysicalType::FIXED_LEN_BYTE_ARRAY => match descr.logical_type() { + LogicalType::DECIMAL => Field::Decimal(Decimal::from_bytes( + value, + descr.type_precision(), + descr.type_scale(), + )), + LogicalType::NONE => Field::Bytes(value), + _ => nyi!(descr, value), + }, + _ => nyi!(descr, value), + } + } +} + +impl fmt::Display for Field { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Field::Null => write!(f, "null"), + Field::Bool(value) => write!(f, "{}", value), + Field::Byte(value) => write!(f, "{}", value), + Field::Short(value) => write!(f, "{}", value), + Field::Int(value) => write!(f, "{}", value), + Field::Long(value) => write!(f, "{}", value), + Field::UByte(value) => write!(f, "{}", value), + Field::UShort(value) => write!(f, "{}", value), + Field::UInt(value) => write!(f, "{}", value), + Field::ULong(value) => write!(f, "{}", value), + Field::Float(value) => { + if value > 1e19 || value < 1e-15 { + write!(f, "{:E}", value) + } else { + write!(f, "{:?}", value) + } + } + Field::Double(value) => { + if value > 1e19 || value < 1e-15 { + write!(f, "{:E}", value) + } else { + write!(f, "{:?}", value) + } + } + Field::Decimal(ref value) => write!(f, "{}", convert_decimal_to_string(value)), + Field::Str(ref value) => write!(f, "\"{}\"", value), + Field::Bytes(ref value) => write!(f, "{:?}", value.data()), + Field::Date(value) => write!(f, "{}", convert_date_to_string(value)), + Field::Timestamp(value) => write!(f, "{}", convert_timestamp_to_string(value)), + Field::Group(ref fields) => write!(f, "{}", fields), + Field::ListInternal(ref list) => { + let elems = &list.elements; + write!(f, "[")?; + for (i, field) in elems.iter().enumerate() { + field.fmt(f)?; + if i < elems.len() - 1 { + write!(f, ", ")?; + } + } + write!(f, "]") + } + Field::MapInternal(ref map) => { + let entries = &map.entries; + write!(f, "{{")?; + for (i, &(ref key, ref value)) in entries.iter().enumerate() { + key.fmt(f)?; + write!(f, " -> ")?; + value.fmt(f)?; + if i < entries.len() - 1 { + write!(f, ", ")?; + } + } + write!(f, "}}") + } + } + } +} + +/// Helper method to convert Parquet date into a string. +/// Input `value` is a number of days since the epoch in UTC. +/// Date is displayed in local timezone. +#[inline] +fn convert_date_to_string(value: u32) -> String { + static NUM_SECONDS_IN_DAY: i64 = 60 * 60 * 24; + let dt = Local.timestamp(value as i64 * NUM_SECONDS_IN_DAY, 0).date(); + format!("{}", dt.format("%Y-%m-%d %:z")) +} + +/// Helper method to convert Parquet timestamp into a string. +/// Input `value` is a number of milliseconds since the epoch in UTC. +/// Datetime is displayed in local timezone. +#[inline] +fn convert_timestamp_to_string(value: u64) -> String { + let dt = Local.timestamp((value / 1000) as i64, 0); + format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")) +} + +/// Helper method to convert Parquet decimal into a string. +/// We assert that `scale >= 0` and `precision > scale`, but this will be enforced +/// when constructing Parquet schema. +#[inline] +fn convert_decimal_to_string(decimal: &Decimal) -> String { + assert!(decimal.scale() >= 0 && decimal.precision() > decimal.scale()); + + // Specify as signed bytes to resolve sign as part of conversion. + let num = BigInt::from_signed_bytes_be(decimal.data()); + + // Offset of the first digit in a string. + let negative = if num.sign() == Sign::Minus { 1 } else { 0 }; + let mut num_str = num.to_string(); + let mut point = num_str.len() as i32 - decimal.scale() - negative; + + // Convert to string form without scientific notation. + if point <= 0 { + // Zeros need to be prepended to the unscaled value. + while point < 0 { + num_str.insert(negative as usize, '0'); + point += 1; + } + num_str.insert_str(negative as usize, "0."); + } else { + // No zeroes need to be prepended to the unscaled value, simply insert decimal point. + num_str.insert((point + negative) as usize, '.'); + } + + num_str +} + +#[cfg(test)] +mod tests { + use super::*; + + use chrono; + use std::rc::Rc; + + use crate::parquet::schema::types::{ColumnDescriptor, ColumnPath, PrimitiveTypeBuilder}; + + /// Creates test column descriptor based on provided type parameters. + macro_rules! make_column_descr { + ($physical_type:expr, $logical_type:expr) => {{ + let tpe = PrimitiveTypeBuilder::new("col", $physical_type) + .with_logical_type($logical_type) + .build() + .unwrap(); + Rc::new(ColumnDescriptor::new( + Rc::new(tpe), + None, + 0, + 0, + ColumnPath::from("col"), + )) + }}; + ($physical_type:expr, $logical_type:expr, $len:expr, $prec:expr, $scale:expr) => {{ + let tpe = PrimitiveTypeBuilder::new("col", $physical_type) + .with_logical_type($logical_type) + .with_length($len) + .with_precision($prec) + .with_scale($scale) + .build() + .unwrap(); + Rc::new(ColumnDescriptor::new( + Rc::new(tpe), + None, + 0, + 0, + ColumnPath::from("col"), + )) + }}; + } + + #[test] + fn test_row_convert_bool() { + // BOOLEAN value does not depend on logical type + let descr = make_column_descr![PhysicalType::BOOLEAN, LogicalType::NONE]; + + let row = Field::convert_bool(&descr, true); + assert_eq!(row, Field::Bool(true)); + + let row = Field::convert_bool(&descr, false); + assert_eq!(row, Field::Bool(false)); + } + + #[test] + fn test_row_convert_int32() { + let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_8]; + let row = Field::convert_int32(&descr, 111); + assert_eq!(row, Field::Byte(111)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_16]; + let row = Field::convert_int32(&descr, 222); + assert_eq!(row, Field::Short(222)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_32]; + let row = Field::convert_int32(&descr, 333); + assert_eq!(row, Field::Int(333)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_8]; + let row = Field::convert_int32(&descr, -1); + assert_eq!(row, Field::UByte(255)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_16]; + let row = Field::convert_int32(&descr, 256); + assert_eq!(row, Field::UShort(256)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_32]; + let row = Field::convert_int32(&descr, 1234); + assert_eq!(row, Field::UInt(1234)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::NONE]; + let row = Field::convert_int32(&descr, 444); + assert_eq!(row, Field::Int(444)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::DATE]; + let row = Field::convert_int32(&descr, 14611); + assert_eq!(row, Field::Date(14611)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::DECIMAL, 0, 8, 2]; + let row = Field::convert_int32(&descr, 444); + assert_eq!(row, Field::Decimal(Decimal::from_i32(444, 8, 2))); + } + + #[test] + fn test_row_convert_int64() { + let descr = make_column_descr![PhysicalType::INT64, LogicalType::INT_64]; + let row = Field::convert_int64(&descr, 1111); + assert_eq!(row, Field::Long(1111)); + + let descr = make_column_descr![PhysicalType::INT64, LogicalType::UINT_64]; + let row = Field::convert_int64(&descr, 78239823); + assert_eq!(row, Field::ULong(78239823)); + + let descr = make_column_descr![PhysicalType::INT64, LogicalType::TIMESTAMP_MILLIS]; + let row = Field::convert_int64(&descr, 1541186529153); + assert_eq!(row, Field::Timestamp(1541186529153)); + + let descr = make_column_descr![PhysicalType::INT64, LogicalType::NONE]; + let row = Field::convert_int64(&descr, 2222); + assert_eq!(row, Field::Long(2222)); + + let descr = make_column_descr![PhysicalType::INT64, LogicalType::DECIMAL, 0, 8, 2]; + let row = Field::convert_int64(&descr, 3333); + assert_eq!(row, Field::Decimal(Decimal::from_i64(3333, 8, 2))); + } + + #[test] + fn test_row_convert_int96() { + // INT96 value does not depend on logical type + let descr = make_column_descr![PhysicalType::INT96, LogicalType::NONE]; + + let value = Int96::from(vec![0, 0, 2454923]); + let row = Field::convert_int96(&descr, value); + assert_eq!(row, Field::Timestamp(1238544000000)); + + let value = Int96::from(vec![4165425152, 13, 2454923]); + let row = Field::convert_int96(&descr, value); + assert_eq!(row, Field::Timestamp(1238544060000)); + } + + #[test] + #[should_panic(expected = "Expected non-negative milliseconds when converting Int96")] + fn test_row_convert_int96_invalid() { + // INT96 value does not depend on logical type + let descr = make_column_descr![PhysicalType::INT96, LogicalType::NONE]; + + let value = Int96::from(vec![0, 0, 0]); + Field::convert_int96(&descr, value); + } + + #[test] + fn test_row_convert_float() { + // FLOAT value does not depend on logical type + let descr = make_column_descr![PhysicalType::FLOAT, LogicalType::NONE]; + let row = Field::convert_float(&descr, 2.31); + assert_eq!(row, Field::Float(2.31)); + } + + #[test] + fn test_row_convert_double() { + // DOUBLE value does not depend on logical type + let descr = make_column_descr![PhysicalType::DOUBLE, LogicalType::NONE]; + let row = Field::convert_double(&descr, 1.56); + assert_eq!(row, Field::Double(1.56)); + } + + #[test] + fn test_row_convert_byte_array() { + // UTF8 + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::UTF8]; + let value = ByteArray::from(vec![b'A', b'B', b'C', b'D']); + let row = Field::convert_byte_array(&descr, value); + assert_eq!(row, Field::Str("ABCD".to_string())); + + // ENUM + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::ENUM]; + let value = ByteArray::from(vec![b'1', b'2', b'3']); + let row = Field::convert_byte_array(&descr, value); + assert_eq!(row, Field::Str("123".to_string())); + + // JSON + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::JSON]; + let value = ByteArray::from(vec![b'{', b'"', b'a', b'"', b':', b'1', b'}']); + let row = Field::convert_byte_array(&descr, value); + assert_eq!(row, Field::Str("{\"a\":1}".to_string())); + + // NONE + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::NONE]; + let value = ByteArray::from(vec![1, 2, 3, 4, 5]); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row, Field::Bytes(value)); + + // BSON + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::BSON]; + let value = ByteArray::from(vec![1, 2, 3, 4, 5]); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row, Field::Bytes(value)); + + // DECIMAL + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::DECIMAL, 0, 8, 2]; + let value = ByteArray::from(vec![207, 200]); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row, Field::Decimal(Decimal::from_bytes(value, 8, 2))); + + // DECIMAL (FIXED_LEN_BYTE_ARRAY) + let descr = make_column_descr![ + PhysicalType::FIXED_LEN_BYTE_ARRAY, + LogicalType::DECIMAL, + 8, + 17, + 5 + ]; + let value = ByteArray::from(vec![0, 0, 0, 0, 0, 4, 147, 224]); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row, Field::Decimal(Decimal::from_bytes(value, 17, 5))); + + // NONE (FIXED_LEN_BYTE_ARRAY) + let descr = make_column_descr![ + PhysicalType::FIXED_LEN_BYTE_ARRAY, + LogicalType::NONE, + 6, + 0, + 0 + ]; + let value = ByteArray::from(vec![1, 2, 3, 4, 5, 6]); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row, Field::Bytes(value)); + } + + #[test] + fn test_convert_date_to_string() { + fn check_date_conversion(y: u32, m: u32, d: u32) { + let datetime = chrono::NaiveDate::from_ymd(y as i32, m, d).and_hms(0, 0, 0); + let dt = Local.from_utc_datetime(&datetime); + let res = convert_date_to_string((dt.timestamp() / 60 / 60 / 24) as u32); + let exp = format!("{}", dt.format("%Y-%m-%d %:z")); + assert_eq!(res, exp); + } + + check_date_conversion(2010, 01, 02); + check_date_conversion(2014, 05, 01); + check_date_conversion(2016, 02, 29); + check_date_conversion(2017, 09, 12); + check_date_conversion(2018, 03, 31); + } + + #[test] + fn test_convert_timestamp_to_string() { + fn check_datetime_conversion(y: u32, m: u32, d: u32, h: u32, mi: u32, s: u32) { + let datetime = chrono::NaiveDate::from_ymd(y as i32, m, d).and_hms(h, mi, s); + let dt = Local.from_utc_datetime(&datetime); + let res = convert_timestamp_to_string(dt.timestamp_millis() as u64); + let exp = format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")); + assert_eq!(res, exp); + } + + check_datetime_conversion(2010, 01, 02, 13, 12, 54); + check_datetime_conversion(2011, 01, 03, 08, 23, 01); + check_datetime_conversion(2012, 04, 05, 11, 06, 32); + check_datetime_conversion(2013, 05, 12, 16, 38, 00); + check_datetime_conversion(2014, 11, 28, 21, 15, 12); + } + + #[test] + fn test_convert_float_to_string() { + assert_eq!(format!("{}", Field::Float(1.0)), "1.0"); + assert_eq!(format!("{}", Field::Float(9.63)), "9.63"); + assert_eq!(format!("{}", Field::Float(1e-15)), "0.000000000000001"); + assert_eq!(format!("{}", Field::Float(1e-16)), "1E-16"); + assert_eq!(format!("{}", Field::Float(1e19)), "10000000000000000000.0"); + assert_eq!(format!("{}", Field::Float(1e20)), "1E20"); + assert_eq!(format!("{}", Field::Float(1.7976931E30)), "1.7976931E30"); + assert_eq!(format!("{}", Field::Float(-1.7976931E30)), "-1.7976931E30"); + } + + #[test] + fn test_convert_double_to_string() { + assert_eq!(format!("{}", Field::Double(1.0)), "1.0"); + assert_eq!(format!("{}", Field::Double(9.63)), "9.63"); + assert_eq!(format!("{}", Field::Double(1e-15)), "0.000000000000001"); + assert_eq!(format!("{}", Field::Double(1e-16)), "1E-16"); + assert_eq!(format!("{}", Field::Double(1e19)), "10000000000000000000.0"); + assert_eq!(format!("{}", Field::Double(1e20)), "1E20"); + assert_eq!( + format!("{}", Field::Double(1.79769313486E308)), + "1.79769313486E308" + ); + assert_eq!( + format!("{}", Field::Double(-1.79769313486E308)), + "-1.79769313486E308" + ); + } + + #[test] + fn test_convert_decimal_to_string() { + // Helper method to compare decimal + fn check_decimal(bytes: Vec, precision: i32, scale: i32, res: &str) { + let decimal = Decimal::from_bytes(ByteArray::from(bytes), precision, scale); + assert_eq!(convert_decimal_to_string(&decimal), res); + } + + // This example previously used to fail in some engines + check_decimal( + vec![0, 0, 0, 0, 0, 0, 0, 0, 13, 224, 182, 179, 167, 100, 0, 0], + 38, + 18, + "1.000000000000000000", + ); + check_decimal( + vec![ + 249, 233, 247, 16, 185, 192, 202, 223, 215, 165, 192, 166, 67, 72, + ], + 36, + 28, + "-12344.0242342304923409234234293432", + ); + check_decimal(vec![0, 0, 0, 0, 0, 4, 147, 224], 17, 5, "3.00000"); + check_decimal(vec![0, 0, 0, 0, 1, 201, 195, 140], 18, 2, "300000.12"); + check_decimal(vec![207, 200], 10, 2, "-123.44"); + check_decimal(vec![207, 200], 10, 8, "-0.00012344"); + } + + #[test] + fn test_row_display() { + // Primitive types + assert_eq!(format!("{}", Field::Null), "null"); + assert_eq!(format!("{}", Field::Bool(true)), "true"); + assert_eq!(format!("{}", Field::Bool(false)), "false"); + assert_eq!(format!("{}", Field::Byte(1)), "1"); + assert_eq!(format!("{}", Field::Short(2)), "2"); + assert_eq!(format!("{}", Field::Int(3)), "3"); + assert_eq!(format!("{}", Field::Long(4)), "4"); + assert_eq!(format!("{}", Field::UByte(1)), "1"); + assert_eq!(format!("{}", Field::UShort(2)), "2"); + assert_eq!(format!("{}", Field::UInt(3)), "3"); + assert_eq!(format!("{}", Field::ULong(4)), "4"); + assert_eq!(format!("{}", Field::Float(5.0)), "5.0"); + assert_eq!(format!("{}", Field::Float(5.1234)), "5.1234"); + assert_eq!(format!("{}", Field::Double(6.0)), "6.0"); + assert_eq!(format!("{}", Field::Double(6.1234)), "6.1234"); + assert_eq!(format!("{}", Field::Str("abc".to_string())), "\"abc\""); + assert_eq!( + format!("{}", Field::Bytes(ByteArray::from(vec![1, 2, 3]))), + "[1, 2, 3]" + ); + assert_eq!( + format!("{}", Field::Date(14611)), + convert_date_to_string(14611) + ); + assert_eq!( + format!("{}", Field::Timestamp(1262391174000)), + convert_timestamp_to_string(1262391174000) + ); + assert_eq!( + format!("{}", Field::Decimal(Decimal::from_i32(4, 8, 2))), + convert_decimal_to_string(&Decimal::from_i32(4, 8, 2)) + ); + + // Complex types + let fields = vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ("z".to_string(), Field::Float(3.1)), + ("a".to_string(), Field::Str("abc".to_string())), + ]; + let row = Field::Group(make_row(fields)); + assert_eq!(format!("{}", row), "{x: null, Y: 2, z: 3.1, a: \"abc\"}"); + + let row = Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12), + ])); + assert_eq!(format!("{}", row), "[2, 1, null, 12]"); + + let row = Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)), + ])); + assert_eq!(format!("{}", row), "{1 -> 1.2, 2 -> 4.5, 3 -> 2.3}"); + } + + #[test] + fn test_is_primitive() { + // primitives + assert!(Field::Null.is_primitive()); + assert!(Field::Bool(true).is_primitive()); + assert!(Field::Bool(false).is_primitive()); + assert!(Field::Byte(1).is_primitive()); + assert!(Field::Short(2).is_primitive()); + assert!(Field::Int(3).is_primitive()); + assert!(Field::Long(4).is_primitive()); + assert!(Field::UByte(1).is_primitive()); + assert!(Field::UShort(2).is_primitive()); + assert!(Field::UInt(3).is_primitive()); + assert!(Field::ULong(4).is_primitive()); + assert!(Field::Float(5.0).is_primitive()); + assert!(Field::Float(5.1234).is_primitive()); + assert!(Field::Double(6.0).is_primitive()); + assert!(Field::Double(6.1234).is_primitive()); + assert!(Field::Str("abc".to_string()).is_primitive()); + assert!(Field::Bytes(ByteArray::from(vec![1, 2, 3])).is_primitive()); + assert!(Field::Timestamp(12345678).is_primitive()); + assert!(Field::Decimal(Decimal::from_i32(4, 8, 2)).is_primitive()); + + // complex types + assert_eq!( + false, + Field::Group(make_row(vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ("z".to_string(), Field::Float(3.1)), + ("a".to_string(), Field::Str("abc".to_string())) + ])) + .is_primitive() + ); + + assert_eq!( + false, + Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12) + ])) + .is_primitive() + ); + + assert_eq!( + false, + Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)) + ])) + .is_primitive() + ); + } + + #[test] + fn test_row_primitive_accessors() { + // primitives + let row = make_row(vec![ + ("a".to_string(), Field::Null), + ("b".to_string(), Field::Bool(false)), + ("c".to_string(), Field::Byte(3)), + ("d".to_string(), Field::Short(4)), + ("e".to_string(), Field::Int(5)), + ("f".to_string(), Field::Long(6)), + ("g".to_string(), Field::UByte(3)), + ("h".to_string(), Field::UShort(4)), + ("i".to_string(), Field::UInt(5)), + ("j".to_string(), Field::ULong(6)), + ("k".to_string(), Field::Float(7.1)), + ("l".to_string(), Field::Double(8.1)), + ("m".to_string(), Field::Str("abc".to_string())), + ( + "n".to_string(), + Field::Bytes(ByteArray::from(vec![1, 2, 3, 4, 5])), + ), + ("o".to_string(), Field::Decimal(Decimal::from_i32(4, 7, 2))), + ]); + + assert_eq!(false, row.get_bool(1).unwrap()); + assert_eq!(3, row.get_byte(2).unwrap()); + assert_eq!(4, row.get_short(3).unwrap()); + assert_eq!(5, row.get_int(4).unwrap()); + assert_eq!(6, row.get_long(5).unwrap()); + assert_eq!(3, row.get_ubyte(6).unwrap()); + assert_eq!(4, row.get_ushort(7).unwrap()); + assert_eq!(5, row.get_uint(8).unwrap()); + assert_eq!(6, row.get_ulong(9).unwrap()); + assert_eq!(7.1, row.get_float(10).unwrap()); + assert_eq!(8.1, row.get_double(11).unwrap()); + assert_eq!("abc", row.get_string(12).unwrap()); + assert_eq!(5, row.get_bytes(13).unwrap().len()); + assert_eq!(7, row.get_decimal(14).unwrap().precision()); + } + + #[test] + fn test_row_primitive_invalid_accessors() { + // primitives + let row = make_row(vec![ + ("a".to_string(), Field::Null), + ("b".to_string(), Field::Bool(false)), + ("c".to_string(), Field::Byte(3)), + ("d".to_string(), Field::Short(4)), + ("e".to_string(), Field::Int(5)), + ("f".to_string(), Field::Long(6)), + ("g".to_string(), Field::UByte(3)), + ("h".to_string(), Field::UShort(4)), + ("i".to_string(), Field::UInt(5)), + ("j".to_string(), Field::ULong(6)), + ("k".to_string(), Field::Float(7.1)), + ("l".to_string(), Field::Double(8.1)), + ("m".to_string(), Field::Str("abc".to_string())), + ( + "n".to_string(), + Field::Bytes(ByteArray::from(vec![1, 2, 3, 4, 5])), + ), + ("o".to_string(), Field::Decimal(Decimal::from_i32(4, 7, 2))), + ]); + + for i in 0..row.len() { + assert!(row.get_group(i).is_err()); + } + } + + #[test] + fn test_row_complex_accessors() { + let row = make_row(vec![ + ( + "a".to_string(), + Field::Group(make_row(vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ])), + ), + ( + "b".to_string(), + Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12), + ])), + ), + ( + "c".to_string(), + Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)), + ])), + ), + ]); + + assert_eq!(2, row.get_group(0).unwrap().len()); + assert_eq!(4, row.get_list(1).unwrap().len()); + assert_eq!(3, row.get_map(2).unwrap().len()); + } + + #[test] + fn test_row_complex_invalid_accessors() { + let row = make_row(vec![ + ( + "a".to_string(), + Field::Group(make_row(vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ])), + ), + ( + "b".to_string(), + Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12), + ])), + ), + ( + "c".to_string(), + Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)), + ])), + ), + ]); + + assert_eq!( + ParquetError::General("Cannot access Group as Float".to_string()), + row.get_float(0).unwrap_err() + ); + assert_eq!( + ParquetError::General("Cannot access ListInternal as Float".to_string()), + row.get_float(1).unwrap_err() + ); + assert_eq!( + ParquetError::General("Cannot access MapInternal as Float".to_string()), + row.get_float(2).unwrap_err() + ); + } + + #[test] + fn test_list_primitive_accessors() { + // primitives + let list = make_list(vec![Field::Bool(false)]); + assert_eq!(false, list.get_bool(0).unwrap()); + + let list = make_list(vec![Field::Byte(3), Field::Byte(4)]); + assert_eq!(4, list.get_byte(1).unwrap()); + + let list = make_list(vec![Field::Short(4), Field::Short(5), Field::Short(6)]); + assert_eq!(6, list.get_short(2).unwrap()); + + let list = make_list(vec![Field::Int(5)]); + assert_eq!(5, list.get_int(0).unwrap()); + + let list = make_list(vec![Field::Long(6), Field::Long(7)]); + assert_eq!(7, list.get_long(1).unwrap()); + + let list = make_list(vec![Field::UByte(3), Field::UByte(4)]); + assert_eq!(4, list.get_ubyte(1).unwrap()); + + let list = make_list(vec![Field::UShort(4), Field::UShort(5), Field::UShort(6)]); + assert_eq!(6, list.get_ushort(2).unwrap()); + + let list = make_list(vec![Field::UInt(5)]); + assert_eq!(5, list.get_uint(0).unwrap()); + + let list = make_list(vec![Field::ULong(6), Field::ULong(7)]); + assert_eq!(7, list.get_ulong(1).unwrap()); + + let list = make_list(vec![ + Field::Float(8.1), + Field::Float(9.2), + Field::Float(10.3), + ]); + assert_eq!(10.3, list.get_float(2).unwrap()); + + let list = make_list(vec![Field::Double(3.1415)]); + assert_eq!(3.1415, list.get_double(0).unwrap()); + + let list = make_list(vec![Field::Str("abc".to_string())]); + assert_eq!(&"abc".to_string(), list.get_string(0).unwrap()); + + let list = make_list(vec![Field::Bytes(ByteArray::from(vec![1, 2, 3, 4, 5]))]); + assert_eq!(&[1, 2, 3, 4, 5], list.get_bytes(0).unwrap().data()); + + let list = make_list(vec![Field::Decimal(Decimal::from_i32(4, 5, 2))]); + assert_eq!(&[0, 0, 0, 4], list.get_decimal(0).unwrap().data()); + } + + #[test] + fn test_list_primitive_invalid_accessors() { + // primitives + let list = make_list(vec![Field::Bool(false)]); + assert!(list.get_byte(0).is_err()); + + let list = make_list(vec![Field::Byte(3), Field::Byte(4)]); + assert!(list.get_short(1).is_err()); + + let list = make_list(vec![Field::Short(4), Field::Short(5), Field::Short(6)]); + assert!(list.get_int(2).is_err()); + + let list = make_list(vec![Field::Int(5)]); + assert!(list.get_long(0).is_err()); + + let list = make_list(vec![Field::Long(6), Field::Long(7)]); + assert!(list.get_float(1).is_err()); + + let list = make_list(vec![Field::UByte(3), Field::UByte(4)]); + assert!(list.get_short(1).is_err()); + + let list = make_list(vec![Field::UShort(4), Field::UShort(5), Field::UShort(6)]); + assert!(list.get_int(2).is_err()); + + let list = make_list(vec![Field::UInt(5)]); + assert!(list.get_long(0).is_err()); + + let list = make_list(vec![Field::ULong(6), Field::ULong(7)]); + assert!(list.get_float(1).is_err()); + + let list = make_list(vec![ + Field::Float(8.1), + Field::Float(9.2), + Field::Float(10.3), + ]); + assert!(list.get_double(2).is_err()); + + let list = make_list(vec![Field::Double(3.1415)]); + assert!(list.get_string(0).is_err()); + + let list = make_list(vec![Field::Str("abc".to_string())]); + assert!(list.get_bytes(0).is_err()); + + let list = make_list(vec![Field::Bytes(ByteArray::from(vec![1, 2, 3, 4, 5]))]); + assert!(list.get_bool(0).is_err()); + + let list = make_list(vec![Field::Decimal(Decimal::from_i32(4, 5, 2))]); + assert!(list.get_bool(0).is_err()); + } + + #[test] + fn test_list_complex_accessors() { + let list = make_list(vec![Field::Group(make_row(vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ]))]); + assert_eq!(2, list.get_group(0).unwrap().len()); + + let list = make_list(vec![Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12), + ]))]); + assert_eq!(4, list.get_list(0).unwrap().len()); + + let list = make_list(vec![Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)), + ]))]); + assert_eq!(3, list.get_map(0).unwrap().len()); + } + + #[test] + fn test_list_complex_invalid_accessors() { + let list = make_list(vec![Field::Group(make_row(vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ]))]); + assert_eq!( + general_err!("Cannot access Group as Float".to_string()), + list.get_float(0).unwrap_err() + ); + + let list = make_list(vec![Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12), + ]))]); + assert_eq!( + general_err!("Cannot access ListInternal as Float".to_string()), + list.get_float(0).unwrap_err() + ); + + let list = make_list(vec![Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)), + ]))]); + assert_eq!( + general_err!("Cannot access MapInternal as Float".to_string()), + list.get_float(0).unwrap_err() + ); + } + + #[test] + fn test_map_accessors() { + // a map from int to string + let map = make_map(vec![ + (Field::Int(1), Field::Str("a".to_string())), + (Field::Int(2), Field::Str("b".to_string())), + (Field::Int(3), Field::Str("c".to_string())), + (Field::Int(4), Field::Str("d".to_string())), + (Field::Int(5), Field::Str("e".to_string())), + ]); + + assert_eq!(5, map.len()); + for i in 0..5 { + assert_eq!((i + 1) as i32, map.get_keys().get_int(i).unwrap()); + assert_eq!( + &((i as u8 + 'a' as u8) as char).to_string(), + map.get_values().get_string(i).unwrap() + ); + } + } +} diff --git a/rust/src/parquet/record/mod.rs b/rust/src/parquet/record/mod.rs new file mode 100644 index 0000000000000..0dba8a78bd165 --- /dev/null +++ b/rust/src/parquet/record/mod.rs @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains record-based API for reading Parquet files. + +mod api; +pub mod reader; +mod triplet; + +pub use self::api::{List, ListAccessor, Map, MapAccessor, Row, RowAccessor}; diff --git a/rust/src/parquet/record/reader.rs b/rust/src/parquet/record/reader.rs new file mode 100644 index 0000000000000..d9f3d6fea1978 --- /dev/null +++ b/rust/src/parquet/record/reader.rs @@ -0,0 +1,1464 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains implementation of record assembly and converting Parquet types into +//! [`Row`](`::record::api::Row`)s. + +use std::{collections::HashMap, fmt, rc::Rc}; + +use crate::parquet::basic::{LogicalType, Repetition}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::file::reader::{FileReader, RowGroupReader}; +use crate::parquet::record::{ + api::{make_list, make_map, make_row, Field, Row}, + triplet::TripletIter, +}; +use crate::parquet::schema::types::{ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr}; + +/// Default batch size for a reader +const DEFAULT_BATCH_SIZE: usize = 1024; + +/// Tree builder for `Reader` enum. +/// Serves as a container of options for building a reader tree and a builder, and +/// accessing a records iterator [`RowIter`]. +pub struct TreeBuilder { + // Batch size (>= 1) for triplet iterators + batch_size: usize, +} + +impl TreeBuilder { + /// Creates new tree builder with default parameters. + pub fn new() -> Self { + Self { + batch_size: DEFAULT_BATCH_SIZE, + } + } + + /// Sets batch size for this tree builder. + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = batch_size; + self + } + + /// Creates new root reader for provided schema and row group. + pub fn build(&self, descr: SchemaDescPtr, row_group_reader: &RowGroupReader) -> Reader { + // Prepare lookup table of column path -> original column index + // This allows to prune columns and map schema leaf nodes to the column readers + let mut paths: HashMap = HashMap::new(); + let row_group_metadata = row_group_reader.metadata(); + + for col_index in 0..row_group_reader.num_columns() { + let col_meta = row_group_metadata.column(col_index); + let col_path = col_meta.column_path().clone(); + paths.insert(col_path, col_index); + } + + // Build child readers for the message type + let mut readers = Vec::new(); + let mut path = Vec::new(); + + for field in descr.root_schema().get_fields() { + let reader = self.reader_tree(field.clone(), &mut path, 0, 0, &paths, row_group_reader); + readers.push(reader); + } + + // Return group reader for message type, + // it is always required with definition level 0 + Reader::GroupReader(None, 0, readers) + } + + /// Creates iterator of `Row`s directly from schema descriptor and row group. + pub fn as_iter(&self, descr: SchemaDescPtr, row_group_reader: &RowGroupReader) -> ReaderIter { + let num_records = row_group_reader.metadata().num_rows() as usize; + ReaderIter::new(self.build(descr, row_group_reader), num_records) + } + + /// Builds tree of readers for the current schema recursively. + fn reader_tree( + &self, + field: TypePtr, + mut path: &mut Vec, + mut curr_def_level: i16, + mut curr_rep_level: i16, + paths: &HashMap, + row_group_reader: &RowGroupReader, + ) -> Reader { + assert!(field.get_basic_info().has_repetition()); + // Update current definition and repetition levels for this type + let repetition = field.get_basic_info().repetition(); + match repetition { + Repetition::OPTIONAL => { + curr_def_level += 1; + } + Repetition::REPEATED => { + curr_def_level += 1; + curr_rep_level += 1; + } + _ => {} + } + + path.push(String::from(field.name())); + let reader = if field.is_primitive() { + let col_path = ColumnPath::new(path.to_vec()); + let orig_index = *paths.get(&col_path).unwrap(); + let col_descr = row_group_reader + .metadata() + .column(orig_index) + .column_descr_ptr(); + let col_reader = row_group_reader.get_column_reader(orig_index).unwrap(); + let column = TripletIter::new(col_descr, col_reader, self.batch_size); + Reader::PrimitiveReader(field, column) + } else { + match field.get_basic_info().logical_type() { + // List types + LogicalType::LIST => { + assert_eq!(field.get_fields().len(), 1, "Invalid list type {:?}", field); + + let repeated_field = field.get_fields()[0].clone(); + assert_eq!( + repeated_field.get_basic_info().repetition(), + Repetition::REPEATED, + "Invalid list type {:?}", + field + ); + + if Reader::is_element_type(&repeated_field) { + // Support for backward compatible lists + let reader = self.reader_tree( + repeated_field.clone(), + &mut path, + curr_def_level, + curr_rep_level, + paths, + row_group_reader, + ); + + Reader::RepeatedReader( + field, + curr_def_level, + curr_rep_level, + Box::new(reader), + ) + } else { + let child_field = repeated_field.get_fields()[0].clone(); + + path.push(String::from(repeated_field.name())); + + let reader = self.reader_tree( + child_field, + &mut path, + curr_def_level + 1, + curr_rep_level + 1, + paths, + row_group_reader, + ); + + path.pop(); + + Reader::RepeatedReader( + field, + curr_def_level, + curr_rep_level, + Box::new(reader), + ) + } + } + // Map types (key-value pairs) + LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { + assert_eq!(field.get_fields().len(), 1, "Invalid map type: {:?}", field); + assert!( + !field.get_fields()[0].is_primitive(), + "Invalid map type: {:?}", + field + ); + + let key_value_type = field.get_fields()[0].clone(); + assert_eq!( + key_value_type.get_basic_info().repetition(), + Repetition::REPEATED, + "Invalid map type: {:?}", + field + ); + assert_eq!( + key_value_type.get_fields().len(), + 2, + "Invalid map type: {:?}", + field + ); + + path.push(String::from(key_value_type.name())); + + let key_type = &key_value_type.get_fields()[0]; + assert!( + key_type.is_primitive(), + "Map key type is expected to be a primitive type, but found {:?}", + key_type + ); + let key_reader = self.reader_tree( + key_type.clone(), + &mut path, + curr_def_level + 1, + curr_rep_level + 1, + paths, + row_group_reader, + ); + + let value_type = &key_value_type.get_fields()[1]; + let value_reader = self.reader_tree( + value_type.clone(), + &mut path, + curr_def_level + 1, + curr_rep_level + 1, + paths, + row_group_reader, + ); + + path.pop(); + + Reader::KeyValueReader( + field, + curr_def_level, + curr_rep_level, + Box::new(key_reader), + Box::new(value_reader), + ) + } + // A repeated field that is neither contained by a `LIST`- or `MAP`-annotated + // group nor annotated by `LIST` or `MAP` should be interpreted as a required + // list of required elements where the element type is the type of the field. + _ if repetition == Repetition::REPEATED => { + let required_field = Type::group_type_builder(field.name()) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(field.get_basic_info().logical_type()) + .with_fields(&mut Vec::from(field.get_fields())) + .build() + .unwrap(); + + path.pop(); + + let reader = self.reader_tree( + Rc::new(required_field), + &mut path, + curr_def_level, + curr_rep_level, + paths, + row_group_reader, + ); + + Reader::RepeatedReader( + field, + curr_def_level - 1, + curr_rep_level - 1, + Box::new(reader), + ) + } + // Group types (structs) + _ => { + let mut readers = Vec::new(); + for child in field.get_fields() { + let reader = self.reader_tree( + child.clone(), + &mut path, + curr_def_level, + curr_rep_level, + paths, + row_group_reader, + ); + readers.push(reader); + } + Reader::GroupReader(Some(field), curr_def_level, readers) + } + } + }; + path.pop(); + + Reader::option(repetition, curr_def_level, reader) + } +} + +/// Reader tree for record assembly +pub enum Reader { + // Primitive reader with type information and triplet iterator + PrimitiveReader(TypePtr, TripletIter), + // Optional reader with definition level of a parent and a reader + OptionReader(i16, Box), + // Group (struct) reader with type information, definition level and list of child + // readers. When it represents message type, type information is None + GroupReader(Option, i16, Vec), + // Reader for repeated values, e.g. lists, contains type information, definition + // level, repetition level and a child reader + RepeatedReader(TypePtr, i16, i16, Box), + // Reader of key-value pairs, e.g. maps, contains type information, definition level, + // repetition level, child reader for keys and child reader for values + KeyValueReader(TypePtr, i16, i16, Box, Box), +} + +impl Reader { + /// Wraps reader in option reader based on repetition. + fn option(repetition: Repetition, def_level: i16, reader: Reader) -> Self { + if repetition == Repetition::OPTIONAL { + Reader::OptionReader(def_level - 1, Box::new(reader)) + } else { + reader + } + } + + /// Returns true if repeated type is an element type for the list. + /// Used to determine legacy list types. + /// This method is copied from Spark Parquet reader and is based on the reference: + /// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + /// #backward-compatibility-rules + fn is_element_type(repeated_type: &Type) -> bool { + // For legacy 2-level list types with primitive element type, e.g.: + // + // // ARRAY (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated int32 element; + // } + // + repeated_type.is_primitive() || + // For legacy 2-level list types whose element type is a group type with 2 or more + // fields, e.g.: + // + // // ARRAY> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group element { + // required binary str (UTF8); + // required int32 num; + // }; + // } + // + repeated_type.is_group() && repeated_type.get_fields().len() > 1 || + // For legacy 2-level list types generated by parquet-avro (Parquet version < 1.6.0), + // e.g.: + // + // // ARRAY> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group array { + // required binary str (UTF8); + // }; + // } + // + repeated_type.name() == "array" || + // For Parquet data generated by parquet-thrift, e.g.: + // + // // ARRAY> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group my_list_tuple { + // required binary str (UTF8); + // }; + // } + // + repeated_type.name().ends_with("_tuple") + } + + /// Reads current record as `Row` from the reader tree. + /// Automatically advances all necessary readers. + /// This must be called on the root level reader (i.e., for Message type). + /// Otherwise, it will panic. + fn read(&mut self) -> Row { + match *self { + Reader::GroupReader(_, _, ref mut readers) => { + let mut fields = Vec::new(); + for reader in readers { + fields.push((String::from(reader.field_name()), reader.read_field())); + } + make_row(fields) + } + _ => panic!("Cannot call read() on {}", self), + } + } + + /// Reads current record as `Field` from the reader tree. + /// Automatically advances all necessary readers. + fn read_field(&mut self) -> Field { + match *self { + Reader::PrimitiveReader(_, ref mut column) => { + let value = column.current_value(); + column.read_next().unwrap(); + value + } + Reader::OptionReader(def_level, ref mut reader) => { + if reader.current_def_level() > def_level { + reader.read_field() + } else { + reader.advance_columns(); + Field::Null + } + } + Reader::GroupReader(_, def_level, ref mut readers) => { + let mut fields = Vec::new(); + for reader in readers { + if reader.repetition() != Repetition::OPTIONAL + || reader.current_def_level() > def_level + { + fields.push((String::from(reader.field_name()), reader.read_field())); + } else { + reader.advance_columns(); + fields.push((String::from(reader.field_name()), Field::Null)); + } + } + let row = make_row(fields); + Field::Group(row) + } + Reader::RepeatedReader(_, def_level, rep_level, ref mut reader) => { + let mut elements = Vec::new(); + loop { + if reader.current_def_level() > def_level { + elements.push(reader.read_field()); + } else { + reader.advance_columns(); + // If the current definition level is equal to the definition level of this + // repeated type, then the result is an empty list and the repetition level + // will always be <= rl. + break; + } + + // This covers case when we are out of repetition levels and should close the + // group, or there are no values left to buffer. + if !reader.has_next() || reader.current_rep_level() <= rep_level { + break; + } + } + Field::ListInternal(make_list(elements)) + } + Reader::KeyValueReader(_, def_level, rep_level, ref mut keys, ref mut values) => { + let mut pairs = Vec::new(); + loop { + if keys.current_def_level() > def_level { + pairs.push((keys.read_field(), values.read_field())); + } else { + keys.advance_columns(); + values.advance_columns(); + // If the current definition level is equal to the definition level of this + // repeated type, then the result is an empty list and the repetition level + // will always be <= rl. + break; + } + + // This covers case when we are out of repetition levels and should close the + // group, or there are no values left to buffer. + if !keys.has_next() || keys.current_rep_level() <= rep_level { + break; + } + } + + Field::MapInternal(make_map(pairs)) + } + } + } + + /// Returns field name for the current reader. + fn field_name(&self) -> &str { + match *self { + Reader::PrimitiveReader(ref field, _) => field.name(), + Reader::OptionReader(_, ref reader) => reader.field_name(), + Reader::GroupReader(ref opt, ..) => match opt { + &Some(ref field) => field.name(), + &None => panic!("Field is None for group reader"), + }, + Reader::RepeatedReader(ref field, ..) => field.name(), + Reader::KeyValueReader(ref field, ..) => field.name(), + } + } + + /// Returns repetition for the current reader. + fn repetition(&self) -> Repetition { + match *self { + Reader::PrimitiveReader(ref field, _) => field.get_basic_info().repetition(), + Reader::OptionReader(_, ref reader) => reader.repetition(), + Reader::GroupReader(ref opt, ..) => match opt { + &Some(ref field) => field.get_basic_info().repetition(), + &None => panic!("Field is None for group reader"), + }, + Reader::RepeatedReader(ref field, ..) => field.get_basic_info().repetition(), + Reader::KeyValueReader(ref field, ..) => field.get_basic_info().repetition(), + } + } + + /// Returns true, if current reader has more values, false otherwise. + /// Method does not advance internal iterator. + fn has_next(&self) -> bool { + match *self { + Reader::PrimitiveReader(_, ref column) => column.has_next(), + Reader::OptionReader(_, ref reader) => reader.has_next(), + Reader::GroupReader(_, _, ref readers) => readers.first().unwrap().has_next(), + Reader::RepeatedReader(_, _, _, ref reader) => reader.has_next(), + Reader::KeyValueReader(_, _, _, ref keys, _) => keys.has_next(), + } + } + + /// Returns current definition level, + /// Method does not advance internal iterator. + fn current_def_level(&self) -> i16 { + match *self { + Reader::PrimitiveReader(_, ref column) => column.current_def_level(), + Reader::OptionReader(_, ref reader) => reader.current_def_level(), + Reader::GroupReader(_, _, ref readers) => match readers.first() { + Some(reader) => reader.current_def_level(), + None => panic!("Current definition level: empty group reader"), + }, + Reader::RepeatedReader(_, _, _, ref reader) => reader.current_def_level(), + Reader::KeyValueReader(_, _, _, ref keys, _) => keys.current_def_level(), + } + } + + /// Returns current repetition level. + /// Method does not advance internal iterator. + fn current_rep_level(&self) -> i16 { + match *self { + Reader::PrimitiveReader(_, ref column) => column.current_rep_level(), + Reader::OptionReader(_, ref reader) => reader.current_rep_level(), + Reader::GroupReader(_, _, ref readers) => match readers.first() { + Some(reader) => reader.current_rep_level(), + None => panic!("Current repetition level: empty group reader"), + }, + Reader::RepeatedReader(_, _, _, ref reader) => reader.current_rep_level(), + Reader::KeyValueReader(_, _, _, ref keys, _) => keys.current_rep_level(), + } + } + + /// Advances leaf columns for the current reader. + fn advance_columns(&mut self) { + match *self { + Reader::PrimitiveReader(_, ref mut column) => { + column.read_next().unwrap(); + } + Reader::OptionReader(_, ref mut reader) => { + reader.advance_columns(); + } + Reader::GroupReader(_, _, ref mut readers) => { + for reader in readers { + reader.advance_columns(); + } + } + Reader::RepeatedReader(_, _, _, ref mut reader) => { + reader.advance_columns(); + } + Reader::KeyValueReader(_, _, _, ref mut keys, ref mut values) => { + keys.advance_columns(); + values.advance_columns(); + } + } + } +} + +impl fmt::Display for Reader { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let s = match self { + Reader::PrimitiveReader(..) => "PrimitiveReader", + Reader::OptionReader(..) => "OptionReader", + Reader::GroupReader(..) => "GroupReader", + Reader::RepeatedReader(..) => "RepeatedReader", + Reader::KeyValueReader(..) => "KeyValueReader", + }; + write!(f, "{}", s) + } +} + +// ---------------------------------------------------------------------- +// Row iterators + +/// Iterator of [`Row`](`::record::api::Row`)s. +/// It is used either for a single row group to iterate over data in that row group, or +/// an entire file with auto buffering of all row groups. +pub struct RowIter<'a> { + descr: SchemaDescPtr, + tree_builder: TreeBuilder, + file_reader: Option<&'a FileReader>, + current_row_group: usize, + num_row_groups: usize, + row_iter: Option, +} + +impl<'a> RowIter<'a> { + /// Creates iterator of [`Row`](`::record::api::Row`)s for all row groups in a file. + pub fn from_file(proj: Option, reader: &'a FileReader) -> Result { + let descr = + Self::get_proj_descr(proj, reader.metadata().file_metadata().schema_descr_ptr())?; + let num_row_groups = reader.num_row_groups(); + + Ok(Self { + descr, + tree_builder: Self::tree_builder(), + file_reader: Some(reader), + current_row_group: 0, + num_row_groups, + row_iter: None, + }) + } + + /// Creates iterator of [`Row`](`::record::api::Row`)s for a specific row group. + pub fn from_row_group(proj: Option, reader: &'a RowGroupReader) -> Result { + let descr = Self::get_proj_descr(proj, reader.metadata().schema_descr_ptr())?; + let tree_builder = Self::tree_builder(); + let row_iter = tree_builder.as_iter(descr.clone(), reader); + + // For row group we need to set `current_row_group` >= `num_row_groups`, because we + // only have one row group and can't buffer more. + Ok(Self { + descr, + tree_builder, + file_reader: None, + current_row_group: 0, + num_row_groups: 0, + row_iter: Some(row_iter), + }) + } + + /// Returns common tree builder, so the same settings are applied to both iterators + /// from file reader and row group. + #[inline] + fn tree_builder() -> TreeBuilder { + TreeBuilder::new() + } + + /// Helper method to get schema descriptor for projected schema. + /// If projection is None, then full schema is returned. + #[inline] + fn get_proj_descr(proj: Option, root_descr: SchemaDescPtr) -> Result { + match proj { + Some(projection) => { + // check if projection is part of file schema + let root_schema = root_descr.root_schema(); + if !root_schema.check_contains(&projection) { + return Err(general_err!("Root schema does not contain projection")); + } + Ok(Rc::new(SchemaDescriptor::new(Rc::new(projection)))) + } + None => Ok(root_descr), + } + } +} + +impl<'a> Iterator for RowIter<'a> { + type Item = Row; + + fn next(&mut self) -> Option { + let mut row = None; + if let Some(ref mut iter) = self.row_iter { + row = iter.next(); + } + + while row.is_none() && self.current_row_group < self.num_row_groups { + // We do not expect any failures when accessing a row group, and file reader + // must be set for selecting next row group. + let row_group_reader = &*self + .file_reader + .as_ref() + .expect("File reader is required to advance row group") + .get_row_group(self.current_row_group) + .unwrap(); + self.current_row_group += 1; + let mut iter = self + .tree_builder + .as_iter(self.descr.clone(), row_group_reader); + row = iter.next(); + self.row_iter = Some(iter); + } + + row + } +} + +/// Internal iterator of [`Row`](`::record::api::Row`)s for a reader. +pub struct ReaderIter { + root_reader: Reader, + records_left: usize, +} + +impl ReaderIter { + fn new(mut root_reader: Reader, num_records: usize) -> Self { + // Prepare root reader by advancing all column vectors + root_reader.advance_columns(); + Self { + root_reader, + records_left: num_records, + } + } +} + +impl Iterator for ReaderIter { + type Item = Row; + + fn next(&mut self) -> Option { + if self.records_left > 0 { + self.records_left -= 1; + Some(self.root_reader.read()) + } else { + None + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::parquet::errors::{ParquetError, Result}; + use crate::parquet::file::reader::{FileReader, SerializedFileReader}; + use crate::parquet::record::api::{Field, Row}; + use crate::parquet::schema::parser::parse_message_type; + use crate::parquet::util::test_common::get_test_file; + + // Convenient macros to assemble row, list, map, and group. + + macro_rules! row { + () => { + { + let result = Vec::new(); + make_row(result) + } + }; + ( $( $e:expr ), + ) => { + { + let mut result = Vec::new(); + $( + result.push($e); + )* + make_row(result) + } + } + } + + macro_rules! list { + () => { + { + let result = Vec::new(); + Field::ListInternal(make_list(result)) + } + }; + ( $( $e:expr ), + ) => { + { + let mut result = Vec::new(); + $( + result.push($e); + )* + Field::ListInternal(make_list(result)) + } + } + } + + macro_rules! map { + () => { + { + let result = Vec::new(); + Field::MapInternal(make_map(result)) + } + }; + ( $( $e:expr ), + ) => { + { + let mut result = Vec::new(); + $( + result.push($e); + )* + Field::MapInternal(make_map(result)) + } + } + } + + macro_rules! group { + ( $( $e:expr ), * ) => { + { + Field::Group(row!($( $e ), *)) + } + } + } + + #[test] + fn test_file_reader_rows_nulls() { + let rows = test_file_reader_rows("nulls.snappy.parquet", None).unwrap(); + let expected_rows = vec![ + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + ]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_nonnullable() { + let rows = test_file_reader_rows("nonnullable.impala.parquet", None).unwrap(); + let expected_rows = vec![row![ + ("ID".to_string(), Field::Long(8)), + ("Int_Array".to_string(), list![Field::Int(-1)]), + ( + "int_array_array".to_string(), + list![list![Field::Int(-1), Field::Int(-2)], list![]] + ), + ( + "Int_Map".to_string(), + map![(Field::Str("k1".to_string()), Field::Int(-1))] + ), + ( + "int_map_array".to_string(), + list![ + map![], + map![(Field::Str("k1".to_string()), Field::Int(1))], + map![], + map![] + ] + ), + ( + "nested_Struct".to_string(), + group![ + ("a".to_string(), Field::Int(-1)), + ("B".to_string(), list![Field::Int(-1)]), + ( + "c".to_string(), + group![( + "D".to_string(), + list![list![group![ + ("e".to_string(), Field::Int(-1)), + ("f".to_string(), Field::Str("nonnullable".to_string())) + ]]] + )] + ), + ("G".to_string(), map![]) + ] + ) + ]]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_nullable() { + let rows = test_file_reader_rows("nullable.impala.parquet", None).unwrap(); + let expected_rows = vec![ + row![ + ("id".to_string(), Field::Long(1)), + ( + "int_array".to_string(), + list![Field::Int(1), Field::Int(2), Field::Int(3)] + ), + ( + "int_array_Array".to_string(), + list![ + list![Field::Int(1), Field::Int(2)], + list![Field::Int(3), Field::Int(4)] + ] + ), + ( + "int_map".to_string(), + map![ + (Field::Str("k1".to_string()), Field::Int(1)), + (Field::Str("k2".to_string()), Field::Int(100)) + ] + ), + ( + "int_Map_Array".to_string(), + list![map![(Field::Str("k1".to_string()), Field::Int(1))]] + ), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Int(1)), + ("b".to_string(), list![Field::Int(1)]), + ( + "C".to_string(), + group![( + "d".to_string(), + list![ + list![ + group![ + ("E".to_string(), Field::Int(10)), + ("F".to_string(), Field::Str("aaa".to_string())) + ], + group![ + ("E".to_string(), Field::Int(-10)), + ("F".to_string(), Field::Str("bbb".to_string())) + ] + ], + list![group![ + ("E".to_string(), Field::Int(11)), + ("F".to_string(), Field::Str("c".to_string())) + ]] + ] + )] + ), + ( + "g".to_string(), + map![( + Field::Str("foo".to_string()), + group![( + "H".to_string(), + group![("i".to_string(), list![Field::Double(1.1)])] + )] + )] + ) + ] + ) + ], + row![ + ("id".to_string(), Field::Long(2)), + ( + "int_array".to_string(), + list![ + Field::Null, + Field::Int(1), + Field::Int(2), + Field::Null, + Field::Int(3), + Field::Null + ] + ), + ( + "int_array_Array".to_string(), + list![ + list![Field::Null, Field::Int(1), Field::Int(2), Field::Null], + list![Field::Int(3), Field::Null, Field::Int(4)], + list![], + Field::Null + ] + ), + ( + "int_map".to_string(), + map![ + (Field::Str("k1".to_string()), Field::Int(2)), + (Field::Str("k2".to_string()), Field::Null) + ] + ), + ( + "int_Map_Array".to_string(), + list![ + map![ + (Field::Str("k3".to_string()), Field::Null), + (Field::Str("k1".to_string()), Field::Int(1)) + ], + Field::Null, + map![] + ] + ), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Null), + ("b".to_string(), list![Field::Null]), + ( + "C".to_string(), + group![( + "d".to_string(), + list![ + list![ + group![ + ("E".to_string(), Field::Null), + ("F".to_string(), Field::Null) + ], + group![ + ("E".to_string(), Field::Int(10)), + ("F".to_string(), Field::Str("aaa".to_string())) + ], + group![ + ("E".to_string(), Field::Null), + ("F".to_string(), Field::Null) + ], + group![ + ("E".to_string(), Field::Int(-10)), + ("F".to_string(), Field::Str("bbb".to_string())) + ], + group![ + ("E".to_string(), Field::Null), + ("F".to_string(), Field::Null) + ] + ], + list![ + group![ + ("E".to_string(), Field::Int(11)), + ("F".to_string(), Field::Str("c".to_string())) + ], + Field::Null + ], + list![], + Field::Null + ] + )] + ), + ( + "g".to_string(), + map![ + ( + Field::Str("g1".to_string()), + group![( + "H".to_string(), + group![( + "i".to_string(), + list![Field::Double(2.2), Field::Null] + )] + )] + ), + ( + Field::Str("g2".to_string()), + group![("H".to_string(), group![("i".to_string(), list![])])] + ), + (Field::Str("g3".to_string()), Field::Null), + ( + Field::Str("g4".to_string()), + group![( + "H".to_string(), + group![("i".to_string(), Field::Null)] + )] + ), + ( + Field::Str("g5".to_string()), + group![("H".to_string(), Field::Null)] + ) + ] + ) + ] + ) + ], + row![ + ("id".to_string(), Field::Long(3)), + ("int_array".to_string(), list![]), + ("int_array_Array".to_string(), list![Field::Null]), + ("int_map".to_string(), map![]), + ("int_Map_Array".to_string(), list![Field::Null, Field::Null]), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Null), + ("b".to_string(), Field::Null), + ("C".to_string(), group![("d".to_string(), list![])]), + ("g".to_string(), map![]) + ] + ) + ], + row![ + ("id".to_string(), Field::Long(4)), + ("int_array".to_string(), Field::Null), + ("int_array_Array".to_string(), list![]), + ("int_map".to_string(), map![]), + ("int_Map_Array".to_string(), list![]), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Null), + ("b".to_string(), Field::Null), + ("C".to_string(), group![("d".to_string(), Field::Null)]), + ("g".to_string(), Field::Null) + ] + ) + ], + row![ + ("id".to_string(), Field::Long(5)), + ("int_array".to_string(), Field::Null), + ("int_array_Array".to_string(), Field::Null), + ("int_map".to_string(), map![]), + ("int_Map_Array".to_string(), Field::Null), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Null), + ("b".to_string(), Field::Null), + ("C".to_string(), Field::Null), + ( + "g".to_string(), + map![( + Field::Str("foo".to_string()), + group![( + "H".to_string(), + group![( + "i".to_string(), + list![Field::Double(2.2), Field::Double(3.3)] + )] + )] + )] + ) + ] + ) + ], + row![ + ("id".to_string(), Field::Long(6)), + ("int_array".to_string(), Field::Null), + ("int_array_Array".to_string(), Field::Null), + ("int_map".to_string(), Field::Null), + ("int_Map_Array".to_string(), Field::Null), + ("nested_struct".to_string(), Field::Null) + ], + row![ + ("id".to_string(), Field::Long(7)), + ("int_array".to_string(), Field::Null), + ( + "int_array_Array".to_string(), + list![Field::Null, list![Field::Int(5), Field::Int(6)]] + ), + ( + "int_map".to_string(), + map![ + (Field::Str("k1".to_string()), Field::Null), + (Field::Str("k3".to_string()), Field::Null) + ] + ), + ("int_Map_Array".to_string(), Field::Null), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Int(7)), + ( + "b".to_string(), + list![Field::Int(2), Field::Int(3), Field::Null] + ), + ( + "C".to_string(), + group![( + "d".to_string(), + list![list![], list![Field::Null], Field::Null] + )] + ), + ("g".to_string(), Field::Null) + ] + ) + ], + ]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_projection() { + let schema = " + message spark_schema { + REQUIRED DOUBLE c; + REQUIRED INT32 b; + } + "; + let schema = parse_message_type(&schema).unwrap(); + let rows = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); + let expected_rows = vec![ + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + ]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_projection_map() { + let schema = " + message spark_schema { + OPTIONAL group a (MAP) { + REPEATED group key_value { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL group value (MAP) { + REPEATED group key_value { + REQUIRED INT32 key; + REQUIRED BOOLEAN value; + } + } + } + } + } + "; + let schema = parse_message_type(&schema).unwrap(); + let rows = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); + let expected_rows = vec![ + row![( + "a".to_string(), + map![( + Field::Str("a".to_string()), + map![ + (Field::Int(1), Field::Bool(true)), + (Field::Int(2), Field::Bool(false)) + ] + )] + )], + row![( + "a".to_string(), + map![( + Field::Str("b".to_string()), + map![(Field::Int(1), Field::Bool(true))] + )] + )], + row![( + "a".to_string(), + map![(Field::Str("c".to_string()), Field::Null)] + )], + row![("a".to_string(), map![(Field::Str("d".to_string()), map![])])], + row![( + "a".to_string(), + map![( + Field::Str("e".to_string()), + map![(Field::Int(1), Field::Bool(true))] + )] + )], + row![( + "a".to_string(), + map![( + Field::Str("f".to_string()), + map![ + (Field::Int(3), Field::Bool(true)), + (Field::Int(4), Field::Bool(false)), + (Field::Int(5), Field::Bool(true)) + ] + )] + )], + ]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_projection_list() { + let schema = " + message spark_schema { + OPTIONAL group a (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL BYTE_ARRAY element (UTF8); + } + } + } + } + } + } + } + "; + let schema = parse_message_type(&schema).unwrap(); + let rows = test_file_reader_rows("nested_lists.snappy.parquet", Some(schema)).unwrap(); + let expected_rows = vec![ + row![( + "a".to_string(), + list![ + list![ + list![Field::Str("a".to_string()), Field::Str("b".to_string())], + list![Field::Str("c".to_string())] + ], + list![Field::Null, list![Field::Str("d".to_string())]] + ] + )], + row![( + "a".to_string(), + list![ + list![ + list![Field::Str("a".to_string()), Field::Str("b".to_string())], + list![Field::Str("c".to_string()), Field::Str("d".to_string())] + ], + list![Field::Null, list![Field::Str("e".to_string())]] + ] + )], + row![( + "a".to_string(), + list![ + list![ + list![Field::Str("a".to_string()), Field::Str("b".to_string())], + list![Field::Str("c".to_string()), Field::Str("d".to_string())], + list![Field::Str("e".to_string())] + ], + list![Field::Null, list![Field::Str("f".to_string())]] + ] + )], + ]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_invalid_projection() { + let schema = " + message spark_schema { + REQUIRED INT32 key; + REQUIRED BOOLEAN value; + } + "; + let schema = parse_message_type(&schema).unwrap(); + let res = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)); + assert!(res.is_err()); + assert_eq!( + res.unwrap_err(), + general_err!("Root schema does not contain projection") + ); + } + + #[test] + fn test_row_group_rows_invalid_projection() { + let schema = " + message spark_schema { + REQUIRED INT32 key; + REQUIRED BOOLEAN value; + } + "; + let schema = parse_message_type(&schema).unwrap(); + let res = test_row_group_rows("nested_maps.snappy.parquet", Some(schema)); + assert!(res.is_err()); + assert_eq!( + res.unwrap_err(), + general_err!("Root schema does not contain projection") + ); + } + + #[test] + #[should_panic(expected = "Invalid map type")] + fn test_file_reader_rows_invalid_map_type() { + let schema = " + message spark_schema { + OPTIONAL group a (MAP) { + REPEATED group key_value { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL group value (MAP) { + REPEATED group key_value { + REQUIRED INT32 key; + } + } + } + } + } + "; + let schema = parse_message_type(&schema).unwrap(); + test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); + } + + #[test] + fn test_tree_reader_handle_repeated_fields_with_no_annotation() { + // Array field `phoneNumbers` does not contain LIST annotation. + // We parse it as struct with `phone` repeated field as array. + let rows = test_file_reader_rows("repeated_no_annotation.parquet", None).unwrap(); + let expected_rows = vec![ + row![ + ("id".to_string(), Field::Int(1)), + ("phoneNumbers".to_string(), Field::Null) + ], + row![ + ("id".to_string(), Field::Int(2)), + ("phoneNumbers".to_string(), Field::Null) + ], + row![ + ("id".to_string(), Field::Int(3)), + ( + "phoneNumbers".to_string(), + group![("phone".to_string(), list![])] + ) + ], + row![ + ("id".to_string(), Field::Int(4)), + ( + "phoneNumbers".to_string(), + group![( + "phone".to_string(), + list![group![ + ("number".to_string(), Field::Long(5555555555)), + ("kind".to_string(), Field::Null) + ]] + )] + ) + ], + row![ + ("id".to_string(), Field::Int(5)), + ( + "phoneNumbers".to_string(), + group![( + "phone".to_string(), + list![group![ + ("number".to_string(), Field::Long(1111111111)), + ("kind".to_string(), Field::Str("home".to_string())) + ]] + )] + ) + ], + row![ + ("id".to_string(), Field::Int(6)), + ( + "phoneNumbers".to_string(), + group![( + "phone".to_string(), + list![ + group![ + ("number".to_string(), Field::Long(1111111111)), + ("kind".to_string(), Field::Str("home".to_string())) + ], + group![ + ("number".to_string(), Field::Long(2222222222)), + ("kind".to_string(), Field::Null) + ], + group![ + ("number".to_string(), Field::Long(3333333333)), + ("kind".to_string(), Field::Str("mobile".to_string())) + ] + ] + )] + ) + ], + ]; + + assert_eq!(rows, expected_rows); + } + + fn test_file_reader_rows(file_name: &str, schema: Option) -> Result> { + let file = get_test_file(file_name); + let file_reader: Box = Box::new(SerializedFileReader::new(file)?); + let iter = file_reader.get_row_iter(schema)?; + Ok(iter.collect()) + } + + fn test_row_group_rows(file_name: &str, schema: Option) -> Result> { + let file = get_test_file(file_name); + let file_reader: Box = Box::new(SerializedFileReader::new(file)?); + // Check the first row group only, because files will contain only single row group + let row_group_reader = file_reader.get_row_group(0).unwrap(); + let iter = row_group_reader.get_row_iter(schema)?; + Ok(iter.collect()) + } +} diff --git a/rust/src/parquet/record/triplet.rs b/rust/src/parquet/record/triplet.rs new file mode 100644 index 0000000000000..fadcbbce9ba5b --- /dev/null +++ b/rust/src/parquet/record/triplet.rs @@ -0,0 +1,561 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::parquet::basic::Type as PhysicalType; +use crate::parquet::column::reader::{get_typed_column_reader, ColumnReader, ColumnReaderImpl}; +use crate::parquet::data_type::*; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::record::api::Field; +use crate::parquet::schema::types::ColumnDescPtr; + +/// Macro to generate simple functions that cover all types of triplet iterator. +/// $func is a function of a typed triplet iterator and $token is a either {`ref`} or +/// {`ref`, `mut`} +macro_rules! triplet_enum_func { + ($self:ident, $func:ident, $( $token:tt ),*) => ({ + match *$self { + TripletIter::BoolTripletIter($($token)* typed) => typed.$func(), + TripletIter::Int32TripletIter($($token)* typed) => typed.$func(), + TripletIter::Int64TripletIter($($token)* typed) => typed.$func(), + TripletIter::Int96TripletIter($($token)* typed) => typed.$func(), + TripletIter::FloatTripletIter($($token)* typed) => typed.$func(), + TripletIter::DoubleTripletIter($($token)* typed) => typed.$func(), + TripletIter::ByteArrayTripletIter($($token)* typed) => typed.$func(), + TripletIter::FixedLenByteArrayTripletIter($($token)* typed) => typed.$func() + } + }); +} + +/// High level API wrapper on column reader. +/// Provides per-element access for each primitive column. +pub enum TripletIter { + BoolTripletIter(TypedTripletIter), + Int32TripletIter(TypedTripletIter), + Int64TripletIter(TypedTripletIter), + Int96TripletIter(TypedTripletIter), + FloatTripletIter(TypedTripletIter), + DoubleTripletIter(TypedTripletIter), + ByteArrayTripletIter(TypedTripletIter), + FixedLenByteArrayTripletIter(TypedTripletIter), +} + +impl TripletIter { + /// Creates new triplet for column reader + pub fn new(descr: ColumnDescPtr, reader: ColumnReader, batch_size: usize) -> Self { + match descr.physical_type() { + PhysicalType::BOOLEAN => { + TripletIter::BoolTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::INT32 => { + TripletIter::Int32TripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::INT64 => { + TripletIter::Int64TripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::INT96 => { + TripletIter::Int96TripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::FLOAT => { + TripletIter::FloatTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::DOUBLE => { + TripletIter::DoubleTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::BYTE_ARRAY => { + TripletIter::ByteArrayTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::FIXED_LEN_BYTE_ARRAY => TripletIter::FixedLenByteArrayTripletIter( + TypedTripletIter::new(descr, batch_size, reader), + ), + } + } + + /// Invokes underlying typed triplet iterator to buffer current value. + /// Should be called once - either before `is_null` or `current_value`. + #[inline] + pub fn read_next(&mut self) -> Result { + triplet_enum_func!(self, read_next, ref, mut) + } + + /// Provides check on values/levels left without invoking the underlying typed triplet + /// iterator. + /// Returns true if more values/levels exist, false otherwise. + /// It is always in sync with `read_next` method. + #[inline] + pub fn has_next(&self) -> bool { + triplet_enum_func!(self, has_next, ref) + } + + /// Returns current definition level for a leaf triplet iterator + #[inline] + pub fn current_def_level(&self) -> i16 { + triplet_enum_func!(self, current_def_level, ref) + } + + /// Returns max definition level for a leaf triplet iterator + #[inline] + pub fn max_def_level(&self) -> i16 { + triplet_enum_func!(self, max_def_level, ref) + } + + /// Returns current repetition level for a leaf triplet iterator + #[inline] + pub fn current_rep_level(&self) -> i16 { + triplet_enum_func!(self, current_rep_level, ref) + } + + /// Returns max repetition level for a leaf triplet iterator + #[inline] + pub fn max_rep_level(&self) -> i16 { + triplet_enum_func!(self, max_rep_level, ref) + } + + /// Returns true, if current value is null. + /// Based on the fact that for non-null value current definition level + /// equals to max definition level. + #[inline] + pub fn is_null(&self) -> bool { + self.current_def_level() < self.max_def_level() + } + + /// Updates non-null value for current row. + pub fn current_value(&self) -> Field { + assert!(!self.is_null(), "Value is null"); + match *self { + TripletIter::BoolTripletIter(ref typed) => { + Field::convert_bool(typed.column_descr(), *typed.current_value()) + } + TripletIter::Int32TripletIter(ref typed) => { + Field::convert_int32(typed.column_descr(), *typed.current_value()) + } + TripletIter::Int64TripletIter(ref typed) => { + Field::convert_int64(typed.column_descr(), *typed.current_value()) + } + TripletIter::Int96TripletIter(ref typed) => { + Field::convert_int96(typed.column_descr(), typed.current_value().clone()) + } + TripletIter::FloatTripletIter(ref typed) => { + Field::convert_float(typed.column_descr(), *typed.current_value()) + } + TripletIter::DoubleTripletIter(ref typed) => { + Field::convert_double(typed.column_descr(), *typed.current_value()) + } + TripletIter::ByteArrayTripletIter(ref typed) => { + Field::convert_byte_array(typed.column_descr(), typed.current_value().clone()) + } + TripletIter::FixedLenByteArrayTripletIter(ref typed) => { + Field::convert_byte_array(typed.column_descr(), typed.current_value().clone()) + } + } + } +} + +/// Internal typed triplet iterator as a wrapper for column reader +/// (primitive leaf column), provides per-element access. +pub struct TypedTripletIter { + reader: ColumnReaderImpl, + column_descr: ColumnDescPtr, + batch_size: usize, + // type properties + max_def_level: i16, + max_rep_level: i16, + // values and levels + values: Vec, + def_levels: Option>, + rep_levels: Option>, + // current index for the triplet (value, def, rep) + curr_triplet_index: usize, + // how many triplets are left before we need to buffer + triplets_left: usize, + // helper flag to quickly check if we have more values/levels to read + has_next: bool, +} + +impl TypedTripletIter { + /// Creates new typed triplet iterator based on provided column reader. + /// Use batch size to specify the amount of values to buffer from column reader. + fn new(descr: ColumnDescPtr, batch_size: usize, column_reader: ColumnReader) -> Self { + assert!( + batch_size > 0, + "Expected positive batch size, found: {}", + batch_size + ); + + let max_def_level = descr.max_def_level(); + let max_rep_level = descr.max_rep_level(); + + let def_levels = if max_def_level == 0 { + None + } else { + Some(vec![0; batch_size]) + }; + let rep_levels = if max_rep_level == 0 { + None + } else { + Some(vec![0; batch_size]) + }; + + Self { + reader: get_typed_column_reader(column_reader), + column_descr: descr, + batch_size, + max_def_level, + max_rep_level, + values: vec![T::T::default(); batch_size], + def_levels, + rep_levels, + curr_triplet_index: 0, + triplets_left: 0, + has_next: false, + } + } + + /// Returns column descriptor reference for the current typed triplet iterator. + #[inline] + pub fn column_descr(&self) -> &ColumnDescPtr { + &self.column_descr + } + + /// Returns maximum definition level for the triplet iterator (leaf column). + #[inline] + fn max_def_level(&self) -> i16 { + self.max_def_level + } + + /// Returns maximum repetition level for the triplet iterator (leaf column). + #[inline] + fn max_rep_level(&self) -> i16 { + self.max_rep_level + } + + /// Returns current value. + /// Method does not advance the iterator, therefore can be called multiple times. + #[inline] + fn current_value(&self) -> &T::T { + assert!( + self.current_def_level() == self.max_def_level(), + "Cannot extract value, max definition level: {}, current level: {}", + self.max_def_level(), + self.current_def_level() + ); + &self.values[self.curr_triplet_index] + } + + /// Returns current definition level. + /// If field is required, then maximum definition level is returned. + #[inline] + fn current_def_level(&self) -> i16 { + match self.def_levels { + Some(ref vec) => vec[self.curr_triplet_index], + None => self.max_def_level, + } + } + + /// Returns current repetition level. + /// If field is required, then maximum repetition level is returned. + #[inline] + fn current_rep_level(&self) -> i16 { + match self.rep_levels { + Some(ref vec) => vec[self.curr_triplet_index], + None => self.max_rep_level, + } + } + + /// Quick check if iterator has more values/levels to read. + /// It is updated as a result of `read_next` method, so they are synchronized. + #[inline] + fn has_next(&self) -> bool { + self.has_next + } + + /// Advances to the next triplet. + /// Returns true, if there are more records to read, false there are no records left. + fn read_next(&mut self) -> Result { + self.curr_triplet_index += 1; + + if self.curr_triplet_index >= self.triplets_left { + let (values_read, levels_read) = { + // Get slice of definition levels, if available + let def_levels = match self.def_levels { + Some(ref mut vec) => Some(&mut vec[..]), + None => None, + }; + + // Get slice of repetition levels, if available + let rep_levels = match self.rep_levels { + Some(ref mut vec) => Some(&mut vec[..]), + None => None, + }; + + // Buffer triplets + self.reader + .read_batch(self.batch_size, def_levels, rep_levels, &mut self.values)? + }; + + // No more values or levels to read + if values_read == 0 && levels_read == 0 { + self.has_next = false; + return Ok(false); + } + + // We never read values more than levels + if levels_read == 0 || values_read == levels_read { + // There are no definition levels to read, column is required + // or definition levels match values, so it does not require spacing + self.curr_triplet_index = 0; + self.triplets_left = values_read; + } else if values_read < levels_read { + // Add spacing for triplets. + // The idea is setting values for positions in def_levels when current definition + // level equals to maximum definition level. Values and levels are guaranteed to + // line up, because of the column reader method. + + // Note: if values_read == 0, then spacing will not be triggered + let mut idx = values_read; + let def_levels = self.def_levels.as_ref().unwrap(); + for i in 0..levels_read { + if def_levels[levels_read - i - 1] == self.max_def_level { + idx -= 1; // This is done to avoid usize becoming a negative value + self.values.swap(levels_read - i - 1, idx); + } + } + self.curr_triplet_index = 0; + self.triplets_left = levels_read; + } else { + return Err(general_err!( + "Spacing of values/levels is wrong, values_read: {}, levels_read: {}", + values_read, + levels_read + )); + } + } + + self.has_next = true; + Ok(true) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::parquet::file::reader::{FileReader, SerializedFileReader}; + use crate::parquet::schema::types::ColumnPath; + use crate::parquet::util::test_common::get_test_file; + + #[test] + #[should_panic(expected = "Expected positive batch size, found: 0")] + fn test_triplet_zero_batch_size() { + let column_path = ColumnPath::from(vec!["b_struct".to_string(), "b_c_int".to_string()]); + test_column_in_file( + "nulls.snappy.parquet", + 0, + &column_path, + &vec![], + &vec![], + &vec![], + ); + } + + #[test] + fn test_triplet_null_column() { + let path = vec!["b_struct", "b_c_int"]; + let values = vec![]; + let def_levels = vec![1, 1, 1, 1, 1, 1, 1, 1]; + let rep_levels = vec![0, 0, 0, 0, 0, 0, 0, 0]; + test_triplet_iter( + "nulls.snappy.parquet", + path, + &values, + &def_levels, + &rep_levels, + ); + } + + #[test] + fn test_triplet_required_column() { + let path = vec!["ID"]; + let values = vec![Field::Long(8)]; + let def_levels = vec![0]; + let rep_levels = vec![0]; + test_triplet_iter( + "nonnullable.impala.parquet", + path, + &values, + &def_levels, + &rep_levels, + ); + } + + #[test] + fn test_triplet_optional_column() { + let path = vec!["nested_struct", "A"]; + let values = vec![Field::Int(1), Field::Int(7)]; + let def_levels = vec![2, 1, 1, 1, 1, 0, 2]; + let rep_levels = vec![0, 0, 0, 0, 0, 0, 0]; + test_triplet_iter( + "nullable.impala.parquet", + path, + &values, + &def_levels, + &rep_levels, + ); + } + + #[test] + fn test_triplet_optional_list_column() { + let path = vec!["a", "list", "element", "list", "element", "list", "element"]; + let values = vec![ + Field::Str("a".to_string()), + Field::Str("b".to_string()), + Field::Str("c".to_string()), + Field::Str("d".to_string()), + Field::Str("a".to_string()), + Field::Str("b".to_string()), + Field::Str("c".to_string()), + Field::Str("d".to_string()), + Field::Str("e".to_string()), + Field::Str("a".to_string()), + Field::Str("b".to_string()), + Field::Str("c".to_string()), + Field::Str("d".to_string()), + Field::Str("e".to_string()), + Field::Str("f".to_string()), + ]; + let def_levels = vec![7, 7, 7, 4, 7, 7, 7, 7, 7, 4, 7, 7, 7, 7, 7, 7, 4, 7]; + let rep_levels = vec![0, 3, 2, 1, 2, 0, 3, 2, 3, 1, 2, 0, 3, 2, 3, 2, 1, 2]; + test_triplet_iter( + "nested_lists.snappy.parquet", + path, + &values, + &def_levels, + &rep_levels, + ); + } + + #[test] + fn test_triplet_optional_map_column() { + let path = vec!["a", "key_value", "value", "key_value", "key"]; + let values = vec![ + Field::Int(1), + Field::Int(2), + Field::Int(1), + Field::Int(1), + Field::Int(3), + Field::Int(4), + Field::Int(5), + ]; + let def_levels = vec![4, 4, 4, 2, 3, 4, 4, 4, 4]; + let rep_levels = vec![0, 2, 0, 0, 0, 0, 0, 2, 2]; + test_triplet_iter( + "nested_maps.snappy.parquet", + path, + &values, + &def_levels, + &rep_levels, + ); + } + + // Check triplet iterator across different batch sizes + fn test_triplet_iter( + file_name: &str, + column_path: Vec<&str>, + expected_values: &[Field], + expected_def_levels: &[i16], + expected_rep_levels: &[i16], + ) { + // Convert path into column path + let path: Vec = column_path.iter().map(|x| x.to_string()).collect(); + let column_path = ColumnPath::from(path); + + let batch_sizes = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 128, 256]; + for batch_size in batch_sizes { + test_column_in_file( + file_name, + batch_size, + &column_path, + expected_values, + expected_def_levels, + expected_rep_levels, + ); + } + } + + // Check values of a selectd column in a file + fn test_column_in_file( + file_name: &str, + batch_size: usize, + column_path: &ColumnPath, + expected_values: &[Field], + expected_def_levels: &[i16], + expected_rep_levels: &[i16], + ) { + let file = get_test_file(file_name); + let file_reader = SerializedFileReader::new(file).unwrap(); + // Get schema descriptor + let file_metadata = file_reader.metadata().file_metadata(); + let schema = file_metadata.schema_descr(); + // Get first row group + let row_group_reader = file_reader.get_row_group(0).unwrap(); + + for i in 0..schema.num_columns() { + let descr = schema.column(i); + if descr.path() == column_path { + let reader = row_group_reader.get_column_reader(i).unwrap(); + test_triplet_column( + descr, + reader, + batch_size, + expected_values, + expected_def_levels, + expected_rep_levels, + ); + } + } + } + + // Check values for individual triplet iterator + fn test_triplet_column( + descr: ColumnDescPtr, + reader: ColumnReader, + batch_size: usize, + expected_values: &[Field], + expected_def_levels: &[i16], + expected_rep_levels: &[i16], + ) { + let mut iter = TripletIter::new(descr.clone(), reader, batch_size); + let mut values: Vec = Vec::new(); + let mut def_levels: Vec = Vec::new(); + let mut rep_levels: Vec = Vec::new(); + + assert_eq!(iter.max_def_level(), descr.max_def_level()); + assert_eq!(iter.max_rep_level(), descr.max_rep_level()); + + while let Ok(true) = iter.read_next() { + assert!(iter.has_next()); + if !iter.is_null() { + values.push(iter.current_value()); + } + def_levels.push(iter.current_def_level()); + rep_levels.push(iter.current_rep_level()); + } + + assert_eq!(values, expected_values); + assert_eq!(def_levels, expected_def_levels); + assert_eq!(rep_levels, expected_rep_levels); + } +} diff --git a/rust/src/parquet/schema/mod.rs b/rust/src/parquet/schema/mod.rs new file mode 100644 index 0000000000000..5319504964627 --- /dev/null +++ b/rust/src/parquet/schema/mod.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet schema definitions and methods to print and parse schema. +//! +//! # Example +//! +//! ```rust +//! use arrow::parquet::{ +//! basic::{LogicalType, Repetition, Type as PhysicalType}, +//! schema::{parser, printer, types::Type}, +//! }; +//! use std::rc::Rc; +//! +//! // Create the following schema: +//! // +//! // message schema { +//! // OPTIONAL BYTE_ARRAY a (UTF8); +//! // REQUIRED INT32 b; +//! // } +//! +//! let field_a = Type::primitive_type_builder("a", PhysicalType::BYTE_ARRAY) +//! .with_logical_type(LogicalType::UTF8) +//! .with_repetition(Repetition::OPTIONAL) +//! .build() +//! .unwrap(); +//! +//! let field_b = Type::primitive_type_builder("b", PhysicalType::INT32) +//! .with_repetition(Repetition::REQUIRED) +//! .build() +//! .unwrap(); +//! +//! let schema = Type::group_type_builder("schema") +//! .with_fields(&mut vec![Rc::new(field_a), Rc::new(field_b)]) +//! .build() +//! .unwrap(); +//! +//! let mut buf = Vec::new(); +//! +//! // Print schema into buffer +//! printer::print_schema(&mut buf, &schema); +//! +//! // Parse schema from the string +//! let string_schema = String::from_utf8(buf).unwrap(); +//! let parsed_schema = parser::parse_message_type(&string_schema).unwrap(); +//! +//! assert_eq!(schema, parsed_schema); +//! ``` + +pub mod parser; +pub mod printer; +pub mod types; diff --git a/rust/src/parquet/schema/parser.rs b/rust/src/parquet/schema/parser.rs new file mode 100644 index 0000000000000..2890c84a755ba --- /dev/null +++ b/rust/src/parquet/schema/parser.rs @@ -0,0 +1,764 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet schema parser. +//! Provides methods to parse and validate string message type into Parquet +//! [`Type`](`::schema::types::Type`). +//! +//! # Example +//! +//! ```rust +//! use arrow::parquet::schema::parser::parse_message_type; +//! +//! let message_type = " +//! message spark_schema { +//! OPTIONAL BYTE_ARRAY a (UTF8); +//! REQUIRED INT32 b; +//! REQUIRED DOUBLE c; +//! REQUIRED BOOLEAN d; +//! OPTIONAL group e (LIST) { +//! REPEATED group list { +//! REQUIRED INT32 element; +//! } +//! } +//! } +//! "; +//! +//! let schema = parse_message_type(message_type).expect("Expected valid schema"); +//! println!("{:?}", schema); +//! ``` + +use std::rc::Rc; + +use crate::parquet::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::schema::types::{Type, TypePtr}; + +/// Parses message type as string into a Parquet [`Type`](`::schema::types::Type`) which, +/// for example, could be used to extract individual columns. Returns Parquet general +/// error when parsing or validation fails. +pub fn parse_message_type<'a>(message_type: &'a str) -> Result { + let mut parser = Parser { + tokenizer: &mut Tokenizer::from_str(message_type), + }; + parser.parse_message_type() +} + +/// Tokenizer to split message type string into tokens that are separated using characters +/// defined in `is_schema_delim` method. Tokenizer also preserves delimiters as tokens. +/// Tokenizer provides Iterator interface to process tokens; it also allows to step back +/// to reprocess previous tokens. +struct Tokenizer<'a> { + // List of all tokens for a string + tokens: Vec<&'a str>, + // Current index of vector + index: usize, +} + +impl<'a> Tokenizer<'a> { + // Create tokenizer from message type string + pub fn from_str(string: &'a str) -> Self { + let vec = string + .split_whitespace() + .flat_map(|t| Self::split_token(t)) + .collect(); + Tokenizer { + tokens: vec, + index: 0, + } + } + + // List of all special characters in schema + fn is_schema_delim(c: char) -> bool { + c == ';' || c == '{' || c == '}' || c == '(' || c == ')' || c == '=' || c == ',' + } + + /// Splits string into tokens; input string can already be token or can contain + /// delimiters, e.g. required" -> Vec("required") and + /// "(UTF8);" -> Vec("(", "UTF8", ")", ";") + fn split_token(string: &str) -> Vec<&str> { + let mut buffer: Vec<&str> = Vec::new(); + let mut tail = string; + while let Some(index) = tail.find(Self::is_schema_delim) { + let (h, t) = tail.split_at(index); + if !h.is_empty() { + buffer.push(h); + } + buffer.push(&t[0..1]); + tail = &t[1..]; + } + if !tail.is_empty() { + buffer.push(tail); + } + buffer + } + + // Move pointer to a previous element + fn backtrack(&mut self) { + self.index -= 1; + } +} + +impl<'a> Iterator for Tokenizer<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + if self.index < self.tokens.len() { + self.index += 1; + Some(self.tokens[self.index - 1]) + } else { + None + } + } +} + +/// Internal Schema parser. +/// Traverses message type using tokenizer and parses each group/primitive type +/// recursively. +struct Parser<'a> { + tokenizer: &'a mut Tokenizer<'a>, +} + +// Utility function to assert token on validity. +fn assert_token(token: Option<&str>, expected: &str) -> Result<()> { + match token { + Some(value) if value == expected => Ok(()), + Some(other) => Err(general_err!( + "Expected '{}', found token '{}'", + expected, + other + )), + None => Err(general_err!( + "Expected '{}', but no token found (None)", + expected + )), + } +} + +// Utility function to parse i32 or return general error. +fn parse_i32(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result { + value + .ok_or(general_err!(not_found_msg)) + .and_then(|v| v.parse::().map_err(|_| general_err!(parse_fail_msg))) +} + +impl<'a> Parser<'a> { + // Entry function to parse message type, uses internal tokenizer. + fn parse_message_type(&mut self) -> Result { + // Check that message type starts with "message". + match self.tokenizer.next() { + Some("message") => { + let name = self + .tokenizer + .next() + .ok_or(general_err!("Expected name, found None"))?; + let mut fields = self.parse_child_types()?; + Type::group_type_builder(name) + .with_fields(&mut fields) + .build() + } + _ => Err(general_err!("Message type does not start with 'message'")), + } + } + + // Parses child types for a current group type. + // This is only invoked on root and group types. + fn parse_child_types(&mut self) -> Result> { + assert_token(self.tokenizer.next(), "{")?; + let mut vec = Vec::new(); + while let Some(value) = self.tokenizer.next() { + if value == "}" { + break; + } else { + self.tokenizer.backtrack(); + vec.push(Rc::new(self.add_type()?)); + } + } + Ok(vec) + } + + fn add_type(&mut self) -> Result { + // Parse repetition + let repetition = self + .tokenizer + .next() + .ok_or(general_err!("Expected repetition, found None")) + .and_then(|v| v.to_uppercase().parse::())?; + + match self.tokenizer.next() { + Some(group) if group.to_uppercase() == "GROUP" => self.add_group_type(Some(repetition)), + Some(type_string) => { + let physical_type = type_string.to_uppercase().parse::()?; + self.add_primitive_type(repetition, physical_type) + } + None => Err(general_err!("Invalid type, could not extract next token")), + } + } + + fn add_group_type(&mut self, repetition: Option) -> Result { + // Parse name of the group type + let name = self + .tokenizer + .next() + .ok_or(general_err!("Expected name, found None"))?; + + // Parse logical type if exists + let logical_type = if let Some("(") = self.tokenizer.next() { + let tpe = self + .tokenizer + .next() + .ok_or(general_err!("Expected logical type, found None")) + .and_then(|v| v.to_uppercase().parse::())?; + assert_token(self.tokenizer.next(), ")")?; + tpe + } else { + self.tokenizer.backtrack(); + LogicalType::NONE + }; + + // Parse optional id + let id = if let Some("=") = self.tokenizer.next() { + self.tokenizer.next().and_then(|v| v.parse::().ok()) + } else { + self.tokenizer.backtrack(); + None + }; + + let mut fields = self.parse_child_types()?; + let mut builder = Type::group_type_builder(name) + .with_logical_type(logical_type) + .with_fields(&mut fields); + if let Some(rep) = repetition { + builder = builder.with_repetition(rep); + } + if let Some(id) = id { + builder = builder.with_id(id); + } + builder.build() + } + + fn add_primitive_type( + &mut self, + repetition: Repetition, + physical_type: PhysicalType, + ) -> Result { + // Read type length if the type is FIXED_LEN_BYTE_ARRAY. + let mut length: i32 = -1; + if physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY { + assert_token(self.tokenizer.next(), "(")?; + length = parse_i32( + self.tokenizer.next(), + "Expected length for FIXED_LEN_BYTE_ARRAY, found None", + "Failed to parse length for FIXED_LEN_BYTE_ARRAY", + )?; + assert_token(self.tokenizer.next(), ")")?; + } + + // Parse name of the primitive type + let name = self + .tokenizer + .next() + .ok_or(general_err!("Expected name, found None"))?; + + // Parse logical type + let (logical_type, precision, scale) = if let Some("(") = self.tokenizer.next() { + let tpe = self + .tokenizer + .next() + .ok_or(general_err!("Expected logical type, found None")) + .and_then(|v| v.to_uppercase().parse::())?; + + // Parse precision and scale for decimals + let mut precision: i32 = -1; + let mut scale: i32 = -1; + + if tpe == LogicalType::DECIMAL { + if let Some("(") = self.tokenizer.next() { + // Parse precision + precision = parse_i32( + self.tokenizer.next(), + "Expected precision, found None", + "Failed to parse precision for DECIMAL type", + )?; + + // Parse scale + scale = if let Some(",") = self.tokenizer.next() { + parse_i32( + self.tokenizer.next(), + "Expected scale, found None", + "Failed to parse scale for DECIMAL type", + )? + } else { + // Scale is not provided, set it to 0. + self.tokenizer.backtrack(); + 0 + }; + + assert_token(self.tokenizer.next(), ")")?; + } else { + self.tokenizer.backtrack(); + } + } + + assert_token(self.tokenizer.next(), ")")?; + (tpe, precision, scale) + } else { + self.tokenizer.backtrack(); + (LogicalType::NONE, -1, -1) + }; + + // Parse optional id + let id = if let Some("=") = self.tokenizer.next() { + self.tokenizer.next().and_then(|v| v.parse::().ok()) + } else { + self.tokenizer.backtrack(); + None + }; + assert_token(self.tokenizer.next(), ";")?; + + let mut builder = Type::primitive_type_builder(name, physical_type) + .with_repetition(repetition) + .with_logical_type(logical_type) + .with_length(length) + .with_precision(precision) + .with_scale(scale); + if let Some(id) = id { + builder = builder.with_id(id); + } + Ok(builder.build()?) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tokenize_empty_string() { + assert_eq!(Tokenizer::from_str("").next(), None); + } + + #[test] + fn test_tokenize_delimiters() { + let mut iter = Tokenizer::from_str(",;{}()="); + assert_eq!(iter.next(), Some(",")); + assert_eq!(iter.next(), Some(";")); + assert_eq!(iter.next(), Some("{")); + assert_eq!(iter.next(), Some("}")); + assert_eq!(iter.next(), Some("(")); + assert_eq!(iter.next(), Some(")")); + assert_eq!(iter.next(), Some("=")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_delimiters_with_whitespaces() { + let mut iter = Tokenizer::from_str(" , ; { } ( ) = "); + assert_eq!(iter.next(), Some(",")); + assert_eq!(iter.next(), Some(";")); + assert_eq!(iter.next(), Some("{")); + assert_eq!(iter.next(), Some("}")); + assert_eq!(iter.next(), Some("(")); + assert_eq!(iter.next(), Some(")")); + assert_eq!(iter.next(), Some("=")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_words() { + let mut iter = Tokenizer::from_str("abc def ghi jkl mno"); + assert_eq!(iter.next(), Some("abc")); + assert_eq!(iter.next(), Some("def")); + assert_eq!(iter.next(), Some("ghi")); + assert_eq!(iter.next(), Some("jkl")); + assert_eq!(iter.next(), Some("mno")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_backtrack() { + let mut iter = Tokenizer::from_str("abc;"); + assert_eq!(iter.next(), Some("abc")); + assert_eq!(iter.next(), Some(";")); + iter.backtrack(); + assert_eq!(iter.next(), Some(";")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_message_type() { + let schema = " + message schema { + required int32 a; + optional binary c (UTF8); + required group d { + required int32 a; + optional binary c (UTF8); + } + required group e (LIST) { + repeated group list { + required int32 element; + } + } + } + "; + let mut iter = Tokenizer::from_str(schema); + let mut res = Vec::new(); + while let Some(token) = iter.next() { + res.push(token); + } + assert_eq!( + res, + vec![ + "message", "schema", "{", "required", "int32", "a", ";", "optional", "binary", "c", + "(", "UTF8", ")", ";", "required", "group", "d", "{", "required", "int32", "a", + ";", "optional", "binary", "c", "(", "UTF8", ")", ";", "}", "required", "group", + "e", "(", "LIST", ")", "{", "repeated", "group", "list", "{", "required", "int32", + "element", ";", "}", "}", "}" + ] + ); + } + + #[test] + fn test_assert_token() { + assert!(assert_token(Some("a"), "a").is_ok()); + assert!(assert_token(Some("a"), "b").is_err()); + assert!(assert_token(None, "b").is_err()); + } + + #[test] + fn test_parse_message_type_invalid() { + let mut iter = Tokenizer::from_str("test"); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Parquet error: Message type does not start with 'message'" + ); + } + + #[test] + fn test_parse_message_type_no_name() { + let mut iter = Tokenizer::from_str("message"); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Parquet error: Expected name, found None" + ); + } + + #[test] + fn test_parse_message_type_fixed_byte_array() { + let schema = " + message schema { + REQUIRED FIXED_LEN_BYTE_ARRAY col; + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + let schema = " + message schema { + REQUIRED FIXED_LEN_BYTE_ARRAY(16) col; + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_ok()); + } + + #[test] + fn test_parse_message_type_decimal() { + // It is okay for decimal to omit precision and scale with right syntax. + // Here we test wrong syntax of decimal type + + // Invalid decimal syntax + let schema = " + message root { + optional int32 f1 (DECIMAL(); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Invalid decimal, need precision and scale + let schema = " + message root { + optional int32 f1 (DECIMAL()); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Invalid decimal because of `,` - has precision, needs scale + let schema = " + message root { + optional int32 f1 (DECIMAL(8,)); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Invalid decimal because, we always require either precision or scale to be + // specified as part of logical type + let schema = " + message root { + optional int32 f3 (DECIMAL); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Valid decimal (precision, scale) + let schema = " + message root { + optional int32 f1 (DECIMAL(8, 3)); + optional int32 f2 (DECIMAL(8)); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_ok()); + } + + #[test] + fn test_parse_message_type_compare_1() { + let schema = " + message root { + optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3)); + optional fixed_len_byte_array (16) f2 (DECIMAL (38, 18)); + } + "; + let mut iter = Tokenizer::from_str(schema); + let message = Parser { + tokenizer: &mut iter, + } + .parse_message_type() + .unwrap(); + + let expected = Type::group_type_builder("root") + .with_fields(&mut vec![ + Rc::new( + Type::primitive_type_builder("f1", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_logical_type(LogicalType::DECIMAL) + .with_length(5) + .with_precision(9) + .with_scale(3) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("f2", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_logical_type(LogicalType::DECIMAL) + .with_length(16) + .with_precision(38) + .with_scale(18) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(); + + assert_eq!(message, expected); + } + + #[test] + fn test_parse_message_type_compare_2() { + let schema = " + message root { + required group a0 { + optional group a1 (LIST) { + repeated binary a2 (UTF8); + } + + optional group b1 (LIST) { + repeated group b2 { + optional int32 b3; + optional double b4; + } + } + } + } + "; + let mut iter = Tokenizer::from_str(schema); + let message = Parser { + tokenizer: &mut iter, + } + .parse_message_type() + .unwrap(); + + let expected = Type::group_type_builder("root") + .with_fields(&mut vec![Rc::new( + Type::group_type_builder("a0") + .with_repetition(Repetition::REQUIRED) + .with_fields(&mut vec![ + Rc::new( + Type::group_type_builder("a1") + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::LIST) + .with_fields(&mut vec![Rc::new( + Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::UTF8) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ), + Rc::new( + Type::group_type_builder("b1") + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::LIST) + .with_fields(&mut vec![Rc::new( + Type::group_type_builder("b2") + .with_repetition(Repetition::REPEATED) + .with_fields(&mut vec![ + Rc::new( + Type::primitive_type_builder( + "b3", + PhysicalType::INT32, + ) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder( + "b4", + PhysicalType::DOUBLE, + ) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + )]) + .build() + .unwrap(); + + assert_eq!(message, expected); + } + + #[test] + fn test_parse_message_type_compare_3() { + let schema = " + message root { + required int32 _1 (INT_8); + required int32 _2 (INT_16); + required float _3; + required double _4; + optional int32 _5 (DATE); + optional binary _6 (UTF8); + } + "; + let mut iter = Tokenizer::from_str(schema); + let message = Parser { + tokenizer: &mut iter, + } + .parse_message_type() + .unwrap(); + + let mut fields = vec![ + Rc::new( + Type::primitive_type_builder("_1", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_8) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("_2", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_16) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("_3", PhysicalType::FLOAT) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("_4", PhysicalType::DOUBLE) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("_5", PhysicalType::INT32) + .with_logical_type(LogicalType::DATE) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("_6", PhysicalType::BYTE_ARRAY) + .with_logical_type(LogicalType::UTF8) + .build() + .unwrap(), + ), + ]; + + let expected = Type::group_type_builder("root") + .with_fields(&mut fields) + .build() + .unwrap(); + assert_eq!(message, expected); + } +} diff --git a/rust/src/parquet/schema/printer.rs b/rust/src/parquet/schema/printer.rs new file mode 100644 index 0000000000000..d61f116eb9e70 --- /dev/null +++ b/rust/src/parquet/schema/printer.rs @@ -0,0 +1,467 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet schema printer. +//! Provides methods to print Parquet file schema and list file metadata. +//! +//! # Example +//! +//! ```rust +//! use arrow::parquet::{ +//! file::reader::{FileReader, SerializedFileReader}, +//! schema::printer::{print_file_metadata, print_parquet_metadata, print_schema}, +//! }; +//! use std::{fs::File, path::Path}; +//! +//! // Open a file +//! let path = Path::new("test.parquet"); +//! if let Ok(file) = File::open(&path) { +//! let reader = SerializedFileReader::new(file).unwrap(); +//! let parquet_metadata = reader.metadata(); +//! +//! print_parquet_metadata(&mut std::io::stdout(), &parquet_metadata); +//! print_file_metadata(&mut std::io::stdout(), &parquet_metadata.file_metadata()); +//! +//! print_schema( +//! &mut std::io::stdout(), +//! &parquet_metadata.file_metadata().schema(), +//! ); +//! } +//! ``` + +use std::{fmt, io}; + +use crate::parquet::basic::{LogicalType, Type as PhysicalType}; +use crate::parquet::file::metadata::{ + ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData, +}; +use crate::parquet::schema::types::Type; + +/// Prints Parquet metadata [`ParquetMetaData`](`::file::metadata::ParquetMetaData`) +/// information. +#[allow(unused_must_use)] +pub fn print_parquet_metadata(out: &mut io::Write, metadata: &ParquetMetaData) { + print_file_metadata(out, &metadata.file_metadata()); + writeln!(out, ""); + writeln!(out, ""); + writeln!(out, "num of row groups: {}", metadata.num_row_groups()); + writeln!(out, "row groups:"); + writeln!(out, ""); + for (i, rg) in metadata.row_groups().iter().enumerate() { + writeln!(out, "row group {}:", i); + print_dashes(out, 80); + print_row_group_metadata(out, rg); + } +} + +/// Prints file metadata [`FileMetaData`](`::file::metadata::FileMetaData`) information. +#[allow(unused_must_use)] +pub fn print_file_metadata(out: &mut io::Write, file_metadata: &FileMetaData) { + writeln!(out, "version: {}", file_metadata.version()); + writeln!(out, "num of rows: {}", file_metadata.num_rows()); + if let Some(created_by) = file_metadata.created_by().as_ref() { + writeln!(out, "created by: {}", created_by); + } + let schema = file_metadata.schema(); + print_schema(out, schema); +} + +/// Prints Parquet [`Type`](`::schema::types::Type`) information. +#[allow(unused_must_use)] +pub fn print_schema(out: &mut io::Write, tp: &Type) { + // TODO: better if we can pass fmt::Write to Printer. + // But how can we make it to accept both io::Write & fmt::Write? + let mut s = String::new(); + { + let mut printer = Printer::new(&mut s); + printer.print(tp); + } + writeln!(out, "{}", s); +} + +#[allow(unused_must_use)] +fn print_row_group_metadata(out: &mut io::Write, rg_metadata: &RowGroupMetaData) { + writeln!(out, "total byte size: {}", rg_metadata.total_byte_size()); + writeln!(out, "num of rows: {}", rg_metadata.num_rows()); + writeln!(out, ""); + writeln!(out, "num of columns: {}", rg_metadata.num_columns()); + writeln!(out, "columns: "); + for (i, cc) in rg_metadata.columns().iter().enumerate() { + writeln!(out, ""); + writeln!(out, "column {}:", i); + print_dashes(out, 80); + print_column_chunk_metadata(out, cc); + } +} + +#[allow(unused_must_use)] +fn print_column_chunk_metadata(out: &mut io::Write, cc_metadata: &ColumnChunkMetaData) { + writeln!(out, "column type: {}", cc_metadata.column_type()); + writeln!(out, "column path: {}", cc_metadata.column_path()); + let encoding_strs: Vec<_> = cc_metadata + .encodings() + .iter() + .map(|e| format!("{}", e)) + .collect(); + writeln!(out, "encodings: {}", encoding_strs.join(" ")); + let file_path_str = match cc_metadata.file_path() { + None => "N/A", + Some(ref fp) => *fp, + }; + writeln!(out, "file path: {}", file_path_str); + writeln!(out, "file offset: {}", cc_metadata.file_offset()); + writeln!(out, "num of values: {}", cc_metadata.num_values()); + writeln!( + out, + "total compressed size (in bytes): {}", + cc_metadata.compressed_size() + ); + writeln!( + out, + "total uncompressed size (in bytes): {}", + cc_metadata.uncompressed_size() + ); + writeln!(out, "data page offset: {}", cc_metadata.data_page_offset()); + let index_page_offset_str = match cc_metadata.index_page_offset() { + None => "N/A".to_owned(), + Some(ipo) => ipo.to_string(), + }; + writeln!(out, "index page offset: {}", index_page_offset_str); + let dict_page_offset_str = match cc_metadata.dictionary_page_offset() { + None => "N/A".to_owned(), + Some(dpo) => dpo.to_string(), + }; + writeln!(out, "dictionary page offset: {}", dict_page_offset_str); + let statistics_str = match cc_metadata.statistics() { + None => "N/A".to_owned(), + Some(stats) => stats.to_string(), + }; + writeln!(out, "statistics: {}", statistics_str); + writeln!(out, ""); +} + +#[allow(unused_must_use)] +fn print_dashes(out: &mut io::Write, num: i32) { + for _ in 0..num { + write!(out, "-"); + } + writeln!(out, ""); +} + +const INDENT_WIDTH: i32 = 2; + +/// Struct for printing Parquet message type. +struct Printer<'a> { + output: &'a mut fmt::Write, + indent: i32, +} + +#[allow(unused_must_use)] +impl<'a> Printer<'a> { + fn new(output: &'a mut fmt::Write) -> Self { + Printer { output, indent: 0 } + } + + fn print_indent(&mut self) { + for _ in 0..self.indent { + write!(self.output, " "); + } + } +} + +#[allow(unused_must_use)] +impl<'a> Printer<'a> { + pub fn print(&mut self, tp: &Type) { + self.print_indent(); + match tp { + &Type::PrimitiveType { + ref basic_info, + physical_type, + type_length, + scale, + precision, + } => { + let phys_type_str = match physical_type { + PhysicalType::FIXED_LEN_BYTE_ARRAY => { + // We need to include length for fixed byte array + format!("{} ({})", physical_type, type_length) + } + _ => format!("{}", physical_type), + }; + // Also print logical type if it is available + let logical_type_str = match basic_info.logical_type() { + LogicalType::NONE => format!(""), + decimal @ LogicalType::DECIMAL => { + // For decimal type we should print precision and scale if they are > 0, e.g. + // DECIMAL(9, 2) - DECIMAL(9) - DECIMAL + let precision_scale = match (precision, scale) { + (p, s) if p > 0 && s > 0 => format!(" ({}, {})", p, s), + (p, 0) if p > 0 => format!(" ({})", p), + _ => format!(""), + }; + format!(" ({}{})", decimal, precision_scale) + } + other_logical_type => format!(" ({})", other_logical_type), + }; + write!( + self.output, + "{} {} {}{};", + basic_info.repetition(), + phys_type_str, + basic_info.name(), + logical_type_str + ); + } + &Type::GroupType { + ref basic_info, + ref fields, + } => { + if basic_info.has_repetition() { + let r = basic_info.repetition(); + write!(self.output, "{} group {} ", r, basic_info.name()); + if basic_info.logical_type() != LogicalType::NONE { + write!(self.output, "({}) ", basic_info.logical_type()); + } + writeln!(self.output, "{{"); + } else { + writeln!(self.output, "message {} {{", basic_info.name()); + } + + self.indent += INDENT_WIDTH; + for c in fields { + self.print(&c); + writeln!(self.output, ""); + } + self.indent -= INDENT_WIDTH; + self.print_indent(); + write!(self.output, "}}"); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::rc::Rc; + + use crate::parquet::basic::{Repetition, Type as PhysicalType}; + use crate::parquet::schema::{parser::parse_message_type, types::Type}; + + fn assert_print_parse_message(message: Type) { + let mut s = String::new(); + { + let mut p = Printer::new(&mut s); + p.print(&message); + } + let parsed = parse_message_type(&s).unwrap(); + assert_eq!(message, parsed); + } + + #[test] + fn test_print_primitive_type() { + let mut s = String::new(); + { + let mut p = Printer::new(&mut s); + let foo = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_32) + .build() + .unwrap(); + p.print(&foo); + } + assert_eq!(&mut s, "REQUIRED INT32 foo (INT_32);"); + } + + #[test] + fn test_print_primitive_type_without_logical() { + let mut s = String::new(); + { + let mut p = Printer::new(&mut s); + let foo = Type::primitive_type_builder("foo", PhysicalType::DOUBLE) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(); + p.print(&foo); + } + assert_eq!(&mut s, "REQUIRED DOUBLE foo;"); + } + + #[test] + fn test_print_group_type() { + let mut s = String::new(); + { + let mut p = Printer::new(&mut s); + let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_32) + .with_id(0) + .build(); + let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) + .with_logical_type(LogicalType::UTF8) + .with_id(1) + .build(); + let f3 = Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::INTERVAL) + .with_length(12) + .with_id(2) + .build(); + let mut struct_fields = Vec::new(); + struct_fields.push(Rc::new(f1.unwrap())); + struct_fields.push(Rc::new(f2.unwrap())); + let foo = Type::group_type_builder("foo") + .with_repetition(Repetition::OPTIONAL) + .with_fields(&mut struct_fields) + .with_id(1) + .build() + .unwrap(); + let mut fields = Vec::new(); + fields.push(Rc::new(foo)); + fields.push(Rc::new(f3.unwrap())); + let message = Type::group_type_builder("schema") + .with_fields(&mut fields) + .with_id(2) + .build() + .unwrap(); + p.print(&message); + } + let expected = "message schema { + OPTIONAL group foo { + REQUIRED INT32 f1 (INT_32); + OPTIONAL BYTE_ARRAY f2 (UTF8); + } + REPEATED FIXED_LEN_BYTE_ARRAY (12) f3 (INTERVAL); +}"; + assert_eq!(&mut s, expected); + } + + #[test] + fn test_print_and_parse_primitive() { + let a2 = Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::UTF8) + .build() + .unwrap(); + + let a1 = Type::group_type_builder("a1") + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::LIST) + .with_fields(&mut vec![Rc::new(a2)]) + .build() + .unwrap(); + + let b3 = Type::primitive_type_builder("b3", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(); + + let b4 = Type::primitive_type_builder("b4", PhysicalType::DOUBLE) + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(); + + let b2 = Type::group_type_builder("b2") + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::NONE) + .with_fields(&mut vec![Rc::new(b3), Rc::new(b4)]) + .build() + .unwrap(); + + let b1 = Type::group_type_builder("b1") + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::LIST) + .with_fields(&mut vec![Rc::new(b2)]) + .build() + .unwrap(); + + let a0 = Type::group_type_builder("a0") + .with_repetition(Repetition::REQUIRED) + .with_fields(&mut vec![Rc::new(a1), Rc::new(b1)]) + .build() + .unwrap(); + + let message = Type::group_type_builder("root") + .with_fields(&mut vec![Rc::new(a0)]) + .build() + .unwrap(); + + assert_print_parse_message(message); + } + + #[test] + fn test_print_and_parse_nested() { + let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_32) + .build() + .unwrap(); + + let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::UTF8) + .build() + .unwrap(); + + let foo = Type::group_type_builder("foo") + .with_repetition(Repetition::OPTIONAL) + .with_fields(&mut vec![Rc::new(f1), Rc::new(f2)]) + .build() + .unwrap(); + + let f3 = Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::INTERVAL) + .with_length(12) + .build() + .unwrap(); + + let message = Type::group_type_builder("schema") + .with_fields(&mut vec![Rc::new(foo), Rc::new(f3)]) + .build() + .unwrap(); + + assert_print_parse_message(message); + } + + #[test] + fn test_print_and_parse_decimal() { + let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(9) + .with_scale(2) + .build() + .unwrap(); + + let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(9) + .with_scale(0) + .build() + .unwrap(); + + let message = Type::group_type_builder("schema") + .with_fields(&mut vec![Rc::new(f1), Rc::new(f2)]) + .build() + .unwrap(); + + assert_print_parse_message(message); + } +} diff --git a/rust/src/parquet/schema/types.rs b/rust/src/parquet/schema/types.rs new file mode 100644 index 0000000000000..90c767c093055 --- /dev/null +++ b/rust/src/parquet/schema/types.rs @@ -0,0 +1,1830 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains structs and methods to build Parquet schema and schema descriptors. + +use std::{collections::HashMap, convert::From, fmt, rc::Rc}; + +use parquet_format::SchemaElement; + +use crate::parquet::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::parquet::errors::{ParquetError, Result}; + +// ---------------------------------------------------------------------- +// Parquet Type definitions + +/// Type alias for `Rc`. +pub type TypePtr = Rc; +/// Type alias for `Rc`. +pub type SchemaDescPtr = Rc; +/// Type alias for `Rc`. +pub type ColumnDescPtr = Rc; + +/// Representation of a Parquet type. +/// Used to describe primitive leaf fields and structs, including top-level schema. +/// Note that the top-level schema type is represented using `GroupType` whose +/// repetition is `None`. +#[derive(Debug, PartialEq)] +pub enum Type { + PrimitiveType { + basic_info: BasicTypeInfo, + physical_type: PhysicalType, + type_length: i32, + scale: i32, + precision: i32, + }, + GroupType { + basic_info: BasicTypeInfo, + fields: Vec, + }, +} + +impl Type { + /// Creates primitive type builder with provided field name and physical type. + pub fn primitive_type_builder(name: &str, physical_type: PhysicalType) -> PrimitiveTypeBuilder { + PrimitiveTypeBuilder::new(name, physical_type) + } + + /// Creates group type builder with provided column name. + pub fn group_type_builder(name: &str) -> GroupTypeBuilder { + GroupTypeBuilder::new(name) + } + + /// Returns [`BasicTypeInfo`] information about the type. + pub fn get_basic_info(&self) -> &BasicTypeInfo { + match *self { + Type::PrimitiveType { ref basic_info, .. } => &basic_info, + Type::GroupType { ref basic_info, .. } => &basic_info, + } + } + + /// Returns this type's field name. + pub fn name(&self) -> &str { + self.get_basic_info().name() + } + + /// Gets the fields from this group type. + /// Note that this will panic if called on a non-group type. + // TODO: should we return `&[&Type]` here? + pub fn get_fields(&self) -> &[TypePtr] { + match *self { + Type::GroupType { ref fields, .. } => &fields[..], + _ => panic!("Cannot call get_fields() on a non-group type"), + } + } + + /// Gets physical type of this primitive type. + /// Note that this will panic if called on a non-primitive type. + pub fn get_physical_type(&self) -> PhysicalType { + match *self { + Type::PrimitiveType { + basic_info: _, + physical_type, + .. + } => physical_type, + _ => panic!("Cannot call get_physical_type() on a non-primitive type"), + } + } + + /// Checks if `sub_type` schema is part of current schema. + /// This method can be used to check if projected columns are part of the root schema. + pub fn check_contains(&self, sub_type: &Type) -> bool { + // Names match, and repetitions match or not set for both + let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name() + && (self.is_schema() && sub_type.is_schema() + || !self.is_schema() + && !sub_type.is_schema() + && self.get_basic_info().repetition() + == sub_type.get_basic_info().repetition()); + + match *self { + Type::PrimitiveType { .. } if basic_match && sub_type.is_primitive() => { + self.get_physical_type() == sub_type.get_physical_type() + } + Type::GroupType { .. } if basic_match && sub_type.is_group() => { + // build hashmap of name -> TypePtr + let mut field_map = HashMap::new(); + for field in self.get_fields() { + field_map.insert(field.name(), field); + } + + for field in sub_type.get_fields() { + if !field_map + .get(field.name()) + .map(|tpe| tpe.check_contains(field)) + .unwrap_or(false) + { + return false; + } + } + true + } + _ => false, + } + } + + /// Returns `true` if this type is a primitive type, `false` otherwise. + pub fn is_primitive(&self) -> bool { + match *self { + Type::PrimitiveType { .. } => true, + _ => false, + } + } + + /// Returns `true` if this type is a group type, `false` otherwise. + pub fn is_group(&self) -> bool { + match *self { + Type::GroupType { .. } => true, + _ => false, + } + } + + /// Returns `true` if this type is the top-level schema type (message type). + pub fn is_schema(&self) -> bool { + match *self { + Type::GroupType { ref basic_info, .. } => !basic_info.has_repetition(), + _ => false, + } + } +} + +/// A builder for primitive types. All attributes are optional +/// except the name and physical type. +/// Note that if not specified explicitly, `Repetition::OPTIONAL` is used. +pub struct PrimitiveTypeBuilder<'a> { + name: &'a str, + repetition: Repetition, + physical_type: PhysicalType, + logical_type: LogicalType, + length: i32, + precision: i32, + scale: i32, + id: Option, +} + +impl<'a> PrimitiveTypeBuilder<'a> { + /// Creates new primitive type builder with provided field name and physical type. + pub fn new(name: &'a str, physical_type: PhysicalType) -> Self { + Self { + name, + repetition: Repetition::OPTIONAL, + physical_type, + logical_type: LogicalType::NONE, + length: -1, + precision: -1, + scale: -1, + id: None, + } + } + + /// Sets [`Repetition`](`::basic::Repetition`) for this field and returns itself. + pub fn with_repetition(mut self, repetition: Repetition) -> Self { + self.repetition = repetition; + self + } + + /// Sets [`LogicalType`](`::basic::LogicalType`) for this field and returns itself. + pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { + self.logical_type = logical_type; + self + } + + /// Sets type length and returns itself. + /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because + /// they maintain fixed size underlying byte array. + /// By default, value is `0`. + pub fn with_length(mut self, length: i32) -> Self { + self.length = length; + self + } + + /// Sets precision for Parquet DECIMAL physical type and returns itself. + /// By default, it equals to `0` and used only for decimal context. + pub fn with_precision(mut self, precision: i32) -> Self { + self.precision = precision; + self + } + + /// Sets scale for Parquet DECIMAL physical type and returns itself. + /// By default, it equals to `0` and used only for decimal context. + pub fn with_scale(mut self, scale: i32) -> Self { + self.scale = scale; + self + } + + /// Sets optional field id and returns itself. + pub fn with_id(mut self, id: i32) -> Self { + self.id = Some(id); + self + } + + /// Creates a new `PrimitiveType` instance from the collected attributes. + /// Returns `Err` in case of any building conditions are not met. + pub fn build(self) -> Result { + let basic_info = BasicTypeInfo { + name: String::from(self.name), + repetition: Some(self.repetition), + logical_type: self.logical_type, + id: self.id, + }; + + // Check length before logical type, since it is used for logical type validation. + if self.physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY && self.length < 0 { + return Err(general_err!( + "Invalid FIXED_LEN_BYTE_ARRAY length: {}", + self.length + )); + } + + match self.logical_type { + LogicalType::NONE => {} + LogicalType::UTF8 | LogicalType::BSON | LogicalType::JSON => { + if self.physical_type != PhysicalType::BYTE_ARRAY { + return Err(general_err!( + "{} can only annotate BYTE_ARRAY fields", + self.logical_type + )); + } + } + LogicalType::DECIMAL => { + match self.physical_type { + PhysicalType::INT32 + | PhysicalType::INT64 + | PhysicalType::BYTE_ARRAY + | PhysicalType::FIXED_LEN_BYTE_ARRAY => (), + _ => { + return Err(general_err!( + "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED" + )); + } + } + + // Precision is required and must be a non-zero positive integer. + if self.precision < 1 { + return Err(general_err!( + "Invalid DECIMAL precision: {}", + self.precision + )); + } + + // Scale must be zero or a positive integer less than the precision. + if self.scale < 0 { + return Err(general_err!("Invalid DECIMAL scale: {}", self.scale)); + } + + if self.scale >= self.precision { + return Err(general_err!( + "Invalid DECIMAL: scale ({}) cannot be greater than or equal to precision \ + ({})", + self.scale, + self.precision + )); + } + + // Check precision and scale based on physical type limitations. + match self.physical_type { + PhysicalType::INT32 => { + if self.precision > 9 { + return Err(general_err!( + "Cannot represent INT32 as DECIMAL with precision {}", + self.precision + )); + } + } + PhysicalType::INT64 => { + if self.precision > 18 { + return Err(general_err!( + "Cannot represent INT64 as DECIMAL with precision {}", + self.precision + )); + } + } + PhysicalType::FIXED_LEN_BYTE_ARRAY => { + let max_precision = + (2f64.powi(8 * self.length - 1) - 1f64).log10().floor() as i32; + + if self.precision > max_precision { + return Err(general_err!( + "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length {} and \ + precision {}", + self.length, + self.precision + )); + } + } + _ => (), // For BYTE_ARRAY precision is not limited + } + } + LogicalType::DATE + | LogicalType::TIME_MILLIS + | LogicalType::UINT_8 + | LogicalType::UINT_16 + | LogicalType::UINT_32 + | LogicalType::INT_8 + | LogicalType::INT_16 + | LogicalType::INT_32 => { + if self.physical_type != PhysicalType::INT32 { + return Err(general_err!( + "{} can only annotate INT32", + self.logical_type + )); + } + } + LogicalType::TIME_MICROS + | LogicalType::TIMESTAMP_MILLIS + | LogicalType::TIMESTAMP_MICROS + | LogicalType::UINT_64 + | LogicalType::INT_64 => { + if self.physical_type != PhysicalType::INT64 { + return Err(general_err!( + "{} can only annotate INT64", + self.logical_type + )); + } + } + LogicalType::INTERVAL => { + if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 { + return Err(general_err!( + "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)" + )); + } + } + LogicalType::ENUM => { + if self.physical_type != PhysicalType::BYTE_ARRAY { + return Err(general_err!("ENUM can only annotate BYTE_ARRAY fields")); + } + } + _ => { + return Err(general_err!( + "{} cannot be applied to a primitive type", + self.logical_type + )); + } + } + + Ok(Type::PrimitiveType { + basic_info, + physical_type: self.physical_type, + type_length: self.length, + scale: self.scale, + precision: self.precision, + }) + } +} + +/// A builder for group types. All attributes are optional except the name. +/// Note that if not specified explicitly, `None` is used as the repetition of the group, +/// which means it is a root (message) type. +pub struct GroupTypeBuilder<'a> { + name: &'a str, + repetition: Option, + logical_type: LogicalType, + fields: Vec, + id: Option, +} + +impl<'a> GroupTypeBuilder<'a> { + /// Creates new group type builder with provided field name. + pub fn new(name: &'a str) -> Self { + Self { + name, + repetition: None, + logical_type: LogicalType::NONE, + fields: Vec::new(), + id: None, + } + } + + /// Sets [`Repetition`](`::basic::Repetition`) for this field and returns itself. + pub fn with_repetition(mut self, repetition: Repetition) -> Self { + self.repetition = Some(repetition); + self + } + + /// Sets [`LogicalType`](`::basic::LogicalType`) for this field and returns itself. + pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { + self.logical_type = logical_type; + self + } + + /// Sets a list of fields that should be child nodes of this field. + /// Returns updated self. + pub fn with_fields(mut self, fields: &mut Vec) -> Self { + self.fields.append(fields); + self + } + + /// Sets optional field id and returns itself. + pub fn with_id(mut self, id: i32) -> Self { + self.id = Some(id); + self + } + + /// Creates a new `GroupType` instance from the gathered attributes. + pub fn build(self) -> Result { + let basic_info = BasicTypeInfo { + name: String::from(self.name), + repetition: self.repetition, + logical_type: self.logical_type, + id: self.id, + }; + Ok(Type::GroupType { + basic_info, + fields: self.fields, + }) + } +} + +/// Basic type info. This contains information such as the name of the type, +/// the repetition level, the logical type and the kind of the type (group, primitive). +#[derive(Debug, PartialEq)] +pub struct BasicTypeInfo { + name: String, + repetition: Option, + logical_type: LogicalType, + id: Option, +} + +impl BasicTypeInfo { + /// Returns field name. + pub fn name(&self) -> &str { + &self.name + } + + /// Returns `true` if type has repetition field set, `false` otherwise. + /// This is mostly applied to group type, because primitive type always has + /// repetition set. + pub fn has_repetition(&self) -> bool { + self.repetition.is_some() + } + + /// Returns [`Repetition`](`::basic::Repetition`) value for the type. + pub fn repetition(&self) -> Repetition { + assert!(self.repetition.is_some()); + self.repetition.unwrap() + } + + /// Returns [`LogicalType`](`::basic::LogicalType`) value for the type. + pub fn logical_type(&self) -> LogicalType { + self.logical_type + } + + /// Returns `true` if id is set, `false` otherwise. + pub fn has_id(&self) -> bool { + self.id.is_some() + } + + /// Returns id value for the type. + pub fn id(&self) -> i32 { + assert!(self.id.is_some()); + self.id.unwrap() + } +} + +// ---------------------------------------------------------------------- +// Parquet descriptor definitions + +/// Represents a path in a nested schema +#[derive(Clone, PartialEq, Debug, Eq, Hash)] +pub struct ColumnPath { + parts: Vec, +} + +impl ColumnPath { + /// Creates new column path from vector of field names. + pub fn new(parts: Vec) -> Self { + ColumnPath { parts } + } + + /// Returns string representation of this column path. + /// ```rust + /// use arrow::parquet::schema::types::ColumnPath; + /// + /// let path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c".to_string()]); + /// assert_eq!(&path.string(), "a.b.c"); + /// ``` + pub fn string(&self) -> String { + self.parts.join(".") + } +} + +impl fmt::Display for ColumnPath { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self.string()) + } +} + +impl From> for ColumnPath { + fn from(parts: Vec) -> Self { + ColumnPath { parts } + } +} + +impl<'a> From<&'a str> for ColumnPath { + fn from(single_path: &str) -> Self { + let s = String::from(single_path); + ColumnPath::from(s) + } +} + +impl From for ColumnPath { + fn from(single_path: String) -> Self { + let mut v = vec![]; + v.push(single_path); + ColumnPath { parts: v } + } +} + +impl AsRef<[String]> for ColumnPath { + fn as_ref(&self) -> &[String] { + &self.parts + } +} + +/// A descriptor for leaf-level primitive columns. +/// This encapsulates information such as definition and repetition levels and is used to +/// re-assemble nested data. +pub struct ColumnDescriptor { + // The "leaf" primitive type of this column + primitive_type: TypePtr, + + // The root type of this column. For instance, if the column is "a.b.c.d", then the + // primitive type is 'd' while the root_type is 'a'. + // + // NOTE: this is sometimes `None` for the convenience of testing. It should NEVER be + // `None` when running in production. + root_type: Option, + + // The maximum definition level for this column + max_def_level: i16, + + // The maximum repetition level for this column + max_rep_level: i16, + + // The path of this column. For instance, "a.b.c.d". + path: ColumnPath, +} + +impl ColumnDescriptor { + /// Creates new descriptor for leaf-level column. + pub fn new( + primitive_type: TypePtr, + root_type: Option, + max_def_level: i16, + max_rep_level: i16, + path: ColumnPath, + ) -> Self { + Self { + primitive_type, + root_type, + max_def_level, + max_rep_level, + path, + } + } + + /// Returns maximum definition level for this column. + pub fn max_def_level(&self) -> i16 { + self.max_def_level + } + + /// Returns maximum repetition level for this column. + pub fn max_rep_level(&self) -> i16 { + self.max_rep_level + } + + /// Returns [`ColumnPath`] for this column. + pub fn path(&self) -> &ColumnPath { + &self.path + } + + /// Returns self type [`Type`](`::schema::types::Type`) for this leaf column. + pub fn self_type(&self) -> &Type { + self.primitive_type.as_ref() + } + + /// Returns root [`Type`](`::schema::types::Type`) (most top-level parent field) for + /// this leaf column. + pub fn root_type(&self) -> &Type { + assert!(self.root_type.is_some()); + self.root_type.as_ref().unwrap() + } + + /// Returns column name. + pub fn name(&self) -> &str { + self.primitive_type.name() + } + + /// Returns [`LogicalType`](`::basic::LogicalType`) for this column. + pub fn logical_type(&self) -> LogicalType { + self.primitive_type.get_basic_info().logical_type() + } + + /// Returns physical type for this column. + /// Note that it will panic if called on a non-primitive type. + pub fn physical_type(&self) -> PhysicalType { + match self.primitive_type.as_ref() { + &Type::PrimitiveType { physical_type, .. } => physical_type, + _ => panic!("Expected primitive type!"), + } + } + + /// Returns type length for this column. + /// Note that it will panic if called on a non-primitive type. + pub fn type_length(&self) -> i32 { + match self.primitive_type.as_ref() { + &Type::PrimitiveType { type_length, .. } => type_length, + _ => panic!("Expected primitive type!"), + } + } + + /// Returns type precision for this column. + /// Note that it will panic if called on a non-primitive type. + pub fn type_precision(&self) -> i32 { + match self.primitive_type.as_ref() { + &Type::PrimitiveType { precision, .. } => precision, + _ => panic!("Expected primitive type!"), + } + } + + /// Returns type scale for this column. + /// Note that it will panic if called on a non-primitive type. + pub fn type_scale(&self) -> i32 { + match self.primitive_type.as_ref() { + &Type::PrimitiveType { scale, .. } => scale, + _ => panic!("Expected primitive type!"), + } + } +} + +/// A schema descriptor. This encapsulates the top-level schemas for all the columns, +/// as well as all descriptors for all the primitive columns. +pub struct SchemaDescriptor { + // The top-level schema (the "message" type). + // This must be a `GroupType` where each field is a root column type in the schema. + schema: TypePtr, + + // All the descriptors for primitive columns in this schema, constructed from + // `schema` in DFS order. + leaves: Vec, + + // Mapping from a leaf column's index to the root column type that it + // comes from. For instance: the leaf `a.b.c.d` would have a link back to `a`: + // -- a <-----+ + // -- -- b | + // -- -- -- c | + // -- -- -- -- d + leaf_to_base: HashMap, +} + +impl SchemaDescriptor { + /// Creates new schema descriptor from Parquet schema. + pub fn new(tp: TypePtr) -> Self { + assert!(tp.is_group(), "SchemaDescriptor should take a GroupType"); + let mut leaves = vec![]; + let mut leaf_to_base = HashMap::new(); + for f in tp.get_fields() { + let mut path = vec![]; + build_tree( + f.clone(), + tp.clone(), + f.clone(), + 0, + 0, + &mut leaves, + &mut leaf_to_base, + &mut path, + ); + } + + Self { + schema: tp, + leaves, + leaf_to_base, + } + } + + /// Returns [`ColumnDescriptor`] for a field position. + pub fn column(&self, i: usize) -> ColumnDescPtr { + assert!( + i < self.leaves.len(), + "Index out of bound: {} not in [0, {})", + i, + self.leaves.len() + ); + self.leaves[i].clone() + } + + /// Returns slice of [`ColumnDescriptor`]. + pub fn columns(&self) -> &[ColumnDescPtr] { + &self.leaves + } + + /// Returns number of leaf-level columns. + pub fn num_columns(&self) -> usize { + self.leaves.len() + } + + /// Returns column root [`Type`](`::schema::types::Type`) for a field position. + pub fn get_column_root(&self, i: usize) -> &Type { + assert!( + i < self.leaves.len(), + "Index out of bound: {} not in [0, {})", + i, + self.leaves.len() + ); + let result = self.leaf_to_base.get(&i); + assert!( + result.is_some(), + "Expected a value for index {} but found None", + i + ); + result.unwrap().as_ref() + } + + /// Returns schema as [`Type`](`::schema::types::Type`). + pub fn root_schema(&self) -> &Type { + self.schema.as_ref() + } + + /// Returns schema name. + pub fn name(&self) -> &str { + self.schema.name() + } +} + +fn build_tree( + tp: TypePtr, + root_tp: TypePtr, + base_tp: TypePtr, + mut max_rep_level: i16, + mut max_def_level: i16, + leaves: &mut Vec, + leaf_to_base: &mut HashMap, + path_so_far: &mut Vec, +) { + assert!(tp.get_basic_info().has_repetition()); + + path_so_far.push(String::from(tp.name())); + match tp.get_basic_info().repetition() { + Repetition::OPTIONAL => { + max_def_level += 1; + } + Repetition::REPEATED => { + max_def_level += 1; + max_rep_level += 1; + } + _ => {} + } + + match tp.as_ref() { + &Type::PrimitiveType { .. } => { + let mut path: Vec = vec![]; + path.extend_from_slice(&path_so_far[..]); + leaves.push(Rc::new(ColumnDescriptor::new( + tp.clone(), + Some(root_tp), + max_def_level, + max_rep_level, + ColumnPath::new(path), + ))); + leaf_to_base.insert(leaves.len() - 1, base_tp); + } + &Type::GroupType { ref fields, .. } => { + for f in fields { + build_tree( + f.clone(), + root_tp.clone(), + base_tp.clone(), + max_rep_level, + max_def_level, + leaves, + leaf_to_base, + path_so_far, + ); + let idx = path_so_far.len() - 1; + path_so_far.remove(idx); + } + } + } +} + +/// Method to convert from Thrift. +pub fn from_thrift(elements: &[SchemaElement]) -> Result { + let mut index = 0; + let mut schema_nodes = Vec::new(); + while index < elements.len() { + let t = from_thrift_helper(elements, index)?; + index = t.0; + schema_nodes.push(t.1); + } + if schema_nodes.len() != 1 { + return Err(general_err!( + "Expected exactly one root node, but found {}", + schema_nodes.len() + )); + } + + Ok(schema_nodes.remove(0)) +} + +/// Constructs a new Type from the `elements`, starting at index `index`. +/// The first result is the starting index for the next Type after this one. If it is +/// equal to `elements.len()`, then this Type is the last one. +/// The second result is the result Type. +fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize, TypePtr)> { + // Whether or not the current node is root (message type). + // There is only one message type node in the schema tree. + let is_root_node = index == 0; + + if index > elements.len() { + return Err(general_err!( + "Index out of bound, index = {}, len = {}", + index, + elements.len() + )); + } + let logical_type = LogicalType::from(elements[index].converted_type); + let field_id = elements[index].field_id; + match elements[index].num_children { + // From parquet-format: + // The children count is used to construct the nested relationship. + // This field is not set when the element is a primitive type + // Sometimes parquet-cpp sets num_children field to 0 for primitive types, so we + // have to handle this case too. + None | Some(0) => { + // primitive type + if elements[index].repetition_type.is_none() { + return Err(general_err!( + "Repetition level must be defined for a primitive type" + )); + } + let repetition = Repetition::from(elements[index].repetition_type.unwrap()); + let physical_type = PhysicalType::from(elements[index].type_.unwrap()); + let length = elements[index].type_length.unwrap_or(-1); + let scale = elements[index].scale.unwrap_or(-1); + let precision = elements[index].precision.unwrap_or(-1); + let name = &elements[index].name; + let mut builder = Type::primitive_type_builder(name, physical_type) + .with_repetition(repetition) + .with_logical_type(logical_type) + .with_length(length) + .with_precision(precision) + .with_scale(scale); + if let Some(id) = field_id { + builder = builder.with_id(id); + } + Ok((index + 1, Rc::new(builder.build()?))) + } + Some(n) => { + let repetition = elements[index].repetition_type.map(|r| Repetition::from(r)); + let mut fields = vec![]; + let mut next_index = index + 1; + for _ in 0..n { + let child_result = from_thrift_helper(elements, next_index as usize)?; + next_index = child_result.0; + fields.push(child_result.1); + } + + let mut builder = Type::group_type_builder(&elements[index].name) + .with_logical_type(logical_type) + .with_fields(&mut fields); + if let Some(rep) = repetition { + // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or REPEATED + // for root node. + // + // We only set repetition for group types that are not top-level message type. + // According to parquet-format: + // Root of the schema does not have a repetition_type. + // All other types must have one. + if !is_root_node { + builder = builder.with_repetition(rep); + } + } + if let Some(id) = field_id { + builder = builder.with_id(id); + } + Ok((next_index, Rc::new(builder.build().unwrap()))) + } + } +} + +/// Method to convert to Thrift. +pub fn to_thrift(schema: &Type) -> Result> { + if !schema.is_group() { + return Err(general_err!("Root schema must be Group type")); + } + let mut elements: Vec = Vec::new(); + to_thrift_helper(schema, &mut elements); + Ok(elements) +} + +/// Constructs list of `SchemaElement` from the schema using depth-first traversal. +/// Here we assume that schema is always valid and starts with group type. +fn to_thrift_helper(schema: &Type, elements: &mut Vec) { + match *schema { + Type::PrimitiveType { + ref basic_info, + physical_type, + type_length, + scale, + precision, + } => { + let element = SchemaElement { + type_: Some(physical_type.into()), + type_length: if type_length >= 0 { + Some(type_length) + } else { + None + }, + repetition_type: Some(basic_info.repetition().into()), + name: basic_info.name().to_owned(), + num_children: None, + converted_type: basic_info.logical_type().into(), + scale: if scale >= 0 { Some(scale) } else { None }, + precision: if precision >= 0 { + Some(precision) + } else { + None + }, + field_id: if basic_info.has_id() { + Some(basic_info.id()) + } else { + None + }, + logical_type: None, + }; + + elements.push(element); + } + Type::GroupType { + ref basic_info, + ref fields, + } => { + let repetition = if basic_info.has_repetition() { + Some(basic_info.repetition().into()) + } else { + None + }; + + let element = SchemaElement { + type_: None, + type_length: None, + repetition_type: repetition, + name: basic_info.name().to_owned(), + num_children: Some(fields.len() as i32), + converted_type: basic_info.logical_type().into(), + scale: None, + precision: None, + field_id: if basic_info.has_id() { + Some(basic_info.id()) + } else { + None + }, + logical_type: None, + }; + + elements.push(element); + + // Add child elements for a group + for field in fields { + to_thrift_helper(field, elements); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::error::Error; + + use crate::parquet::schema::parser::parse_message_type; + + #[test] + fn test_primitive_type() { + let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_logical_type(LogicalType::INT_32) + .with_id(0) + .build(); + assert!(result.is_ok()); + + if let Ok(tp) = result { + assert!(tp.is_primitive()); + assert!(!tp.is_group()); + let basic_info = tp.get_basic_info(); + assert_eq!(basic_info.repetition(), Repetition::OPTIONAL); + assert_eq!(basic_info.logical_type(), LogicalType::INT_32); + assert_eq!(basic_info.id(), 0); + match tp { + Type::PrimitiveType { physical_type, .. } => { + assert_eq!(physical_type, PhysicalType::INT32); + } + _ => assert!(false), + } + } + + // Test illegal inputs + result = Type::primitive_type_builder("foo", PhysicalType::INT64) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::BSON) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "BSON can only annotate BYTE_ARRAY fields"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT96) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(-1) + .with_scale(-1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(-1) + .with_scale(-1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "Invalid DECIMAL precision: -1"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(0) + .with_scale(-1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "Invalid DECIMAL precision: 0"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(1) + .with_scale(-1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "Invalid DECIMAL scale: -1"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(1) + .with_scale(2) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "Invalid DECIMAL: scale (2) cannot be greater than or equal to precision (1)" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(18) + .with_scale(2) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "Cannot represent INT32 as DECIMAL with precision 18" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT64) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(32) + .with_scale(2) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "Cannot represent INT64 as DECIMAL with precision 32" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_length(5) + .with_precision(12) + .with_scale(2) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length 5 and precision 12" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT64) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::UINT_8) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "UINT_8 can only annotate INT32"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::TIME_MICROS) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "TIME_MICROS can only annotate INT64"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INTERVAL) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INTERVAL) + .with_length(1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::ENUM) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "ENUM can only annotate BYTE_ARRAY fields"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::MAP) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "MAP cannot be applied to a primitive type"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_length(-1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "Invalid FIXED_LEN_BYTE_ARRAY length: -1"); + } + } + + #[test] + fn test_group_type() { + let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .with_logical_type(LogicalType::INT_32) + .with_id(0) + .build(); + assert!(f1.is_ok()); + let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) + .with_logical_type(LogicalType::UTF8) + .with_id(1) + .build(); + assert!(f2.is_ok()); + + let mut fields = vec![]; + fields.push(Rc::new(f1.unwrap())); + fields.push(Rc::new(f2.unwrap())); + + let result = Type::group_type_builder("foo") + .with_repetition(Repetition::REPEATED) + .with_fields(&mut fields) + .with_id(1) + .build(); + assert!(result.is_ok()); + + let tp = result.unwrap(); + let basic_info = tp.get_basic_info(); + assert!(tp.is_group()); + assert!(!tp.is_primitive()); + assert_eq!(basic_info.repetition(), Repetition::REPEATED); + assert_eq!(basic_info.logical_type(), LogicalType::NONE); + assert_eq!(basic_info.id(), 1); + assert_eq!(tp.get_fields().len(), 2); + assert_eq!(tp.get_fields()[0].name(), "f1"); + assert_eq!(tp.get_fields()[1].name(), "f2"); + } + + #[test] + fn test_column_descriptor() { + let result = test_column_descriptor_helper(); + assert!( + result.is_ok(), + "Expected result to be OK but got err:\n {}", + result.unwrap_err() + ); + } + + fn test_column_descriptor_helper() -> Result<()> { + let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY) + .with_logical_type(LogicalType::UTF8) + .build()?; + + let root_tp = Type::group_type_builder("root") + .with_logical_type(LogicalType::LIST) + .build() + .unwrap(); + let root_tp_rc = Rc::new(root_tp); + + let descr = ColumnDescriptor::new( + Rc::new(tp), + Some(root_tp_rc.clone()), + 4, + 1, + ColumnPath::from("name"), + ); + + assert_eq!(descr.path(), &ColumnPath::from("name")); + assert_eq!(descr.logical_type(), LogicalType::UTF8); + assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY); + assert_eq!(descr.max_def_level(), 4); + assert_eq!(descr.max_rep_level(), 1); + assert_eq!(descr.name(), "name"); + assert_eq!(descr.type_length(), -1); + assert_eq!(descr.type_precision(), -1); + assert_eq!(descr.type_scale(), -1); + assert_eq!(descr.root_type(), root_tp_rc.as_ref()); + + Ok(()) + } + + #[test] + fn test_schema_descriptor() { + let result = test_schema_descriptor_helper(); + assert!( + result.is_ok(), + "Expected result to be OK but got err:\n {}", + result.unwrap_err() + ); + } + + // A helper fn to avoid handling the results from type creation + fn test_schema_descriptor_helper() -> Result<()> { + let mut fields = vec![]; + + let inta = Type::primitive_type_builder("a", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_32) + .build()?; + fields.push(Rc::new(inta)); + let intb = Type::primitive_type_builder("b", PhysicalType::INT64) + .with_logical_type(LogicalType::INT_64) + .build()?; + fields.push(Rc::new(intb)); + let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::UTF8) + .build()?; + fields.push(Rc::new(intc)); + + // 3-level list encoding + let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_64) + .build()?; + let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?; + let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::INT_32) + .build()?; + let list = Type::group_type_builder("records") + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::LIST) + .with_fields(&mut vec![Rc::new(item1), Rc::new(item2), Rc::new(item3)]) + .build()?; + let bag = Type::group_type_builder("bag") + .with_repetition(Repetition::OPTIONAL) + .with_fields(&mut vec![Rc::new(list)]) + .build()?; + fields.push(Rc::new(bag)); + + let schema = Type::group_type_builder("schema") + .with_repetition(Repetition::REPEATED) + .with_fields(&mut fields) + .build()?; + let descr = SchemaDescriptor::new(Rc::new(schema)); + + let nleaves = 6; + assert_eq!(descr.num_columns(), nleaves); + + // mdef mrep + // required int32 a 0 0 + // optional int64 b 1 0 + // repeated byte_array c 1 1 + // optional group bag 1 0 + // repeated group records 2 1 + // required int64 item1 2 1 + // optional boolean item2 3 1 + // repeated int32 item3 3 2 + let ex_max_def_levels = vec![0, 1, 1, 2, 3, 3]; + let ex_max_rep_levels = vec![0, 0, 1, 1, 1, 2]; + + for i in 0..nleaves { + let col = descr.column(i); + assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{}", i); + assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{}", i); + } + + assert_eq!(descr.column(0).path().string(), "a"); + assert_eq!(descr.column(1).path().string(), "b"); + assert_eq!(descr.column(2).path().string(), "c"); + assert_eq!(descr.column(3).path().string(), "bag.records.item1"); + assert_eq!(descr.column(4).path().string(), "bag.records.item2"); + assert_eq!(descr.column(5).path().string(), "bag.records.item3"); + + assert_eq!(descr.get_column_root(0).name(), "a"); + assert_eq!(descr.get_column_root(3).name(), "bag"); + assert_eq!(descr.get_column_root(4).name(), "bag"); + assert_eq!(descr.get_column_root(5).name(), "bag"); + + Ok(()) + } + + #[test] + fn test_schema_build_tree_def_rep_levels() { + let message_type = " + message spark_schema { + REQUIRED INT32 a; + OPTIONAL group b { + OPTIONAL INT32 _1; + OPTIONAL INT32 _2; + } + OPTIONAL group c (LIST) { + REPEATED group list { + OPTIONAL INT32 element; + } + } + } + "; + let schema = parse_message_type(message_type).expect("should parse schema"); + let descr = SchemaDescriptor::new(Rc::new(schema)); + // required int32 a + assert_eq!(descr.column(0).max_def_level(), 0); + assert_eq!(descr.column(0).max_rep_level(), 0); + // optional int32 b._1 + assert_eq!(descr.column(1).max_def_level(), 2); + assert_eq!(descr.column(1).max_rep_level(), 0); + // optional int32 b._2 + assert_eq!(descr.column(2).max_def_level(), 2); + assert_eq!(descr.column(2).max_rep_level(), 0); + // repeated optional int32 c.list.element + assert_eq!(descr.column(3).max_def_level(), 3); + assert_eq!(descr.column(3).max_rep_level(), 1); + } + + #[test] + #[should_panic(expected = "Cannot call get_physical_type() on a non-primitive type")] + fn test_get_physical_type_panic() { + let list = Type::group_type_builder("records") + .with_repetition(Repetition::REPEATED) + .build() + .unwrap(); + list.get_physical_type(); + } + + #[test] + fn test_get_physical_type_primitive() { + let f = Type::primitive_type_builder("f", PhysicalType::INT64) + .build() + .unwrap(); + assert_eq!(f.get_physical_type(), PhysicalType::INT64); + + let f = Type::primitive_type_builder("f", PhysicalType::BYTE_ARRAY) + .build() + .unwrap(); + assert_eq!(f.get_physical_type(), PhysicalType::BYTE_ARRAY); + } + + #[test] + fn test_check_contains_primitive_primitive() { + // OK + let f1 = Type::primitive_type_builder("f", PhysicalType::INT32) + .build() + .unwrap(); + let f2 = Type::primitive_type_builder("f", PhysicalType::INT32) + .build() + .unwrap(); + assert!(f1.check_contains(&f2)); + + // OK: different logical type does not affect check_contains + let f1 = Type::primitive_type_builder("f", PhysicalType::INT32) + .with_logical_type(LogicalType::UINT_8) + .build() + .unwrap(); + let f2 = Type::primitive_type_builder("f", PhysicalType::INT32) + .with_logical_type(LogicalType::UINT_16) + .build() + .unwrap(); + assert!(f1.check_contains(&f2)); + + // KO: different name + let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(); + let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + + // KO: different type + let f1 = Type::primitive_type_builder("f", PhysicalType::INT32) + .build() + .unwrap(); + let f2 = Type::primitive_type_builder("f", PhysicalType::INT64) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + + // KO: different repetition + let f1 = Type::primitive_type_builder("f", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(); + let f2 = Type::primitive_type_builder("f", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + } + + // function to create a new group type for testing + fn test_new_group_type(name: &str, repetition: Repetition, types: Vec) -> Type { + let mut fields = Vec::new(); + for tpe in types { + fields.push(Rc::new(tpe)) + } + Type::group_type_builder(name) + .with_repetition(repetition) + .with_fields(&mut fields) + .build() + .unwrap() + } + + #[test] + fn test_check_contains_group_group() { + // OK: should match okay with empty fields + let f1 = Type::group_type_builder("f").build().unwrap(); + let f2 = Type::group_type_builder("f").build().unwrap(); + assert!(f1.check_contains(&f2)); + + // OK: fields match + let f1 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap(), + ], + ); + let f2 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap(), + ], + ); + assert!(f1.check_contains(&f2)); + + // OK: subset of fields + let f1 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap(), + ], + ); + let f2 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap()], + ); + assert!(f1.check_contains(&f2)); + + // KO: different name + let f1 = Type::group_type_builder("f1").build().unwrap(); + let f2 = Type::group_type_builder("f2").build().unwrap(); + assert!(!f1.check_contains(&f2)); + + // KO: different repetition + let f1 = Type::group_type_builder("f") + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(); + let f2 = Type::group_type_builder("f") + .with_repetition(Repetition::REPEATED) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + + // KO: different fields + let f1 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap(), + ], + ); + let f2 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::BOOLEAN) + .build() + .unwrap(), + ], + ); + assert!(!f1.check_contains(&f2)); + + // KO: different fields + let f1 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap(), + ], + ); + let f2 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![Type::primitive_type_builder("f3", PhysicalType::INT32) + .build() + .unwrap()], + ); + assert!(!f1.check_contains(&f2)); + } + + #[test] + fn test_check_contains_group_primitive() { + // KO: should not match + let f1 = Type::group_type_builder("f").build().unwrap(); + let f2 = Type::primitive_type_builder("f", PhysicalType::INT64) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + assert!(!f2.check_contains(&f1)); + + // KO: should not match when primitive field is part of group type + let f1 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap()], + ); + let f2 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + assert!(!f2.check_contains(&f1)); + + // OK: match nested types + let f1 = test_new_group_type( + "a", + Repetition::REPEATED, + vec![ + test_new_group_type( + "b", + Repetition::REPEATED, + vec![Type::primitive_type_builder("c", PhysicalType::INT32) + .build() + .unwrap()], + ), + Type::primitive_type_builder("d", PhysicalType::INT64) + .build() + .unwrap(), + Type::primitive_type_builder("e", PhysicalType::BOOLEAN) + .build() + .unwrap(), + ], + ); + let f2 = test_new_group_type( + "a", + Repetition::REPEATED, + vec![test_new_group_type( + "b", + Repetition::REPEATED, + vec![Type::primitive_type_builder("c", PhysicalType::INT32) + .build() + .unwrap()], + )], + ); + assert!(f1.check_contains(&f2)); // should match + assert!(!f2.check_contains(&f1)); // should fail + } + + #[test] + fn test_schema_type_thrift_conversion_err() { + let schema = Type::primitive_type_builder("col", PhysicalType::INT32) + .build() + .unwrap(); + let thrift_schema = to_thrift(&schema); + assert!(thrift_schema.is_err()); + if let Err(e) = thrift_schema { + assert_eq!(e.description(), "Root schema must be Group type"); + } + } + + #[test] + fn test_schema_type_thrift_conversion() { + let message_type = " + message conversions { + REQUIRED INT64 id; + OPTIONAL group int_array_Array (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL INT32 element; + } + } + } + } + OPTIONAL group int_map (MAP) { + REPEATED group map (MAP_KEY_VALUE) { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL INT32 value; + } + } + OPTIONAL group int_Map_Array (LIST) { + REPEATED group list { + OPTIONAL group g (MAP) { + REPEATED group map (MAP_KEY_VALUE) { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL group value { + OPTIONAL group H { + OPTIONAL group i (LIST) { + REPEATED group list { + OPTIONAL DOUBLE element; + } + } + } + } + } + } + } + } + OPTIONAL group nested_struct { + OPTIONAL INT32 A; + OPTIONAL group b (LIST) { + REPEATED group list { + REQUIRED FIXED_LEN_BYTE_ARRAY (16) element; + } + } + } + } + "; + let expected_schema = parse_message_type(message_type).unwrap(); + let thrift_schema = to_thrift(&expected_schema).unwrap(); + let result_schema = from_thrift(&thrift_schema).unwrap(); + assert_eq!(result_schema, Rc::new(expected_schema)); + } + + #[test] + fn test_schema_type_thrift_conversion_decimal() { + let message_type = " + message decimals { + OPTIONAL INT32 field0; + OPTIONAL INT64 field1 (DECIMAL (18, 2)); + OPTIONAL FIXED_LEN_BYTE_ARRAY (16) field2 (DECIMAL (38, 18)); + OPTIONAL BYTE_ARRAY field3 (DECIMAL (9)); + } + "; + let expected_schema = parse_message_type(message_type).unwrap(); + let thrift_schema = to_thrift(&expected_schema).unwrap(); + let result_schema = from_thrift(&thrift_schema).unwrap(); + assert_eq!(result_schema, Rc::new(expected_schema)); + } + + // Tests schema conversion from thrift, when num_children is set to Some(0) for a + // primitive type. + #[test] + fn test_schema_from_thrift_with_num_children_set() { + // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT + let message_type = " + message schema { + OPTIONAL BYTE_ARRAY id (UTF8); + OPTIONAL BYTE_ARRAY name (UTF8); + OPTIONAL BYTE_ARRAY message (UTF8); + OPTIONAL INT32 type (UINT_8); + OPTIONAL INT64 author_time (TIMESTAMP_MILLIS); + OPTIONAL INT64 __index_level_0__; + } + "; + + let expected_schema = parse_message_type(message_type).unwrap(); + let mut thrift_schema = to_thrift(&expected_schema).unwrap(); + // Change all of None to Some(0) + for mut elem in &mut thrift_schema[..] { + if elem.num_children == None { + elem.num_children = Some(0); + } + } + + let result_schema = from_thrift(&thrift_schema).unwrap(); + assert_eq!(result_schema, Rc::new(expected_schema)); + } + + // Sometimes parquet-cpp sets repetition level for the root node, which is against + // the format definition, but we need to handle it by setting it back to None. + #[test] + fn test_schema_from_thrift_root_has_repetition() { + // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT + let message_type = " + message schema { + OPTIONAL BYTE_ARRAY a (UTF8); + OPTIONAL INT32 b (UINT_8); + } + "; + + let expected_schema = parse_message_type(message_type).unwrap(); + let mut thrift_schema = to_thrift(&expected_schema).unwrap(); + thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into()); + + let result_schema = from_thrift(&thrift_schema).unwrap(); + assert_eq!(result_schema, Rc::new(expected_schema)); + } +} diff --git a/rust/src/parquet/util/bit_packing.rs b/rust/src/parquet/util/bit_packing.rs new file mode 100644 index 0000000000000..851fb36ea5c98 --- /dev/null +++ b/rust/src/parquet/util/bit_packing.rs @@ -0,0 +1,3658 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Unpack 32 values with bit width `num_bits` from `in_ptr`, and write to `out_ptr`. +/// Return the `in_ptr` where the starting offset points to the first byte after all the +/// bytes that were consumed. +// TODO: may be better to make these more compact using if-else conditions. +// However, this may require const generics: +// https://github.com/rust-lang/rust/issues/44580 +// to eliminate the branching cost. +// TODO: we should use SIMD instructions to further optimize this. I have explored +// https://github.com/tantivy-search/bitpacking +// but the layout it uses for SIMD is different from Parquet. +// TODO: support packing as well, which is used for encoding. +pub unsafe fn unpack32(mut in_ptr: *const u32, out_ptr: *mut u32, num_bits: usize) -> *const u32 { + in_ptr = match num_bits { + 0 => nullunpacker32(in_ptr, out_ptr), + 1 => unpack1_32(in_ptr, out_ptr), + 2 => unpack2_32(in_ptr, out_ptr), + 3 => unpack3_32(in_ptr, out_ptr), + 4 => unpack4_32(in_ptr, out_ptr), + 5 => unpack5_32(in_ptr, out_ptr), + 6 => unpack6_32(in_ptr, out_ptr), + 7 => unpack7_32(in_ptr, out_ptr), + 8 => unpack8_32(in_ptr, out_ptr), + 9 => unpack9_32(in_ptr, out_ptr), + 10 => unpack10_32(in_ptr, out_ptr), + 11 => unpack11_32(in_ptr, out_ptr), + 12 => unpack12_32(in_ptr, out_ptr), + 13 => unpack13_32(in_ptr, out_ptr), + 14 => unpack14_32(in_ptr, out_ptr), + 15 => unpack15_32(in_ptr, out_ptr), + 16 => unpack16_32(in_ptr, out_ptr), + 17 => unpack17_32(in_ptr, out_ptr), + 18 => unpack18_32(in_ptr, out_ptr), + 19 => unpack19_32(in_ptr, out_ptr), + 20 => unpack20_32(in_ptr, out_ptr), + 21 => unpack21_32(in_ptr, out_ptr), + 22 => unpack22_32(in_ptr, out_ptr), + 23 => unpack23_32(in_ptr, out_ptr), + 24 => unpack24_32(in_ptr, out_ptr), + 25 => unpack25_32(in_ptr, out_ptr), + 26 => unpack26_32(in_ptr, out_ptr), + 27 => unpack27_32(in_ptr, out_ptr), + 28 => unpack28_32(in_ptr, out_ptr), + 29 => unpack29_32(in_ptr, out_ptr), + 30 => unpack30_32(in_ptr, out_ptr), + 31 => unpack31_32(in_ptr, out_ptr), + 32 => unpack32_32(in_ptr, out_ptr), + _ => unimplemented!(), + }; + in_ptr +} + +unsafe fn nullunpacker32(in_buf: *const u32, mut out: *mut u32) -> *const u32 { + for _ in 0..32 { + *out = 0; + out = out.offset(1); + } + in_buf +} + +unsafe fn unpack1_32(in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 1) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 2) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 3) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 4) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 5) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 6) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 7) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 8) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 9) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 10) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 11) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 12) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 13) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 14) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 15) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 16) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 17) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 18) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 19) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 20) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 21) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 22) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 23) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 24) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 25) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 26) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 27) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 28) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 29) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 30) & 1; + out = out.offset(1); + *out = (*in_buf) >> 31; + + in_buf.offset(1) +} + +unsafe fn unpack2_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 2) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 26) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 2); + out = out.offset(1); + *out = (*in_buf) >> 30; + out = out.offset(1); + in_buf = in_buf.offset(1); + *out = ((*in_buf) >> 0) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 2) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 26) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 2); + out = out.offset(1); + *out = (*in_buf) >> 30; + + in_buf.offset(1) +} + +unsafe fn unpack3_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 3) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 9) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 21) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 27) % (1u32 << 3); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (3 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 7) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 19) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 25) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 3); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (3 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 5) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 11) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 23) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 26) % (1u32 << 3); + out = out.offset(1); + *out = (*in_buf) >> 29; + + in_buf.offset(1) +} + +unsafe fn unpack4_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 4); + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 4); + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 4); + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 4); + + in_buf.offset(1) +} + +unsafe fn unpack5_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 5) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 25) % (1u32 << 5); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (5 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 23) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 5); + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (5 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 11) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 21) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 26) % (1u32 << 5); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (5 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 9) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 19) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 5); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (5 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 7) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 5); + out = out.offset(1); + *out = (*in_buf) >> 27; + + in_buf.offset(1) +} + +unsafe fn unpack6_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (6 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (6 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (6 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (6 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 26; + + in_buf.offset(1) +} + +unsafe fn unpack7_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 7) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 21) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (7 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (7 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (7 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 9) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 23) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (7 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 19) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (7 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (7 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 11) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 25; + + in_buf.offset(1) +} + +unsafe fn unpack8_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + + in_buf.offset(1) +} + +unsafe fn unpack9_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 9) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (9 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (9 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (9 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 21) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (9 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (9 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 11) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (9 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (9 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 19) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (9 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 23; + + in_buf.offset(1) +} + +unsafe fn unpack10_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (10 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (10 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (10 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (10 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (10 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (10 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (10 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (10 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 22; + + in_buf.offset(1) +} + +unsafe fn unpack11_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 11) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (11 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (11 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (11 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (11 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (11 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (11 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (11 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (11 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 19) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (11 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (11 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 21; + + in_buf.offset(1) +} + +unsafe fn unpack12_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (12 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (12 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (12 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (12 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (12 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (12 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (12 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (12 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 20; + + in_buf.offset(1) +} + +unsafe fn unpack13_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (13 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (13 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (13 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (13 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (13 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (13 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (13 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (13 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (13 - 11); + out = out.offset(1); + + *out = ((*in_buf) >> 11) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (13 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (13 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (13 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 19; + + in_buf.offset(1) +} + +unsafe fn unpack14_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 14); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (14 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (14 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (14 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 14); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (14 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (14 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (14 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 14); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (14 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (14 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (14 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 14); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (14 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (14 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (14 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 18; + + in_buf.offset(1) +} + +unsafe fn unpack15_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 15); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (15 - 13); + out = out.offset(1); + + *out = ((*in_buf) >> 13) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (15 - 11); + out = out.offset(1); + + *out = ((*in_buf) >> 11) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (15 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (15 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (15 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (15 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (15 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 15); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (15 - 14); + out = out.offset(1); + + *out = ((*in_buf) >> 14) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (15 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (15 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (15 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (15 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (15 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (15 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 17; + + in_buf.offset(1) +} + +unsafe fn unpack16_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + + in_buf.offset(1) +} + +unsafe fn unpack17_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (17 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (17 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (17 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (17 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (17 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (17 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (17 - 14); + out = out.offset(1); + + *out = ((*in_buf) >> 14) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (17 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (17 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (17 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (17 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (17 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (17 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (17 - 11); + out = out.offset(1); + + *out = ((*in_buf) >> 11) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (17 - 13); + out = out.offset(1); + + *out = ((*in_buf) >> 13) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (17 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + + in_buf.offset(1) +} + +unsafe fn unpack18_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (18 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (18 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (18 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (18 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (18 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (18 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (18 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (18 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (18 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (18 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (18 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (18 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (18 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (18 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (18 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (18 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + + in_buf.offset(1) +} + +unsafe fn unpack19_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (19 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (19 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (19 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (19 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (19 - 11); + out = out.offset(1); + + *out = ((*in_buf) >> 11) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (19 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (19 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (19 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (19 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (19 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (19 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (19 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (19 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (19 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (19 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (19 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (19 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (19 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + + in_buf.offset(1) +} + +unsafe fn unpack20_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (20 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (20 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (20 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (20 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (20 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (20 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (20 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (20 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (20 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (20 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (20 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (20 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (20 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (20 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (20 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (20 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + + in_buf.offset(1) +} + +unsafe fn unpack21_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (21 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (21 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (21 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (21 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (21 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (21 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (21 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (21 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (21 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (21 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (21 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (21 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (21 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (21 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (21 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (21 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (21 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (21 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (21 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (21 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + + in_buf.offset(1) +} + +unsafe fn unpack22_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (22 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (22 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (22 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (22 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (22 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (22 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (22 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (22 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (22 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (22 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (22 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (22 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (22 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (22 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (22 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (22 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (22 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (22 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (22 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (22 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + + in_buf.offset(1) +} + +unsafe fn unpack23_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (23 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (23 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (23 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (23 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (23 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (23 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (23 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (23 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (23 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (23 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (23 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (23 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 21)) << (23 - 21); + out = out.offset(1); + + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (23 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (23 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (23 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (23 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (23 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (23 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (23 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (23 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (23 - 9); + out = out.offset(1); + + *out = (*in_buf) >> 9; + + in_buf.offset(1) +} + +unsafe fn unpack24_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + + in_buf.offset(1) +} + +unsafe fn unpack25_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (25 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (25 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (25 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (25 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (25 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (25 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (25 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (25 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (25 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (25 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 23)) << (25 - 23); + out = out.offset(1); + + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (25 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (25 - 9); + out = out.offset(1); + + *out = (*in_buf) >> 9; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (25 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (25 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (25 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (25 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (25 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (25 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (25 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (25 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 21)) << (25 - 21); + out = out.offset(1); + + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (25 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (25 - 7); + out = out.offset(1); + + *out = (*in_buf) >> 7; + + in_buf.offset(1) +} + +unsafe fn unpack26_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (26 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (26 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (26 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (26 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (26 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (26 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (26 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (26 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (26 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (26 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (26 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (26 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (26 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (26 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (26 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (26 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (26 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (26 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (26 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (26 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (26 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (26 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (26 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (26 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + + in_buf.offset(1) +} + +unsafe fn unpack27_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 27); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (27 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (27 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (27 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (27 - 7); + out = out.offset(1); + + *out = (*in_buf) >> 7; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (27 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 27); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (27 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (27 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (27 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (27 - 9); + out = out.offset(1); + + *out = (*in_buf) >> 9; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (27 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 27); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 26)) << (27 - 26); + out = out.offset(1); + + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 21)) << (27 - 21); + out = out.offset(1); + + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (27 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (27 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (27 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (27 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 27); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 23)) << (27 - 23); + out = out.offset(1); + + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (27 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (27 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (27 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (27 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 27); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 25)) << (27 - 25); + out = out.offset(1); + + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (27 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (27 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (27 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (27 - 5); + out = out.offset(1); + + *out = (*in_buf) >> 5; + + in_buf.offset(1) +} + +unsafe fn unpack28_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 28); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (28 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (28 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (28 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (28 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (28 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (28 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 28); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (28 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (28 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (28 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (28 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (28 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (28 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 28); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (28 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (28 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (28 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (28 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (28 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (28 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 28); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (28 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (28 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (28 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (28 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (28 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (28 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + + in_buf.offset(1) +} + +unsafe fn unpack29_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 29); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 26)) << (29 - 26); + out = out.offset(1); + + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 23)) << (29 - 23); + out = out.offset(1); + + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (29 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (29 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (29 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (29 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (29 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (29 - 5); + out = out.offset(1); + + *out = (*in_buf) >> 5; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (29 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 29); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 28)) << (29 - 28); + out = out.offset(1); + + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 25)) << (29 - 25); + out = out.offset(1); + + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (29 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (29 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (29 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (29 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (29 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (29 - 7); + out = out.offset(1); + + *out = (*in_buf) >> 7; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (29 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (29 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 29); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 27)) << (29 - 27); + out = out.offset(1); + + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (29 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 21)) << (29 - 21); + out = out.offset(1); + + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (29 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (29 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (29 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (29 - 9); + out = out.offset(1); + + *out = (*in_buf) >> 9; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (29 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (29 - 3); + out = out.offset(1); + + *out = (*in_buf) >> 3; + + in_buf.offset(1) +} + +unsafe fn unpack30_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 30); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 28)) << (30 - 28); + out = out.offset(1); + + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 26)) << (30 - 26); + out = out.offset(1); + + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (30 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (30 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (30 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (30 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (30 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (30 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (30 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (30 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (30 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (30 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (30 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (30 - 2); + out = out.offset(1); + + *out = (*in_buf) >> 2; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 30); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 28)) << (30 - 28); + out = out.offset(1); + + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 26)) << (30 - 26); + out = out.offset(1); + + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (30 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (30 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (30 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (30 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (30 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (30 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (30 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (30 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (30 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (30 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (30 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (30 - 2); + out = out.offset(1); + + *out = (*in_buf) >> 2; + + in_buf.offset(1) +} + +unsafe fn unpack31_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 31); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 30)) << (31 - 30); + out = out.offset(1); + + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 29)) << (31 - 29); + out = out.offset(1); + + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 28)) << (31 - 28); + out = out.offset(1); + + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 27)) << (31 - 27); + out = out.offset(1); + + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 26)) << (31 - 26); + out = out.offset(1); + + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 25)) << (31 - 25); + out = out.offset(1); + + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (31 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 23)) << (31 - 23); + out = out.offset(1); + + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (31 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 21)) << (31 - 21); + out = out.offset(1); + + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (31 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (31 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (31 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (31 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (31 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (31 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (31 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (31 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (31 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (31 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (31 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (31 - 9); + out = out.offset(1); + + *out = (*in_buf) >> 9; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (31 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (31 - 7); + out = out.offset(1); + + *out = (*in_buf) >> 7; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (31 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (31 - 5); + out = out.offset(1); + + *out = (*in_buf) >> 5; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (31 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (31 - 3); + out = out.offset(1); + + *out = (*in_buf) >> 3; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (31 - 2); + out = out.offset(1); + + *out = (*in_buf) >> 2; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (31 - 1); + out = out.offset(1); + + *out = (*in_buf) >> 1; + + in_buf.offset(1) +} + +unsafe fn unpack32_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + + in_buf.offset(1) +} diff --git a/rust/src/parquet/util/bit_util.rs b/rust/src/parquet/util/bit_util.rs new file mode 100644 index 0000000000000..9dbb9a32333d2 --- /dev/null +++ b/rust/src/parquet/util/bit_util.rs @@ -0,0 +1,1058 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + cmp, + mem::{size_of, transmute_copy}, +}; + +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::util::{bit_packing::unpack32, memory::ByteBufferPtr}; + +/// Reads `$size` of bytes from `$src`, and reinterprets them as type `$ty`, in +/// little-endian order. `$ty` must implement the `Default` trait. Otherwise this won't +/// compile. +/// This is copied and modified from byteorder crate. +macro_rules! read_num_bytes { + ($ty:ty, $size:expr, $src:expr) => {{ + assert!($size <= $src.len()); + let mut data: $ty = Default::default(); + unsafe { + ::std::ptr::copy_nonoverlapping($src.as_ptr(), &mut data as *mut $ty as *mut u8, $size); + } + data + }}; +} + +/// Converts value `val` of type `T` to a byte vector, by reading `num_bytes` from `val`. +/// NOTE: if `val` is less than the size of `T` then it can be truncated. +#[inline] +pub fn convert_to_bytes(val: &T, num_bytes: usize) -> Vec { + let mut bytes: Vec = vec![0; num_bytes]; + memcpy_value(val, num_bytes, &mut bytes); + bytes +} + +#[inline] +pub fn memcpy(source: &[u8], target: &mut [u8]) { + assert!(target.len() >= source.len()); + unsafe { ::std::ptr::copy_nonoverlapping(source.as_ptr(), target.as_mut_ptr(), source.len()) } +} + +#[inline] +pub fn memcpy_value(source: &T, num_bytes: usize, target: &mut [u8]) { + assert!( + target.len() >= num_bytes, + "Not enough space. Only had {} bytes but need to put {} bytes", + target.len(), + num_bytes + ); + unsafe { + ::std::ptr::copy_nonoverlapping( + source as *const T as *const u8, + target.as_mut_ptr(), + num_bytes, + ) + } +} + +/// Returns the ceil of value/divisor +#[inline] +pub fn ceil(value: i64, divisor: i64) -> i64 { + let mut result = value / divisor; + if value % divisor != 0 { + result += 1 + }; + result +} + +/// Returns ceil(log2(x)) +#[inline] +pub fn log2(mut x: u64) -> i32 { + if x == 1 { + return 0; + } + x -= 1; + let mut result = 0; + while x > 0 { + x >>= 1; + result += 1; + } + result +} + +/// Returns the `num_bits` least-significant bits of `v` +#[inline] +pub fn trailing_bits(v: u64, num_bits: usize) -> u64 { + if num_bits == 0 { + return 0; + } + if num_bits >= 64 { + return v; + } + let n = 64 - num_bits; + (v << n) >> n +} + +#[inline] +pub fn set_array_bit(bits: &mut [u8], i: usize) { + bits[i / 8] |= 1 << (i % 8); +} + +#[inline] +pub fn unset_array_bit(bits: &mut [u8], i: usize) { + bits[i / 8] &= !(1 << (i % 8)); +} + +/// Returns the minimum number of bits needed to represent the value 'x' +#[inline] +pub fn num_required_bits(x: u64) -> usize { + for i in (0..64).rev() { + if x & (1u64 << i) != 0 { + return i + 1; + } + } + 0 +} + +/// Utility class for writing bit/byte streams. This class can write data in either +/// bit packed or byte aligned fashion. +pub struct BitWriter { + buffer: Vec, + max_bytes: usize, + buffered_values: u64, + byte_offset: usize, + bit_offset: usize, + start: usize, +} + +impl BitWriter { + pub fn new(max_bytes: usize) -> Self { + Self { + buffer: vec![0; max_bytes], + max_bytes, + buffered_values: 0, + byte_offset: 0, + bit_offset: 0, + start: 0, + } + } + + /// Initializes the writer from the existing buffer `buffer` and starting + /// offset `start`. + pub fn new_from_buf(buffer: Vec, start: usize) -> Self { + assert!(start < buffer.len()); + let len = buffer.len(); + Self { + buffer, + max_bytes: len, + buffered_values: 0, + byte_offset: start, + bit_offset: 0, + start, + } + } + + /// Consumes and returns the current buffer. + #[inline] + pub fn consume(mut self) -> Vec { + self.flush(); + self.buffer.truncate(self.byte_offset); + self.buffer + } + + /// Flushes the internal buffered bits and returns the buffer's content. + /// This is a borrow equivalent of `consume` method. + #[inline] + pub fn flush_buffer(&mut self) -> &[u8] { + self.flush(); + &self.buffer()[0..self.byte_offset] + } + + /// Clears the internal state so the buffer can be reused. + #[inline] + pub fn clear(&mut self) { + self.buffered_values = 0; + self.byte_offset = self.start; + self.bit_offset = 0; + } + + /// Flushes the internal buffered bits and the align the buffer to the next byte. + #[inline] + pub fn flush(&mut self) { + let num_bytes = ceil(self.bit_offset as i64, 8) as usize; + assert!(self.byte_offset + num_bytes <= self.max_bytes); + memcpy_value( + &self.buffered_values, + num_bytes, + &mut self.buffer[self.byte_offset..], + ); + self.buffered_values = 0; + self.bit_offset = 0; + self.byte_offset += num_bytes; + } + + /// Advances the current offset by skipping `num_bytes`, flushing the internal bit + /// buffer first. + /// This is useful when you want to jump over `num_bytes` bytes and come back later + /// to fill these bytes. + /// + /// Returns error if `num_bytes` is beyond the boundary of the internal buffer. + /// Otherwise, returns the old offset. + #[inline] + pub fn skip(&mut self, num_bytes: usize) -> Result { + self.flush(); + assert!(self.byte_offset <= self.max_bytes); + if self.byte_offset + num_bytes > self.max_bytes { + return Err(general_err!( + "Not enough bytes left in BitWriter. Need {} but only have {}", + self.byte_offset + num_bytes, + self.max_bytes + )); + } + let result = self.byte_offset; + self.byte_offset += num_bytes; + Ok(result) + } + + /// Returns a slice containing the next `num_bytes` bytes starting from the current + /// offset, and advances the underlying buffer by `num_bytes`. + /// This is useful when you want to jump over `num_bytes` bytes and come back later + /// to fill these bytes. + #[inline] + pub fn get_next_byte_ptr(&mut self, num_bytes: usize) -> Result<&mut [u8]> { + let offset = self.skip(num_bytes)?; + Ok(&mut self.buffer[offset..offset + num_bytes]) + } + + #[inline] + pub fn bytes_written(&self) -> usize { + self.byte_offset - self.start + ceil(self.bit_offset as i64, 8) as usize + } + + #[inline] + pub fn buffer(&self) -> &[u8] { + &self.buffer[self.start..] + } + + #[inline] + pub fn byte_offset(&self) -> usize { + self.byte_offset + } + + /// Returns the internal buffer length. This is the maximum number of bytes that this + /// writer can write. User needs to call `consume` to consume the current buffer before + /// more data can be written. + #[inline] + pub fn buffer_len(&self) -> usize { + self.max_bytes + } + + /// Writes the `num_bits` LSB of value `v` to the internal buffer of this writer. + /// The `num_bits` must not be greater than 64. This is bit packed. + /// + /// Returns false if there's not enough room left. True otherwise. + #[inline] + pub fn put_value(&mut self, v: u64, num_bits: usize) -> bool { + assert!(num_bits <= 64); + assert_eq!(v.checked_shr(num_bits as u32).unwrap_or(0), 0); // covers case v >> 64 + + if self.byte_offset * 8 + self.bit_offset + num_bits > self.max_bytes as usize * 8 { + return false; + } + + self.buffered_values |= v << self.bit_offset; + self.bit_offset += num_bits; + if self.bit_offset >= 64 { + memcpy_value( + &self.buffered_values, + 8, + &mut self.buffer[self.byte_offset..], + ); + self.byte_offset += 8; + self.bit_offset -= 64; + self.buffered_values = 0; + // Perform checked right shift: v >> offset, where offset < 64, otherwise we shift + // all bits + self.buffered_values = v + .checked_shr((num_bits - self.bit_offset) as u32) + .unwrap_or(0); + } + assert!(self.bit_offset < 64); + true + } + + /// Writes `val` of `num_bytes` bytes to the next aligned byte. If size of `T` is + /// larger than `num_bytes`, extra higher ordered bytes will be ignored. + /// + /// Returns false if there's not enough room left. True otherwise. + #[inline] + pub fn put_aligned(&mut self, val: T, num_bytes: usize) -> bool { + let result = self.get_next_byte_ptr(num_bytes); + if result.is_err() { + // TODO: should we return `Result` for this func? + return false; + } + let mut ptr = result.unwrap(); + memcpy_value(&val, num_bytes, &mut ptr); + true + } + + /// Writes `val` of `num_bytes` bytes at the designated `offset`. The `offset` is the + /// offset starting from the beginning of the internal buffer that this writer + /// maintains. Note that this will overwrite any existing data between `offset` and + /// `offset + num_bytes`. Also that if size of `T` is larger than `num_bytes`, extra + /// higher ordered bytes will be ignored. + /// + /// Returns false if there's not enough room left, or the `pos` is not valid. + /// True otherwise. + #[inline] + pub fn put_aligned_offset(&mut self, val: T, num_bytes: usize, offset: usize) -> bool { + if num_bytes + offset > self.max_bytes { + return false; + } + memcpy_value( + &val, + num_bytes, + &mut self.buffer[offset..offset + num_bytes], + ); + true + } + + /// Writes a VLQ encoded integer `v` to this buffer. The value is byte aligned. + /// + /// Returns false if there's not enough room left. True otherwise. + #[inline] + pub fn put_vlq_int(&mut self, mut v: u64) -> bool { + let mut result = true; + while v & 0xFFFFFFFFFFFFFF80 != 0 { + result &= self.put_aligned::(((v & 0x7F) | 0x80) as u8, 1); + v >>= 7; + } + result &= self.put_aligned::((v & 0x7F) as u8, 1); + result + } + + /// Writes a zigzag-VLQ encoded (in little endian order) int `v` to this buffer. + /// Zigzag-VLQ is a variant of VLQ encoding where negative and positive + /// numbers are encoded in a zigzag fashion. + /// See: https://developers.google.com/protocol-buffers/docs/encoding + /// + /// Returns false if there's not enough room left. True otherwise. + #[inline] + pub fn put_zigzag_vlq_int(&mut self, v: i64) -> bool { + let u: u64 = ((v << 1) ^ (v >> 63)) as u64; + self.put_vlq_int(u) + } +} + +/// Maximum byte length for a VLQ encoded integer +/// MAX_VLQ_BYTE_LEN = 5 for i32, and MAX_VLQ_BYTE_LEN = 10 for i64 +pub const MAX_VLQ_BYTE_LEN: usize = 10; + +pub struct BitReader { + // The byte buffer to read from, passed in by client + buffer: ByteBufferPtr, + + // Bytes are memcpy'd from `buffer` and values are read from this variable. + // This is faster than reading values byte by byte directly from `buffer` + buffered_values: u64, + + // + // End Start + // |............|B|B|B|B|B|B|B|B|..............| + // ^ ^ + // bit_offset byte_offset + // + // Current byte offset in `buffer` + byte_offset: usize, + + // Current bit offset in `buffered_values` + bit_offset: usize, + + // Total number of bytes in `buffer` + total_bytes: usize, +} + +/// Utility class to read bit/byte stream. This class can read bits or bytes that are +/// either byte aligned or not. +impl BitReader { + pub fn new(buffer: ByteBufferPtr) -> Self { + let total_bytes = buffer.len(); + let num_bytes = cmp::min(8, total_bytes); + let buffered_values = read_num_bytes!(u64, num_bytes, buffer.as_ref()); + BitReader { + buffer, + buffered_values, + byte_offset: 0, + bit_offset: 0, + total_bytes, + } + } + + #[inline] + pub fn reset(&mut self, buffer: ByteBufferPtr) { + self.buffer = buffer; + self.total_bytes = self.buffer.len(); + let num_bytes = cmp::min(8, self.total_bytes); + self.buffered_values = read_num_bytes!(u64, num_bytes, self.buffer.as_ref()); + self.byte_offset = 0; + self.bit_offset = 0; + } + + /// Gets the current byte offset + #[inline] + pub fn get_byte_offset(&self) -> usize { + self.byte_offset + ceil(self.bit_offset as i64, 8) as usize + } + + /// Reads a value of type `T` and of size `num_bits`. + /// + /// Returns `None` if there's not enough data available. `Some` otherwise. + #[inline] + pub fn get_value(&mut self, num_bits: usize) -> Option { + assert!(num_bits <= 64); + assert!(num_bits <= size_of::() * 8); + + if self.byte_offset * 8 + self.bit_offset + num_bits > self.total_bytes * 8 { + return None; + } + + let mut v = + trailing_bits(self.buffered_values, self.bit_offset + num_bits) >> self.bit_offset; + self.bit_offset += num_bits; + + if self.bit_offset >= 64 { + self.byte_offset += 8; + self.bit_offset -= 64; + + self.reload_buffer_values(); + v |= trailing_bits(self.buffered_values, self.bit_offset) + .wrapping_shl((num_bits - self.bit_offset) as u32); + } + + // TODO: better to avoid copying here + let result: T = unsafe { transmute_copy::(&v) }; + Some(result) + } + + #[inline] + pub fn get_batch(&mut self, batch: &mut [T], num_bits: usize) -> usize { + assert!(num_bits <= 32); + assert!(num_bits <= size_of::() * 8); + + let mut values_to_read = batch.len(); + let needed_bits = num_bits * values_to_read; + let remaining_bits = (self.total_bytes - self.byte_offset) * 8 - self.bit_offset; + if remaining_bits < needed_bits { + values_to_read = remaining_bits / num_bits; + } + + let mut i = 0; + + // First align bit offset to byte offset + if self.bit_offset != 0 { + while i < values_to_read && self.bit_offset != 0 { + batch[i] = self + .get_value(num_bits) + .expect("expected to have more data"); + i += 1; + } + } + + unsafe { + let in_buf = &self.buffer.data()[self.byte_offset..]; + let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32; + if size_of::() == 4 { + while values_to_read - i >= 32 { + let out_ptr = &mut batch[i..] as *mut [T] as *mut T as *mut u32; + in_ptr = unpack32(in_ptr, out_ptr, num_bits); + self.byte_offset += 4 * num_bits; + i += 32; + } + } else { + let mut out_buf = [0u32; 32]; + let out_ptr = &mut out_buf as &mut [u32] as *mut [u32] as *mut u32; + while values_to_read - i >= 32 { + in_ptr = unpack32(in_ptr, out_ptr, num_bits); + self.byte_offset += 4 * num_bits; + for n in 0..32 { + // We need to copy from smaller size to bigger size to avoid overwritting + // other memory regions. + if size_of::() > size_of::() { + ::std::ptr::copy_nonoverlapping( + out_buf[n..].as_ptr() as *const u32, + &mut batch[i] as *mut T as *mut u32, + 1, + ); + } else { + ::std::ptr::copy_nonoverlapping( + out_buf[n..].as_ptr() as *const T, + &mut batch[i] as *mut T, + 1, + ); + } + i += 1; + } + } + } + } + + assert!(values_to_read - i < 32); + + self.reload_buffer_values(); + while i < values_to_read { + batch[i] = self + .get_value(num_bits) + .expect("expected to have more data"); + i += 1; + } + + values_to_read + } + + /// Reads a `num_bytes`-sized value from this buffer and return it. + /// `T` needs to be a little-endian native type. The value is assumed to be byte + /// aligned so the bit reader will be advanced to the start of the next byte before + /// reading the value. + + /// Returns `Some` if there's enough bytes left to form a value of `T`. + /// Otherwise `None`. + #[inline] + pub fn get_aligned(&mut self, num_bytes: usize) -> Option { + let bytes_read = ceil(self.bit_offset as i64, 8) as usize; + if self.byte_offset + bytes_read + num_bytes > self.total_bytes { + return None; + } + + // Advance byte_offset to next unread byte and read num_bytes + self.byte_offset += bytes_read; + let v = read_num_bytes!( + T, + num_bytes, + self.buffer.start_from(self.byte_offset).as_ref() + ); + self.byte_offset += num_bytes; + + // Reset buffered_values + self.bit_offset = 0; + self.reload_buffer_values(); + Some(v) + } + + /// Reads a VLQ encoded (in little endian order) int from the stream. + /// The encoded int must start at the beginning of a byte. + /// + /// Returns `None` if there's not enough bytes in the stream. `Some` otherwise. + #[inline] + pub fn get_vlq_int(&mut self) -> Option { + let mut shift = 0; + let mut v: i64 = 0; + while let Some(byte) = self.get_aligned::(1) { + v |= ((byte & 0x7F) as i64) << shift; + shift += 7; + assert!( + shift <= MAX_VLQ_BYTE_LEN * 7, + "Num of bytes exceed MAX_VLQ_BYTE_LEN ({})", + MAX_VLQ_BYTE_LEN + ); + if byte & 0x80 == 0 { + return Some(v); + } + } + None + } + + /// Reads a zigzag-VLQ encoded (in little endian order) int from the stream + /// Zigzag-VLQ is a variant of VLQ encoding where negative and positive numbers are + /// encoded in a zigzag fashion. + /// See: https://developers.google.com/protocol-buffers/docs/encoding + /// + /// Note: the encoded int must start at the beginning of a byte. + /// + /// Returns `None` if the number of bytes there's not enough bytes in the stream. + /// `Some` otherwise. + #[inline] + pub fn get_zigzag_vlq_int(&mut self) -> Option { + self.get_vlq_int().map(|v| { + let u = v as u64; + ((u >> 1) as i64 ^ -((u & 1) as i64)) + }) + } + + #[inline] + fn reload_buffer_values(&mut self) { + let bytes_to_read = cmp::min(self.total_bytes - self.byte_offset, 8); + self.buffered_values = read_num_bytes!( + u64, + bytes_to_read, + self.buffer.start_from(self.byte_offset).as_ref() + ); + } +} + +impl From> for BitReader { + #[inline] + fn from(buffer: Vec) -> Self { + BitReader::new(ByteBufferPtr::new(buffer)) + } +} + +#[cfg(test)] +mod tests { + use super::super::test_common::*; + use super::*; + + use rand::distributions::{Distribution, Standard}; + use std::fmt::Debug; + + #[test] + fn test_ceil() { + assert_eq!(ceil(0, 1), 0); + assert_eq!(ceil(1, 1), 1); + assert_eq!(ceil(1, 2), 1); + assert_eq!(ceil(1, 8), 1); + assert_eq!(ceil(7, 8), 1); + assert_eq!(ceil(8, 8), 1); + assert_eq!(ceil(9, 8), 2); + assert_eq!(ceil(9, 9), 1); + assert_eq!(ceil(10000000000, 10), 1000000000); + assert_eq!(ceil(10, 10000000000), 1); + assert_eq!(ceil(10000000000, 1000000000), 10); + } + + #[test] + fn test_bit_reader_get_byte_offset() { + let buffer = vec![255; 10]; + let mut bit_reader = BitReader::from(buffer); + assert_eq!(bit_reader.get_byte_offset(), 0); // offset (0 bytes, 0 bits) + bit_reader.get_value::(6); + assert_eq!(bit_reader.get_byte_offset(), 1); // offset (0 bytes, 6 bits) + bit_reader.get_value::(10); + assert_eq!(bit_reader.get_byte_offset(), 2); // offset (0 bytes, 16 bits) + bit_reader.get_value::(20); + assert_eq!(bit_reader.get_byte_offset(), 5); // offset (0 bytes, 36 bits) + bit_reader.get_value::(30); + assert_eq!(bit_reader.get_byte_offset(), 9); // offset (8 bytes, 2 bits) + } + + #[test] + fn test_bit_reader_get_value() { + let buffer = vec![255, 0]; + let mut bit_reader = BitReader::from(buffer); + assert_eq!(bit_reader.get_value::(1), Some(1)); + assert_eq!(bit_reader.get_value::(2), Some(3)); + assert_eq!(bit_reader.get_value::(3), Some(7)); + assert_eq!(bit_reader.get_value::(4), Some(3)); + } + + #[test] + fn test_bit_reader_get_value_boundary() { + let buffer = vec![10, 0, 0, 0, 20, 0, 30, 0, 0, 0, 40, 0]; + let mut bit_reader = BitReader::from(buffer); + assert_eq!(bit_reader.get_value::(32), Some(10)); + assert_eq!(bit_reader.get_value::(16), Some(20)); + assert_eq!(bit_reader.get_value::(32), Some(30)); + assert_eq!(bit_reader.get_value::(16), Some(40)); + } + + #[test] + fn test_bit_reader_get_aligned() { + // 01110101 11001011 + let buffer = ByteBufferPtr::new(vec![0x75, 0xCB]); + let mut bit_reader = BitReader::new(buffer.all()); + assert_eq!(bit_reader.get_value::(3), Some(5)); + assert_eq!(bit_reader.get_aligned::(1), Some(203)); + assert_eq!(bit_reader.get_value::(1), None); + bit_reader.reset(buffer.all()); + assert_eq!(bit_reader.get_aligned::(3), None); + } + + #[test] + fn test_bit_reader_get_vlq_int() { + // 10001001 00000001 11110010 10110101 00000110 + let buffer: Vec = vec![0x89, 0x01, 0xF2, 0xB5, 0x06]; + let mut bit_reader = BitReader::from(buffer); + assert_eq!(bit_reader.get_vlq_int(), Some(137)); + assert_eq!(bit_reader.get_vlq_int(), Some(105202)); + } + + #[test] + fn test_bit_reader_get_zigzag_vlq_int() { + let buffer: Vec = vec![0, 1, 2, 3]; + let mut bit_reader = BitReader::from(buffer); + assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(0)); + assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(-1)); + assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(1)); + assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(-2)); + } + + #[test] + fn test_set_array_bit() { + let mut buffer = vec![0, 0, 0]; + set_array_bit(&mut buffer[..], 1); + assert_eq!(buffer, vec![2, 0, 0]); + set_array_bit(&mut buffer[..], 4); + assert_eq!(buffer, vec![18, 0, 0]); + unset_array_bit(&mut buffer[..], 1); + assert_eq!(buffer, vec![16, 0, 0]); + set_array_bit(&mut buffer[..], 10); + assert_eq!(buffer, vec![16, 4, 0]); + set_array_bit(&mut buffer[..], 10); + assert_eq!(buffer, vec![16, 4, 0]); + set_array_bit(&mut buffer[..], 11); + assert_eq!(buffer, vec![16, 12, 0]); + unset_array_bit(&mut buffer[..], 10); + assert_eq!(buffer, vec![16, 8, 0]); + } + + #[test] + fn test_num_required_bits() { + assert_eq!(num_required_bits(0), 0); + assert_eq!(num_required_bits(1), 1); + assert_eq!(num_required_bits(2), 2); + assert_eq!(num_required_bits(4), 3); + assert_eq!(num_required_bits(8), 4); + assert_eq!(num_required_bits(10), 4); + assert_eq!(num_required_bits(12), 4); + assert_eq!(num_required_bits(16), 5); + } + + #[test] + fn test_log2() { + assert_eq!(log2(1), 0); + assert_eq!(log2(2), 1); + assert_eq!(log2(3), 2); + assert_eq!(log2(4), 2); + assert_eq!(log2(5), 3); + assert_eq!(log2(5), 3); + assert_eq!(log2(6), 3); + assert_eq!(log2(7), 3); + assert_eq!(log2(8), 3); + assert_eq!(log2(9), 4); + } + + #[test] + fn test_skip() { + let mut writer = BitWriter::new(5); + let old_offset = writer.skip(1).expect("skip() should return OK"); + writer.put_aligned(42, 4); + writer.put_aligned_offset(0x10, 1, old_offset); + let result = writer.consume(); + assert_eq!(result.as_ref(), [0x10, 42, 0, 0, 0]); + + writer = BitWriter::new(4); + let result = writer.skip(5); + assert!(result.is_err()); + } + + #[test] + fn test_get_next_byte_ptr() { + let mut writer = BitWriter::new(5); + { + let first_byte = writer + .get_next_byte_ptr(1) + .expect("get_next_byte_ptr() should return OK"); + first_byte[0] = 0x10; + } + writer.put_aligned(42, 4); + let result = writer.consume(); + assert_eq!(result.as_ref(), [0x10, 42, 0, 0, 0]); + } + + #[test] + fn test_consume_flush_buffer() { + let mut writer1 = BitWriter::new(3); + let mut writer2 = BitWriter::new(3); + for i in 1..10 { + writer1.put_value(i, 4); + writer2.put_value(i, 4); + } + let res1 = writer1.flush_buffer(); + let res2 = writer2.consume(); + assert_eq!(res1, &res2[..]); + } + + #[test] + fn test_put_get_bool() { + let len = 8; + let mut writer = BitWriter::new(len); + + for i in 0..8 { + let result = writer.put_value(i % 2, 1); + assert!(result); + } + + writer.flush(); + { + let buffer = writer.buffer(); + assert_eq!(buffer[0], 0b10101010); + } + + // Write 00110011 + for i in 0..8 { + let result = match i { + 0 | 1 | 4 | 5 => writer.put_value(false as u64, 1), + _ => writer.put_value(true as u64, 1), + }; + assert!(result); + } + writer.flush(); + { + let buffer = writer.buffer(); + assert_eq!(buffer[0], 0b10101010); + assert_eq!(buffer[1], 0b11001100); + } + + let mut reader = BitReader::from(writer.consume()); + + for i in 0..8 { + let val = reader + .get_value::(1) + .expect("get_value() should return OK"); + assert_eq!(val, i % 2); + } + + for i in 0..8 { + let val = reader + .get_value::(1) + .expect("get_value() should return OK"); + match i { + 0 | 1 | 4 | 5 => assert_eq!(val, false), + _ => assert_eq!(val, true), + } + } + } + + #[test] + fn test_put_value_roundtrip() { + test_put_value_rand_numbers(32, 2); + test_put_value_rand_numbers(32, 3); + test_put_value_rand_numbers(32, 4); + test_put_value_rand_numbers(32, 5); + test_put_value_rand_numbers(32, 6); + test_put_value_rand_numbers(32, 7); + test_put_value_rand_numbers(32, 8); + test_put_value_rand_numbers(64, 16); + test_put_value_rand_numbers(64, 24); + test_put_value_rand_numbers(64, 32); + } + + fn test_put_value_rand_numbers(total: usize, num_bits: usize) { + assert!(num_bits < 64); + let num_bytes = ceil(num_bits as i64, 8); + let mut writer = BitWriter::new(num_bytes as usize * total); + let values: Vec = random_numbers::(total) + .iter() + .map(|v| v & ((1 << num_bits) - 1)) + .collect(); + for i in 0..total { + assert!( + writer.put_value(values[i] as u64, num_bits), + "[{}]: put_value() failed", + i + ); + } + + let mut reader = BitReader::from(writer.consume()); + for i in 0..total { + let v = reader + .get_value::(num_bits) + .expect("get_value() should return OK"); + assert_eq!( + v, values[i], + "[{}]: expected {} but got {}", + i, values[i], v + ); + } + } + + #[test] + fn test_get_batch() { + const SIZE: &[usize] = &[1, 31, 32, 33, 128, 129]; + for s in SIZE { + for i in 0..33 { + match i { + 0...8 => test_get_batch_helper::(*s, i), + 9...16 => test_get_batch_helper::(*s, i), + _ => test_get_batch_helper::(*s, i), + } + } + } + } + + fn test_get_batch_helper(total: usize, num_bits: usize) + where + T: Default + Clone + Debug + Eq, + { + assert!(num_bits <= 32); + let num_bytes = ceil(num_bits as i64, 8); + let mut writer = BitWriter::new(num_bytes as usize * total); + + let values: Vec = random_numbers::(total) + .iter() + .map(|v| v & ((1u64 << num_bits) - 1) as u32) + .collect(); + + // Generic values used to check against actual values read from `get_batch`. + let expected_values: Vec = values + .iter() + .map(|v| unsafe { transmute_copy::(&v) }) + .collect(); + + for i in 0..total { + assert!(writer.put_value(values[i] as u64, num_bits)); + } + + let buf = writer.consume(); + let mut reader = BitReader::from(buf); + let mut batch = vec![T::default(); values.len()]; + let values_read = reader.get_batch::(&mut batch, num_bits); + assert_eq!(values_read, values.len()); + for i in 0..batch.len() { + assert_eq!( + batch[i], expected_values[i], + "num_bits = {}, index = {}", + num_bits, i + ); + } + } + + #[test] + fn test_put_aligned_roundtrip() { + test_put_aligned_rand_numbers::(4, 3); + test_put_aligned_rand_numbers::(16, 5); + test_put_aligned_rand_numbers::(32, 7); + test_put_aligned_rand_numbers::(32, 9); + test_put_aligned_rand_numbers::(32, 11); + test_put_aligned_rand_numbers::(32, 13); + test_put_aligned_rand_numbers::(32, 17); + test_put_aligned_rand_numbers::(32, 23); + } + + fn test_put_aligned_rand_numbers(total: usize, num_bits: usize) + where + T: Copy + Default + Debug + PartialEq, + Standard: Distribution, + { + assert!(num_bits <= 32); + assert!(total % 2 == 0); + + let aligned_value_byte_width = ::std::mem::size_of::(); + let value_byte_width = ceil(num_bits as i64, 8) as usize; + let mut writer = + BitWriter::new((total / 2) * (aligned_value_byte_width + value_byte_width)); + let values: Vec = random_numbers::(total / 2) + .iter() + .map(|v| v & ((1 << num_bits) - 1)) + .collect(); + let aligned_values = random_numbers::(total / 2); + + for i in 0..total { + let j = i / 2; + if i % 2 == 0 { + assert!( + writer.put_value(values[j] as u64, num_bits), + "[{}]: put_value() failed", + i + ); + } else { + assert!( + writer.put_aligned::(aligned_values[j], aligned_value_byte_width), + "[{}]: put_aligned() failed", + i + ); + } + } + + let mut reader = BitReader::from(writer.consume()); + for i in 0..total { + let j = i / 2; + if i % 2 == 0 { + let v = reader + .get_value::(num_bits) + .expect("get_value() should return OK"); + assert_eq!( + v, values[j] as u64, + "[{}]: expected {} but got {}", + i, values[j], v + ); + } else { + let v = reader + .get_aligned::(aligned_value_byte_width) + .expect("get_aligned() should return OK"); + assert_eq!( + v, aligned_values[j], + "[{}]: expected {:?} but got {:?}", + i, aligned_values[j], v + ); + } + } + } + + #[test] + fn test_put_vlq_int() { + let total = 64; + let mut writer = BitWriter::new(total * 32); + let values = random_numbers::(total); + for i in 0..total { + assert!( + writer.put_vlq_int(values[i] as u64), + "[{}]; put_vlq_int() failed", + i + ); + } + + let mut reader = BitReader::from(writer.consume()); + for i in 0..total { + let v = reader + .get_vlq_int() + .expect("get_vlq_int() should return OK"); + assert_eq!( + v as u32, values[i], + "[{}]: expected {} but got {}", + i, values[i], v + ); + } + } + + #[test] + fn test_put_zigzag_vlq_int() { + let total = 64; + let mut writer = BitWriter::new(total * 32); + let values = random_numbers::(total); + for i in 0..total { + assert!( + writer.put_zigzag_vlq_int(values[i] as i64), + "[{}]; put_zigzag_vlq_int() failed", + i + ); + } + + let mut reader = BitReader::from(writer.consume()); + for i in 0..total { + let v = reader + .get_zigzag_vlq_int() + .expect("get_zigzag_vlq_int() should return OK"); + assert_eq!( + v as i32, values[i], + "[{}]: expected {} but got {}", + i, values[i], v + ); + } + } +} diff --git a/rust/src/parquet/util/hash_util.rs b/rust/src/parquet/util/hash_util.rs new file mode 100644 index 0000000000000..c7bffef8bbf34 --- /dev/null +++ b/rust/src/parquet/util/hash_util.rs @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::parquet::data_type::AsBytes; + +/// Computes hash value for `data`, with a seed value `seed`. +/// The data type `T` must implement the `AsBytes` trait. +pub fn hash(data: &T, seed: u32) -> u32 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("sse4.2") { + unsafe { crc32_hash(data, seed) } + } else { + murmur_hash2_64a(data, seed as u64) as u32 + } + } +} + +const MURMUR_PRIME: u64 = 0xc6a4a7935bd1e995; +const MURMUR_R: i32 = 47; + +/// Rust implementation of MurmurHash2, 64-bit version for 64-bit platforms +fn murmur_hash2_64a(data: &T, seed: u64) -> u64 { + let data_bytes = data.as_bytes(); + let len = data_bytes.len(); + let len_64 = (len / 8) * 8; + let data_bytes_64 = unsafe { + ::std::slice::from_raw_parts(&data_bytes[0..len_64] as *const [u8] as *const u64, len / 8) + }; + + let mut h = seed ^ (MURMUR_PRIME.wrapping_mul(data_bytes.len() as u64)); + for v in data_bytes_64 { + let mut k = *v; + k = k.wrapping_mul(MURMUR_PRIME); + k ^= k >> MURMUR_R; + k = k.wrapping_mul(MURMUR_PRIME); + h ^= k; + h = h.wrapping_mul(MURMUR_PRIME); + } + + let data2 = &data_bytes[len_64..]; + + let v = len & 7; + if v == 7 { + h ^= (data2[6] as u64) << 48; + } + if v >= 6 { + h ^= (data2[5] as u64) << 40; + } + if v >= 5 { + h ^= (data2[4] as u64) << 32; + } + if v >= 4 { + h ^= (data2[3] as u64) << 24; + } + if v >= 3 { + h ^= (data2[2] as u64) << 16; + } + if v >= 2 { + h ^= (data2[1] as u64) << 8; + } + if v >= 1 { + h ^= data2[0] as u64; + } + if v > 0 { + h = h.wrapping_mul(MURMUR_PRIME); + } + + h ^= h >> MURMUR_R; + h = h.wrapping_mul(MURMUR_PRIME); + h ^= h >> MURMUR_R; + h +} + +/// CRC32 hash implementation using SSE4 instructions. Borrowed from Impala. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[target_feature(enable = "sse4.2")] +unsafe fn crc32_hash(data: &T, seed: u32) -> u32 { + #[cfg(target_arch = "x86")] + use std::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use std::arch::x86_64::*; + + let bytes: &[u8] = data.as_bytes(); + let u32_num_bytes = ::std::mem::size_of::(); + let mut num_bytes = bytes.len(); + let num_words = num_bytes / u32_num_bytes; + num_bytes %= u32_num_bytes; + + let bytes_u32: &[u32] = ::std::slice::from_raw_parts( + &bytes[0..num_words * u32_num_bytes] as *const [u8] as *const u32, + num_words, + ); + + let mut offset = 0; + let mut hash = seed; + while offset < num_words { + hash = _mm_crc32_u32(hash, bytes_u32[offset]); + offset += 1; + } + + offset = num_words * u32_num_bytes; + while offset < num_bytes { + hash = _mm_crc32_u8(hash, bytes[offset]); + offset += 1; + } + + // The lower half of the CRC hash has poor uniformity, so swap the halves + // for anyone who only uses the first several bits of the hash. + hash = (hash << 16) | (hash >> 16); + hash +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_murmur2_64a() { + let result = murmur_hash2_64a(&"hello", 123); + assert_eq!(result, 2597646618390559622); + + let result = murmur_hash2_64a(&"helloworld", 123); + assert_eq!(result, 4934371746140206573); + + let result = murmur_hash2_64a(&"helloworldparquet", 123); + assert_eq!(result, 2392198230801491746); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_crc32() { + if is_x86_feature_detected!("sse4.2") { + unsafe { + let result = crc32_hash(&"hello", 123); + assert_eq!(result, 2927487359); + + let result = crc32_hash(&"helloworld", 123); + assert_eq!(result, 314229527); + + let result = crc32_hash(&"helloworldparquet", 123); + assert_eq!(result, 667078870); + } + } + } +} diff --git a/rust/src/parquet/util/io.rs b/rust/src/parquet/util/io.rs new file mode 100644 index 0000000000000..8724e67c2dbe7 --- /dev/null +++ b/rust/src/parquet/util/io.rs @@ -0,0 +1,220 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{cmp, fs::File, io::*, sync::Mutex}; + +use crate::parquet::file::reader::ParquetReader; + +// ---------------------------------------------------------------------- +// Read/Write wrappers for `File`. + +/// Position trait returns the current position in the stream. +/// Should be viewed as a lighter version of `Seek` that does not allow seek operations, +/// and does not require mutable reference for the current position. +pub trait Position { + /// Returns position in the stream. + fn pos(&self) -> u64; +} + +/// Struct that represents a slice of a file data with independent start position and +/// length. Internally clones provided file handle, wraps with BufReader and resets +/// position before any read. +/// +/// This is workaround and alternative for `file.try_clone()` method. It clones `File` +/// while preserving independent position, which is not available with `try_clone()`. +/// +/// Designed after `arrow::io::RandomAccessFile`. +pub struct FileSource { + reader: Mutex>, + start: u64, // start position in a file + end: u64, // end position in a file +} + +impl FileSource { + /// Creates new file reader with start and length from a file handle + pub fn new(fd: &R, start: u64, length: usize) -> Self { + Self { + reader: Mutex::new(BufReader::new(fd.try_clone().unwrap())), + start, + end: start + length as u64, + } + } +} + +impl Read for FileSource { + fn read(&mut self, buf: &mut [u8]) -> Result { + let mut reader = self + .reader + .lock() + .map_err(|err| Error::new(ErrorKind::Other, err.to_string()))?; + + let bytes_to_read = cmp::min(buf.len(), (self.end - self.start) as usize); + let buf = &mut buf[0..bytes_to_read]; + + reader.seek(SeekFrom::Start(self.start as u64))?; + let res = reader.read(buf); + if let Ok(bytes_read) = res { + self.start += bytes_read as u64; + } + + res + } +} + +impl Position for FileSource { + fn pos(&self) -> u64 { + self.start + } +} + +/// Struct that represents `File` output stream with position tracking. +/// Used as a sink in file writer. +pub struct FileSink { + buf: BufWriter, + // This is not necessarily position in the underlying file, + // but rather current position in the sink. + pos: u64, +} + +impl FileSink { + /// Creates new file sink. + /// Position is set to whatever position file has. + pub fn new(file: &File) -> Self { + let mut owned_file = file.try_clone().unwrap(); + let pos = owned_file.seek(SeekFrom::Current(0)).unwrap(); + Self { + buf: BufWriter::new(owned_file), + pos, + } + } +} + +impl Write for FileSink { + fn write(&mut self, buf: &[u8]) -> Result { + let num_bytes = self.buf.write(buf)?; + self.pos += num_bytes as u64; + Ok(num_bytes) + } + + fn flush(&mut self) -> Result<()> { + self.buf.flush() + } +} + +impl Position for FileSink { + fn pos(&self) -> u64 { + self.pos + } +} + +// Position implementation for Cursor to use in various tests. +impl<'a> Position for Cursor<&'a mut Vec> { + fn pos(&self) -> u64 { + self.position() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::parquet::util::test_common::{get_temp_file, get_test_file}; + + #[test] + fn test_io_read_fully() { + let mut buf = vec![0; 8]; + let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); + + let bytes_read = src.read(&mut buf[..]).unwrap(); + assert_eq!(bytes_read, 4); + assert_eq!(buf, vec![b'P', b'A', b'R', b'1', 0, 0, 0, 0]); + } + + #[test] + fn test_io_read_in_chunks() { + let mut buf = vec![0; 4]; + let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); + + let bytes_read = src.read(&mut buf[0..2]).unwrap(); + assert_eq!(bytes_read, 2); + let bytes_read = src.read(&mut buf[2..]).unwrap(); + assert_eq!(bytes_read, 2); + assert_eq!(buf, vec![b'P', b'A', b'R', b'1']); + } + + #[test] + fn test_io_read_pos() { + let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); + + src.read(&mut vec![0; 1]).unwrap(); + assert_eq!(src.pos(), 1); + + src.read(&mut vec![0; 4]).unwrap(); + assert_eq!(src.pos(), 4); + } + + #[test] + fn test_io_read_over_limit() { + let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); + + // Read all bytes from source + src.read(&mut vec![0; 128]).unwrap(); + assert_eq!(src.pos(), 4); + + // Try reading again, should return 0 bytes. + let bytes_read = src.read(&mut vec![0; 128]).unwrap(); + assert_eq!(bytes_read, 0); + assert_eq!(src.pos(), 4); + } + + #[test] + fn test_io_seek_switch() { + let mut buf = vec![0; 4]; + let mut file = get_test_file("alltypes_plain.parquet"); + let mut src = FileSource::new(&file, 0, 4); + + file.seek(SeekFrom::Start(5 as u64)) + .expect("File seek to a position"); + + let bytes_read = src.read(&mut buf[..]).unwrap(); + assert_eq!(bytes_read, 4); + assert_eq!(buf, vec![b'P', b'A', b'R', b'1']); + } + + #[test] + fn test_io_write_with_pos() { + let mut file = get_temp_file("file_sink_test", &[b'a', b'b', b'c']); + file.seek(SeekFrom::Current(3)).unwrap(); + + // Write into sink + let mut sink = FileSink::new(&file); + assert_eq!(sink.pos(), 3); + + sink.write(&[b'd', b'e', b'f', b'g']).unwrap(); + assert_eq!(sink.pos(), 7); + + sink.flush().unwrap(); + assert_eq!(sink.pos(), file.seek(SeekFrom::Current(0)).unwrap()); + + // Read data using file chunk + let mut res = vec![0u8; 7]; + let mut chunk = FileSource::new(&file, 0, file.metadata().unwrap().len() as usize); + chunk.read(&mut res[..]).unwrap(); + + assert_eq!(res, vec![b'a', b'b', b'c', b'd', b'e', b'f', b'g']); + } +} diff --git a/rust/src/parquet/util/memory.rs b/rust/src/parquet/util/memory.rs new file mode 100644 index 0000000000000..69a389e50fe92 --- /dev/null +++ b/rust/src/parquet/util/memory.rs @@ -0,0 +1,524 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utility methods and structs for working with memory. + +use std::{ + cell::Cell, + fmt::{Debug, Display, Formatter, Result as FmtResult}, + io::{Result as IoResult, Write}, + mem, + ops::{Index, IndexMut}, + rc::{Rc, Weak}, +}; + +// ---------------------------------------------------------------------- +// Memory Tracker classes + +/// Reference counted pointer for [`MemTracker`]. +pub type MemTrackerPtr = Rc; +/// Non-owning reference for [`MemTracker`]. +pub type WeakMemTrackerPtr = Weak; + +/// Struct to track memory usage information. +#[derive(Debug)] +pub struct MemTracker { + // In the tuple, the first element is the current memory allocated (in bytes), + // and the second element is the maximum memory allocated so far (in bytes). + memory_usage: Cell<(i64, i64)>, +} + +impl MemTracker { + /// Creates new memory tracker. + #[inline] + pub fn new() -> MemTracker { + MemTracker { + memory_usage: Cell::new((0, 0)), + } + } + + /// Returns the current memory consumption, in bytes. + pub fn memory_usage(&self) -> i64 { + self.memory_usage.get().0 + } + + /// Returns the maximum memory consumption so far, in bytes. + pub fn max_memory_usage(&self) -> i64 { + self.memory_usage.get().1 + } + + /// Adds `num_bytes` to the memory consumption tracked by this memory tracker. + #[inline] + pub fn alloc(&self, num_bytes: i64) { + let (current, mut maximum) = self.memory_usage.get(); + let new_current = current + num_bytes; + if new_current > maximum { + maximum = new_current + } + self.memory_usage.set((new_current, maximum)); + } +} + +// ---------------------------------------------------------------------- +// Buffer classes + +/// Type alias for [`Buffer`]. +pub type ByteBuffer = Buffer; +/// Type alias for [`BufferPtr`]. +pub type ByteBufferPtr = BufferPtr; + +/// A resize-able buffer class with generic member, with optional memory tracker. +/// +/// Note that a buffer has two attributes: +/// `capacity` and `size`: the former is the total number of space reserved for +/// the buffer, while the latter is the actual number of elements. +/// Invariant: `capacity` >= `size`. +/// The total allocated bytes for a buffer equals to `capacity * sizeof()`. +pub struct Buffer { + data: Vec, + mem_tracker: Option, + type_length: usize, +} + +impl Buffer { + /// Creates new empty buffer. + pub fn new() -> Self { + Buffer { + data: vec![], + mem_tracker: None, + type_length: ::std::mem::size_of::(), + } + } + + /// Adds [`MemTracker`] for this buffer. + #[inline] + pub fn with_mem_tracker(mut self, mc: MemTrackerPtr) -> Self { + mc.alloc((self.data.capacity() * self.type_length) as i64); + self.mem_tracker = Some(mc); + self + } + + /// Returns slice of data in this buffer. + #[inline] + pub fn data(&self) -> &[T] { + self.data.as_slice() + } + + /// Sets data for this buffer. + #[inline] + pub fn set_data(&mut self, new_data: Vec) { + if let Some(ref mc) = self.mem_tracker { + let capacity_diff = new_data.capacity() as i64 - self.data.capacity() as i64; + mc.alloc(capacity_diff * self.type_length as i64); + } + self.data = new_data; + } + + /// Resizes underlying data in place to a new length `new_size`. + /// + /// If `new_size` is less than current length, data is truncated, otherwise, it is + /// extended to `new_size` with provided default value `init_value`. + /// + /// Memory tracker is also updated, if available. + #[inline] + pub fn resize(&mut self, new_size: usize, init_value: T) { + let old_capacity = self.data.capacity(); + self.data.resize(new_size, init_value); + if let Some(ref mc) = self.mem_tracker { + let capacity_diff = self.data.capacity() as i64 - old_capacity as i64; + mc.alloc(capacity_diff * self.type_length as i64); + } + } + + /// Clears underlying data. + #[inline] + pub fn clear(&mut self) { + self.data.clear() + } + + /// Reserves capacity `additional_capacity` for underlying data vector. + /// + /// Memory tracker is also updated, if available. + #[inline] + pub fn reserve(&mut self, additional_capacity: usize) { + let old_capacity = self.data.capacity(); + self.data.reserve(additional_capacity); + if self.data.capacity() > old_capacity { + if let Some(ref mc) = self.mem_tracker { + let capacity_diff = self.data.capacity() as i64 - old_capacity as i64; + mc.alloc(capacity_diff * self.type_length as i64); + } + } + } + + /// Returns [`BufferPtr`] with buffer data. + /// Buffer data is reset. + #[inline] + pub fn consume(&mut self) -> BufferPtr { + let old_data = mem::replace(&mut self.data, vec![]); + let mut result = BufferPtr::new(old_data); + if let Some(ref mc) = self.mem_tracker { + result = result.with_mem_tracker(mc.clone()); + } + result + } + + /// Adds `value` to the buffer. + #[inline] + pub fn push(&mut self, value: T) { + self.data.push(value) + } + + /// Returns current capacity for the buffer. + #[inline] + pub fn capacity(&self) -> usize { + self.data.capacity() + } + + /// Returns current size for the buffer. + #[inline] + pub fn size(&self) -> usize { + self.data.len() + } + + /// Returns `true` if memory tracker is added to buffer, `false` otherwise. + #[inline] + pub fn is_mem_tracked(&self) -> bool { + self.mem_tracker.is_some() + } + + /// Returns memory tracker associated with this buffer. + /// This may panic, if memory tracker is not set, use method above to check if + /// memory tracker is available. + #[inline] + pub fn mem_tracker(&self) -> &MemTrackerPtr { + self.mem_tracker.as_ref().unwrap() + } +} + +impl Index for Buffer { + type Output = T; + + fn index(&self, index: usize) -> &T { + &self.data[index] + } +} + +impl IndexMut for Buffer { + fn index_mut(&mut self, index: usize) -> &mut T { + &mut self.data[index] + } +} + +// TODO: implement this for other types +impl Write for Buffer { + #[inline] + fn write(&mut self, buf: &[u8]) -> IoResult { + let old_capacity = self.data.capacity(); + let bytes_written = self.data.write(buf)?; + if let Some(ref mc) = self.mem_tracker { + if self.data.capacity() - old_capacity > 0 { + mc.alloc((self.data.capacity() - old_capacity) as i64) + } + } + Ok(bytes_written) + } + + fn flush(&mut self) -> IoResult<()> { + // No-op + self.data.flush() + } +} + +impl AsRef<[u8]> for Buffer { + fn as_ref(&self) -> &[u8] { + self.data.as_slice() + } +} + +impl Drop for Buffer { + #[inline] + fn drop(&mut self) { + if let Some(ref mc) = self.mem_tracker { + mc.alloc(-((self.data.capacity() * self.type_length) as i64)); + } + } +} + +// ---------------------------------------------------------------------- +// Immutable Buffer (BufferPtr) classes + +/// An representation of a slice on a reference-counting and read-only byte array. +/// Sub-slices can be further created from this. The byte array will be released +/// when all slices are dropped. +#[derive(Clone, Debug)] +pub struct BufferPtr { + data: Rc>, + start: usize, + len: usize, + // TODO: will this create too many references? rethink about this. + mem_tracker: Option, +} + +impl BufferPtr { + /// Creates new buffer from a vector. + pub fn new(v: Vec) -> Self { + let len = v.len(); + Self { + data: Rc::new(v), + start: 0, + len, + mem_tracker: None, + } + } + + /// Returns slice of data in this buffer. + pub fn data(&self) -> &[T] { + &self.data[self.start..self.start + self.len] + } + + /// Updates this buffer with new `start` position and length `len`. + /// + /// Range should be within current start position and length. + pub fn with_range(mut self, start: usize, len: usize) -> Self { + assert!(start <= self.len); + assert!(start + len <= self.len); + self.start = start; + self.len = len; + self + } + + /// Adds memory tracker to this buffer. + pub fn with_mem_tracker(mut self, mc: MemTrackerPtr) -> Self { + self.mem_tracker = Some(mc); + self + } + + /// Returns start position of this buffer. + pub fn start(&self) -> usize { + self.start + } + + /// Returns length of this buffer + pub fn len(&self) -> usize { + self.len + } + + /// Returns `true` if this buffer has memory tracker, `false` otherwise. + pub fn is_mem_tracked(&self) -> bool { + self.mem_tracker.is_some() + } + + /// Returns a shallow copy of the buffer. + /// Reference counted pointer to the data is copied. + pub fn all(&self) -> BufferPtr { + BufferPtr { + data: self.data.clone(), + start: self.start, + len: self.len, + mem_tracker: self.mem_tracker.as_ref().map(|p| p.clone()), + } + } + + /// Returns a shallow copy of the buffer that starts with `start` position. + pub fn start_from(&self, start: usize) -> BufferPtr { + assert!(start <= self.len); + BufferPtr { + data: self.data.clone(), + start: self.start + start, + len: self.len - start, + mem_tracker: self.mem_tracker.as_ref().map(|p| p.clone()), + } + } + + /// Returns a shallow copy that is a range slice within this buffer. + pub fn range(&self, start: usize, len: usize) -> BufferPtr { + assert!(start + len <= self.len); + BufferPtr { + data: self.data.clone(), + start: self.start + start, + len, + mem_tracker: self.mem_tracker.as_ref().map(|p| p.clone()), + } + } +} + +impl Index for BufferPtr { + type Output = T; + + fn index(&self, index: usize) -> &T { + assert!(index < self.len); + &self.data[self.start + index] + } +} + +impl Display for BufferPtr { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", self.data) + } +} + +impl Drop for BufferPtr { + fn drop(&mut self) { + if self.is_mem_tracked() + && Rc::strong_count(&self.data) == 1 + && Rc::weak_count(&self.data) == 0 + { + let mc = self.mem_tracker.as_ref().unwrap(); + mc.alloc(-(self.data.capacity() as i64)); + } + } +} + +impl AsRef<[u8]> for BufferPtr { + fn as_ref(&self) -> &[u8] { + &self.data[self.start..self.start + self.len] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_byte_buffer_mem_tracker() { + let mem_tracker = Rc::new(MemTracker::new()); + + let mut buffer = ByteBuffer::new().with_mem_tracker(mem_tracker.clone()); + buffer.set_data(vec![0; 10]); + assert_eq!(mem_tracker.memory_usage(), buffer.capacity() as i64); + buffer.set_data(vec![0; 20]); + let capacity = buffer.capacity() as i64; + assert_eq!(mem_tracker.memory_usage(), capacity); + + let max_capacity = { + let mut buffer2 = ByteBuffer::new().with_mem_tracker(mem_tracker.clone()); + buffer2.reserve(30); + assert_eq!( + mem_tracker.memory_usage(), + buffer2.capacity() as i64 + capacity + ); + buffer2.set_data(vec![0; 100]); + assert_eq!( + mem_tracker.memory_usage(), + buffer2.capacity() as i64 + capacity + ); + buffer2.capacity() as i64 + capacity + }; + + assert_eq!(mem_tracker.memory_usage(), capacity); + assert_eq!(mem_tracker.max_memory_usage(), max_capacity); + + buffer.reserve(40); + assert_eq!(mem_tracker.memory_usage(), buffer.capacity() as i64); + + buffer.consume(); + assert_eq!(mem_tracker.memory_usage(), buffer.capacity() as i64); + } + + #[test] + fn test_byte_ptr_mem_tracker() { + let mem_tracker = Rc::new(MemTracker::new()); + + let mut buffer = ByteBuffer::new().with_mem_tracker(mem_tracker.clone()); + buffer.set_data(vec![0; 60]); + + { + let buffer_capacity = buffer.capacity() as i64; + let buf_ptr = buffer.consume(); + assert_eq!(mem_tracker.memory_usage(), buffer_capacity); + { + let buf_ptr1 = buf_ptr.all(); + { + let _ = buf_ptr.start_from(20); + assert_eq!(mem_tracker.memory_usage(), buffer_capacity); + } + assert_eq!(mem_tracker.memory_usage(), buffer_capacity); + let _ = buf_ptr1.range(30, 20); + assert_eq!(mem_tracker.memory_usage(), buffer_capacity); + } + assert_eq!(mem_tracker.memory_usage(), buffer_capacity); + } + assert_eq!(mem_tracker.memory_usage(), buffer.capacity() as i64); + } + + #[test] + fn test_byte_buffer() { + let mut buffer = ByteBuffer::new(); + assert_eq!(buffer.size(), 0); + assert_eq!(buffer.capacity(), 0); + + let mut buffer2 = ByteBuffer::new(); + buffer2.reserve(40); + assert_eq!(buffer2.size(), 0); + assert_eq!(buffer2.capacity(), 40); + + buffer.set_data((0..5).collect()); + assert_eq!(buffer.size(), 5); + assert_eq!(buffer[4], 4); + + buffer.set_data((0..20).collect()); + assert_eq!(buffer.size(), 20); + assert_eq!(buffer[10], 10); + + let expected: Vec = (0..20).collect(); + { + let data = buffer.data(); + assert_eq!(data, expected.as_slice()); + } + + buffer.reserve(40); + assert!(buffer.capacity() >= 40); + + let byte_ptr = buffer.consume(); + assert_eq!(buffer.size(), 0); + assert_eq!(byte_ptr.as_ref(), expected.as_slice()); + + let values: Vec = (0..30).collect(); + let _ = buffer.write(values.as_slice()); + let _ = buffer.flush(); + + assert_eq!(buffer.data(), values.as_slice()); + } + + #[test] + fn test_byte_ptr() { + let values = (0..50).collect(); + let ptr = ByteBufferPtr::new(values); + assert_eq!(ptr.len(), 50); + assert_eq!(ptr.start(), 0); + assert_eq!(ptr[40], 40); + + let ptr2 = ptr.all(); + assert_eq!(ptr2.len(), 50); + assert_eq!(ptr2.start(), 0); + assert_eq!(ptr2[40], 40); + + let ptr3 = ptr.start_from(20); + assert_eq!(ptr3.len(), 30); + assert_eq!(ptr3.start(), 20); + assert_eq!(ptr3[0], 20); + + let ptr4 = ptr3.range(10, 10); + assert_eq!(ptr4.len(), 10); + assert_eq!(ptr4.start(), 30); + assert_eq!(ptr4[0], 30); + + let expected: Vec = (30..40).collect(); + assert_eq!(ptr4.as_ref(), expected.as_slice()); + } +} diff --git a/rust/src/parquet/util/mod.rs b/rust/src/parquet/util/mod.rs new file mode 100644 index 0000000000000..669cc3c0a495c --- /dev/null +++ b/rust/src/parquet/util/mod.rs @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod io; +pub mod memory; +#[macro_use] +pub mod bit_util; +mod bit_packing; +pub mod hash_util; + +#[cfg(test)] +pub mod test_common; diff --git a/rust/src/parquet/util/test_common.rs b/rust/src/parquet/util/test_common.rs new file mode 100644 index 0000000000000..f9b1af4a5cef4 --- /dev/null +++ b/rust/src/parquet/util/test_common.rs @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use rand::{ + distributions::{range::SampleRange, Distribution, Standard}, + thread_rng, Rng, +}; +use std::{env, fs, io::Write, path::PathBuf, str::FromStr}; + +use crate::parquet::data_type::*; +use crate::parquet::util::memory::ByteBufferPtr; + +/// Random generator of data type `T` values and sequences. +pub trait RandGen { + fn gen(len: i32) -> T::T; + + fn gen_vec(len: i32, total: usize) -> Vec { + let mut result = vec![]; + for _ in 0..total { + result.push(Self::gen(len)) + } + result + } +} + +impl RandGen for T { + default fn gen(_: i32) -> T::T { + panic!("Unsupported data type"); + } +} + +impl RandGen for BoolType { + fn gen(_: i32) -> bool { + thread_rng().gen::() + } +} + +impl RandGen for Int32Type { + fn gen(_: i32) -> i32 { + thread_rng().gen::() + } +} + +impl RandGen for Int64Type { + fn gen(_: i32) -> i64 { + thread_rng().gen::() + } +} + +impl RandGen for Int96Type { + fn gen(_: i32) -> Int96 { + let mut rng = thread_rng(); + let mut result = Int96::new(); + result.set_data(rng.gen::(), rng.gen::(), rng.gen::()); + result + } +} + +impl RandGen for FloatType { + fn gen(_: i32) -> f32 { + thread_rng().gen::() + } +} + +impl RandGen for DoubleType { + fn gen(_: i32) -> f64 { + thread_rng().gen::() + } +} + +impl RandGen for ByteArrayType { + fn gen(_: i32) -> ByteArray { + let mut rng = thread_rng(); + let mut result = ByteArray::new(); + let mut value = vec![]; + let len = rng.gen_range::(0, 128); + for _ in 0..len { + value.push(rng.gen_range(0, 255) & 0xFF); + } + result.set_data(ByteBufferPtr::new(value)); + result + } +} + +impl RandGen for FixedLenByteArrayType { + fn gen(len: i32) -> ByteArray { + let mut rng = thread_rng(); + let value_len = if len < 0 { + rng.gen_range::(0, 128) + } else { + len as usize + }; + let value = random_bytes(value_len); + ByteArray::from(value) + } +} + +pub fn random_bytes(n: usize) -> Vec { + let mut result = vec![]; + let mut rng = thread_rng(); + for _ in 0..n { + result.push(rng.gen_range(0, 255) & 0xFF); + } + result +} + +pub fn random_bools(n: usize) -> Vec { + let mut result = vec![]; + let mut rng = thread_rng(); + for _ in 0..n { + result.push(rng.gen::()); + } + result +} + +pub fn random_numbers(n: usize) -> Vec +where + Standard: Distribution, +{ + let mut rng = thread_rng(); + Standard.sample_iter(&mut rng).take(n).collect() +} + +pub fn random_numbers_range(n: usize, low: T, high: T, result: &mut Vec) +where + T: PartialOrd + SampleRange + Copy, +{ + let mut rng = thread_rng(); + for _ in 0..n { + result.push(rng.gen_range(low, high)); + } +} + +/// Returns path to the test parquet file in 'data' directory +pub fn get_test_path(file_name: &str) -> PathBuf { + let result = env::var("PARQUET_TEST_DATA"); + if result.is_err() { + panic!("Please point PARQUET_TEST_DATA environment variable to the test data directory"); + } + let mut pathbuf = PathBuf::from_str(result.unwrap().as_str()).unwrap(); + pathbuf.push(file_name); + pathbuf +} + +/// Returns file handle for a test parquet file from 'data' directory +pub fn get_test_file(file_name: &str) -> fs::File { + let file = fs::File::open(get_test_path(file_name).as_path()); + if file.is_err() { + panic!("Test file {} not found", file_name) + } + file.unwrap() +} + +/// Returns file handle for a temp file in 'target' directory with a provided content +pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File { + // build tmp path to a file in "target/debug/testdata" + let mut path_buf = env::current_dir().unwrap(); + path_buf.push("target"); + path_buf.push("debug"); + path_buf.push("testdata"); + fs::create_dir_all(&path_buf).unwrap(); + path_buf.push(file_name); + + // write file content + let mut tmp_file = fs::File::create(path_buf.as_path()).unwrap(); + tmp_file.write_all(content).unwrap(); + tmp_file.sync_all().unwrap(); + + // return file handle for both read and write + let file = fs::OpenOptions::new() + .read(true) + .write(true) + .open(path_buf.as_path()); + assert!(file.is_ok()); + file.unwrap() +} diff --git a/rust/src/record_batch.rs b/rust/src/record_batch.rs index 2666770460e84..e6a8e79500f08 100644 --- a/rust/src/record_batch.rs +++ b/rust/src/record_batch.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. +use std::sync::Arc; + use crate::array::*; use crate::datatypes::*; -use std::sync::Arc; /// A batch of column-oriented data pub struct RecordBatch { @@ -67,6 +68,7 @@ unsafe impl Sync for RecordBatch {} #[cfg(test)] mod tests { use super::*; + use crate::array_data::*; use crate::buffer::*; diff --git a/rust/src/tensor.rs b/rust/src/tensor.rs index 175b68d81f188..7272a2cf14631 100644 --- a/rust/src/tensor.rs +++ b/rust/src/tensor.rs @@ -216,6 +216,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { #[cfg(test)] mod tests { use super::*; + use crate::buffer::Buffer; use crate::builder::*; From 5a5d807bc9ccebc4fd9ec733788aede00a8bdd71 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 18 Dec 2018 15:29:27 +0100 Subject: [PATCH 068/328] [C++] Make Doxygen less verbose (#3213) --- cpp/apidoc/Doxyfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index e5285873c9e02..e7eefba130140 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -741,7 +741,7 @@ CITE_BIB_FILES = # messages are off. # The default value is: NO. -QUIET = NO +QUIET = YES # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES From d432cb4a27ce40ed4cf414c8267081c8dff89d82 Mon Sep 17 00:00:00 2001 From: Paddy Horan Date: Tue, 18 Dec 2018 15:37:51 +0100 Subject: [PATCH 069/328] ARROW-2560: [Rust] The Rust README should include Rust-specific information on contributing Author: Paddy Horan Closes #3210 from paddyhoran/ARROW-2560 and squashes the following commits: 8f81cb15 Updated README with parquet/rustfmt info --- rust/README.md | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/rust/README.md b/rust/README.md index f8908f8e6e64d..cbfd4dd684a0f 100644 --- a/rust/README.md +++ b/rust/README.md @@ -24,7 +24,8 @@ ## Status -This is a native Rust implementation of Apache Arrow. The current status is: +This is a native Rust implementation of Apache Arrow. Currently the project +is developed and tested against nightly Rust. The current status is: - [x] Primitive Arrays - [x] List Arrays @@ -36,6 +37,13 @@ This is a native Rust implementation of Apache Arrow. The current status is: - [ ] Arrow IPC - [ ] Interop tests with other implementations +## Dependencies + +Parquet support for Apache Arrow requires LLVM. Our windows CI image +includes LLVM but to build the libraries locally windows users will have +to install LLVM. Follow [this](https://github.com/appveyor/ci/issues/2651) +link for info. + ## Examples The examples folder shows how to construct some different types of Arrow @@ -51,8 +59,24 @@ cargo run --example read_csv ## Run Tests +Parquet support in Arrow requires data to test against, this data is in a +git submodule. To pull down this data run the following: + +```bash +git submodule update --init +``` + +The data can then be found in `cpp/submodules/parquet_testing/data`. +Create a new environment variable called `PARQUET_TEST_DATA` to point +to this location and then `cargo test` as usual. + +Our CI uses `rustfmt` to check code formatting. Although the project is +built and tested against nightly rust we use the stable version of +`rustfmt`. So before submitting a PR be sure to run the following +and check for lint issues: + ```bash -cargo test +cargo +stable fmt --all -- --check ``` # Publishing to crates.io From 36ded49568b8c3d664f0f14d06ec199ef5286857 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Dec 2018 15:47:09 +0100 Subject: [PATCH 070/328] ARROW-3058: [Python] Raise more helpful better error message when writing a pandas.DataFrame to Feather format that requires a chunked layout Author: Wes McKinney Closes #3178 from wesm/ARROW-3058 and squashes the following commits: 4a10687f Raise more helpful better error message when a large binary/string column yields ChunkedArray on conversion to pyarrow.Table --- python/pyarrow/feather.py | 26 +++++++++++++++++++++----- python/pyarrow/tests/test_feather.py | 18 ++++++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index faa2f7d892ee0..3713c1f135036 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -23,7 +23,7 @@ from pyarrow.compat import pdapi from pyarrow.lib import FeatherError # noqa -from pyarrow.lib import RecordBatch, concat_tables +from pyarrow.lib import Table, concat_tables import pyarrow.lib as ext @@ -62,6 +62,21 @@ def read_pandas(self, columns=None, use_threads=True): use_threads=use_threads) +def check_chunked_overflow(col): + if col.data.num_chunks == 1: + return + + if col.type in (ext.binary(), ext.string()): + raise ValueError("Column '{0}' exceeds 2GB maximum capacity of " + "a Feather binary column. This restriction may be " + "lifted in the future".format(col.name)) + else: + # TODO(wesm): Not sure when else this might be reached + raise ValueError("Column '{0}' of type {1} was chunked on conversion " + "to Arrow and cannot be currently written to " + "Feather format".format(col.name, str(col.type))) + + class FeatherWriter(object): def __init__(self, dest): @@ -78,10 +93,11 @@ def write(self, df): # TODO(wesm): Remove this length check, see ARROW-1732 if len(df.columns) > 0: - batch = RecordBatch.from_pandas(df, preserve_index=False) - for i, name in enumerate(batch.schema.names): - col = batch[i] - self.writer.write_array(name, col) + table = Table.from_pandas(df, preserve_index=False) + for i, name in enumerate(table.schema.names): + col = table[i] + check_chunked_overflow(col) + self.writer.write_array(name, col.data.chunk(0)) self.writer.close() diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 01b567216bfcf..d144f989d0f0a 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import io import os import sys import tempfile @@ -535,3 +536,20 @@ def test_unsupported(self): def test_large_dataframe(self): df = pd.DataFrame({'A': np.arange(400000000)}) self._check_pandas_roundtrip(df) + + +@pytest.mark.large_memory +def test_chunked_binary_error_message(): + # ARROW-3058: As Feather does not yet support chunked columns, we at least + # make sure it's clear to the user what is going on + + # 2^31 + 1 bytes + values = [b'x'] + [ + b'x' * (1 << 20) + ] * 2 * (1 << 10) + df = pd.DataFrame({'byte_col': values}) + + with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum " + "capacity of a Feather binary column. This restriction " + "may be lifted in the future"): + write_feather(df, io.BytesIO()) From e832df36c2d44d02273de851db3cfcd8c231f479 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Tue, 18 Dec 2018 16:43:39 +0100 Subject: [PATCH 071/328] ARROW-3387: [C++] Implement Binary to String cast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: François Saint-Jacques Closes #3211 from fsaintjacques/ARROW-3387-cast-binary-to-string and squashes the following commits: 16cdb8ef ARROE-3387: clang-format 1949d377 ARROW-3387: Address review comments 31092b9f ARROW-3387: Implement Binary to String cast f045d64f ARROW-3387: Partition compute-test.cc in separate files 5358148e ARROW-3387: Rename CopyData to ZeroCopyData --- cpp/src/arrow/compute/compute-test.cc | 1551 +---------------- cpp/src/arrow/compute/kernels/CMakeLists.txt | 4 + cpp/src/arrow/compute/kernels/boolean-test.cc | 157 ++ cpp/src/arrow/compute/kernels/cast-test.cc | 1197 +++++++++++++ cpp/src/arrow/compute/kernels/cast.cc | 106 +- cpp/src/arrow/compute/kernels/cast.h | 9 +- cpp/src/arrow/compute/kernels/hash-test.cc | 344 ++++ cpp/src/arrow/compute/kernels/util-internal.h | 4 +- cpp/src/arrow/compute/test-util.h | 57 + cpp/src/arrow/util/utf8.h | 8 + 10 files changed, 1873 insertions(+), 1564 deletions(-) create mode 100644 cpp/src/arrow/compute/kernels/boolean-test.cc create mode 100644 cpp/src/arrow/compute/kernels/cast-test.cc create mode 100644 cpp/src/arrow/compute/kernels/hash-test.cc create mode 100644 cpp/src/arrow/compute/test-util.h diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index e34a086d8e2d9..8129441b41fa1 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -39,10 +39,8 @@ #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" -#include "arrow/compute/kernels/boolean.h" -#include "arrow/compute/kernels/cast.h" -#include "arrow/compute/kernels/hash.h" #include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" using std::shared_ptr; using std::vector; @@ -50,26 +48,6 @@ using std::vector; namespace arrow { namespace compute { -class ComputeFixture { - public: - ComputeFixture() : ctx_(default_memory_pool()) {} - - protected: - FunctionContext ctx_; -}; - -template -shared_ptr _MakeArray(const shared_ptr& type, const vector& values, - const vector& is_valid) { - shared_ptr result; - if (is_valid.size() > 0) { - ArrayFromVector(type, is_valid, values, &result); - } else { - ArrayFromVector(type, values, &result); - } - return result; -} - // ---------------------------------------------------------------------- // Datum @@ -91,1533 +69,6 @@ TEST(TestDatum, ImplicitConstructors) { CheckImplicitConstructor
(Datum::TABLE); } -// ---------------------------------------------------------------------- -// Cast - -static void AssertBufferSame(const Array& left, const Array& right, int buffer_index) { - ASSERT_EQ(left.data()->buffers[buffer_index].get(), - right.data()->buffers[buffer_index].get()); -} - -class TestCast : public ComputeFixture, public TestBase { - public: - void CheckPass(const Array& input, const Array& expected, - const shared_ptr& out_type, const CastOptions& options) { - shared_ptr result; - ASSERT_OK(Cast(&ctx_, input, out_type, options, &result)); - ASSERT_ARRAYS_EQUAL(expected, *result); - } - - template - void CheckFails(const shared_ptr& in_type, const vector& in_values, - const vector& is_valid, const shared_ptr& out_type, - const CastOptions& options) { - shared_ptr input, result; - if (is_valid.size() > 0) { - ArrayFromVector(in_type, is_valid, in_values, &input); - } else { - ArrayFromVector(in_type, in_values, &input); - } - ASSERT_RAISES(Invalid, Cast(&ctx_, *input, out_type, options, &result)); - } - - void CheckZeroCopy(const Array& input, const shared_ptr& out_type) { - shared_ptr result; - ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result)); - AssertBufferSame(input, *result, 0); - AssertBufferSame(input, *result, 1); - } - - template - void CheckCase(const shared_ptr& in_type, const vector& in_values, - const vector& is_valid, const shared_ptr& out_type, - const vector& out_values, const CastOptions& options) { - DCHECK_EQ(in_values.size(), out_values.size()); - shared_ptr input, expected; - if (is_valid.size() > 0) { - DCHECK_EQ(is_valid.size(), out_values.size()); - ArrayFromVector(in_type, is_valid, in_values, &input); - ArrayFromVector(out_type, is_valid, out_values, &expected); - } else { - ArrayFromVector(in_type, in_values, &input); - ArrayFromVector(out_type, out_values, &expected); - } - CheckPass(*input, *expected, out_type, options); - - // Check a sliced variant - if (input->length() > 1) { - CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); - } - } -}; - -TEST_F(TestCast, SameTypeZeroCopy) { - vector is_valid = {true, false, true, true, true}; - vector v1 = {0, 1, 2, 3, 4}; - - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); - - shared_ptr result; - ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result)); - - AssertBufferSame(*arr, *result, 0); - AssertBufferSame(*arr, *result, 1); -} - -TEST_F(TestCast, ToBoolean) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - // int8, should suffice for other integers - vector v1 = {0, 1, 127, -1, 0}; - vector e1 = {false, true, true, true, false}; - CheckCase(int8(), v1, is_valid, boolean(), e1, - options); - - // floating point - vector v2 = {1.0, 0, 0, -1.0, 5.0}; - vector e2 = {true, false, false, true, true}; - CheckCase(float64(), v2, is_valid, boolean(), e2, - options); -} - -TEST_F(TestCast, ToIntUpcast) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - // int8 to int32 - vector v1 = {0, 1, 127, -1, 0}; - vector e1 = {0, 1, 127, -1, 0}; - CheckCase(int8(), v1, is_valid, int32(), e1, - options); - - // bool to int8 - vector v2 = {false, true, false, true, true}; - vector e2 = {0, 1, 0, 1, 1}; - CheckCase(boolean(), v2, is_valid, int8(), e2, - options); - - // uint8 to int16, no overflow/underrun - vector v3 = {0, 100, 200, 255, 0}; - vector e3 = {0, 100, 200, 255, 0}; - CheckCase(uint8(), v3, is_valid, int16(), e3, - options); -} - -TEST_F(TestCast, OverflowInNullSlot) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - vector v11 = {0, 70000, 2000, 1000, 0}; - vector e11 = {0, 0, 2000, 1000, 0}; - - shared_ptr expected; - ArrayFromVector(int16(), is_valid, e11, &expected); - - auto buf = Buffer::Wrap(v11.data(), v11.size()); - Int32Array tmp11(5, buf, expected->null_bitmap(), -1); - - CheckPass(tmp11, *expected, int16(), options); -} - -TEST_F(TestCast, ToIntDowncastSafe) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - // int16 to uint8, no overflow/underrun - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100, 200, 1, 2}; - CheckCase(int16(), v1, is_valid, uint8(), e1, - options); - - // int16 to uint8, with overflow - vector v2 = {0, 100, 256, 0, 0}; - CheckFails(int16(), v2, is_valid, uint8(), options); - - // underflow - vector v3 = {0, 100, -1, 0, 0}; - CheckFails(int16(), v3, is_valid, uint8(), options); - - // int32 to int16, no overflow - vector v4 = {0, 1000, 2000, 1, 2}; - vector e4 = {0, 1000, 2000, 1, 2}; - CheckCase(int32(), v4, is_valid, int16(), e4, - options); - - // int32 to int16, overflow - vector v5 = {0, 1000, 2000, 70000, 0}; - CheckFails(int32(), v5, is_valid, int16(), options); - - // underflow - vector v6 = {0, 1000, 2000, -70000, 0}; - CheckFails(int32(), v6, is_valid, int16(), options); - - vector v7 = {0, 1000, 2000, -70000, 0}; - CheckFails(int32(), v7, is_valid, uint8(), options); -} - -template -std::vector UnsafeVectorCast(const std::vector& v) { - size_t n_elems = v.size(); - std::vector result(n_elems); - - for (size_t i = 0; i < v.size(); i++) result[i] = static_cast(v[i]); - - return std::move(result); -} - -TEST_F(TestCast, IntegerSignedToUnsigned) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - vector v1 = {INT32_MIN, 100, -1, UINT16_MAX, INT32_MAX}; - - // Same width - CheckFails(int32(), v1, is_valid, uint32(), options); - // Wider - CheckFails(int32(), v1, is_valid, uint64(), options); - // Narrower - CheckFails(int32(), v1, is_valid, uint16(), options); - // Fail because of overflow (instead of underflow). - vector over = {0, -11, 0, UINT16_MAX + 1, INT32_MAX}; - CheckFails(int32(), over, is_valid, uint16(), options); - - options.allow_int_overflow = true; - - CheckCase( - int32(), v1, is_valid, uint32(), UnsafeVectorCast(v1), options); - CheckCase( - int32(), v1, is_valid, uint64(), UnsafeVectorCast(v1), options); - CheckCase( - int32(), v1, is_valid, uint16(), UnsafeVectorCast(v1), options); - CheckCase( - int32(), over, is_valid, uint16(), UnsafeVectorCast(over), - options); -} - -TEST_F(TestCast, IntegerUnsignedToSigned) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, true, true}; - - vector v1 = {0, INT16_MAX + 1, UINT32_MAX}; - vector v2 = {0, INT16_MAX + 1, 2}; - // Same width - CheckFails(uint32(), v1, is_valid, int32(), options); - // Narrower - CheckFails(uint32(), v1, is_valid, int16(), options); - CheckFails(uint32(), v2, is_valid, int16(), options); - - options.allow_int_overflow = true; - - CheckCase( - uint32(), v1, is_valid, int32(), UnsafeVectorCast(v1), options); - CheckCase( - uint32(), v1, is_valid, int64(), UnsafeVectorCast(v1), options); - CheckCase( - uint32(), v1, is_valid, int16(), UnsafeVectorCast(v1), options); - CheckCase( - uint32(), v2, is_valid, int16(), UnsafeVectorCast(v2), options); -} - -TEST_F(TestCast, ToIntDowncastUnsafe) { - CastOptions options; - options.allow_int_overflow = true; - - vector is_valid = {true, false, true, true, true}; - - // int16 to uint8, no overflow/underrun - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100, 200, 1, 2}; - CheckCase(int16(), v1, is_valid, uint8(), e1, - options); - - // int16 to uint8, with overflow - vector v2 = {0, 100, 256, 0, 0}; - vector e2 = {0, 100, 0, 0, 0}; - CheckCase(int16(), v2, is_valid, uint8(), e2, - options); - - // underflow - vector v3 = {0, 100, -1, 0, 0}; - vector e3 = {0, 100, 255, 0, 0}; - CheckCase(int16(), v3, is_valid, uint8(), e3, - options); - - // int32 to int16, no overflow - vector v4 = {0, 1000, 2000, 1, 2}; - vector e4 = {0, 1000, 2000, 1, 2}; - CheckCase(int32(), v4, is_valid, int16(), e4, - options); - - // int32 to int16, overflow - // TODO(wesm): do we want to allow this? we could set to null - vector v5 = {0, 1000, 2000, 70000, 0}; - vector e5 = {0, 1000, 2000, 4464, 0}; - CheckCase(int32(), v5, is_valid, int16(), e5, - options); - - // underflow - // TODO(wesm): do we want to allow this? we could set overflow to null - vector v6 = {0, 1000, 2000, -70000, 0}; - vector e6 = {0, 1000, 2000, -4464, 0}; - CheckCase(int32(), v6, is_valid, int16(), e6, - options); -} - -TEST_F(TestCast, FloatingPointToInt) { - // which means allow_float_truncate == false - auto options = CastOptions::Safe(); - - vector is_valid = {true, false, true, true, true}; - vector all_valid = {true, true, true, true, true}; - - // float32 to int32 no truncation - vector v1 = {1.0, 0, 0.0, -1.0, 5.0}; - vector e1 = {1, 0, 0, -1, 5}; - CheckCase(float32(), v1, is_valid, int32(), e1, - options); - CheckCase(float32(), v1, all_valid, int32(), e1, - options); - - // float64 to int32 no truncation - vector v2 = {1.0, 0, 0.0, -1.0, 5.0}; - vector e2 = {1, 0, 0, -1, 5}; - CheckCase(float64(), v2, is_valid, int32(), e2, - options); - CheckCase(float64(), v2, all_valid, int32(), e2, - options); - - // float64 to int64 no truncation - vector v3 = {1.0, 0, 0.0, -1.0, 5.0}; - vector e3 = {1, 0, 0, -1, 5}; - CheckCase(float64(), v3, is_valid, int64(), e3, - options); - CheckCase(float64(), v3, all_valid, int64(), e3, - options); - - // float64 to int32 truncate - vector v4 = {1.5, 0, 0.5, -1.5, 5.5}; - vector e4 = {1, 0, 0, -1, 5}; - - options.allow_float_truncate = false; - CheckFails(float64(), v4, is_valid, int32(), options); - CheckFails(float64(), v4, all_valid, int32(), options); - - options.allow_float_truncate = true; - CheckCase(float64(), v4, is_valid, int32(), e4, - options); - CheckCase(float64(), v4, all_valid, int32(), e4, - options); - - // float64 to int64 truncate - vector v5 = {1.5, 0, 0.5, -1.5, 5.5}; - vector e5 = {1, 0, 0, -1, 5}; - - options.allow_float_truncate = false; - CheckFails(float64(), v5, is_valid, int64(), options); - CheckFails(float64(), v5, all_valid, int64(), options); - - options.allow_float_truncate = true; - CheckCase(float64(), v5, is_valid, int64(), e5, - options); - CheckCase(float64(), v5, all_valid, int64(), e5, - options); -} - -TEST_F(TestCast, IntToFloatingPoint) { - auto options = CastOptions::Safe(); - - vector all_valid = {true, true, true, true, true}; - vector all_invalid = {false, false, false, false, false}; - - vector v1 = {INT64_MIN, INT64_MIN + 1, 0, INT64_MAX - 1, INT64_MAX}; - CheckFails(int64(), v1, all_valid, float32(), options); - - // While it's not safe to convert, all values are null. - CheckCase(int64(), v1, all_invalid, float64(), - UnsafeVectorCast(v1), - options); -} - -TEST_F(TestCast, TimestampToTimestamp) { - CastOptions options; - - auto CheckTimestampCast = - [this](const CastOptions& options, TimeUnit::type from_unit, TimeUnit::type to_unit, - const vector& from_values, const vector& to_values, - const vector& is_valid) { - CheckCase( - timestamp(from_unit), from_values, is_valid, timestamp(to_unit), to_values, - options); - }; - - vector is_valid = {true, false, true, true, true}; - - // Multiply promotions - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MILLI, v1, e1, is_valid); - - vector v2 = {0, 100, 200, 1, 2}; - vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MICRO, v2, e2, is_valid); - - vector v3 = {0, 100, 200, 1, 2}; - vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::NANO, v3, e3, is_valid); - - vector v4 = {0, 100, 200, 1, 2}; - vector e4 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::MICRO, v4, e4, is_valid); - - vector v5 = {0, 100, 200, 1, 2}; - vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::NANO, v5, e5, is_valid); - - vector v6 = {0, 100, 200, 1, 2}; - vector e6 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::NANO, v6, e6, is_valid); - - // Zero copy - vector v7 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(timestamp(TimeUnit::SECOND), is_valid, v7, - &arr); - CheckZeroCopy(*arr, timestamp(TimeUnit::SECOND)); - - // ARROW-1773, cast to integer - CheckZeroCopy(*arr, int64()); - - // Divide, truncate - vector v8 = {0, 100123, 200456, 1123, 2456}; - vector e8 = {0, 100, 200, 1, 2}; - - options.allow_time_truncate = true; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::SECOND, v8, e8, is_valid); - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::MILLI, v8, e8, is_valid); - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MICRO, v8, e8, is_valid); - - vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; - vector e9 = {0, 100, 200, 1, 2}; - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::SECOND, v9, e9, is_valid); - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MILLI, v9, e9, is_valid); - - vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; - vector e10 = {0, 100, 200, 1, 2}; - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::SECOND, v10, e10, is_valid); - - // Disallow truncate, failures - options.allow_time_truncate = false; - CheckFails(timestamp(TimeUnit::MILLI), v8, is_valid, - timestamp(TimeUnit::SECOND), options); - CheckFails(timestamp(TimeUnit::MICRO), v8, is_valid, - timestamp(TimeUnit::MILLI), options); - CheckFails(timestamp(TimeUnit::NANO), v8, is_valid, - timestamp(TimeUnit::MICRO), options); - CheckFails(timestamp(TimeUnit::MICRO), v9, is_valid, - timestamp(TimeUnit::SECOND), options); - CheckFails(timestamp(TimeUnit::NANO), v9, is_valid, - timestamp(TimeUnit::MILLI), options); - CheckFails(timestamp(TimeUnit::NANO), v10, is_valid, - timestamp(TimeUnit::SECOND), options); -} - -TEST_F(TestCast, TimestampToDate32_Date64) { - CastOptions options; - - vector is_valid = {true, true, false}; - - // 2000-01-01, 2000-01-02, null - vector v_nano = {946684800000000000, 946771200000000000, 0}; - vector v_micro = {946684800000000, 946771200000000, 0}; - vector v_milli = {946684800000, 946771200000, 0}; - vector v_second = {946684800, 946771200, 0}; - vector v_day = {10957, 10958, 0}; - - // Simple conversions - CheckCase( - timestamp(TimeUnit::NANO), v_nano, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::MICRO), v_micro, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::MILLI), v_milli, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::SECOND), v_second, is_valid, date64(), v_milli, options); - - CheckCase( - timestamp(TimeUnit::NANO), v_nano, is_valid, date32(), v_day, options); - CheckCase( - timestamp(TimeUnit::MICRO), v_micro, is_valid, date32(), v_day, options); - CheckCase( - timestamp(TimeUnit::MILLI), v_milli, is_valid, date32(), v_day, options); - CheckCase( - timestamp(TimeUnit::SECOND), v_second, is_valid, date32(), v_day, options); - - // Disallow truncate, failures - vector v_nano_fail = {946684800000000001, 946771200000000001, 0}; - vector v_micro_fail = {946684800000001, 946771200000001, 0}; - vector v_milli_fail = {946684800001, 946771200001, 0}; - vector v_second_fail = {946684801, 946771201, 0}; - - options.allow_time_truncate = false; - CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date64(), - options); - CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date64(), - options); - CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date64(), - options); - CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, - date64(), options); - - CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date32(), - options); - CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date32(), - options); - CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date32(), - options); - CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, - date32(), options); - - // Make sure that nulls are excluded from the truncation checks - vector v_second_nofail = {946684800, 946771200, 1}; - CheckCase( - timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date32(), v_day, options); -} - -TEST_F(TestCast, TimeToCompatible) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - // Multiply promotions - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100000, 200000, 1000, 2000}; - CheckCase( - time32(TimeUnit::SECOND), v1, is_valid, time32(TimeUnit::MILLI), e1, options); - - vector v2 = {0, 100, 200, 1, 2}; - vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckCase( - time32(TimeUnit::SECOND), v2, is_valid, time64(TimeUnit::MICRO), e2, options); - - vector v3 = {0, 100, 200, 1, 2}; - vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; - CheckCase( - time32(TimeUnit::SECOND), v3, is_valid, time64(TimeUnit::NANO), e3, options); - - vector v4 = {0, 100, 200, 1, 2}; - vector e4 = {0, 100000, 200000, 1000, 2000}; - CheckCase( - time32(TimeUnit::MILLI), v4, is_valid, time64(TimeUnit::MICRO), e4, options); - - vector v5 = {0, 100, 200, 1, 2}; - vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckCase( - time32(TimeUnit::MILLI), v5, is_valid, time64(TimeUnit::NANO), e5, options); - - vector v6 = {0, 100, 200, 1, 2}; - vector e6 = {0, 100000, 200000, 1000, 2000}; - CheckCase( - time64(TimeUnit::MICRO), v6, is_valid, time64(TimeUnit::NANO), e6, options); - - // Zero copy - vector v7 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(time64(TimeUnit::MICRO), is_valid, v7, &arr); - CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); - - // ARROW-1773: cast to int64 - CheckZeroCopy(*arr, int64()); - - vector v7_2 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(time32(TimeUnit::SECOND), is_valid, v7_2, &arr); - CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); - - // ARROW-1773: cast to int64 - CheckZeroCopy(*arr, int32()); - - // Divide, truncate - vector v8 = {0, 100123, 200456, 1123, 2456}; - vector e8 = {0, 100, 200, 1, 2}; - - options.allow_time_truncate = true; - CheckCase( - time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), e8, options); - CheckCase( - time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), e8, options); - CheckCase( - time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), e8, options); - - vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; - vector e9 = {0, 100, 200, 1, 2}; - CheckCase( - time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), e9, options); - CheckCase( - time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), e9, options); - - vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; - vector e10 = {0, 100, 200, 1, 2}; - CheckCase( - time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), e10, options); - - // Disallow truncate, failures - - options.allow_time_truncate = false; - CheckFails(time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), - options); - CheckFails(time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), - options); - CheckFails(time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), - options); - CheckFails(time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), - options); - CheckFails(time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), - options); - CheckFails(time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), - options); -} - -TEST_F(TestCast, PrimitiveZeroCopy) { - shared_ptr arr; - - ArrayFromVector(uint8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint8()); - ArrayFromVector(int8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int8()); - - ArrayFromVector(uint16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint16()); - ArrayFromVector(int16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int16()); - - ArrayFromVector(uint32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint32()); - ArrayFromVector(int32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int32()); - - ArrayFromVector(uint64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint64()); - ArrayFromVector(int64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int64()); - - ArrayFromVector(float32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, float32()); - - ArrayFromVector(float64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, float64()); -} - -TEST_F(TestCast, DateToCompatible) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - constexpr int64_t F = 86400000; - - // Multiply promotion - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100 * F, 200 * F, F, 2 * F}; - CheckCase(date32(), v1, is_valid, date64(), - e1, options); - - // Zero copy - vector v2 = {0, 70000, 2000, 1000, 0}; - vector v3 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(date32(), is_valid, v2, &arr); - CheckZeroCopy(*arr, date32()); - - // ARROW-1773: zero copy cast to integer - CheckZeroCopy(*arr, int32()); - - ArrayFromVector(date64(), is_valid, v3, &arr); - CheckZeroCopy(*arr, date64()); - - // ARROW-1773: zero copy cast to integer - CheckZeroCopy(*arr, int64()); - - // Divide, truncate - vector v8 = {0, 100 * F + 123, 200 * F + 456, F + 123, 2 * F + 456}; - vector e8 = {0, 100, 200, 1, 2}; - - options.allow_time_truncate = true; - CheckCase(date64(), v8, is_valid, date32(), - e8, options); - - // Disallow truncate, failures - options.allow_time_truncate = false; - CheckFails(date64(), v8, is_valid, date32(), options); -} - -TEST_F(TestCast, ToDouble) { - CastOptions options; - vector is_valid = {true, false, true, true, true}; - - // int16 to double - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100, 200, 1, 2}; - CheckCase(int16(), v1, is_valid, float64(), e1, - options); - - // float to double - vector v2 = {0, 100, 200, 1, 2}; - vector e2 = {0, 100, 200, 1, 2}; - CheckCase(float32(), v2, is_valid, float64(), e2, - options); - - // bool to double - vector v3 = {true, true, false, false, true}; - vector e3 = {1, 1, 0, 0, 1}; - CheckCase(boolean(), v3, is_valid, float64(), e3, - options); -} - -TEST_F(TestCast, ChunkedArray) { - vector values1 = {0, 1, 2}; - vector values2 = {3, 4, 5}; - - auto type = int16(); - auto out_type = int64(); - - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - - ArrayVector arrays = {a1, a2}; - auto carr = std::make_shared(arrays); - - CastOptions options; - - Datum out; - ASSERT_OK(Cast(&this->ctx_, carr, out_type, options, &out)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); - - auto out_carr = out.chunked_array(); - - vector ex_values1 = {0, 1, 2}; - vector ex_values2 = {3, 4, 5}; - auto a3 = _MakeArray(out_type, ex_values1, {}); - auto a4 = _MakeArray(out_type, ex_values2, {}); - - ArrayVector ex_arrays = {a3, a4}; - auto ex_carr = std::make_shared(ex_arrays); - - ASSERT_TRUE(out.chunked_array()->Equals(*ex_carr)); -} - -TEST_F(TestCast, UnsupportedTarget) { - vector is_valid = {true, false, true, true, true}; - vector v1 = {0, 1, 2, 3, 4}; - - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); - - shared_ptr result; - ASSERT_RAISES(NotImplemented, Cast(&this->ctx_, *arr, utf8(), {}, &result)); -} - -TEST_F(TestCast, DateTimeZeroCopy) { - vector is_valid = {true, false, true, true, true}; - - vector v1 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); - - CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); - CheckZeroCopy(*arr, date32()); - - vector v2 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(int64(), is_valid, v2, &arr); - - CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); - CheckZeroCopy(*arr, date64()); - CheckZeroCopy(*arr, timestamp(TimeUnit::NANO)); -} - -TEST_F(TestCast, FromNull) { - // Null casts to everything - const int length = 10; - - NullArray arr(length); - - shared_ptr result; - ASSERT_OK(Cast(&ctx_, arr, int32(), {}, &result)); - - ASSERT_EQ(length, result->length()); - ASSERT_EQ(length, result->null_count()); - - // OK to look at bitmaps - ASSERT_ARRAYS_EQUAL(*result, *result); -} - -TEST_F(TestCast, PreallocatedMemory) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - const int64_t length = 5; - - shared_ptr arr; - vector v1 = {0, 70000, 2000, 1000, 0}; - vector e1 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(int32(), is_valid, v1, &arr); - - auto out_type = int64(); - - std::unique_ptr kernel; - ASSERT_OK(GetCastFunction(*int32(), out_type, options, &kernel)); - - auto out_data = ArrayData::Make(out_type, length); - - shared_ptr out_values; - ASSERT_OK(this->ctx_.Allocate(length * sizeof(int64_t), &out_values)); - - out_data->buffers.push_back(nullptr); - out_data->buffers.push_back(out_values); - - Datum out(out_data); - ASSERT_OK(kernel->Call(&this->ctx_, arr, &out)); - - // Buffer address unchanged - ASSERT_EQ(out_values.get(), out_data->buffers[1].get()); - - shared_ptr result = MakeArray(out_data); - shared_ptr expected; - ArrayFromVector(int64(), is_valid, e1, &expected); - - ASSERT_ARRAYS_EQUAL(*expected, *result); -} - -template -void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr& in_type, - const vector& in_values, - const std::shared_ptr& out_type, - const vector& out_values) { - using OutTraits = TypeTraits; - - CastOptions options; - - const int64_t length = static_cast(in_values.size()); - - shared_ptr arr, expected; - ArrayFromVector(in_type, in_values, &arr); - ArrayFromVector(out_type, out_values, &expected); - - shared_ptr out_buffer; - ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer)); - - std::unique_ptr kernel; - ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel)); - - const int64_t first_half = length / 2; - - auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer}); - auto out_second_data = out_data->Copy(); - out_second_data->offset = first_half; - - Datum out_first(out_data); - Datum out_second(out_second_data); - - // Cast each bit - ASSERT_OK(kernel->Call(ctx, arr->Slice(0, first_half), &out_first)); - ASSERT_OK(kernel->Call(ctx, arr->Slice(first_half), &out_second)); - - shared_ptr result = MakeArray(out_data); - - ASSERT_ARRAYS_EQUAL(*expected, *result); -} - -TEST_F(TestCast, OffsetOutputBuffer) { - // ARROW-1735 - vector v1 = {0, 10000, 2000, 1000, 0}; - vector e1 = {0, 10000, 2000, 1000, 0}; - - auto in_type = int32(); - auto out_type = int64(); - CheckOffsetOutputCase(&this->ctx_, in_type, v1, - out_type, e1); - - vector e2 = {false, true, true, true, false}; - - out_type = boolean(); - CheckOffsetOutputCase(&this->ctx_, in_type, v1, - boolean(), e2); - - vector e3 = {0, 10000, 2000, 1000, 0}; - CheckOffsetOutputCase(&this->ctx_, in_type, v1, - int16(), e3); -} - -TEST_F(TestCast, StringToBoolean) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - vector v1 = {"False", "true", "true", "True", "false"}; - vector v2 = {"0", "1", "1", "1", "0"}; - vector e = {false, true, true, true, false}; - CheckCase(utf8(), v1, is_valid, boolean(), - e, options); - CheckCase(utf8(), v2, is_valid, boolean(), - e, options); -} - -TEST_F(TestCast, StringToBooleanErrors) { - CastOptions options; - - vector is_valid = {true}; - - CheckFails(utf8(), {"false "}, is_valid, boolean(), options); - CheckFails(utf8(), {"T"}, is_valid, boolean(), options); -} - -TEST_F(TestCast, StringToNumber) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - // string to int - vector v_int = {"0", "1", "127", "-1", "0"}; - vector e_int8 = {0, 1, 127, -1, 0}; - vector e_int16 = {0, 1, 127, -1, 0}; - vector e_int32 = {0, 1, 127, -1, 0}; - vector e_int64 = {0, 1, 127, -1, 0}; - CheckCase(utf8(), v_int, is_valid, int8(), - e_int8, options); - CheckCase(utf8(), v_int, is_valid, int16(), - e_int16, options); - CheckCase(utf8(), v_int, is_valid, int32(), - e_int32, options); - CheckCase(utf8(), v_int, is_valid, int64(), - e_int64, options); - - v_int = {"2147483647", "0", "-2147483648", "0", "0"}; - e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; - CheckCase(utf8(), v_int, is_valid, int32(), - e_int32, options); - v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; - e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; - CheckCase(utf8(), v_int, is_valid, int64(), - e_int64, options); - - // string to uint - vector v_uint = {"0", "1", "127", "255", "0"}; - vector e_uint8 = {0, 1, 127, 255, 0}; - vector e_uint16 = {0, 1, 127, 255, 0}; - vector e_uint32 = {0, 1, 127, 255, 0}; - vector e_uint64 = {0, 1, 127, 255, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint8(), e_uint8, options); - CheckCase(utf8(), v_uint, is_valid, - uint16(), e_uint16, options); - CheckCase(utf8(), v_uint, is_valid, - uint32(), e_uint32, options); - CheckCase(utf8(), v_uint, is_valid, - uint64(), e_uint64, options); - - v_uint = {"4294967295", "0", "0", "0", "0"}; - e_uint32 = {4294967295, 0, 0, 0, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint32(), e_uint32, options); - v_uint = {"18446744073709551615", "0", "0", "0", "0"}; - e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint64(), e_uint64, options); - - // string to float - vector v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; - vector e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; - vector e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; - CheckCase(utf8(), v_float, is_valid, - float32(), e_float, options); - CheckCase(utf8(), v_float, is_valid, - float64(), e_double, options); - - // Test that casting is locale-independent - auto global_locale = std::locale(); - try { - // French locale uses the comma as decimal point - std::locale::global(std::locale("fr_FR.UTF-8")); - } catch (std::runtime_error&) { - // Locale unavailable, ignore - } - CheckCase(utf8(), v_float, is_valid, - float32(), e_float, options); - CheckCase(utf8(), v_float, is_valid, - float64(), e_double, options); - std::locale::global(global_locale); -} - -TEST_F(TestCast, StringToNumberErrors) { - CastOptions options; - - vector is_valid = {true}; - - CheckFails(utf8(), {"z"}, is_valid, int8(), options); - CheckFails(utf8(), {"12 z"}, is_valid, int8(), options); - CheckFails(utf8(), {"128"}, is_valid, int8(), options); - CheckFails(utf8(), {"-129"}, is_valid, int8(), options); - CheckFails(utf8(), {"0.5"}, is_valid, int8(), options); - - CheckFails(utf8(), {"256"}, is_valid, uint8(), options); - CheckFails(utf8(), {"-1"}, is_valid, uint8(), options); - - CheckFails(utf8(), {"z"}, is_valid, float32(), options); -} - -TEST_F(TestCast, StringToTimestamp) { - CastOptions options; - - vector is_valid = {true, false, true}; - vector strings = {"1970-01-01", "xxx", "2000-02-29"}; - - auto type = timestamp(TimeUnit::SECOND); - vector e = {0, 0, 951782400}; - CheckCase(utf8(), strings, is_valid, - type, e, options); - - type = timestamp(TimeUnit::MICRO); - e = {0, 0, 951782400000000LL}; - CheckCase(utf8(), strings, is_valid, - type, e, options); - - // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc -} - -TEST_F(TestCast, StringToTimestampErrors) { - CastOptions options; - - vector is_valid = {true}; - - for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) { - auto type = timestamp(unit); - CheckFails(utf8(), {""}, is_valid, type, options); - CheckFails(utf8(), {"xxx"}, is_valid, type, options); - } -} - -template -class TestDictionaryCast : public TestCast {}; - -typedef ::testing::Types - TestTypes; - -TYPED_TEST_CASE(TestDictionaryCast, TestTypes); - -TYPED_TEST(TestDictionaryCast, Basic) { - CastOptions options; - shared_ptr plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - - Datum out; - ASSERT_OK(DictionaryEncode(&this->ctx_, plain_array->data(), &out)); - - this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options); -} - -TEST_F(TestCast, DictToNonDictNoNulls) { - vector dict_values = {"foo", "bar", "baz"}; - auto ex_dict = _MakeArray(utf8(), dict_values, {}); - auto dict_type = dictionary(int32(), ex_dict); - - // Explicitly construct with nullptr for the null_bitmap_data - std::vector i1 = {1, 0, 1}; - std::vector i2 = {2, 1, 0, 1}; - auto c1 = std::make_shared>(3, Buffer::Wrap(i1)); - auto c2 = std::make_shared>(4, Buffer::Wrap(i2)); - - ArrayVector dict_arrays = {std::make_shared(dict_type, c1), - std::make_shared(dict_type, c2)}; - auto dict_carr = std::make_shared(dict_arrays); - - Datum cast_input(dict_carr); - Datum cast_output; - // Ensure that casting works even when the null_bitmap_data array is a nullptr - ASSERT_OK(Cast(&this->ctx_, cast_input, - static_cast(*dict_type).dictionary()->type(), - CastOptions(), &cast_output)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, cast_output.kind()); - - auto e1 = _MakeArray(utf8(), {"bar", "foo", "bar"}, {}); - auto e2 = _MakeArray(utf8(), {"baz", "bar", "foo", "bar"}, {}); - - auto chunks = cast_output.chunked_array()->chunks(); - ASSERT_EQ(chunks.size(), 2); - ASSERT_ARRAYS_EQUAL(*e1, *chunks[0]); - ASSERT_ARRAYS_EQUAL(*e2, *chunks[1]); -} - -/*TYPED_TEST(TestDictionaryCast, Reverse) { - CastOptions options; - shared_ptr plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - - shared_ptr dict_array; - ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array)); - - this->CheckPass(*plain_array, *dict_array, dict_array->type(), options); -}*/ - -TEST_F(TestCast, ListToList) { - CastOptions options; - std::shared_ptr offsets; - - vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; - std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; - ArrayFromVector(offsets_is_valid, offsets_values, &offsets); - - shared_ptr int32_plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - std::shared_ptr int32_list_array; - ASSERT_OK( - ListArray::FromArrays(*offsets, *int32_plain_array, pool_, &int32_list_array)); - - std::shared_ptr int64_plain_array; - ASSERT_OK(Cast(&this->ctx_, *int32_plain_array, int64(), options, &int64_plain_array)); - std::shared_ptr int64_list_array; - ASSERT_OK( - ListArray::FromArrays(*offsets, *int64_plain_array, pool_, &int64_list_array)); - - std::shared_ptr float64_plain_array; - ASSERT_OK( - Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array)); - std::shared_ptr float64_list_array; - ASSERT_OK( - ListArray::FromArrays(*offsets, *float64_plain_array, pool_, &float64_list_array)); - - CheckPass(*int32_list_array, *int64_list_array, int64_list_array->type(), options); - CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), options); - CheckPass(*int64_list_array, *int32_list_array, int32_list_array->type(), options); - CheckPass(*int64_list_array, *float64_list_array, float64_list_array->type(), options); - - options.allow_float_truncate = true; - CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), options); - CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options); -} - -// ---------------------------------------------------------------------- -// Dictionary tests - -template -void CheckUnique(FunctionContext* ctx, const shared_ptr& type, - const vector& in_values, const vector& in_is_valid, - const vector& out_values, const vector& out_is_valid) { - shared_ptr input = _MakeArray(type, in_values, in_is_valid); - shared_ptr expected = _MakeArray(type, out_values, out_is_valid); - - shared_ptr result; - ASSERT_OK(Unique(ctx, input, &result)); - ASSERT_ARRAYS_EQUAL(*expected, *result); -} - -template -void CheckDictEncode(FunctionContext* ctx, const shared_ptr& type, - const vector& in_values, const vector& in_is_valid, - const vector& out_values, const vector& out_is_valid, - const vector& out_indices) { - shared_ptr input = _MakeArray(type, in_values, in_is_valid); - shared_ptr ex_dict = _MakeArray(type, out_values, out_is_valid); - shared_ptr ex_indices = - _MakeArray(int32(), out_indices, in_is_valid); - - DictionaryArray expected(dictionary(int32(), ex_dict), ex_indices); - - Datum datum_out; - ASSERT_OK(DictionaryEncode(ctx, input, &datum_out)); - shared_ptr result = MakeArray(datum_out.array()); - - ASSERT_ARRAYS_EQUAL(expected, *result); -} - -class TestHashKernel : public ComputeFixture, public TestBase {}; - -template -class TestHashKernelPrimitive : public ComputeFixture, public TestBase {}; - -typedef ::testing::Types - PrimitiveDictionaries; - -TYPED_TEST_CASE(TestHashKernelPrimitive, PrimitiveDictionaries); - -TYPED_TEST(TestHashKernelPrimitive, Unique) { - using T = typename TypeParam::c_type; - auto type = TypeTraits::type_singleton(); - CheckUnique(&this->ctx_, type, {2, 1, 2, 1}, {true, false, true, true}, - {2, 1}, {}); - CheckUnique(&this->ctx_, type, {2, 1, 3, 1}, {false, false, true, true}, - {3, 1}, {}); -} - -TYPED_TEST(TestHashKernelPrimitive, DictEncode) { - using T = typename TypeParam::c_type; - auto type = TypeTraits::type_singleton(); - CheckDictEncode(&this->ctx_, type, {2, 1, 2, 1, 2, 3}, - {true, false, true, true, true, true}, {2, 1, 3}, {}, - {0, 0, 0, 1, 0, 2}); -} - -TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { - using T = typename TypeParam::c_type; - // Skip this test for (u)int8 - if (sizeof(Scalar) == 1) { - return; - } - - const int64_t kTotalValues = 1000000; - const int64_t kRepeats = 5; - - vector values; - vector uniques; - vector indices; - for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { - const auto val = static_cast(i % kTotalValues); - values.push_back(val); - - if (i < kTotalValues) { - uniques.push_back(val); - } - indices.push_back(static_cast(i % kTotalValues)); - } - - auto type = TypeTraits::type_singleton(); - CheckUnique(&this->ctx_, type, values, {}, uniques, {}); - - CheckDictEncode(&this->ctx_, type, values, {}, uniques, {}, indices); -} - -TEST_F(TestHashKernel, UniqueTimeTimestamp) { - CheckUnique(&this->ctx_, time32(TimeUnit::SECOND), {2, 1, 2, 1}, - {true, false, true, true}, {2, 1}, {}); - - CheckUnique(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1}, - {true, false, true, true}, {2, 1}, {}); - - CheckUnique(&this->ctx_, timestamp(TimeUnit::NANO), - {2, 1, 2, 1}, {true, false, true, true}, {2, 1}, - {}); -} - -TEST_F(TestHashKernel, UniqueBoolean) { - CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, - {true, false, true, true}, {true, false}, {}); - - CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, - {true, false, true, true}, {false, true}, {}); - - // No nulls - CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, {}, - {true, false}, {}); - - CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, {}, - {false, true}, {}); -} - -TEST_F(TestHashKernel, DictEncodeBoolean) { - CheckDictEncode( - &this->ctx_, boolean(), {true, true, false, true, false}, - {true, false, true, true, true}, {true, false}, {}, {0, 0, 1, 0, 1}); - - CheckDictEncode( - &this->ctx_, boolean(), {false, true, false, true, false}, - {true, false, true, true, true}, {false, true}, {}, {0, 0, 0, 1, 0}); - - // No nulls - CheckDictEncode(&this->ctx_, boolean(), - {true, true, false, true, false}, {}, {true, false}, - {}, {0, 0, 1, 0, 1}); - - CheckDictEncode(&this->ctx_, boolean(), - {false, true, false, true, false}, {}, {false, true}, - {}, {0, 1, 0, 1, 0}); -} - -TEST_F(TestHashKernel, UniqueBinary) { - CheckUnique(&this->ctx_, binary(), - {"test", "", "test2", "test"}, - {true, false, true, true}, {"test", "test2"}, {}); - - CheckUnique(&this->ctx_, utf8(), {"test", "", "test2", "test"}, - {true, false, true, true}, {"test", "test2"}, {}); -} - -TEST_F(TestHashKernel, DictEncodeBinary) { - CheckDictEncode( - &this->ctx_, binary(), {"test", "", "test2", "test", "baz"}, - {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); - - CheckDictEncode( - &this->ctx_, utf8(), {"test", "", "test2", "test", "baz"}, - {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); -} - -TEST_F(TestHashKernel, BinaryResizeTable) { - const int32_t kTotalValues = 10000; -#if !defined(ARROW_VALGRIND) - const int32_t kRepeats = 10; -#else - // Mitigate Valgrind's slowness - const int32_t kRepeats = 3; -#endif - - vector values; - vector uniques; - vector indices; - char buf[20] = "test"; - - for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { - int32_t index = i % kTotalValues; - - ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0); - values.emplace_back(buf); - - if (i < kTotalValues) { - uniques.push_back(values.back()); - } - indices.push_back(index); - } - - CheckUnique(&this->ctx_, binary(), values, {}, uniques, {}); - CheckDictEncode(&this->ctx_, binary(), values, {}, uniques, {}, - indices); - - CheckUnique(&this->ctx_, utf8(), values, {}, uniques, {}); - CheckDictEncode(&this->ctx_, utf8(), values, {}, uniques, {}, - indices); -} - -TEST_F(TestHashKernel, UniqueFixedSizeBinary) { - CheckUnique( - &this->ctx_, fixed_size_binary(5), {"aaaaa", "", "bbbbb", "aaaaa"}, - {true, false, true, true}, {"aaaaa", "bbbbb"}, {}); -} - -TEST_F(TestHashKernel, DictEncodeFixedSizeBinary) { - CheckDictEncode( - &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, - {true, false, true, true, true}, {"bbbbb", "aaaaa", "ccccc"}, {}, {0, 0, 0, 1, 2}); -} - -TEST_F(TestHashKernel, FixedSizeBinaryResizeTable) { - const int32_t kTotalValues = 10000; -#if !defined(ARROW_VALGRIND) - const int32_t kRepeats = 10; -#else - // Mitigate Valgrind's slowness - const int32_t kRepeats = 3; -#endif - - vector values; - vector uniques; - vector indices; - char buf[7] = "test.."; - - for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { - int32_t index = i % kTotalValues; - - buf[4] = static_cast(index / 128); - buf[5] = static_cast(index % 128); - values.emplace_back(buf, 6); - - if (i < kTotalValues) { - uniques.push_back(values.back()); - } - indices.push_back(index); - } - - auto type = fixed_size_binary(6); - CheckUnique(&this->ctx_, type, values, {}, uniques, - {}); - CheckDictEncode(&this->ctx_, type, values, {}, - uniques, {}, indices); -} - -TEST_F(TestHashKernel, UniqueDecimal) { - vector values{12, 12, 11, 12}; - vector expected{12, 11}; - - CheckUnique(&this->ctx_, decimal(2, 0), values, - {true, false, true, true}, expected, {}); -} - -TEST_F(TestHashKernel, DictEncodeDecimal) { - vector values{12, 12, 11, 12, 13}; - vector expected{12, 11, 13}; - - CheckDictEncode(&this->ctx_, decimal(2, 0), values, - {true, false, true, true, true}, expected, - {}, {0, 0, 1, 0, 2}); -} - -TEST_F(TestHashKernel, ChunkedArrayInvoke) { - vector values1 = {"foo", "bar", "foo"}; - vector values2 = {"bar", "baz", "quuux", "foo"}; - - auto type = utf8(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - - vector dict_values = {"foo", "bar", "baz", "quuux"}; - auto ex_dict = _MakeArray(type, dict_values, {}); - - ArrayVector arrays = {a1, a2}; - auto carr = std::make_shared(arrays); - - // Unique - shared_ptr result; - ASSERT_OK(Unique(&this->ctx_, carr, &result)); - ASSERT_ARRAYS_EQUAL(*ex_dict, *result); - - // Dictionary encode - auto dict_type = dictionary(int32(), ex_dict); - - auto i1 = _MakeArray(int32(), {0, 1, 0}, {}); - auto i2 = _MakeArray(int32(), {1, 2, 3, 0}, {}); - - ArrayVector dict_arrays = {std::make_shared(dict_type, i1), - std::make_shared(dict_type, i2)}; - auto dict_carr = std::make_shared(dict_arrays); - - Datum encoded_out; - ASSERT_OK(DictionaryEncode(&this->ctx_, carr, &encoded_out)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); - - AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); -} - -using BinaryKernelFunc = - std::function; - -class TestBooleanKernel : public ComputeFixture, public TestBase { - public: - void TestArrayBinary(const BinaryKernelFunc& kernel, const std::shared_ptr& left, - const std::shared_ptr& right, - const std::shared_ptr& expected) { - Datum result; - ASSERT_OK(kernel(&this->ctx_, left, right, &result)); - ASSERT_EQ(Datum::ARRAY, result.kind()); - std::shared_ptr result_array = result.make_array(); - ASSERT_TRUE(result_array->Equals(expected)); - } - - void TestChunkedArrayBinary(const BinaryKernelFunc& kernel, - const std::shared_ptr& left, - const std::shared_ptr& right, - const std::shared_ptr& expected) { - Datum result; - std::shared_ptr result_array; - ASSERT_OK(kernel(&this->ctx_, left, right, &result)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); - std::shared_ptr result_ca = result.chunked_array(); - ASSERT_TRUE(result_ca->Equals(expected)); - } - - void TestBinaryKernel(const BinaryKernelFunc& kernel, const std::vector& values1, - const std::vector& values2, - const std::vector& values3, - const std::vector& values3_nulls) { - auto type = boolean(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - auto a3 = _MakeArray(type, values3, {}); - auto a1_nulls = _MakeArray(type, values1, values1); - auto a2_nulls = _MakeArray(type, values2, values2); - auto a3_nulls = _MakeArray(type, values3, values3_nulls); - - TestArrayBinary(kernel, a1, a2, a3); - TestArrayBinary(kernel, a1_nulls, a2_nulls, a3_nulls); - TestArrayBinary(kernel, a1->Slice(1), a2->Slice(1), a3->Slice(1)); - TestArrayBinary(kernel, a1_nulls->Slice(1), a2_nulls->Slice(1), a3_nulls->Slice(1)); - - // ChunkedArray - std::vector> ca1_arrs = {a1, a1->Slice(1)}; - auto ca1 = std::make_shared(ca1_arrs); - std::vector> ca2_arrs = {a2, a2->Slice(1)}; - auto ca2 = std::make_shared(ca2_arrs); - std::vector> ca3_arrs = {a3, a3->Slice(1)}; - auto ca3 = std::make_shared(ca3_arrs); - TestChunkedArrayBinary(kernel, ca1, ca2, ca3); - - // ChunkedArray with different chunks - std::vector> ca4_arrs = {a1->Slice(0, 1), a1->Slice(1), - a1->Slice(1, 1), a1->Slice(2)}; - auto ca4 = std::make_shared(ca4_arrs); - TestChunkedArrayBinary(kernel, ca4, ca2, ca3); - } -}; - -TEST_F(TestBooleanKernel, Invert) { - vector values1 = {true, false, true}; - vector values2 = {false, true, false}; - - auto type = boolean(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - - // Plain array - Datum result; - ASSERT_OK(Invert(&this->ctx_, a1, &result)); - ASSERT_EQ(Datum::ARRAY, result.kind()); - std::shared_ptr result_array = result.make_array(); - ASSERT_TRUE(result_array->Equals(a2)); - - // Array with offset - ASSERT_OK(Invert(&this->ctx_, a1->Slice(1), &result)); - ASSERT_EQ(Datum::ARRAY, result.kind()); - result_array = result.make_array(); - ASSERT_TRUE(result_array->Equals(a2->Slice(1))); - - // ChunkedArray - std::vector> ca1_arrs = {a1, a1->Slice(1)}; - auto ca1 = std::make_shared(ca1_arrs); - std::vector> ca2_arrs = {a2, a2->Slice(1)}; - auto ca2 = std::make_shared(ca2_arrs); - ASSERT_OK(Invert(&this->ctx_, ca1, &result)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); - std::shared_ptr result_ca = result.chunked_array(); - ASSERT_TRUE(result_ca->Equals(ca2)); -} - -TEST_F(TestBooleanKernel, And) { - vector values1 = {true, false, true, false, true, true}; - vector values2 = {true, true, false, false, true, false}; - vector values3 = {true, false, false, false, true, false}; - TestBinaryKernel(And, values1, values2, values3, values3); -} - -TEST_F(TestBooleanKernel, Or) { - vector values1 = {true, false, true, false, true, true}; - vector values2 = {true, true, false, false, true, false}; - vector values3 = {true, true, true, false, true, true}; - vector values3_nulls = {true, false, false, false, true, false}; - TestBinaryKernel(Or, values1, values2, values3, values3_nulls); -} - -TEST_F(TestBooleanKernel, Xor) { - vector values1 = {true, false, true, false, true, true}; - vector values2 = {true, true, false, false, true, false}; - vector values3 = {false, true, true, false, false, true}; - vector values3_nulls = {true, false, false, false, true, false}; - TestBinaryKernel(Xor, values1, values2, values3, values3_nulls); -} - class TestInvokeBinaryKernel : public ComputeFixture, public TestBase {}; class DummyBinaryKernel : public BinaryKernel { diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index a5a142b5c28ce..4d508aacb9990 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -16,3 +16,7 @@ # under the License. ARROW_INSTALL_ALL_HEADERS("arrow/compute/kernels") + +ADD_ARROW_TEST(boolean-test PREFIX "arrow-compute") +ADD_ARROW_TEST(cast-test PREFIX "arrow-compute") +ADD_ARROW_TEST(hash-test PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/boolean-test.cc b/cpp/src/arrow/compute/kernels/boolean-test.cc new file mode 100644 index 0000000000000..24b3c68aa1cfb --- /dev/null +++ b/cpp/src/arrow/compute/kernels/boolean-test.cc @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +#include "arrow/test-common.h" +#include "arrow/test-util.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/boolean.h" +#include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { +namespace compute { + +using BinaryKernelFunc = + std::function; + +class TestBooleanKernel : public ComputeFixture, public TestBase { + public: + void TestArrayBinary(const BinaryKernelFunc& kernel, const std::shared_ptr& left, + const std::shared_ptr& right, + const std::shared_ptr& expected) { + Datum result; + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); + ASSERT_EQ(Datum::ARRAY, result.kind()); + std::shared_ptr result_array = result.make_array(); + ASSERT_TRUE(result_array->Equals(expected)); + } + + void TestChunkedArrayBinary(const BinaryKernelFunc& kernel, + const std::shared_ptr& left, + const std::shared_ptr& right, + const std::shared_ptr& expected) { + Datum result; + std::shared_ptr result_array; + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); + std::shared_ptr result_ca = result.chunked_array(); + ASSERT_TRUE(result_ca->Equals(expected)); + } + + void TestBinaryKernel(const BinaryKernelFunc& kernel, const std::vector& values1, + const std::vector& values2, + const std::vector& values3, + const std::vector& values3_nulls) { + auto type = boolean(); + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + auto a3 = _MakeArray(type, values3, {}); + auto a1_nulls = _MakeArray(type, values1, values1); + auto a2_nulls = _MakeArray(type, values2, values2); + auto a3_nulls = _MakeArray(type, values3, values3_nulls); + + TestArrayBinary(kernel, a1, a2, a3); + TestArrayBinary(kernel, a1_nulls, a2_nulls, a3_nulls); + TestArrayBinary(kernel, a1->Slice(1), a2->Slice(1), a3->Slice(1)); + TestArrayBinary(kernel, a1_nulls->Slice(1), a2_nulls->Slice(1), a3_nulls->Slice(1)); + + // ChunkedArray + std::vector> ca1_arrs = {a1, a1->Slice(1)}; + auto ca1 = std::make_shared(ca1_arrs); + std::vector> ca2_arrs = {a2, a2->Slice(1)}; + auto ca2 = std::make_shared(ca2_arrs); + std::vector> ca3_arrs = {a3, a3->Slice(1)}; + auto ca3 = std::make_shared(ca3_arrs); + TestChunkedArrayBinary(kernel, ca1, ca2, ca3); + + // ChunkedArray with different chunks + std::vector> ca4_arrs = {a1->Slice(0, 1), a1->Slice(1), + a1->Slice(1, 1), a1->Slice(2)}; + auto ca4 = std::make_shared(ca4_arrs); + TestChunkedArrayBinary(kernel, ca4, ca2, ca3); + } +}; + +TEST_F(TestBooleanKernel, Invert) { + vector values1 = {true, false, true}; + vector values2 = {false, true, false}; + + auto type = boolean(); + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + // Plain array + Datum result; + ASSERT_OK(Invert(&this->ctx_, a1, &result)); + ASSERT_EQ(Datum::ARRAY, result.kind()); + std::shared_ptr result_array = result.make_array(); + ASSERT_TRUE(result_array->Equals(a2)); + + // Array with offset + ASSERT_OK(Invert(&this->ctx_, a1->Slice(1), &result)); + ASSERT_EQ(Datum::ARRAY, result.kind()); + result_array = result.make_array(); + ASSERT_TRUE(result_array->Equals(a2->Slice(1))); + + // ChunkedArray + std::vector> ca1_arrs = {a1, a1->Slice(1)}; + auto ca1 = std::make_shared(ca1_arrs); + std::vector> ca2_arrs = {a2, a2->Slice(1)}; + auto ca2 = std::make_shared(ca2_arrs); + ASSERT_OK(Invert(&this->ctx_, ca1, &result)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); + std::shared_ptr result_ca = result.chunked_array(); + ASSERT_TRUE(result_ca->Equals(ca2)); +} + +TEST_F(TestBooleanKernel, And) { + vector values1 = {true, false, true, false, true, true}; + vector values2 = {true, true, false, false, true, false}; + vector values3 = {true, false, false, false, true, false}; + TestBinaryKernel(And, values1, values2, values3, values3); +} + +TEST_F(TestBooleanKernel, Or) { + vector values1 = {true, false, true, false, true, true}; + vector values2 = {true, true, false, false, true, false}; + vector values3 = {true, true, true, false, true, true}; + vector values3_nulls = {true, false, false, false, true, false}; + TestBinaryKernel(Or, values1, values2, values3, values3_nulls); +} + +TEST_F(TestBooleanKernel, Xor) { + vector values1 = {true, false, true, false, true, true}; + vector values2 = {true, true, false, false, true, false}; + vector values3 = {false, true, true, false, false, true}; + vector values3_nulls = {true, false, false, false, true, false}; + TestBinaryKernel(Xor, values1, values2, values3, values3_nulls); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc new file mode 100644 index 0000000000000..4c3992868ef6d --- /dev/null +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -0,0 +1,1197 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/test-common.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/decimal.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/cast.h" +#include "arrow/compute/kernels/hash.h" +#include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { +namespace compute { + +static void AssertBufferSame(const Array& left, const Array& right, int buffer_index) { + ASSERT_EQ(left.data()->buffers[buffer_index].get(), + right.data()->buffers[buffer_index].get()); +} + +class TestCast : public ComputeFixture, public TestBase { + public: + void CheckPass(const Array& input, const Array& expected, + const shared_ptr& out_type, const CastOptions& options) { + shared_ptr result; + ASSERT_OK(Cast(&ctx_, input, out_type, options, &result)); + ASSERT_ARRAYS_EQUAL(expected, *result); + } + + template + void CheckFails(const shared_ptr& in_type, const vector& in_values, + const vector& is_valid, const shared_ptr& out_type, + const CastOptions& options) { + shared_ptr input, result; + if (is_valid.size() > 0) { + ArrayFromVector(in_type, is_valid, in_values, &input); + } else { + ArrayFromVector(in_type, in_values, &input); + } + ASSERT_RAISES(Invalid, Cast(&ctx_, *input, out_type, options, &result)); + } + + void CheckZeroCopy(const Array& input, const shared_ptr& out_type) { + shared_ptr result; + ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result)); + AssertBufferSame(input, *result, 0); + AssertBufferSame(input, *result, 1); + } + + template + void CheckCase(const shared_ptr& in_type, const vector& in_values, + const vector& is_valid, const shared_ptr& out_type, + const vector& out_values, const CastOptions& options) { + DCHECK_EQ(in_values.size(), out_values.size()); + shared_ptr input, expected; + if (is_valid.size() > 0) { + DCHECK_EQ(is_valid.size(), out_values.size()); + ArrayFromVector(in_type, is_valid, in_values, &input); + ArrayFromVector(out_type, is_valid, out_values, &expected); + } else { + ArrayFromVector(in_type, in_values, &input); + ArrayFromVector(out_type, out_values, &expected); + } + CheckPass(*input, *expected, out_type, options); + + // Check a sliced variant + if (input->length() > 1) { + CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); + } + } +}; + +TEST_F(TestCast, SameTypeZeroCopy) { + vector is_valid = {true, false, true, true, true}; + vector v1 = {0, 1, 2, 3, 4}; + + shared_ptr arr; + ArrayFromVector(int32(), is_valid, v1, &arr); + + shared_ptr result; + ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result)); + + AssertBufferSame(*arr, *result, 0); + AssertBufferSame(*arr, *result, 1); +} + +TEST_F(TestCast, ToBoolean) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + // int8, should suffice for other integers + vector v1 = {0, 1, 127, -1, 0}; + vector e1 = {false, true, true, true, false}; + CheckCase(int8(), v1, is_valid, boolean(), e1, + options); + + // floating point + vector v2 = {1.0, 0, 0, -1.0, 5.0}; + vector e2 = {true, false, false, true, true}; + CheckCase(float64(), v2, is_valid, boolean(), e2, + options); +} + +TEST_F(TestCast, ToIntUpcast) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + // int8 to int32 + vector v1 = {0, 1, 127, -1, 0}; + vector e1 = {0, 1, 127, -1, 0}; + CheckCase(int8(), v1, is_valid, int32(), e1, + options); + + // bool to int8 + vector v2 = {false, true, false, true, true}; + vector e2 = {0, 1, 0, 1, 1}; + CheckCase(boolean(), v2, is_valid, int8(), e2, + options); + + // uint8 to int16, no overflow/underrun + vector v3 = {0, 100, 200, 255, 0}; + vector e3 = {0, 100, 200, 255, 0}; + CheckCase(uint8(), v3, is_valid, int16(), e3, + options); +} + +TEST_F(TestCast, OverflowInNullSlot) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + vector v11 = {0, 70000, 2000, 1000, 0}; + vector e11 = {0, 0, 2000, 1000, 0}; + + shared_ptr expected; + ArrayFromVector(int16(), is_valid, e11, &expected); + + auto buf = Buffer::Wrap(v11.data(), v11.size()); + Int32Array tmp11(5, buf, expected->null_bitmap(), -1); + + CheckPass(tmp11, *expected, int16(), options); +} + +TEST_F(TestCast, ToIntDowncastSafe) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + // int16 to uint8, no overflow/underrun + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100, 200, 1, 2}; + CheckCase(int16(), v1, is_valid, uint8(), e1, + options); + + // int16 to uint8, with overflow + vector v2 = {0, 100, 256, 0, 0}; + CheckFails(int16(), v2, is_valid, uint8(), options); + + // underflow + vector v3 = {0, 100, -1, 0, 0}; + CheckFails(int16(), v3, is_valid, uint8(), options); + + // int32 to int16, no overflow + vector v4 = {0, 1000, 2000, 1, 2}; + vector e4 = {0, 1000, 2000, 1, 2}; + CheckCase(int32(), v4, is_valid, int16(), e4, + options); + + // int32 to int16, overflow + vector v5 = {0, 1000, 2000, 70000, 0}; + CheckFails(int32(), v5, is_valid, int16(), options); + + // underflow + vector v6 = {0, 1000, 2000, -70000, 0}; + CheckFails(int32(), v6, is_valid, int16(), options); + + vector v7 = {0, 1000, 2000, -70000, 0}; + CheckFails(int32(), v7, is_valid, uint8(), options); +} + +template +std::vector UnsafeVectorCast(const std::vector& v) { + size_t n_elems = v.size(); + std::vector result(n_elems); + + for (size_t i = 0; i < v.size(); i++) result[i] = static_cast(v[i]); + + return std::move(result); +} + +TEST_F(TestCast, IntegerSignedToUnsigned) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + vector v1 = {INT32_MIN, 100, -1, UINT16_MAX, INT32_MAX}; + + // Same width + CheckFails(int32(), v1, is_valid, uint32(), options); + // Wider + CheckFails(int32(), v1, is_valid, uint64(), options); + // Narrower + CheckFails(int32(), v1, is_valid, uint16(), options); + // Fail because of overflow (instead of underflow). + vector over = {0, -11, 0, UINT16_MAX + 1, INT32_MAX}; + CheckFails(int32(), over, is_valid, uint16(), options); + + options.allow_int_overflow = true; + + CheckCase( + int32(), v1, is_valid, uint32(), UnsafeVectorCast(v1), options); + CheckCase( + int32(), v1, is_valid, uint64(), UnsafeVectorCast(v1), options); + CheckCase( + int32(), v1, is_valid, uint16(), UnsafeVectorCast(v1), options); + CheckCase( + int32(), over, is_valid, uint16(), UnsafeVectorCast(over), + options); +} + +TEST_F(TestCast, IntegerUnsignedToSigned) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, true, true}; + + vector v1 = {0, INT16_MAX + 1, UINT32_MAX}; + vector v2 = {0, INT16_MAX + 1, 2}; + // Same width + CheckFails(uint32(), v1, is_valid, int32(), options); + // Narrower + CheckFails(uint32(), v1, is_valid, int16(), options); + CheckFails(uint32(), v2, is_valid, int16(), options); + + options.allow_int_overflow = true; + + CheckCase( + uint32(), v1, is_valid, int32(), UnsafeVectorCast(v1), options); + CheckCase( + uint32(), v1, is_valid, int64(), UnsafeVectorCast(v1), options); + CheckCase( + uint32(), v1, is_valid, int16(), UnsafeVectorCast(v1), options); + CheckCase( + uint32(), v2, is_valid, int16(), UnsafeVectorCast(v2), options); +} + +TEST_F(TestCast, ToIntDowncastUnsafe) { + CastOptions options; + options.allow_int_overflow = true; + + vector is_valid = {true, false, true, true, true}; + + // int16 to uint8, no overflow/underrun + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100, 200, 1, 2}; + CheckCase(int16(), v1, is_valid, uint8(), e1, + options); + + // int16 to uint8, with overflow + vector v2 = {0, 100, 256, 0, 0}; + vector e2 = {0, 100, 0, 0, 0}; + CheckCase(int16(), v2, is_valid, uint8(), e2, + options); + + // underflow + vector v3 = {0, 100, -1, 0, 0}; + vector e3 = {0, 100, 255, 0, 0}; + CheckCase(int16(), v3, is_valid, uint8(), e3, + options); + + // int32 to int16, no overflow + vector v4 = {0, 1000, 2000, 1, 2}; + vector e4 = {0, 1000, 2000, 1, 2}; + CheckCase(int32(), v4, is_valid, int16(), e4, + options); + + // int32 to int16, overflow + // TODO(wesm): do we want to allow this? we could set to null + vector v5 = {0, 1000, 2000, 70000, 0}; + vector e5 = {0, 1000, 2000, 4464, 0}; + CheckCase(int32(), v5, is_valid, int16(), e5, + options); + + // underflow + // TODO(wesm): do we want to allow this? we could set overflow to null + vector v6 = {0, 1000, 2000, -70000, 0}; + vector e6 = {0, 1000, 2000, -4464, 0}; + CheckCase(int32(), v6, is_valid, int16(), e6, + options); +} + +TEST_F(TestCast, FloatingPointToInt) { + // which means allow_float_truncate == false + auto options = CastOptions::Safe(); + + vector is_valid = {true, false, true, true, true}; + vector all_valid = {true, true, true, true, true}; + + // float32 to int32 no truncation + vector v1 = {1.0, 0, 0.0, -1.0, 5.0}; + vector e1 = {1, 0, 0, -1, 5}; + CheckCase(float32(), v1, is_valid, int32(), e1, + options); + CheckCase(float32(), v1, all_valid, int32(), e1, + options); + + // float64 to int32 no truncation + vector v2 = {1.0, 0, 0.0, -1.0, 5.0}; + vector e2 = {1, 0, 0, -1, 5}; + CheckCase(float64(), v2, is_valid, int32(), e2, + options); + CheckCase(float64(), v2, all_valid, int32(), e2, + options); + + // float64 to int64 no truncation + vector v3 = {1.0, 0, 0.0, -1.0, 5.0}; + vector e3 = {1, 0, 0, -1, 5}; + CheckCase(float64(), v3, is_valid, int64(), e3, + options); + CheckCase(float64(), v3, all_valid, int64(), e3, + options); + + // float64 to int32 truncate + vector v4 = {1.5, 0, 0.5, -1.5, 5.5}; + vector e4 = {1, 0, 0, -1, 5}; + + options.allow_float_truncate = false; + CheckFails(float64(), v4, is_valid, int32(), options); + CheckFails(float64(), v4, all_valid, int32(), options); + + options.allow_float_truncate = true; + CheckCase(float64(), v4, is_valid, int32(), e4, + options); + CheckCase(float64(), v4, all_valid, int32(), e4, + options); + + // float64 to int64 truncate + vector v5 = {1.5, 0, 0.5, -1.5, 5.5}; + vector e5 = {1, 0, 0, -1, 5}; + + options.allow_float_truncate = false; + CheckFails(float64(), v5, is_valid, int64(), options); + CheckFails(float64(), v5, all_valid, int64(), options); + + options.allow_float_truncate = true; + CheckCase(float64(), v5, is_valid, int64(), e5, + options); + CheckCase(float64(), v5, all_valid, int64(), e5, + options); +} + +TEST_F(TestCast, IntToFloatingPoint) { + auto options = CastOptions::Safe(); + + vector all_valid = {true, true, true, true, true}; + vector all_invalid = {false, false, false, false, false}; + + vector v1 = {INT64_MIN, INT64_MIN + 1, 0, INT64_MAX - 1, INT64_MAX}; + CheckFails(int64(), v1, all_valid, float32(), options); + + // While it's not safe to convert, all values are null. + CheckCase(int64(), v1, all_invalid, float64(), + UnsafeVectorCast(v1), + options); +} + +TEST_F(TestCast, TimestampToTimestamp) { + CastOptions options; + + auto CheckTimestampCast = + [this](const CastOptions& options, TimeUnit::type from_unit, TimeUnit::type to_unit, + const vector& from_values, const vector& to_values, + const vector& is_valid) { + CheckCase( + timestamp(from_unit), from_values, is_valid, timestamp(to_unit), to_values, + options); + }; + + vector is_valid = {true, false, true, true, true}; + + // Multiply promotions + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100000, 200000, 1000, 2000}; + CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MILLI, v1, e1, is_valid); + + vector v2 = {0, 100, 200, 1, 2}; + vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MICRO, v2, e2, is_valid); + + vector v3 = {0, 100, 200, 1, 2}; + vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; + CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::NANO, v3, e3, is_valid); + + vector v4 = {0, 100, 200, 1, 2}; + vector e4 = {0, 100000, 200000, 1000, 2000}; + CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::MICRO, v4, e4, is_valid); + + vector v5 = {0, 100, 200, 1, 2}; + vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::NANO, v5, e5, is_valid); + + vector v6 = {0, 100, 200, 1, 2}; + vector e6 = {0, 100000, 200000, 1000, 2000}; + CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::NANO, v6, e6, is_valid); + + // Zero copy + vector v7 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(timestamp(TimeUnit::SECOND), is_valid, v7, + &arr); + CheckZeroCopy(*arr, timestamp(TimeUnit::SECOND)); + + // ARROW-1773, cast to integer + CheckZeroCopy(*arr, int64()); + + // Divide, truncate + vector v8 = {0, 100123, 200456, 1123, 2456}; + vector e8 = {0, 100, 200, 1, 2}; + + options.allow_time_truncate = true; + CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::SECOND, v8, e8, is_valid); + CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::MILLI, v8, e8, is_valid); + CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MICRO, v8, e8, is_valid); + + vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; + vector e9 = {0, 100, 200, 1, 2}; + CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::SECOND, v9, e9, is_valid); + CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MILLI, v9, e9, is_valid); + + vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; + vector e10 = {0, 100, 200, 1, 2}; + CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::SECOND, v10, e10, is_valid); + + // Disallow truncate, failures + options.allow_time_truncate = false; + CheckFails(timestamp(TimeUnit::MILLI), v8, is_valid, + timestamp(TimeUnit::SECOND), options); + CheckFails(timestamp(TimeUnit::MICRO), v8, is_valid, + timestamp(TimeUnit::MILLI), options); + CheckFails(timestamp(TimeUnit::NANO), v8, is_valid, + timestamp(TimeUnit::MICRO), options); + CheckFails(timestamp(TimeUnit::MICRO), v9, is_valid, + timestamp(TimeUnit::SECOND), options); + CheckFails(timestamp(TimeUnit::NANO), v9, is_valid, + timestamp(TimeUnit::MILLI), options); + CheckFails(timestamp(TimeUnit::NANO), v10, is_valid, + timestamp(TimeUnit::SECOND), options); +} + +TEST_F(TestCast, TimestampToDate32_Date64) { + CastOptions options; + + vector is_valid = {true, true, false}; + + // 2000-01-01, 2000-01-02, null + vector v_nano = {946684800000000000, 946771200000000000, 0}; + vector v_micro = {946684800000000, 946771200000000, 0}; + vector v_milli = {946684800000, 946771200000, 0}; + vector v_second = {946684800, 946771200, 0}; + vector v_day = {10957, 10958, 0}; + + // Simple conversions + CheckCase( + timestamp(TimeUnit::NANO), v_nano, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::MICRO), v_micro, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::MILLI), v_milli, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second, is_valid, date64(), v_milli, options); + + CheckCase( + timestamp(TimeUnit::NANO), v_nano, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::MICRO), v_micro, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::MILLI), v_milli, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second, is_valid, date32(), v_day, options); + + // Disallow truncate, failures + vector v_nano_fail = {946684800000000001, 946771200000000001, 0}; + vector v_micro_fail = {946684800000001, 946771200000001, 0}; + vector v_milli_fail = {946684800001, 946771200001, 0}; + vector v_second_fail = {946684801, 946771201, 0}; + + options.allow_time_truncate = false; + CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, + date64(), options); + + CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, + date32(), options); + + // Make sure that nulls are excluded from the truncation checks + vector v_second_nofail = {946684800, 946771200, 1}; + CheckCase( + timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date32(), v_day, options); +} + +TEST_F(TestCast, TimeToCompatible) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + // Multiply promotions + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100000, 200000, 1000, 2000}; + CheckCase( + time32(TimeUnit::SECOND), v1, is_valid, time32(TimeUnit::MILLI), e1, options); + + vector v2 = {0, 100, 200, 1, 2}; + vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckCase( + time32(TimeUnit::SECOND), v2, is_valid, time64(TimeUnit::MICRO), e2, options); + + vector v3 = {0, 100, 200, 1, 2}; + vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; + CheckCase( + time32(TimeUnit::SECOND), v3, is_valid, time64(TimeUnit::NANO), e3, options); + + vector v4 = {0, 100, 200, 1, 2}; + vector e4 = {0, 100000, 200000, 1000, 2000}; + CheckCase( + time32(TimeUnit::MILLI), v4, is_valid, time64(TimeUnit::MICRO), e4, options); + + vector v5 = {0, 100, 200, 1, 2}; + vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckCase( + time32(TimeUnit::MILLI), v5, is_valid, time64(TimeUnit::NANO), e5, options); + + vector v6 = {0, 100, 200, 1, 2}; + vector e6 = {0, 100000, 200000, 1000, 2000}; + CheckCase( + time64(TimeUnit::MICRO), v6, is_valid, time64(TimeUnit::NANO), e6, options); + + // Zero copy + vector v7 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(time64(TimeUnit::MICRO), is_valid, v7, &arr); + CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); + + // ARROW-1773: cast to int64 + CheckZeroCopy(*arr, int64()); + + vector v7_2 = {0, 70000, 2000, 1000, 0}; + ArrayFromVector(time32(TimeUnit::SECOND), is_valid, v7_2, &arr); + CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); + + // ARROW-1773: cast to int64 + CheckZeroCopy(*arr, int32()); + + // Divide, truncate + vector v8 = {0, 100123, 200456, 1123, 2456}; + vector e8 = {0, 100, 200, 1, 2}; + + options.allow_time_truncate = true; + CheckCase( + time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), e8, options); + CheckCase( + time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), e8, options); + CheckCase( + time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), e8, options); + + vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; + vector e9 = {0, 100, 200, 1, 2}; + CheckCase( + time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), e9, options); + CheckCase( + time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), e9, options); + + vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; + vector e10 = {0, 100, 200, 1, 2}; + CheckCase( + time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), e10, options); + + // Disallow truncate, failures + + options.allow_time_truncate = false; + CheckFails(time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), + options); + CheckFails(time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), + options); + CheckFails(time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), + options); + CheckFails(time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), + options); + CheckFails(time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), + options); + CheckFails(time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), + options); +} + +TEST_F(TestCast, PrimitiveZeroCopy) { + shared_ptr arr; + + ArrayFromVector(uint8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, uint8()); + ArrayFromVector(int8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, int8()); + + ArrayFromVector(uint16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, uint16()); + ArrayFromVector(int16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, int16()); + + ArrayFromVector(uint32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, uint32()); + ArrayFromVector(int32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, int32()); + + ArrayFromVector(uint64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, uint64()); + ArrayFromVector(int64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, int64()); + + ArrayFromVector(float32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, float32()); + + ArrayFromVector(float64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, float64()); +} + +TEST_F(TestCast, DateToCompatible) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + constexpr int64_t F = 86400000; + + // Multiply promotion + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100 * F, 200 * F, F, 2 * F}; + CheckCase(date32(), v1, is_valid, date64(), + e1, options); + + // Zero copy + vector v2 = {0, 70000, 2000, 1000, 0}; + vector v3 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(date32(), is_valid, v2, &arr); + CheckZeroCopy(*arr, date32()); + + // ARROW-1773: zero copy cast to integer + CheckZeroCopy(*arr, int32()); + + ArrayFromVector(date64(), is_valid, v3, &arr); + CheckZeroCopy(*arr, date64()); + + // ARROW-1773: zero copy cast to integer + CheckZeroCopy(*arr, int64()); + + // Divide, truncate + vector v8 = {0, 100 * F + 123, 200 * F + 456, F + 123, 2 * F + 456}; + vector e8 = {0, 100, 200, 1, 2}; + + options.allow_time_truncate = true; + CheckCase(date64(), v8, is_valid, date32(), + e8, options); + + // Disallow truncate, failures + options.allow_time_truncate = false; + CheckFails(date64(), v8, is_valid, date32(), options); +} + +TEST_F(TestCast, ToDouble) { + CastOptions options; + vector is_valid = {true, false, true, true, true}; + + // int16 to double + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100, 200, 1, 2}; + CheckCase(int16(), v1, is_valid, float64(), e1, + options); + + // float to double + vector v2 = {0, 100, 200, 1, 2}; + vector e2 = {0, 100, 200, 1, 2}; + CheckCase(float32(), v2, is_valid, float64(), e2, + options); + + // bool to double + vector v3 = {true, true, false, false, true}; + vector e3 = {1, 1, 0, 0, 1}; + CheckCase(boolean(), v3, is_valid, float64(), e3, + options); +} + +TEST_F(TestCast, ChunkedArray) { + vector values1 = {0, 1, 2}; + vector values2 = {3, 4, 5}; + + auto type = int16(); + auto out_type = int64(); + + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + ArrayVector arrays = {a1, a2}; + auto carr = std::make_shared(arrays); + + CastOptions options; + + Datum out; + ASSERT_OK(Cast(&this->ctx_, carr, out_type, options, &out)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); + + auto out_carr = out.chunked_array(); + + vector ex_values1 = {0, 1, 2}; + vector ex_values2 = {3, 4, 5}; + auto a3 = _MakeArray(out_type, ex_values1, {}); + auto a4 = _MakeArray(out_type, ex_values2, {}); + + ArrayVector ex_arrays = {a3, a4}; + auto ex_carr = std::make_shared(ex_arrays); + + ASSERT_TRUE(out.chunked_array()->Equals(*ex_carr)); +} + +TEST_F(TestCast, UnsupportedTarget) { + vector is_valid = {true, false, true, true, true}; + vector v1 = {0, 1, 2, 3, 4}; + + shared_ptr arr; + ArrayFromVector(int32(), is_valid, v1, &arr); + + shared_ptr result; + ASSERT_RAISES(NotImplemented, Cast(&this->ctx_, *arr, utf8(), {}, &result)); +} + +TEST_F(TestCast, DateTimeZeroCopy) { + vector is_valid = {true, false, true, true, true}; + + vector v1 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(int32(), is_valid, v1, &arr); + + CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); + CheckZeroCopy(*arr, date32()); + + vector v2 = {0, 70000, 2000, 1000, 0}; + ArrayFromVector(int64(), is_valid, v2, &arr); + + CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); + CheckZeroCopy(*arr, date64()); + CheckZeroCopy(*arr, timestamp(TimeUnit::NANO)); +} + +TEST_F(TestCast, FromNull) { + // Null casts to everything + const int length = 10; + + NullArray arr(length); + + shared_ptr result; + ASSERT_OK(Cast(&ctx_, arr, int32(), {}, &result)); + + ASSERT_EQ(length, result->length()); + ASSERT_EQ(length, result->null_count()); + + // OK to look at bitmaps + ASSERT_ARRAYS_EQUAL(*result, *result); +} + +TEST_F(TestCast, PreallocatedMemory) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + const int64_t length = 5; + + shared_ptr arr; + vector v1 = {0, 70000, 2000, 1000, 0}; + vector e1 = {0, 70000, 2000, 1000, 0}; + ArrayFromVector(int32(), is_valid, v1, &arr); + + auto out_type = int64(); + + std::unique_ptr kernel; + ASSERT_OK(GetCastFunction(*int32(), out_type, options, &kernel)); + + auto out_data = ArrayData::Make(out_type, length); + + shared_ptr out_values; + ASSERT_OK(this->ctx_.Allocate(length * sizeof(int64_t), &out_values)); + + out_data->buffers.push_back(nullptr); + out_data->buffers.push_back(out_values); + + Datum out(out_data); + ASSERT_OK(kernel->Call(&this->ctx_, arr, &out)); + + // Buffer address unchanged + ASSERT_EQ(out_values.get(), out_data->buffers[1].get()); + + shared_ptr result = MakeArray(out_data); + shared_ptr expected; + ArrayFromVector(int64(), is_valid, e1, &expected); + + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +template +void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr& in_type, + const vector& in_values, + const std::shared_ptr& out_type, + const vector& out_values) { + using OutTraits = TypeTraits; + + CastOptions options; + + const int64_t length = static_cast(in_values.size()); + + shared_ptr arr, expected; + ArrayFromVector(in_type, in_values, &arr); + ArrayFromVector(out_type, out_values, &expected); + + shared_ptr out_buffer; + ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer)); + + std::unique_ptr kernel; + ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel)); + + const int64_t first_half = length / 2; + + auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer}); + auto out_second_data = out_data->Copy(); + out_second_data->offset = first_half; + + Datum out_first(out_data); + Datum out_second(out_second_data); + + // Cast each bit + ASSERT_OK(kernel->Call(ctx, arr->Slice(0, first_half), &out_first)); + ASSERT_OK(kernel->Call(ctx, arr->Slice(first_half), &out_second)); + + shared_ptr result = MakeArray(out_data); + + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +TEST_F(TestCast, OffsetOutputBuffer) { + // ARROW-1735 + vector v1 = {0, 10000, 2000, 1000, 0}; + vector e1 = {0, 10000, 2000, 1000, 0}; + + auto in_type = int32(); + auto out_type = int64(); + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + out_type, e1); + + vector e2 = {false, true, true, true, false}; + + out_type = boolean(); + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + boolean(), e2); + + vector e3 = {0, 10000, 2000, 1000, 0}; + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + int16(), e3); +} + +TEST_F(TestCast, StringToBoolean) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + vector v1 = {"False", "true", "true", "True", "false"}; + vector v2 = {"0", "1", "1", "1", "0"}; + vector e = {false, true, true, true, false}; + CheckCase(utf8(), v1, is_valid, boolean(), + e, options); + CheckCase(utf8(), v2, is_valid, boolean(), + e, options); +} + +TEST_F(TestCast, StringToBooleanErrors) { + CastOptions options; + + vector is_valid = {true}; + + CheckFails(utf8(), {"false "}, is_valid, boolean(), options); + CheckFails(utf8(), {"T"}, is_valid, boolean(), options); +} + +TEST_F(TestCast, StringToNumber) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + // string to int + vector v_int = {"0", "1", "127", "-1", "0"}; + vector e_int8 = {0, 1, 127, -1, 0}; + vector e_int16 = {0, 1, 127, -1, 0}; + vector e_int32 = {0, 1, 127, -1, 0}; + vector e_int64 = {0, 1, 127, -1, 0}; + CheckCase(utf8(), v_int, is_valid, int8(), + e_int8, options); + CheckCase(utf8(), v_int, is_valid, int16(), + e_int16, options); + CheckCase(utf8(), v_int, is_valid, int32(), + e_int32, options); + CheckCase(utf8(), v_int, is_valid, int64(), + e_int64, options); + + v_int = {"2147483647", "0", "-2147483648", "0", "0"}; + e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; + CheckCase(utf8(), v_int, is_valid, int32(), + e_int32, options); + v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; + e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; + CheckCase(utf8(), v_int, is_valid, int64(), + e_int64, options); + + // string to uint + vector v_uint = {"0", "1", "127", "255", "0"}; + vector e_uint8 = {0, 1, 127, 255, 0}; + vector e_uint16 = {0, 1, 127, 255, 0}; + vector e_uint32 = {0, 1, 127, 255, 0}; + vector e_uint64 = {0, 1, 127, 255, 0}; + CheckCase(utf8(), v_uint, is_valid, + uint8(), e_uint8, options); + CheckCase(utf8(), v_uint, is_valid, + uint16(), e_uint16, options); + CheckCase(utf8(), v_uint, is_valid, + uint32(), e_uint32, options); + CheckCase(utf8(), v_uint, is_valid, + uint64(), e_uint64, options); + + v_uint = {"4294967295", "0", "0", "0", "0"}; + e_uint32 = {4294967295, 0, 0, 0, 0}; + CheckCase(utf8(), v_uint, is_valid, + uint32(), e_uint32, options); + v_uint = {"18446744073709551615", "0", "0", "0", "0"}; + e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; + CheckCase(utf8(), v_uint, is_valid, + uint64(), e_uint64, options); + + // string to float + vector v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; + vector e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; + vector e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; + CheckCase(utf8(), v_float, is_valid, + float32(), e_float, options); + CheckCase(utf8(), v_float, is_valid, + float64(), e_double, options); + + // Test that casting is locale-independent + auto global_locale = std::locale(); + try { + // French locale uses the comma as decimal point + std::locale::global(std::locale("fr_FR.UTF-8")); + } catch (std::runtime_error&) { + // Locale unavailable, ignore + } + CheckCase(utf8(), v_float, is_valid, + float32(), e_float, options); + CheckCase(utf8(), v_float, is_valid, + float64(), e_double, options); + std::locale::global(global_locale); +} + +TEST_F(TestCast, StringToNumberErrors) { + CastOptions options; + + vector is_valid = {true}; + + CheckFails(utf8(), {"z"}, is_valid, int8(), options); + CheckFails(utf8(), {"12 z"}, is_valid, int8(), options); + CheckFails(utf8(), {"128"}, is_valid, int8(), options); + CheckFails(utf8(), {"-129"}, is_valid, int8(), options); + CheckFails(utf8(), {"0.5"}, is_valid, int8(), options); + + CheckFails(utf8(), {"256"}, is_valid, uint8(), options); + CheckFails(utf8(), {"-1"}, is_valid, uint8(), options); + + CheckFails(utf8(), {"z"}, is_valid, float32(), options); +} + +TEST_F(TestCast, StringToTimestamp) { + CastOptions options; + + vector is_valid = {true, false, true}; + vector strings = {"1970-01-01", "xxx", "2000-02-29"}; + + auto type = timestamp(TimeUnit::SECOND); + vector e = {0, 0, 951782400}; + CheckCase(utf8(), strings, is_valid, + type, e, options); + + type = timestamp(TimeUnit::MICRO); + e = {0, 0, 951782400000000LL}; + CheckCase(utf8(), strings, is_valid, + type, e, options); + + // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc +} + +TEST_F(TestCast, StringToTimestampErrors) { + CastOptions options; + + vector is_valid = {true}; + + for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) { + auto type = timestamp(unit); + CheckFails(utf8(), {""}, is_valid, type, options); + CheckFails(utf8(), {"xxx"}, is_valid, type, options); + } +} + +constexpr const char* kInvalidUtf8 = "\xa0\xa1"; + +TEST_F(TestCast, BinaryToString) { + CastOptions options; + + // All valid except the last one + vector all = {1, 1, 1, 1, 1}; + vector valid = {1, 1, 1, 1, 0}; + vector strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8}; + + std::shared_ptr array; + + // Should accept when invalid but null. + ArrayFromVector(binary(), valid, strings, &array); + CheckZeroCopy(*array, utf8()); + + // Should refuse due to invalid utf8 payload + CheckFails(binary(), strings, all, utf8(), options); + + // Should accept due to option override + options.allow_invalid_utf8 = true; + CheckCase(binary(), strings, all, + utf8(), strings, options); +} + +template +class TestDictionaryCast : public TestCast {}; + +typedef ::testing::Types + TestTypes; + +TYPED_TEST_CASE(TestDictionaryCast, TestTypes); + +TYPED_TEST(TestDictionaryCast, Basic) { + CastOptions options; + shared_ptr plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + + Datum out; + ASSERT_OK(DictionaryEncode(&this->ctx_, plain_array->data(), &out)); + + this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options); +} + +TEST_F(TestCast, DictToNonDictNoNulls) { + vector dict_values = {"foo", "bar", "baz"}; + auto ex_dict = _MakeArray(utf8(), dict_values, {}); + auto dict_type = dictionary(int32(), ex_dict); + + // Explicitly construct with nullptr for the null_bitmap_data + std::vector i1 = {1, 0, 1}; + std::vector i2 = {2, 1, 0, 1}; + auto c1 = std::make_shared>(3, Buffer::Wrap(i1)); + auto c2 = std::make_shared>(4, Buffer::Wrap(i2)); + + ArrayVector dict_arrays = {std::make_shared(dict_type, c1), + std::make_shared(dict_type, c2)}; + auto dict_carr = std::make_shared(dict_arrays); + + Datum cast_input(dict_carr); + Datum cast_output; + // Ensure that casting works even when the null_bitmap_data array is a nullptr + ASSERT_OK(Cast(&this->ctx_, cast_input, + static_cast(*dict_type).dictionary()->type(), + CastOptions(), &cast_output)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, cast_output.kind()); + + auto e1 = _MakeArray(utf8(), {"bar", "foo", "bar"}, {}); + auto e2 = _MakeArray(utf8(), {"baz", "bar", "foo", "bar"}, {}); + + auto chunks = cast_output.chunked_array()->chunks(); + ASSERT_EQ(chunks.size(), 2); + ASSERT_ARRAYS_EQUAL(*e1, *chunks[0]); + ASSERT_ARRAYS_EQUAL(*e2, *chunks[1]); +} + +/*TYPED_TEST(TestDictionaryCast, Reverse) { + CastOptions options; + shared_ptr plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + + shared_ptr dict_array; + ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array)); + + this->CheckPass(*plain_array, *dict_array, dict_array->type(), options); +}*/ + +TEST_F(TestCast, ListToList) { + CastOptions options; + std::shared_ptr offsets; + + vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; + std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; + ArrayFromVector(offsets_is_valid, offsets_values, &offsets); + + shared_ptr int32_plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + std::shared_ptr int32_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *int32_plain_array, pool_, &int32_list_array)); + + std::shared_ptr int64_plain_array; + ASSERT_OK(Cast(&this->ctx_, *int32_plain_array, int64(), options, &int64_plain_array)); + std::shared_ptr int64_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *int64_plain_array, pool_, &int64_list_array)); + + std::shared_ptr float64_plain_array; + ASSERT_OK( + Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array)); + std::shared_ptr float64_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *float64_plain_array, pool_, &float64_list_array)); + + CheckPass(*int32_list_array, *int64_list_array, int64_list_array->type(), options); + CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), options); + CheckPass(*int64_list_array, *int32_list_array, int32_list_array->type(), options); + CheckPass(*int64_list_array, *float64_list_array, float64_list_array->type(), options); + + options.allow_float_truncate = true; + CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), options); + CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 4f7d7f822b3ab..b148486bd212f 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -37,6 +37,7 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/parsing.h" // IWYU pragma: keep +#include "arrow/util/utf8.h" #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" @@ -77,6 +78,19 @@ namespace compute { constexpr int64_t kMillisecondsInDay = 86400000; +template +struct is_binary_to_string { + static constexpr bool value = false; +}; + +template +struct is_binary_to_string< + O, I, + typename std::enable_if::value && + std::is_base_of::value>::type> { + static constexpr bool value = true; +}; + // ---------------------------------------------------------------------- // Zero copy casts @@ -112,15 +126,30 @@ struct is_zero_copy_cast< static constexpr bool value = sizeof(O_T) == sizeof(I_T); }; +// Binary to String doesn't require copying, the payload only needs to be +// validated. +template +struct is_zero_copy_cast< + O, I, + typename std::enable_if::value && + is_binary_to_string::value>::type> { + static constexpr bool value = true; +}; + template struct CastFunctor {}; // Indicated no computation required +// +// The case BinaryType -> StringType is special cased due to validation +// requirements. template -struct CastFunctor::value>::type> { +struct CastFunctor::value && + !is_binary_to_string::value>::type> { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { - CopyData(input, output); + ZeroCopyData(input, output); } }; @@ -532,7 +561,7 @@ struct CastFunctor { const auto& out_type = checked_cast(*output->type); if (in_type.unit() == out_type.unit()) { - CopyData(input, output); + ZeroCopyData(input, output); return; } @@ -625,7 +654,7 @@ struct CastFunctor(*output->type); if (in_type.unit() == out_type.unit()) { - CopyData(input, output); + ZeroCopyData(input, output); return; } @@ -998,7 +1027,7 @@ struct CastFunctor { continue; } - auto str = input_array.GetView(i); + const auto str = input_array.GetView(i); if (!converter(str.data(), str.length(), out_data)) { std::stringstream ss; ss << "Failed to cast String '" << str << "' into " << output->type->ToString(); @@ -1009,6 +1038,52 @@ struct CastFunctor { } }; +// ---------------------------------------------------------------------- +// Binary to String +// + +template +struct CastFunctor< + StringType, I, + typename std::enable_if::value>::type> { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + BinaryArray binary(input.Copy()); + + if (options.allow_invalid_utf8) { + ZeroCopyData(input, output); + return; + } + + util::InitializeUTF8(); + + if (binary.null_count() != 0) { + for (int64_t i = 0; i < input.length; i++) { + if (binary.IsNull(i)) { + continue; + } + + const auto str = binary.GetView(i); + if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { + ctx->SetStatus(Status::Invalid("Invalid UTF8 payload")); + return; + } + } + + } else { + for (int64_t i = 0; i < input.length; i++) { + const auto str = binary.GetView(i); + if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { + ctx->SetStatus(Status::Invalid("Invalid UTF8 payload")); + return; + } + } + } + + ZeroCopyData(input, output); + } +}; + // ---------------------------------------------------------------------- typedef std::functionkind() == Datum::NONE) { - out->value = ArrayData::Make(out_type_, in_data.length); + switch (out->kind()) { + case Datum::NONE: + out->value = ArrayData::Make(out_type_, in_data.length); + break; + case Datum::ARRAY: + break; + default: + return Status::NotImplemented("CastKernel only supports Datum::ARRAY output"); } - result = out->array().get(); - + ArrayData* result = out->array().get(); if (!is_zero_copy_) { RETURN_NOT_OK( AllocateIfNotPreallocated(ctx, in_data, can_pre_allocate_values_, result)); @@ -1187,6 +1267,8 @@ class CastKernel : public UnaryKernel { FN(TimestampType, Date64Type); \ FN(TimestampType, Int64Type); +#define BINARY_CASES(FN, IN_TYPE) FN(BinaryType, StringType); + #define STRING_CASES(FN, IN_TYPE) \ FN(StringType, StringType); \ FN(StringType, BooleanType); \ @@ -1259,6 +1341,7 @@ GET_CAST_FUNCTION(DATE64_CASES, Date64Type); GET_CAST_FUNCTION(TIME32_CASES, Time32Type); GET_CAST_FUNCTION(TIME64_CASES, Time64Type); GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType); +GET_CAST_FUNCTION(BINARY_CASES, BinaryType); GET_CAST_FUNCTION(STRING_CASES, StringType); GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType); @@ -1307,6 +1390,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& CAST_FUNCTION_CASE(Time32Type); CAST_FUNCTION_CASE(Time64Type); CAST_FUNCTION_CASE(TimestampType); + CAST_FUNCTION_CASE(BinaryType); CAST_FUNCTION_CASE(StringType); CAST_FUNCTION_CASE(DictionaryType); case Type::LIST: diff --git a/cpp/src/arrow/compute/kernels/cast.h b/cpp/src/arrow/compute/kernels/cast.h index 65c70bf14aa88..8c42f07bda7f1 100644 --- a/cpp/src/arrow/compute/kernels/cast.h +++ b/cpp/src/arrow/compute/kernels/cast.h @@ -38,12 +38,14 @@ struct ARROW_EXPORT CastOptions { CastOptions() : allow_int_overflow(false), allow_time_truncate(false), - allow_float_truncate(false) {} + allow_float_truncate(false), + allow_invalid_utf8(false) {} explicit CastOptions(bool safe) : allow_int_overflow(!safe), allow_time_truncate(!safe), - allow_float_truncate(!safe) {} + allow_float_truncate(!safe), + allow_invalid_utf8(!safe) {} static CastOptions Safe() { return CastOptions(true); } @@ -52,6 +54,9 @@ struct ARROW_EXPORT CastOptions { bool allow_int_overflow; bool allow_time_truncate; bool allow_float_truncate; + // Indicate if conversions from Binary/FixedSizeBinary to string must + // validate the utf8 payload. + bool allow_invalid_utf8; }; /// \since 0.7.0 diff --git a/cpp/src/arrow/compute/kernels/hash-test.cc b/cpp/src/arrow/compute/kernels/hash-test.cc new file mode 100644 index 0000000000000..f20575f621b4c --- /dev/null +++ b/cpp/src/arrow/compute/kernels/hash-test.cc @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/test-common.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/decimal.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/hash.h" +#include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { +namespace compute { + +// ---------------------------------------------------------------------- +// Dictionary tests + +template +void CheckUnique(FunctionContext* ctx, const shared_ptr& type, + const vector& in_values, const vector& in_is_valid, + const vector& out_values, const vector& out_is_valid) { + shared_ptr input = _MakeArray(type, in_values, in_is_valid); + shared_ptr expected = _MakeArray(type, out_values, out_is_valid); + + shared_ptr result; + ASSERT_OK(Unique(ctx, input, &result)); + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +template +void CheckDictEncode(FunctionContext* ctx, const shared_ptr& type, + const vector& in_values, const vector& in_is_valid, + const vector& out_values, const vector& out_is_valid, + const vector& out_indices) { + shared_ptr input = _MakeArray(type, in_values, in_is_valid); + shared_ptr ex_dict = _MakeArray(type, out_values, out_is_valid); + shared_ptr ex_indices = + _MakeArray(int32(), out_indices, in_is_valid); + + DictionaryArray expected(dictionary(int32(), ex_dict), ex_indices); + + Datum datum_out; + ASSERT_OK(DictionaryEncode(ctx, input, &datum_out)); + shared_ptr result = MakeArray(datum_out.array()); + + ASSERT_ARRAYS_EQUAL(expected, *result); +} + +class TestHashKernel : public ComputeFixture, public TestBase {}; + +template +class TestHashKernelPrimitive : public ComputeFixture, public TestBase {}; + +typedef ::testing::Types + PrimitiveDictionaries; + +TYPED_TEST_CASE(TestHashKernelPrimitive, PrimitiveDictionaries); + +TYPED_TEST(TestHashKernelPrimitive, Unique) { + using T = typename TypeParam::c_type; + auto type = TypeTraits::type_singleton(); + CheckUnique(&this->ctx_, type, {2, 1, 2, 1}, {true, false, true, true}, + {2, 1}, {}); + CheckUnique(&this->ctx_, type, {2, 1, 3, 1}, {false, false, true, true}, + {3, 1}, {}); +} + +TYPED_TEST(TestHashKernelPrimitive, DictEncode) { + using T = typename TypeParam::c_type; + auto type = TypeTraits::type_singleton(); + CheckDictEncode(&this->ctx_, type, {2, 1, 2, 1, 2, 3}, + {true, false, true, true, true, true}, {2, 1, 3}, {}, + {0, 0, 0, 1, 0, 2}); +} + +TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { + using T = typename TypeParam::c_type; + // Skip this test for (u)int8 + if (sizeof(Scalar) == 1) { + return; + } + + const int64_t kTotalValues = 1000000; + const int64_t kRepeats = 5; + + vector values; + vector uniques; + vector indices; + for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { + const auto val = static_cast(i % kTotalValues); + values.push_back(val); + + if (i < kTotalValues) { + uniques.push_back(val); + } + indices.push_back(static_cast(i % kTotalValues)); + } + + auto type = TypeTraits::type_singleton(); + CheckUnique(&this->ctx_, type, values, {}, uniques, {}); + + CheckDictEncode(&this->ctx_, type, values, {}, uniques, {}, indices); +} + +TEST_F(TestHashKernel, UniqueTimeTimestamp) { + CheckUnique(&this->ctx_, time32(TimeUnit::SECOND), {2, 1, 2, 1}, + {true, false, true, true}, {2, 1}, {}); + + CheckUnique(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1}, + {true, false, true, true}, {2, 1}, {}); + + CheckUnique(&this->ctx_, timestamp(TimeUnit::NANO), + {2, 1, 2, 1}, {true, false, true, true}, {2, 1}, + {}); +} + +TEST_F(TestHashKernel, UniqueBoolean) { + CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, + {true, false, true, true}, {true, false}, {}); + + CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, + {true, false, true, true}, {false, true}, {}); + + // No nulls + CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, {}, + {true, false}, {}); + + CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, {}, + {false, true}, {}); +} + +TEST_F(TestHashKernel, DictEncodeBoolean) { + CheckDictEncode( + &this->ctx_, boolean(), {true, true, false, true, false}, + {true, false, true, true, true}, {true, false}, {}, {0, 0, 1, 0, 1}); + + CheckDictEncode( + &this->ctx_, boolean(), {false, true, false, true, false}, + {true, false, true, true, true}, {false, true}, {}, {0, 0, 0, 1, 0}); + + // No nulls + CheckDictEncode(&this->ctx_, boolean(), + {true, true, false, true, false}, {}, {true, false}, + {}, {0, 0, 1, 0, 1}); + + CheckDictEncode(&this->ctx_, boolean(), + {false, true, false, true, false}, {}, {false, true}, + {}, {0, 1, 0, 1, 0}); +} + +TEST_F(TestHashKernel, UniqueBinary) { + CheckUnique(&this->ctx_, binary(), + {"test", "", "test2", "test"}, + {true, false, true, true}, {"test", "test2"}, {}); + + CheckUnique(&this->ctx_, utf8(), {"test", "", "test2", "test"}, + {true, false, true, true}, {"test", "test2"}, {}); +} + +TEST_F(TestHashKernel, DictEncodeBinary) { + CheckDictEncode( + &this->ctx_, binary(), {"test", "", "test2", "test", "baz"}, + {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); + + CheckDictEncode( + &this->ctx_, utf8(), {"test", "", "test2", "test", "baz"}, + {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); +} + +TEST_F(TestHashKernel, BinaryResizeTable) { + const int32_t kTotalValues = 10000; +#if !defined(ARROW_VALGRIND) + const int32_t kRepeats = 10; +#else + // Mitigate Valgrind's slowness + const int32_t kRepeats = 3; +#endif + + vector values; + vector uniques; + vector indices; + char buf[20] = "test"; + + for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { + int32_t index = i % kTotalValues; + + ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0); + values.emplace_back(buf); + + if (i < kTotalValues) { + uniques.push_back(values.back()); + } + indices.push_back(index); + } + + CheckUnique(&this->ctx_, binary(), values, {}, uniques, {}); + CheckDictEncode(&this->ctx_, binary(), values, {}, uniques, {}, + indices); + + CheckUnique(&this->ctx_, utf8(), values, {}, uniques, {}); + CheckDictEncode(&this->ctx_, utf8(), values, {}, uniques, {}, + indices); +} + +TEST_F(TestHashKernel, UniqueFixedSizeBinary) { + CheckUnique( + &this->ctx_, fixed_size_binary(5), {"aaaaa", "", "bbbbb", "aaaaa"}, + {true, false, true, true}, {"aaaaa", "bbbbb"}, {}); +} + +TEST_F(TestHashKernel, DictEncodeFixedSizeBinary) { + CheckDictEncode( + &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, + {true, false, true, true, true}, {"bbbbb", "aaaaa", "ccccc"}, {}, {0, 0, 0, 1, 2}); +} + +TEST_F(TestHashKernel, FixedSizeBinaryResizeTable) { + const int32_t kTotalValues = 10000; +#if !defined(ARROW_VALGRIND) + const int32_t kRepeats = 10; +#else + // Mitigate Valgrind's slowness + const int32_t kRepeats = 3; +#endif + + vector values; + vector uniques; + vector indices; + char buf[7] = "test.."; + + for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { + int32_t index = i % kTotalValues; + + buf[4] = static_cast(index / 128); + buf[5] = static_cast(index % 128); + values.emplace_back(buf, 6); + + if (i < kTotalValues) { + uniques.push_back(values.back()); + } + indices.push_back(index); + } + + auto type = fixed_size_binary(6); + CheckUnique(&this->ctx_, type, values, {}, uniques, + {}); + CheckDictEncode(&this->ctx_, type, values, {}, + uniques, {}, indices); +} + +TEST_F(TestHashKernel, UniqueDecimal) { + vector values{12, 12, 11, 12}; + vector expected{12, 11}; + + CheckUnique(&this->ctx_, decimal(2, 0), values, + {true, false, true, true}, expected, {}); +} + +TEST_F(TestHashKernel, DictEncodeDecimal) { + vector values{12, 12, 11, 12, 13}; + vector expected{12, 11, 13}; + + CheckDictEncode(&this->ctx_, decimal(2, 0), values, + {true, false, true, true, true}, expected, + {}, {0, 0, 1, 0, 2}); +} + +TEST_F(TestHashKernel, ChunkedArrayInvoke) { + vector values1 = {"foo", "bar", "foo"}; + vector values2 = {"bar", "baz", "quuux", "foo"}; + + auto type = utf8(); + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + vector dict_values = {"foo", "bar", "baz", "quuux"}; + auto ex_dict = _MakeArray(type, dict_values, {}); + + ArrayVector arrays = {a1, a2}; + auto carr = std::make_shared(arrays); + + // Unique + shared_ptr result; + ASSERT_OK(Unique(&this->ctx_, carr, &result)); + ASSERT_ARRAYS_EQUAL(*ex_dict, *result); + + // Dictionary encode + auto dict_type = dictionary(int32(), ex_dict); + + auto i1 = _MakeArray(int32(), {0, 1, 0}, {}); + auto i2 = _MakeArray(int32(), {1, 2, 3, 0}, {}); + + ArrayVector dict_arrays = {std::make_shared(dict_type, i1), + std::make_shared(dict_type, i2)}; + auto dict_carr = std::make_shared(dict_arrays); + + Datum encoded_out; + ASSERT_OK(DictionaryEncode(&this->ctx_, carr, &encoded_out)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); + + AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/util-internal.h b/cpp/src/arrow/compute/kernels/util-internal.h index 23ed4fd7ee7d7..d71e36d9c42b4 100644 --- a/cpp/src/arrow/compute/kernels/util-internal.h +++ b/cpp/src/arrow/compute/kernels/util-internal.h @@ -32,7 +32,9 @@ namespace compute { class FunctionContext; -static inline void CopyData(const ArrayData& input, ArrayData* output) { +// \brief Make a copy of the buffers into a destination array without carrying +// the type. +static inline void ZeroCopyData(const ArrayData& input, ArrayData* output) { output->length = input.length; output->null_count = input.null_count; output->buffers = input.buffers; diff --git a/cpp/src/arrow/compute/test-util.h b/cpp/src/arrow/compute/test-util.h new file mode 100644 index 0000000000000..e2bda698a9bff --- /dev/null +++ b/cpp/src/arrow/compute/test-util.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_TEST_UTIL_H +#define ARROW_COMPUTE_TEST_UTIL_H + +#include +#include + +#include "arrow/array.h" +#include "arrow/memory_pool.h" +#include "arrow/type.h" + +#include "arrow/compute/context.h" + +namespace arrow { +namespace compute { + +class ComputeFixture { + public: + ComputeFixture() : ctx_(default_memory_pool()) {} + + protected: + FunctionContext ctx_; +}; + +template +std::shared_ptr _MakeArray(const std::shared_ptr& type, + const std::vector& values, + const std::vector& is_valid) { + std::shared_ptr result; + if (is_valid.size() > 0) { + ArrayFromVector(type, is_valid, values, &result); + } else { + ArrayFromVector(type, values, &result); + } + return result; +} + +} // namespace compute +} // namespace arrow + +#endif diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index f5a18be05a92f..072c2188f7081 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -24,6 +24,7 @@ #include #include "arrow/util/macros.h" +#include "arrow/util/string_view.h" #include "arrow/util/visibility.h" namespace arrow { @@ -157,6 +158,13 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) { return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept); } +inline bool ValidateUTF8(const util::string_view& str) { + const uint8_t* data = reinterpret_cast(str.data()); + const size_t length = str.size(); + + return ValidateUTF8(data, length); +} + } // namespace util } // namespace arrow From 781e251a150ec52f3072188f2291ec4a70995ebf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 18 Dec 2018 10:53:16 -0600 Subject: [PATCH 072/328] ARROW-4055: [Python] Fails to convert pytz.utc with versions 2018.3 and earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Closes #3207 from kszucs/ARROW-4055 and squashes the following commits: 2edb3b219 fix import order 49b381fa4 hypothesis test c3d68b379 explicitly check against pytz.utc --- python/pyarrow/tests/test_convert_pandas.py | 21 +++++++++++++++------ python/pyarrow/types.pxi | 4 +++- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 4d283b3150606..41bcae83db516 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -20,9 +20,13 @@ import decimal import json import multiprocessing as mp + from collections import OrderedDict from datetime import date, datetime, time, timedelta +import hypothesis as h +import hypothesis.extra.pytz as tzst +import hypothesis.strategies as st import numpy as np import numpy.testing as npt import pandas as pd @@ -31,9 +35,6 @@ import pytz import pyarrow as pa -import pyarrow.types as patypes -from pyarrow.compat import PY2 - from .pandas_examples import dataframe_with_arrays, dataframe_with_lists @@ -94,7 +95,7 @@ def _check_series_roundtrip(s, type_=None, expected_pa_type=None): assert arr.type == expected_pa_type result = pd.Series(arr.to_pandas(), name=s.name) - if patypes.is_timestamp(arr.type) and arr.type.tz is not None: + if pa.types.is_timestamp(arr.type) and arr.type.tz is not None: result = (result.dt.tz_localize('utc') .dt.tz_convert(arr.type.tz)) @@ -255,12 +256,14 @@ def test_string_column_index(self): column_indexes, = js['column_indexes'] assert column_indexes['name'] == 'stringz' assert column_indexes['name'] == column_indexes['field_name'] - assert column_indexes['pandas_type'] == ('bytes' if PY2 else 'unicode') assert column_indexes['numpy_type'] == 'object' + assert column_indexes['pandas_type'] == ( + 'bytes' if six.PY2 else 'unicode' + ) md = column_indexes['metadata'] - if not PY2: + if not six.PY2: assert len(md) == 1 assert md['encoding'] == 'UTF-8' else: @@ -840,6 +843,12 @@ def test_python_datetime_with_pytz_tzinfo(self): df = pd.DataFrame({'datetime': values}) _check_pandas_roundtrip(df) + @h.given(st.none() | tzst.timezones()) + def test_python_datetime_with_pytz_timezone(self, tz): + values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)] + df = pd.DataFrame({'datetime': values}) + _check_pandas_roundtrip(df) + @pytest.mark.skipif(six.PY2, reason='datetime.timezone is available since ' 'python version 3.2') def test_python_datetime_with_timezone_tzinfo(self): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index f69190c1c2eaa..9ec36bff3a6fe 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1002,7 +1002,9 @@ def tzinfo_to_string(tz): raise ValueError('Offset must represent whole number of minutes') return '{}{:02d}:{:02d}'.format(sign, hours, minutes) - if isinstance(tz, pytz.tzinfo.BaseTzInfo): + if tz is pytz.utc: + return tz.zone # ARROW-4055 + elif isinstance(tz, pytz.tzinfo.BaseTzInfo): return tz.zone elif isinstance(tz, pytz._FixedOffset): return fixed_offset_to_string(tz) From 758bd557584107cb336cbc3422744dacd93978af Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Dec 2018 12:23:55 -0600 Subject: [PATCH 073/328] ARROW-2919: [C++/Python] Improve HdfsFile error messages, fix Python unit test suite This also resolves ARROW-3957 and ARROW-4053. Summary: * Properly initialize NativeFile when opening from HDFS. This was broken when the "closed" property was added and some other refactoring, and wasn't caught because these tests aren't being run regularly * Slightly improves the handling of filesystem URIs -- there were some tests that failed without these changes because the docker-compose HDFS containers don't allow writes from $USER * Improve error message when calling "info" on a file that does not exist * Improve error message when calling `ls` on a directory that does not exist * Suggest checking whether you are connecting to the right HDFS port when getting errno 255 Author: Wes McKinney Closes #3209 from wesm/ARROW-2919 and squashes the following commits: b11e5b665 Restore arrow_dependencies to Gandiva dependencies 20e8784f6 Code review comments 4ba93bbb1 More helpful error messages when GetPathInfo or ListDirectory fails due to non-existent file or bad port 3c67ea6f0 Basic fixes to get Python unit tests passing again --- cpp/src/arrow/io/hdfs-test.cc | 19 ++++++++++ cpp/src/arrow/io/hdfs.cc | 51 +++++++++++++++++++-------- cpp/src/gandiva/CMakeLists.txt | 2 +- python/pyarrow/filesystem.py | 4 +-- python/pyarrow/io-hdfs.pxi | 3 ++ python/pyarrow/parquet.py | 58 ++++++++++++++++++++----------- python/pyarrow/tests/test_hdfs.py | 13 ++++--- 7 files changed, 108 insertions(+), 42 deletions(-) diff --git a/cpp/src/arrow/io/hdfs-test.cc b/cpp/src/arrow/io/hdfs-test.cc index c853b2012666e..08a7e13a1f8a2 100644 --- a/cpp/src/arrow/io/hdfs-test.cc +++ b/cpp/src/arrow/io/hdfs-test.cc @@ -257,6 +257,23 @@ TYPED_TEST(TestHadoopFileSystem, GetPathInfo) { ASSERT_EQ(size, info.size); } +TYPED_TEST(TestHadoopFileSystem, GetPathInfoNotExist) { + // ARROW-2919: Test that the error message is reasonable + SKIP_IF_NO_DRIVER(); + + ASSERT_OK(this->MakeScratchDir()); + auto path = this->ScratchPath("path-does-not-exist"); + + HdfsPathInfo info; + Status s = this->client_->GetPathInfo(path, &info); + ASSERT_TRUE(s.IsIOError()); + + const std::string error_message = s.ToString(); + + // Check that the file path is found in the error message + ASSERT_LT(error_message.find(path), std::string::npos); +} + TYPED_TEST(TestHadoopFileSystem, AppendToFile) { SKIP_IF_NO_DRIVER(); @@ -377,6 +394,8 @@ TYPED_TEST(TestHadoopFileSystem, LargeFile) { std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(path, &file)); + ASSERT_FALSE(file->closed()); + std::shared_ptr buffer; ASSERT_OK(AllocateBuffer(nullptr, size, &buffer)); diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 6f01f75eec3c1..030b84853da60 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -43,14 +43,27 @@ using std::size_t; namespace arrow { namespace io { -#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ - do { \ - if (RETURN_VALUE == -1) { \ - std::stringstream ss; \ - ss << "HDFS " << WHAT << " failed, errno: " << errno << " (" << strerror(errno) \ - << ")"; \ - return Status::IOError(ss.str()); \ - } \ +namespace { + +std::string TranslateErrno(int error_code) { + std::stringstream ss; + ss << error_code << " (" << strerror(error_code) << ")"; + if (error_code == 255) { + // Unknown error can occur if the host is correct but the port is not + ss << " Please check that you are connecting to the correct HDFS RPC port"; + } + return ss.str(); +} + +} // namespace + +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + std::stringstream ss; \ + ss << "HDFS " << WHAT << " failed, errno: " << TranslateErrno(errno); \ + return Status::IOError(ss.str()); \ + } \ } while (0) static constexpr int kDefaultHdfsBufferSize = 1 << 16; @@ -99,6 +112,16 @@ class HdfsAnyFileImpl { bool is_open_; }; +namespace { + +Status GetPathInfoFailed(const std::string& path) { + std::stringstream ss; + ss << "Calling GetPathInfo for " << path << " failed. errno: " << TranslateErrno(errno); + return Status::IOError(ss.str()); +} + +} // namespace + // Private implementation for read-only files class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { public: @@ -180,7 +203,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { Status GetSize(int64_t* size) { hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path_.c_str()); if (entry == nullptr) { - return Status::IOError("HDFS: GetPathInfo failed"); + return GetPathInfoFailed(path_); } *size = entry->mSize; @@ -204,7 +227,7 @@ HdfsReadableFile::HdfsReadableFile(MemoryPool* pool) { impl_.reset(new HdfsReadableFileImpl(pool)); } -HdfsReadableFile::~HdfsReadableFile() { DCHECK(impl_->Close().ok()); } +HdfsReadableFile::~HdfsReadableFile() { DCHECK_OK(impl_->Close()); } Status HdfsReadableFile::Close() { return impl_->Close(); } @@ -272,7 +295,7 @@ class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { HdfsOutputStream::HdfsOutputStream() { impl_.reset(new HdfsOutputStreamImpl()); } -HdfsOutputStream::~HdfsOutputStream() { DCHECK(impl_->Close().ok()); } +HdfsOutputStream::~HdfsOutputStream() { DCHECK_OK(impl_->Close()); } Status HdfsOutputStream::Close() { return impl_->Close(); } @@ -399,7 +422,7 @@ class HadoopFileSystem::HadoopFileSystemImpl { hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path.c_str()); if (entry == nullptr) { - return Status::IOError("HDFS: GetPathInfo failed"); + return GetPathInfoFailed(path); } SetPathInfo(entry, info); @@ -444,8 +467,8 @@ class HadoopFileSystem::HadoopFileSystemImpl { num_entries = 0; } else { std::stringstream ss; - ss << "HDFS list directory failed, errno: " << errno << " (" << strerror(errno) - << ")"; + ss << "HDFS list directory of " << path + << " failed, errno: " << TranslateErrno(errno); return Status::IOError(ss.str()); } } diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 8052db5e8545d..23ad93e201e71 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -83,7 +83,7 @@ endif() ADD_ARROW_LIB(gandiva SOURCES ${SRC_FILES} OUTPUTS GANDIVA_LIBRARIES - DEPENDENCIES precompiled + DEPENDENCIES arrow_dependencies precompiled EXTRA_INCLUDES $ SHARED_LINK_LIBS arrow_shared diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index 8188a2607e21a..98efb1e3ec374 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -390,7 +390,7 @@ def _ensure_filesystem(fs): return fs -def _get_fs_from_path(path): +def get_filesystem_from_uri(path): """ return filesystem from path which could be an HDFS URI """ @@ -411,4 +411,4 @@ def _get_fs_from_path(path): else: fs = LocalFileSystem.get_instance() - return fs + return fs, parsed_uri.path diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi index e7a322ea469bb..d93bd790eaa1e 100644 --- a/python/pyarrow/io-hdfs.pxi +++ b/python/pyarrow/io-hdfs.pxi @@ -433,6 +433,9 @@ cdef class HadoopFileSystem: out.set_random_access_file( rd_handle) + out.is_readable = True + + assert not out.closed if c_buffer_size == 0: c_buffer_size = 2 ** 16 diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index feaa890fc6cd9..a520acece972e 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -18,6 +18,7 @@ from collections import defaultdict from concurrent import futures +from six.moves.urllib.parse import urlparse import json import numpy as np import os @@ -34,10 +35,24 @@ ParquetSchema, ColumnSchema) from pyarrow.compat import guid from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem, - _get_fs_from_path) + get_filesystem_from_uri) from pyarrow.util import _is_path_like, _stringify_path +def _parse_uri(path): + path = _stringify_path(path) + return urlparse(path).path + + +def _get_filesystem_and_path(passed_filesystem, path): + if passed_filesystem is None: + return get_filesystem_from_uri(path) + else: + passed_filesystem = _ensure_filesystem(passed_filesystem) + parsed_path = _parse_uri(path) + return passed_filesystem, parsed_path + + def _check_contains_null(val): if isinstance(val, six.binary_type): for byte in val: @@ -316,7 +331,8 @@ def __init__(self, where, schema, flavor=None, version='1.0', use_dictionary=True, compression='snappy', - use_deprecated_int96_timestamps=None, **options): + use_deprecated_int96_timestamps=None, + filesystem=None, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark if flavor is not None and 'spark' in flavor: @@ -338,8 +354,8 @@ def __init__(self, where, schema, flavor=None, self.file_handle = None if _is_path_like(where): - fs = _get_fs_from_path(where) - sink = self.file_handle = fs.open(where, 'wb') + fs, path = _get_filesystem_and_path(filesystem, where) + sink = self.file_handle = fs.open(path, 'wb') else: sink = where @@ -681,7 +697,8 @@ class ParquetManifest(object): """ def __init__(self, dirpath, filesystem=None, pathsep='/', partition_scheme='hive', metadata_nthreads=1): - self.filesystem = filesystem or _get_fs_from_path(dirpath) + filesystem, dirpath = _get_filesystem_and_path(filesystem, dirpath) + self.filesystem = filesystem self.pathsep = pathsep self.dirpath = _stringify_path(dirpath) self.partition_scheme = partition_scheme @@ -845,15 +862,15 @@ class ParquetDataset(object): def __init__(self, path_or_paths, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1): - if filesystem is None: - a_path = path_or_paths - if isinstance(a_path, list): - a_path = a_path[0] - self.fs = _get_fs_from_path(a_path) - else: - self.fs = _ensure_filesystem(filesystem) + a_path = path_or_paths + if isinstance(a_path, list): + a_path = a_path[0] - self.paths = path_or_paths + self.fs, _ = _get_filesystem_and_path(filesystem, a_path) + if isinstance(path_or_paths, list): + self.paths = [_parse_uri(path) for path in path_or_paths] + else: + self.paths = _parse_uri(path_or_paths) (self.pieces, self.partitions, @@ -1070,10 +1087,11 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1): def read_table(source, columns=None, use_threads=True, metadata=None, - use_pandas_metadata=False, memory_map=True): + use_pandas_metadata=False, memory_map=True, + filesystem=None): if _is_path_like(source): - fs = _get_fs_from_path(source) - return fs.read_parquet(source, columns=columns, + fs, path = _get_filesystem_and_path(filesystem, source) + return fs.read_parquet(path, columns=columns, use_threads=use_threads, metadata=metadata, use_pandas_metadata=use_pandas_metadata) @@ -1113,12 +1131,13 @@ def write_table(table, where, row_group_size=None, version='1.0', use_deprecated_int96_timestamps=None, coerce_timestamps=None, allow_truncated_timestamps=False, - flavor=None, **kwargs): + flavor=None, filesystem=None, **kwargs): row_group_size = kwargs.pop('chunk_size', row_group_size) use_int96 = use_deprecated_int96_timestamps try: with ParquetWriter( where, table.schema, + filesystem=filesystem, version=version, flavor=flavor, use_dictionary=use_dictionary, @@ -1192,10 +1211,7 @@ def write_to_dataset(table, root_path, partition_cols=None, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ - if filesystem is None: - fs = _get_fs_from_path(root_path) - else: - fs = _ensure_filesystem(filesystem) + fs, root_path = _get_filesystem_and_path(filesystem, root_path) _mkdir_if_not_exists(fs, root_path) diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py index f218a1604a9d9..1af841f2ecbb1 100644 --- a/python/pyarrow/tests/test_hdfs.py +++ b/python/pyarrow/tests/test_hdfs.py @@ -216,7 +216,7 @@ def test_ls(self): self.hdfs.mkdir(dir_path) f = self.hdfs.open(f1_path, 'wb') - f.write('a' * 10) + f.write(b'a' * 10) contents = sorted(self.hdfs.ls(base_path, False)) assert contents == [dir_path, f1_path] @@ -341,9 +341,9 @@ def test_read_write_parquet_files_with_uri(self): df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) - pq.write_table(table, path) + pq.write_table(table, path, filesystem=self.hdfs) - result = pq.read_table(path).to_pandas() + result = pq.read_table(path, filesystem=self.hdfs).to_pandas() pdt.assert_frame_equal(result, df) @@ -380,7 +380,7 @@ def check_driver(cls): def test_orphaned_file(self): hdfs = hdfs_test_client() file_path = self._make_test_file(hdfs, 'orphaned_file_test', 'fname', - 'foobarbaz') + b'foobarbaz') f = hdfs.open(file_path) hdfs = None @@ -413,6 +413,11 @@ def _get_hdfs_uri(path): @pytest.mark.fastparquet @pytest.mark.parametrize('client', ['libhdfs', 'libhdfs3']) def test_fastparquet_read_with_hdfs(client): + try: + import snappy # noqa + except ImportError: + pytest.skip('fastparquet test requires snappy') + import pyarrow.parquet as pq fastparquet = pytest.importorskip('fastparquet') From 1a5991c99ef9092f439bed7e0bcf707a7247b419 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Dec 2018 18:44:40 -0600 Subject: [PATCH 074/328] ARROW-4069: [Python] Add tests for casting binary -> string/utf8. Add pyarrow.utf8() type factory alias for readability This is the Python side of ARROW-3387 to make sure all is in order there Author: Wes McKinney Closes #3215 from wesm/ARROW-4069 and squashes the following commits: eaf0cf403 Add tests for casting binary -> string/utf8. Add pyarrow.utf8() alias for pyarrow.string() for readability --- docs/source/python/api.rst | 1 + python/pyarrow/__init__.py | 2 +- python/pyarrow/tests/test_array.py | 20 ++++++++++++++++++++ python/pyarrow/types.pxi | 7 +++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst index 06863964978b3..064a3e9740543 100644 --- a/docs/source/python/api.rst +++ b/docs/source/python/api.rst @@ -50,6 +50,7 @@ Type and Schema Factory Functions date64 binary string + utf8 decimal128 list_ struct diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 63ed53e0ebab5..3121db68b9322 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -57,7 +57,7 @@ def parse_git(root, **kwargs): uint8, uint16, uint32, uint64, time32, time64, timestamp, date32, date64, float16, float32, float64, - binary, string, decimal128, + binary, string, utf8, decimal128, list_, struct, union, dictionary, field, type_for_alias, DataType, diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f9bd06ee04ef7..95a60435e3460 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -725,6 +725,26 @@ def test_cast_date32_to_int(): assert result2.equals(arr) +def test_cast_binary_to_utf8(): + binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary()) + utf8_arr = binary_arr.cast(pa.utf8()) + expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) + + assert utf8_arr.equals(expected) + + non_utf8_values = [(u'mañana').encode('utf-16-le')] + non_utf8_binary = pa.array(non_utf8_values) + assert non_utf8_binary.type == pa.binary() + with pytest.raises(ValueError): + non_utf8_binary.cast(pa.string()) + + non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]), + type=pa.binary()) + # No error + casted = non_utf8_all_null.cast(pa.string()) + assert casted.null_count == 1 + + def test_cast_date64_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.date64()) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 9ec36bff3a6fe..d367a8a85673f 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1237,6 +1237,13 @@ def string(): return primitive_type(_Type_STRING) +def utf8(): + """ + Alias for string() + """ + return string() + + def binary(int length=-1): """ Create variable-length binary type From bfa7f11cffa58dcf44f7e1278846e373e63d1dfe Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Dec 2018 18:56:24 -0600 Subject: [PATCH 075/328] ARROW-4070: [C++] Enable use of ARROW_BOOST_VENDORED with ninja-build It seems that ninja-build is a lot stricter about the dependency graph -- it seeks the root dependency of the `boost_*_static` libraries and finds targets (the absolute paths to the static libraries) that it doesn't know how to build. Setting these as the BUILD_BYPRODUCTS of the ExternalProject fixes the issue. I need this fix in ARROW-3803 so I'm going to cherry pick it there, and I can rebase later Author: Wes McKinney Closes #3217 from wesm/ARROW-4070 and squashes the following commits: aac135daa Use static library paths as BOOST_BUILD_PRODUCTS so that ninja-build can understand the dependency graph --- cpp/CMakeLists.txt | 3 ++- cpp/cmake_modules/ThirdpartyToolchain.cmake | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 60cbe85d10b6d..1672245924fb5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -239,7 +239,8 @@ static|shared (default shared)") ON) option(ARROW_BOOST_VENDORED - "Use vendored Boost instead of existing Boost" + "Use vendored Boost instead of existing Boost. \ +Note that this requires linking Boost statically" OFF) option(ARROW_PROTOBUF_USE_SHARED diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index d493de75a55f5..db0b69be460ce 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -369,15 +369,16 @@ if (ARROW_BOOST_VENDORED) set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) set(BOOST_REGEX_LIBRARY boost_regex_static) + if (ARROW_BOOST_HEADER_ONLY) set(BOOST_BUILD_PRODUCTS) set(BOOST_CONFIGURE_COMMAND "") set(BOOST_BUILD_COMMAND "") else() set(BOOST_BUILD_PRODUCTS - ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_REGEX_LIBRARY}) + ${BOOST_STATIC_SYSTEM_LIBRARY} + ${BOOST_STATIC_FILESYSTEM_LIBRARY} + ${BOOST_STATIC_REGEX_LIBRARY}) set(BOOST_CONFIGURE_COMMAND "./bootstrap.sh" "--prefix=${BOOST_PREFIX}" From 25b6a6c2c85c6afde2453459fd13ae00aa692028 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Dec 2018 19:45:27 -0600 Subject: [PATCH 076/328] ARROW-4073: [Python] Fix URI parsing on Windows. Also fix test for get_library_dirs when using ARROW_HOME to develop Resolves ARROW-4074 Author: Wes McKinney Closes #3218 from wesm/ARROW-4073 and squashes the following commits: 683b68fda lint 5f2c3404b Fix URI parsing on Windows. Also fix ARROW-4074 where windows .lib files are installed in ARROW_HOME and not the usual conda/pip locations --- python/pyarrow/__init__.py | 4 ++++ python/pyarrow/parquet.py | 10 +++++++++- python/pyarrow/tests/test_misc.py | 6 ++---- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 3121db68b9322..7f0a371b4bfd2 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -231,4 +231,8 @@ def get_library_dirs(): if _os.path.exists(_os.path.join(library_lib, 'arrow.lib')): library_dirs.append(library_lib) + # ARROW-4074: Allow for ARROW_HOME to be set to some other directory + if 'ARROW_HOME' in _os.environ: + library_dirs.append(_os.path.join(_os.environ['ARROW_HOME'], 'lib')) + return library_dirs diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index a520acece972e..b8dae65a5de78 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -38,10 +38,18 @@ get_filesystem_from_uri) from pyarrow.util import _is_path_like, _stringify_path +_URI_STRIP_SCHEMES = ('hdfs',) + def _parse_uri(path): path = _stringify_path(path) - return urlparse(path).path + parsed_uri = urlparse(path) + if parsed_uri.scheme in _URI_STRIP_SCHEMES: + return parsed_uri.path + else: + # ARROW-4073: On Windows returning the path with the scheme + # stripped removes the drive letter, if any + return path def _get_filesystem_and_path(passed_filesystem, path): diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 1c384f35d72b0..f7c316a8bafcd 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -28,10 +28,8 @@ def test_get_include(): @pytest.mark.skipif('sys.platform != "win32"') def test_get_library_dirs_win32(): - library_dirs = pa.get_library_dirs() - - library_lib = library_dirs[-1] - assert os.path.exists(os.path.join(library_lib, 'arrow.lib')) + assert any(os.path.exists(os.path.join(directory, 'arrow.lib')) + for directory in pa.get_library_dirs()) def test_cpu_count(): From 944b9e319a5f208c0fc45953d1f10972b1433020 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Wed, 19 Dec 2018 11:39:09 +0900 Subject: [PATCH 077/328] ARROW-4051: [Gandiva] [GLib] Add support for null literal - Add `#GGandivaNullLiteralNode`. - Remove `return_type` property in `#GGandivaFunctionNode` to use `ggandiva_node_get_return_type()`. Author: Yosuke Shiro Author: Kouhei Sutou Closes #3197 from shiro615/glib-support-null-literal-node and squashes the following commits: 4f0a39f7 Fix a typo e93cd085 Simplify be30df17 Add tests for return-type of literal nodes 8b3244e7 Use data type from GArrowField as return type fcc0d8f8 Raise error for invalid input 41aada5e Fix variable names 415df817 Add return-type property in ggandiva_field_node_new_raw() c337f122 Call g_object_unref(data_type) to prevent a memory leak 64cef1d8 Use the given return_type to create GGandivaNullLiteralNode b17f5e25 Reuse return-type property instead of defining ggandiva_node_get_return_type() 3e25b0d5 Refactor null check 0ac03b4d Add missing null check 285f64b8 Fix orders of class 49d1044a Remove return_type property for using ggandiva_node_get_return_type() f78881cf Add ggandiva_node_get_return_type() 5896f0bb Add GGandivaNullLiteralNode --- c_glib/gandiva-glib/node.cpp | 292 ++++++++++++------ c_glib/gandiva-glib/node.h | 17 + c_glib/gandiva-glib/node.hpp | 3 +- .../test/gandiva/test-binary-literal-node.rb | 27 +- .../test/gandiva/test-boolean-literal-node.rb | 10 +- .../test/gandiva/test-double-literal-node.rb | 10 +- c_glib/test/gandiva/test-field-node.rb | 10 +- .../test/gandiva/test-float-literal-node.rb | 14 +- .../test/gandiva/test-int16-literal-node.rb | 10 +- .../test/gandiva/test-int32-literal-node.rb | 10 +- .../test/gandiva/test-int64-literal-node.rb | 10 +- c_glib/test/gandiva/test-int8-literal-node.rb | 10 +- c_glib/test/gandiva/test-null-literal-node.rb | 38 +++ .../test/gandiva/test-string-literal-node.rb | 10 +- .../test/gandiva/test-uint16-literal-node.rb | 10 +- .../test/gandiva/test-uint32-literal-node.rb | 10 +- .../test/gandiva/test-uint64-literal-node.rb | 10 +- .../test/gandiva/test-uint8-literal-node.rb | 10 +- 18 files changed, 372 insertions(+), 139 deletions(-) create mode 100644 c_glib/test/gandiva/test-null-literal-node.rb diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index cdb9724d7ebbf..709836524d848 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -22,6 +22,7 @@ #endif #include +#include #include #include @@ -52,6 +53,9 @@ G_BEGIN_DECLS * #GGandivaLiteralNode is a base class for a node in the expression tree, * representing a literal. * + * #GGandivaNullLiteralNode is a class for a node in the expression tree, + * representing a null literal. + * * #GGandivaBooleanLiteralNode is a class for a node in the expression tree, * representing a boolean literal. * @@ -96,10 +100,12 @@ G_BEGIN_DECLS typedef struct GGandivaNodePrivate_ { std::shared_ptr node; + GArrowDataType *return_type; } GGandivaNodePrivate; enum { - PROP_NODE = 1 + PROP_NODE = 1, + PROP_RETURN_TYPE }; G_DEFINE_TYPE_WITH_PRIVATE(GGandivaNode, @@ -111,6 +117,19 @@ G_DEFINE_TYPE_WITH_PRIVATE(GGandivaNode, ggandiva_node_get_instance_private( \ GGANDIVA_NODE(object))) +static void +ggandiva_node_dispose(GObject *object) +{ + auto priv = GGANDIVA_NODE_GET_PRIVATE(object); + + if (priv->return_type) { + g_object_unref(priv->return_type); + priv->return_type = nullptr; + } + + G_OBJECT_CLASS(ggandiva_node_parent_class)->dispose(object); +} + static void ggandiva_node_finalize(GObject *object) { @@ -134,6 +153,27 @@ ggandiva_node_set_property(GObject *object, priv->node = *static_cast *>(g_value_get_pointer(value)); break; + case PROP_RETURN_TYPE: + priv->return_type = GARROW_DATA_TYPE(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_node_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_NODE_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_RETURN_TYPE: + g_value_set_object(value, priv->return_type); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -148,19 +188,28 @@ ggandiva_node_init(GGandivaNode *object) static void ggandiva_node_class_init(GGandivaNodeClass *klass) { - GParamSpec *spec; - auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = ggandiva_node_dispose; gobject_class->finalize = ggandiva_node_finalize; gobject_class->set_property = ggandiva_node_set_property; + gobject_class->get_property = ggandiva_node_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("node", "Node", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_NODE, spec); + + spec = g_param_spec_object("return-type", + "Return type", + "The return type", + GARROW_TYPE_DATA_TYPE, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_RETURN_TYPE, spec); } @@ -274,12 +323,10 @@ ggandiva_field_node_new(GArrowField *field) typedef struct GGandivaFunctionNodePrivate_ { gchar *name; GList *parameters; - GArrowDataType *return_type; } GGandivaFunctionNodePrivate; enum { - PROP_NAME = 1, - PROP_RETURN_TYPE + PROP_NAME = 1 }; G_DEFINE_TYPE_WITH_PRIVATE(GGandivaFunctionNode, @@ -305,11 +352,6 @@ ggandiva_function_node_dispose(GObject *object) priv->parameters = nullptr; } - if (priv->return_type) { - g_object_unref(priv->return_type); - priv->return_type = nullptr; - } - G_OBJECT_CLASS(ggandiva_function_node_parent_class)->dispose(object); } @@ -335,9 +377,6 @@ ggandiva_function_node_set_property(GObject *object, case PROP_NAME: priv->name = g_value_dup_string(value); break; - case PROP_RETURN_TYPE: - priv->return_type = GARROW_DATA_TYPE(g_value_dup_object(value)); - break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -356,9 +395,6 @@ ggandiva_function_node_get_property(GObject *object, case PROP_NAME: g_value_set_string(value, priv->name); break; - case PROP_RETURN_TYPE: - g_value_set_object(value, priv->return_type); - break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -390,14 +426,6 @@ ggandiva_function_node_class_init(GGandivaFunctionNodeClass *klass) static_cast(G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_NAME, spec); - - spec = g_param_spec_object("return-type", - "Return type", - "The return type of the function", - GARROW_TYPE_DATA_TYPE, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_RETURN_TYPE, spec); } /** @@ -462,6 +490,50 @@ ggandiva_literal_node_class_init(GGandivaLiteralNodeClass *klass) } +G_DEFINE_TYPE(GGandivaNullLiteralNode, + ggandiva_null_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_null_literal_node_init(GGandivaNullLiteralNode *null_literal_node) +{ +} + +static void +ggandiva_null_literal_node_class_init(GGandivaNullLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_null_literal_node_new: + * @return_type: A #GArrowDataType. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GGandivaNullLiteralNode for + * the type or %NULL on error. + * + * Since: 0.12.0 + */ +GGandivaNullLiteralNode * +ggandiva_null_literal_node_new(GArrowDataType *return_type, + GError **error) +{ + auto arrow_return_type = garrow_data_type_get_raw(return_type); + auto gandiva_node = gandiva::TreeExprBuilder::MakeNull(arrow_return_type); + if (!gandiva_node) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "[gandiva][null-literal-node][new] " + "failed to create: <%s>", + arrow_return_type->ToString().c_str()); + return NULL; + } + return GGANDIVA_NULL_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + return_type)); +} + + G_DEFINE_TYPE(GGandivaBooleanLiteralNode, ggandiva_boolean_literal_node, GGANDIVA_TYPE_LITERAL_NODE) @@ -488,7 +560,8 @@ GGandivaBooleanLiteralNode * ggandiva_boolean_literal_node_new(gboolean value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(static_cast(value)); - return GGANDIVA_BOOLEAN_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_BOOLEAN_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -533,7 +606,8 @@ GGandivaInt8LiteralNode * ggandiva_int8_literal_node_new(gint8 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_INT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_INT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -577,7 +651,8 @@ GGandivaUInt8LiteralNode * ggandiva_uint8_literal_node_new(guint8 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_UINT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_UINT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -621,7 +696,8 @@ GGandivaInt16LiteralNode * ggandiva_int16_literal_node_new(gint16 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_INT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_INT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -665,7 +741,8 @@ GGandivaUInt16LiteralNode * ggandiva_uint16_literal_node_new(guint16 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_UINT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_UINT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -709,7 +786,8 @@ GGandivaInt32LiteralNode * ggandiva_int32_literal_node_new(gint32 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_INT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_INT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -753,7 +831,8 @@ GGandivaUInt32LiteralNode * ggandiva_uint32_literal_node_new(guint32 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_UINT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_UINT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -797,7 +876,8 @@ GGandivaInt64LiteralNode * ggandiva_int64_literal_node_new(gint64 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_INT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_INT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -841,7 +921,8 @@ GGandivaUInt64LiteralNode * ggandiva_uint64_literal_node_new(guint64 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_UINT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_UINT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -885,7 +966,8 @@ GGandivaFloatLiteralNode * ggandiva_float_literal_node_new(gfloat value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_FLOAT_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_FLOAT_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -929,7 +1011,8 @@ GGandivaDoubleLiteralNode * ggandiva_double_literal_node_new(gdouble value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_DOUBLE_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_DOUBLE_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -1002,7 +1085,8 @@ ggandiva_binary_literal_node_new(const guint8 *value, auto gandiva_node = gandiva::TreeExprBuilder::MakeBinaryLiteral(std::string(reinterpret_cast(value), size)); - return GGANDIVA_BINARY_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_BINARY_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -1022,7 +1106,8 @@ ggandiva_binary_literal_node_new_bytes(GBytes *value) gandiva::TreeExprBuilder::MakeBinaryLiteral( std::string(reinterpret_cast(raw_value), value_size)); - auto literal_node = ggandiva_literal_node_new_raw(&gandiva_node); + auto literal_node = ggandiva_literal_node_new_raw(&gandiva_node, + NULL); auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(literal_node); priv->value = value; g_bytes_ref(priv->value); @@ -1076,7 +1161,8 @@ GGandivaStringLiteralNode * ggandiva_string_literal_node_new(const gchar *value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeStringLiteral(value); - return GGANDIVA_STRING_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_STRING_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -1107,10 +1193,14 @@ GGandivaFieldNode * ggandiva_field_node_new_raw(std::shared_ptr *gandiva_node, GArrowField *field) { + auto arrow_return_type = (*gandiva_node)->return_type(); + auto return_type = garrow_field_get_data_type(field); auto field_node = g_object_new(GGANDIVA_TYPE_FIELD_NODE, "node", gandiva_node, "field", field, + "return-type", return_type, NULL); + g_object_unref(return_type); return GGANDIVA_FIELD_NODE(field_node); } @@ -1135,56 +1225,84 @@ ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, } GGandivaLiteralNode * -ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node) +ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node, + GArrowDataType *return_type) { - GType type; + auto gandiva_literal_node = + std::static_pointer_cast(*gandiva_node); + + GGandivaLiteralNode *literal_node; + if (gandiva_literal_node->is_null()) { + literal_node = + GGANDIVA_LITERAL_NODE(g_object_new(GGANDIVA_TYPE_NULL_LITERAL_NODE, + "node", gandiva_node, + "return-type", return_type, + NULL)); + } else { + GType type; + + auto arrow_return_type = gandiva_literal_node->return_type(); + switch (arrow_return_type->id()) { + case arrow::Type::BOOL: + type = GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE; + break; + case arrow::Type::type::UINT8: + type = GGANDIVA_TYPE_UINT8_LITERAL_NODE; + break; + case arrow::Type::type::UINT16: + type = GGANDIVA_TYPE_UINT16_LITERAL_NODE; + break; + case arrow::Type::type::UINT32: + type = GGANDIVA_TYPE_UINT32_LITERAL_NODE; + break; + case arrow::Type::type::UINT64: + type = GGANDIVA_TYPE_UINT64_LITERAL_NODE; + break; + case arrow::Type::type::INT8: + type = GGANDIVA_TYPE_INT8_LITERAL_NODE; + break; + case arrow::Type::type::INT16: + type = GGANDIVA_TYPE_INT16_LITERAL_NODE; + break; + case arrow::Type::type::INT32: + type = GGANDIVA_TYPE_INT32_LITERAL_NODE; + break; + case arrow::Type::type::INT64: + type = GGANDIVA_TYPE_INT64_LITERAL_NODE; + break; + case arrow::Type::type::FLOAT: + type = GGANDIVA_TYPE_FLOAT_LITERAL_NODE; + break; + case arrow::Type::type::DOUBLE: + type = GGANDIVA_TYPE_DOUBLE_LITERAL_NODE; + break; + case arrow::Type::type::STRING: + type = GGANDIVA_TYPE_STRING_LITERAL_NODE; + break; + case arrow::Type::type::BINARY: + type = GGANDIVA_TYPE_BINARY_LITERAL_NODE; + break; + default: + type = GGANDIVA_TYPE_LITERAL_NODE; + break; + } - switch ((*gandiva_node)->return_type()->id()) { - case arrow::Type::BOOL: - type = GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE; - break; - case arrow::Type::type::UINT8: - type = GGANDIVA_TYPE_UINT8_LITERAL_NODE; - break; - case arrow::Type::type::UINT16: - type = GGANDIVA_TYPE_UINT16_LITERAL_NODE; - break; - case arrow::Type::type::UINT32: - type = GGANDIVA_TYPE_UINT32_LITERAL_NODE; - break; - case arrow::Type::type::UINT64: - type = GGANDIVA_TYPE_UINT64_LITERAL_NODE; - break; - case arrow::Type::type::INT8: - type = GGANDIVA_TYPE_INT8_LITERAL_NODE; - break; - case arrow::Type::type::INT16: - type = GGANDIVA_TYPE_INT16_LITERAL_NODE; - break; - case arrow::Type::type::INT32: - type = GGANDIVA_TYPE_INT32_LITERAL_NODE; - break; - case arrow::Type::type::INT64: - type = GGANDIVA_TYPE_INT64_LITERAL_NODE; - break; - case arrow::Type::type::FLOAT: - type = GGANDIVA_TYPE_FLOAT_LITERAL_NODE; - break; - case arrow::Type::type::DOUBLE: - type = GGANDIVA_TYPE_DOUBLE_LITERAL_NODE; - break; - case arrow::Type::type::STRING: - type = GGANDIVA_TYPE_STRING_LITERAL_NODE; - break; - case arrow::Type::type::BINARY: - type = GGANDIVA_TYPE_BINARY_LITERAL_NODE; - break; - default: - type = GGANDIVA_TYPE_LITERAL_NODE; - break; + if (return_type) { + literal_node = + GGANDIVA_LITERAL_NODE(g_object_new(type, + "node", gandiva_node, + "return-type", return_type, + NULL)); + } else { + return_type = garrow_data_type_new_raw(&arrow_return_type); + literal_node = + GGANDIVA_LITERAL_NODE(g_object_new(type, + "node", gandiva_node, + "return-type", return_type, + NULL)); + g_object_unref(return_type); + } } - auto literal_node = GGANDIVA_LITERAL_NODE(g_object_new(type, - "node", gandiva_node, - NULL)); + return literal_node; } diff --git a/c_glib/gandiva-glib/node.h b/c_glib/gandiva-glib/node.h index 183003fd9f68a..d9e67e27b7eea 100644 --- a/c_glib/gandiva-glib/node.h +++ b/c_glib/gandiva-glib/node.h @@ -35,6 +35,7 @@ struct _GGandivaNodeClass GObjectClass parent_class; }; + #define GGANDIVA_TYPE_FIELD_NODE (ggandiva_field_node_get_type()) G_DECLARE_DERIVABLE_TYPE(GGandivaFieldNode, ggandiva_field_node, @@ -80,6 +81,22 @@ struct _GGandivaLiteralNodeClass }; +#define GGANDIVA_TYPE_NULL_LITERAL_NODE (ggandiva_null_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaNullLiteralNode, + ggandiva_null_literal_node, + GGANDIVA, + NULL_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaNullLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaNullLiteralNode * +ggandiva_null_literal_node_new(GArrowDataType *return_type, + GError **error); + + #define GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE (ggandiva_boolean_literal_node_get_type()) G_DECLARE_DERIVABLE_TYPE(GGandivaBooleanLiteralNode, ggandiva_boolean_literal_node, diff --git a/c_glib/gandiva-glib/node.hpp b/c_glib/gandiva-glib/node.hpp index 7ff136003f174..40f9d1b465591 100644 --- a/c_glib/gandiva-glib/node.hpp +++ b/c_glib/gandiva-glib/node.hpp @@ -36,4 +36,5 @@ ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, GList *parameters, GArrowDataType *return_type); GGandivaLiteralNode * -ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node); +ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node, + GArrowDataType *return_type); diff --git a/c_glib/test/gandiva/test-binary-literal-node.rb b/c_glib/test/gandiva/test-binary-literal-node.rb index 93a54a361cc82..fddf74830d4ab 100644 --- a/c_glib/test/gandiva/test-binary-literal-node.rb +++ b/c_glib/test/gandiva/test-binary-literal-node.rb @@ -21,14 +21,27 @@ def setup @value = "\x00\x01\x02\x03\x04" end - def test_new - literal_node = Gandiva::BinaryLiteralNode.new(@value) - assert_equal(@value, literal_node.value.to_s) + sub_test_case(".new") do + def test_string + node = Gandiva::BinaryLiteralNode.new(@value) + assert_equal(@value, node.value.to_s) + end + + def test_bytes + bytes_value = GLib::Bytes.new(@value) + node = Gandiva::BinaryLiteralNode.new(bytes_value) + assert_equal(@value, node.value.to_s) + end end - def test_new_bytes - bytes_value = GLib::Bytes.new(@value) - literal_node = Gandiva::BinaryLiteralNode.new(bytes_value) - assert_equal(@value, literal_node.value.to_s) + sub_test_case("instance methods") do + def setup + super + @node = Gandiva::BinaryLiteralNode.new(@value) + end + + def test_return_type + assert_equal(Arrow::BinaryDataType.new, @node.return_type) + end end end diff --git a/c_glib/test/gandiva/test-boolean-literal-node.rb b/c_glib/test/gandiva/test-boolean-literal-node.rb index 3d1f10c5e81c1..6e18a76218595 100644 --- a/c_glib/test/gandiva/test-boolean-literal-node.rb +++ b/c_glib/test/gandiva/test-boolean-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaBooleanLiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = true + @node = Gandiva::BooleanLiteralNode.new(@value) end def test_value - value = true - literal_node = Gandiva::BooleanLiteralNode.new(value) - assert_equal(value, literal_node.value?) + assert_equal(@value, @node.value?) + end + + def test_return_type + assert_equal(Arrow::BooleanDataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-double-literal-node.rb b/c_glib/test/gandiva/test-double-literal-node.rb index fd4bd08e4c254..27cc3aea23b32 100644 --- a/c_glib/test/gandiva/test-double-literal-node.rb +++ b/c_glib/test/gandiva/test-double-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaDoubleLiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 1.5 + @node = Gandiva::DoubleLiteralNode.new(@value) end def test_value - value = 1.5 - literal_node = Gandiva::DoubleLiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::DoubleDataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-field-node.rb b/c_glib/test/gandiva/test-field-node.rb index c5bfe6cfc9743..51db285bcc0bf 100644 --- a/c_glib/test/gandiva/test-field-node.rb +++ b/c_glib/test/gandiva/test-field-node.rb @@ -18,11 +18,15 @@ class TestGandivaFieldNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @field = Arrow::Field.new("valid", Arrow::BooleanDataType.new) + @node = Gandiva::FieldNode.new(@field) end def test_field - field = Arrow::Field.new("valid", Arrow::BooleanDataType.new) - field_node = Gandiva::FieldNode.new(field) - assert_equal(field, field_node.field) + assert_equal(@field, @node.field) + end + + def test_return_type + assert_equal(@field.data_type, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-float-literal-node.rb b/c_glib/test/gandiva/test-float-literal-node.rb index 202ec38fc5907..4a49eb37441d1 100644 --- a/c_glib/test/gandiva/test-float-literal-node.rb +++ b/c_glib/test/gandiva/test-float-literal-node.rb @@ -18,17 +18,15 @@ class TestGandivaFloatLiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 1.5 + @node = Gandiva::FloatLiteralNode.new(@value) end - def test_new - assert_nothing_raised do - Gandiva::FloatLiteralNode.new(1.5) - end + def test_value + assert_equal(@value, @node.value) end - def test_value - value = 1.5 - literal_node = Gandiva::FloatLiteralNode.new(value) - assert_equal(value, literal_node.value) + def test_return_type + assert_equal(Arrow::FloatDataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-int16-literal-node.rb b/c_glib/test/gandiva/test-int16-literal-node.rb index 9b5bb6822ebba..f8e6b26849496 100644 --- a/c_glib/test/gandiva/test-int16-literal-node.rb +++ b/c_glib/test/gandiva/test-int16-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaInt16LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 15) + @node = Gandiva::Int16LiteralNode.new(@value) end def test_value - value = -3 - literal_node = Gandiva::Int16LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int16DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-int32-literal-node.rb b/c_glib/test/gandiva/test-int32-literal-node.rb index 9c94cdef4b125..3d1bf588cf7dc 100644 --- a/c_glib/test/gandiva/test-int32-literal-node.rb +++ b/c_glib/test/gandiva/test-int32-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaInt32LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 31) + @node = Gandiva::Int32LiteralNode.new(@value) end def test_value - value = -3 - literal_node = Gandiva::Int32LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int32DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-int64-literal-node.rb b/c_glib/test/gandiva/test-int64-literal-node.rb index e1b4b91d8c32c..b2ca3bf630b43 100644 --- a/c_glib/test/gandiva/test-int64-literal-node.rb +++ b/c_glib/test/gandiva/test-int64-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaInt64LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 63) + @node = Gandiva::Int64LiteralNode.new(@value) end def test_value - value = -3 - literal_node = Gandiva::Int64LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int64DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-int8-literal-node.rb b/c_glib/test/gandiva/test-int8-literal-node.rb index 30f11fc81a60d..8d917bd1b4dfe 100644 --- a/c_glib/test/gandiva/test-int8-literal-node.rb +++ b/c_glib/test/gandiva/test-int8-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaInt8LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 7) + @node = Gandiva::Int8LiteralNode.new(@value) end def test_value - value = -3 - literal_node = Gandiva::Int8LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int8DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-null-literal-node.rb b/c_glib/test/gandiva/test-null-literal-node.rb new file mode 100644 index 0000000000000..ae14f3c15e411 --- /dev/null +++ b/c_glib/test/gandiva/test-null-literal-node.rb @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaNullLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_invalid_type + return_type = Arrow::NullDataType.new + message = + "[gandiva][null-literal-node][new] " + + "failed to create: <#{return_type}>" + assert_raise(Arrow::Error::Invalid.new(message)) do + Gandiva::NullLiteralNode.new(return_type) + end + end + + def test_return_type + return_type = Arrow::BooleanDataType.new + literal_node = Gandiva::NullLiteralNode.new(return_type) + assert_equal(return_type, literal_node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-string-literal-node.rb b/c_glib/test/gandiva/test-string-literal-node.rb index a231f6111f40f..8a397ab4d1a9b 100644 --- a/c_glib/test/gandiva/test-string-literal-node.rb +++ b/c_glib/test/gandiva/test-string-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaStringLiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = "Hello" + @node = Gandiva::StringLiteralNode.new(@value) end def test_value - value = "Hello" - literal_node = Gandiva::StringLiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::StringDataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-uint16-literal-node.rb b/c_glib/test/gandiva/test-uint16-literal-node.rb index e8bdd308969bb..971da38881df6 100644 --- a/c_glib/test/gandiva/test-uint16-literal-node.rb +++ b/c_glib/test/gandiva/test-uint16-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaUInt16LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 2 ** 16 - 1 + @node = Gandiva::UInt16LiteralNode.new(@value) end def test_value - value = 3 - literal_node = Gandiva::UInt16LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt16DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-uint32-literal-node.rb b/c_glib/test/gandiva/test-uint32-literal-node.rb index 9d5995774dd97..8fcab7fefad87 100644 --- a/c_glib/test/gandiva/test-uint32-literal-node.rb +++ b/c_glib/test/gandiva/test-uint32-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaUInt32LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 2 ** 32 - 1 + @node = Gandiva::UInt32LiteralNode.new(@value) end def test_value - value = 3 - literal_node = Gandiva::UInt32LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt32DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-uint64-literal-node.rb b/c_glib/test/gandiva/test-uint64-literal-node.rb index 56c46db81bd24..d5afddcd75f44 100644 --- a/c_glib/test/gandiva/test-uint64-literal-node.rb +++ b/c_glib/test/gandiva/test-uint64-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaUInt64LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 3 + @node = Gandiva::UInt64LiteralNode.new(@value) end def test_value - value = 3 - literal_node = Gandiva::UInt64LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt64DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-uint8-literal-node.rb b/c_glib/test/gandiva/test-uint8-literal-node.rb index 04f76cd76326f..8ce91d599f435 100644 --- a/c_glib/test/gandiva/test-uint8-literal-node.rb +++ b/c_glib/test/gandiva/test-uint8-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaUInt8LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 2 ** 8 - 1 + @node = Gandiva::UInt8LiteralNode.new(@value) end def test_value - value = 3 - literal_node = Gandiva::UInt8LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt8DataType.new, @node.return_type) end end From b8d4477ffbe5a569521828964277e7d6ea115671 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Wed, 19 Dec 2018 13:57:13 +0100 Subject: [PATCH 078/328] ARROW-3989: [Rust] [CSV] Cast bool string to lower case in reader The csv reader currently only handles boolean types if the string is explicitly `true|false`. Excel saves bools as `TRUE|FALSE`, and Python/Pandas as `True|False`. This PR adds a condition that lowercases booleans when casting them to Arrow types. @andygrove @sunchao I believe it's ready for review. Author: Neville Dipale Closes #3214 from nevi-me/rust/boolean-case and squashes the following commits: 38d99426 move primitive array builder into Reader 9fae4428 move is_boolean_type check out of loop, remove duplicate impl Reader 2a86b527 : Cast timestamp string to lower case to handle True, TRUE ... --- rust/src/csv/reader.rs | 79 ++++++++++++++++++++---------------- rust/test/data/null_test.csv | 8 ++-- 2 files changed, 47 insertions(+), 40 deletions(-) diff --git a/rust/src/csv/reader.rs b/rust/src/csv/reader.rs index 632aa7ae7936d..b9c46fc3217cc 100644 --- a/rust/src/csv/reader.rs +++ b/rust/src/csv/reader.rs @@ -87,32 +87,7 @@ impl Reader { batch_size, } } -} -fn build_primitive_array( - rows: &[StringRecord], - col_idx: &usize, -) -> Result { - let mut builder = PrimitiveArrayBuilder::::new(rows.len()); - for row_index in 0..rows.len() { - match rows[row_index].get(*col_idx) { - Some(s) if s.len() > 0 => match s.parse::() { - Ok(v) => builder.push(v)?, - Err(_) => { - // TODO: we should surface the underlying error here. - return Err(ArrowError::ParseError(format!( - "Error while parsing value {}", - s - ))); - } - }, - _ => builder.push_null().unwrap(), - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) -} - -impl Reader { /// Read the next batch of rows pub fn next(&mut self) -> Result> { // read a batch of rows into memory @@ -151,17 +126,17 @@ impl Reader { .map(|i| { let field = self.schema.field(*i); match field.data_type() { - &DataType::Boolean => build_primitive_array::(rows, i), - &DataType::Int8 => build_primitive_array::(rows, i), - &DataType::Int16 => build_primitive_array::(rows, i), - &DataType::Int32 => build_primitive_array::(rows, i), - &DataType::Int64 => build_primitive_array::(rows, i), - &DataType::UInt8 => build_primitive_array::(rows, i), - &DataType::UInt16 => build_primitive_array::(rows, i), - &DataType::UInt32 => build_primitive_array::(rows, i), - &DataType::UInt64 => build_primitive_array::(rows, i), - &DataType::Float32 => build_primitive_array::(rows, i), - &DataType::Float64 => build_primitive_array::(rows, i), + &DataType::Boolean => self.build_primitive_array::(rows, i), + &DataType::Int8 => self.build_primitive_array::(rows, i), + &DataType::Int16 => self.build_primitive_array::(rows, i), + &DataType::Int32 => self.build_primitive_array::(rows, i), + &DataType::Int64 => self.build_primitive_array::(rows, i), + &DataType::UInt8 => self.build_primitive_array::(rows, i), + &DataType::UInt16 => self.build_primitive_array::(rows, i), + &DataType::UInt32 => self.build_primitive_array::(rows, i), + &DataType::UInt64 => self.build_primitive_array::(rows, i), + &DataType::Float32 => self.build_primitive_array::(rows, i), + &DataType::Float64 => self.build_primitive_array::(rows, i), &DataType::Utf8 => { let values_builder: UInt8Builder = UInt8Builder::new(rows.len()); let mut list_builder = ListArrayBuilder::new(values_builder); @@ -191,6 +166,38 @@ impl Reader { Err(e) => Err(e), } } + + fn build_primitive_array( + &self, + rows: &[StringRecord], + col_idx: &usize, + ) -> Result { + let mut builder = PrimitiveArrayBuilder::::new(rows.len()); + let is_boolean_type = *self.schema.field(*col_idx).data_type() == DataType::Boolean; + for row_index in 0..rows.len() { + match rows[row_index].get(*col_idx) { + Some(s) if s.len() > 0 => { + let t = if is_boolean_type { + s.to_lowercase().parse::() + } else { + s.parse::() + }; + match t { + Ok(v) => builder.push(v)?, + Err(_) => { + // TODO: we should surface the underlying error here. + return Err(ArrowError::ParseError(format!( + "Error while parsing value {}", + s + ))); + } + } + } + _ => builder.push_null()?, + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + } } #[cfg(test)] diff --git a/rust/test/data/null_test.csv b/rust/test/data/null_test.csv index 80830606563b3..7e0dde5371429 100644 --- a/rust/test/data/null_test.csv +++ b/rust/test/data/null_test.csv @@ -1,6 +1,6 @@ c_int,c_float,c_string,c_bool -1,1.1,"1.11",true -2,2.2,"2.22",true +1,1.1,"1.11",True +2,2.2,"2.22",TRUE 3,,"3.33",true -4,4.4,,false -5,6.6,"",false \ No newline at end of file +4,4.4,,False +5,6.6,"",FALSE \ No newline at end of file From cec8d23dd48e764064adcfdfb33b13989fd3b667 Mon Sep 17 00:00:00 2001 From: cav71 Date: Wed, 19 Dec 2018 15:52:23 +0100 Subject: [PATCH 079/328] ARROW-4066: [Doc] Instructions to create Sphinx documentation Document how to build the documentation in the Python docs. Author: cav71 Author: Antoine Pitrou Closes #3198 from cav71/documentation and squashes the following commits: 9af13754 Missing word 1389e19d Remove spurious newlines 61b32356 Some improvements e21fdd7a update documentation 5ce1cf45 update documentation e5e6c4de Merge remote-tracking branch 'upstream/master' into documentation c132dffe update doc following comments from: https://github.com/apache/arrow/pull/3198 f3620520 doc doc --- docs/source/python/development.rst | 59 +++++++++++++++++++++++++++++- docs/source/python/index.rst | 17 +++++---- docs/source/python/install.rst | 4 +- 3 files changed, 68 insertions(+), 12 deletions(-) diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst index 4258feef79f44..1dcfda862817f 100644 --- a/docs/source/python/development.rst +++ b/docs/source/python/development.rst @@ -86,6 +86,8 @@ On Linux and OSX: --file arrow/ci/conda_env_python.yml \ python=3.6 + source activate pyarrow-dev + On Windows: .. code-block:: shell @@ -95,16 +97,18 @@ On Windows: --file arrow\ci\conda_env_python.yml ^ python=3.6 + activate pyarrow-dev + We need to set some environment variables to let Arrow's build system know about our build toolchain: .. code-block:: shell export ARROW_BUILD_TYPE=release - export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX export ARROW_HOME=$CONDA_PREFIX export PARQUET_HOME=$CONDA_PREFIX + export BOOST_HOME=$CONDA_PREFIX Using pip ~~~~~~~~~ @@ -207,9 +211,10 @@ Now, build pyarrow: .. code-block:: shell - cd arrow/python + pushd arrow/python python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ --with-parquet --with-plasma --inplace + popd If you did not build with plasma, you can omit ``--with-plasma``. @@ -352,3 +357,53 @@ Getting ``python-test.exe`` to run is a bit tricky because your set PYTHONHOME=%CONDA_PREFIX% Now ``python-test.exe`` or simply ``ctest`` (to run all tests) should work. + +Building the Documentation +========================== + +Prerequisites +------------- + +The documentation build process uses `Doxygen `_ and +`Sphinx `_ along with a few extensions. + +If you're using Conda, the required software can be installed in a single line: + +.. code-block:: shell + + conda install -c conda-forge --file ci/conda_env_sphinx.yml + +Otherwise, you'll first need to install `Doxygen `_ +yourself (for example from your distribution's official repositories, if +using Linux). Then you can install the Python-based requirements with the +following command: + +.. code-block:: shell + + pip install -r docs/requirements.txt + +Building +-------- + +These two steps are mandatory and must be executed in order. + +#. Process the C++ API using Doxygen + + .. code-block:: shell + + pushd cpp/apidoc + doxygen + popd + +#. Build the complete documentation using Sphinx + + .. code-block:: shell + + pushd docs + make html + popd + +After these steps are completed, the documentation is rendered in HTML +format in ``docs/_build/html``. In particular, you can point your browser +at ``docs/_build/html/index.html`` to read the docs and review any changes +you made. diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index 56282192b170b..cf691e37eaa25 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -18,21 +18,22 @@ Python bindings =============== -The Arrow Python bindings have first-class integration with NumPy, pandas, and -built-in Python objects. They are based on the C++ implementation of Arrow. - This is the documentation of the Python API of Apache Arrow. For more details -on the format and other language bindings see the parent documentation. -Here will we only detail the usage of the Python API for Arrow and the leaf +on the Arrow format and other language bindings see the +:doc:`parent documentation <../index>`. + +The Arrow Python bindings (also named "PyArrow") have first-class integration +with NumPy, pandas, and built-in Python objects. They are based on the C++ +implementation of Arrow. + +Here will we detail the usage of the Python API for Arrow and the leaf libraries that add additional functionality such as reading Apache Parquet files into Arrow structures. .. toctree:: :maxdepth: 2 - :caption: Getting Started install - development memory data ipc @@ -44,5 +45,5 @@ files into Arrow structures. parquet extending api + development getting_involved - diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst index d07d9004d2632..8092b6ce6a0ef 100644 --- a/docs/source/python/install.rst +++ b/docs/source/python/install.rst @@ -15,8 +15,8 @@ .. specific language governing permissions and limitations .. under the License. -Install PyArrow -=============== +Installing PyArrow +================== Conda ----- From 6bfac93ab1c133190d782683df4054f98b2007e5 Mon Sep 17 00:00:00 2001 From: shyam Date: Wed, 19 Dec 2018 09:10:16 -0600 Subject: [PATCH 080/328] ARROW-3979 : [Gandiva] fix all valgrind reported errors Fix all the issues reported by valgrind and also enable option ARROW_TRAVIS_VALGRIND. Author: shyam Closes #3201 from shyambits2004/master and squashes the following commits: 81d5b7669 ARROW-3979 : fix all valgrind reported errors --- .travis.yml | 5 +-- cpp/src/gandiva/bitmap_accumulator_test.cc | 7 ++- cpp/src/gandiva/eval_batch.h | 2 +- cpp/src/gandiva/exported_funcs_registry.h | 8 ++-- cpp/src/gandiva/local_bitmaps_holder.h | 6 +-- cpp/src/gandiva/precompiled/CMakeLists.txt | 2 +- cpp/src/gandiva/projector.cc | 6 +++ cpp/src/gandiva/selection_vector_test.cc | 51 ++++++++++------------ cpp/src/gandiva/tests/projector_test.cc | 9 ++-- cpp/valgrind.supp | 13 +++++- 10 files changed, 62 insertions(+), 47 deletions(-) diff --git a/.travis.yml b/.travis.yml index bf0261b3fa1ea..64408128fe146 100644 --- a/.travis.yml +++ b/.travis.yml @@ -114,8 +114,7 @@ matrix: - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - ARROW_CPP_BUILD_TARGETS="gandiva-all" - ARROW_TRAVIS_USE_TOOLCHAIN=1 - # ARROW-3979 temporarily disabled. - - ARROW_TRAVIS_VALGRIND=0 + - ARROW_TRAVIS_VALGRIND=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" before_script: @@ -123,7 +122,7 @@ matrix: - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh diff --git a/cpp/src/gandiva/bitmap_accumulator_test.cc b/cpp/src/gandiva/bitmap_accumulator_test.cc index fc89421344e83..53e8aaca21ff1 100644 --- a/cpp/src/gandiva/bitmap_accumulator_test.cc +++ b/cpp/src/gandiva/bitmap_accumulator_test.cc @@ -32,9 +32,8 @@ class TestBitMapAccumulator : public ::testing::Test { int nrecords); }; -void TestBitMapAccumulator::FillBitMap(uint8_t* bmap, int nrecords) { - int nbytes = nrecords / 8; - unsigned int cur; +void TestBitMapAccumulator::FillBitMap(uint8_t* bmap, int nbytes) { + unsigned int cur = 0; for (int i = 0; i < nbytes; ++i) { rand_r(&cur); @@ -62,7 +61,7 @@ TEST_F(TestBitMapAccumulator, TestIntersectBitMaps) { uint8_t expected_bitmap[length]; for (int i = 0; i < 4; i++) { - FillBitMap(src_bitmaps[i], nrecords); + FillBitMap(src_bitmaps[i], length); } for (int i = 0; i < 4; i++) { diff --git a/cpp/src/gandiva/eval_batch.h b/cpp/src/gandiva/eval_batch.h index 608f4200ce415..093968f232afb 100644 --- a/cpp/src/gandiva/eval_batch.h +++ b/cpp/src/gandiva/eval_batch.h @@ -85,7 +85,7 @@ class EvalBatch { /// An array of 'num_buffers_', each containing a buffer. The buffer /// sizes depends on the data type, but all of them have the same /// number of slots (equal to num_records_). - std::unique_ptr buffers_array_; + std::unique_ptr buffers_array_; std::unique_ptr local_bitmaps_holder_; diff --git a/cpp/src/gandiva/exported_funcs_registry.h b/cpp/src/gandiva/exported_funcs_registry.h index 511ec9c212468..35ad5c0fae516 100644 --- a/cpp/src/gandiva/exported_funcs_registry.h +++ b/cpp/src/gandiva/exported_funcs_registry.h @@ -18,6 +18,7 @@ #ifndef GANDIVA_EXPORTED_FUNCS_REGISTRY_H #define GANDIVA_EXPORTED_FUNCS_REGISTRY_H +#include #include #include @@ -30,12 +31,12 @@ class ExportedFuncsBase; /// LLVM/IR code. class ExportedFuncsRegistry { public: - using list_type = std::vector; + using list_type = std::vector>; // Add functions from all the registered classes to the engine. static void AddMappings(Engine* engine); - static bool Register(ExportedFuncsBase* entry) { + static bool Register(std::shared_ptr entry) { registered().push_back(entry); return true; } @@ -48,7 +49,8 @@ class ExportedFuncsRegistry { }; #define REGISTER_EXPORTED_FUNCS(classname) \ - static bool _registered_##classname = ExportedFuncsRegistry::Register(new classname) + static bool _registered_##classname = \ + ExportedFuncsRegistry::Register(std::make_shared()) } // namespace gandiva diff --git a/cpp/src/gandiva/local_bitmaps_holder.h b/cpp/src/gandiva/local_bitmaps_holder.h index 1dc82562e3110..ae0ba53e99003 100644 --- a/cpp/src/gandiva/local_bitmaps_holder.h +++ b/cpp/src/gandiva/local_bitmaps_holder.h @@ -50,10 +50,10 @@ class LocalBitMapsHolder { int64_t num_records_; /// A container of 'local_bitmaps_', each sized to accomodate 'num_records'. - std::vector> local_bitmaps_vec_; + std::vector> local_bitmaps_vec_; /// An array of the local bitmaps. - std::unique_ptr local_bitmaps_array_; + std::unique_ptr local_bitmaps_array_; int64_t local_bitmap_size_; }; @@ -72,7 +72,7 @@ inline LocalBitMapsHolder::LocalBitMapsHolder(int64_t num_records, int num_local // Alloc 'num_local_bitmaps_' number of bitmaps, each of capacity 'num_records_'. for (int i = 0; i < num_local_bitmaps; ++i) { // TODO : round-up to a slab friendly multiple. - std::unique_ptr bitmap(new uint8_t[local_bitmap_size_]); + std::unique_ptr bitmap(new uint8_t[local_bitmap_size_]); // keep pointer to the bitmap in the array. (local_bitmaps_array_.get())[i] = bitmap.get(); diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 2af49084bf310..21a74bd4916ee 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -65,7 +65,7 @@ function(add_precompiled_unit_test REL_TEST_NAME) ) target_compile_definitions(${TEST_NAME} PRIVATE GANDIVA_UNIT_TEST=1) add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) - set_property(TEST ${TEST_NAME} PROPERTY LABELS gandiva;unittest ${TEST_NAME}) + set_property(TEST ${TEST_NAME} PROPERTY LABELS gandiva-tests {TEST_NAME}) endfunction(add_precompiled_unit_test REL_TEST_NAME) # testing diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 8020a45b3d302..40fdc201133a4 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -175,6 +175,12 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, astatus = arrow::AllocateBuffer(pool, data_len, &data); ARROW_RETURN_NOT_OK(astatus); + // Valgrind detects unitialized memory at byte level. Boolean types use bits + // and can leave buffer memory uninitialized in the last byte. + if (type->id() == arrow::Type::BOOL) { + data->mutable_data()[data_len - 1] = 0; + } + *array_data = arrow::ArrayData::Make(type, num_records, {null_bitmap, data}); return Status::OK(); } diff --git a/cpp/src/gandiva/selection_vector_test.cc b/cpp/src/gandiva/selection_vector_test.cc index acb0f338cd6ae..67389273c82f2 100644 --- a/cpp/src/gandiva/selection_vector_test.cc +++ b/cpp/src/gandiva/selection_vector_test.cc @@ -18,6 +18,7 @@ #include "gandiva/selection_vector.h" #include +#include #include @@ -102,15 +103,14 @@ TEST_F(TestSelectionVector, TestInt16PopulateFromBitMap) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = RoundUpNumi64(max_slots) * 8; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 5); - arrow::BitUtil::SetBit(bitmap.get(), 121); - arrow::BitUtil::SetBit(bitmap.get(), 220); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 5); + arrow::BitUtil::SetBit(&bitmap[0], 121); + arrow::BitUtil::SetBit(&bitmap[0], 220); - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, max_slots - 1); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1); EXPECT_EQ(status.ok(), true) << status.message(); EXPECT_EQ(selection->GetNumSlots(), 3); @@ -127,15 +127,14 @@ TEST_F(TestSelectionVector, TestInt16PopulateFromBitMapNegative) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = 16; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 1); - arrow::BitUtil::SetBit(bitmap.get(), 2); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 1); + arrow::BitUtil::SetBit(&bitmap[0], 2); // The bitmap has three set bits, whereas the selection vector has capacity for only 2. - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, 2); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, 2); EXPECT_EQ(status.IsInvalid(), true); } @@ -175,15 +174,14 @@ TEST_F(TestSelectionVector, TestInt32PopulateFromBitMap) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = RoundUpNumi64(max_slots) * 8; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 5); - arrow::BitUtil::SetBit(bitmap.get(), 121); - arrow::BitUtil::SetBit(bitmap.get(), 220); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 5); + arrow::BitUtil::SetBit(&bitmap[0], 121); + arrow::BitUtil::SetBit(&bitmap[0], 220); - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, max_slots - 1); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1); EXPECT_EQ(status.ok(), true) << status.message(); EXPECT_EQ(selection->GetNumSlots(), 3); @@ -243,15 +241,14 @@ TEST_F(TestSelectionVector, TestInt64PopulateFromBitMap) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = RoundUpNumi64(max_slots) * 8; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 5); - arrow::BitUtil::SetBit(bitmap.get(), 121); - arrow::BitUtil::SetBit(bitmap.get(), 220); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 5); + arrow::BitUtil::SetBit(&bitmap[0], 121); + arrow::BitUtil::SetBit(&bitmap[0], 220); - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, max_slots - 1); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1); EXPECT_EQ(status.ok(), true) << status.message(); EXPECT_EQ(selection->GetNumSlots(), 3); diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index becaf8f1ba3d7..61d9dc3ad1629 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -493,14 +493,15 @@ TEST_F(TestProjector, TestZeroCopy) { // allocate output buffers int64_t bitmap_sz = arrow::BitUtil::BytesForBits(num_records); - std::unique_ptr bitmap(new uint8_t[bitmap_sz]); + int64_t bitmap_capacity = arrow::BitUtil::RoundUpToMultipleOf64(bitmap_sz); + std::vector bitmap(bitmap_capacity); std::shared_ptr bitmap_buf = - std::make_shared(bitmap.get(), bitmap_sz); + std::make_shared(&bitmap[0], bitmap_capacity); int64_t data_sz = sizeof(float) * num_records; - std::unique_ptr data(new uint8_t[data_sz]); + std::vector data(bitmap_capacity); std::shared_ptr data_buf = - std::make_shared(data.get(), data_sz); + std::make_shared(&data[0], data_sz); auto array_data = arrow::ArrayData::Make(float32(), num_records, {bitmap_buf, data_buf}); diff --git a/cpp/valgrind.supp b/cpp/valgrind.supp index 8e707e39e7cd8..d8bc8fb28f2d5 100644 --- a/cpp/valgrind.supp +++ b/cpp/valgrind.supp @@ -21,4 +21,15 @@ Memcheck:Cond fun:*CastFunctor*BooleanType* } - +{ + :Conditional jump or move depends on uninitialised value(s) + Memcheck:Cond + ... + fun:_ZN3re23RE2C1E* +} +{ + :Use of uninitialised value of size 8 + Memcheck:Value8 + ... + fun:_ZN3re23RE2C1E* +} From d08964334082e87010b37933623f021c98e8733d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Dec 2018 10:07:13 -0600 Subject: [PATCH 081/328] ARROW-3803: [C++/Python] Merge C++ builds and tests, run Python tests in separate CI entries I found unfortunately that the conda-forge boost-cpp package is not fully compatible with Xcode 8.3, see https://issues.apache.org/jira/browse/ARROW-4056 We might have to build a vendored Boost in this CI entry to work around the problem (this is what the Ray project did when they also hit this issue) Author: Wes McKinney Closes #3208 from wesm/ARROW-3803 and squashes the following commits: 7c47776a9 Remove now unneeded travis_script_gandiva_cpp.sh 9c8d6aa27 * Combine C++ CI jobs, split Python CI jobs into separate build entries * Use gcc 4.8 * Pin boost-cpp 1.68.0 due to crashes caused by 1.69.0 --- .travis.yml | 103 +++++++++--------- ci/conda_env_cpp.yml | 4 +- ci/travis_before_script_cpp.sh | 19 +++- ci/travis_env_common.sh | 2 + ci/travis_script_gandiva_cpp.sh | 30 ----- ci/travis_script_python.sh | 6 +- cpp/cmake_modules/ThirdpartyToolchain.cmake | 2 + cpp/src/plasma/CMakeLists.txt | 2 + .../gandiva/evaluator/MicroBenchmarkTest.java | 2 + python/CMakeLists.txt | 1 + 10 files changed, 77 insertions(+), 94 deletions(-) delete mode 100755 ci/travis_script_gandiva_cpp.sh diff --git a/.travis.yml b/.travis.yml index 64408128fe146..f7094fc56d641 100644 --- a/.travis.yml +++ b/.travis.yml @@ -62,74 +62,67 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh script: - $TRAVIS_BUILD_DIR/ci/travis_lint.sh - - name: "C++ & Python w/ gcc 4.9" + - name: "C++ unit tests, code coverage with gcc 4.8" compiler: gcc language: cpp os: linux jdk: openjdk8 env: - - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_TRAVIS_VALGRIND=1 + - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_COVERAGE=1 - ARROW_TRAVIS_PARQUET=1 - - ARROW_TRAVIS_PYTHON_DOCS=1 + - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - - ARROW_TRAVIS_PYTHON_JVM=1 - - ARROW_TRAVIS_JAVA_BUILD_ONLY=1 - - ARROW_TRAVIS_PYTHON_GANDIVA=1 - # ARROW-2999 Benchmarks are disabled in Travis CI for the time being - # - ARROW_TRAVIS_PYTHON_BENCHMARKS=1 - - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" before_script: - # (ARROW_CI_CPP_AFFECTED implies ARROW_CI_PYTHON_AFFECTED) - - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh # If either C++ or Python changed, we must install the C++ libraries - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - # All test steps are required for accurate C++ coverage info - - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh - # Build Arrow Java to test the pyarrow<->JVM in-process bridge - - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh - # Only run Plasma tests with valgrind in one of the Python builds because - # they are slow - - export PLASMA_VALGRIND=0 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 - - export PLASMA_VALGRIND=1 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh - - name: "Gandiva C++ w/ gcc 4.9 and Java" + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh || travis_terminate 1 + - name: "Python 2.7 and 3.6 unit tests, coverage with gcc 4.8" compiler: gcc language: cpp os: linux jdk: openjdk8 env: - - ARROW_TRAVIS_GANDIVA=1 - - ARROW_TRAVIS_GANDIVA_JAVA=1 - - ARROW_TRAVIS_GANDIVA_TESTS=1 - - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - - ARROW_CPP_BUILD_TARGETS="gandiva-all" - - ARROW_TRAVIS_USE_TOOLCHAIN=1 + # Valgrind is needed for the Plasma store tests - ARROW_TRAVIS_VALGRIND=1 + - ARROW_TRAVIS_USE_TOOLCHAIN=1 + - ARROW_TRAVIS_COVERAGE=1 + - ARROW_TRAVIS_PYTHON_DOCS=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" + - ARROW_TRAVIS_PYTHON_JVM=1 + - ARROW_TRAVIS_PYTHON_GANDIVA=1 + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 + # TODO(wesm): Run the benchmarks outside of Travis + # - ARROW_TRAVIS_PYTHON_BENCHMARKS=1 before_script: - # Run if something changed in CPP or Java. - - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi + - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh + - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh script: - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh - - name: "[OS X] C++ & Python w/ XCode 6.4" + - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh + # Only run Plasma tests with valgrind in one of the Python builds because + # they are slow + - export PLASMA_VALGRIND=0 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 + - export PLASMA_VALGRIND=1 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 + - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh + - name: "[OS X] C++ w/ XCode 8.3" compiler: clang language: cpp - osx_image: xcode6.4 + osx_image: xcode8.3 os: osx cache: addons: @@ -138,39 +131,41 @@ matrix: - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN + # ARROW-3803: The Xcode 8.3 image has Boost libraries in /usr/local/lib + # which can get loaded before the toolchain Boost libraries. These seem to + # get loaded even though we are modifying LD_LIBRARY_PATH. We build our own + # Boost and statically link to get around the issue until this can be + # investigated further + - ARROW_TRAVIS_VENDORED_BOOST=1 before_script: - - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi # If either C++ or Python changed, we must install the C++ libraries - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - - if [ $ARROW_CI_CPP_AFFECTED == "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh; fi - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - - name: "[OS X] Gandiva C++ w/ XCode 8.3 & Java" + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh + - name: "[OS X] Python w/ XCode 6.4" compiler: clang language: cpp - # xcode 7.3 has a bug in strptime. - osx_image: xcode8.3 + osx_image: xcode6.4 os: osx cache: addons: env: - - ARROW_TRAVIS_GANDIVA=1 - - ARROW_TRAVIS_GANDIVA_JAVA=1 - - ARROW_TRAVIS_GANDIVA_TESTS=1 - - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - - ARROW_CPP_BUILD_TARGETS="gandiva-all" - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 before_script: - # Run if something changed in CPP or Java. - - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library script: - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh + - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - name: "[manylinux1] Python" language: cpp before_script: diff --git a/ci/conda_env_cpp.yml b/ci/conda_env_cpp.yml index 1e22e9017fc62..87523b3fdd611 100644 --- a/ci/conda_env_cpp.yml +++ b/ci/conda_env_cpp.yml @@ -15,7 +15,9 @@ # specific language governing permissions and limitations # under the License. -boost-cpp +# ARROW-4056: The conda-forge boost 1.69.0 seems to break the Parquet unit +# tests with Xcode 8.3. Root cause not yet determined +boost-cpp=1.68.0 brotli bzip2 cmake diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index aa5b2a6ab084c..8ddc98691015f 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -40,6 +40,14 @@ if [ "$only_library_mode" == "no" ]; then source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh fi +if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then + # Set up C++ toolchain from conda-forge packages for faster builds + source $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh +fi + +mkdir -p $ARROW_CPP_BUILD_DIR +pushd $ARROW_CPP_BUILD_DIR + CMAKE_COMMON_FLAGS="\ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL \ -DARROW_NO_DEPRECATED_API=ON \ @@ -48,15 +56,10 @@ CMAKE_LINUX_FLAGS="" CMAKE_OSX_FLAGS="" if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then - # Set up C++ toolchain from conda-forge packages for faster builds - source $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh CMAKE_COMMON_FLAGS="${CMAKE_COMMON_FLAGS} -DARROW_JEMALLOC=ON" CMAKE_COMMON_FLAGS="${CMAKE_COMMON_FLAGS} -DARROW_WITH_BZ2=ON" fi -mkdir -p $ARROW_CPP_BUILD_DIR -pushd $ARROW_CPP_BUILD_DIR - if [ $only_library_mode == "yes" ]; then CMAKE_COMMON_FLAGS="\ $CMAKE_COMMON_FLAGS \ @@ -115,10 +118,14 @@ if [ $ARROW_TRAVIS_VERBOSE == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_VERBOSE_THIRDPARTY_BUILD=ON" fi -if [ $ARROW_TRAVIS_USE_VENDORED_BOOST == "1" ]; then +if [ $ARROW_TRAVIS_VENDORED_BOOST == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BOOST_VENDORED=ON" fi +if [ $ARROW_TRAVIS_STATIC_BOOST == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BOOST_USE_SHARED=OFF" +fi + if [ $ARROW_TRAVIS_OPTIONAL_INSTALL == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_OPTIONAL_INSTALL=ON" fi diff --git a/ci/travis_env_common.sh b/ci/travis_env_common.sh index f5748b2a0452a..636a25fcd7486 100755 --- a/ci/travis_env_common.sh +++ b/ci/travis_env_common.sh @@ -33,6 +33,8 @@ export ARROW_RUBY_DIR=$TRAVIS_BUILD_DIR/ruby export ARROW_RUST_DIR=${TRAVIS_BUILD_DIR}/rust export ARROW_R_DIR=${TRAVIS_BUILD_DIR}/r +export ARROW_TRAVIS_COVERAGE=${ARROW_TRAVIS_COVERAGE:=0} + if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then export ARROW_CPP_COVERAGE_FILE=${TRAVIS_BUILD_DIR}/coverage.info export ARROW_PYTHON_COVERAGE_FILE=${TRAVIS_BUILD_DIR}/.coverage diff --git a/ci/travis_script_gandiva_cpp.sh b/ci/travis_script_gandiva_cpp.sh deleted file mode 100755 index bc4a7a9a8f03b..0000000000000 --- a/ci/travis_script_gandiva_cpp.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh - -pushd $CPP_BUILD_DIR - -PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva-tests - -popd - -# TODO : Capture C++ coverage info diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 20ec57efc39e4..69e115a9dcce7 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -87,7 +87,7 @@ rm -rf * # XXX Can we simply reuse CMAKE_COMMON_FLAGS from travis_before_script_cpp.sh? CMAKE_COMMON_FLAGS="-DARROW_EXTRA_ERROR_CONTEXT=ON" -PYTHON_CPP_BUILD_TARGETS="arrow_python-all plasma" +PYTHON_CPP_BUILD_TARGETS="arrow_python-all plasma parquet" if [ $ARROW_TRAVIS_COVERAGE == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GENERATE_COVERAGE=ON" @@ -103,6 +103,7 @@ cmake -GNinja \ -DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_OPTIONAL_INSTALL=ON \ + -DARROW_PARQUET=on \ -DARROW_PLASMA=on \ -DARROW_TENSORFLOW=on \ -DARROW_PYTHON=on \ @@ -176,12 +177,11 @@ if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then coverage report -i --include="*/_parquet.pyx" # Generate XML file for CodeCov coverage xml -i -o $TRAVIS_BUILD_DIR/coverage.xml - # Capture C++ coverage info and combine with previous coverage file + # Capture C++ coverage info pushd $TRAVIS_BUILD_DIR lcov --quiet --directory . --capture --no-external --output-file coverage-python-tests.info \ 2>&1 | grep -v "WARNING: no data found for /usr/include" lcov --add-tracefile coverage-python-tests.info \ - --add-tracefile $ARROW_CPP_COVERAGE_FILE \ --output-file $ARROW_CPP_COVERAGE_FILE rm coverage-python-tests.info popd # $TRAVIS_BUILD_DIR diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index db0b69be460ce..3381b5cda16b4 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -346,6 +346,8 @@ if (MSVC AND ARROW_USE_STATIC_CRT) set(Boost_USE_STATIC_RUNTIME ON) endif() set(Boost_ADDITIONAL_VERSIONS + "1.70.0" "1.70" + "1.69.0" "1.69" "1.68.0" "1.68" "1.67.0" "1.67" "1.66.0" "1.66" diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index d9c7dcaedeac3..a71acf8ae43d8 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -96,6 +96,8 @@ ADD_ARROW_LIB(plasma SHARED_LINK_LIBS ${FLATBUFFERS_STATIC_LIB} ${CMAKE_THREAD_LIBS_INIT} ${PLASMA_LINK_LIBS} STATIC_LINK_LIBS ${FLATBUFFERS_STATIC_LIB} ${CMAKE_THREAD_LIBS_INIT} ${PLASMA_STATIC_LINK_LIBS}) +add_dependencies(plasma ${PLASMA_LIBRARIES}) + foreach(LIB_TARGET ${PLASMA_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java index c4d6bd9070613..6934c3f9e7d1a 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java @@ -26,10 +26,12 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import com.google.common.collect.Lists; +@Ignore public class MicroBenchmarkTest extends BaseEvaluatorTest { private double toleranceRatio = 4.0; diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 1a874542c8f9d..a6e4123082532 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -138,6 +138,7 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constant-logical-operand") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-declarations") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sometimes-uninitialized") # We have public Cython APIs which return C++ types, which are in an extern # "C" blog (no symbol mangling) and clang doesn't like this From 320621dae6704dab000dddbf400a87a6f4a79914 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Dec 2018 10:54:35 -0600 Subject: [PATCH 082/328] ARROW-4030: [CI] Use travis_terminate in more script commands to fail faster I had done this partially in ARROW-3803, but I reviewed again and tried to apply this more consistently. Note it is not necessary to use this in the last command in the script: block Author: Wes McKinney Closes #3226 from wesm/ARROW-4030 and squashes the following commits: a04c11e11 Use travis_terminate in more builds --- .travis.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index f7094fc56d641..10300c9b6e287 100644 --- a/.travis.yml +++ b/.travis.yml @@ -111,13 +111,13 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh script: - - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh || travis_terminate 1 # Only run Plasma tests with valgrind in one of the Python builds because # they are slow - export PLASMA_VALGRIND=0 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 || travis_terminate 1 - export PLASMA_VALGRIND=1 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh - name: "[OS X] C++ w/ XCode 8.3" compiler: clang @@ -147,7 +147,7 @@ matrix: - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh - name: "[OS X] Python w/ XCode 6.4" compiler: clang @@ -163,8 +163,8 @@ matrix: before_script: script: - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi - - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 + - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - name: "[manylinux1] Python" language: cpp From f66fa805e89aef948581876ac802b1ffc6430f5c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 19 Dec 2018 12:53:09 -0600 Subject: [PATCH 083/328] ARROW-554: [C++] Add functions to unify dictionary types and arrays Author: Antoine Pitrou Closes #3165 from pitrou/ARROW-554-conform-dicts and squashes the following commits: 7d2579b30 ARROW-554: Add functions to conform dictionaries --- cpp/src/arrow/array-dict-test.cc | 62 ++++++++++ cpp/src/arrow/array.cc | 61 ++++++++++ cpp/src/arrow/array.h | 24 +++- cpp/src/arrow/array/builder_dict.cc | 172 +++++++++++++++++++++------- cpp/src/arrow/type-test.cc | 128 +++++++++++++++++++++ cpp/src/arrow/type.cc | 7 +- cpp/src/arrow/type.h | 18 +++ cpp/src/arrow/util/hashing.h | 38 ------ cpp/src/arrow/util/int-util-test.cc | 9 ++ cpp/src/arrow/util/int-util.cc | 40 +++++++ cpp/src/arrow/util/int-util.h | 4 + cpp/src/arrow/visitor_inline.h | 2 +- python/pyarrow/tests/test_types.py | 4 +- 13 files changed, 484 insertions(+), 85 deletions(-) diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index 87cb2290a7bf9..730b891cf57f4 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -31,6 +31,7 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" namespace arrow { @@ -38,6 +39,8 @@ namespace arrow { using std::string; using std::vector; +using internal::checked_cast; + // ---------------------------------------------------------------------- // Dictionary tests @@ -740,4 +743,63 @@ TEST(TestDictionary, FromArray) { ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, &arr4)); } +TEST(TestDictionary, TransposeBasic) { + std::shared_ptr arr, out, expected; + + auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); + auto dict_type = dictionary(int16(), dict); + auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]"); + // ["B", "C", "A", "A"] + ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr)); + + // Transpose to same index type + { + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int16(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*out, *expected); + } + + // Transpose to other type + { + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int8(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*expected, *out); + } +} + +TEST(TestDictionary, TransposeNulls) { + std::shared_ptr arr, out, expected; + + auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); + auto dict_type = dictionary(int16(), dict); + auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]"); + // ["B", "C", null, "A"] + ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr)); + + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int16(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*expected, *out); +} + } // namespace arrow diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index ff94aa2a1e6fe..7e45e90d9c8f7 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -33,6 +33,7 @@ #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/visitor.h" @@ -663,6 +664,66 @@ std::shared_ptr DictionaryArray::dictionary() const { return dict_type_->dictionary(); } +template +static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data, + const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) { + using in_c_type = typename InType::c_type; + using out_c_type = typename OutType::c_type; + + std::shared_ptr out_buffer; + RETURN_NOT_OK(AllocateBuffer(pool, in_data.length * sizeof(out_c_type), &out_buffer)); + // Null bitmap is unchanged + auto out_data = ArrayData::Make(type, in_data.length, {in_data.buffers[0], out_buffer}, + in_data.null_count); + internal::TransposeInts(in_data.GetValues(1), + out_data->GetMutableValues(1), in_data.length, + transpose_map.data()); + *out = MakeArray(out_data); + return Status::OK(); +} + +Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) const { + DCHECK_EQ(type->id(), Type::DICTIONARY); + const auto& out_dict_type = checked_cast(*type); + + // XXX We'll probably want to make this operation a kernel when we + // implement dictionary-to-dictionary casting. + auto in_type_id = dict_type_->index_type()->id(); + auto out_type_id = out_dict_type.index_type()->id(); + +#define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \ + case OUT_INDEX_TYPE::type_id: \ + return TransposeDictIndices(pool, *data(), type, \ + transpose_map, out); + +#define TRANSPOSE_IN_CASE(IN_INDEX_TYPE) \ + case IN_INDEX_TYPE::type_id: \ + switch (out_type_id) { \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int8Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int16Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int32Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int64Type) \ + default: \ + return Status::NotImplemented("unexpected index type"); \ + } + + switch (in_type_id) { + TRANSPOSE_IN_CASE(Int8Type) + TRANSPOSE_IN_CASE(Int16Type) + TRANSPOSE_IN_CASE(Int32Type) + TRANSPOSE_IN_CASE(Int64Type) + default: + return Status::NotImplemented("unexpected index type"); + } + +#undef TRANSPOSE_IN_OUT_CASE +#undef TRANSPOSE_IN_CASE +} + // ---------------------------------------------------------------------- // Implement Array::Accept as inline visitor diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 52c5207d8dddc..aead17f133d74 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -422,6 +422,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { value_type Value(int64_t i) const { return raw_values()[i]; } + // For API compatibility with BinaryArray etc. + value_type GetView(int64_t i) const { return Value(i); } + protected: using PrimitiveArray::PrimitiveArray; }; @@ -442,6 +445,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { i + data_->offset); } + bool GetView(int64_t i) const { return Value(i); } + protected: using PrimitiveArray::PrimitiveArray; }; @@ -802,7 +807,7 @@ class ARROW_EXPORT DictionaryArray : public Array { /// This function does the validation of the indices and input type. It checks if /// all indices are non-negative and smaller than the size of the dictionary /// - /// \param[in] type a data type containing a dictionary + /// \param[in] type a dictionary type /// \param[in] indices an array of non-negative signed /// integers smaller than the size of the dictionary /// \param[out] out the resulting DictionaryArray instance @@ -810,6 +815,23 @@ class ARROW_EXPORT DictionaryArray : public Array { const std::shared_ptr& indices, std::shared_ptr* out); + /// \brief Transpose this DictionaryArray + /// + /// This method constructs a new dictionary array with the given dictionary type, + /// transposing indices using the transpose map. + /// The type and the transpose map are typically computed using + /// DictionaryType::Unify. + /// + /// \param[in] pool a pool to allocate the array data from + /// \param[in] type a dictionary type + /// \param[in] transpose_map a vector transposing this array's indices + /// into the target array's indices + /// \param[out] out the resulting DictionaryArray instance + Status Transpose(MemoryPool* pool, const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) const; + // XXX Do we also want an unsafe in-place Transpose? + std::shared_ptr indices() const; std::shared_ptr dictionary() const; diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 0891e4c0829f4..e534c3cadb14b 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -19,6 +19,9 @@ #include #include +#include +#include +#include #include #include @@ -30,11 +33,117 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" #include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// DictionaryType unification + +struct UnifyDictionaryValues { + MemoryPool* pool_; + std::shared_ptr value_type_; + const std::vector& types_; + std::shared_ptr* out_values_; + std::vector>* out_transpose_maps_; + + Status Visit(const DataType&, void* = nullptr) { + // Default implementation for non-dictionary-supported datatypes + std::stringstream ss; + ss << "Unification of " << value_type_->ToString() + << " dictionaries is not implemented"; + return Status::NotImplemented(ss.str()); + } + + template + Status Visit(const T&, + typename internal::DictionaryTraits::MemoTableType* = nullptr) { + using ArrayType = typename TypeTraits::ArrayType; + using DictTraits = typename internal::DictionaryTraits; + using MemoTableType = typename DictTraits::MemoTableType; + + MemoTableType memo_table; + if (out_transpose_maps_ != nullptr) { + out_transpose_maps_->clear(); + out_transpose_maps_->reserve(types_.size()); + } + // Build up the unified dictionary values and the transpose maps + for (const auto& type : types_) { + const ArrayType& values = checked_cast(*type->dictionary()); + if (out_transpose_maps_ != nullptr) { + std::vector transpose_map; + transpose_map.reserve(values.length()); + for (int64_t i = 0; i < values.length(); ++i) { + int32_t dict_index = memo_table.GetOrInsert(values.GetView(i)); + transpose_map.push_back(dict_index); + } + out_transpose_maps_->push_back(std::move(transpose_map)); + } else { + for (int64_t i = 0; i < values.length(); ++i) { + memo_table.GetOrInsert(values.GetView(i)); + } + } + } + // Build unified dictionary array + std::shared_ptr data; + RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table, + 0 /* start_offset */, &data)); + *out_values_ = MakeArray(data); + return Status::OK(); + } +}; + +Status DictionaryType::Unify(MemoryPool* pool, const std::vector& types, + std::shared_ptr* out_type, + std::vector>* out_transpose_maps) { + if (types.size() == 0) { + return Status::Invalid("need at least one input type"); + } + std::vector dict_types; + dict_types.reserve(types.size()); + for (const auto& type : types) { + if (type->id() != Type::DICTIONARY) { + return Status::TypeError("input types must be dictionary types"); + } + dict_types.push_back(checked_cast(type)); + } + + // XXX Should we check the ordered flag? + auto value_type = dict_types[0]->dictionary()->type(); + for (const auto& type : dict_types) { + auto values = type->dictionary(); + if (!values->type()->Equals(value_type)) { + return Status::TypeError("input types have different value types"); + } + if (values->null_count() != 0) { + return Status::TypeError("input types have null values"); + } + } + + std::shared_ptr values; + { + UnifyDictionaryValues visitor{pool, value_type, dict_types, &values, + out_transpose_maps}; + RETURN_NOT_OK(VisitTypeInline(*value_type, &visitor)); + } + + // Build unified dictionary type with the right index type + std::shared_ptr index_type; + if (values->length() <= std::numeric_limits::max()) { + index_type = int8(); + } else if (values->length() <= std::numeric_limits::max()) { + index_type = int16(); + } else if (values->length() <= std::numeric_limits::max()) { + index_type = int32(); + } else { + index_type = int64(); + } + *out_type = arrow::dictionary(index_type, values); + return Status::OK(); +} + // ---------------------------------------------------------------------- // DictionaryBuilder @@ -118,12 +227,31 @@ Status DictionaryBuilder::AppendNull() { return values_builder_.Append template Status DictionaryBuilder::AppendArray(const Array& array) { - const auto& numeric_array = checked_cast&>(array); + using ArrayType = typename TypeTraits::ArrayType; + + const auto& concrete_array = checked_cast(array); for (int64_t i = 0; i < array.length(); i++) { if (array.IsNull(i)) { RETURN_NOT_OK(AppendNull()); } else { - RETURN_NOT_OK(Append(numeric_array.Value(i))); + RETURN_NOT_OK(Append(concrete_array.GetView(i))); + } + } + return Status::OK(); +} + +template <> +Status DictionaryBuilder::AppendArray(const Array& array) { + if (!type_->Equals(*array.type())) { + return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); + } + + const auto& typed_array = checked_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + RETURN_NOT_OK(AppendNull()); + } else { + RETURN_NOT_OK(Append(typed_array.GetValue(i))); } } return Status::OK(); @@ -168,46 +296,6 @@ Status DictionaryBuilder::FinishInternal(std::shared_ptr* o return Status::OK(); } -// -// StringType and BinaryType specializations -// - -#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \ - \ - template <> \ - Status DictionaryBuilder::AppendArray(const Array& array) { \ - using ArrayType = typename TypeTraits::ArrayType; \ - const ArrayType& binary_array = checked_cast(array); \ - for (int64_t i = 0; i < array.length(); i++) { \ - if (array.IsNull(i)) { \ - RETURN_NOT_OK(AppendNull()); \ - } else { \ - RETURN_NOT_OK(Append(binary_array.GetView(i))); \ - } \ - } \ - return Status::OK(); \ - } - -BINARY_DICTIONARY_SPECIALIZATIONS(StringType); -BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType); - -template <> -Status DictionaryBuilder::AppendArray(const Array& array) { - if (!type_->Equals(*array.type())) { - return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); - } - - const auto& typed_array = checked_cast(array); - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsNull(i)) { - RETURN_NOT_OK(AppendNull()); - } else { - RETURN_NOT_OK(Append(typed_array.GetValue(i))); - } - } - return Status::OK(); -} - template class DictionaryBuilder; template class DictionaryBuilder; template class DictionaryBuilder; diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index e0a10690c2c77..20b7aff884b7f 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -24,6 +24,8 @@ #include +#include "arrow/memory_pool.h" +#include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -480,6 +482,132 @@ TEST(TestStructType, GetChildIndex) { ASSERT_EQ(-1, struct_type.GetChildIndex("not-found")); } +TEST(TestDictionaryType, Equals) { + auto t1 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t2 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t3 = dictionary(int16(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t4 = dictionary(int8(), ArrayFromJSON(int16(), "[3, 4, 5, 6]")); + auto t5 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 7, 6]")); + + ASSERT_TRUE(t1->Equals(t2)); + // Different index type + ASSERT_FALSE(t1->Equals(t3)); + // Different value type + ASSERT_FALSE(t1->Equals(t4)); + // Different values + ASSERT_FALSE(t1->Equals(t5)); +} + +TEST(TestDictionaryType, UnifyNumeric) { + auto t1 = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7]")); + auto t2 = dictionary(int8(), ArrayFromJSON(int64(), "[1, 7, 4, 8]")); + auto t3 = dictionary(int8(), ArrayFromJSON(int64(), "[1, -200]")); + + auto expected = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7, 1, 8, -200]")); + + std::shared_ptr dict_type; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()}, + &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()}, + &dict_type, &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + ASSERT_EQ(transpose_maps.size(), 3); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1, 2})); + ASSERT_EQ(transpose_maps[1], std::vector({3, 2, 1, 4})); + ASSERT_EQ(transpose_maps[2], std::vector({3, 5})); +} + +TEST(TestDictionaryType, UnifyString) { + auto t1 = dictionary(int16(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\"]")); + auto t2 = dictionary(int32(), ArrayFromJSON(utf8(), "[\"quux\", \"foo\"]")); + + auto expected = + dictionary(int8(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"quux\"]")); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type, + &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + + ASSERT_EQ(transpose_maps.size(), 2); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1})); + ASSERT_EQ(transpose_maps[1], std::vector({2, 0})); +} + +TEST(TestDictionaryType, UnifyFixedSizeBinary) { + auto type = fixed_size_binary(3); + + std::string data = "foobarbazqux"; + auto buf = std::make_shared(data); + // ["foo", "bar"] + auto dict1 = std::make_shared(type, 2, SliceBuffer(buf, 0, 6)); + auto t1 = dictionary(int16(), dict1); + // ["bar", "baz", "qux"] + auto dict2 = std::make_shared(type, 3, SliceBuffer(buf, 3, 9)); + auto t2 = dictionary(int16(), dict2); + + // ["foo", "bar", "baz", "qux"] + auto expected_dict = std::make_shared(type, 4, buf); + auto expected = dictionary(int8(), expected_dict); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type, + &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + ASSERT_EQ(transpose_maps.size(), 2); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1})); + ASSERT_EQ(transpose_maps[1], std::vector({1, 2, 3})); +} + +TEST(TestDictionaryType, UnifyLarge) { + // Unifying "large" dictionary types should choose the right index type + std::shared_ptr dict1, dict2, expected_dict; + + Int32Builder builder; + ASSERT_OK(builder.Reserve(120)); + for (int32_t i = 0; i < 120; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&dict1)); + ASSERT_EQ(dict1->length(), 120); + auto t1 = dictionary(int8(), dict1); + + ASSERT_OK(builder.Reserve(30)); + for (int32_t i = 110; i < 140; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&dict2)); + ASSERT_EQ(dict2->length(), 30); + auto t2 = dictionary(int8(), dict2); + + ASSERT_OK(builder.Reserve(140)); + for (int32_t i = 0; i < 140; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&expected_dict)); + ASSERT_EQ(expected_dict->length(), 140); + // int8 would be too narrow to hold all possible index values + auto expected = dictionary(int16(), expected_dict); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); +} + TEST(TypesTest, TestDecimal128Small) { Decimal128Type t1(8, 4); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 5f1ca8d7b0f09..753cb65ff26da 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -260,7 +260,12 @@ DictionaryType::DictionaryType(const std::shared_ptr& index_type, : FixedWidthType(Type::DICTIONARY), index_type_(index_type), dictionary_(dictionary), - ordered_(ordered) {} + ordered_(ordered) { +#ifndef NDEBUG + const auto& int_type = checked_cast(*index_type); + DCHECK_EQ(int_type.is_signed(), true) << "dictionary index type should be signed"; +#endif +} int DictionaryType::bit_width() const { return checked_cast(*index_type_).bit_width(); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 9694202b9705c..8f6cfd6ced4ff 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -39,6 +39,7 @@ namespace arrow { class Array; class Field; +class MemoryPool; struct Type { /// \brief Main data type enumeration @@ -768,6 +769,23 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { bool ordered() const { return ordered_; } + /// \brief Unify several dictionary types + /// + /// Compute a resulting dictionary that will allow the union of values + /// of all input dictionary types. The input types must all have the + /// same value type. + /// \param[in] pool Memory pool to allocate dictionary values from + /// \param[in] types A sequence of input dictionary types + /// \param[out] out_type The unified dictionary type + /// \param[out] out_transpose_maps (optionally) A sequence of integer vectors, + /// one per input type. Each integer vector represents the transposition + /// of input type indices into unified type indices. + // XXX Should we return something special (an empty transpose map?) when + // the transposition is the identity function? + static Status Unify(MemoryPool* pool, const std::vector& types, + std::shared_ptr* out_type, + std::vector>* out_transpose_maps = NULLPTR); + private: // Must be an integer type (not currently checked) std::shared_ptr index_type_; diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index ee368fb4e314c..76724b2a30035 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -651,25 +651,6 @@ template struct HashTraits> { using c_type = typename T::c_type; using MemoTableType = SmallScalarMemoTable; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - std::shared_ptr dict_buffer; - auto dict_length = static_cast(memo_table.size()) - start_offset; - // This makes a copy, but we assume a dictionary array is usually small - // compared to the size of the dictionary-using array. - // (also, copying the dictionary values is cheap compared to the cost - // of building the memo table) - RETURN_NOT_OK( - AllocateBuffer(pool, TypeTraits::bytes_required(dict_length), &dict_buffer)); - memo_table.CopyValues(static_cast(start_offset), - reinterpret_cast(dict_buffer->mutable_data())); - *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */); - return Status::OK(); - } }; template @@ -677,25 +658,6 @@ struct HashTraits< T, typename std::enable_if::value && !is_8bit_int::value>::type> { using c_type = typename T::c_type; using MemoTableType = ScalarMemoTable; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - std::shared_ptr dict_buffer; - auto dict_length = static_cast(memo_table.size()) - start_offset; - // This makes a copy, but we assume a dictionary array is usually small - // compared to the size of the dictionary-using array. - // (also, copying the dictionary values is cheap compared to the cost - // of building the memo table) - RETURN_NOT_OK( - AllocateBuffer(pool, TypeTraits::bytes_required(dict_length), &dict_buffer)); - memo_table.CopyValues(static_cast(start_offset), - reinterpret_cast(dict_buffer->mutable_data())); - *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */); - return Status::OK(); - } }; template diff --git a/cpp/src/arrow/util/int-util-test.cc b/cpp/src/arrow/util/int-util-test.cc index 018eeda7248a3..5eba531d874e0 100644 --- a/cpp/src/arrow/util/int-util-test.cc +++ b/cpp/src/arrow/util/int-util-test.cc @@ -373,5 +373,14 @@ TEST(IntWidth, NullsMany) { } } +TEST(TransposeInts, Int8ToInt64) { + std::vector src = {1, 3, 5, 0, 3, 2}; + std::vector transpose_map = {1111, 2222, 3333, 4444, 5555, 6666, 7777}; + std::vector dest(src.size()); + + TransposeInts(src.data(), dest.data(), 6, transpose_map.data()); + ASSERT_EQ(dest, std::vector({2222, 4444, 6666, 1111, 4444, 3333})); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/int-util.cc b/cpp/src/arrow/util/int-util.cc index ced1cd1c20da2..d81044b3cafdc 100644 --- a/cpp/src/arrow/util/int-util.cc +++ b/cpp/src/arrow/util/int-util.cc @@ -402,5 +402,45 @@ void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) { memcpy(dest, source, length * sizeof(int64_t)); } +template +void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length, + const int32_t* transpose_map) { + while (length >= 4) { + dest[0] = static_cast(transpose_map[src[0]]); + dest[1] = static_cast(transpose_map[src[1]]); + dest[2] = static_cast(transpose_map[src[2]]); + dest[3] = static_cast(transpose_map[src[3]]); + length -= 4; + src += 4; + dest += 4; + } + while (length > 0) { + *dest++ = static_cast(transpose_map[*src++]); + --length; + } +} + +#define INSTANTIATE(SRC, DEST) \ + template ARROW_EXPORT void TransposeInts( \ + const SRC* source, DEST* dest, int64_t length, const int32_t* transpose_map); + +#define INSTANTIATE_ALL_DEST(DEST) \ + INSTANTIATE(int8_t, DEST) \ + INSTANTIATE(int16_t, DEST) \ + INSTANTIATE(int32_t, DEST) \ + INSTANTIATE(int64_t, DEST) + +#define INSTANTIATE_ALL() \ + INSTANTIATE_ALL_DEST(int8_t) \ + INSTANTIATE_ALL_DEST(int16_t) \ + INSTANTIATE_ALL_DEST(int32_t) \ + INSTANTIATE_ALL_DEST(int64_t) + +INSTANTIATE_ALL() + +#undef INSTANTIATE +#undef INSTANTIATE_ALL +#undef INSTANTIATE_ALL_DEST + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/int-util.h b/cpp/src/arrow/util/int-util.h index 68355d34549ac..66d389e5f40cf 100644 --- a/cpp/src/arrow/util/int-util.h +++ b/cpp/src/arrow/util/int-util.h @@ -63,6 +63,10 @@ void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length); ARROW_EXPORT void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length); +template +ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length, + const int32_t* transpose_map); + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index b6fc1f1ff2bfb..a5deaa7a1d22c 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -121,7 +121,7 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor) { // The scalar value's type depends on the array data type: // - the type's `c_type`, if any // - for boolean arrays, a `bool` -// - for binary, string and fixed-size binary arrars, a `util::string_view` +// - for binary, string and fixed-size binary arrays, a `util::string_view` template struct ArrayDataVisitor {}; diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 310656d86fd47..af2d1139c43fe 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -303,8 +303,8 @@ def test_dictionary_type(): assert ty0.dictionary.to_pylist() == ['a', 'b', 'c'] assert ty0.ordered is False - ty1 = pa.dictionary(pa.float32(), pa.array([1.0, 2.0]), ordered=True) - assert ty1.index_type == pa.float32() + ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True) + assert ty1.index_type == pa.int8() assert isinstance(ty0.dictionary, pa.Array) assert ty1.dictionary.to_pylist() == [1.0, 2.0] assert ty1.ordered is True From e39e36441b94f211a57685887d14a8ff1d1b5f98 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Dec 2018 15:51:08 -0600 Subject: [PATCH 084/328] ARROW-3545: [C++/Python] Use "field" terminology with StructType, specify behavior with duplicate field names Author: Wes McKinney Closes #3220 from wesm/ARROW-3545 and squashes the following commits: dc212e61c Fix more deprecated API uses 16e198473 Remove field_by_name/field APIs from Python bindings, cdef only 3c4abed05 Fix use of deprecated APIs 2eecdbf57 Rename GetChildIndex, GetChildByName for better semantic consistency. Define behavior of these functions when there are duplicate field names. Reflect changes in Python --- cpp/src/arrow/array.cc | 2 +- cpp/src/arrow/type-test.cc | 30 ++++++++++++++++--------- cpp/src/arrow/type.cc | 33 +++++++++++++++++++++++++--- cpp/src/arrow/type.h | 9 +++++++- python/pyarrow/includes/libarrow.pxd | 4 ++-- python/pyarrow/lib.pxd | 3 ++- python/pyarrow/scalar.pxi | 2 +- python/pyarrow/tests/test_types.py | 17 ++++++++++---- python/pyarrow/types.pxi | 12 +++++++--- 9 files changed, 86 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 7e45e90d9c8f7..d07c27fe15906 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -395,7 +395,7 @@ std::shared_ptr StructArray::field(int i) const { } std::shared_ptr StructArray::GetFieldByName(const std::string& name) const { - int i = struct_type()->GetChildIndex(name); + int i = struct_type()->GetFieldIndex(name); return i == -1 ? nullptr : field(i); } diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 20b7aff884b7f..5b758d7a129fd 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -448,7 +448,7 @@ TEST(TestStructType, Basics) { // TODO(wesm): out of bounds for field(...) } -TEST(TestStructType, GetChildByName) { +TEST(TestStructType, GetFieldByName) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); @@ -457,17 +457,17 @@ TEST(TestStructType, GetChildByName) { StructType struct_type({f0, f1, f2, f3}); std::shared_ptr result; - result = struct_type.GetChildByName("f1"); + result = struct_type.GetFieldByName("f1"); ASSERT_EQ(f1, result); - result = struct_type.GetChildByName("f3"); + result = struct_type.GetFieldByName("f3"); ASSERT_EQ(f3, result); - result = struct_type.GetChildByName("not-found"); + result = struct_type.GetFieldByName("not-found"); ASSERT_EQ(result, nullptr); } -TEST(TestStructType, GetChildIndex) { +TEST(TestStructType, GetFieldIndex) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); @@ -475,11 +475,21 @@ TEST(TestStructType, GetChildIndex) { StructType struct_type({f0, f1, f2, f3}); - ASSERT_EQ(0, struct_type.GetChildIndex(f0->name())); - ASSERT_EQ(1, struct_type.GetChildIndex(f1->name())); - ASSERT_EQ(2, struct_type.GetChildIndex(f2->name())); - ASSERT_EQ(3, struct_type.GetChildIndex(f3->name())); - ASSERT_EQ(-1, struct_type.GetChildIndex("not-found")); + ASSERT_EQ(0, struct_type.GetFieldIndex(f0->name())); + ASSERT_EQ(1, struct_type.GetFieldIndex(f1->name())); + ASSERT_EQ(2, struct_type.GetFieldIndex(f2->name())); + ASSERT_EQ(3, struct_type.GetFieldIndex(f3->name())); + ASSERT_EQ(-1, struct_type.GetFieldIndex("not-found")); +} + +TEST(TestStructType, GetFieldIndexDuplicates) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", int64()); + auto f2 = field("f1", utf8()); + StructType struct_type({f0, f1, f2}); + + ASSERT_EQ(0, struct_type.GetFieldIndex("f0")); + ASSERT_EQ(-1, struct_type.GetFieldIndex("f1")); } TEST(TestDictionaryType, Equals) { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 753cb65ff26da..ee7fda7c8c8f4 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -232,18 +232,37 @@ std::string StructType::ToString() const { return s.str(); } -std::shared_ptr StructType::GetChildByName(const std::string& name) const { - int i = GetChildIndex(name); +std::shared_ptr StructType::GetFieldByName(const std::string& name) const { + int i = GetFieldIndex(name); return i == -1 ? nullptr : children_[i]; } -int StructType::GetChildIndex(const std::string& name) const { +int StructType::GetFieldIndex(const std::string& name) const { if (children_.size() > 0 && name_to_index_.size() == 0) { for (size_t i = 0; i < children_.size(); ++i) { name_to_index_[children_[i]->name()] = static_cast(i); } } + if (name_to_index_.size() < children_.size()) { + // There are duplicate field names. Refuse to guess + int counts = 0; + int last_observed_index = -1; + for (size_t i = 0; i < children_.size(); ++i) { + if (children_[i]->name() == name) { + ++counts; + last_observed_index = static_cast(i); + } + } + + if (counts == 1) { + return last_observed_index; + } else { + // Duplicate or not found + return -1; + } + } + auto it = name_to_index_.find(name); if (it == name_to_index_.end()) { return -1; @@ -252,6 +271,14 @@ int StructType::GetChildIndex(const std::string& name) const { } } +std::shared_ptr StructType::GetChildByName(const std::string& name) const { + return GetFieldByName(name); +} + +int StructType::GetChildIndex(const std::string& name) const { + return GetFieldIndex(name); +} + // ---------------------------------------------------------------------- // DictionaryType diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 8f6cfd6ced4ff..95b5189de0343 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -516,9 +516,16 @@ class ARROW_EXPORT StructType : public NestedType { std::string name() const override { return "struct"; } /// Returns null if name not found + std::shared_ptr GetFieldByName(const std::string& name) const; + + /// Returns -1 if name not found or if there are multiple fields having the + /// same name + int GetFieldIndex(const std::string& name) const; + + ARROW_DEPRECATED("Use GetFieldByName") std::shared_ptr GetChildByName(const std::string& name) const; - /// Returns -1 if name not found + ARROW_DEPRECATED("Use GetChildIndex") int GetChildIndex(const std::string& name) const; private: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 61517e4f09d21..f4629af0617fb 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -276,8 +276,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStructType" arrow::StructType"(CDataType): CStructType(const vector[shared_ptr[CField]]& fields) - shared_ptr[CField] GetChildByName(const c_string& name) - int GetChildIndex(const c_string& name) + shared_ptr[CField] GetFieldByName(const c_string& name) + int GetFieldIndex(const c_string& name) cdef cppclass CUnionType" arrow::UnionType"(CDataType): CUnionType(const vector[shared_ptr[CField]]& fields, diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 3e628263ba36f..d829d6a0c50ad 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -65,7 +65,8 @@ cdef class StructType(DataType): cdef: const CStructType* struct_type - cdef Field child_by_name(self, name) + cdef Field field(self, int i) + cdef Field field_by_name(self, name) cdef class DictionaryType(DataType): diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index a2a133beb43f6..fd3f58072d452 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -470,7 +470,7 @@ cdef class StructValue(ArrayValue): int index type = self.type.type - index = type.GetChildIndex(tobytes(key)) + index = type.GetFieldIndex(tobytes(key)) if index < 0: raise KeyError(key) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index af2d1139c43fe..729c76e1471f5 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -231,9 +231,12 @@ def test_list_type(): def test_struct_type(): - fields = [pa.field('a', pa.int64()), - pa.field('a', pa.int32()), - pa.field('b', pa.int32())] + fields = [ + # Duplicate field name on purpose + pa.field('a', pa.int64()), + pa.field('a', pa.int32()), + pa.field('b', pa.int32()) + ] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 @@ -243,11 +246,17 @@ def test_struct_type(): with pytest.raises(IndexError): assert ty[3] - assert ty['a'] == ty[1] assert ty['b'] == ty[2] + + # Duplicate + with pytest.raises(KeyError): + ty['a'] + + # Not found with pytest.raises(KeyError): ty['c'] + # Neither integer nor string with pytest.raises(TypeError): ty[None] diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index d367a8a85673f..29b2a1ea3c9a0 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -213,13 +213,19 @@ cdef class StructType(DataType): DataType.init(self, type) self.struct_type = type.get() - cdef Field child_by_name(self, name): + cdef Field field(self, int i): + """ + Alias for child(i) + """ + return self.child(i) + + cdef Field field_by_name(self, name): """ Access a child field by its name rather than the column index. """ cdef shared_ptr[CField] field - field = self.struct_type.GetChildByName(tobytes(name)) + field = self.struct_type.GetFieldByName(tobytes(name)) if field == nullptr: raise KeyError(name) @@ -234,7 +240,7 @@ cdef class StructType(DataType): def __getitem__(self, i): if isinstance(i, six.string_types): - return self.child_by_name(i) + return self.field_by_name(i) elif isinstance(i, six.integer_types): return self.child(i) else: From cca9d2866508030f0db6999ff3ce6d39be393bb9 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 19 Dec 2018 23:15:00 +0100 Subject: [PATCH 085/328] ARROW-3620: [Python] Document pa.cpu_count() in Sphinx API docs Author: Antoine Pitrou Closes #3224 from pitrou/ARROW-3620-document-cpu-count and squashes the following commits: 15fda9ba ARROW-3620: Document pa.cpu_count() in Sphinx API docs --- docs/source/python/api.rst | 47 +++++++++++++++++++++++--------------- docs/source/python/csv.rst | 4 ++++ python/pyarrow/lib.pyx | 7 +++--- 3 files changed, 37 insertions(+), 21 deletions(-) diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst index 064a3e9740543..40ccb68c36f38 100644 --- a/docs/source/python/api.rst +++ b/docs/source/python/api.rst @@ -282,21 +282,8 @@ Serialization and IPC SerializedPyObject SerializationContext -.. _api.feather: - -Feather Format -~~~~~~~~~~~~~~ - -.. currentmodule:: pyarrow.feather - .. _api.memory_pool: -.. autosummary:: - :toctree: generated/ - - read_feather - write_feather - Memory Pools ------------ @@ -329,8 +316,8 @@ Type Classes .. _api.plasma: -In-Memory Object Store ----------------------- +Plasma In-Memory Object Store +----------------------------- .. autosummary:: :toctree: generated/ @@ -354,12 +341,27 @@ CSV Files ConvertOptions read_csv -.. currentmodule:: pyarrow.parquet +.. _api.feather: + +Feather Files +------------- + +.. currentmodule:: pyarrow.feather + +.. autosummary:: + :toctree: generated/ + + read_feather + write_feather + +.. currentmodule:: pyarrow .. _api.parquet: -Apache Parquet --------------- +Parquet Files +------------- + +.. currentmodule:: pyarrow.parquet .. autosummary:: :toctree: generated/ @@ -377,6 +379,15 @@ Apache Parquet .. currentmodule:: pyarrow +Multi-Threading +--------------- + +.. autosummary:: + :toctree: generated/ + + cpu_count + set_cpu_count + Using with C extensions ----------------------- diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst index f1bcea9e24795..17023b1610d48 100644 --- a/docs/source/python/csv.rst +++ b/docs/source/python/csv.rst @@ -86,3 +86,7 @@ overhead of reading CSV files. Performance options can be controlled through the :class:`ReadOptions` class. Multi-threaded reading is the default for highest performance, distributing the workload efficiently over all available cores. + +.. note:: + The number of threads to use concurrently is automatically inferred by Arrow + and can be inspected using the :func:`~pyarrow.cpu_count()` function. diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 9c661dbc3554a..3fe879a319668 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -42,9 +42,10 @@ def cpu_count(): Return the number of threads to use in parallel operations. The number of threads is determined at startup by inspecting the - OMP_NUM_THREADS and OMP_THREAD_LIMIT environment variables. If neither - is present, it will default to the number of hardware threads on the - system. It can be modified at runtime by calling set_cpu_count(). + ``OMP_NUM_THREADS`` and ``OMP_THREAD_LIMIT`` environment variables. + If neither is present, it will default to the number of hardware threads + on the system. It can be modified at runtime by calling + :func:`set_cpu_count()`. """ return GetCpuThreadPoolCapacity() From 1a8c8f0b2aae01fe8357980f1f4e5c879941c0eb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 19 Dec 2018 18:01:19 -0700 Subject: [PATCH 086/328] ARROW-4038: [Rust] Implement boolean AND, OR, NOT array ops - Implements boolean AND, OR, NOT operations in `array_ops` - Removes all uses of `unwrap()` in array_ops and replaces with `?` - Improve error messages Author: Andy Grove Closes #3189 from andygrove/ARROW-4038 and squashes the following commits: 69518d7 add tests a38d9a9 add docs for all array_ops and add explicit handling for case where both sides are null 661e2af improve error message 36b9171 Implement boolean AND, OR, NOT operations, remove unwraps and improve error message --- rust/src/array_ops.rs | 175 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 169 insertions(+), 6 deletions(-) diff --git a/rust/src/array_ops.rs b/rust/src/array_ops.rs index e73a858e951b1..59145754f0248 100644 --- a/rust/src/array_ops.rs +++ b/rust/src/array_ops.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Defines primitive computations on arrays + use std::ops::{Add, Div, Mul, Sub}; use num::Zero; @@ -25,6 +27,7 @@ use crate::datatypes; use crate::datatypes::ArrowNumericType; use crate::error::{ArrowError, Result}; +/// Perform `left + right` operation on two arrays. If either left or right value is null then the result is also null. pub fn add(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> where T: datatypes::ArrowNumericType, @@ -37,6 +40,7 @@ where math_op(left, right, |a, b| Ok(a + b)) } +/// Perform `left - right` operation on two arrays. If either left or right value is null then the result is also null. pub fn subtract(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> where T: datatypes::ArrowNumericType, @@ -49,6 +53,7 @@ where math_op(left, right, |a, b| Ok(a - b)) } +/// Perform `left * right` operation on two arrays. If either left or right value is null then the result is also null. pub fn multiply(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> where T: datatypes::ArrowNumericType, @@ -61,6 +66,8 @@ where math_op(left, right, |a, b| Ok(a * b)) } +/// Perform `left / right` operation on two arrays. If either left or right value is null then the result is also null. +/// If any right hand value is zero then the result of this operation will be `Err(ArrowError::DivideByZero)`. pub fn divide(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> where T: datatypes::ArrowNumericType, @@ -79,6 +86,8 @@ where }) } +/// Helper function to perform math lambda function on values from two arrays. If either left or +/// right value is null then the output value is also null, so `1 + null` is `null`. fn math_op( left: &PrimitiveArray, right: &PrimitiveArray, @@ -90,16 +99,16 @@ where { if left.len() != right.len() { return Err(ArrowError::ComputeError( - "Cannot perform math operation on two batches of different length".to_string(), + "Cannot perform math operation on arrays of different length".to_string(), )); } let mut b = PrimitiveArrayBuilder::::new(left.len()); for i in 0..left.len() { let index = i; if left.is_null(i) || right.is_null(i) { - b.push_null().unwrap(); + b.push_null()?; } else { - b.push(op(left.value(index), right.value(index))?).unwrap(); + b.push(op(left.value(index), right.value(index))?)?; } } Ok(b.finish()) @@ -121,6 +130,7 @@ where min_max_helper(array, |a, b| a > b) } +/// Helper function to perform min/max lambda function on values from a numeric array. fn min_max_helper(array: &PrimitiveArray, cmp: F) -> Option where T: ArrowNumericType, @@ -145,6 +155,7 @@ where n } +/// Perform `left == right` operation on two arrays. pub fn eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, @@ -152,6 +163,7 @@ where bool_op(left, right, |a, b| a == b) } +/// Perform `left != right` operation on two arrays. pub fn neq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, @@ -159,50 +171,59 @@ where bool_op(left, right, |a, b| a != b) } +/// Perform `left < right` operation on two arrays. Null values are less than non-null values. pub fn lt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, { bool_op(left, right, |a, b| match (a, b) { + (None, None) => false, (None, _) => true, (_, None) => false, (Some(aa), Some(bb)) => aa < bb, }) } +/// Perform `left <= right` operation on two arrays. Null values are less than non-null values. pub fn lt_eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, { bool_op(left, right, |a, b| match (a, b) { + (None, None) => true, (None, _) => true, (_, None) => false, (Some(aa), Some(bb)) => aa <= bb, }) } +/// Perform `left > right` operation on two arrays. Non-null values are greater than null values. pub fn gt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, { bool_op(left, right, |a, b| match (a, b) { + (None, None) => false, (None, _) => false, (_, None) => true, (Some(aa), Some(bb)) => aa > bb, }) } +/// Perform `left >= right` operation on two arrays. Non-null values are greater than null values. pub fn gt_eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, { bool_op(left, right, |a, b| match (a, b) { + (None, None) => true, (None, _) => false, (_, None) => true, (Some(aa), Some(bb)) => aa >= bb, }) } +/// Helper function to perform boolean lambda function on values from two arrays. fn bool_op(left: &PrimitiveArray, right: &PrimitiveArray, op: F) -> Result where T: ArrowNumericType, @@ -210,7 +231,7 @@ where { if left.len() != right.len() { return Err(ArrowError::ComputeError( - "Cannot perform math operation on two batches of different length".to_string(), + "Cannot perform math operation on arrays of different length".to_string(), )); } let mut b = BooleanArray::builder(left.len()); @@ -226,7 +247,56 @@ where } else { Some(right.value(index)) }; - b.push(op(l, r)).unwrap(); + b.push(op(l, r))?; + } + Ok(b.finish()) +} + +/// Perform `AND` operation on two arrays. If either left or right value is null then the result is also null. +pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform boolean operation on arrays of different length".to_string(), + )); + } + let mut b = BooleanArray::builder(left.len()); + for i in 0..left.len() { + if left.is_null(i) || right.is_null(i) { + b.push_null()?; + } else { + b.push(left.value(i) && right.value(i))?; + } + } + Ok(b.finish()) +} + +/// Perform `OR` operation on two arrays. If either left or right value is null then the result is also null. +pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform boolean operation on arrays of different length".to_string(), + )); + } + let mut b = BooleanArray::builder(left.len()); + for i in 0..left.len() { + if left.is_null(i) || right.is_null(i) { + b.push_null()?; + } else { + b.push(left.value(i) || right.value(i))?; + } + } + Ok(b.finish()) +} + +/// Perform unary `NOT` operation on an arrays. If value is null then the result is also null. +pub fn not(left: &BooleanArray) -> Result { + let mut b = BooleanArray::builder(left.len()); + for i in 0..left.len() { + if left.is_null(i) { + b.push_null()?; + } else { + b.push(!left.value(i))?; + } } Ok(b.finish()) } @@ -256,7 +326,7 @@ mod tests { .err() .expect("should have failed due to different lengths"); assert_eq!( - "ComputeError(\"Cannot perform math operation on two batches of different length\")", + "ComputeError(\"Cannot perform math operation on arrays of different length\")", format!("{:?}", e) ); } @@ -365,6 +435,16 @@ mod tests { assert_eq!(true, c.value(4)); } + #[test] + fn test_primitive_array_lt_nulls() { + let a = Int32Array::from(vec![None, None, Some(1)]); + let b = Int32Array::from(vec![None, Some(1), None]); + let c = lt(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + } + #[test] fn test_primitive_array_lt_eq() { let a = Int32Array::from(vec![8, 8, 8, 8, 8]); @@ -377,6 +457,16 @@ mod tests { assert_eq!(true, c.value(4)); } + #[test] + fn test_primitive_array_lt_eq_nulls() { + let a = Int32Array::from(vec![None, None, Some(1)]); + let b = Int32Array::from(vec![None, Some(1), None]); + let c = lt_eq(&a, &b).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + } + #[test] fn test_primitive_array_gt() { let a = Int32Array::from(vec![8, 8, 8, 8, 8]); @@ -389,6 +479,16 @@ mod tests { assert_eq!(false, c.value(4)); } + #[test] + fn test_primitive_array_gt_nulls() { + let a = Int32Array::from(vec![None, None, Some(1)]); + let b = Int32Array::from(vec![None, Some(1), None]); + let c = gt(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(true, c.value(2)); + } + #[test] fn test_primitive_array_gt_eq() { let a = Int32Array::from(vec![8, 8, 8, 8, 8]); @@ -401,6 +501,16 @@ mod tests { assert_eq!(false, c.value(4)); } + #[test] + fn test_primitive_array_gt_eq_nulls() { + let a = Int32Array::from(vec![None, None, Some(1)]); + let b = Int32Array::from(vec![None, Some(1), None]); + let c = gt_eq(&a, &b).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(true, c.value(2)); + } + #[test] fn test_buffer_array_min_max() { let a = Int32Array::from(vec![5, 6, 7, 8, 9]); @@ -415,4 +525,57 @@ mod tests { assert_eq!(9, max(&a).unwrap()); } + #[test] + fn test_bool_array_and() { + let a = BooleanArray::from(vec![false, false, true, true]); + let b = BooleanArray::from(vec![false, true, false, true]); + let c = and(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(true, c.value(3)); + } + + #[test] + fn test_bool_array_or() { + let a = BooleanArray::from(vec![false, false, true, true]); + let b = BooleanArray::from(vec![false, true, false, true]); + let c = or(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(true, c.value(2)); + assert_eq!(true, c.value(3)); + } + + #[test] + fn test_bool_array_or_nulls() { + let a = BooleanArray::from(vec![None, Some(false), None, Some(false)]); + let b = BooleanArray::from(vec![None, None, Some(false), Some(false)]); + let c = or(&a, &b).unwrap(); + assert_eq!(true, c.is_null(0)); + assert_eq!(true, c.is_null(1)); + assert_eq!(true, c.is_null(2)); + assert_eq!(false, c.is_null(3)); + } + + #[test] + fn test_bool_array_not() { + let a = BooleanArray::from(vec![false, false, true, true]); + let c = not(&a).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(false, c.value(3)); + } + + #[test] + fn test_bool_array_and_nulls() { + let a = BooleanArray::from(vec![None, Some(false), None, Some(false)]); + let b = BooleanArray::from(vec![None, None, Some(false), Some(false)]); + let c = and(&a, &b).unwrap(); + assert_eq!(true, c.is_null(0)); + assert_eq!(true, c.is_null(1)); + assert_eq!(true, c.is_null(2)); + assert_eq!(false, c.is_null(3)); + } } From 729cc3d3f31ebeeab9a86ec0ed59cf4000802135 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 20 Dec 2018 18:27:01 +0900 Subject: [PATCH 087/328] ARROW-4085: [GLib] Use "field" for struct data type Because C++ API is changed to use "field" by ARROW-3545. Author: Kouhei Sutou Closes #3229 from kou/glib-use-field and squashes the following commits: c078e31f Use "field" for struct data type --- c_glib/arrow-glib/composite-data-type.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index a4d3d843617a0..599506f269c8c 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -230,7 +230,7 @@ garrow_struct_data_type_get_field_by_name(GArrowStructDataType *data_type, auto arrow_struct_data_type = std::static_pointer_cast(arrow_data_type); - auto arrow_field = arrow_struct_data_type->GetChildByName(name); + auto arrow_field = arrow_struct_data_type->GetFieldByName(name); if (arrow_field) { return garrow_field_new_raw(&arrow_field); } else { @@ -256,7 +256,7 @@ garrow_struct_data_type_get_field_index(GArrowStructDataType *data_type, auto arrow_struct_data_type = std::static_pointer_cast(arrow_data_type); - return arrow_struct_data_type->GetChildIndex(name); + return arrow_struct_data_type->GetFieldIndex(name); } From c39db631f74e617b5317a64997364ea61c82c5f1 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 20 Dec 2018 08:36:54 -0600 Subject: [PATCH 088/328] ARROW-4082: [C++] Allow RelWithDebInfo, improve FindClangTools SetupCxxFlags.cmake does not list "RELWITHDEBINFO" in the final flag setup, so cmake will error out if that build config is selected. It's handy for quick debugging without switching your python build etc over to "DEBUG". FindClangTools.cmake could check the version of 'clang-format' (no version suffix) to see if it satisfies a version requirement. Also the doccomment lists the incorrect variable name for the hint path Author: Benjamin Kietzman Closes #3227 from bkietz/ARROW-4082-tweak-cmake and squashes the following commits: 15526cf01 allow RelWithDebInfo, improve FindClangTools --- cpp/README.md | 6 ++++++ cpp/cmake_modules/FindClangTools.cmake | 29 ++++++++++++++++++++++---- cpp/cmake_modules/SetupCxxFlags.cmake | 1 + 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/cpp/README.md b/cpp/README.md index 5940db1f44301..b602bef1c7710 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -428,6 +428,12 @@ You may find the required packages at http://releases.llvm.org/download.html or use the Debian/Ubuntu APT repositories on https://apt.llvm.org/. On macOS with [Homebrew][1] you can get it via `brew install llvm@6`. +Depending on how you installed clang-format, the build system may not be able +to find it. You can provide an explicit path to your LLVM installation (or the +root path for the clang tools) with the environment variable +`$CLANG_TOOLS_PATH` or by passing `-DClangTools_PATH=$PATH_TO_CLANG_TOOLS` when +invoking CMake. + ## Checking for ABI and API stability To build ABI compliance reports, you need to install the two tools diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake index 2ddf7880ceb43..62ee8c3b6dd4a 100644 --- a/cpp/cmake_modules/FindClangTools.cmake +++ b/cpp/cmake_modules/FindClangTools.cmake @@ -20,7 +20,7 @@ # Variables used by this module, they can change the default behaviour and need # to be set before calling find_package: # -# ClangToolsBin_HOME - +# ClangTools_PATH - # When set, this path is inspected instead of standard library binary locations # to find clang-tidy and clang-format # @@ -75,10 +75,11 @@ if (CLANG_FORMAT_VERSION) ) # If not found yet, search alternative locations - if (("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") AND APPLE) + if ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") + STRING(REGEX REPLACE "^([0-9]+)\\.[0-9]+" "\\1" CLANG_FORMAT_MAJOR_VERSION "${CLANG_FORMAT_VERSION}") + STRING(REGEX REPLACE "^[0-9]+\\.([0-9]+)" "\\1" CLANG_FORMAT_MINOR_VERSION "${CLANG_FORMAT_VERSION}") + if (APPLE) # Homebrew ships older LLVM versions in /usr/local/opt/llvm@version/ - STRING(REGEX REPLACE "^([0-9]+)\\.[0-9]+" "\\1" CLANG_FORMAT_MAJOR_VERSION "${CLANG_FORMAT_VERSION}") - STRING(REGEX REPLACE "^[0-9]+\\.([0-9]+)" "\\1" CLANG_FORMAT_MINOR_VERSION "${CLANG_FORMAT_VERSION}") if ("${CLANG_FORMAT_MINOR_VERSION}" STREQUAL "0") find_program(CLANG_FORMAT_BIN NAMES clang-format @@ -102,7 +103,27 @@ if (CLANG_FORMAT_VERSION) NO_DEFAULT_PATH ) endif() + else() + # try searching for "clang-format" and check the version + find_program(CLANG_FORMAT_BIN + NAMES clang-format + PATHS + ${ClangTools_PATH} + $ENV{CLANG_TOOLS_PATH} + /usr/local/bin /usr/bin + NO_DEFAULT_PATH + ) + if (NOT ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND")) + execute_process(COMMAND ${CLANG_FORMAT_BIN} "-version" + OUTPUT_VARIABLE CLANG_FORMAT_FOUND_VERSION_MESSAGE + OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT ("${CLANG_FORMAT_FOUND_VERSION_MESSAGE}" MATCHES "^clang-format version ${CLANG_FORMAT_MAJOR_VERSION}\\.${CLANG_FORMAT_MINOR_VERSION}.*")) + set(CLANG_FORMAT_BIN "CLANG_FORMAT_BIN-NOTFOUND") + endif() + endif() + endif() endif() + else() find_program(CLANG_FORMAT_BIN NAMES clang-format-4.0 diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 61fd14ca2cf46..11608350c5f7a 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -365,6 +365,7 @@ message("Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_ if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_DEBUG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_DEBUG}") +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "RELWITHDEBINFO") elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_FASTDEBUG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_FASTDEBUG}") From ce9c6e3914274dcaf7806159ea5373e0cb632727 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 20 Dec 2018 08:51:31 -0600 Subject: [PATCH 089/328] ARROW-4084: [C++] Make Status static method support variadic arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Static constructors like `Status::Invalid` now supports variadic arguments à la `Status::Invalid("my", variable, "error message: ", i)`. - A new macro was added `ARROW_RETURN_IF(cond, status)` which replaces the previous `ARROW_RETURN_IF_FALSE` but also adds branch prediction hints. Note that only gandiva was refactored with this macro as otherwise the code review would have exploded. - Fixed a bug in memory map implementations not checking the return code of `mmap` and `mremap`. Author: François Saint-Jacques Closes #3228 from fsaintjacques/ARROW-4084-variadic-status-message and squashes the following commits: a877ab994 Travis 890df68f9 Remove gandiva expect string message testing 71ecbae7d Use perfect forwarding. 774bf9387 Add missing string header bf5cdfe06 Removed code printing in status 1d1db49c4 Reformat d9fcad919 ARROW-4084: Make Status static method support variadic arguments --- cpp/src/arrow/adapters/orc/adapter.cc | 22 +-- cpp/src/arrow/array.cc | 60 +++---- cpp/src/arrow/array/builder_binary.cc | 21 +-- cpp/src/arrow/array/builder_nested.cc | 12 +- cpp/src/arrow/builder.cc | 5 +- cpp/src/arrow/compute/kernels/cast.cc | 51 +++--- cpp/src/arrow/compute/kernels/hash.cc | 8 +- cpp/src/arrow/csv/converter.cc | 23 +-- cpp/src/arrow/csv/parser.cc | 4 +- cpp/src/arrow/csv/reader.cc | 6 +- .../arrow/dbi/hiveserver2/hiveserver2-test.cc | 6 +- cpp/src/arrow/dbi/hiveserver2/service.cc | 4 +- .../arrow/dbi/hiveserver2/thrift-internal.cc | 6 +- cpp/src/arrow/flight/internal.cc | 11 +- cpp/src/arrow/gpu/cuda_arrow_ipc.cc | 5 +- cpp/src/arrow/gpu/cuda_common.h | 16 +- cpp/src/arrow/io/file-test.cc | 4 +- cpp/src/arrow/io/file.cc | 4 +- cpp/src/arrow/io/hdfs-internal.cc | 8 +- cpp/src/arrow/io/hdfs.cc | 34 ++-- cpp/src/arrow/ipc/dictionary.cc | 8 +- cpp/src/arrow/ipc/feather.cc | 4 +- cpp/src/arrow/ipc/json-integration-test.cc | 10 +- cpp/src/arrow/ipc/json-internal.cc | 42 ++--- cpp/src/arrow/ipc/json-internal.h | 63 +++---- cpp/src/arrow/ipc/json-simple.cc | 42 ++--- cpp/src/arrow/ipc/message.cc | 31 ++-- cpp/src/arrow/ipc/metadata-internal.cc | 8 +- cpp/src/arrow/ipc/reader.cc | 29 +-- cpp/src/arrow/memory_pool.cc | 20 +-- cpp/src/arrow/python/arrow_to_pandas.cc | 114 +++++------- cpp/src/arrow/python/common.h | 6 +- cpp/src/arrow/python/decimal.cc | 8 +- cpp/src/arrow/python/helpers.cc | 24 +-- cpp/src/arrow/python/inference.cc | 17 +- cpp/src/arrow/python/numpy-internal.h | 5 +- cpp/src/arrow/python/numpy_convert.cc | 12 +- cpp/src/arrow/python/numpy_to_arrow.cc | 19 +- cpp/src/arrow/python/python_to_arrow.cc | 15 +- cpp/src/arrow/python/serialize.cc | 7 +- cpp/src/arrow/python/util/datetime.h | 4 +- cpp/src/arrow/record_batch.cc | 26 ++- cpp/src/arrow/status.cc | 1 + cpp/src/arrow/status.h | 155 +++++++++------- cpp/src/arrow/table.cc | 52 ++---- cpp/src/arrow/util/compression_brotli.cc | 4 +- cpp/src/arrow/util/compression_lz4.cc | 16 +- cpp/src/arrow/util/compression_snappy.cc | 6 +- cpp/src/arrow/util/compression_zlib.cc | 54 ++---- cpp/src/arrow/util/compression_zstd.cc | 4 +- cpp/src/arrow/util/decimal.cc | 20 +-- cpp/src/arrow/util/decimal.h | 5 +- cpp/src/arrow/util/io-util.cc | 54 +++--- cpp/src/arrow/util/string_builder.h | 51 ++++++ cpp/src/gandiva/date_utils.cc | 19 +- cpp/src/gandiva/engine.cc | 29 ++- cpp/src/gandiva/expr_validator.cc | 168 ++++++++---------- cpp/src/gandiva/filter.cc | 59 +++--- cpp/src/gandiva/like_holder.cc | 37 ++-- cpp/src/gandiva/llvm_generator.cc | 38 ++-- cpp/src/gandiva/projector.cc | 120 +++++-------- cpp/src/gandiva/regex_util.cc | 14 +- cpp/src/gandiva/selection_vector.cc | 82 ++++----- .../tests/projector_build_validation_test.cc | 13 +- cpp/src/parquet/arrow/reader.cc | 11 +- cpp/src/parquet/arrow/schema.cc | 28 ++- cpp/src/parquet/arrow/writer.cc | 11 +- cpp/src/plasma/io.cc | 10 +- 68 files changed, 763 insertions(+), 1122 deletions(-) create mode 100644 cpp/src/arrow/util/string_builder.h diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index de803d5ba6f03..01fc09afb0c92 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -206,11 +206,7 @@ Status GetArrowType(const liborc::Type* type, std::shared_ptr* out) { *out = union_(fields, type_codes); break; } - default: { - std::stringstream ss; - ss << "Unknown Orc type kind: " << kind; - return Status::Invalid(ss.str()); - } + default: { return Status::Invalid("Unknown Orc type kind: ", kind); } } return Status::OK(); } @@ -346,11 +342,9 @@ class ORCFileReader::Impl { } Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) { - if (stripe < 0 || stripe >= NumberOfStripes()) { - std::stringstream ss; - ss << "Out of bounds stripe: " << stripe; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(), + Status::Invalid("Out of bounds stripe: ", stripe)); + opts->range(stripes_[stripe].offset, stripes_[stripe].length); return Status::OK(); } @@ -359,9 +353,7 @@ class ORCFileReader::Impl { const std::vector& include_indices) { std::list include_indices_list; for (auto it = include_indices.begin(); it != include_indices.end(); ++it) { - if (*it < 0) { - return Status::Invalid("Negative field index"); - } + ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index")); include_indices_list.push_back(*it); } opts->includeTypes(include_indices_list); @@ -455,9 +447,7 @@ class ORCFileReader::Impl { case liborc::DECIMAL: return AppendDecimalBatch(type, batch, offset, length, builder); default: - std::stringstream ss; - ss << "Not implemented type kind: " << kind; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Not implemented type kind: ", kind); } } diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index d07c27fe15906..66a685b45d315 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -638,9 +638,8 @@ Status DictionaryArray::FromArrays(const std::shared_ptr& type, is_valid = ValidateDictionaryIndices(indices, upper_bound); break; default: - std::stringstream ss; - ss << "Categorical index type not supported: " << indices->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Categorical index type not supported: ", + indices->type()->ToString()); } if (!is_valid.ok()) { @@ -740,12 +739,11 @@ struct ValidateVisitor { Status Visit(const NullArray&) { return Status::OK(); } Status Visit(const PrimitiveArray& array) { - if (array.data()->buffers.size() != 2) { - return Status::Invalid("number of buffers was != 2"); - } - if (array.values() == nullptr) { - return Status::Invalid("values was null"); - } + ARROW_RETURN_IF(array.data()->buffers.size() != 2, + Status::Invalid("number of buffers was != 2")); + + ARROW_RETURN_IF(array.values() == nullptr, Status::Invalid("values was null")); + return Status::OK(); } @@ -776,10 +774,8 @@ struct ValidateVisitor { return Status::Invalid("value_offsets_ was null"); } if (value_offsets->size() / static_cast(sizeof(int32_t)) < array.length()) { - std::stringstream ss; - ss << "offset buffer size (bytes): " << value_offsets->size() - << " isn't large enough for length: " << array.length(); - return Status::Invalid(ss.str()); + return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), + " isn't large enough for length: ", array.length()); } if (!array.values()) { @@ -788,17 +784,13 @@ struct ValidateVisitor { const int32_t last_offset = array.value_offset(array.length()); if (array.values()->length() != last_offset) { - std::stringstream ss; - ss << "Final offset invariant not equal to values length: " << last_offset - << "!=" << array.values()->length(); - return Status::Invalid(ss.str()); + return Status::Invalid("Final offset invariant not equal to values length: ", + last_offset, "!=", array.values()->length()); } const Status child_valid = ValidateArray(*array.values()); if (!child_valid.ok()) { - std::stringstream ss; - ss << "Child array invalid: " << child_valid.ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Child array invalid: ", child_valid.ToString()); } int32_t prev_offset = array.value_offset(0); @@ -808,18 +800,14 @@ struct ValidateVisitor { for (int64_t i = 1; i <= array.length(); ++i) { int32_t current_offset = array.value_offset(i); if (array.IsNull(i - 1) && current_offset != prev_offset) { - std::stringstream ss; - ss << "Offset invariant failure at: " << i - << " inconsistent value_offsets for null slot" << current_offset - << "!=" << prev_offset; - return Status::Invalid(ss.str()); + return Status::Invalid("Offset invariant failure at: ", i, + " inconsistent value_offsets for null slot", + current_offset, "!=", prev_offset); } if (current_offset < prev_offset) { - std::stringstream ss; - ss << "Offset invariant failure: " << i - << " inconsistent offset for non-null slot: " << current_offset << "<" - << prev_offset; - return Status::Invalid(ss.str()); + return Status::Invalid("Offset invariant failure: ", i, + " inconsistent offset for non-null slot: ", current_offset, + "<", prev_offset); } prev_offset = current_offset; } @@ -842,18 +830,14 @@ struct ValidateVisitor { for (int i = 0; i < array.num_fields(); ++i) { auto it = array.field(i); if (it->length() != array_length) { - std::stringstream ss; - ss << "Length is not equal from field " << it->type()->ToString() - << " at position {" << idx << "}"; - return Status::Invalid(ss.str()); + return Status::Invalid("Length is not equal from field ", + it->type()->ToString(), " at position [", idx, "]"); } const Status child_valid = ValidateArray(*it); if (!child_valid.ok()) { - std::stringstream ss; - ss << "Child array invalid: " << child_valid.ToString() << " at position {" - << idx << "}"; - return Status::Invalid(ss.str()); + return Status::Invalid("Child array invalid: ", child_valid.ToString(), + " at position [", idx, "}"); } ++idx; } diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index ad6ba11a484d1..8739859310b10 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -59,21 +59,18 @@ Status BinaryBuilder::Resize(int64_t capacity) { } Status BinaryBuilder::ReserveData(int64_t elements) { - if (value_data_length() + elements > value_data_capacity()) { - if (value_data_length() + elements > kBinaryMemoryLimit) { - return Status::CapacityError( - "Cannot reserve capacity larger than 2^31 - 1 for binary"); - } - RETURN_NOT_OK(value_data_builder_.Reserve(elements)); - } - return Status::OK(); + const int64_t size = value_data_length() + elements; + ARROW_RETURN_IF( + size > kBinaryMemoryLimit, + Status::CapacityError("Cannot reserve capacity larger than 2^31 - 1 for binary")); + + return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) + : Status::OK(); } Status BinaryBuilder::AppendOverflow(int64_t num_bytes) { - std::stringstream ss; - ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " - << num_bytes; - return Status::CapacityError(ss.str()); + return Status::CapacityError("BinaryArray cannot contain more than ", + kBinaryMemoryLimit, " bytes, have ", num_bytes); } Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index e73324323af3d..87c302a82cfe6 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -58,13 +58,11 @@ Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, } Status ListBuilder::AppendNextOffset() { - int64_t num_values = value_builder_->length(); - if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { - std::stringstream ss; - ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," - << " have " << num_values; - return Status::CapacityError(ss.str()); - } + const int64_t num_values = value_builder_->length(); + ARROW_RETURN_IF( + num_values > kListMaximumElements, + Status::CapacityError("ListArray cannot contain more then 2^31 - 1 child elements,", + " have ", num_values)); return offsets_builder_.Append(static_cast(num_values)); } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index ff2b453bb4494..2072edc936a3c 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -93,9 +93,8 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, } default: { - std::stringstream ss; - ss << "MakeBuilder: cannot construct builder for type " << type->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("MakeBuilder: cannot construct builder for type ", + type->ToString()); } } } diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index b148486bd212f..2ce0702f20c32 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -508,11 +508,9 @@ void ShiftTime(FunctionContext* ctx, const CastOptions& options, const bool is_m out_data[i] = static_cast(in_data[i] / factor); } } else { -#define RAISE_INVALID_CAST(VAL) \ - std::stringstream ss; \ - ss << "Casting from " << input.type->ToString() << " to " << output->type->ToString() \ - << " would lose data: " << VAL; \ - ctx->SetStatus(Status::Invalid(ss.str())); +#define RAISE_INVALID_CAST(VAL) \ + ctx->SetStatus(Status::Invalid("Casting from ", input.type->ToString(), " to ", \ + output->type->ToString(), " would lose data: ", VAL)); if (input.null_count != 0) { internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset, @@ -795,9 +793,8 @@ struct CastFunctor< UnpackFixedSizeBinaryDictionary(ctx, indices, dictionary, output); break; default: - std::stringstream ss; - ss << "Invalid index type: " << indices.type()->ToString(); - ctx->SetStatus(Status::Invalid(ss.str())); + ctx->SetStatus( + Status::Invalid("Invalid index type: ", indices.type()->ToString())); return; } } @@ -874,9 +871,8 @@ struct CastFunctor(ctx, indices, dictionary, output))); break; default: - std::stringstream ss; - ss << "Invalid index type: " << indices.type()->ToString(); - ctx->SetStatus(Status::Invalid(ss.str())); + ctx->SetStatus( + Status::Invalid("Invalid index type: ", indices.type()->ToString())); return; } } @@ -932,9 +928,8 @@ struct CastFunctor(indices, dictionary, out); break; default: - std::stringstream ss; - ss << "Invalid index type: " << indices.type()->ToString(); - ctx->SetStatus(Status::Invalid(ss.str())); + ctx->SetStatus( + Status::Invalid("Invalid index type: ", indices.type()->ToString())); return; } } @@ -960,9 +955,8 @@ struct CastFunctor> { auto str = input_array.GetView(i); if (!converter(str.data(), str.length(), out_data)) { - std::stringstream ss; - ss << "Failed to cast String '" << str << "' into " << output->type->ToString(); - ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); + ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ", + output->type->ToString())); return; } } @@ -991,10 +985,9 @@ struct CastFunctortype->ToString(); - ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); + ctx->SetStatus(Status::Invalid("Failed to cast String '", + input_array.GetString(i), "' into ", + output->type->ToString())); return; } @@ -1029,9 +1022,8 @@ struct CastFunctor { const auto str = input_array.GetView(i); if (!converter(str.data(), str.length(), out_data)) { - std::stringstream ss; - ss << "Failed to cast String '" << str << "' into " << output->type->ToString(); - ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); + ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ", + output->type->ToString())); return; } } @@ -1123,9 +1115,8 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const ArrayData& i if (!(is_primitive(type_id) || type_id == Type::FIXED_SIZE_BINARY || type_id == Type::DECIMAL)) { - std::stringstream ss; - ss << "Cannot pre-allocate memory for type: " << out->type->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot pre-allocate memory for type: ", + out->type->ToString()); } if (type_id != Type::NA) { @@ -1400,10 +1391,8 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& break; } if (*kernel == nullptr) { - std::stringstream ss; - ss << "No cast implemented from " << in_type.ToString() << " to " - << out_type->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("No cast implemented from ", in_type.ToString(), " to ", + out_type->ToString()); } return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc index c057ea5736139..0513fe1f6ad4f 100644 --- a/cpp/src/arrow/compute/kernels/hash.cc +++ b/cpp/src/arrow/compute/kernels/hash.cc @@ -56,11 +56,9 @@ namespace compute { namespace { -#define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE) \ - if (!KERNEL) { \ - std::stringstream ss; \ - ss << FUNCNAME << " not implemented for " << type->ToString(); \ - return Status::NotImplemented(ss.str()); \ +#define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE) \ + if (!KERNEL) { \ + return Status::NotImplemented(FUNCNAME, " not implemented for ", type->ToString()); \ } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 8a249a68c07ec..1018f8553860e 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -40,10 +40,9 @@ namespace { Status GenericConversionError(const std::shared_ptr& type, const uint8_t* data, uint32_t size) { - std::stringstream ss; - ss << "CSV conversion error to " << type->ToString() << ": invalid value '" - << std::string(reinterpret_cast(data), size) << "'"; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV conversion error to ", type->ToString(), + ": invalid value '", + std::string(reinterpret_cast(data), size), "'"); } inline bool IsWhitespace(uint8_t c) { @@ -214,9 +213,8 @@ class VarSizeBinaryConverter : public ConcreteConverter { auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) { - std::stringstream ss; - ss << "CSV conversion error to " << type_->ToString() << ": invalid UTF8 data"; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV conversion error to ", type_->ToString(), + ": invalid UTF8 data"); } builder.UnsafeAppend(data, size); return Status::OK(); @@ -256,10 +254,8 @@ Status FixedSizeBinaryConverter::Convert(const BlockParser& parser, int32_t col_ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { if (ARROW_PREDICT_FALSE(size != byte_width)) { - std::stringstream ss; - ss << "CSV conversion error to " << type_->ToString() << ": got a " << size - << "-byte long string"; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV conversion error to ", type_->ToString(), ": got a ", + size, "-byte long string"); } return builder.Append(data); }; @@ -410,9 +406,8 @@ Status Converter::Make(const std::shared_ptr& type, break; default: { - std::stringstream ss; - ss << "CSV conversion to " << type->ToString() << " is not supported"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("CSV conversion to ", type->ToString(), + " is not supported"); } #undef CONVERTER_CASE diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index fe7f841f58328..b1d175adfb582 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -30,9 +30,7 @@ namespace arrow { namespace csv { static Status ParseError(const char* message) { - std::stringstream ss; - ss << "CSV parse error: " << message; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV parse error: ", message); } static Status MismatchingColumns(int32_t expected, int32_t actual) { diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index b2a6b7b430ad0..efd61167b71a5 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -355,10 +355,8 @@ class ThreadedTableReader : public BaseTableReader { chunk_size, &parsed_size)); if (parsed_size != chunk_size) { DCHECK_EQ(parsed_size, chunk_size); - std::stringstream ss; - ss << "Chunker and parser disagree on block size: " << chunk_size << " vs " - << parsed_size; - return Status::Invalid(ss.str()); + return Status::Invalid("Chunker and parser disagree on block size: ", + chunk_size, " vs ", parsed_size); } RETURN_NOT_OK(ProcessData(parser, chunk_index)); // Keep chunk buffer alive within closure and release it at the end diff --git a/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc b/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc index 7022ff017f48e..a7749161c4676 100644 --- a/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc +++ b/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc @@ -97,10 +97,8 @@ Status Wait(const std::unique_ptr& op, if (op_state == state) { return Status::OK(); } else { - std::stringstream ss; - ss << "Failed to reach state '" << OperationStateToString(state) << "' after " - << retries << " retries."; - return Status::IOError(ss.str()); + return Status::IOError("Failed to reach state '", OperationStateToString(state), + "' after ", retries, " retries"); } } diff --git a/cpp/src/arrow/dbi/hiveserver2/service.cc b/cpp/src/arrow/dbi/hiveserver2/service.cc index e2d3f2a21bf37..502a8a284b86f 100644 --- a/cpp/src/arrow/dbi/hiveserver2/service.cc +++ b/cpp/src/arrow/dbi/hiveserver2/service.cc @@ -92,9 +92,7 @@ Service::Service(const string& host, int port, int conn_timeout, Status Service::Open() { if (impl_->protocol_version < hs2::TProtocolVersion::HIVE_CLI_SERVICE_PROTOCOL_V6) { - std::stringstream ss; - ss << "Unsupported protocol: " << impl_->protocol_version; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported protocol: ", impl_->protocol_version); } impl_->socket.reset(new TSocket(host_, port_)); diff --git a/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc b/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc index d154e143ba290..171eae36816e0 100644 --- a/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc +++ b/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc @@ -204,11 +204,7 @@ Status TStatusToStatus(const hs2::TStatus& tstatus) { return Status::IOError(tstatus.errorMessage); case hs2::TStatusCode::INVALID_HANDLE_STATUS: return Status::Invalid("Invalid handle"); - default: { - std::stringstream ss; - ss << "Unknown TStatusCode " << tstatus.statusCode; - return Status::UnknownError(ss.str()); - } + default: { return Status::UnknownError("Unknown TStatusCode ", tstatus.statusCode); } } } diff --git a/cpp/src/arrow/flight/internal.cc b/cpp/src/arrow/flight/internal.cc index 796e6095cdb7f..b4c6b2addcc11 100644 --- a/cpp/src/arrow/flight/internal.cc +++ b/cpp/src/arrow/flight/internal.cc @@ -37,16 +37,13 @@ Status FromGrpcStatus(const grpc::Status& grpc_status) { if (grpc_status.ok()) { return Status::OK(); } - std::stringstream ss; if (grpc_status.error_code() == grpc::StatusCode::UNIMPLEMENTED) { - ss << "gRPC returned unimplemented error, with message: " - << grpc_status.error_message(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("gRPC returned unimplemented error, with message: ", + grpc_status.error_message()); } else { - ss << "gRPC failed with error code " << grpc_status.error_code() - << " and message: " << grpc_status.error_message(); - return Status::IOError(ss.str()); + return Status::IOError("gRPC failed with error code ", grpc_status.error_code(), + " and message: ", grpc_status.error_message()); } } diff --git a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc index 03256a1f52c70..b4d8744cb0bd0 100644 --- a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc +++ b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc @@ -82,9 +82,8 @@ Status ReadMessage(CudaBufferReader* reader, MemoryPool* pool, RETURN_NOT_OK(AllocateBuffer(pool, message_length, &metadata)); RETURN_NOT_OK(reader->Read(message_length, &bytes_read, metadata->mutable_data())); if (bytes_read != message_length) { - std::stringstream ss; - ss << "Expected " << message_length << " metadata bytes, but only got " << bytes_read; - return Status::IOError(ss.str()); + return Status::IOError("Expected ", message_length, " metadata bytes, but only got ", + bytes_read); } return ipc::Message::ReadFrom(metadata, reader, out); diff --git a/cpp/src/arrow/gpu/cuda_common.h b/cpp/src/arrow/gpu/cuda_common.h index a53dd220adda0..2b630c8114325 100644 --- a/cpp/src/arrow/gpu/cuda_common.h +++ b/cpp/src/arrow/gpu/cuda_common.h @@ -34,15 +34,13 @@ namespace cuda { (void)ret; \ } while (0) -#define CU_RETURN_NOT_OK(STMT) \ - do { \ - CUresult ret = (STMT); \ - if (ret != CUDA_SUCCESS) { \ - std::stringstream ss; \ - ss << "Cuda Driver API call in " << __FILE__ << " at line " << __LINE__ \ - << " failed with code " << ret << ": " << #STMT; \ - return Status::IOError(ss.str()); \ - } \ +#define CU_RETURN_NOT_OK(STMT) \ + do { \ + CUresult ret = (STMT); \ + if (ret != CUDA_SUCCESS) { \ + return Status::IOError("Cuda Driver API call in ", __FILE__, " at line ", \ + __LINE__, " failed with code ", ret, ": ", #STMT); \ + } \ } while (0) } // namespace cuda diff --git a/cpp/src/arrow/io/file-test.cc b/cpp/src/arrow/io/file-test.cc index 4d710d3470f5c..6d780c0940eba 100644 --- a/cpp/src/arrow/io/file-test.cc +++ b/cpp/src/arrow/io/file-test.cc @@ -460,9 +460,7 @@ class MyMemoryPool : public MemoryPool { *ptr = reinterpret_cast(std::realloc(*ptr, new_size)); if (*ptr == NULL) { - std::stringstream ss; - ss << "realloc of size " << new_size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("realloc of size ", new_size, " failed"); } return Status::OK(); diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 869d8e3720766..0398d5a1f9e80 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -479,9 +479,7 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { void* result = mmap(nullptr, static_cast(initial_size), prot_flags_, map_mode_, file_->fd(), 0); if (result == MAP_FAILED) { - std::stringstream ss; - ss << "Memory mapping file failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return Status::IOError("Memory mapping file failed: ", std::strerror(errno)); } size_ = capacity_ = initial_size; data_ = mutable_data_ = static_cast(result); diff --git a/cpp/src/arrow/io/hdfs-internal.cc b/cpp/src/arrow/io/hdfs-internal.cc index c8be5164cfa78..c273ab45f634f 100644 --- a/cpp/src/arrow/io/hdfs-internal.cc +++ b/cpp/src/arrow/io/hdfs-internal.cc @@ -218,9 +218,7 @@ static arrow::Status try_dlopen(std::vector potential_paths, const cha } if (out_handle == NULL) { - std::stringstream ss; - ss << "Unable to load " << name; - return arrow::Status::IOError(ss.str()); + return arrow::Status::IOError("Unable to load ", name); } return arrow::Status::OK(); @@ -243,9 +241,7 @@ static arrow::Status try_dlopen(std::vector potential_paths, const cha } if (out_handle == NULL) { - std::stringstream ss; - ss << "Unable to load " << name; - return arrow::Status::IOError(ss.str()); + return arrow::Status::IOError("Unable to load ", name); } return arrow::Status::OK(); diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 030b84853da60..3e9b804ca233c 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -57,13 +57,11 @@ std::string TranslateErrno(int error_code) { } // namespace -#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ - do { \ - if (RETURN_VALUE == -1) { \ - std::stringstream ss; \ - ss << "HDFS " << WHAT << " failed, errno: " << TranslateErrno(errno); \ - return Status::IOError(ss.str()); \ - } \ +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + return Status::IOError("HDFS ", WHAT, " failed, errno: ", TranslateErrno(errno)); \ + } \ } while (0) static constexpr int kDefaultHdfsBufferSize = 1 << 16; @@ -466,10 +464,8 @@ class HadoopFileSystem::HadoopFileSystemImpl { if ((errno == 0) || (errno == ENOENT && Exists(path))) { num_entries = 0; } else { - std::stringstream ss; - ss << "HDFS list directory of " << path - << " failed, errno: " << TranslateErrno(errno); - return Status::IOError(ss.str()); + return Status::IOError("HDFS list directory failed, errno: ", + TranslateErrno(errno)); } } @@ -492,14 +488,9 @@ class HadoopFileSystem::HadoopFileSystemImpl { hdfsFile handle = driver_->OpenFile(fs_, path.c_str(), O_RDONLY, buffer_size, 0, 0); if (handle == nullptr) { - std::stringstream ss; - if (!Exists(path)) { - ss << "HDFS file does not exist: " << path; - } else { - // TODO(wesm): determine other causes of failure - ss << "HDFS path exists, but opening file failed: " << path; - } - return Status::IOError(ss.str()); + const char* msg = !Exists(path) ? "HDFS file does not exist: " + : "HDFS path exists, but opening file failed: "; + return Status::IOError(msg, path); } // std::make_shared does not work with private ctors @@ -521,10 +512,7 @@ class HadoopFileSystem::HadoopFileSystemImpl { static_cast(default_block_size)); if (handle == nullptr) { - // TODO(wesm): determine cause of failure - std::stringstream ss; - ss << "Unable to open file " << path; - return Status::IOError(ss.str()); + return Status::IOError("Unable to open file ", path); } // std::make_shared does not work with private ctors diff --git a/cpp/src/arrow/ipc/dictionary.cc b/cpp/src/arrow/ipc/dictionary.cc index 488bb75b9d75f..aa0d9085f5a8f 100644 --- a/cpp/src/arrow/ipc/dictionary.cc +++ b/cpp/src/arrow/ipc/dictionary.cc @@ -34,9 +34,7 @@ Status DictionaryMemo::GetDictionary(int64_t id, std::shared_ptr* dictionary) const { auto it = id_to_dictionary_.find(id); if (it == id_to_dictionary_.end()) { - std::stringstream ss; - ss << "Dictionary with id " << id << " not found"; - return Status::KeyError(ss.str()); + return Status::KeyError("Dictionary with id ", id, " not found"); } *dictionary = it->second; return Status::OK(); @@ -70,9 +68,7 @@ bool DictionaryMemo::HasDictionaryId(int64_t id) const { Status DictionaryMemo::AddDictionary(int64_t id, const std::shared_ptr& dictionary) { if (HasDictionaryId(id)) { - std::stringstream ss; - ss << "Dictionary with id " << id << " already exists"; - return Status::KeyError(ss.str()); + return Status::KeyError("Dictionary with id ", id, " already exists"); } intptr_t address = reinterpret_cast(dictionary.get()); id_to_dictionary_[id] = dictionary; diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index ebdb335fa57f7..b0ab62c678c72 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -642,9 +642,7 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { Status LoadArrayMetadata(const Array& values, ArrayMetadata* meta) { if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()))) { - std::stringstream ss; - ss << "Array is not primitive type: " << values.type()->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Array is not primitive type: ", values.type()->ToString()); } meta->type = ToFlatbufferType(values.type_id()); diff --git a/cpp/src/arrow/ipc/json-integration-test.cc b/cpp/src/arrow/ipc/json-integration-test.cc index 914cdb66599f4..fe69a53a944c7 100644 --- a/cpp/src/arrow/ipc/json-integration-test.cc +++ b/cpp/src/arrow/ipc/json-integration-test.cc @@ -170,10 +170,8 @@ static Status ValidateArrowVsJson(const std::string& arrow_path, const int arrow_nbatches = arrow_reader->num_record_batches(); if (json_nbatches != arrow_nbatches) { - std::stringstream ss; - ss << "Different number of record batches: " << json_nbatches << " (JSON) vs " - << arrow_nbatches << " (Arrow)"; - return Status::Invalid(ss.str()); + return Status::Invalid("Different number of record batches: ", json_nbatches, + " (JSON) vs ", arrow_nbatches, " (Arrow)"); } std::shared_ptr arrow_batch; @@ -231,9 +229,7 @@ Status RunCommand(const std::string& json_path, const std::string& arrow_path, return ValidateArrowVsJson(arrow_path, json_path); } else { - std::stringstream ss; - ss << "Unknown command: " << command; - return Status::Invalid(ss.str()); + return Status::Invalid("Unknown command: ", command); } } diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index d5a5dd9f397db..05e547506c596 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -633,9 +633,7 @@ static Status GetInteger(const rj::Value::ConstObject& json_type, *type = is_signed ? int64() : uint64(); break; default: - std::stringstream ss; - ss << "Invalid bit width: " << bit_width; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid bit width: ", bit_width); } return Status::OK(); } @@ -654,9 +652,7 @@ static Status GetFloatingPoint(const RjObject& json_type, } else if (precision == "HALF") { *type = float16(); } else { - std::stringstream ss; - ss << "Invalid precision: " << precision; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid precision: ", precision); } return Status::OK(); } @@ -693,9 +689,7 @@ static Status GetDate(const RjObject& json_type, std::shared_ptr* type } else if (unit_str == "MILLISECOND") { *type = date64(); } else { - std::stringstream ss; - ss << "Invalid date unit: " << unit_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid date unit: ", unit_str); } return Status::OK(); } @@ -718,9 +712,7 @@ static Status GetTime(const RjObject& json_type, std::shared_ptr* type } else if (unit_str == "NANOSECOND") { *type = time64(TimeUnit::NANO); } else { - std::stringstream ss; - ss << "Invalid time unit: " << unit_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid time unit: ", unit_str); } const auto& fw_type = checked_cast(**type); @@ -749,9 +741,7 @@ static Status GetTimestamp(const RjObject& json_type, std::shared_ptr* } else if (unit_str == "NANOSECOND") { unit = TimeUnit::NANO; } else { - std::stringstream ss; - ss << "Invalid time unit: " << unit_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid time unit: ", unit_str); } const auto& it_tz = json_type.FindMember("timezone"); @@ -778,9 +768,7 @@ static Status GetUnion(const RjObject& json_type, } else if (mode_str == "DENSE") { mode = UnionMode::DENSE; } else { - std::stringstream ss; - ss << "Invalid union mode: " << mode_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid union mode: ", mode_str); } const auto& it_type_codes = json_type.FindMember("typeIds"); @@ -838,9 +826,7 @@ static Status GetType(const RjObject& json_type, } else if (type_name == "union") { return GetUnion(json_type, children, type); } else { - std::stringstream ss; - ss << "Unrecognized type name: " << type_name; - return Status::Invalid(ss.str()); + return Status::Invalid("Unrecognized type name: ", type_name); } return Status::OK(); } @@ -1235,10 +1221,8 @@ class ArrayReader { const auto& json_children_arr = json_children->value.GetArray(); if (type.num_children() != static_cast(json_children_arr.Size())) { - std::stringstream ss; - ss << "Expected " << type.num_children() << " children, but got " - << json_children_arr.Size(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected ", type.num_children(), " children, but got ", + json_children_arr.Size()); } for (int i = 0; i < static_cast(json_children_arr.Size()); ++i) { @@ -1342,9 +1326,7 @@ static Status ReadDictionary(const RjObject& obj, const DictionaryTypeMap& id_to auto it = id_to_field.find(id); if (it == id_to_field.end()) { - std::stringstream ss; - ss << "No dictionary with id " << id; - return Status::Invalid(ss.str()); + return Status::Invalid("No dictionary with id ", id); } std::vector> fields = {it->second}; @@ -1489,9 +1471,7 @@ Status ReadArray(MemoryPool* pool, const rj::Value& json_array, const Schema& sc } if (result == nullptr) { - std::stringstream ss; - ss << "Field named " << name << " not found in schema"; - return Status::KeyError(ss.str()); + return Status::KeyError("Field named ", name, " not found in schema"); } return ReadArray(pool, json_array, result->type(), array); diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 5516e2dd72a2e..c8c724968f67c 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -49,56 +49,39 @@ using RjWriter = rj::Writer; using RjArray = rj::Value::ConstArray; using RjObject = rj::Value::ConstObject; -#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ - if (NAME == (PARENT).MemberEnd()) { \ - std::stringstream ss; \ - ss << "field " << TOK << " not found"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ + if (NAME == (PARENT).MemberEnd()) { \ + return Status::Invalid("field ", TOK, " not found"); \ } -#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsString()) { \ - std::stringstream ss; \ - ss << "field was not a string" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsString()) { \ + return Status::Invalid("field was not a string line ", __LINE__); \ } -#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsBool()) { \ - std::stringstream ss; \ - ss << "field was not a boolean" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsBool()) { \ + return Status::Invalid("field was not a boolean line ", __LINE__); \ } -#define RETURN_NOT_INT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsInt()) { \ - std::stringstream ss; \ - ss << "field was not an int" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_INT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsInt()) { \ + return Status::Invalid("field was not an int line ", __LINE__); \ } -#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsArray()) { \ - std::stringstream ss; \ - ss << "field was not an array" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsArray()) { \ + return Status::Invalid("field was not an array line ", __LINE__); \ } -#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsObject()) { \ - std::stringstream ss; \ - ss << "field was not an object" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsObject()) { \ + return Status::Invalid("field was not an object line ", __LINE__); \ } namespace arrow { diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index a8d120036e4f5..d812f841d9353 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -41,9 +41,7 @@ using ::arrow::internal::checked_cast; static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; static Status JSONTypeError(const char* expected_type, rj::Type json_type) { - std::stringstream ss; - ss << "Expected " << expected_type << " or null, got type " << json_type; - return Status::Invalid(ss.str()); + return Status::Invalid("Expected ", expected_type, " or null, got type ", json_type); } class Converter { @@ -184,9 +182,8 @@ class IntegerConverter final : public ConcreteConverter> if (v == v64) { return builder_->Append(v); } else { - std::stringstream ss; - ss << "Value " << v64 << " out of bounds for " << this->type_->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Value ", v64, " out of bounds for ", + this->type_->ToString()); } } else { return JSONTypeError("signed int", json_obj.GetType()); @@ -203,9 +200,8 @@ class IntegerConverter final : public ConcreteConverter> if (v == v64) { return builder_->Append(v); } else { - std::stringstream ss; - ss << "Value " << v64 << " out of bounds for " << this->type_->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Value ", v64, " out of bounds for ", + this->type_->ToString()); } return builder_->Append(v); } else { @@ -272,10 +268,8 @@ class DecimalConverter final : public ConcreteConverter { auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); RETURN_NOT_OK(Decimal128::FromString(view, &d, &precision, &scale)); if (scale != decimal_type_->scale()) { - std::stringstream ss; - ss << "Invalid scale for decimal: expected " << decimal_type_->scale() << ", got " - << scale; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid scale for decimal: expected ", + decimal_type_->scale(), ", got ", scale); } return builder_->Append(d); } @@ -390,10 +384,8 @@ class StructConverter final : public ConcreteConverter { auto size = json_obj.Size(); auto expected_size = static_cast(type_->num_children()); if (size != expected_size) { - std::stringstream ss; - ss << "Expected array of size " << expected_size << ", got array of size " - << size; - return Status::Invalid(ss.str()); + return Status::Invalid("Expected array of size ", expected_size, + ", got array of size ", size); } for (uint32_t i = 0; i < size; ++i) { RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); @@ -414,9 +406,8 @@ class StructConverter final : public ConcreteConverter { } } if (remaining > 0) { - std::stringstream ss; - ss << "Unexpected members in JSON object for type " << type_->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Unexpected members in JSON object for type ", + type_->ToString()); } return builder_->Append(); } @@ -460,9 +451,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) default: { - std::stringstream ss; - ss << "JSON conversion to " << type->ToString() << " not implemented"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("JSON conversion to ", type->ToString(), + " not implemented"); } } @@ -481,10 +471,8 @@ Status ArrayFromJSON(const std::shared_ptr& type, rj::Document json_doc; json_doc.Parse(json_string.data(), json_string.length()); if (json_doc.HasParseError()) { - std::stringstream ss; - ss << "JSON parse error at offset " << json_doc.GetErrorOffset() << ": " - << GetParseError_En(json_doc.GetParseError()); - return Status::Invalid(ss.str()); + return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", + GetParseError_En(json_doc.GetParseError())); } // The JSON document should be an array, append it diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 724e6255cbddb..8adf4a8b66038 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -153,10 +153,8 @@ Status Message::ReadFrom(const std::shared_ptr& metadata, io::InputStrea std::shared_ptr body; RETURN_NOT_OK(stream->Read(body_length, &body)); if (body->size() < body_length) { - std::stringstream ss; - ss << "Expected to be able to read " << body_length << " bytes for message body, got " - << body->size(); - return Status::IOError(ss.str()); + return Status::IOError("Expected to be able to read ", body_length, + " bytes for message body, got ", body->size()); } return Message::Open(metadata, body, out); @@ -171,10 +169,8 @@ Status Message::ReadFrom(const int64_t offset, const std::shared_ptr& me std::shared_ptr body; RETURN_NOT_OK(file->ReadAt(offset, body_length, &body)); if (body->size() < body_length) { - std::stringstream ss; - ss << "Expected to be able to read " << body_length << " bytes for message body, got " - << body->size(); - return Status::IOError(ss.str()); + return Status::IOError("Expected to be able to read ", body_length, + " bytes for message body, got ", body->size()); } return Message::Open(metadata, body, out); @@ -238,19 +234,16 @@ Status ReadMessage(int64_t offset, int32_t metadata_length, io::RandomAccessFile RETURN_NOT_OK(file->ReadAt(offset, metadata_length, &buffer)); if (buffer->size() < metadata_length) { - std::stringstream ss; - ss << "Expected to read " << metadata_length << " metadata bytes but got " - << buffer->size(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected to read ", metadata_length, + " metadata bytes but got ", buffer->size()); } int32_t flatbuffer_size = *reinterpret_cast(buffer->data()); if (flatbuffer_size + static_cast(sizeof(int32_t)) > metadata_length) { - std::stringstream ss; - ss << "flatbuffer size " << metadata_length << " invalid. File offset: " << offset - << ", metadata length: " << metadata_length; - return Status::Invalid(ss.str()); + return Status::Invalid("flatbuffer size ", metadata_length, + " invalid. File offset: ", offset, + ", metadata length: ", metadata_length); } auto metadata = SliceBuffer(buffer, 4, buffer->size() - 4); @@ -303,10 +296,8 @@ Status ReadMessage(io::InputStream* file, std::unique_ptr* message) { std::shared_ptr metadata; RETURN_NOT_OK(file->Read(message_length, &metadata)); if (metadata->size() != message_length) { - std::stringstream ss; - ss << "Expected to read " << message_length << " metadata bytes, but " - << "only read " << metadata->size(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected to read ", message_length, " metadata bytes, but ", + "only read ", metadata->size()); } return Message::ReadFrom(metadata, file, message); diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index ef189c8ae617a..1d4c80c2946b1 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -443,9 +443,7 @@ static Status TypeToFlatbuffer(FBB& fbb, const DataType& type, return UnionToFlatBuffer(fbb, *value_type, children, dictionary_memo, offset); default: *out_type = flatbuf::Type_NONE; // Make clang-tidy happy - std::stringstream ss; - ss << "Unable to convert type: " << type.ToString() << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unable to convert type: ", type.ToString()); } return Status::OK(); } @@ -483,9 +481,7 @@ static Status TensorTypeToFlatbuffer(FBB& fbb, const DataType& type, break; default: *out_type = flatbuf::Type_NONE; // Make clang-tidy happy - std::stringstream ss; - ss << "Unable to convert type: " << type.ToString() << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unable to convert type: ", type.ToString()); } return Status::OK(); } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 65f5d963e88db..b2c26767be4e9 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -225,9 +225,7 @@ class ArrayLoader { const int num_children = type.num_children(); if (num_children != 1) { - std::stringstream ss; - ss << "Wrong number of children: " << num_children; - return Status::Invalid(ss.str()); + return Status::Invalid("Wrong number of children: ", num_children); } return LoadChildren(type.children()); @@ -343,9 +341,7 @@ Status ReadDictionary(const Buffer& metadata, const DictionaryTypeMap& dictionar int64_t id = *dictionary_id = dictionary_batch->id(); auto it = dictionary_types.find(id); if (it == dictionary_types.end()) { - std::stringstream ss; - ss << "Do not have type metadata for dictionary with id: " << id; - return Status::KeyError(ss.str()); + return Status::KeyError("Do not have type metadata for dictionary with id: ", id); } std::vector> fields = {it->second}; @@ -372,10 +368,8 @@ static Status ReadMessageAndValidate(MessageReader* reader, Message::Type expect RETURN_NOT_OK(reader->ReadNextMessage(message)); if (!(*message) && !allow_null) { - std::stringstream ss; - ss << "Expected " << FormatMessageType(expected_type) - << " message in stream, was null or length 0"; - return Status::Invalid(ss.str()); + return Status::Invalid("Expected ", FormatMessageType(expected_type), + " message in stream, was null or length 0"); } if ((*message) == nullptr) { @@ -383,10 +377,9 @@ static Status ReadMessageAndValidate(MessageReader* reader, Message::Type expect } if ((*message)->type() != expected_type) { - std::stringstream ss; - ss << "Message not expected type: " << FormatMessageType(expected_type) - << ", was: " << (*message)->type(); - return Status::IOError(ss.str()); + return Status::IOError( + "Message not expected type: ", FormatMessageType(expected_type), + ", was: ", (*message)->type()); } return Status::OK(); } @@ -512,9 +505,7 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { int magic_size = static_cast(strlen(kArrowMagicBytes)); if (footer_offset_ <= magic_size * 2 + 4) { - std::stringstream ss; - ss << "File is too small: " << footer_offset_; - return Status::Invalid(ss.str()); + return Status::Invalid("File is too small: ", footer_offset_); } std::shared_ptr buffer; @@ -523,9 +514,7 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { const int64_t expected_footer_size = magic_size + sizeof(int32_t); if (buffer->size() < expected_footer_size) { - std::stringstream ss; - ss << "Unable to read " << expected_footer_size << "from end of file"; - return Status::Invalid(ss.str()); + return Status::Invalid("Unable to read ", expected_footer_size, "from end of file"); } if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) { diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index d62db32b062ac..fb5beacf0f863 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -55,31 +55,23 @@ Status AllocateAligned(int64_t size, uint8_t** out) { *out = reinterpret_cast(_aligned_malloc(static_cast(size), kAlignment)); if (!*out) { - std::stringstream ss; - ss << "malloc of size " << size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("malloc of size ", size, " failed"); } #elif defined(ARROW_JEMALLOC) *out = reinterpret_cast(mallocx( std::max(static_cast(size), kAlignment), MALLOCX_ALIGN(kAlignment))); if (*out == NULL) { - std::stringstream ss; - ss << "malloc of size " << size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("malloc of size ", size, " failed"); } #else const int result = posix_memalign(reinterpret_cast(out), kAlignment, static_cast(size)); if (result == ENOMEM) { - std::stringstream ss; - ss << "malloc of size " << size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("malloc of size ", size, " failed"); } if (result == EINVAL) { - std::stringstream ss; - ss << "invalid alignment parameter: " << kAlignment; - return Status::Invalid(ss.str()); + return Status::Invalid("invalid alignment parameter: ", kAlignment); } #endif return Status::OK(); @@ -118,10 +110,8 @@ class DefaultMemoryPool : public MemoryPool { *ptr = reinterpret_cast( rallocx(*ptr, static_cast(new_size), MALLOCX_ALIGN(kAlignment))); if (*ptr == NULL) { - std::stringstream ss; - ss << "realloc of size " << new_size << " failed"; *ptr = previous_ptr; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("realloc of size ", new_size, " failed"); } #else // Note: We cannot use realloc() here as it doesn't guarantee alignment. diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 3e04f2727ed51..29d64355bdaed 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -414,9 +414,7 @@ inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& data, *out_values = WrapBytes::Wrap(view.data(), view.length()); if (*out_values == nullptr) { PyErr_Clear(); - std::stringstream ss; - ss << "Wrapping " << view << " failed"; - return Status::UnknownError(ss.str()); + return Status::UnknownError("Wrapping ", view, " failed"); } } ++out_values; @@ -773,18 +771,16 @@ class ObjectBlock : public PandasBlock { CONVERTLISTSLIKE_CASE(ListType, LIST) CONVERTLISTSLIKE_CASE(NullType, NA) default: { - std::stringstream ss; - ss << "Not implemented type for conversion from List to Pandas ObjectBlock: " - << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented( + "Not implemented type for conversion from List to Pandas ObjectBlock: ", + list_type->value_type()->ToString()); } } } else if (type == Type::STRUCT) { RETURN_NOT_OK(ConvertStruct(options_, data, out_buffer)); } else { - std::stringstream ss; - ss << "Unsupported type for object array output: " << col->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported type for object array output: ", + col->type()->ToString()); } placement_data_[rel_placement] = abs_placement; @@ -810,10 +806,9 @@ class IntBlock : public PandasBlock { const ChunkedArray& data = *col->data().get(); if (type != ARROW_TYPE) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas int" << sizeof(C_TYPE) << " block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), " to a Pandas int", + sizeof(C_TYPE), " block"); } ConvertIntegerNoNullsSameType(options_, data, out_buffer); @@ -841,10 +836,9 @@ class Float16Block : public PandasBlock { Type::type type = col->type()->id(); if (type != Type::HALF_FLOAT) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas float16 block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas float16 block"); } npy_half* out_buffer = @@ -866,10 +860,9 @@ class Float32Block : public PandasBlock { Type::type type = col->type()->id(); if (type != Type::FLOAT) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas float32 block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas float32 block"); } float* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; @@ -922,10 +915,9 @@ class Float64Block : public PandasBlock { ConvertNumericNullable(data, NAN, out_buffer); break; default: - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas float64 block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas float64 block"); } #undef INTEGER_CASE @@ -945,10 +937,9 @@ class BoolBlock : public PandasBlock { Type::type type = col->type()->id(); if (type != Type::BOOL) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas boolean block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas boolean block"); } uint8_t* out_buffer = @@ -1006,10 +997,9 @@ class DatetimeBlock : public PandasBlock { return Status::NotImplemented("Unsupported time unit"); } } else { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas datetime block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas datetime block."); } placement_data_[rel_placement] = abs_placement; @@ -1075,9 +1065,8 @@ class CategoricalBlock : public PandasBlock { const T* values = arr.raw_values(); for (int64_t i = 0; i < arr.length(); ++i) { if (arr.IsValid(i) && (values[i] < 0 || values[i] >= dict_length)) { - std::stringstream ss; - ss << "Out of bounds dictionary index: " << static_cast(values[i]); - return Status::Invalid(ss.str()); + return Status::Invalid("Out of bounds dictionary index: ", + static_cast(values[i])); } } return Status::OK(); @@ -1088,16 +1077,15 @@ class CategoricalBlock : public PandasBlock { RETURN_NOT_OK(AllocateNDArrayFromIndices(npy_type, indices_first)); } else { if (options_.zero_copy_only) { - std::stringstream ss; if (needs_copy_) { - ss << "Need to allocate categorical memory, " - << "but only zero-copy conversions allowed."; - } else { - ss << "Needed to copy " << data.num_chunks() << " chunks with " - << indices_first->null_count() - << " indices nulls, but zero_copy_only was True"; + return Status::Invalid("Need to allocate categorical memory, but ", + "only zero-copy conversions " + "allowed"); } - return Status::Invalid(ss.str()); + + return Status::Invalid("Needed to copy ", data.num_chunks(), " chunks with ", + indices_first->null_count(), + " indices nulls, but zero_copy_only was True"); } RETURN_NOT_OK(AllocateNDArray(npy_type, 1)); @@ -1155,10 +1143,8 @@ class CategoricalBlock : public PandasBlock { RETURN_NOT_OK(WriteIndices(converted_col)); break; default: { - std::stringstream ss; - ss << "Categorical index type not supported: " - << dict_type.index_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Categorical index type not supported: ", + dict_type.index_type()->ToString()); } } @@ -1349,10 +1335,8 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options case Type::LIST: { auto list_type = std::static_pointer_cast(col.type()); if (!ListTypeSupported(*list_type->value_type())) { - std::stringstream ss; - ss << "Not implemented type for list in DataFrameBlock: " - << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Not implemented type for list in DataFrameBlock: ", + list_type->value_type()->ToString()); } *output_type = PandasBlock::OBJECT; } break; @@ -1360,10 +1344,9 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options *output_type = PandasBlock::CATEGORICAL; break; default: - std::stringstream ss; - ss << "No known equivalent Pandas block for Arrow data of type "; - ss << col.type()->ToString() << " is known."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented( + "No known equivalent Pandas block for Arrow data of type ", + col.type()->ToString(), " is known."); } return Status::OK(); } @@ -1657,10 +1640,8 @@ class ArrowDeserializer { if (data_.num_chunks() == 1 && data_.null_count() == 0) { return ConvertValuesZeroCopy(options_, npy_type, data_.chunk(0)); } else if (options_.zero_copy_only) { - std::stringstream ss; - ss << "Needed to copy " << data_.num_chunks() << " chunks with " - << data_.null_count() << " nulls, but zero_copy_only was True"; - return Status::Invalid(ss.str()); + return Status::Invalid("Needed to copy ", data_.num_chunks(), " chunks with ", + data_.null_count(), " nulls, but zero_copy_only was True"); } RETURN_NOT_OK(AllocateOutput(npy_type)); @@ -1751,10 +1732,8 @@ class ArrowDeserializer { if (data_.num_chunks() == 1 && data_.null_count() == 0) { return ConvertValuesZeroCopy(options_, traits::npy_type, data_.chunk(0)); } else if (options_.zero_copy_only) { - std::stringstream ss; - ss << "Needed to copy " << data_.num_chunks() << " chunks with " - << data_.null_count() << " nulls, but zero_copy_only was True"; - return Status::Invalid(ss.str()); + return Status::Invalid("Needed to copy ", data_.num_chunks(), " chunks with ", + data_.null_count(), " nulls, but zero_copy_only was True"); } if (data_.null_count() > 0) { @@ -1854,9 +1833,8 @@ class ArrowDeserializer { CONVERTVALUES_LISTSLIKE_CASE(Decimal128Type, DECIMAL) CONVERTVALUES_LISTSLIKE_CASE(ListType, LIST) default: { - std::stringstream ss; - ss << "Not implemented type for lists: " << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Not implemented type for lists: ", + list_type->value_type()->ToString()); } } #undef CONVERTVALUES_LISTSLIKE_CASE diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 6587bd328f3fb..6e41beddd1b72 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -215,10 +215,8 @@ struct PyBytesView { this->ref.reset(); return Status::OK(); } else { - std::stringstream ss; - ss << "Expected " << expected_msg << ", got a '" << Py_TYPE(obj)->tp_name - << "' object"; - return Status::TypeError(ss.str()); + return Status::TypeError("Expected ", expected_msg, ", got a '", + Py_TYPE(obj)->tp_name, "' object"); } } diff --git a/cpp/src/arrow/python/decimal.cc b/cpp/src/arrow/python/decimal.cc index 051f31faacacf..8db7c01b9ab8b 100644 --- a/cpp/src/arrow/python/decimal.cc +++ b/cpp/src/arrow/python/decimal.cc @@ -125,11 +125,9 @@ Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arr const int32_t scale = arrow_type.scale(); if (ARROW_PREDICT_FALSE(inferred_precision > precision)) { - std::stringstream buf; - buf << "Decimal type with precision " << inferred_precision - << " does not fit into precision inferred from first array element: " - << precision; - return Status::Invalid(buf.str()); + return Status::Invalid( + "Decimal type with precision ", inferred_precision, + " does not fit into precision inferred from first array element: ", precision); } if (scale != inferred_scale) { diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index 2f43db6505c67..28ed1a6c364dc 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -164,11 +164,10 @@ namespace { Status IntegerOverflowStatus(PyObject* obj, const std::string& overflow_message) { if (overflow_message.empty()) { - std::stringstream ss; std::string obj_as_stdstring; RETURN_NOT_OK(PyObject_StdStringStr(obj, &obj_as_stdstring)); - ss << "Value " << obj_as_stdstring << " too large to fit in C integer type"; - return Status::Invalid(ss.str()); + return Status::Invalid("Value ", obj_as_stdstring, + " too large to fit in C integer type"); } else { return Status::Invalid(overflow_message); } @@ -299,13 +298,10 @@ bool PandasObjectIsNull(PyObject* obj) { } Status InvalidValue(PyObject* obj, const std::string& why) { - std::stringstream ss; - std::string obj_as_str; RETURN_NOT_OK(internal::PyObject_StdStringStr(obj, &obj_as_str)); - ss << "Could not convert " << obj_as_str << " with type " << Py_TYPE(obj)->tp_name - << ": " << why; - return Status::Invalid(ss.str()); + return Status::Invalid("Could not convert ", obj_as_str, " with type ", + Py_TYPE(obj)->tp_name, ": ", why); } Status UnboxIntegerAsInt64(PyObject* obj, int64_t* out) { @@ -355,10 +351,8 @@ Status IntegerScalarToDoubleSafe(PyObject* obj, double* out) { constexpr int64_t kDoubleMin = -(1LL << 53); if (value < kDoubleMin || value > kDoubleMax) { - std::stringstream ss; - ss << "Integer value " << value << " is outside of the range exactly" - << " representable by a IEEE 754 double precision value"; - return Status::Invalid(ss.str()); + return Status::Invalid("Integer value ", value, " is outside of the range exactly", + " representable by a IEEE 754 double precision value"); } *out = static_cast(value); return Status::OK(); @@ -372,10 +366,8 @@ Status IntegerScalarToFloat32Safe(PyObject* obj, float* out) { constexpr int64_t kFloatMin = -(1LL << 24); if (value < kFloatMin || value > kFloatMax) { - std::stringstream ss; - ss << "Integer value " << value << " is outside of the range exactly" - << " representable by a IEEE 754 single precision value"; - return Status::Invalid(ss.str()); + return Status::Invalid("Integer value ", value, " is outside of the range exactly", + " representable by a IEEE 754 single precision value"); } *out = static_cast(value); return Status::OK(); diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index 0f1d85ead2a16..c9db5f4f28531 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -58,10 +58,9 @@ class NumPyDtypeUnifier { NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(NULLPTR) {} Status InvalidMix(int new_dtype) { - std::stringstream ss; - ss << "Cannot mix NumPy dtypes " << GetNumPyTypeName(current_type_num_) << " and " - << GetNumPyTypeName(new_dtype); - return Status::Invalid(ss.str()); + return Status::Invalid("Cannot mix NumPy dtypes ", + GetNumPyTypeName(current_type_num_), " and ", + GetNumPyTypeName(new_dtype)); } int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; } @@ -250,9 +249,7 @@ class NumPyDtypeUnifier { action = Observe_DATETIME(descr); break; default: - std::stringstream ss; - ss << "Unsupported numpy type " << GetNumPyTypeName(dtype) << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported numpy type ", GetNumPyTypeName(dtype)); } if (action == INVALID) { @@ -480,10 +477,8 @@ class TypeInferrer { } else if (PyBytes_Check(key_obj)) { key = internal::PyBytes_AsStdString(key_obj); } else { - std::stringstream ss; - ss << "Expected dict key of type str or bytes, got '" << Py_TYPE(key_obj)->tp_name - << "'"; - return Status::TypeError(ss.str()); + return Status::TypeError("Expected dict key of type str or bytes, got '", + Py_TYPE(key_obj)->tp_name, "'"); } // Get or create visitor for this key auto it = struct_inferrers_.find(key); diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index 463795a2109f0..6954e35c3e199 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -143,9 +143,8 @@ inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) { TYPE_VISIT_INLINE(DATETIME); TYPE_VISIT_INLINE(OBJECT); } - std::stringstream ss; - ss << "NumPy type not implemented: " << GetNumPyTypeName(PyArray_TYPE(arr)); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("NumPy type not implemented: ", + GetNumPyTypeName(PyArray_TYPE(arr))); } #undef TYPE_VISIT_INLINE diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index d95e337a4870d..c73e0bc15c9c5 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -92,9 +92,7 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr* out) { TO_ARROW_TYPE_CASE(FLOAT32, float32); TO_ARROW_TYPE_CASE(FLOAT64, float64); default: { - std::stringstream ss; - ss << "Unsupported numpy type " << descr->type_num << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); } } return Status::OK(); @@ -119,9 +117,7 @@ Status GetNumPyType(const DataType& type, int* type_num) { NUMPY_TYPE_CASE(FLOAT, FLOAT32); NUMPY_TYPE_CASE(DOUBLE, FLOAT64); default: { - std::stringstream ss; - ss << "Unsupported tensor type: " << type.ToString() << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported tensor type: ", type.ToString()); } } #undef NUMPY_TYPE_CASE @@ -181,9 +177,7 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out) { } } break; default: { - std::stringstream ss; - ss << "Unsupported numpy type " << descr->type_num << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); } } diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index da288d3c6868e..461a085722243 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -283,9 +283,8 @@ class NumPyConverter { } Status TypeNotImplemented(std::string type_name) { - std::stringstream ss; - ss << "NumPyConverter doesn't implement <" << type_name << "> conversion. "; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("NumPyConverter doesn't implement <", type_name, + "> conversion. "); } MemoryPool* pool_; @@ -574,9 +573,8 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { auto byte_width = type.byte_width(); if (itemsize_ != byte_width) { - std::stringstream ss; - ss << "Got bytestring of length " << itemsize_ << " (expected " << byte_width << ")"; - return Status::Invalid(ss.str()); + return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ", + byte_width, ")"); } FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_); @@ -651,9 +649,8 @@ Status NumPyConverter::Visit(const StringType& type) { if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) { return builder.Append(data, itemsize_); } else { - std::stringstream ss; - ss << "Encountered non-UTF8 binary value: " << HexEncode(data, itemsize_); - return Status::Invalid(ss.str()); + return Status::Invalid("Encountered non-UTF8 binary value: ", + HexEncode(data, itemsize_)); } } else { return AppendUTF32(reinterpret_cast(data), itemsize_, byteorder, @@ -697,9 +694,7 @@ Status NumPyConverter::Visit(const StructType& type) { for (auto field : type.children()) { PyObject* tup = PyDict_GetItemString(dtype_->fields, field->name().c_str()); if (tup == NULL) { - std::stringstream ss; - ss << "Missing field '" << field->name() << "' in struct array"; - return Status::TypeError(ss.str()); + return Status::TypeError("Missing field '", field->name(), "' in struct array"); } PyArray_Descr* sub_dtype = reinterpret_cast(PyTuple_GET_ITEM(tup, 0)); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index a77cebc7e7d50..f5e6a5776071d 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -402,10 +402,7 @@ class TimestampConverter : public TypedConverter type; RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); if (type->id() != Type::TIMESTAMP) { - std::ostringstream ss; - ss << "Expected np.datetime64 but got: "; - ss << type->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected np.datetime64 but got: ", type->ToString()); } const TimestampType& ttype = checked_cast(*type); if (unit_ != ttype.unit()) { @@ -705,10 +702,7 @@ Status ListConverter::AppendNdarrayItem(PyObject* obj) { return value_converter_->AppendSingleVirtual(obj); } default: { - std::stringstream ss; - ss << "Unknown list item type: "; - ss << value_type_->ToString(); - return Status::TypeError(ss.str()); + return Status::TypeError("Unknown list item type: ", value_type_->ToString()); } } } @@ -911,9 +905,8 @@ Status GetConverter(const std::shared_ptr& type, bool from_pandas, new StructConverter(from_pandas, strict_conversions)); break; default: - std::stringstream ss; - ss << "Sequence converter for type " << type->ToString() << " not implemented"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Sequence converter for type ", type->ToString(), + " not implemented"); } return Status::OK(); } diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 7911557ee73e0..ca94369be5157 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -407,10 +407,9 @@ Status CallCustomCallback(PyObject* context, PyObject* method_name, PyObject* el PyObject** result) { *result = NULL; if (context == Py_None) { - std::stringstream ss; - ss << "error while calling callback on " << internal::PyObject_StdStringRepr(elem) - << ": handler not registered"; - return Status::SerializationError(ss.str()); + return Status::SerializationError("error while calling callback on ", + internal::PyObject_StdStringRepr(elem), + ": handler not registered"); } else { *result = PyObject_CallMethodObjArgs(context, method_name, elem, NULL); return PassPyError(); diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h index 7350deadcc67f..dc462972c57b7 100644 --- a/cpp/src/arrow/python/util/datetime.h +++ b/cpp/src/arrow/python/util/datetime.h @@ -199,9 +199,7 @@ static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, switch (unit) { case TimeUnit::NANO: if (val % 1000 != 0) { - std::stringstream ss; - ss << "Value " << val << " has non-zero nanoseconds"; - return Status::Invalid(ss.str()); + return Status::Invalid("Value ", val, " has non-zero nanoseconds"); } val /= 1000; // fall through diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 33287c19ffdde..baaf5cb17500f 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -95,16 +95,13 @@ class SimpleRecordBatch : public RecordBatch { DCHECK(column != nullptr); if (!field->type()->Equals(column->type())) { - std::stringstream ss; - ss << "Column data type " << field->type()->name() - << " does not match field data type " << column->type()->name(); - return Status::Invalid(ss.str()); + return Status::Invalid("Column data type ", field->type()->name(), + " does not match field data type ", column->type()->name()); } if (column->length() != num_rows_) { - std::stringstream ss; - ss << "Added column's length must match record batch's length. Expected length " - << num_rows_ << " but got length " << column->length(); - return Status::Invalid(ss.str()); + return Status::Invalid( + "Added column's length must match record batch's length. Expected length ", + num_rows_, " but got length ", column->length()); } std::shared_ptr new_schema; @@ -229,17 +226,14 @@ Status RecordBatch::Validate() const { auto arr_shared = this->column_data(i); const ArrayData& arr = *arr_shared; if (arr.length != num_rows_) { - std::stringstream ss; - ss << "Number of rows in column " << i << " did not match batch: " << arr.length - << " vs " << num_rows_; - return Status::Invalid(ss.str()); + return Status::Invalid("Number of rows in column ", i, + " did not match batch: ", arr.length, " vs ", num_rows_); } const auto& schema_type = *schema_->field(i)->type(); if (!arr.type->Equals(schema_type)) { - std::stringstream ss; - ss << "Column " << i << " type not match schema: " << arr.type->ToString() << " vs " - << schema_type.ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Column ", i, + " type not match schema: ", arr.type->ToString(), " vs ", + schema_type.ToString()); } } return Status::OK(); diff --git a/cpp/src/arrow/status.cc b/cpp/src/arrow/status.cc index 8be8b36d13bd8..db7f087149017 100644 --- a/cpp/src/arrow/status.cc +++ b/cpp/src/arrow/status.cc @@ -13,6 +13,7 @@ #include "arrow/status.h" #include +#include namespace arrow { diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index e3632a6d5f62e..12975afcc8100 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -25,34 +25,41 @@ #endif #include "arrow/util/macros.h" +#include "arrow/util/string_builder.h" #include "arrow/util/visibility.h" #ifdef ARROW_EXTRA_ERROR_CONTEXT -/// \brief Propagate any non-successful Status to the caller -#define ARROW_RETURN_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - std::stringstream ss; \ - ss << __FILE__ << ":" << __LINE__ << " code: " << #s << "\n" << _s.message(); \ - return ::arrow::Status(_s.code(), ss.str()); \ - } \ +/// \brief Return with given status if condition is met. +#define ARROW_RETURN_IF(condition, status) \ + do { \ + if (ARROW_PREDICT_FALSE(condition)) { \ + ::arrow::Status _s = (status); \ + std::stringstream ss; \ + ss << __FILE__ << ":" << __LINE__ << " : " << _s.message(); \ + return ::arrow::Status(_s.code(), ss.str()); \ + } \ } while (0) #else -/// \brief Propagate any non-successful Status to the caller -#define ARROW_RETURN_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - return _s; \ - } \ - } while (false) +#define ARROW_RETURN_IF(condition, status) \ + do { \ + if (ARROW_PREDICT_FALSE(condition)) { \ + return (status); \ + } \ + } while (0) #endif // ARROW_EXTRA_ERROR_CONTEXT +/// \brief Propagate any non-successful Status to the caller +#define ARROW_RETURN_NOT_OK(status) \ + do { \ + ::arrow::Status __s = (status); \ + ARROW_RETURN_IF(!__s.ok(), __s); \ + \ + } while (false) + #define RETURN_NOT_OK_ELSE(s, else_) \ do { \ ::arrow::Status _s = (s); \ @@ -62,17 +69,6 @@ } \ } while (false) -#define ARROW_RETURN_FAILURE_IF_FALSE(condition, status) \ - do { \ - if (!(condition)) { \ - Status _status = (status); \ - std::stringstream ss; \ - ss << __FILE__ << ":" << __LINE__ << " code: " << _status.CodeAsString() << " \n " \ - << _status.message(); \ - return ::arrow::Status(_status.code(), ss.str()); \ - } \ - } while (0) - // This is an internal-use macro and should not be used in public headers. #ifndef RETURN_NOT_OK #define RETURN_NOT_OK(s) ARROW_RETURN_NOT_OK(s) @@ -149,84 +145,119 @@ class ARROW_EXPORT Status { static Status OK() { return Status(); } /// Return a success status with a specific message - static Status OK(const std::string& msg) { return Status(StatusCode::OK, msg); } + template + static Status OK(Args&&... args) { + return Status(StatusCode::OK, util::StringBuilder(std::forward(args)...)); + } /// Return an error status for out-of-memory conditions - static Status OutOfMemory(const std::string& msg) { - return Status(StatusCode::OutOfMemory, msg); + template + static Status OutOfMemory(Args&&... args) { + return Status(StatusCode::OutOfMemory, + util::StringBuilder(std::forward(args)...)); } /// Return an error status for failed key lookups (e.g. column name in a table) - static Status KeyError(const std::string& msg) { - return Status(StatusCode::KeyError, msg); + template + static Status KeyError(Args&&... args) { + return Status(StatusCode::KeyError, util::StringBuilder(std::forward(args)...)); } /// Return an error status for type errors (such as mismatching data types) - static Status TypeError(const std::string& msg) { - return Status(StatusCode::TypeError, msg); + template + static Status TypeError(Args&&... args) { + return Status(StatusCode::TypeError, + util::StringBuilder(std::forward(args)...)); } /// Return an error status for unknown errors - static Status UnknownError(const std::string& msg) { - return Status(StatusCode::UnknownError, msg); + template + static Status UnknownError(Args&&... args) { + return Status(StatusCode::UnknownError, + util::StringBuilder(std::forward(args)...)); } /// Return an error status when an operation or a combination of operation and /// data types is unimplemented - static Status NotImplemented(const std::string& msg) { - return Status(StatusCode::NotImplemented, msg); + template + static Status NotImplemented(Args&&... args) { + return Status(StatusCode::NotImplemented, + util::StringBuilder(std::forward(args)...)); } /// Return an error status for invalid data (for example a string that fails parsing) - static Status Invalid(const std::string& msg) { - return Status(StatusCode::Invalid, msg); + template + static Status Invalid(Args&&... args) { + return Status(StatusCode::Invalid, util::StringBuilder(std::forward(args)...)); } /// Return an error status when a container's capacity would exceed its limits - static Status CapacityError(const std::string& msg) { - return Status(StatusCode::CapacityError, msg); + template + static Status CapacityError(Args&&... args) { + return Status(StatusCode::CapacityError, + util::StringBuilder(std::forward(args)...)); } /// Return an error status when some IO-related operation failed - static Status IOError(const std::string& msg) { - return Status(StatusCode::IOError, msg); + template + static Status IOError(Args&&... args) { + return Status(StatusCode::IOError, util::StringBuilder(std::forward(args)...)); } /// Return an error status when some (de)serialization operation failed - static Status SerializationError(const std::string& msg) { - return Status(StatusCode::SerializationError, msg); + template + static Status SerializationError(Args&&... args) { + return Status(StatusCode::SerializationError, + util::StringBuilder(std::forward(args)...)); } - static Status RError(const std::string& msg) { return Status(StatusCode::RError, msg); } + template + static Status RError(Args&&... args) { + return Status(StatusCode::RError, util::StringBuilder(std::forward(args)...)); + } - static Status PlasmaObjectExists(const std::string& msg) { - return Status(StatusCode::PlasmaObjectExists, msg); + template + static Status PlasmaObjectExists(Args&&... args) { + return Status(StatusCode::PlasmaObjectExists, + util::StringBuilder(std::forward(args)...)); } - static Status PlasmaObjectNonexistent(const std::string& msg) { - return Status(StatusCode::PlasmaObjectNonexistent, msg); + template + static Status PlasmaObjectNonexistent(Args&&... args) { + return Status(StatusCode::PlasmaObjectNonexistent, + util::StringBuilder(std::forward(args)...)); } - static Status PlasmaObjectAlreadySealed(const std::string& msg) { - return Status(StatusCode::PlasmaObjectAlreadySealed, msg); + template + static Status PlasmaObjectAlreadySealed(Args&&... args) { + return Status(StatusCode::PlasmaObjectAlreadySealed, + util::StringBuilder(std::forward(args)...)); } - static Status PlasmaStoreFull(const std::string& msg) { - return Status(StatusCode::PlasmaStoreFull, msg); + template + static Status PlasmaStoreFull(Args&&... args) { + return Status(StatusCode::PlasmaStoreFull, + util::StringBuilder(std::forward(args)...)); } static Status StillExecuting() { return Status(StatusCode::StillExecuting, ""); } - static Status CodeGenError(const std::string& msg) { - return Status(StatusCode::CodeGenError, msg); + template + static Status CodeGenError(Args&&... args) { + return Status(StatusCode::CodeGenError, + util::StringBuilder(std::forward(args)...)); } - static Status ExpressionValidationError(const std::string& msg) { - return Status(StatusCode::ExpressionValidationError, msg); + template + static Status ExpressionValidationError(Args&&... args) { + return Status(StatusCode::ExpressionValidationError, + util::StringBuilder(std::forward(args)...)); } - static Status ExecutionError(const std::string& msg) { - return Status(StatusCode::ExecutionError, msg); + template + static Status ExecutionError(Args&&... args) { + return Status(StatusCode::ExecutionError, + util::StringBuilder(std::forward(args)...)); } /// Return true iff the status indicates success. diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 1f3d927ddd62b..d232ac35e30c7 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -234,10 +234,8 @@ Status Column::ValidateData() { for (int i = 0; i < data_->num_chunks(); ++i) { std::shared_ptr type = data_->chunk(i)->type(); if (!this->type()->Equals(type)) { - std::stringstream ss; - ss << "In chunk " << i << " expected type " << this->type()->ToString() - << " but saw " << type->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("In chunk ", i, " expected type ", this->type()->ToString(), + " but saw ", type->ToString()); } } return Status::OK(); @@ -301,10 +299,9 @@ class SimpleTable : public Table { DCHECK(col != nullptr); if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Added column's length must match table's length. Expected length " - << num_rows_ << " but got length " << col->length(); - return Status::Invalid(ss.str()); + return Status::Invalid( + "Added column's length must match table's length. Expected length ", num_rows_, + " but got length ", col->length()); } std::shared_ptr new_schema; @@ -319,10 +316,9 @@ class SimpleTable : public Table { DCHECK(col != nullptr); if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Added column's length must match table's length. Expected length " - << num_rows_ << " but got length " << col->length(); - return Status::Invalid(ss.str()); + return Status::Invalid( + "Added column's length must match table's length. Expected length ", num_rows_, + " but got length ", col->length()); } std::shared_ptr new_schema; @@ -363,15 +359,11 @@ class SimpleTable : public Table { for (int i = 0; i < num_columns(); ++i) { const Column* col = columns_[i].get(); if (col == nullptr) { - std::stringstream ss; - ss << "Column " << i << " was null"; - return Status::Invalid(ss.str()); + return Status::Invalid("Column ", i, " was null"); } if (!col->field()->Equals(*schema_->field(i))) { - std::stringstream ss; - ss << "Column field " << i << " named " << col->name() - << " is inconsistent with schema"; - return Status::Invalid(ss.str()); + return Status::Invalid("Column field ", i, " named ", col->name(), + " is inconsistent with schema"); } } @@ -379,10 +371,8 @@ class SimpleTable : public Table { for (int i = 0; i < num_columns(); ++i) { const Column* col = columns_[i].get(); if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Column " << i << " named " << col->name() << " expected length " - << num_rows_ << " but got length " << col->length(); - return Status::Invalid(ss.str()); + return Status::Invalid("Column ", i, " named ", col->name(), " expected length ", + num_rows_, " but got length ", col->length()); } } return Status::OK(); @@ -414,11 +404,9 @@ Status Table::FromRecordBatches(const std::shared_ptr& schema, for (int i = 0; i < nbatches; ++i) { if (!batches[i]->schema()->Equals(*schema, false)) { - std::stringstream ss; - ss << "Schema at index " << static_cast(i) << " was different: \n" - << schema->ToString() << "\nvs\n" - << batches[i]->schema()->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Schema at index ", static_cast(i), + " was different: \n", schema->ToString(), "\nvs\n", + batches[i]->schema()->ToString()); } } @@ -458,11 +446,9 @@ Status ConcatenateTables(const std::vector>& tables, for (int i = 1; i < ntables; ++i) { if (!tables[i]->schema()->Equals(*schema, false)) { - std::stringstream ss; - ss << "Schema at index " << static_cast(i) << " was different: \n" - << schema->ToString() << "\nvs\n" - << tables[i]->schema()->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Schema at index ", static_cast(i), + " was different: \n", schema->ToString(), "\nvs\n", + tables[i]->schema()->ToString()); } } diff --git a/cpp/src/arrow/util/compression_brotli.cc b/cpp/src/arrow/util/compression_brotli.cc index 89d099d6a6067..3d75253e11d9f 100644 --- a/cpp/src/arrow/util/compression_brotli.cc +++ b/cpp/src/arrow/util/compression_brotli.cc @@ -81,9 +81,7 @@ class BrotliDecompressor : public Decompressor { Status BrotliError(const char* msg) { return Status::IOError(msg); } Status BrotliError(BrotliDecoderErrorCode code, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << BrotliDecoderErrorString(code); - return Status::IOError(ss.str()); + return Status::IOError(prefix_msg, BrotliDecoderErrorString(code)); } BrotliDecoderState* state_ = nullptr; diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index 97fd46ab6c587..d157ba6176054 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -31,6 +31,10 @@ namespace arrow { namespace util { +static Status LZ4Error(LZ4F_errorCode_t ret, const char* prefix_msg) { + return Status::IOError(prefix_msg, LZ4F_getErrorName(ret)); +} + // ---------------------------------------------------------------------- // Lz4 decompressor implementation @@ -79,12 +83,6 @@ class LZ4Decompressor : public Decompressor { bool IsFinished() override { return finished_; } protected: - Status LZ4Error(LZ4F_errorCode_t ret, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << LZ4F_getErrorName(ret); - return Status::IOError(ss.str()); - } - LZ4F_dctx* ctx_ = nullptr; bool finished_; }; @@ -125,12 +123,6 @@ class LZ4Compressor : public Compressor { bool* should_retry) override; protected: - Status LZ4Error(LZ4F_errorCode_t ret, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << LZ4F_getErrorName(ret); - return Status::IOError(ss.str()); - } - LZ4F_cctx* ctx_ = nullptr; LZ4F_preferences_t prefs_; bool first_time_; diff --git a/cpp/src/arrow/util/compression_snappy.cc b/cpp/src/arrow/util/compression_snappy.cc index 1b483e5855209..058593fe13d4e 100644 --- a/cpp/src/arrow/util/compression_snappy.cc +++ b/cpp/src/arrow/util/compression_snappy.cc @@ -57,10 +57,8 @@ Status SnappyCodec::Decompress(int64_t input_len, const uint8_t* input, return Status::IOError("Corrupt snappy compressed data."); } if (output_buffer_len < static_cast(decompressed_size)) { - std::stringstream ss; - ss << "Output buffer size (" << output_buffer_len << ") must be " << decompressed_size - << " or larger."; - return Status::Invalid(ss.str()); + return Status::Invalid("Output buffer size (", output_buffer_len, ") must be ", + decompressed_size, " or larger."); } if (output_len) { *output_len = static_cast(decompressed_size); diff --git a/cpp/src/arrow/util/compression_zlib.cc b/cpp/src/arrow/util/compression_zlib.cc index 686dffa640940..dfda317e3bf36 100644 --- a/cpp/src/arrow/util/compression_zlib.cc +++ b/cpp/src/arrow/util/compression_zlib.cc @@ -76,6 +76,10 @@ static int DecompressionWindowBitsForFormat(GZipCodec::Format format) { } } +static Status ZlibErrorPrefix(const char* prefix_msg, const char* msg) { + return Status::IOError(prefix_msg, (msg) ? msg : "(unknown error)"); +} + // ---------------------------------------------------------------------- // gzip decompressor implementation @@ -142,14 +146,7 @@ class GZipDecompressor : public Decompressor { protected: Status ZlibError(const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg; - if (stream_.msg && *stream_.msg) { - ss << stream_.msg; - } else { - ss << "(unknown error)"; - } - return Status::IOError(ss.str()); + return ZlibErrorPrefix(prefix_msg, stream_.msg); } z_stream stream_; @@ -197,14 +194,7 @@ class GZipCompressor : public Compressor { protected: Status ZlibError(const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg; - if (stream_.msg && *stream_.msg) { - ss << stream_.msg; - } else { - ss << "(unknown error)"; - } - return Status::IOError(ss.str()); + return ZlibErrorPrefix(prefix_msg, stream_.msg); } z_stream stream_; @@ -344,9 +334,7 @@ class GZipCodec::GZipCodecImpl { int window_bits = CompressionWindowBitsForFormat(format_); if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, kGZipDefaultCompressionLevel, Z_DEFAULT_STRATEGY)) != Z_OK) { - std::stringstream ss; - ss << "zlib deflateInit failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib deflateInit failed: ", stream_.msg); } compressor_initialized_ = true; return Status::OK(); @@ -367,9 +355,7 @@ class GZipCodec::GZipCodecImpl { // Initialize to run either deflate or zlib/gzip format int window_bits = DecompressionWindowBitsForFormat(format_); if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) { - std::stringstream ss; - ss << "zlib inflateInit failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib inflateInit failed: ", stream_.msg); } decompressor_initialized_ = true; return Status::OK(); @@ -401,9 +387,7 @@ class GZipCodec::GZipCodecImpl { // Reset the stream for this block if (inflateReset(&stream_) != Z_OK) { - std::stringstream ss; - ss << "zlib inflateReset failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib inflateReset failed: ", stream_.msg); } int ret = 0; @@ -425,18 +409,13 @@ class GZipCodec::GZipCodecImpl { if (ret == Z_STREAM_END || ret != Z_OK) break; // Failure, buffer was too small - std::stringstream ss; - ss << "Too small a buffer passed to GZipCodec. InputLength=" << input_length - << " OutputLength=" << output_buffer_length; - return Status::IOError(ss.str()); + return Status::IOError("Too small a buffer passed to GZipCodec. InputLength=", + input_length, " OutputLength=", output_buffer_length); } // Failure for some other reason if (ret != Z_STREAM_END) { - std::stringstream ss; - ss << "GZipCodec failed: "; - if (stream_.msg != NULL) ss << stream_.msg; - return Status::IOError(ss.str()); + return ZlibErrorPrefix("GZipCodec failed: ", stream_.msg); } if (output_length) { @@ -475,15 +454,12 @@ class GZipCodec::GZipCodecImpl { // small return Status::IOError("zlib deflate failed, output buffer too small"); } - std::stringstream ss; - ss << "zlib deflate failed: " << stream_.msg; - return Status::IOError(ss.str()); + + return ZlibErrorPrefix("zlib deflate failed: ", stream_.msg); } if (deflateReset(&stream_) != Z_OK) { - std::stringstream ss; - ss << "zlib deflateReset failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib deflateReset failed: ", stream_.msg); } // Actual output length diff --git a/cpp/src/arrow/util/compression_zstd.cc b/cpp/src/arrow/util/compression_zstd.cc index 083cae99b9730..de9df8fc9492e 100644 --- a/cpp/src/arrow/util/compression_zstd.cc +++ b/cpp/src/arrow/util/compression_zstd.cc @@ -36,9 +36,7 @@ namespace util { constexpr int kZSTDDefaultCompressionLevel = 1; static Status ZSTDError(size_t ret, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << ZSTD_getErrorName(ret); - return Status::IOError(ss.str()); + return Status::IOError(prefix_msg, ZSTD_getErrorName(ret)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index c47ac82e8ce3c..f6e110561b275 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -345,9 +345,7 @@ Status Decimal128::FromString(const util::string_view& s, Decimal128* out, DecimalComponents dec; if (!ParseDecimalComponents(s.data(), s.size(), &dec)) { - std::stringstream ss; - ss << "The string '" << s << "' is not a valid decimal number"; - return Status::Invalid(ss.str()); + return Status::Invalid("The string '", s, "' is not a valid decimal number"); } std::string exponent_value = dec.exponent_sign + dec.exponent_digits; @@ -878,11 +876,9 @@ Status Decimal128::Rescale(int32_t original_scale, int32_t new_scale, // Fail if we overflow or truncate if (ARROW_PREDICT_FALSE(rescale_would_cause_data_loss)) { - std::stringstream buf; - buf << "Rescaling decimal value " << ToString(original_scale) - << " from original scale of " << original_scale << " to new scale of " - << new_scale << " would cause data loss"; - return Status::Invalid(buf.str()); + return Status::Invalid("Rescaling decimal value ", ToString(original_scale), + " from original scale of ", original_scale, + " to new scale of ", new_scale, " would cause data loss"); } return Status::OK(); @@ -909,11 +905,9 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 int64_t high, low; if (length < kMinDecimalBytes || length > kMaxDecimalBytes) { - std::ostringstream stream; - stream << "Length of byte array passed to Decimal128::FromBigEndian "; - stream << "was " << length << ", but must be between "; - stream << kMinDecimalBytes << " and " << kMaxDecimalBytes; - return Status::Invalid(stream.str()); + return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ", + "was ", length, ", but must be between ", kMinDecimalBytes, + " and ", kMaxDecimalBytes); } // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index fe76d25eb41d0..f59a4a42abed6 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -149,9 +149,8 @@ class ARROW_EXPORT Decimal128 { constexpr auto max_value = std::numeric_limits::max(); const auto& self = *this; if (self < min_value || self > max_value) { - std::stringstream buf; - buf << "Invalid cast from Decimal128 to " << sizeof(T) << " byte integer"; - return Status::Invalid(buf.str()); + return Status::Invalid("Invalid cast from Decimal128 to ", sizeof(T), + " byte integer"); } *out = static_cast(low_bits_); return Status::OK(); diff --git a/cpp/src/arrow/util/io-util.cc b/cpp/src/arrow/util/io-util.cc index 74ad80691da94..5d67fe87fa0e5 100644 --- a/cpp/src/arrow/util/io-util.cc +++ b/cpp/src/arrow/util/io-util.cc @@ -113,10 +113,8 @@ static inline Status CheckFileOpResult(int ret, int errno_actual, const PlatformFilename& file_name, const char* opname) { if (ret == -1) { - std::stringstream ss; - ss << "Failed to " << opname << " file: " << file_name.string(); - ss << " , error: " << std::strerror(errno_actual); - return Status::IOError(ss.str()); + return Status::IOError("Failed to ", opname, " file: ", file_name.string(), + " , error: ", std::strerror(errno_actual)); } return Status::OK(); } @@ -232,12 +230,18 @@ Status CreatePipe(int fd[2]) { #endif if (ret == -1) { - return Status::IOError(std::string("Error creating pipe: ") + - std::string(strerror(errno))); + return Status::IOError("Error creating pipe: ", std::strerror(errno)); } return Status::OK(); } +static Status StatusFromErrno(const char* prefix) { +#ifdef _WIN32 + errno = __map_mman_error(GetLastError(), EPERM); +#endif + return Status::IOError(prefix, std::strerror(errno)); +} + // // Compatible way to remap a memory map // @@ -251,18 +255,12 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, HANDLE fm, h; if (!UnmapViewOfFile(addr)) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "UnmapViewOfFile failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("UnmapViewOfFile failed: "); } h = reinterpret_cast(_get_osfhandle(fildes)); if (h == INVALID_HANDLE_VALUE) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "cannot get file handle: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("Cannot get file handle: "); } LONG new_size_low = static_cast(new_size & 0xFFFFFFFFL); @@ -272,18 +270,12 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, SetEndOfFile(h); fm = CreateFileMapping(h, NULL, PAGE_READWRITE, 0, 0, ""); if (fm == NULL) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "mremap failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("CreateFileMapping failed: "); } *new_addr = MapViewOfFile(fm, FILE_MAP_WRITE, 0, 0, new_size); CloseHandle(fm); if (new_addr == NULL) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "mremap failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("MapViewOfFile failed: "); } return Status::OK(); #else @@ -291,26 +283,26 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, // we have to close the mmap first, truncate the file to the new size // and recreate the mmap if (munmap(addr, old_size) == -1) { - std::stringstream ss; - ss << "munmap failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("munmap failed: "); } if (ftruncate(fildes, new_size) == -1) { - std::stringstream ss; - ss << "cannot truncate file: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("ftruncate failed: "); } // we set READ / WRITE flags on the new map, since we could only have // unlarged a RW map in the first place *new_addr = mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, fildes, 0); + if (*new_addr == MAP_FAILED) { + return StatusFromErrno("mmap failed: "); + } return Status::OK(); #else if (ftruncate(fildes, new_size) == -1) { - std::stringstream ss; - ss << "file truncate failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("ftruncate failed: "); } *new_addr = mremap(addr, old_size, new_size, MREMAP_MAYMOVE); + if (*new_addr == MAP_FAILED) { + return StatusFromErrno("mremap failed: "); + } return Status::OK(); #endif #endif diff --git a/cpp/src/arrow/util/string_builder.h b/cpp/src/arrow/util/string_builder.h new file mode 100644 index 0000000000000..7b3e10742a9a9 --- /dev/null +++ b/cpp/src/arrow/util/string_builder.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. template + +#ifndef ARROW_UTIL_STRING_BUILDER_H +#define ARROW_UTIL_STRING_BUILDER_H + +#include +#include +#include + +namespace arrow { +namespace util { + +template +void StringBuilderRecursive(std::stringstream& stream, Head&& head) { + stream << head; +} + +template +void StringBuilderRecursive(std::stringstream& stream, Head&& head, Tail&&... tail) { + StringBuilderRecursive(stream, std::forward(head)); + StringBuilderRecursive(stream, std::forward(tail)...); +} + +template +std::string StringBuilder(Args&&... args) { + std::stringstream stream; + + StringBuilderRecursive(stream, std::forward(args)...); + + return stream.str(); +} + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_STRING_BUILDER_H diff --git a/cpp/src/gandiva/date_utils.cc b/cpp/src/gandiva/date_utils.cc index 2686b193500ff..8a7e1f03fbd20 100644 --- a/cpp/src/gandiva/date_utils.cc +++ b/cpp/src/gandiva/date_utils.cc @@ -75,11 +75,8 @@ Status DateUtils::ToInternalFormat(const std::string& format, buffer.str(""); continue; } else { - if (buffer.str().length() > 0) { - std::stringstream err_msg; - err_msg << "Invalid date format string '" << format << "' at position " << i; - return Status::Invalid(err_msg.str()); - } + ARROW_RETURN_IF(buffer.str().length() > 0, + Status::Invalid("Invalid date format string '", format, "'")); is_in_quoted_text = true; continue; @@ -156,10 +153,7 @@ Status DateUtils::ToInternalFormat(const std::string& format, } } } else { - // no potential matches found - std::stringstream err_msg; - err_msg << "Invalid date format string '" << format << "' at position " << i; - return Status::Invalid(err_msg.str()); + return Status::Invalid("Invalid date format string '", format, "'"); } } @@ -170,11 +164,10 @@ Status DateUtils::ToInternalFormat(const std::string& format, if (exactMatches.size() == 1 && exactMatches[0].length() == buffer.str().length()) { builder << sql_date_format_to_boost_map_[exactMatches[0]]; } else { - // we didn't successfully parse the entire string + // Format partially parsed int64_t pos = format.length() - buffer.str().length(); - std::stringstream err_msg; - err_msg << "Invalid date format string '" << format << "' at position " << pos; - return Status::Invalid(err_msg.str()); + return Status::Invalid("Invalid date format string '", format, "' at position ", + pos); } } std::string final_pattern = builder.str(); diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 59884c5b4ad44..da7a6d886c0e0 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -103,12 +103,11 @@ Status Engine::LoadPreCompiledIRFiles(const std::string& byte_code_file_path) { /// Read from file into memory buffer. llvm::ErrorOr> buffer_or_error = llvm::MemoryBuffer::getFile(byte_code_file_path); - if (!buffer_or_error) { - std::stringstream ss; - ss << "Could not load module from IR " << byte_code_file_path << ": " - << buffer_or_error.getError().message(); - return Status::CodeGenError(ss.str()); - } + ARROW_RETURN_IF( + !buffer_or_error, + Status::CodeGenError("Could not load module from IR ", byte_code_file_path, ": ", + buffer_or_error.getError().message())); + std::unique_ptr buffer = move(buffer_or_error.get()); /// Parse the IR module. @@ -123,15 +122,11 @@ Status Engine::LoadPreCompiledIRFiles(const std::string& byte_code_file_path) { } std::unique_ptr ir_module = move(module_or_error.get()); - /// Verify the IR module - if (llvm::verifyModule(*ir_module, &llvm::errs())) { - return Status::CodeGenError("verify of IR Module failed"); - } + ARROW_RETURN_IF(llvm::verifyModule(*ir_module, &llvm::errs()), + Status::CodeGenError("verify of IR Module failed")); + ARROW_RETURN_IF(llvm::Linker::linkModules(*module_, move(ir_module)), + Status::CodeGenError("failed to link IR Modules")); - // Link this to the primary module. - if (llvm::Linker::linkModules(*module_, move(ir_module))) { - return Status::CodeGenError("failed to link IR Modules"); - } return Status::OK(); } @@ -197,13 +192,13 @@ Status Engine::FinalizeModule(bool optimise_ir, bool dump_ir) { } } - if (llvm::verifyModule(*module_, &llvm::errs())) { - return Status::CodeGenError("verify of module failed after optimisation passes"); - } + ARROW_RETURN_IF(llvm::verifyModule(*module_, &llvm::errs()), + Status::CodeGenError("Module verification failed after optimizer")); // do the compilation execution_engine_->finalizeObject(); module_finalized_ = true; + return Status::OK(); } diff --git a/cpp/src/gandiva/expr_validator.cc b/cpp/src/gandiva/expr_validator.cc index 3f5d63745f942..43de9d7a053f8 100644 --- a/cpp/src/gandiva/expr_validator.cc +++ b/cpp/src/gandiva/expr_validator.cc @@ -24,133 +24,114 @@ namespace gandiva { Status ExprValidator::Validate(const ExpressionPtr& expr) { - if (expr == nullptr) { - return Status::ExpressionValidationError("Expression cannot be null."); - } + ARROW_RETURN_IF(expr == nullptr, + Status::ExpressionValidationError("Expression cannot be null")); + Node& root = *expr->root(); - Status status = root.Accept(*this); - if (!status.ok()) { - return status; - } - // validate return type matches - // no need to check if type is supported - // since root type has been validated. - if (!root.return_type()->Equals(*expr->result()->type())) { - std::stringstream ss; - ss << "Return type of root node " << root.return_type()->name() - << " does not match that of expression " << *expr->result()->type(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_NOT_OK(root.Accept(*this)); + + // Ensure root's return type match the expression return type. Type + // support validation is not required because root type is already supported. + ARROW_RETURN_IF(!root.return_type()->Equals(*expr->result()->type()), + Status::ExpressionValidationError("Return type of root node ", + root.return_type()->name(), + " does not match that of expression ", + expr->result()->type()->name())); + return Status::OK(); } Status ExprValidator::Visit(const FieldNode& node) { auto llvm_type = types_->IRType(node.return_type()->id()); - if (llvm_type == nullptr) { - std::stringstream ss; - ss << "Field " << node.field()->name() << " has unsupported data type " - << node.return_type()->name(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(llvm_type == nullptr, + Status::ExpressionValidationError("Field ", node.field()->name(), + " has unsupported data type ", + node.return_type()->name())); + // Ensure that field is found in schema auto field_in_schema_entry = field_map_.find(node.field()->name()); + ARROW_RETURN_IF(field_in_schema_entry == field_map_.end(), + Status::ExpressionValidationError("Field ", node.field()->name(), + " not in schema.")); - // validate that field is in schema. - if (field_in_schema_entry == field_map_.end()) { - std::stringstream ss; - ss << "Field " << node.field()->name() << " not in schema."; - return Status::ExpressionValidationError(ss.str()); - } - + // Ensure that that the found field match. FieldPtr field_in_schema = field_in_schema_entry->second; - // validate that field matches the definition in schema. - if (!field_in_schema->Equals(node.field())) { - std::stringstream ss; - ss << "Field definition in schema " << field_in_schema->ToString() - << " different from field in expression " << node.field()->ToString(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(!field_in_schema->Equals(node.field()), + Status::ExpressionValidationError( + "Field definition in schema ", field_in_schema->ToString(), + " different from field in expression ", node.field()->ToString())); + return Status::OK(); } Status ExprValidator::Visit(const FunctionNode& node) { auto desc = node.descriptor(); FunctionSignature signature(desc->name(), desc->params(), desc->return_type()); + const NativeFunction* native_function = registry_.LookupSignature(signature); - if (native_function == nullptr) { - std::stringstream ss; - ss << "Function " << signature.ToString() << " not supported yet. "; - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(native_function == nullptr, + Status::ExpressionValidationError("Function ", signature.ToString(), + " not supported yet. ")); for (auto& child : node.children()) { - Status status = child->Accept(*this); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(child->Accept(*this)); } + return Status::OK(); } Status ExprValidator::Visit(const IfNode& node) { - Status status = node.condition()->Accept(*this); - ARROW_RETURN_NOT_OK(status); - status = node.then_node()->Accept(*this); - ARROW_RETURN_NOT_OK(status); - status = node.else_node()->Accept(*this); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(node.condition()->Accept(*this)); + ARROW_RETURN_NOT_OK(node.then_node()->Accept(*this)); + ARROW_RETURN_NOT_OK(node.else_node()->Accept(*this)); auto if_node_ret_type = node.return_type(); auto then_node_ret_type = node.then_node()->return_type(); auto else_node_ret_type = node.else_node()->return_type(); - if (!if_node_ret_type->Equals(*then_node_ret_type)) { - std::stringstream ss; - ss << "Return type of if " << *if_node_ret_type << " and then " << *then_node_ret_type - << " not matching."; - return Status::ExpressionValidationError(ss.str()); - } + // Then-branch return type must match. + ARROW_RETURN_IF(!if_node_ret_type->Equals(*then_node_ret_type), + Status::ExpressionValidationError( + "Return type of if ", if_node_ret_type->ToString(), " and then ", + then_node_ret_type->ToString(), " not matching.")); - if (!if_node_ret_type->Equals(*else_node_ret_type)) { - std::stringstream ss; - ss << "Return type of if " << *if_node_ret_type << " and else " << *else_node_ret_type - << " not matching."; - return Status::ExpressionValidationError(ss.str()); - } + // Else-branch return type must match. + ARROW_RETURN_IF(!if_node_ret_type->Equals(*else_node_ret_type), + Status::ExpressionValidationError( + "Return type of if ", if_node_ret_type->ToString(), " and else ", + else_node_ret_type->ToString(), " not matching.")); return Status::OK(); } Status ExprValidator::Visit(const LiteralNode& node) { auto llvm_type = types_->IRType(node.return_type()->id()); - if (llvm_type == nullptr) { - std::stringstream ss; - ss << "Value " << node.holder() << " has unsupported data type " - << node.return_type()->name(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(llvm_type == nullptr, + Status::ExpressionValidationError("Value ", node.holder(), + " has unsupported data type ", + node.return_type()->name())); + return Status::OK(); } Status ExprValidator::Visit(const BooleanNode& node) { - Status status; - - if (node.children().size() < 2) { - std::stringstream ss; - ss << "Boolean expression has " << node.children().size() - << " children, expected atleast two"; - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF( + node.children().size() < 2, + Status::ExpressionValidationError("Boolean expression has ", node.children().size(), + " children, expected atleast two")); for (auto& child : node.children()) { - if (!child->return_type()->Equals(arrow::boolean())) { - std::stringstream ss; - ss << "Boolean expression has a child with return type " - << child->return_type()->name() << ", expected return type boolean"; - return Status::ExpressionValidationError(ss.str()); - } - - status = child->Accept(*this); - ARROW_RETURN_NOT_OK(status); + const auto bool_type = arrow::boolean(); + const auto ret_type = child->return_type(); + + ARROW_RETURN_IF(!ret_type->Equals(bool_type), + Status::ExpressionValidationError( + "Boolean expression has a child with return type ", + ret_type->ToString(), ", expected return type boolean")); + + ARROW_RETURN_NOT_OK(child->Accept(*this)); } + return Status::OK(); } @@ -178,18 +159,13 @@ Status ExprValidator::Visit(const InExpressionNode& node) { Status ExprValidator::ValidateInExpression(size_t number_of_values, DataTypePtr in_expr_return_type, DataTypePtr type_of_values) { - if (static_cast(number_of_values) == 0) { - std::stringstream ss; - ss << "IN Expression needs a non-empty constant list to match."; - return Status::ExpressionValidationError(ss.str()); - } - - if (!in_expr_return_type->Equals(type_of_values)) { - std::stringstream ss; - ss << "Evaluation expression for IN clause returns " << in_expr_return_type - << " values are of type" << type_of_values; - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(number_of_values == 0, + Status::ExpressionValidationError( + "IN Expression needs a non-empty constant list to match.")); + ARROW_RETURN_IF(!in_expr_return_type->Equals(type_of_values), + Status::ExpressionValidationError( + "Evaluation expression for IN clause returns ", in_expr_return_type, + " values are of type", type_of_values)); return Status::OK(); } diff --git a/cpp/src/gandiva/filter.cc b/cpp/src/gandiva/filter.cc index 7a24d9554ef3f..6075e2574559b 100644 --- a/cpp/src/gandiva/filter.cc +++ b/cpp/src/gandiva/filter.cc @@ -40,32 +40,28 @@ Filter::Filter(std::unique_ptr llvm_generator, SchemaPtr schema, Status Filter::Make(SchemaPtr schema, ConditionPtr condition, std::shared_ptr configuration, std::shared_ptr* filter) { - ARROW_RETURN_FAILURE_IF_FALSE(schema != nullptr, - Status::Invalid("schema cannot be null")); - ARROW_RETURN_FAILURE_IF_FALSE(condition != nullptr, - Status::Invalid("condition cannot be null")); - ARROW_RETURN_FAILURE_IF_FALSE(configuration != nullptr, - Status::Invalid("configuration cannot be null")); + ARROW_RETURN_IF(schema == nullptr, Status::Invalid("Schema cannot be null")); + ARROW_RETURN_IF(condition == nullptr, Status::Invalid("Condition cannot be null")); + ARROW_RETURN_IF(configuration == nullptr, + Status::Invalid("Configuration cannot be null")); + static Cache> cache; FilterCacheKey cache_key(schema, configuration, *(condition.get())); - std::shared_ptr cachedFilter = cache.GetModule(cache_key); + auto cachedFilter = cache.GetModule(cache_key); if (cachedFilter != nullptr) { *filter = cachedFilter; return Status::OK(); } + // Build LLVM generator, and generate code for the specified expression std::unique_ptr llvm_gen; - Status status = LLVMGenerator::Make(configuration, &llvm_gen); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, &llvm_gen)); // Run the validation on the expression. // Return if the expression is invalid since we will not be able to process further. ExprValidator expr_validator(llvm_gen->types(), schema); - status = expr_validator.Validate(condition); - ARROW_RETURN_NOT_OK(status); - - status = llvm_gen->Build({condition}); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(expr_validator.Validate(condition)); + ARROW_RETURN_NOT_OK(llvm_gen->Build({condition})); // Instantiate the filter with the completely built llvm generator *filter = std::make_shared(std::move(llvm_gen), schema, configuration); @@ -76,42 +72,33 @@ Status Filter::Make(SchemaPtr schema, ConditionPtr condition, Status Filter::Evaluate(const arrow::RecordBatch& batch, std::shared_ptr out_selection) { - if (!batch.schema()->Equals(*schema_)) { - return Status::Invalid("Schema in RecordBatch must match the schema in Make()"); - } - if (batch.num_rows() == 0) { - return Status::Invalid("RecordBatch must be non-empty."); - } - if (out_selection == nullptr) { - return Status::Invalid("out_selection must be non-null."); - } - if (out_selection->GetMaxSlots() < batch.num_rows()) { - std::stringstream ss; - ss << "out_selection has " << out_selection->GetMaxSlots() - << " slots, which is less than the batch size " << batch.num_rows(); - return Status::Invalid(ss.str()); - } + const auto num_rows = batch.num_rows(); + ARROW_RETURN_IF(!batch.schema()->Equals(*schema_), + Status::Invalid("RecordBatch schema must expected filter schema")); + ARROW_RETURN_IF(num_rows == 0, Status::Invalid("RecordBatch must be non-empty.")); + ARROW_RETURN_IF(out_selection == nullptr, + Status::Invalid("out_selection must be non-null.")); + ARROW_RETURN_IF(out_selection->GetMaxSlots() < num_rows, + Status::Invalid("Output selection vector capacity too small")); // Allocate three local_bitmaps (one for output, one for validity, one to compute the // intersection). - LocalBitMapsHolder bitmaps(batch.num_rows(), 3 /*local_bitmaps*/); + LocalBitMapsHolder bitmaps(num_rows, 3 /*local_bitmaps*/); int64_t bitmap_size = bitmaps.GetLocalBitMapSize(); auto validity = std::make_shared(bitmaps.GetLocalBitMap(0), bitmap_size); auto value = std::make_shared(bitmaps.GetLocalBitMap(1), bitmap_size); - auto array_data = - arrow::ArrayData::Make(arrow::boolean(), batch.num_rows(), {validity, value}); + auto array_data = arrow::ArrayData::Make(arrow::boolean(), num_rows, {validity, value}); // Execute the expression(s). - auto status = llvm_generator_->Execute(batch, {array_data}); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(llvm_generator_->Execute(batch, {array_data})); // Compute the intersection of the value and validity. auto result = bitmaps.GetLocalBitMap(2); BitMapAccumulator::IntersectBitMaps( - result, {bitmaps.GetLocalBitMap(0), bitmaps.GetLocalBitMap((1))}, batch.num_rows()); + result, {bitmaps.GetLocalBitMap(0), bitmaps.GetLocalBitMap((1))}, num_rows); - return out_selection->PopulateFromBitMap(result, bitmap_size, batch.num_rows() - 1); + return out_selection->PopulateFromBitMap(result, bitmap_size, num_rows - 1); } } // namespace gandiva diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index d659b22c46e34..051b75b7dc137 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -50,39 +50,40 @@ const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) { } } - // didn't hit any of the optimisation paths. return original. + // Could not optimize, return original node. return node; } +static bool IsArrowStringLiteral(arrow::Type::type type) { + return type == arrow::Type::STRING || type == arrow::Type::BINARY; +} + Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* holder) { - if (node.children().size() != 2) { - return Status::Invalid("'like' function requires two parameters"); - } + ARROW_RETURN_IF(node.children().size() != 2, + Status::Invalid("'like' function requires two parameters")); auto literal = dynamic_cast(node.children().at(1).get()); - if (literal == nullptr) { - return Status::Invalid("'like' function requires a literal as the second parameter"); - } + ARROW_RETURN_IF( + literal == nullptr, + Status::Invalid("'like' function requires a literal as the second parameter")); auto literal_type = literal->return_type()->id(); - if (literal_type != arrow::Type::STRING && literal_type != arrow::Type::BINARY) { - return Status::Invalid( - "'like' function requires a string literal as the second parameter"); - } - auto pattern = boost::get(literal->holder()); - return Make(pattern, holder); + ARROW_RETURN_IF( + !IsArrowStringLiteral(literal_type), + Status::Invalid( + "'like' function requires a string literal as the second parameter")); + + return Make(boost::get(literal->holder()), holder); } Status LikeHolder::Make(const std::string& sql_pattern, std::shared_ptr* holder) { std::string pcre_pattern; - auto status = RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); - if (!lholder->regex_.ok()) { - return Status::Invalid("building re2 regex failed for pattern " + pcre_pattern); - } + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); *holder = lholder; return Status::OK(); diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 82d0386cfb9f3..50f147b2fc7dd 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -44,10 +44,10 @@ LLVMGenerator::LLVMGenerator() Status LLVMGenerator::Make(std::shared_ptr config, std::unique_ptr* llvm_generator) { std::unique_ptr llvmgen_obj(new LLVMGenerator()); - Status status = Engine::Make(config, &(llvmgen_obj->engine_)); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(Engine::Make(config, &(llvmgen_obj->engine_))); *llvm_generator = std::move(llvmgen_obj); + return Status::OK(); } @@ -57,33 +57,29 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out // decompose the expression to separate out value and validities. ExprDecomposer decomposer(function_registry_, annotator_); ValueValidityPairPtr value_validity; - auto status = decomposer.Decompose(*expr->root(), &value_validity); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(decomposer.Decompose(*expr->root(), &value_validity)); // Generate the IR function for the decomposed expression. llvm::Function* ir_function = nullptr; - status = CodeGenExprValue(value_validity->value_expr(), output, idx, &ir_function); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK( + CodeGenExprValue(value_validity->value_expr(), output, idx, &ir_function)); std::unique_ptr compiled_expr( new CompiledExpr(value_validity, output, ir_function)); compiled_exprs_.push_back(std::move(compiled_expr)); + return Status::OK(); } /// Build and optimise module for projection expression. Status LLVMGenerator::Build(const ExpressionVector& exprs) { - Status status; - for (auto& expr : exprs) { auto output = annotator_.AddOutputFieldDescriptor(expr->result()); - status = Add(expr, output); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(Add(expr, output)); } - // optimise, compile and finalize the module - status = engine_->FinalizeModule(optimise_ir_, dump_ir_); - ARROW_RETURN_NOT_OK(status); + // Optimize, compile and finalize the module + ARROW_RETURN_NOT_OK(engine_->FinalizeModule(optimise_ir_, dump_ir_)); // setup the jit functions for each expression. for (auto& compiled_expr : compiled_exprs_) { @@ -91,6 +87,7 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs) { EvalFunc fn = reinterpret_cast(engine_->CompiledFunction(ir_func)); compiled_expr->set_jit_function(fn); } + return Status::OK(); } @@ -107,13 +104,15 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, EvalFunc jit_function = compiled_expr->jit_function(); jit_function(eval_batch->GetBufferArray(), eval_batch->GetLocalBitMapArray(), (int64_t)eval_batch->GetExecutionContext(), record_batch.num_rows()); - // check for execution errors - if (eval_batch->GetExecutionContext()->has_error()) { - return Status::ExecutionError(eval_batch->GetExecutionContext()->get_error()); - } + + ARROW_RETURN_IF( + eval_batch->GetExecutionContext()->has_error(), + Status::ExecutionError(eval_batch->GetExecutionContext()->get_error())); + // generate validity vectors. ComputeBitMapsForExpr(*compiled_expr, *eval_batch); } + return Status::OK(); } @@ -233,8 +232,8 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, FieldDescriptorPtr out engine_->AddFunctionToCompile(func_name); *fn = llvm::Function::Create(prototype, llvm::GlobalValue::ExternalLinkage, func_name, module()); - ARROW_RETURN_FAILURE_IF_FALSE((*fn != nullptr), - Status::CodeGenError("Error creating function.")); + ARROW_RETURN_IF((*fn == nullptr), Status::CodeGenError("Error creating function.")); + // Name the arguments llvm::Function::arg_iterator args = (*fn)->arg_begin(); llvm::Value* arg_addrs = &*args; @@ -396,6 +395,7 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, value = ir_builder()->CreateCall(fn, args, full_name); DCHECK(value->getType() == ret_type); } + return value; } diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 40fdc201133a4..d5902fc72f16d 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -45,12 +45,10 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, std::shared_ptr configuration, std::shared_ptr* projector) { - ARROW_RETURN_FAILURE_IF_FALSE(schema != nullptr, - Status::Invalid("schema cannot be null")); - ARROW_RETURN_FAILURE_IF_FALSE(!exprs.empty(), - Status::Invalid("expressions need to be non-empty")); - ARROW_RETURN_FAILURE_IF_FALSE(configuration != nullptr, - Status::Invalid("configuration cannot be null")); + ARROW_RETURN_IF(schema == nullptr, Status::Invalid("Schema cannot be null")); + ARROW_RETURN_IF(exprs.empty(), Status::Invalid("Expressions cannot be empty")); + ARROW_RETURN_IF(configuration == nullptr, + Status::Invalid("Configuration cannot be null")); // see if equivalent projector was already built static Cache> cache; @@ -63,23 +61,21 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, // Build LLVM generator, and generate code for the specified expressions std::unique_ptr llvm_gen; - Status status = LLVMGenerator::Make(configuration, &llvm_gen); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, &llvm_gen)); // Run the validation on the expressions. // Return if any of the expression is invalid since // we will not be able to process further. ExprValidator expr_validator(llvm_gen->types(), schema); for (auto& expr : exprs) { - status = expr_validator.Validate(expr); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(expr_validator.Validate(expr)); } - status = llvm_gen->Build(exprs); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(llvm_gen->Build(exprs)); // save the output field types. Used for validation at Evaluate() time. std::vector output_fields; + output_fields.reserve(exprs.size()); for (auto& expr : exprs) { output_fields.push_back(expr->result()); } @@ -94,86 +90,70 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, Status Projector::Evaluate(const arrow::RecordBatch& batch, const ArrayDataVector& output_data_vecs) { - Status status = ValidateEvaluateArgsCommon(batch); - ARROW_RETURN_NOT_OK(status); - - if (output_data_vecs.size() != output_fields_.size()) { - std::stringstream ss; - ss << "number of buffers for output_data_vecs is " << output_data_vecs.size() - << ", expected " << output_fields_.size(); - return Status::Invalid(ss.str()); - } + ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); + ARROW_RETURN_IF( + output_data_vecs.size() != output_fields_.size(), + Status::Invalid("Number of output buffers must match number of fields")); int idx = 0; for (auto& array_data : output_data_vecs) { + const auto output_field = output_fields_[idx]; if (array_data == nullptr) { - std::stringstream ss; - ss << "array for output field " << output_fields_[idx]->name() << "is null."; - return Status::Invalid(ss.str()); + return Status::Invalid("Output array for field ", output_field->name(), + " should not be null"); } - Status status = - ValidateArrayDataCapacity(*array_data, *(output_fields_[idx]), batch.num_rows()); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK( + ValidateArrayDataCapacity(*array_data, *output_field, batch.num_rows())); ++idx; } + return llvm_generator_->Execute(batch, output_data_vecs); } Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* pool, arrow::ArrayVector* output) { - Status status = ValidateEvaluateArgsCommon(batch); - ARROW_RETURN_NOT_OK(status); - - if (output == nullptr) { - return Status::Invalid("output must be non-null."); - } - - if (pool == nullptr) { - return Status::Invalid("memory pool must be non-null."); - } + ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); + ARROW_RETURN_IF(output == nullptr, Status::Invalid("Output must be non-null.")); + ARROW_RETURN_IF(pool == nullptr, Status::Invalid("Memory pool must be non-null.")); // Allocate the output data vecs. ArrayDataVector output_data_vecs; + output_data_vecs.reserve(output_fields_.size()); for (auto& field : output_fields_) { ArrayDataPtr output_data; - status = AllocArrayData(field->type(), batch.num_rows(), pool, &output_data); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK( + AllocArrayData(field->type(), batch.num_rows(), pool, &output_data)); output_data_vecs.push_back(output_data); } // Execute the expression(s). - status = llvm_generator_->Execute(batch, output_data_vecs); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(llvm_generator_->Execute(batch, output_data_vecs)); // Create and return array arrays. output->clear(); for (auto& array_data : output_data_vecs) { output->push_back(arrow::MakeArray(array_data)); } + return Status::OK(); } // TODO : handle variable-len vectors Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, arrow::MemoryPool* pool, ArrayDataPtr* array_data) { - if (!arrow::is_primitive(type->id())) { - return Status::Invalid("Unsupported output data type " + type->ToString()); - } + ARROW_RETURN_IF(!arrow::is_primitive(type->id()), + Status::Invalid("Unsupported output data type ", type)); - arrow::Status astatus; std::shared_ptr null_bitmap; - int64_t size = arrow::BitUtil::BytesForBits(num_records); - astatus = arrow::AllocateBuffer(pool, size, &null_bitmap); - ARROW_RETURN_NOT_OK(astatus); + int64_t bitmap_bytes = arrow::BitUtil::BytesForBits(num_records); + ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, bitmap_bytes, &null_bitmap)); std::shared_ptr data; const auto& fw_type = dynamic_cast(*type); int64_t data_len = arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width()); - astatus = arrow::AllocateBuffer(pool, data_len, &data); - ARROW_RETURN_NOT_OK(astatus); + ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, data_len, &data)); // Valgrind detects unitialized memory at byte level. Boolean types use bits // and can leave buffer memory uninitialized in the last byte. @@ -186,47 +166,33 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } Status Projector::ValidateEvaluateArgsCommon(const arrow::RecordBatch& batch) { - if (!batch.schema()->Equals(*schema_)) { - return Status::Invalid("Schema in RecordBatch must match the schema in Make()"); - } - if (batch.num_rows() == 0) { - return Status::Invalid("RecordBatch must be non-empty."); - } + ARROW_RETURN_IF(!batch.schema()->Equals(*schema_), + Status::Invalid("Schema in RecordBatch must match schema in Make()")); + ARROW_RETURN_IF(batch.num_rows() == 0, + Status::Invalid("RecordBatch must be non-empty.")); + return Status::OK(); } Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, const arrow::Field& field, int64_t num_records) { - // verify that there are atleast two buffers (validity and data). - if (array_data.buffers.size() < 2) { - std::stringstream ss; - ss << "number of buffers for output field " << field.name() << "is " - << array_data.buffers.size() << ", must have minimum 2."; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(array_data.buffers.size() < 2, + Status::Invalid("ArrayData must have at least 2 buffers")); - // verify size of bitmap buffer. int64_t min_bitmap_len = arrow::BitUtil::BytesForBits(num_records); int64_t bitmap_len = array_data.buffers[0]->capacity(); - if (bitmap_len < min_bitmap_len) { - std::stringstream ss; - ss << "bitmap buffer for output field " << field.name() << "has size " << bitmap_len - << ", must have minimum size " << min_bitmap_len; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(bitmap_len < min_bitmap_len, + Status::Invalid("Bitmap buffer too small for ", field.name())); // verify size of data buffer. // TODO : handle variable-len vectors const auto& fw_type = dynamic_cast(*field.type()); int64_t min_data_len = arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width()); int64_t data_len = array_data.buffers[1]->capacity(); - if (data_len < min_data_len) { - std::stringstream ss; - ss << "data buffer for output field " << field.name() << " has size " << data_len - << ", must have minimum size " << min_data_len; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(data_len < min_data_len, + Status::Invalid("Data buffer too small for ", field.name())); + return Status::OK(); } diff --git a/cpp/src/gandiva/regex_util.cc b/cpp/src/gandiva/regex_util.cc index 893af095a3dd2..1d3860615d57f 100644 --- a/cpp/src/gandiva/regex_util.cc +++ b/cpp/src/gandiva/regex_util.cc @@ -38,20 +38,16 @@ Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char esca if (cur == escape_char) { // escape char must be followed by '_', '%' or the escape char itself. ++idx; - if (idx == sql_pattern.size()) { - std::stringstream msg; - msg << "unexpected escape char at the end of pattern " << sql_pattern; - return Status::Invalid(msg.str()); - } + ARROW_RETURN_IF( + idx == sql_pattern.size(), + Status::Invalid("Unexpected escape char at the end of pattern ", sql_pattern)); cur = sql_pattern.at(idx); if (cur == '_' || cur == '%' || cur == escape_char) { pcre_pattern += cur; } else { - std::stringstream msg; - msg << "invalid escape sequence in pattern " << sql_pattern << " at offset " - << idx; - return Status::Invalid(msg.str()); + return Status::Invalid("Invalid escape sequence in pattern ", sql_pattern, + " at offset ", idx); } } else if (cur == '_') { pcre_pattern += '.'; diff --git a/cpp/src/gandiva/selection_vector.cc b/cpp/src/gandiva/selection_vector.cc index 9266ca7fe1056..f89b80c2b510f 100644 --- a/cpp/src/gandiva/selection_vector.cc +++ b/cpp/src/gandiva/selection_vector.cc @@ -28,22 +28,15 @@ namespace gandiva { Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap_size, int64_t max_bitmap_index) { - if (bitmap_size % 8 != 0) { - std::stringstream ss; - ss << "bitmap size " << bitmap_size << " must be padded to 64-bit size"; - return Status::Invalid(ss.str()); - } - if (max_bitmap_index < 0) { - std::stringstream ss; - ss << "max bitmap index " << max_bitmap_index << " must be positive"; - return Status::Invalid(ss.str()); - } - if (static_cast(max_bitmap_index) > GetMaxSupportedValue()) { - std::stringstream ss; - ss << "max_bitmap_index " << max_bitmap_index << " must be <= maxSupportedValue " - << GetMaxSupportedValue() << " in selection vector"; - return Status::Invalid(ss.str()); - } + const uint64_t max_idx = static_cast(max_bitmap_index); + ARROW_RETURN_IF(bitmap_size % 8, Status::Invalid("Bitmap size ", bitmap_size, + " must be aligned to 64-bit size")); + ARROW_RETURN_IF(max_bitmap_index < 0, + Status::Invalid("Max bitmap index must be positive")); + ARROW_RETURN_IF( + max_idx > GetMaxSupportedValue(), + Status::Invalid("max_bitmap_index ", max_idx, " must be <= maxSupportedValue ", + GetMaxSupportedValue(), " in selection vector")); int64_t max_slots = GetMaxSlots(); @@ -64,9 +57,9 @@ Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap break; } - if (selection_idx >= max_slots) { - return Status::Invalid("selection vector has no remaining slots"); - } + ARROW_RETURN_IF(selection_idx >= max_slots, + Status::Invalid("selection vector has no remaining slots")); + SetIndex(selection_idx, pos_in_bitmap); ++selection_idx; @@ -81,60 +74,54 @@ Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap Status SelectionVector::MakeInt16(int64_t max_slots, std::shared_ptr buffer, std::shared_ptr* selection_vector) { - auto status = SelectionVectorInt16::ValidateBuffer(max_slots, buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt16::ValidateBuffer(max_slots, buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt16(int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* selection_vector) { std::shared_ptr buffer; - auto status = SelectionVectorInt16::AllocateBuffer(max_slots, pool, &buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt16::AllocateBuffer(max_slots, pool, &buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt32(int64_t max_slots, std::shared_ptr buffer, std::shared_ptr* selection_vector) { - auto status = SelectionVectorInt32::ValidateBuffer(max_slots, buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt32::ValidateBuffer(max_slots, buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt32(int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* selection_vector) { std::shared_ptr buffer; - auto status = SelectionVectorInt32::AllocateBuffer(max_slots, pool, &buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt32::AllocateBuffer(max_slots, pool, &buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt64(int64_t max_slots, std::shared_ptr buffer, std::shared_ptr* selection_vector) { - auto status = SelectionVectorInt64::ValidateBuffer(max_slots, buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt64::ValidateBuffer(max_slots, buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt64(int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* selection_vector) { std::shared_ptr buffer; - auto status = SelectionVectorInt64::AllocateBuffer(max_slots, pool, &buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt64::AllocateBuffer(max_slots, pool, &buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } @@ -142,8 +129,7 @@ template Status SelectionVectorImpl::AllocateBuffer( int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* buffer) { auto buffer_len = max_slots * sizeof(C_TYPE); - auto astatus = arrow::AllocateBuffer(pool, buffer_len, buffer); - ARROW_RETURN_NOT_OK(astatus); + ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, buffer_len, buffer)); return Status::OK(); } @@ -151,19 +137,13 @@ Status SelectionVectorImpl::AllocateBuffer( template Status SelectionVectorImpl::ValidateBuffer( int64_t max_slots, std::shared_ptr buffer) { - // verify buffer is mutable - if (!buffer->is_mutable()) { - return Status::Invalid("buffer for selection vector must be mutable"); - } + ARROW_RETURN_IF(!buffer->is_mutable(), + Status::Invalid("buffer for selection vector must be mutable")); + + const int64_t min_len = max_slots * sizeof(C_TYPE); + ARROW_RETURN_IF(buffer->size() < min_len, + Status::Invalid("Buffer for selection vector is too small")); - // verify size of buffer. - int64_t min_len = max_slots * sizeof(C_TYPE); - if (buffer->size() < min_len) { - std::stringstream ss; - ss << "buffer for selection_data has size " << buffer->size() - << ", must have minimum size " << min_len; - return Status::Invalid(ss.str()); - } return Status::OK(); } diff --git a/cpp/src/gandiva/tests/projector_build_validation_test.cc b/cpp/src/gandiva/tests/projector_build_validation_test.cc index ddcb729b3bfee..18f02957fd479 100644 --- a/cpp/src/gandiva/tests/projector_build_validation_test.cc +++ b/cpp/src/gandiva/tests/projector_build_validation_test.cc @@ -191,8 +191,6 @@ TEST_F(TestProjector, TestIfNotMatchingReturnType) { std::shared_ptr projector; Status status = Projector::Make(schema, {expr}, &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Return type of if bool and then int32 not matching."; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } TEST_F(TestProjector, TestElseNotMatchingReturnType) { @@ -218,8 +216,6 @@ TEST_F(TestProjector, TestElseNotMatchingReturnType) { std::shared_ptr projector; Status status = Projector::Make(schema, {expr}, &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Return type of if int32 and else bool not matching."; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } TEST_F(TestProjector, TestElseNotSupportedType) { @@ -245,8 +241,7 @@ TEST_F(TestProjector, TestElseNotSupportedType) { std::shared_ptr projector; Status status = Projector::Make(schema, {expr}, &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Field c has unsupported data type list"; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); + EXPECT_EQ(status.code(), StatusCode::ExpressionValidationError); } TEST_F(TestProjector, TestAndMinChildren) { @@ -266,8 +261,6 @@ TEST_F(TestProjector, TestAndMinChildren) { std::shared_ptr projector; Status status = Projector::Make(schema, {expr}, &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Boolean expression has 1 children, expected atleast two"; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } TEST_F(TestProjector, TestAndBooleanArgType) { @@ -289,10 +282,6 @@ TEST_F(TestProjector, TestAndBooleanArgType) { std::shared_ptr projector; Status status = Projector::Make(schema, {expr}, &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = - "Boolean expression has a child with return type int32, expected return type " - "boolean"; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } } // namespace gandiva diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 7830b6abc75d1..b5905fddff489 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -690,10 +690,8 @@ Status FileReader::GetRecordBatchReader(const std::vector& row_group_indice int max_num = num_row_groups(); for (auto row_group_index : row_group_indices) { if (row_group_index < 0 || row_group_index >= max_num) { - std::ostringstream ss; - ss << "Some index in row_group_indices is " << row_group_index - << ", which is either < 0 or >= num_row_groups(" << max_num << ")"; - return Status::Invalid(ss.str()); + return Status::Invalid("Some index in row_group_indices is ", row_group_index, + ", which is either < 0 or >= num_row_groups(", max_num, ")"); } } @@ -1495,9 +1493,8 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, TRANSFER_CASE(TIME32, ::arrow::Time32Type, Int32Type) TRANSFER_CASE(TIME64, ::arrow::Time64Type, Int64Type) default: - std::stringstream ss; - ss << "No support for reading columns of type " << field_->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("No support for reading columns of type ", + field_->type()->ToString()); } DCHECK_NE(result.kind(), Datum::NONE); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index af9fbc91a5042..fed0e59dfa330 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -80,10 +80,9 @@ static Status FromFLBA(const PrimitiveNode& node, std::shared_ptr* ou *out = MakeDecimal128Type(node); break; default: - std::stringstream ss; - ss << "Unhandled logical type " << LogicalTypeToString(node.logical_type()) - << " for fixed-length binary array"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unhandled logical type ", + LogicalTypeToString(node.logical_type()), + " for fixed-length binary array"); } return Status::OK(); @@ -122,10 +121,9 @@ static Status FromInt32(const PrimitiveNode& node, std::shared_ptr* o *out = MakeDecimal128Type(node); break; default: - std::stringstream ss; - ss << "Unhandled logical type " << LogicalTypeToString(node.logical_type()) - << " for INT32"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unhandled logical type ", + LogicalTypeToString(node.logical_type()), + " for INT32"); } return Status::OK(); } @@ -154,10 +152,9 @@ static Status FromInt64(const PrimitiveNode& node, std::shared_ptr* o *out = ::arrow::time64(::arrow::TimeUnit::MICRO); break; default: - std::stringstream ss; - ss << "Unhandled logical type " << LogicalTypeToString(node.logical_type()) - << " for INT64"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unhandled logical type ", + LogicalTypeToString(node.logical_type()), + " for INT64"); } return Status::OK(); } @@ -613,10 +610,9 @@ Status FieldToNode(const std::shared_ptr& field, } default: { // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR - std::stringstream ss; - ss << "Unhandled type for Arrow to Parquet schema conversion: "; - ss << field->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented( + "Unhandled type for Arrow to Parquet schema conversion: ", + field->type()->ToString()); } } PARQUET_CATCH_NOT_OK(*out = diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index bce9f37026c97..a8153cac1ebea 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -676,10 +676,8 @@ Status ArrowColumnWriter::WriteTimestampsCoerce(const bool truncated_timestamps_ auto DivideBy = [&](const int64_t factor) { for (int64_t i = 0; i < array.length(); i++) { if (!truncated_timestamps_allowed && !data.IsNull(i) && (values[i] % factor != 0)) { - std::stringstream ss; - ss << "Casting from " << type.ToString() << " to " << target_type->ToString() - << " would lose data: " << values[i]; - return Status::Invalid(ss.str()); + return Status::Invalid("Casting from ", type.ToString(), " to ", + target_type->ToString(), " would lose data: ", values[i]); } buffer[i] = values[i] / factor; } @@ -950,9 +948,8 @@ Status ArrowColumnWriter::Write(const Array& data) { default: break; } - std::stringstream ss; - ss << "Data type not supported as list value: " << values_array->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Data type not supported as list value: ", + values_array->type()->ToString()); } } // namespace diff --git a/cpp/src/plasma/io.cc b/cpp/src/plasma/io.cc index d63ceb6da24da..d2794e89d3ac0 100644 --- a/cpp/src/plasma/io.cc +++ b/cpp/src/plasma/io.cc @@ -49,7 +49,7 @@ Status WriteBytes(int fd, uint8_t* cursor, size_t length) { if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { continue; } - return Status::IOError(std::string(strerror(errno))); + return Status::IOError(strerror(errno)); } else if (nbytes == 0) { return Status::IOError("Encountered unexpected EOF"); } @@ -80,7 +80,7 @@ Status ReadBytes(int fd, uint8_t* cursor, size_t length) { if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { continue; } - return Status::IOError(std::string(strerror(errno))); + return Status::IOError(strerror(errno)); } else if (0 == nbytes) { return Status::IOError("Encountered unexpected EOF"); } @@ -171,12 +171,12 @@ Status ConnectIpcSocketRetry(const std::string& pathname, int num_retries, *fd = ConnectIpcSock(pathname); --num_retries; } + // If we could not connect to the socket, exit. if (*fd == -1) { - std::stringstream ss; - ss << "Could not connect to socket " << pathname; - return Status::IOError(ss.str()); + return Status::IOError("Could not connect to socket ", pathname); } + return Status::OK(); } From 2ab97bc3f9885fa95e8ad51aa3b119a5435440c2 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Thu, 20 Dec 2018 11:38:39 -0600 Subject: [PATCH 090/328] ARROW-4089: [Plasma] The tutorial is wrong regarding the parameter type of PlasmaClient.Create Plasma's tutorial says that the data type of one parameter is address of `uint8_t*` but it's actually address of `shared_ptr`. ``` uint8_t* data; <------------------------------- wrong data type here. // Create a Plasma object by specifying its ID and size. ARROW_CHECK_OK(client.Create(object_id, data_size, NULL, 0, &data)); ``` Author: Kousuke Saruta Closes #3235 from sarutak/fix-plasma-tutorial and squashes the following commits: a780a27cf Fix the data type of the pointer in the plasma's tutorial --- cpp/apidoc/tutorials/plasma.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/apidoc/tutorials/plasma.md b/cpp/apidoc/tutorials/plasma.md index b9046d50bc922..40c5a10603e71 100644 --- a/cpp/apidoc/tutorials/plasma.md +++ b/cpp/apidoc/tutorials/plasma.md @@ -182,7 +182,7 @@ was written by the `Create` command. int64_t data_size = 100; // The address of the buffer allocated by the Plasma store will be written at // this address. -uint8_t* data; +std::shared_ptr data; // Create a Plasma object by specifying its ID and size. ARROW_CHECK_OK(client.Create(object_id, data_size, NULL, 0, &data)); ``` @@ -194,7 +194,7 @@ metadata (as raw bytes) and the fourth argument is the size of the metadata. // Create a Plasma object with metadata. int64_t data_size = 100; std::string metadata = "{'author': 'john'}"; -uint8_t* data; +std::shared_ptr data; client.Create(object_id, data_size, (uint8_t*) metadata.data(), metadata.size(), &data); ``` From 398466e629bad593e72def8c892b030958a58a1a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 20 Dec 2018 14:04:18 -0600 Subject: [PATCH 091/328] ARROW-4079: [C++] Add machine benchmark Right now there is a single memory latency benchmark. Its output looks like this, showing the different cache levels up to main memory (this is on a CPU with 16 MB L3 cache): ``` ------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------ BM_memory_latency/2048 2 ns 2 ns 406878405 548.706M items/s BM_memory_latency/4096 2 ns 2 ns 395414303 557.74M items/s BM_memory_latency/8192 2 ns 2 ns 394141916 560.264M items/s BM_memory_latency/16384 2 ns 2 ns 401410292 535.202M items/s BM_memory_latency/32768 2 ns 2 ns 381828811 525.377M items/s BM_memory_latency/65536 4 ns 4 ns 189027575 262.929M items/s BM_memory_latency/131072 5 ns 5 ns 150798287 209.01M items/s BM_memory_latency/262144 5 ns 5 ns 129287045 185.606M items/s BM_memory_latency/524288 7 ns 7 ns 96543517 132.663M items/s BM_memory_latency/1048576 11 ns 11 ns 66380535 89.0397M items/s BM_memory_latency/2097152 12 ns 12 ns 55003164 76.6384M items/s BM_memory_latency/4194304 13 ns 13 ns 51559443 70.9488M items/s BM_memory_latency/8388608 28 ns 28 ns 25813875 33.6881M items/s BM_memory_latency/16777216 66 ns 66 ns 10463216 14.4577M items/s BM_memory_latency/33554432 90 ns 90 ns 7743594 10.5434M items/s ``` Author: Antoine Pitrou Closes #3225 from pitrou/ARROW-4079-machine-benchmark and squashes the following commits: 55f6de696 ARROW-4079: Add machine benchmark --- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/machine-benchmark.cc | 70 +++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 cpp/src/arrow/util/machine-benchmark.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index b13b2f367b022..ee64a32915f09 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -70,5 +70,6 @@ ADD_ARROW_BENCHMARK(decimal-benchmark) ADD_ARROW_BENCHMARK(hashing-benchmark) ADD_ARROW_BENCHMARK(int-util-benchmark) ADD_ARROW_BENCHMARK(lazy-benchmark) +ADD_ARROW_BENCHMARK(machine-benchmark) ADD_ARROW_BENCHMARK(number-parsing-benchmark) ADD_ARROW_BENCHMARK(utf8-util-benchmark) diff --git a/cpp/src/arrow/util/machine-benchmark.cc b/cpp/src/arrow/util/machine-benchmark.cc new file mode 100644 index 0000000000000..ad3f413e7f0fd --- /dev/null +++ b/cpp/src/arrow/util/machine-benchmark.cc @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Non-Arrow system benchmarks, provided for convenience. + +#include +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" + +namespace arrow { + +// Generate a vector of indices such as following the indices describes +// a path over the whole vector. The path is randomized to avoid triggering +// automatic prefetching in the CPU. +std::vector RandomPath(int32_t size) { + std::default_random_engine gen(42); + std::vector indices(size); + + for (int32_t i = 0; i < size; ++i) { + indices[i] = i; + } + std::shuffle(indices.begin(), indices.end(), gen); + std::vector path(size, -999999); + int32_t prev; + prev = indices[size - 1]; + for (int32_t i = 0; i < size; ++i) { + int32_t next = indices[i]; + path[prev] = next; + prev = next; + } + return path; +} + +// Cache / main memory latency, depending on the working set size +static void BM_memory_latency(benchmark::State& state) { + const auto niters = static_cast(state.range(0)); + const std::vector path = RandomPath(niters / 4); + + int32_t total = 0; + int32_t index = 0; + for (auto _ : state) { + total += index; + index = path[index]; + } + benchmark::DoNotOptimize(total); + state.SetItemsProcessed(state.iterations()); +} + +BENCHMARK(BM_memory_latency)->RangeMultiplier(2)->Range(2 << 10, 2 << 24); + +} // namespace arrow From ff293196baa53a2608178b6d3768cb93f964f9f4 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 20 Dec 2018 16:18:29 -0600 Subject: [PATCH 092/328] ARROW-4087: [C++] Make CSV spellings of null values configurable Interestingly, there is no noticeable slowdown when reading CSV files (even though the trie is significantly slower than the hard-coded function in microbenchmarks). Author: Antoine Pitrou Closes #3236 from pitrou/ARROW-4087-csv-configure-nulls and squashes the following commits: 9a7596ddc ARROW-4087: Make CSV spellings of null values configurable --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/csv/converter-test.cc | 51 ++++- cpp/src/arrow/csv/converter.cc | 121 ++-------- cpp/src/arrow/csv/converter.h | 2 +- cpp/src/arrow/csv/options.cc | 9 +- cpp/src/arrow/csv/options.h | 3 + cpp/src/arrow/test-util.h | 12 +- cpp/src/arrow/util/CMakeLists.txt | 2 + cpp/src/arrow/util/hashing-benchmark.cc | 2 + cpp/src/arrow/util/trie-benchmark.cc | 221 ++++++++++++++++++ cpp/src/arrow/util/trie-test.cc | 283 ++++++++++++++++++++++++ cpp/src/arrow/util/trie.cc | 209 +++++++++++++++++ cpp/src/arrow/util/trie.h | 245 ++++++++++++++++++++ python/pyarrow/_csv.pyx | 18 +- python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/tests/test_csv.py | 38 +++- 16 files changed, 1103 insertions(+), 115 deletions(-) create mode 100644 cpp/src/arrow/util/trie-benchmark.cc create mode 100644 cpp/src/arrow/util/trie-test.cc create mode 100644 cpp/src/arrow/util/trie.cc create mode 100644 cpp/src/arrow/util/trie.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 8dd2ac082db0a..f2a811247287b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -110,6 +110,7 @@ set(ARROW_SRCS util/key_value_metadata.cc util/task-group.cc util/thread-pool.cc + util/trie.cc util/utf8.cc ) diff --git a/cpp/src/arrow/csv/converter-test.cc b/cpp/src/arrow/csv/converter-test.cc index 2534541d3154a..ea12c0b66a94b 100644 --- a/cpp/src/arrow/csv/converter-test.cc +++ b/cpp/src/arrow/csv/converter-test.cc @@ -176,13 +176,30 @@ TEST(IntegerConversion, Basics) { } TEST(IntegerConversion, Nulls) { - AssertConversion(int8(), {"12,34\n", ",-128\n"}, - {{12, 0}, {34, -128}}, - {{true, false}, {true, true}}); + AssertConversion(int8(), {"12,N/A\n", ",-128\n"}, + {{12, 0}, {0, -128}}, + {{true, false}, {false, true}}); AssertConversionAllNulls(int8()); } +TEST(IntegerConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + AssertConversion(int8(), {"12,xxx\n", "zzz,-128\n"}, + {{12, 0}, {0, -128}}, {{true, false}, {false, true}}, + options); + + AssertConversionError(int8(), {",xxx,N/A\n"}, {0, 2}, options); + + // Duplicate nulls allowed + options.null_values = {"xxx", "zzz", "xxx"}; + AssertConversion(int8(), {"12,xxx\n", "zzz,-128\n"}, + {{12, 0}, {0, -128}}, {{true, false}, {false, true}}, + options); +} + TEST(IntegerConversion, Whitespace) { AssertConversion(int32(), {" 12,34 \n", " 56 ,78\n"}, {{12, 56}, {34, 78}}); @@ -203,6 +220,15 @@ TEST(FloatingPointConversion, Nulls) { AssertConversionAllNulls(float64()); } +TEST(FloatingPointConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + AssertConversion(float32(), {"1.5,xxx\n", "zzz,-1e10\n"}, + {{1.5, 0.}, {0., -1e10f}}, + {{true, false}, {false, true}}, options); +} + TEST(FloatingPointConversion, Whitespace) { AssertConversion(float64(), {" 12,34.5\n", " 0 ,-1e100 \n"}, {{12., 0.}, {34.5, -1e100}}); @@ -220,6 +246,15 @@ TEST(BooleanConversion, Nulls) { {{true, true}, {false, true}}); } +TEST(BooleanConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + AssertConversion(boolean(), {"true,xxx\n", "zzz,0\n"}, + {{true, false}, {false, false}}, + {{true, false}, {false, true}}, options); +} + TEST(TimestampConversion, Basics) { auto type = timestamp(TimeUnit::SECOND); @@ -243,6 +278,16 @@ TEST(TimestampConversion, Nulls) { {{true}, {false}, {false}}); } +TEST(TimestampConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + auto type = timestamp(TimeUnit::MILLI); + AssertConversion(type, {"1970-01-01 00:01:00,xxx,zzz\n"}, + {{60000}, {0}, {0}}, + {{true}, {false}, {false}}, options); +} + TEST(DecimalConversion, NotImplemented) { std::shared_ptr converter; ASSERT_RAISES(NotImplemented, diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 1018f8553860e..22be7d6e58f3b 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include "arrow/builder.h" #include "arrow/csv/parser.h" @@ -29,12 +30,15 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/parsing.h" // IWYU pragma: keep +#include "arrow/util/trie.h" #include "arrow/util/utf8.h" namespace arrow { namespace csv { using internal::StringConverter; +using internal::Trie; +using internal::TrieBuilder; namespace { @@ -57,115 +61,28 @@ class ConcreteConverter : public Converter { using Converter::Converter; protected: - Status Initialize() override { return Status::OK(); } + Status Initialize() override; inline bool IsNull(const uint8_t* data, uint32_t size, bool quoted); + + Trie null_trie_; }; -// Recognize various spellings of null values. The list of possible spellings -// is taken from Pandas read_csv() documentation. +Status ConcreteConverter::Initialize() { + // TODO no need to build a separate Trie for each Converter instance + TrieBuilder builder; + for (const auto& s : options_.null_values) { + RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */)); + } + null_trie_ = builder.Finish(); + return Status::OK(); +} + bool ConcreteConverter::IsNull(const uint8_t* data, uint32_t size, bool quoted) { if (quoted) { return false; } - if (size == 0) { - return true; - } - // No 1-character null value exists - if (size == 1) { - return false; - } - - // XXX if the CSV parser guaranteed enough excess bytes at the end of the - // parsed area, we wouldn't need to always check size before comparing characters. - - auto chars = reinterpret_cast(data); - auto first = chars[0]; - auto second = chars[1]; - switch (first) { - case 'N': { - // "NA", "N/A", "NaN", "NULL" - if (size == 2) { - return second == 'A'; - } - auto third = chars[2]; - if (size == 3) { - return (second == '/' && third == 'A') || (second == 'a' && third == 'N'); - } - if (size == 4) { - return (second == 'U' && third == 'L' && chars[3] == 'L'); - } - return false; - } - case 'n': { - // "n/a", "nan", "null" - if (size == 2) { - return false; - } - auto third = chars[2]; - if (size == 3) { - return (second == '/' && third == 'a') || (second == 'a' && third == 'n'); - } - if (size == 4) { - return (second == 'u' && third == 'l' && chars[3] == 'l'); - } - return false; - } - case '1': { - // '1.#IND', '1.#QNAN' - if (size == 6) { - // '#' is the most unlikely char here, check it first - return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'I' && - chars[4] == 'N' && chars[5] == 'D'); - } - if (size == 7) { - return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'Q' && - chars[4] == 'N' && chars[5] == 'A' && chars[6] == 'N'); - } - return false; - } - case '-': { - switch (second) { - case 'N': - // "-NaN" - return (size == 4 && chars[2] == 'a' && chars[3] == 'N'); - case 'n': - // "-nan" - return (size == 4 && chars[2] == 'a' && chars[3] == 'n'); - case '1': - // "-1.#IND", "-1.#QNAN" - if (size == 7) { - return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'I' && - chars[5] == 'N' && chars[6] == 'D'); - } - if (size == 8) { - return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'Q' && - chars[5] == 'N' && chars[6] == 'A' && chars[7] == 'N'); - } - return false; - default: - return false; - } - } - case '#': { - // "#N/A", "#N/A N/A", "#NA" - if (size < 3 || chars[1] != 'N') { - return false; - } - auto third = chars[2]; - if (size == 3) { - return third == 'A'; - } - if (size == 4) { - return third == '/' && chars[3] == 'A'; - } - if (size == 8) { - return std::memcmp(data + 2, "/A N/A", 5) == 0; - } - return false; - } - default: - return false; - } + return null_trie_.Find(util::string_view(reinterpret_cast(data), size)) >= + 0; } ///////////////////////////////////////////////////////////////////////// diff --git a/cpp/src/arrow/csv/converter.h b/cpp/src/arrow/csv/converter.h index 38ade1d21a846..d64fe695d0a26 100644 --- a/cpp/src/arrow/csv/converter.h +++ b/cpp/src/arrow/csv/converter.h @@ -57,7 +57,7 @@ class ARROW_EXPORT Converter { virtual Status Initialize() = 0; - ConvertOptions options_; + const ConvertOptions options_; MemoryPool* pool_; std::shared_ptr type_; }; diff --git a/cpp/src/arrow/csv/options.cc b/cpp/src/arrow/csv/options.cc index fccf0b67db98c..01e687b8342a3 100644 --- a/cpp/src/arrow/csv/options.cc +++ b/cpp/src/arrow/csv/options.cc @@ -22,7 +22,14 @@ namespace csv { ParseOptions ParseOptions::Defaults() { return ParseOptions(); } -ConvertOptions ConvertOptions::Defaults() { return ConvertOptions(); } +ConvertOptions ConvertOptions::Defaults() { + auto options = ConvertOptions(); + // The default list of possible null spellings is taken from Pandas' read_csv(). + options.null_values = {"", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", + "-NaN", "-nan", "1.#IND", "1.#QNAN", "N/A", "NA", + "NULL", "NaN", "n/a", "nan", "null"}; + return options; +} ReadOptions ReadOptions::Defaults() { return ReadOptions(); } diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index 10232d45e8df4..2b4653ccdce81 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "arrow/util/visibility.h" @@ -66,6 +67,8 @@ struct ARROW_EXPORT ConvertOptions { bool check_utf8 = true; // Optional per-column types (disabling type inference on those columns) std::unordered_map> column_types; + // Recognized spellings for null values + std::vector null_values; static ConvertOptions Defaults(); }; diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 7fe7685f5a39f..33321633090af 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -69,12 +69,12 @@ ASSERT_EQ((message), s.ToString()); \ } while (false) -#define ASSERT_OK(expr) \ - do { \ - ::arrow::Status s = (expr); \ - if (!s.ok()) { \ - FAIL() << "'" STRINGIFY(expr) "' failed with " << s.ToString(); \ - } \ +#define ASSERT_OK(expr) \ + do { \ + ::arrow::Status _s = (expr); \ + if (!_s.ok()) { \ + FAIL() << "'" STRINGIFY(expr) "' failed with " << _s.ToString(); \ + } \ } while (false) #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index ee64a32915f09..b02dc113c5459 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -62,6 +62,7 @@ ADD_ARROW_TEST(rle-encoding-test) ADD_ARROW_TEST(stl-util-test) ADD_ARROW_TEST(task-group-test) ADD_ARROW_TEST(thread-pool-test) +ADD_ARROW_TEST(trie-test) ADD_ARROW_TEST(utf8-util-test) ADD_ARROW_BENCHMARK(bit-util-benchmark) @@ -72,4 +73,5 @@ ADD_ARROW_BENCHMARK(int-util-benchmark) ADD_ARROW_BENCHMARK(lazy-benchmark) ADD_ARROW_BENCHMARK(machine-benchmark) ADD_ARROW_BENCHMARK(number-parsing-benchmark) +ADD_ARROW_BENCHMARK(trie-benchmark) ADD_ARROW_BENCHMARK(utf8-util-benchmark) diff --git a/cpp/src/arrow/util/hashing-benchmark.cc b/cpp/src/arrow/util/hashing-benchmark.cc index 7d91f0f536ac1..09d00afd5fea4 100644 --- a/cpp/src/arrow/util/hashing-benchmark.cc +++ b/cpp/src/arrow/util/hashing-benchmark.cc @@ -74,6 +74,7 @@ static void BM_HashIntegers(benchmark::State& state) { // NOLINT non-const refe benchmark::DoNotOptimize(total); } state.SetBytesProcessed(2 * state.iterations() * values.size() * sizeof(int64_t)); + state.SetItemsProcessed(2 * state.iterations() * values.size()); } static void BenchmarkStringHashing(benchmark::State& state, // NOLINT non-const reference @@ -92,6 +93,7 @@ static void BenchmarkStringHashing(benchmark::State& state, // NOLINT non-const benchmark::DoNotOptimize(total); } state.SetBytesProcessed(2 * state.iterations() * total_size); + state.SetItemsProcessed(2 * state.iterations() * values.size()); } static void BM_HashSmallStrings(benchmark::State& state) { // NOLINT non-const reference diff --git a/cpp/src/arrow/util/trie-benchmark.cc b/cpp/src/arrow/util/trie-benchmark.cc new file mode 100644 index 0000000000000..acc2892689ff4 --- /dev/null +++ b/cpp/src/arrow/util/trie-benchmark.cc @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/test-util.h" +#include "arrow/util/trie.h" + +namespace arrow { +namespace internal { + +static inline bool InlinedNullLookup(util::string_view s) { + // An inlined version of trie lookup for a specific set of strings + // (see AllNulls()) + auto size = s.length(); + auto data = s.data(); + if (size == 0) { + return false; + } + if (size == 1) { + return false; + } + + auto chars = reinterpret_cast(data); + auto first = chars[0]; + auto second = chars[1]; + switch (first) { + case 'N': { + // "NA", "N/A", "NaN", "NULL" + if (size == 2) { + return second == 'A'; + } + auto third = chars[2]; + if (size == 3) { + return (second == '/' && third == 'A') || (second == 'a' && third == 'N'); + } + if (size == 4) { + return (second == 'U' && third == 'L' && chars[3] == 'L'); + } + return false; + } + case 'n': { + // "n/a", "nan", "null" + if (size == 2) { + return false; + } + auto third = chars[2]; + if (size == 3) { + return (second == '/' && third == 'a') || (second == 'a' && third == 'n'); + } + if (size == 4) { + return (second == 'u' && third == 'l' && chars[3] == 'l'); + } + return false; + } + case '1': { + // '1.#IND', '1.#QNAN' + if (size == 6) { + // '#' is the most unlikely char here, check it first + return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'I' && + chars[4] == 'N' && chars[5] == 'D'); + } + if (size == 7) { + return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'Q' && + chars[4] == 'N' && chars[5] == 'A' && chars[6] == 'N'); + } + return false; + } + case '-': { + switch (second) { + case 'N': + // "-NaN" + return (size == 4 && chars[2] == 'a' && chars[3] == 'N'); + case 'n': + // "-nan" + return (size == 4 && chars[2] == 'a' && chars[3] == 'n'); + case '1': + // "-1.#IND", "-1.#QNAN" + if (size == 7) { + return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'I' && + chars[5] == 'N' && chars[6] == 'D'); + } + if (size == 8) { + return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'Q' && + chars[5] == 'N' && chars[6] == 'A' && chars[7] == 'N'); + } + return false; + default: + return false; + } + } + case '#': { + // "#N/A", "#N/A N/A", "#NA" + if (size < 3 || chars[1] != 'N') { + return false; + } + auto third = chars[2]; + if (size == 3) { + return third == 'A'; + } + if (size == 4) { + return third == '/' && chars[3] == 'A'; + } + if (size == 8) { + return std::memcmp(data + 2, "/A N/A", 5) == 0; + } + return false; + } + default: + return false; + } +} + +std::vector AllNulls() { + return {"#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", + "1.#QNAN", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"}; +} + +Trie MakeNullsTrie() { + auto nulls = AllNulls(); + + TrieBuilder builder; + for (const auto& str : AllNulls()) { + ABORT_NOT_OK(builder.Append(str)); + } + return builder.Finish(); +} + +std::vector Expand(const std::vector& base, size_t n) { + std::vector result; + result.reserve(n); + + while (true) { + for (const auto& v : base) { + result.push_back(v); + if (result.size() == n) { + return result; + } + } + } +} + +static void BenchmarkTrieLookups(benchmark::State& state, // NOLINT non-const reference + const std::vector& strings) { + Trie trie = MakeNullsTrie(); + int32_t total = 0; + + auto lookups = Expand(strings, 100); + + for (auto _ : state) { + for (const auto& s : lookups) { + total += trie.Find(s); + } + } + benchmark::DoNotOptimize(total); + state.SetItemsProcessed(state.iterations() * lookups.size()); +} + +static void BenchmarkInlinedTrieLookups( + benchmark::State& state, // NOLINT non-const reference + const std::vector& strings) { + int32_t total = 0; + + auto lookups = Expand(strings, 100); + + for (auto _ : state) { + for (const auto& s : lookups) { + total += InlinedNullLookup(s); + } + } + benchmark::DoNotOptimize(total); + state.SetItemsProcessed(state.iterations() * lookups.size()); +} + +static void BM_TrieLookupFound(benchmark::State& state) { // NOLINT non-const reference + BenchmarkTrieLookups(state, {"N/A", "null", "-1.#IND", "N/A"}); +} + +static void BM_TrieLookupNotFound( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkTrieLookups(state, {"None", "1.0", "", "abc"}); +} + +static void BM_InlinedTrieLookupFound( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkInlinedTrieLookups(state, {"N/A", "null", "-1.#IND", "N/A"}); +} + +static void BM_InlinedTrieLookupNotFound( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkInlinedTrieLookups(state, {"None", "1.0", "", "abc"}); +} + +static const int kRepetitions = 2; + +BENCHMARK(BM_TrieLookupFound)->Repetitions(kRepetitions); +BENCHMARK(BM_TrieLookupNotFound)->Repetitions(kRepetitions); +BENCHMARK(BM_InlinedTrieLookupFound)->Repetitions(kRepetitions); +BENCHMARK(BM_InlinedTrieLookupNotFound)->Repetitions(kRepetitions); + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/trie-test.cc b/cpp/src/arrow/util/trie-test.cc new file mode 100644 index 0000000000000..33eefa9d9335f --- /dev/null +++ b/cpp/src/arrow/util/trie-test.cc @@ -0,0 +1,283 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/test-util.h" +#include "arrow/util/trie.h" + +namespace arrow { +namespace internal { + +TEST(SmallString, Basics) { + using SS = SmallString<5>; + { + SS s; + ASSERT_EQ(s.length(), 0); + ASSERT_EQ(util::string_view(s), util::string_view("")); + ASSERT_EQ(s, ""); + ASSERT_NE(s, "x"); + ASSERT_EQ(sizeof(s), 6); + } + { + SS s("abc"); + ASSERT_EQ(s.length(), 3); + ASSERT_EQ(util::string_view(s), util::string_view("abc")); + ASSERT_EQ(std::memcmp(s.data(), "abc", 3), 0); + ASSERT_EQ(s, "abc"); + ASSERT_NE(s, "ab"); + } +} + +TEST(SmallString, Assign) { + using SS = SmallString<5>; + auto s = SS(); + + s = util::string_view("abc"); + ASSERT_EQ(s.length(), 3); + ASSERT_EQ(util::string_view(s), util::string_view("abc")); + ASSERT_EQ(std::memcmp(s.data(), "abc", 3), 0); + ASSERT_EQ(s, "abc"); + ASSERT_NE(s, "ab"); + + s = std::string("ghijk"); + ASSERT_EQ(s.length(), 5); + ASSERT_EQ(util::string_view(s), util::string_view("ghijk")); + ASSERT_EQ(std::memcmp(s.data(), "ghijk", 5), 0); + ASSERT_EQ(s, "ghijk"); + ASSERT_NE(s, ""); + + s = SS("xy"); + ASSERT_EQ(s.length(), 2); + ASSERT_EQ(util::string_view(s), util::string_view("xy")); + ASSERT_EQ(std::memcmp(s.data(), "xy", 2), 0); + ASSERT_EQ(s, "xy"); + ASSERT_NE(s, "xyz"); +} + +TEST(SmallString, Substr) { + using SS = SmallString<5>; + { + auto s = SS(); + ASSERT_EQ(s.substr(0), ""); + ASSERT_EQ(s.substr(0, 2), ""); + } + { + auto s = SS("abcd"); + ASSERT_EQ(s.substr(0), "abcd"); + ASSERT_EQ(s.substr(1), "bcd"); + ASSERT_EQ(s.substr(4), ""); + ASSERT_EQ(s.substr(0, 0), ""); + ASSERT_EQ(s.substr(0, 3), "abc"); + ASSERT_EQ(s.substr(0, 4), "abcd"); + ASSERT_EQ(s.substr(1, 0), ""); + ASSERT_EQ(s.substr(1, 2), "bc"); + ASSERT_EQ(s.substr(4, 0), ""); + ASSERT_EQ(s.substr(4, 1), ""); + } +} + +static std::vector AllNulls() { + return {"#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", + "1.#QNAN", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"}; +} + +static void TestTrieContents(const Trie& trie, const std::vector& entries) { + std::unordered_map control; + auto n_entries = static_cast(entries.size()); + + // Build control container + for (int32_t i = 0; i < n_entries; ++i) { + auto p = control.insert({entries[i], i}); + ASSERT_TRUE(p.second); + } + + // Check all existing entries in trie + for (int32_t i = 0; i < n_entries; ++i) { + ASSERT_EQ(i, trie.Find(entries[i])) << "for string '" << entries[i] << "'"; + } + + auto CheckNotExists = [&control, &trie](const std::string& s) { + auto p = control.find(s); + if (p == control.end()) { + ASSERT_EQ(-1, trie.Find(s)) << "for string '" << s << "'"; + } + }; + + // Check potentially non-existing strings + CheckNotExists(""); + CheckNotExists("X"); + CheckNotExists("abcdefxxxxxxxxxxxxxxx"); + + // Check potentially non-existing variations of existing entries + for (const auto& e : entries) { + CheckNotExists(e + "X"); + if (e.size() > 0) { + CheckNotExists(e.substr(0, 1)); + auto prefix = e.substr(0, e.size() - 1); + CheckNotExists(prefix); + CheckNotExists(prefix + "X"); + auto split_at = e.size() / 2; + CheckNotExists(e.substr(0, split_at) + 'x' + e.substr(split_at + 1)); + } + } +} + +static void TestTrieContents(const std::vector& entries) { + TrieBuilder builder; + for (const auto& s : entries) { + ASSERT_OK(builder.Append(s)); + } + const Trie trie = builder.Finish(); + ASSERT_OK(trie.Validate()); + + TestTrieContents(trie, entries); +} + +TEST(Trie, Empty) { + TrieBuilder builder; + const Trie trie = builder.Finish(); + ASSERT_OK(trie.Validate()); + + ASSERT_EQ(-1, trie.Find("")); + ASSERT_EQ(-1, trie.Find("x")); +} + +TEST(Trie, EmptyString) { + TrieBuilder builder; + ASSERT_OK(builder.Append("")); + const Trie trie = builder.Finish(); + ASSERT_OK(trie.Validate()); + + ASSERT_EQ(0, trie.Find("")); + ASSERT_EQ(-1, trie.Find("x")); +} + +TEST(Trie, Basics1) { + TestTrieContents({"abc", "de", "f"}); + TestTrieContents({"abc", "de", "f", ""}); +} + +TEST(Trie, Basics2) { + TestTrieContents({"a", "abc", "abcd", "abcdef"}); + TestTrieContents({"", "a", "abc", "abcd", "abcdef"}); +} + +TEST(Trie, Basics3) { + TestTrieContents({"abcd", "ab", "a"}); + TestTrieContents({"abcd", "ab", "a", ""}); +} + +TEST(Trie, LongStrings) { + TestTrieContents({"abcdefghijklmnopqr", "abcdefghijklmnoprq", "defghijklmnopqrst"}); + TestTrieContents({"abcdefghijklmnopqr", "abcdefghijklmnoprq", "abcde"}); +} + +TEST(Trie, NullChars) { + const std::string empty; + const std::string nul(1, '\x00'); + std::string a, b, c, d; + a = "x" + nul + "y"; + b = "x" + nul + "z"; + c = nul + "y"; + d = nul; + ASSERT_EQ(a.length(), 3); + ASSERT_EQ(d.length(), 1); + + TestTrieContents({a, b, c, d}); + TestTrieContents({a, b, c}); + TestTrieContents({a, b, c, d, ""}); + TestTrieContents({a, b, c, ""}); + TestTrieContents({d, c, b, a}); + TestTrieContents({c, b, a}); + TestTrieContents({d, c, b, a, ""}); + TestTrieContents({c, b, a, ""}); +} + +TEST(Trie, NegativeChars) { + // Test with characters >= 0x80 (to check the absence of sign issues) + TestTrieContents({"\x7f\x80\x81\xff", "\x7f\x80\x81", "\x7f\xff\x81", "\xff\x80\x81"}); +} + +TEST(Trie, CSVNulls) { TestTrieContents(AllNulls()); } + +TEST(Trie, Duplicates) { + { + TrieBuilder builder; + ASSERT_OK(builder.Append("ab")); + ASSERT_OK(builder.Append("abc")); + ASSERT_RAISES(Invalid, builder.Append("abc")); + ASSERT_OK(builder.Append("abcd")); + ASSERT_RAISES(Invalid, builder.Append("ab")); + ASSERT_OK(builder.Append("abcde")); + const Trie trie = builder.Finish(); + + TestTrieContents(trie, {"ab", "abc", "abcd", "abcde"}); + } + { + // With allow_duplicates = true + TrieBuilder builder; + ASSERT_OK(builder.Append("ab", true)); + ASSERT_OK(builder.Append("abc", true)); + ASSERT_OK(builder.Append("abc", true)); + ASSERT_OK(builder.Append("abcd", true)); + ASSERT_OK(builder.Append("ab", true)); + ASSERT_OK(builder.Append("abcde", true)); + const Trie trie = builder.Finish(); + + TestTrieContents(trie, {"ab", "abc", "abcd", "abcde"}); + } +} + +TEST(Trie, CapacityError) { + // A trie uses 16-bit indices into various internal structures and + // therefore has limited size available. + TrieBuilder builder; + uint8_t first, second, third; + bool had_capacity_error = false; + uint8_t s[] = "\x00\x00\x00\x00"; + + for (first = 1; first < 125; ++first) { + s[0] = first; + for (second = 1; second < 125; ++second) { + s[1] = second; + for (third = 1; third < 125; ++third) { + s[2] = third; + auto st = builder.Append(reinterpret_cast(s)); + if (st.IsCapacityError()) { + DCHECK_GE(first, 2); + had_capacity_error = true; + break; + } else { + ASSERT_OK(st); + } + } + } + } + ASSERT_TRUE(had_capacity_error) << "Should have produced CapacityError"; +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/trie.cc b/cpp/src/arrow/util/trie.cc new file mode 100644 index 0000000000000..eaa02b7c5352e --- /dev/null +++ b/cpp/src/arrow/util/trie.cc @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/trie.h" + +#include +#include + +#include "arrow/util/logging.h" + +namespace arrow { +namespace internal { + +Status Trie::Validate() const { + const auto n_nodes = static_cast(nodes_.size()); + if (size_ > n_nodes) { + return Status::Invalid("Number of entries larger than number of nodes"); + } + for (const auto& node : nodes_) { + if (node.found_index_ >= size_) { + return Status::Invalid("Found index >= size"); + } + if (node.child_lookup_ != -1 && + node.child_lookup_ * 256 > + static_cast(lookup_table_.size() - 256)) { + return Status::Invalid("Child lookup base doesn't point to 256 valid indices"); + } + } + for (const auto index : lookup_table_) { + if (index >= n_nodes) { + return Status::Invalid("Child lookup index out of bounds"); + } + } + return Status::OK(); +} + +void Trie::Dump(const Node* node, const std::string& indent) const { + std::cerr << "[\"" << node->substring_ << "\"]"; + if (node->found_index_ >= 0) { + std::cerr << " *"; + } + std::cerr << "\n"; + if (node->child_lookup_ >= 0) { + auto child_indent = indent + " "; + std::cerr << child_indent << "|\n"; + for (fast_index_type i = 0; i < 256; ++i) { + auto child_index = lookup_table_[node->child_lookup_ * 256 + i]; + if (child_index >= 0) { + const Node* child = &nodes_[child_index]; + std::cerr << child_indent << "|-> '" << static_cast(i) << "' (" << i + << ") -> "; + Dump(child, child_indent); + } + } + } +} + +void Trie::Dump() const { Dump(&nodes_[0], ""); } + +TrieBuilder::TrieBuilder() { trie_.nodes_.push_back(Trie::Node{-1, -1, ""}); } + +Status TrieBuilder::AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node) { + if (parent->child_lookup_ == -1) { + RETURN_NOT_OK(ExtendLookupTable(&parent->child_lookup_)); + } + auto parent_lookup = parent->child_lookup_ * 256 + ch; + + DCHECK_EQ(trie_.lookup_table_[parent_lookup], -1); + if (trie_.nodes_.size() >= static_cast(kMaxIndex)) { + return Status::CapacityError("Trie out of bounds"); + } + trie_.nodes_.push_back(std::move(node)); + trie_.lookup_table_[parent_lookup] = static_cast(trie_.nodes_.size() - 1); + return Status::OK(); +} + +Status TrieBuilder::CreateChildNode(Trie::Node* parent, uint8_t ch, + util::string_view substring) { + const auto kMaxSubstringLength = Trie::kMaxSubstringLength; + + while (substring.length() > kMaxSubstringLength) { + // Substring doesn't fit in node => create intermediate node + auto mid_node = Trie::Node{-1, -1, substring.substr(0, kMaxSubstringLength)}; + RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(mid_node))); + // Recurse + parent = &trie_.nodes_.back(); + ch = static_cast(substring[kMaxSubstringLength]); + substring = substring.substr(kMaxSubstringLength + 1); + } + + // Create final matching node + auto child_node = Trie::Node{trie_.size_, -1, substring}; + RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(child_node))); + ++trie_.size_; + return Status::OK(); +} + +Status TrieBuilder::CreateChildNode(Trie::Node* parent, char ch, + util::string_view substring) { + return CreateChildNode(parent, static_cast(ch), substring); +} + +Status TrieBuilder::ExtendLookupTable(index_type* out_index) { + auto cur_size = trie_.lookup_table_.size(); + auto cur_index = cur_size / 256; + if (cur_index > static_cast(kMaxIndex)) { + return Status::CapacityError("Trie out of bounds"); + } + trie_.lookup_table_.resize(cur_size + 256, -1); + *out_index = static_cast(cur_index); + return Status::OK(); +} + +Status TrieBuilder::SplitNode(fast_index_type node_index, fast_index_type split_at) { + Trie::Node* node = &trie_.nodes_[node_index]; + + DCHECK_LT(split_at, node->substring_length()); + + // Before: + // {node} -> [...] + // After: + // {node} -> [c] -> {out_node} -> [...] + auto child_node = Trie::Node{node->found_index_, node->child_lookup_, + node->substring_.substr(split_at + 1)}; + auto ch = node->substring_[split_at]; + node->child_lookup_ = -1; + node->found_index_ = -1; + node->substring_ = node->substring_.substr(0, split_at); + RETURN_NOT_OK(AppendChildNode(node, ch, std::move(child_node))); + + return Status::OK(); +} + +Status TrieBuilder::Append(util::string_view s, bool allow_duplicate) { + // Find or create node for string + fast_index_type node_index = 0; + fast_index_type pos = 0; + fast_index_type remaining = static_cast(s.length()); + + while (true) { + Trie::Node* node = &trie_.nodes_[node_index]; + const auto substring_length = node->substring_length(); + const auto substring_data = node->substring_data(); + + for (fast_index_type i = 0; i < substring_length; ++i) { + if (remaining == 0) { + // New string too short => need to split node + RETURN_NOT_OK(SplitNode(node_index, i)); + // Current node matches exactly + node = &trie_.nodes_[node_index]; + node->found_index_ = trie_.size_++; + return Status::OK(); + } + if (s[pos] != substring_data[i]) { + // Mismatching substring => need to split node + RETURN_NOT_OK(SplitNode(node_index, i)); + // Create new node for mismatching char + node = &trie_.nodes_[node_index]; + return CreateChildNode(node, s[pos], s.substr(pos + 1)); + } + ++pos; + --remaining; + } + if (remaining == 0) { + // Node matches exactly + if (node->found_index_ >= 0) { + if (allow_duplicate) { + return Status::OK(); + } else { + return Status::Invalid("Duplicate entry in trie"); + } + } + node->found_index_ = trie_.size_++; + return Status::OK(); + } + // Lookup child using next input character + if (node->child_lookup_ == -1) { + // Need to extend lookup table for this node + RETURN_NOT_OK(ExtendLookupTable(&node->child_lookup_)); + } + auto c = static_cast(s[pos++]); + --remaining; + node_index = trie_.lookup_table_[node->child_lookup_ * 256 + c]; + if (node_index == -1) { + // Child not found => need to create child node + return CreateChildNode(node, c, s.substr(pos)); + } + node = &trie_.nodes_[node_index]; + } +} + +Trie TrieBuilder::Finish() { return std::move(trie_); } + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/trie.h b/cpp/src/arrow/util/trie.h new file mode 100644 index 0000000000000..3e82bfd8ee28f --- /dev/null +++ b/cpp/src/arrow/util/trie.h @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_TRIE_H +#define ARROW_UTIL_TRIE_H + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +// A non-zero-terminated small string class. +// std::string usually has a small string optimization +// (see review at https://shaharmike.com/cpp/std-string/) +// but this one allows tight control and optimization of memory layout. +template +class SmallString { + public: + SmallString() : length_(0) {} + + template + SmallString(const T& v) { // NOLINT implicit constructor + *this = util::string_view(v); + } + + SmallString& operator=(const util::string_view s) { +#ifndef NDEBUG + CheckSize(s.size()); +#endif + length_ = static_cast(s.size()); + std::memcpy(data_, s.data(), length_); + return *this; + } + + SmallString& operator=(const std::string& s) { + *this = util::string_view(s); + return *this; + } + + SmallString& operator=(const char* s) { + *this = util::string_view(s); + return *this; + } + + explicit operator util::string_view() const { + return util::string_view(data_, length_); + } + + const char* data() const { return data_; } + size_t length() const { return length_; } + bool empty() const { return length_ == 0; } + char operator[](size_t pos) const { +#ifdef NDEBUG + assert(pos <= length_); +#endif + return data_[pos]; + } + + SmallString substr(size_t pos) const { + return SmallString(util::string_view(*this).substr(pos)); + } + + SmallString substr(size_t pos, size_t count) const { + return SmallString(util::string_view(*this).substr(pos, count)); + } + + template + bool operator==(T&& other) const { + return util::string_view(*this) == util::string_view(std::forward(other)); + } + + template + bool operator!=(T&& other) const { + return util::string_view(*this) != util::string_view(std::forward(other)); + } + + protected: + uint8_t length_; + char data_[N]; + +#ifndef NDEBUG + void CheckSize(size_t n) { assert(n <= N); } +#endif +}; + +template +std::ostream& operator<<(std::ostream& os, const SmallString& str) { + return os << util::string_view(str); +} + +// A trie class for byte strings, optimized for small sets of short strings. +// This class is immutable by design, use a TrieBuilder to construct it. +class ARROW_EXPORT Trie { + using index_type = int16_t; + using fast_index_type = int_fast16_t; + + public: + Trie() : size_(0) {} + Trie(Trie&&) = default; + Trie& operator=(Trie&&) = default; + + int32_t Find(util::string_view s) const { + const Node* node = &nodes_[0]; + fast_index_type pos = 0; + fast_index_type remaining = static_cast(s.length()); + + while (remaining > 0) { + auto substring_length = node->substring_length(); + if (substring_length > 0) { + auto substring_data = node->substring_data(); + if (remaining < substring_length) { + // Input too short + return -1; + } + for (fast_index_type i = 0; i < substring_length; ++i) { + if (s[pos++] != substring_data[i]) { + // Mismatching substring + return -1; + } + --remaining; + } + if (remaining == 0) { + // Matched node exactly + return node->found_index_; + } + } + // Lookup child using next input character + if (node->child_lookup_ == -1) { + // Input too long + return -1; + } + auto c = static_cast(s[pos++]); + --remaining; + auto child_index = lookup_table_[node->child_lookup_ * 256 + c]; + if (child_index == -1) { + // Child not found + return -1; + } + node = &nodes_[child_index]; + } + + // Input exhausted + if (node->substring_.empty()) { + // Matched node exactly + return node->found_index_; + } else { + return -1; + } + } + + Status Validate() const; + + void Dump() const; + + protected: + static constexpr size_t kNodeSize = 16; + static constexpr auto kMaxSubstringLength = + kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t); + + struct Node { + // If this node is a valid end of string, index of found string, otherwise -1 + index_type found_index_; + // Base index for child lookup in lookup_table_ (-1 if no child nodes) + index_type child_lookup_; + // The substring for this node. + SmallString substring_; + + fast_index_type substring_length() const { + return static_cast(substring_.length()); + } + const char* substring_data() const { return substring_.data(); } + }; + + static_assert(sizeof(Node) == kNodeSize, "Unexpected node size"); + + ARROW_DISALLOW_COPY_AND_ASSIGN(Trie); + + void Dump(const Node* node, const std::string& indent) const; + + // Node table: entry 0 is the root node + std::vector nodes_; + + // Indexed lookup structure: gives index in node table, or -1 if not found + std::vector lookup_table_; + + // Number of entries + index_type size_; + + friend class TrieBuilder; +}; + +class ARROW_EXPORT TrieBuilder { + using index_type = Trie::index_type; + using fast_index_type = Trie::fast_index_type; + + public: + TrieBuilder(); + Status Append(util::string_view s, bool allow_duplicate = false); + Trie Finish(); + + protected: + // Extend the lookup table by 256 entries, return the index of the new span + Status ExtendLookupTable(index_type* out_lookup_index); + // Split the node given by the index at the substring index `split_at` + Status SplitNode(fast_index_type node_index, fast_index_type split_at); + // Append an already constructed child node to the parent + Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node); + // Create a matching child node from this parent + Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring); + Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring); + + Trie trie_; + + static constexpr auto kMaxIndex = std::numeric_limits::max(); +}; + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_TRIE_H diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index 91d1b08deefad..db8104659884b 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -252,6 +252,9 @@ cdef class ConvertOptions: column_types: dict, optional Map column names to column types (disabling type inference on those columns). + null_values: list, optional + A sequence of strings that denote nulls in the data + (defaults are appropriate in most cases). """ cdef: CCSVConvertOptions options @@ -259,12 +262,14 @@ cdef class ConvertOptions: # Avoid mistakingly creating attributes __slots__ = () - def __init__(self, check_utf8=None, column_types=None): + def __init__(self, check_utf8=None, column_types=None, null_values=None): self.options = CCSVConvertOptions.Defaults() if check_utf8 is not None: self.check_utf8 = check_utf8 if column_types is not None: self.column_types = column_types + if null_values is not None: + self.null_values = null_values @property def check_utf8(self): @@ -306,6 +311,17 @@ cdef class ConvertOptions: assert typ != NULL self.options.column_types[tobytes(k)] = typ + @property + def null_values(self): + """ + A sequence of strings that denote nulls in the data. + """ + return [frombytes(x) for x in self.options.null_values] + + @null_values.setter + def null_values(self, value): + self.options.null_values = [tobytes(x) for x in value] + cdef _get_reader(input_file, shared_ptr[InputStream]* out): use_memory_map = False diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index f4629af0617fb..7ce03bf6eb80c 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -954,6 +954,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions": c_bool check_utf8 unordered_map[c_string, shared_ptr[CDataType]] column_types + vector[c_string] null_values @staticmethod CCSVConvertOptions Defaults() diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index c5816de8a4203..14ba999fea77b 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -151,9 +151,17 @@ def test_convert_options(): with pytest.raises(TypeError): opts.column_types = 0 - opts = cls(check_utf8=False, column_types={'a': pa.null()}) + assert isinstance(opts.null_values, list) + assert '' in opts.null_values + assert 'N/A' in opts.null_values + opts.null_values = ['xxx', 'yyy'] + assert opts.null_values == ['xxx', 'yyy'] + + opts = cls(check_utf8=False, column_types={'a': pa.null()}, + null_values=['xxx', 'yyy']) assert opts.check_utf8 is False assert opts.column_types == {'a': pa.null()} + assert opts.null_values == ['xxx', 'yyy'] class BaseTestCSVRead: @@ -233,6 +241,34 @@ def test_simple_timestamps(self): 'b': [datetime(1970, 1, 1), datetime(1989, 7, 14)], } + def test_custom_nulls(self): + # Infer nulls with custom values + opts = ConvertOptions(null_values=['Xxx', 'Zzz']) + rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n" + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.null()), + ('b', pa.string()), + ('c', pa.string()), + ('d', pa.int64())]) + assert table.schema == schema + assert table.to_pydict() == { + 'a': [None, None], + 'b': [u"Xxx", u"#N/A"], + 'c': [u"1", u""], + 'd': [2, None], + } + + opts = ConvertOptions(null_values=[]) + rows = b"a,b\n#N/A,\n" + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.string()), + ('b', pa.string())]) + assert table.schema == schema + assert table.to_pydict() == { + 'a': [u"#N/A"], + 'b': [u""], + } + def test_column_types(self): # Ask for specific column types in ConvertOptions opts = ConvertOptions(column_types={'b': 'float32', From 1a86ab51d8ee86e132645c9671f5355774b8f71b Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 20 Dec 2018 16:19:42 -0600 Subject: [PATCH 093/328] ARROW-3982: [C++] Allow "binary" input in simple JSON format Since rapidjson doesn't validate UTF8 by default, we can represent arbitrary binary bytes in the JSON input (bytes < 0x20 have to be represented as unicode escapes). Author: Antoine Pitrou Closes #3222 from pitrou/ARROW-3982-json-simple-binary and squashes the following commits: 5aaa5edc8 ARROW-3982: Allow "binary" input in simple JSON format --- cpp/src/arrow/ipc/json-simple-test.cc | 40 ++++++++++++++++++++++++ cpp/src/arrow/ipc/json-simple.cc | 45 +++++++++++++++++++++++++-- cpp/src/arrow/pretty_print-test.cc | 11 ++----- 3 files changed, 84 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc index 84a2210157f53..2e80a0ca85822 100644 --- a/cpp/src/arrow/ipc/json-simple-test.cc +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -289,6 +289,7 @@ TEST(TestDouble, Errors) { } TEST(TestString, Basics) { + // String type std::shared_ptr type = utf8(); std::shared_ptr expected, actual; @@ -300,6 +301,20 @@ TEST(TestString, Basics) { s += '\x00'; s += "char"; AssertJSONArray(type, "[\"\", \"some\\u0000char\"]", {"", s}); + // UTF8 sequence in string + AssertJSONArray(type, "[\"\xc3\xa9\"]", {"\xc3\xa9"}); + + // Binary type + type = binary(); + AssertJSONArray(type, "[\"\", \"foo\", null]", + {true, true, false}, {"", "foo", ""}); + // Arbitrary binary (non-UTF8) sequence in string + s = "\xff\x9f"; + AssertJSONArray(type, "[\"" + s + "\"]", {s}); + // Bytes < 0x20 can be represented as JSON unicode escapes + s = '\x00'; + s += "\x1f"; + AssertJSONArray(type, "[\"\\u0000\\u001f\"]", {s}); } TEST(TestString, Errors) { @@ -310,6 +325,31 @@ TEST(TestString, Errors) { ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); } +TEST(TestFixedSizeBinary, Basics) { + std::shared_ptr type = fixed_size_binary(3); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[\"foo\", \"bar\"]", + {"foo", "bar"}); + AssertJSONArray(type, "[null, \"foo\"]", + {false, true}, {"", "foo"}); + // Arbitrary binary (non-UTF8) sequence in string + std::string s = "\xff\x9f\xcc"; + AssertJSONArray(type, "[\"" + s + "\"]", {s}); +} + +TEST(TestFixedSizeBinary, Errors) { + std::shared_ptr type = fixed_size_binary(3); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); + // Invalid length + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"abcd\"]", &array)); +} + TEST(TestDecimal, Basics) { std::shared_ptr type = decimal(10, 4); std::shared_ptr expected, actual; diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index d812f841d9353..7a78fe4986cd5 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -41,7 +41,8 @@ using ::arrow::internal::checked_cast; static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; static Status JSONTypeError(const char* expected_type, rj::Type json_type) { - return Status::Invalid("Expected ", expected_type, " or null, got type ", json_type); + return Status::Invalid("Expected ", expected_type, " or null, got JSON type ", + json_type); } class Converter { @@ -91,7 +92,6 @@ class ConcreteConverter : public Converter { }; // TODO : dates and times? -// TODO : binary / fixed size binary? // ------------------------------------------------------------------------ // Converter for null arrays @@ -284,7 +284,7 @@ class DecimalConverter final : public ConcreteConverter { }; // ------------------------------------------------------------------------ -// Converter for string arrays +// Converter for binary and string arrays class StringConverter final : public ConcreteConverter { public: @@ -313,6 +313,43 @@ class StringConverter final : public ConcreteConverter { std::shared_ptr builder_; }; +// ------------------------------------------------------------------------ +// Converter for fixed-size binary arrays + +class FixedSizeBinaryConverter final + : public ConcreteConverter { + public: + explicit FixedSizeBinaryConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared(type, default_memory_pool()); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsString()) { + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + if (view.length() != static_cast(builder_->byte_width())) { + std::stringstream ss; + ss << "Invalid string length " << view.length() << " in JSON input for " + << this->type_->ToString(); + return Status::Invalid(ss.str()); + } + return builder_->Append(view); + } else { + return JSONTypeError("string", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + // ------------------------------------------------------------------------ // Converter for list arrays @@ -449,6 +486,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) default: { return Status::NotImplemented("JSON conversion to ", type->ToString(), diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index a1acfb81aeff1..8696efc735b8a 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -277,18 +277,11 @@ TEST_F(TestPrettyPrint, ListType) { TEST_F(TestPrettyPrint, FixedSizeBinaryType) { std::vector is_valid = {true, true, false, true, false}; - std::vector values = {"foo", "bar", "baz"}; - std::shared_ptr array; auto type = fixed_size_binary(3); - FixedSizeBinaryBuilder builder(type); - - ASSERT_OK(builder.Append(values[0])); - ASSERT_OK(builder.Append(values[1])); - ASSERT_OK(builder.Append(values[2])); - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(type, "[\"foo\", \"bar\", null, \"baz\"]"); - static const char* ex = "[\n 666F6F,\n 626172,\n 62617A\n]"; + static const char* ex = "[\n 666F6F,\n 626172,\n null,\n 62617A\n]"; CheckArray(*array, {0, 10}, ex); static const char* ex_2 = " [\n 666F6F,\n ...\n 62617A\n ]"; CheckArray(*array, {2, 1}, ex_2); From 700bd40afab973d00229a43dff5ce764ed996873 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Thu, 20 Dec 2018 16:55:09 -0600 Subject: [PATCH 094/328] ARROW-4052: [C++] Linker errors with glog and gflags After #3196, a potential bug appears. If we use glog installed instead of downloading one at build time and the installed glog is linked to gflags, linker error can be occurred. I modified ThirdpartyToolchain.cmake to add a dependency from glog to gflag. Author: Kousuke Saruta Closes #3234 from sarutak/ARROW-4052 and squashes the following commits: 3c65cbee6 Modified ThirdpartyToolchain.cmake to add a dependency from glog to gflag --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 112 +++++++++++--------- 1 file changed, 61 insertions(+), 51 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 3381b5cda16b4..d8b34862eeaab 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -534,56 +534,11 @@ message(STATUS "double-conversion include dir: ${DOUBLE_CONVERSION_INCLUDE_DIR}" message(STATUS "double-conversion static library: ${DOUBLE_CONVERSION_STATIC_LIB}") # ---------------------------------------------------------------------- -# Google gtest & gflags - -if(ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS - OR ARROW_BUILD_BENCHMARKS) - if("${GTEST_HOME}" STREQUAL "") - if(APPLE) - set(GTEST_CMAKE_CXX_FLAGS "-fPIC -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes") - elseif(NOT MSVC) - set(GTEST_CMAKE_CXX_FLAGS "-fPIC") - endif() - string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) - set(GTEST_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}} ${GTEST_CMAKE_CXX_FLAGS}") - - set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix/src/googletest_ep") - set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include") - set(GTEST_STATIC_LIB - "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GTEST_MAIN_STATIC_LIB - "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GTEST_VENDORED 1) - set(GTEST_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX} - -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}) - if (MSVC AND NOT ARROW_USE_STATIC_CRT) - set(GTEST_CMAKE_ARGS ${GTEST_CMAKE_ARGS} -Dgtest_force_shared_crt=ON) - endif() - - ExternalProject_Add(googletest_ep - URL ${GTEST_SOURCE_URL} - BUILD_BYPRODUCTS ${GTEST_STATIC_LIB} ${GTEST_MAIN_STATIC_LIB} - CMAKE_ARGS ${GTEST_CMAKE_ARGS} - ${EP_LOG_OPTIONS}) - else() - find_package(GTest REQUIRED) - set(GTEST_VENDORED 0) - endif() - - message(STATUS "GTest include dir: ${GTEST_INCLUDE_DIR}") - message(STATUS "GTest static library: ${GTEST_STATIC_LIB}") - include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(gtest - STATIC_LIB ${GTEST_STATIC_LIB}) - ADD_THIRDPARTY_LIB(gtest_main - STATIC_LIB ${GTEST_MAIN_STATIC_LIB}) - - if(GTEST_VENDORED) - add_dependencies(gtest_static googletest_ep) - add_dependencies(gtest_main_static googletest_ep) - endif() +# gflags +if(ARROW_BUILD_TESTS OR + ARROW_BUILD_BENCHMARKS OR + (ARROW_USE_GLOG AND GLOG_HOME)) # gflags (formerly Googleflags) command line parsing if("${GFLAGS_HOME}" STREQUAL "") set(GFLAGS_CMAKE_CXX_FLAGS ${EP_CXX_FLAGS}) @@ -636,6 +591,57 @@ if(ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS endif() endif() +# ---------------------------------------------------------------------- +# Google gtest + +if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) + if("${GTEST_HOME}" STREQUAL "") + if(APPLE) + set(GTEST_CMAKE_CXX_FLAGS "-fPIC -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes") + elseif(NOT MSVC) + set(GTEST_CMAKE_CXX_FLAGS "-fPIC") + endif() + string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) + set(GTEST_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}} ${GTEST_CMAKE_CXX_FLAGS}") + + set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix/src/googletest_ep") + set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include") + set(GTEST_STATIC_LIB + "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GTEST_MAIN_STATIC_LIB + "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GTEST_VENDORED 1) + set(GTEST_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX} + -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}) + if (MSVC AND NOT ARROW_USE_STATIC_CRT) + set(GTEST_CMAKE_ARGS ${GTEST_CMAKE_ARGS} -Dgtest_force_shared_crt=ON) + endif() + + ExternalProject_Add(googletest_ep + URL ${GTEST_SOURCE_URL} + BUILD_BYPRODUCTS ${GTEST_STATIC_LIB} ${GTEST_MAIN_STATIC_LIB} + CMAKE_ARGS ${GTEST_CMAKE_ARGS} + ${EP_LOG_OPTIONS}) + else() + find_package(GTest REQUIRED) + set(GTEST_VENDORED 0) + endif() + + message(STATUS "GTest include dir: ${GTEST_INCLUDE_DIR}") + message(STATUS "GTest static library: ${GTEST_STATIC_LIB}") + include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(gtest + STATIC_LIB ${GTEST_STATIC_LIB}) + ADD_THIRDPARTY_LIB(gtest_main + STATIC_LIB ${GTEST_MAIN_STATIC_LIB}) + + if(GTEST_VENDORED) + add_dependencies(gtest_static googletest_ep) + add_dependencies(gtest_main_static googletest_ep) + endif() +endif() + if(ARROW_BUILD_BENCHMARKS) if("$ENV{GBENCHMARK_HOME}" STREQUAL "") if(CMAKE_VERSION VERSION_LESS 3.6) @@ -1506,10 +1512,14 @@ if (ARROW_USE_GLOG) message(STATUS "Glog static library: ${GLOG_STATIC_LIB}") include_directories(SYSTEM ${GLOG_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(glog - STATIC_LIB ${GLOG_STATIC_LIB}) if (GLOG_VENDORED) + ADD_THIRDPARTY_LIB(glog + STATIC_LIB ${GLOG_STATIC_LIB}) add_dependencies(glog_static glog_ep) + else() + ADD_THIRDPARTY_LIB(glog + STATIC_LIB ${GLOG_STATIC_LIB} + DEPS gflags_static) endif() endif() From c6d97c59ef047cc9d5e2836b1945df26cd7c4622 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 20 Dec 2018 18:19:29 -0600 Subject: [PATCH 095/328] ARROW-4093: [C++] Fix wrong suggested method name Author: Kouhei Sutou Closes #3238 from kou/cpp-fix-typo and squashes the following commits: b5b880af9 Fix wrong suggested method name --- cpp/src/arrow/type.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 95b5189de0343..eb00f43caa172 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -525,7 +525,7 @@ class ARROW_EXPORT StructType : public NestedType { ARROW_DEPRECATED("Use GetFieldByName") std::shared_ptr GetChildByName(const std::string& name) const; - ARROW_DEPRECATED("Use GetChildIndex") + ARROW_DEPRECATED("Use GetFieldIndex") int GetChildIndex(const std::string& name) const; private: From 747590afc84481f61ead4d4c14e25ff9b79213f6 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Sat, 22 Dec 2018 00:31:47 +0900 Subject: [PATCH 096/328] ARROW-4011: [Gandiva] Install irhelpers.bc and use it If we don't install irhelpers.bc, users need to keep build directory that has irhelpers.bc. Author: Kouhei Sutou Closes #3232 from kou/gandiva-use-installed-bc and squashes the following commits: 5a0c6228 Adjust irhelper.bc path in Java 829212c4 Adjust irhelper.bc path in Java ea9c6b36 Adjust irhelper.bc path in Java cb3d473b Adjust irhelper.bc path in Java ab60eda9 Remove "gandiva_" prefix and put built file to current binary dir 934e258c Add "gandiva_" prefix 7ff4cf24 Define GANDIVA_BYTE_COMPILE_FILE_PATH for all build ad615b4b Install irhelpers.bc and use it --- cpp/src/gandiva/CMakeLists.txt | 9 +- cpp/src/gandiva/bc_file_path.cc.in | 2 +- cpp/src/gandiva/engine_llvm_test.cc | 5 +- cpp/src/gandiva/llvm_generator_test.cc | 7 +- cpp/src/gandiva/tests/binary_test.cc | 2 +- cpp/src/gandiva/tests/boolean_expr_test.cc | 12 +-- cpp/src/gandiva/tests/date_time_test.cc | 16 ++-- cpp/src/gandiva/tests/filter_test.cc | 20 ++-- cpp/src/gandiva/tests/hash_test.cc | 6 +- cpp/src/gandiva/tests/huge_table_test.cc | 4 +- cpp/src/gandiva/tests/if_expr_test.cc | 12 +-- cpp/src/gandiva/tests/in_expr_test.cc | 6 +- cpp/src/gandiva/tests/literal_test.cc | 12 +-- cpp/src/gandiva/tests/micro_benchmarks.cc | 16 ++-- cpp/src/gandiva/tests/null_validity_test.cc | 6 +- .../tests/projector_build_validation_test.cc | 22 ++--- cpp/src/gandiva/tests/projector_test.cc | 95 ++++++------------- cpp/src/gandiva/tests/test_util.h | 6 ++ cpp/src/gandiva/tests/utf8_test.cc | 19 ++-- java/gandiva/pom.xml | 2 +- 20 files changed, 131 insertions(+), 148 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 23ad93e201e71..d28c372a9e6ab 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -27,14 +27,18 @@ find_package(LLVM) # Set the path where the byte-code files will be installed. set(GANDIVA_BC_INSTALL_DIR - ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gandiva) + ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/gandiva) set(GANDIVA_BC_FILE_NAME irhelpers.bc) set(GANDIVA_BC_INSTALL_PATH ${GANDIVA_BC_INSTALL_DIR}/${GANDIVA_BC_FILE_NAME}) -set(GANDIVA_BC_OUTPUT_PATH ${BUILD_OUTPUT_ROOT_DIRECTORY}/${GANDIVA_BC_FILE_NAME}) +set(GANDIVA_BC_OUTPUT_PATH ${CMAKE_CURRENT_BINARY_DIR}/${GANDIVA_BC_FILE_NAME}) +install(FILES + ${GANDIVA_BC_OUTPUT_PATH} + DESTINATION ${GANDIVA_BC_INSTALL_DIR}) set(BC_FILE_PATH_CC "${CMAKE_CURRENT_BINARY_DIR}/bc_file_path.cc") configure_file(bc_file_path.cc.in ${BC_FILE_PATH_CC}) +add_definitions(-DGANDIVA_BYTE_COMPILE_FILE_PATH="${GANDIVA_BC_OUTPUT_PATH}") set(SRC_FILES annotator.cc bitmap_accumulator.cc @@ -59,7 +63,6 @@ set(SRC_FILES annotator.cc selection_vector.cc tree_expr_builder.cc to_date_holder.cc - ${SHARED_HELPER_FILES} ${BC_FILE_PATH_CC}) set(GANDIVA_SHARED_PRIVATE_LINK_LIBS diff --git a/cpp/src/gandiva/bc_file_path.cc.in b/cpp/src/gandiva/bc_file_path.cc.in index d6b4e342b6714..54e81ca2bfa18 100644 --- a/cpp/src/gandiva/bc_file_path.cc.in +++ b/cpp/src/gandiva/bc_file_path.cc.in @@ -18,6 +18,6 @@ namespace gandiva { // Path to the byte-code file. -extern const char kByteCodeFilePath[] = "${GANDIVA_BC_OUTPUT_PATH}"; +extern const char kByteCodeFilePath[] = "${GANDIVA_BC_INSTALL_PATH}"; } // namespace gandiva diff --git a/cpp/src/gandiva/engine_llvm_test.cc b/cpp/src/gandiva/engine_llvm_test.cc index fe4f82e19320c..627c385f97363 100644 --- a/cpp/src/gandiva/engine_llvm_test.cc +++ b/cpp/src/gandiva/engine_llvm_test.cc @@ -19,6 +19,7 @@ #include #include "gandiva/llvm_types.h" +#include "gandiva/tests/test_util.h" namespace gandiva { @@ -100,7 +101,7 @@ llvm::Function* TestEngine::BuildVecAdd(Engine* engine, LLVMTypes* types) { TEST_F(TestEngine, TestAddUnoptimised) { std::unique_ptr engine; - Status status = Engine::Make(ConfigurationBuilder::DefaultConfiguration(), &engine); + auto status = Engine::Make(TestConfiguration(), &engine); EXPECT_TRUE(status.ok()) << status.message(); LLVMTypes types(*engine->context()); llvm::Function* ir_func = BuildVecAdd(engine.get(), &types); @@ -115,7 +116,7 @@ TEST_F(TestEngine, TestAddUnoptimised) { TEST_F(TestEngine, TestAddOptimised) { std::unique_ptr engine; - Status status = Engine::Make(ConfigurationBuilder::DefaultConfiguration(), &engine); + auto status = Engine::Make(TestConfiguration(), &engine); EXPECT_TRUE(status.ok()) << status.message(); LLVMTypes types(*engine->context()); llvm::Function* ir_func = BuildVecAdd(engine.get(), &types); diff --git a/cpp/src/gandiva/llvm_generator_test.cc b/cpp/src/gandiva/llvm_generator_test.cc index 818c7912150a9..fed6339314850 100644 --- a/cpp/src/gandiva/llvm_generator_test.cc +++ b/cpp/src/gandiva/llvm_generator_test.cc @@ -26,6 +26,7 @@ #include "gandiva/expression.h" #include "gandiva/func_descriptor.h" #include "gandiva/function_registry.h" +#include "gandiva/tests/test_util.h" namespace gandiva { @@ -39,8 +40,7 @@ class TestLLVMGenerator : public ::testing::Test { // Verify that a valid pc function exists for every function in the registry. TEST_F(TestLLVMGenerator, VerifyPCFunctions) { std::unique_ptr generator; - Status status = - LLVMGenerator::Make(ConfigurationBuilder::DefaultConfiguration(), &generator); + auto status = LLVMGenerator::Make(TestConfiguration(), &generator); EXPECT_TRUE(status.ok()) << status.message(); llvm::Module* module = generator->module(); @@ -54,8 +54,7 @@ TEST_F(TestLLVMGenerator, VerifyPCFunctions) { TEST_F(TestLLVMGenerator, TestAdd) { // Setup LLVM generator to do an arithmetic add of two vectors std::unique_ptr generator; - Status status = - LLVMGenerator::Make(ConfigurationBuilder::DefaultConfiguration(), &generator); + auto status = LLVMGenerator::Make(TestConfiguration(), &generator); EXPECT_TRUE(status.ok()); Annotator annotator; diff --git a/cpp/src/gandiva/tests/binary_test.cc b/cpp/src/gandiva/tests/binary_test.cc index d5d99db910b9d..6ac3c5155196e 100644 --- a/cpp/src/gandiva/tests/binary_test.cc +++ b/cpp/src/gandiva/tests/binary_test.cc @@ -61,7 +61,7 @@ TEST_F(TestBinary, TestSimple) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/boolean_expr_test.cc b/cpp/src/gandiva/tests/boolean_expr_test.cc index 3351ab3ccf3ff..9226f357159c6 100644 --- a/cpp/src/gandiva/tests/boolean_expr_test.cc +++ b/cpp/src/gandiva/tests/boolean_expr_test.cc @@ -60,7 +60,7 @@ TEST_F(TestBooleanExpr, SimpleAnd) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // FALSE_VALID && ? => FALSE_VALID @@ -133,7 +133,7 @@ TEST_F(TestBooleanExpr, SimpleOr) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // TRUE_VALID && ? => TRUE_VALID @@ -210,7 +210,7 @@ TEST_F(TestBooleanExpr, AndThree) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 8; @@ -257,7 +257,7 @@ TEST_F(TestBooleanExpr, OrThree) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 8; @@ -317,7 +317,7 @@ TEST_F(TestBooleanExpr, BooleanAndInsideIf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 4; @@ -368,7 +368,7 @@ TEST_F(TestBooleanExpr, IfInsideBooleanAnd) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 4; diff --git a/cpp/src/gandiva/tests/date_time_test.cc b/cpp/src/gandiva/tests/date_time_test.cc index 3914558d716c7..643b8c8dda3ce 100644 --- a/cpp/src/gandiva/tests/date_time_test.cc +++ b/cpp/src/gandiva/tests/date_time_test.cc @@ -73,7 +73,8 @@ TEST_F(TestProjector, TestIsNull) { auto isnotnull_expr = TreeExprBuilder::MakeExpression("isnotnull", {t0}, b0); std::shared_ptr projector; - Status status = Projector::Make(schema, {isnull_expr, isnotnull_expr}, &projector); + auto status = Projector::Make(schema, {isnull_expr, isnotnull_expr}, + TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); int num_records = 4; @@ -126,8 +127,9 @@ TEST_F(TestProjector, TestDateTime) { auto ts2day_expr = TreeExprBuilder::MakeExpression("extractDay", {field2}, field_day); std::shared_ptr projector; - Status status = Projector::Make( - schema, {date2year_expr, date2month_expr, ts2month_expr, ts2day_expr}, &projector); + auto status = Projector::Make( + schema, {date2year_expr, date2month_expr, ts2month_expr, ts2day_expr}, + TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); struct tm y1970; @@ -196,7 +198,8 @@ TEST_F(TestProjector, TestTime) { TreeExprBuilder::MakeExpression("extractHour", {field0}, field_hour); std::shared_ptr projector; - Status status = Projector::Make(schema, {time2min_expr, time2hour_expr}, &projector); + auto status = Projector::Make(schema, {time2min_expr, time2hour_expr}, + TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); // create input data @@ -264,7 +267,7 @@ TEST_F(TestProjector, TestTimestampDiff) { std::shared_ptr projector; auto exprs = {diff_secs_expr, diff_mins_expr, diff_hours_expr, diff_days_expr, diff_weeks_expr, diff_months_expr, diff_quarters_expr, diff_years_expr}; - Status status = Projector::Make(schema, exprs, &projector); + auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); struct tm y1970; @@ -337,7 +340,8 @@ TEST_F(TestProjector, TestMonthsBetween) { TreeExprBuilder::MakeExpression("months_between", {f0, f1}, output); std::shared_ptr projector; - Status status = Projector::Make(schema, {months_between_expr}, &projector); + auto status = + Projector::Make(schema, {months_between_expr}, TestConfiguration(), &projector); std::cout << status.message(); ASSERT_TRUE(status.ok()); diff --git a/cpp/src/gandiva/tests/filter_test.cc b/cpp/src/gandiva/tests/filter_test.cc index f95cdcc3fef9c..ee60388d5dc1f 100644 --- a/cpp/src/gandiva/tests/filter_test.cc +++ b/cpp/src/gandiva/tests/filter_test.cc @@ -50,14 +50,15 @@ TEST_F(TestFilter, TestFilterCache) { auto less_than_10 = TreeExprBuilder::MakeFunction("less_than", {sum_func, literal_10}, arrow::boolean()); auto condition = TreeExprBuilder::MakeCondition(less_than_10); + auto configuration = TestConfiguration(); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, configuration, &filter); EXPECT_TRUE(status.ok()); // same schema and condition, should return the same filter as above. std::shared_ptr cached_filter; - status = Filter::Make(schema, condition, &cached_filter); + status = Filter::Make(schema, condition, configuration, &cached_filter); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_filter.get() == filter.get()); @@ -65,7 +66,8 @@ TEST_F(TestFilter, TestFilterCache) { auto field2 = field("f2", int32()); auto different_schema = arrow::schema({field0, field1, field2}); std::shared_ptr should_be_new_filter; - status = Filter::Make(different_schema, condition, &should_be_new_filter); + status = + Filter::Make(different_schema, condition, configuration, &should_be_new_filter); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_filter.get() != should_be_new_filter.get()); @@ -74,7 +76,7 @@ TEST_F(TestFilter, TestFilterCache) { "greater_than", {sum_func, literal_10}, arrow::boolean()); auto new_condition = TreeExprBuilder::MakeCondition(greater_than_10); std::shared_ptr should_be_new_filter1; - status = Filter::Make(schema, new_condition, &should_be_new_filter1); + status = Filter::Make(schema, new_condition, configuration, &should_be_new_filter1); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_filter.get() != should_be_new_filter1.get()); } @@ -96,7 +98,7 @@ TEST_F(TestFilter, TestSimple) { auto condition = TreeExprBuilder::MakeCondition(less_than_10); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -134,7 +136,7 @@ TEST_F(TestFilter, TestSimpleCustomConfig) { std::shared_ptr config = config_builder.build(); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -168,7 +170,7 @@ TEST_F(TestFilter, TestZeroCopy) { auto condition = TreeExprBuilder::MakeCondition("isnotnull", {field0}); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -208,7 +210,7 @@ TEST_F(TestFilter, TestZeroCopyNegative) { auto condition = TreeExprBuilder::MakeCondition("isnotnull", {field0}); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -265,7 +267,7 @@ TEST_F(TestFilter, TestSimpleSVInt32) { auto condition = TreeExprBuilder::MakeCondition(less_than_10); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/hash_test.cc b/cpp/src/gandiva/tests/hash_test.cc index 96f92284a5ca1..afaa885dfe26b 100644 --- a/cpp/src/gandiva/tests/hash_test.cc +++ b/cpp/src/gandiva/tests/hash_test.cc @@ -61,7 +61,8 @@ TEST_F(TestHash, TestSimple) { // Build a projector for the expression. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr_0, expr_1}, &projector); + auto status = + Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -113,7 +114,8 @@ TEST_F(TestHash, TestBuf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr_0, expr_1}, &projector); + auto status = + Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/huge_table_test.cc b/cpp/src/gandiva/tests/huge_table_test.cc index bffcb1994707f..cecf290a1439f 100644 --- a/cpp/src/gandiva/tests/huge_table_test.cc +++ b/cpp/src/gandiva/tests/huge_table_test.cc @@ -58,7 +58,7 @@ TEST_F(DISABLED_TestHugeProjector, SimpleTestSumHuge) { // Build expression auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum); std::shared_ptr projector; - Status status = Projector::Make(schema, {sum_expr}, &projector); + auto status = Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -136,7 +136,7 @@ TEST_F(DISABLED_TestHugeFilter, TestSimpleHugeFilter) { auto condition = TreeExprBuilder::MakeCondition(less_than_50); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // prepare input record batch diff --git a/cpp/src/gandiva/tests/if_expr_test.cc b/cpp/src/gandiva/tests/if_expr_test.cc index 93b35673b9467..54b6d43b4df1c 100644 --- a/cpp/src/gandiva/tests/if_expr_test.cc +++ b/cpp/src/gandiva/tests/if_expr_test.cc @@ -61,7 +61,7 @@ TEST_F(TestIfExpr, TestSimple) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -110,7 +110,7 @@ TEST_F(TestIfExpr, TestSimpleArithmetic) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -165,7 +165,7 @@ TEST_F(TestIfExpr, TestNested) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -228,7 +228,7 @@ TEST_F(TestIfExpr, TestNestedInIf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -296,7 +296,7 @@ TEST_F(TestIfExpr, TestNestedInCondition) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -353,7 +353,7 @@ TEST_F(TestIfExpr, TestBigNested) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/in_expr_test.cc b/cpp/src/gandiva/tests/in_expr_test.cc index 13ef97cfb8814..2103874cb1e2c 100644 --- a/cpp/src/gandiva/tests/in_expr_test.cc +++ b/cpp/src/gandiva/tests/in_expr_test.cc @@ -51,7 +51,7 @@ TEST_F(TestIn, TestInSimple) { auto condition = TreeExprBuilder::MakeCondition(in_expr); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -88,7 +88,7 @@ TEST_F(TestIn, TestInString) { auto condition = TreeExprBuilder::MakeCondition(in_expr); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -125,7 +125,7 @@ TEST_F(TestIn, TestInStringValidationError) { auto condition = TreeExprBuilder::MakeCondition(in_expr); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Evaluation expression for IN clause returns "; diff --git a/cpp/src/gandiva/tests/literal_test.cc b/cpp/src/gandiva/tests/literal_test.cc index ced66452a2d45..53323cb4e7cbb 100644 --- a/cpp/src/gandiva/tests/literal_test.cc +++ b/cpp/src/gandiva/tests/literal_test.cc @@ -88,8 +88,8 @@ TEST_F(TestLiteral, TestSimpleArithmetic) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = - Projector::Make(schema, {expr_a, expr_b, expr_c, expr_d, expr_e}, &projector); + auto status = Projector::Make(schema, {expr_a, expr_b, expr_c, expr_d, expr_e}, + TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -133,7 +133,7 @@ TEST_F(TestLiteral, TestLiteralHash) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); auto res1 = field("a", int64()); @@ -142,7 +142,7 @@ TEST_F(TestLiteral, TestLiteralHash) { // Build a projector for the expressions. std::shared_ptr projector1; - status = Projector::Make(schema, {expr1}, &projector1); + status = Projector::Make(schema, {expr1}, TestConfiguration(), &projector1); EXPECT_TRUE(status.ok()) << status.message(); EXPECT_TRUE(projector.get() != projector1.get()); } @@ -165,7 +165,7 @@ TEST_F(TestLiteral, TestNullLiteral) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -207,7 +207,7 @@ TEST_F(TestLiteral, TestNullLiteralInIf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/micro_benchmarks.cc b/cpp/src/gandiva/tests/micro_benchmarks.cc index 7d844eb378bf8..ce86bf0612402 100644 --- a/cpp/src/gandiva/tests/micro_benchmarks.cc +++ b/cpp/src/gandiva/tests/micro_benchmarks.cc @@ -56,7 +56,7 @@ static void TimedTestAdd3(benchmark::State& state) { auto sum_expr = TreeExprBuilder::MakeExpression(sum, field_sum); std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {sum_expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector)); Int64DataGenerator data_generator; ProjectEvaluator evaluator(projector); @@ -99,7 +99,7 @@ static void TimedTestBigNested(benchmark::State& state) { // Build a projector for the expressions. std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); BoundedInt32DataGenerator data_generator(250); ProjectEvaluator evaluator(projector); @@ -122,7 +122,7 @@ static void TimedTestExtractYear(benchmark::State& state) { auto expr = TreeExprBuilder::MakeExpression("extractYear", {field0}, field_res); std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); Int64DataGenerator data_generator; ProjectEvaluator evaluator(projector); @@ -149,7 +149,7 @@ static void TimedTestFilterAdd2(benchmark::State& state) { auto condition = TreeExprBuilder::MakeCondition(less_than); std::shared_ptr filter; - ASSERT_OK(Filter::Make(schema, condition, &filter)); + ASSERT_OK(Filter::Make(schema, condition, TestConfiguration(), &filter)); Int64DataGenerator data_generator; FilterEvaluator evaluator(filter); @@ -173,7 +173,7 @@ static void TimedTestFilterLike(benchmark::State& state) { auto condition = TreeExprBuilder::MakeCondition(like_yellow); std::shared_ptr filter; - ASSERT_OK(Filter::Make(schema, condition, &filter)); + ASSERT_OK(Filter::Make(schema, condition, TestConfiguration(), &filter)); FastUtf8DataGenerator data_generator(32); FilterEvaluator evaluator(filter); @@ -199,7 +199,7 @@ static void TimedTestAllocs(benchmark::State& state) { auto expr = TreeExprBuilder::MakeExpression(length, field_res); std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); FastUtf8DataGenerator data_generator(64); ProjectEvaluator evaluator(projector); @@ -237,7 +237,7 @@ static void TimedTestMultiOr(benchmark::State& state) { // Build a projector for the expressions. std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); FastUtf8DataGenerator data_generator(250); ProjectEvaluator evaluator(projector); @@ -269,7 +269,7 @@ static void TimedTestInExpr(benchmark::State& state) { // Build a projector for the expressions. std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); FastUtf8DataGenerator data_generator(250); ProjectEvaluator evaluator(projector); diff --git a/cpp/src/gandiva/tests/null_validity_test.cc b/cpp/src/gandiva/tests/null_validity_test.cc index 06cfdc08ba906..0374b68d46288 100644 --- a/cpp/src/gandiva/tests/null_validity_test.cc +++ b/cpp/src/gandiva/tests/null_validity_test.cc @@ -60,7 +60,7 @@ TEST_F(TestNullValidity, TestFunc) { auto condition = TreeExprBuilder::MakeCondition(less_than_10); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -111,7 +111,7 @@ TEST_F(TestNullValidity, TestIfElse) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -148,7 +148,7 @@ TEST_F(TestNullValidity, TestUtf8) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/projector_build_validation_test.cc b/cpp/src/gandiva/tests/projector_build_validation_test.cc index 18f02957fd479..6c4eef53ded68 100644 --- a/cpp/src/gandiva/tests/projector_build_validation_test.cc +++ b/cpp/src/gandiva/tests/projector_build_validation_test.cc @@ -50,7 +50,7 @@ TEST_F(TestProjector, TestNonExistentFunction) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Function bool non_existent_function(float, float) not supported yet."; @@ -71,7 +71,7 @@ TEST_F(TestProjector, TestNotMatchingDataType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Return type of root node float does not match that of expression bool"; @@ -92,7 +92,7 @@ TEST_F(TestProjector, TestNotSupportedDataType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Field f0 has unsupported data type list"; EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); @@ -113,7 +113,7 @@ TEST_F(TestProjector, TestIncorrectSchemaMissingField) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Field f2 not in schema"; EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); @@ -135,7 +135,7 @@ TEST_F(TestProjector, TestIncorrectSchemaTypeNotMatching) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Field definition in schema f2: int32 different from field in expression f2: float"; @@ -166,7 +166,7 @@ TEST_F(TestProjector, TestIfNotSupportedFunction) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } @@ -189,7 +189,7 @@ TEST_F(TestProjector, TestIfNotMatchingReturnType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } @@ -214,7 +214,7 @@ TEST_F(TestProjector, TestElseNotMatchingReturnType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } @@ -239,7 +239,7 @@ TEST_F(TestProjector, TestElseNotSupportedType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); EXPECT_EQ(status.code(), StatusCode::ExpressionValidationError); } @@ -259,7 +259,7 @@ TEST_F(TestProjector, TestAndMinChildren) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } @@ -280,7 +280,7 @@ TEST_F(TestProjector, TestAndBooleanArgType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 61d9dc3ad1629..1aeb43b49b0dc 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -50,14 +50,17 @@ TEST_F(TestProjector, TestProjectCache) { auto sub_expr = TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub); + auto configuration = TestConfiguration(); + std::shared_ptr projector; - Status status = Projector::Make(schema, {sum_expr, sub_expr}, &projector); + auto status = Projector::Make(schema, {sum_expr, sub_expr}, configuration, &projector); EXPECT_TRUE(status.ok()); // everything is same, should return the same projector. auto schema_same = arrow::schema({field0, field1}); std::shared_ptr cached_projector; - status = Projector::Make(schema_same, {sum_expr, sub_expr}, &cached_projector); + status = Projector::Make(schema_same, {sum_expr, sub_expr}, configuration, + &cached_projector); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_projector.get() == projector.get()); @@ -65,14 +68,14 @@ TEST_F(TestProjector, TestProjectCache) { auto field2 = field("f2", int32()); auto different_schema = arrow::schema({field0, field1, field2}); std::shared_ptr should_be_new_projector; - status = - Projector::Make(different_schema, {sum_expr, sub_expr}, &should_be_new_projector); + status = Projector::Make(different_schema, {sum_expr, sub_expr}, configuration, + &should_be_new_projector); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_projector.get() != should_be_new_projector.get()); // expression list is different should return a new projector. std::shared_ptr should_be_new_projector1; - status = Projector::Make(schema, {sum_expr}, &should_be_new_projector1); + status = Projector::Make(schema, {sum_expr}, configuration, &should_be_new_projector1); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_projector.get() != should_be_new_projector1.get()); } @@ -90,12 +93,13 @@ TEST_F(TestProjector, TestProjectCacheFieldNames) { auto sum_expr_01 = TreeExprBuilder::MakeExpression("add", {field0, field1}, sum_01); std::shared_ptr projector_01; - Status status = Projector::Make(schema, {sum_expr_01}, &projector_01); + auto status = + Projector::Make(schema, {sum_expr_01}, TestConfiguration(), &projector_01); EXPECT_TRUE(status.ok()); auto sum_expr_12 = TreeExprBuilder::MakeExpression("add", {field1, field2}, sum_12); std::shared_ptr projector_12; - status = Projector::Make(schema, {sum_expr_12}, &projector_12); + status = Projector::Make(schema, {sum_expr_12}, TestConfiguration(), &projector_12); EXPECT_TRUE(status.ok()); // add(f0, f1) != add(f1, f2) @@ -111,14 +115,16 @@ TEST_F(TestProjector, TestProjectCacheDouble) { auto literal0 = TreeExprBuilder::MakeLiteral(d0); auto expr0 = TreeExprBuilder::MakeExpression(literal0, res); + auto configuration = TestConfiguration(); + std::shared_ptr projector0; - auto status = Projector::Make(schema, {expr0}, &projector0); + auto status = Projector::Make(schema, {expr0}, configuration, &projector0); EXPECT_TRUE(status.ok()) << status.message(); auto literal1 = TreeExprBuilder::MakeLiteral(d1); auto expr1 = TreeExprBuilder::MakeExpression(literal1, res); std::shared_ptr projector1; - status = Projector::Make(schema, {expr1}, &projector1); + status = Projector::Make(schema, {expr1}, configuration, &projector1); EXPECT_TRUE(status.ok()) << status.message(); EXPECT_TRUE(projector0.get() != projector1.get()); @@ -134,13 +140,13 @@ TEST_F(TestProjector, TestProjectCacheFloat) { auto literal0 = TreeExprBuilder::MakeLiteral(f0); auto expr0 = TreeExprBuilder::MakeExpression(literal0, res); std::shared_ptr projector0; - auto status = Projector::Make(schema, {expr0}, &projector0); + auto status = Projector::Make(schema, {expr0}, TestConfiguration(), &projector0); EXPECT_TRUE(status.ok()) << status.message(); auto literal1 = TreeExprBuilder::MakeLiteral(f1); auto expr1 = TreeExprBuilder::MakeExpression(literal1, res); std::shared_ptr projector1; - status = Projector::Make(schema, {expr1}, &projector1); + status = Projector::Make(schema, {expr1}, TestConfiguration(), &projector1); EXPECT_TRUE(status.ok()) << status.message(); EXPECT_TRUE(projector0.get() != projector1.get()); @@ -162,50 +168,8 @@ TEST_F(TestProjector, TestIntSumSub) { TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub); std::shared_ptr projector; - Status status = Projector::Make(schema, {sum_expr, sub_expr}, &projector); - EXPECT_TRUE(status.ok()); - - // Create a row-batch with some sample data - int num_records = 4; - auto array0 = MakeArrowArrayInt32({1, 2, 3, 4}, {true, true, true, false}); - auto array1 = MakeArrowArrayInt32({11, 13, 15, 17}, {true, true, false, true}); - // expected output - auto exp_sum = MakeArrowArrayInt32({12, 15, 0, 0}, {true, true, false, false}); - auto exp_sub = MakeArrowArrayInt32({-10, -11, 0, 0}, {true, true, false, false}); - - // prepare input record batch - auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1}); - - // Evaluate expression - arrow::ArrayVector outputs; - status = projector->Evaluate(*in_batch, pool_, &outputs); - EXPECT_TRUE(status.ok()); - - // Validate results - EXPECT_ARROW_ARRAY_EQUALS(exp_sum, outputs.at(0)); - EXPECT_ARROW_ARRAY_EQUALS(exp_sub, outputs.at(1)); -} - -TEST_F(TestProjector, TestIntSumSubCustomConfig) { - // schema for input fields - auto field0 = field("f0", int32()); - auto field1 = field("f2", int32()); - auto schema = arrow::schema({field0, field1}); - - // output fields - auto field_sum = field("add", int32()); - auto field_sub = field("subtract", int32()); - - // Build expression - auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum); - auto sub_expr = - TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub); - - std::shared_ptr projector; - ConfigurationBuilder config_builder; - std::shared_ptr config = config_builder.build(); - - Status status = Projector::Make(schema, {sum_expr, sub_expr}, config, &projector); + auto status = + Projector::Make(schema, {sum_expr, sub_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -257,8 +221,9 @@ static void TestArithmeticOpsForType(arrow::MemoryPool* pool) { auto lt_expr = TreeExprBuilder::MakeExpression("less_than", {field0, field1}, field_lt); std::shared_ptr projector; - Status status = Projector::Make( - schema, {sum_expr, sub_expr, mul_expr, div_expr, eq_expr, lt_expr}, &projector); + auto status = + Projector::Make(schema, {sum_expr, sub_expr, mul_expr, div_expr, eq_expr, lt_expr}, + TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -344,9 +309,9 @@ TEST_F(TestProjector, TestExtendedMath) { TreeExprBuilder::MakeExpression("power", {field0, field1}, field_power); std::shared_ptr projector; - Status status = Projector::Make( + auto status = Projector::Make( schema, {cbrt_expr, exp_expr, log_expr, log10_expr, logb_expr, power_expr}, - &projector); + TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -412,7 +377,7 @@ TEST_F(TestProjector, TestFloatLessThan) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -447,7 +412,7 @@ TEST_F(TestProjector, TestIsNotNull) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {myexpr}, &projector); + auto status = Projector::Make(schema, {myexpr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -480,7 +445,7 @@ TEST_F(TestProjector, TestZeroCopy) { auto cast_expr = TreeExprBuilder::MakeExpression("castFLOAT4", {field0}, res); std::shared_ptr projector; - Status status = Projector::Make(schema, {cast_expr}, &projector); + auto status = Projector::Make(schema, {cast_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -527,7 +492,7 @@ TEST_F(TestProjector, TestZeroCopyNegative) { auto cast_expr = TreeExprBuilder::MakeExpression("castFLOAT4", {field0}, res); std::shared_ptr projector; - Status status = Projector::Make(schema, {cast_expr}, &projector); + auto status = Projector::Make(schema, {cast_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -597,7 +562,7 @@ TEST_F(TestProjector, TestDivideZero) { auto div_expr = TreeExprBuilder::MakeExpression("divide", {field0, field1}, field_div); std::shared_ptr projector; - Status status = Projector::Make(schema, {div_expr}, &projector); + auto status = Projector::Make(schema, {div_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -646,7 +611,7 @@ TEST_F(TestProjector, TestModZero) { auto mod_expr = TreeExprBuilder::MakeExpression("mod", {field0, field1}, field_div); std::shared_ptr projector; - Status status = Projector::Make(schema, {mod_expr}, &projector); + auto status = Projector::Make(schema, {mod_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/test_util.h b/cpp/src/gandiva/tests/test_util.h index d24448727bd83..72b45b124b8dd 100644 --- a/cpp/src/gandiva/tests/test_util.h +++ b/cpp/src/gandiva/tests/test_util.h @@ -73,6 +73,12 @@ static ArrayPtr MakeArrowTypeArray(const std::shared_ptr& type, EXPECT_TRUE((a)->Equals(b)) << "expected array: " << (a)->ToString() \ << " actual array: " << (b)->ToString(); +std::shared_ptr TestConfiguration() { + auto builder = ConfigurationBuilder(); + builder.set_byte_code_file_path(GANDIVA_BYTE_COMPILE_FILE_PATH); + return builder.build(); +} + } // namespace gandiva #endif // GANDIVA_TEST_UTIL_H diff --git a/cpp/src/gandiva/tests/utf8_test.cc b/cpp/src/gandiva/tests/utf8_test.cc index 8b09b72f32d03..925ceea836280 100644 --- a/cpp/src/gandiva/tests/utf8_test.cc +++ b/cpp/src/gandiva/tests/utf8_test.cc @@ -67,7 +67,8 @@ TEST_F(TestUtf8, TestSimple) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr_a, expr_b, expr_c}, &projector); + auto status = + Projector::Make(schema, {expr_a, expr_b, expr_c}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -113,7 +114,7 @@ TEST_F(TestUtf8, TestLiteral) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -155,7 +156,7 @@ TEST_F(TestUtf8, TestNullLiteral) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -197,7 +198,7 @@ TEST_F(TestUtf8, TestLike) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -245,7 +246,7 @@ TEST_F(TestUtf8, TestBeginsEnds) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr1, expr2}, &projector); + auto status = Projector::Make(schema, {expr1, expr2}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -291,7 +292,7 @@ TEST_F(TestUtf8, TestInternalAllocs) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -334,7 +335,7 @@ TEST_F(TestUtf8, TestCastDate) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -389,7 +390,7 @@ TEST_F(TestUtf8, TestToDateNoError) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -444,7 +445,7 @@ TEST_F(TestUtf8, TestToDateError) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index 39752e2d36913..d365eb9193ac1 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -133,7 +133,7 @@ - ${gandiva.cpp.build.dir} + ${gandiva.cpp.build.dir}/../src/gandiva irhelpers.bc From 48dd1724ddf2354391f6b7b3fdb043ab780b2c27 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Dec 2018 10:36:55 -0600 Subject: [PATCH 097/328] ARROW-2970: [Python] Support conversions of NumPy string arrays requiring chunked binary output Author: Wes McKinney Closes #3240 from wesm/ARROW-2970 and squashes the following commits: 8b04eb3c4 Make the test data a bit more diverse 60d35f0e4 Use internal::ChunkedBinaryBuilder for converting NumPy string/binary array to Arrow --- cpp/src/arrow/python/CMakeLists.txt | 2 ++ cpp/src/arrow/python/numpy_to_arrow.cc | 12 ++++++++--- python/pyarrow/tests/test_array.py | 30 ++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index cccbf09d4fb4d..0f037ad4b0571 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -77,6 +77,8 @@ ADD_ARROW_LIB(arrow_python EXTRA_INCLUDES "${ARROW_PYTHON_INCLUDES}" ) +add_dependencies(arrow_python ${ARROW_PYTHON_LIBRARIES}) + foreach(LIB_TARGET ${ARROW_PYTHON_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_PYTHON_EXPORTING) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 461a085722243..aa28b6e870834 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -534,8 +534,11 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d return Status::OK(); } +// Create 16MB chunks for binary data +constexpr int32_t kBinaryChunksize = 1 << 24; + Status NumPyConverter::Visit(const BinaryType& type) { - BinaryBuilder builder(pool_); + ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_); auto data = reinterpret_cast(PyArray_DATA(arr_)); @@ -564,9 +567,12 @@ Status NumPyConverter::Visit(const BinaryType& type) { } } - std::shared_ptr result; + ArrayVector result; RETURN_NOT_OK(builder.Finish(&result)); - return PushArray(result->data()); + for (auto arr : result) { + RETURN_NOT_OK(PushArray(arr->data())); + } + return Status::OK(); } Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 95a60435e3460..352c8558c881b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1268,3 +1268,33 @@ def test_array_from_numpy_str_utf8(): with pytest.raises(ValueError): pa.array(vec, pa.string(), mask=np.array([False])) + + +@pytest.mark.large_memory +def test_numpy_string_overflow_to_chunked(): + # ARROW-3762 + + # 2^31 + 1 bytes + values = [b'x'] + + # Make 10 unique 1MB strings then repeat then 2048 times + unique_strings = { + i: b'x' * ((1 << 20) - 1) + str(i % 10).encode('utf8') + for i in range(10) + } + values += [unique_strings[i % 10] for i in range(1 << 11)] + + arr = np.array(values) + arrow_arr = pa.array(arr) + + assert isinstance(arrow_arr, pa.ChunkedArray) + + # Split up into 16MB chunks. 128 * 16 = 2048, so 129 + assert arrow_arr.num_chunks == 129 + + value_index = 0 + for i in range(arrow_arr.num_chunks): + chunk = arrow_arr.chunk(i) + for val in chunk: + assert val.as_py() == values[value_index] + value_index += 1 From 7ebd7b3aaa5646af8bf9707a590daf29d384cf1d Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sat, 22 Dec 2018 21:09:48 +0100 Subject: [PATCH 098/328] ARROW-4105: [Rust] Add rust-toolchain to enforce user to use nightly toolchain for building The Rust binding needs to be built by nightly toolchain so if we supply rust-toolchain file, user can build without changing the toolchain explicitly. Author: Kousuke Saruta Closes #3247 from sarutak/add-rust-toolchain and squashes the following commits: 6ab619b8 Add rust-toolchain to rat_exclude_files.txt c3fb2aba Add rust-toolchain to enforce to use nightly toolchain for building --- dev/release/rat_exclude_files.txt | 1 + rust/rust-toolchain | 1 + 2 files changed, 2 insertions(+) create mode 100644 rust/rust-toolchain diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 66d62c6257570..bcb474b79b060 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -179,3 +179,4 @@ r/README.Rmd r/man/*.Rd .gitattributes rust/test/data/*.csv +rust/rust-toolchain diff --git a/rust/rust-toolchain b/rust/rust-toolchain new file mode 100644 index 0000000000000..07ade694b1a3c --- /dev/null +++ b/rust/rust-toolchain @@ -0,0 +1 @@ +nightly \ No newline at end of file From b23cedd12f7638cf7d6c042970090e248de95f80 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Sat, 22 Dec 2018 21:56:16 +0100 Subject: [PATCH 099/328] ARROW-4075: [Rust] Reuse array builder after calling finish() Currently a buffer/array builder is consumed after `finish()` is called. This may not be very convenient as one may want to use the same builder for multiple arrays. This changes the behavior of it to reset the builder instead. Author: Chao Sun Closes #3221 from sunchao/ARROW-4075 and squashes the following commits: 49f6c4c6 ARROW-4075: Reuse array builder after calling finish() --- rust/src/builder.rs | 172 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 137 insertions(+), 35 deletions(-) diff --git a/rust/src/builder.rs b/rust/src/builder.rs index d5d222d006fe8..a4c8666233877 100644 --- a/rust/src/builder.rs +++ b/rust/src/builder.rs @@ -60,7 +60,7 @@ pub trait BufferBuilderTrait { fn reserve(&mut self, n: usize) -> Result<()>; fn push(&mut self, v: T::Native) -> Result<()>; fn push_slice(&mut self, slice: &[T::Native]) -> Result<()>; - fn finish(self) -> Buffer; + fn finish(&mut self) -> Buffer; } impl BufferBuilderTrait for BufferBuilder { @@ -114,9 +114,11 @@ impl BufferBuilderTrait for BufferBuilder { self.write_bytes(slice.to_byte_slice(), array_slots) } - /// Consumes this builder and returns an immutable `Buffer`. - default fn finish(self) -> Buffer { - self.buffer.freeze() + /// Reset this builder and returns an immutable `Buffer`. + default fn finish(&mut self) -> Buffer { + let buf = ::std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); + self.len = 0; + buf.freeze() } } @@ -196,13 +198,15 @@ impl BufferBuilderTrait for BufferBuilder { Ok(()) } - /// Consumes this and returns an immutable `Buffer`. - fn finish(mut self) -> Buffer { + /// Reset this builder and returns an immutable `Buffer`. + fn finish(&mut self) -> Buffer { // `push` does not update the buffer's `len` so do it before `freeze` is called. let new_buffer_len = bit_util::ceil(self.len, 8); debug_assert!(new_buffer_len >= self.buffer.len()); - self.buffer.resize(new_buffer_len).unwrap(); - self.buffer.freeze() + let mut buf = ::std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); + self.len = 0; + buf.resize(new_buffer_len).unwrap(); + buf.freeze() } } @@ -211,15 +215,25 @@ pub trait ArrayBuilder { /// The type of array that this builder creates type ArrayType: Array; - /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific - /// implementation before calling it's `finish` method - fn into_any(self) -> Box; - /// Returns the number of array slots in the builder fn len(&self) -> usize; /// Builds the array - fn finish(self) -> Self::ArrayType; + fn finish(&mut self) -> Self::ArrayType; + + /// Returns the builder as an non-mutable `Any` reference. + /// + /// This is most useful when one wants to call non-mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_ref` to get a reference on the specific builder. + fn as_any(&self) -> &Any; + + /// Returns the builder as an mutable `Any` reference. + /// + /// This is most useful when one wants to call mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_mut` to get a reference on the specific builder. + fn as_any_mut(&mut self) -> &mut Any; } /// Array builder for fixed-width primitive types @@ -243,10 +257,14 @@ pub type Float64Builder = PrimitiveArrayBuilder; impl ArrayBuilder for PrimitiveArrayBuilder { type ArrayType = PrimitiveArray; - /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific - /// implementation before calling it's `finish` method - fn into_any(self) -> Box { - Box::new(self) + /// Returns the builder as an non-mutable `Any` reference. + fn as_any(&self) -> &Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut Any { + self } /// Returns the number of array slots in the builder @@ -254,8 +272,8 @@ impl ArrayBuilder for PrimitiveArrayBuilder { self.values_builder.len } - /// Builds the PrimitiveArray - fn finish(self) -> PrimitiveArray { + /// Builds the `PrimitiveArray` and reset this builder. + fn finish(&mut self) -> PrimitiveArray { let len = self.len(); let null_bit_buffer = self.bitmap_builder.finish(); let data = ArrayData::builder(T::get_data_type()) @@ -341,10 +359,14 @@ where { type ArrayType = ListArray; - /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific - /// implementation before calling it's `finish` method. - fn into_any(self) -> Box { - Box::new(self) + /// Returns the builder as an non-mutable `Any` reference. + fn as_any(&self) -> &Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut Any { + self } /// Returns the number of array slots in the builder @@ -352,22 +374,25 @@ where self.len } - /// Builds the `ListArray` - fn finish(self) -> ListArray { + /// Builds the `ListArray` and reset this builder. + fn finish(&mut self) -> ListArray { let len = self.len(); + self.len = 0; let values_arr = self .values_builder - .into_any() - .downcast::() + .as_any_mut() + .downcast_mut::() .unwrap() .finish(); let values_data = values_arr.data(); + let offset_buffer = self.offsets_builder.finish(); let null_bit_buffer = self.bitmap_builder.finish(); + self.offsets_builder.push(0).unwrap(); let data = ArrayData::builder(DataType::List(Box::new(values_data.data_type().clone()))) .len(len) .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) - .add_buffer(self.offsets_builder.finish()) + .add_buffer(offset_buffer) .add_child_data(values_data) .null_bit_buffer(null_bit_buffer) .build(); @@ -403,10 +428,14 @@ pub struct BinaryArrayBuilder { impl ArrayBuilder for BinaryArrayBuilder { type ArrayType = BinaryArray; - /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific - /// implementation before calling it's `finish` method. - fn into_any(self) -> Box { - Box::new(self) + /// Returns the builder as an non-mutable `Any` reference. + fn as_any(&self) -> &Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut Any { + self } /// Returns the number of array slots in the builder @@ -414,8 +443,8 @@ impl ArrayBuilder for BinaryArrayBuilder { self.builder.len() } - /// Builds the `BinaryArray` - fn finish(self) -> BinaryArray { + /// Builds the `BinaryArray` and reset this builder. + fn finish(&mut self) -> BinaryArray { BinaryArray::from(self.builder.finish()) } } @@ -462,7 +491,7 @@ mod tests { #[test] fn test_builder_i32_empty() { - let b = Int32BufferBuilder::new(5); + let mut b = Int32BufferBuilder::new(5); assert_eq!(0, b.len()); assert_eq!(16, b.capacity()); let a = b.finish(); @@ -500,6 +529,27 @@ mod tests { assert_eq!(80, a.len()); } + #[test] + fn test_builder_finish() { + let mut b = Int32BufferBuilder::new(5); + assert_eq!(16, b.capacity()); + for i in 0..10 { + b.push(i).unwrap(); + } + let mut a = b.finish(); + assert_eq!(40, a.len()); + assert_eq!(0, b.len()); + assert_eq!(0, b.capacity()); + + // Try build another buffer after cleaning up. + for i in 0..20 { + b.push(i).unwrap() + } + assert_eq!(32, b.capacity()); + a = b.finish(); + assert_eq!(80, a.len()); + } + #[test] fn test_reserve() { let mut b = UInt8BufferBuilder::new(2); @@ -702,6 +752,20 @@ mod tests { } } + #[test] + fn test_primitive_array_builder_finish() { + let mut builder = Int32Builder::new(5); + builder.push_slice(&[2, 4, 6, 8]).unwrap(); + let mut arr = builder.finish(); + assert_eq!(4, arr.len()); + assert_eq!(0, builder.len()); + + builder.push_slice(&[1, 3, 5, 7, 9]).unwrap(); + arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, builder.len()); + } + #[test] fn test_list_array_builder() { let values_builder = Int32Builder::new(10); @@ -768,6 +832,27 @@ mod tests { assert_eq!(3, list_array.value_length(2)); } + #[test] + fn test_list_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = ListArrayBuilder::new(values_builder); + + builder.values().push_slice(&[1, 2, 3]).unwrap(); + builder.append(true).unwrap(); + builder.values().push_slice(&[4, 5, 6]).unwrap(); + builder.append(true).unwrap(); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert_eq!(0, builder.len()); + + builder.values().push_slice(&[7, 8, 9]).unwrap(); + builder.append(true).unwrap(); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert_eq!(0, builder.len()); + } + #[test] fn test_list_list_array_builder() { let primitive_builder = Int32Builder::new(10); @@ -857,6 +942,23 @@ mod tests { assert_eq!(5, binary_array.value_length(2)); } + #[test] + fn test_binary_array_builder_finish() { + let mut builder = BinaryArrayBuilder::new(10); + + builder.push_string("hello").unwrap(); + builder.push_string("world").unwrap(); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert_eq!(0, builder.len()); + + builder.push_string("arrow").unwrap(); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert_eq!(0, builder.len()); + } + #[test] fn test_binary_array_builder_push_string() { let mut builder = BinaryArrayBuilder::new(20); From ddc5e9a721451d8492dfdf797402b2ab7e5e3845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 22 Dec 2018 23:23:08 +0100 Subject: [PATCH 100/328] ARROW-4106: [Python] Tests fail to run because hypothesis update broke its API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Closes #3250 from kszucs/ARROW-4106 and squashes the following commits: d87cc14c don't use defines_strategy --- python/pyarrow/tests/strategies.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index bc8ded2e896d0..c95b75b270e56 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -89,43 +89,35 @@ metadata = st.dictionaries(st.text(), st.text()) -@st.defines_strategy def fields(type_strategy=primitive_types): return st.builds(pa.field, name=custom_text, type=type_strategy, nullable=st.booleans(), metadata=metadata) -@st.defines_strategy def list_types(item_strategy=primitive_types): return st.builds(pa.list_, item_strategy) -@st.defines_strategy def struct_types(item_strategy=primitive_types): return st.builds(pa.struct, st.lists(fields(item_strategy))) -@st.defines_strategy def complex_types(inner_strategy=primitive_types): return list_types(inner_strategy) | struct_types(inner_strategy) -@st.defines_strategy def nested_list_types(item_strategy=primitive_types): return st.recursive(item_strategy, list_types) -@st.defines_strategy def nested_struct_types(item_strategy=primitive_types): return st.recursive(item_strategy, struct_types) -@st.defines_strategy def nested_complex_types(inner_strategy=primitive_types): return st.recursive(inner_strategy, complex_types) -@st.defines_strategy def schemas(type_strategy=primitive_types): return st.builds(pa.schema, st.lists(fields(type_strategy))) From ffc8877aa6c2d80418cb805076fc0545e6b0204c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 23 Dec 2018 00:43:38 +0100 Subject: [PATCH 101/328] ARROW-4101: [C++] Identity BinaryType cast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Author: François Saint-Jacques Closes #3245 from fsaintjacques/ARROW-4101-cast-identity and squashes the following commits: 4bb2fb7b parametrize 4319bace ARROW-4101: Identity BinaryType cast --- cpp/src/arrow/compute/kernels/cast.cc | 4 +++- python/pyarrow/tests/test_array.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 2ce0702f20c32..7976ef0beffc6 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -1258,7 +1258,9 @@ class CastKernel : public UnaryKernel { FN(TimestampType, Date64Type); \ FN(TimestampType, Int64Type); -#define BINARY_CASES(FN, IN_TYPE) FN(BinaryType, StringType); +#define BINARY_CASES(FN, IN_TYPE) \ + FN(BinaryType, BinaryType); \ + FN(BinaryType, StringType); #define STRING_CASES(FN, IN_TYPE) \ FN(StringType, StringType); \ diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 352c8558c881b..3d3402139cb43 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -755,6 +755,26 @@ def test_cast_date64_to_int(): assert result.equals(expected) +@pytest.mark.parametrize(('ty', 'values'), [ + ('bool', [True, False, True]), + ('uint8', range(0, 255)), + ('int8', range(0, 128)), + ('uint16', range(0, 10)), + ('int16', range(0, 10)), + ('uint32', range(0, 10)), + ('int32', range(0, 10)), + ('uint64', range(0, 10)), + ('int64', range(0, 10)), + ('float', [0.0, 0.1, 0.2]), + ('double', [0.0, 0.1, 0.2]), + ('string', ['a', 'b', 'c']), + ('binary', [b'a', b'b', b'c']) +]) +def test_cast_identities(ty, values): + arr = pa.array(values, type=ty) + assert arr.cast(ty).equals(arr) + + pickle_test_parametrize = pytest.mark.parametrize( ('data', 'typ'), [ From e179dda432e1f67020a0c832a11fc496eec67e7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 23 Dec 2018 00:59:06 +0100 Subject: [PATCH 102/328] ARROW-4098: [Python] Deprecate open_file/open_stream top level APIs in favor of using ipc namespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will mean some user code will have to change (e.g. https://github.com/apache/spark/blob/8edae94fa7ec1a1cc2c69e0924da0da85d4aac83/python/pyspark/serializers.py#L240) but it is the most maintainable option for the long term. We should not remove the deprecated APIs until we are confident that at least our open source downstream dependencies are taken care of Author: Krisztián Szűcs Author: Wes McKinney Closes #3244 from wesm/ARROW-4098 and squashes the following commits: ec3c54be update ipc doc 9017e7ff remove accidentally committed file 36b6a861 Fix up API docs 7ed5343e Deprecate pyarrow.open_stream/open_file in favor of ipc-namespaced versions --- docs/source/python/api.rst | 4 ++-- docs/source/python/ipc.rst | 10 ++++---- python/pyarrow/__init__.py | 22 ++++++++++++++++++ python/pyarrow/tests/test_ipc.py | 40 ++++++++++++++++---------------- 4 files changed, 49 insertions(+), 27 deletions(-) diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst index 40ccb68c36f38..0bad76ff0bf63 100644 --- a/docs/source/python/api.rst +++ b/docs/source/python/api.rst @@ -259,14 +259,14 @@ Serialization and IPC .. autosummary:: :toctree: generated/ + ipc.open_file + ipc.open_stream Message MessageReader RecordBatchFileReader RecordBatchFileWriter RecordBatchStreamReader RecordBatchStreamWriter - open_file - open_stream read_message read_record_batch get_record_batch_size diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst index 3f7e787cd0c2f..812d843b0df56 100644 --- a/docs/source/python/ipc.rst +++ b/docs/source/python/ipc.rst @@ -84,11 +84,11 @@ particular stream. Now we can do: Now ``buf`` contains the complete stream as an in-memory byte buffer. We can read such a stream with :class:`~pyarrow.RecordBatchStreamReader` or the -convenience function ``pyarrow.open_stream``: +convenience function ``pyarrow.ipc.open_stream``: .. ipython:: python - reader = pa.open_stream(buf) + reader = pa.ipc.open_stream(buf) reader.schema batches = [b for b in reader] @@ -125,11 +125,11 @@ The :class:`~pyarrow.RecordBatchFileWriter` has the same API as The difference between :class:`~pyarrow.RecordBatchFileReader` and :class:`~pyarrow.RecordBatchStreamReader` is that the input source must have a ``seek`` method for random access. The stream reader only requires read -operations. We can also use the ``pyarrow.open_file`` method to open a file: +operations. We can also use the ``pyarrow.ipc.open_file`` method to open a file: .. ipython:: python - reader = pa.open_file(buf) + reader = pa.ipc.open_file(buf) Because we have access to the entire payload, we know the number of record batches in the file, and can read any at random: @@ -149,7 +149,7 @@ DataFrame output: .. ipython:: python - df = pa.open_file(buf).read_pandas() + df = pa.ipc.open_file(buf).read_pandas() df[:5] Arbitrary Object Serialization diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 7f0a371b4bfd2..0d1c1bef87a1c 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -146,6 +146,28 @@ def parse_git(root, **kwargs): open_stream, open_file, serialize_pandas, deserialize_pandas) +import pyarrow.ipc as ipc + + +def open_stream(source): + """ + pyarrow.open_stream deprecated since 0.12, use pyarrow.ipc.open_stream + """ + import warnings + warnings.warn("pyarrow.open_stream is deprecated, please use " + "pyarrow.ipc.open_stream") + return ipc.open_stream(source) + + +def open_file(source): + """ + pyarrow.open_file deprecated since 0.12, use pyarrow.ipc.open_file + """ + import warnings + warnings.warn("pyarrow.open_file is deprecated, please use " + "pyarrow.ipc.open_file") + return ipc.open_file(source) + localfs = LocalFileSystem.get_instance() diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 0fb66f8fa4d43..67a91b9ddd440 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -80,7 +80,7 @@ def _check_roundtrip(self, as_table=False): _, batches = self.write_batches(as_table=as_table) file_contents = pa.BufferReader(self.get_source()) - reader = pa.open_file(file_contents) + reader = pa.ipc.open_file(file_contents) assert reader.num_record_batches == len(batches) @@ -121,7 +121,7 @@ def stream_fixture(): def test_empty_file(): buf = b'' with pytest.raises(pa.ArrowInvalid): - pa.open_file(pa.BufferReader(buf)) + pa.ipc.open_file(pa.BufferReader(buf)) def test_file_simple_roundtrip(file_fixture): @@ -142,7 +142,7 @@ def test_file_read_all(sink_factory): _, batches = fixture.write_batches() file_contents = pa.BufferReader(fixture.get_source()) - reader = pa.open_file(file_contents) + reader = pa.ipc.open_file(file_contents) result = reader.read_all() expected = pa.Table.from_batches(batches) @@ -154,8 +154,8 @@ def test_open_file_from_buffer(file_fixture): _, batches = file_fixture.write_batches() source = file_fixture.get_source() - reader1 = pa.open_file(source) - reader2 = pa.open_file(pa.BufferReader(source)) + reader1 = pa.ipc.open_file(source) + reader2 = pa.ipc.open_file(pa.BufferReader(source)) reader3 = pa.RecordBatchFileReader(source) result1 = reader1.read_all() @@ -170,7 +170,7 @@ def test_file_read_pandas(file_fixture): frames, _ = file_fixture.write_batches() file_contents = pa.BufferReader(file_fixture.get_source()) - reader = pa.open_file(file_contents) + reader = pa.ipc.open_file(file_contents) result = reader.read_pandas() expected = pd.concat(frames) @@ -189,8 +189,8 @@ def test_file_pathlib(file_fixture, tmpdir): with open(path, 'wb') as f: f.write(source) - t1 = pa.open_file(pathlib.Path(path)).read_all() - t2 = pa.open_file(pa.OSFile(path)).read_all() + t1 = pa.ipc.open_file(pathlib.Path(path)).read_all() + t2 = pa.ipc.open_file(pa.OSFile(path)).read_all() assert t1.equals(t2) @@ -198,7 +198,7 @@ def test_file_pathlib(file_fixture, tmpdir): def test_empty_stream(): buf = io.BytesIO(b'') with pytest.raises(pa.ArrowInvalid): - pa.open_stream(buf) + pa.ipc.open_stream(buf) def test_stream_categorical_roundtrip(stream_fixture): @@ -213,7 +213,7 @@ def test_stream_categorical_roundtrip(stream_fixture): writer.write_batch(pa.RecordBatch.from_pandas(df)) writer.close() - table = (pa.open_stream(pa.BufferReader(stream_fixture.get_source())) + table = (pa.ipc.open_stream(pa.BufferReader(stream_fixture.get_source())) .read_all()) assert_frame_equal(table.to_pandas(), df) @@ -223,8 +223,8 @@ def test_open_stream_from_buffer(stream_fixture): _, batches = stream_fixture.write_batches() source = stream_fixture.get_source() - reader1 = pa.open_stream(source) - reader2 = pa.open_stream(pa.BufferReader(source)) + reader1 = pa.ipc.open_stream(source) + reader2 = pa.ipc.open_stream(pa.BufferReader(source)) reader3 = pa.RecordBatchStreamReader(source) result1 = reader1.read_all() @@ -250,7 +250,7 @@ def test_stream_write_dispatch(stream_fixture): writer.write(batch) writer.close() - table = (pa.open_stream(pa.BufferReader(stream_fixture.get_source())) + table = (pa.ipc.open_stream(pa.BufferReader(stream_fixture.get_source())) .read_all()) assert_frame_equal(table.to_pandas(), pd.concat([df, df], ignore_index=True)) @@ -271,7 +271,7 @@ def test_stream_write_table_batches(stream_fixture): writer.write_table(table, chunksize=15) writer.close() - batches = list(pa.open_stream(stream_fixture.get_source())) + batches = list(pa.ipc.open_stream(stream_fixture.get_source())) assert list(map(len, batches)) == [10, 15, 5, 10] result_table = pa.Table.from_batches(batches) @@ -283,7 +283,7 @@ def test_stream_write_table_batches(stream_fixture): def test_stream_simple_roundtrip(stream_fixture): _, batches = stream_fixture.write_batches() file_contents = pa.BufferReader(stream_fixture.get_source()) - reader = pa.open_stream(file_contents) + reader = pa.ipc.open_stream(file_contents) assert reader.schema.equals(batches[0].schema) @@ -301,7 +301,7 @@ def test_stream_simple_roundtrip(stream_fixture): def test_stream_read_all(stream_fixture): _, batches = stream_fixture.write_batches() file_contents = pa.BufferReader(stream_fixture.get_source()) - reader = pa.open_stream(file_contents) + reader = pa.ipc.open_stream(file_contents) result = reader.read_all() expected = pa.Table.from_batches(batches) @@ -311,7 +311,7 @@ def test_stream_read_all(stream_fixture): def test_stream_read_pandas(stream_fixture): frames, _ = stream_fixture.write_batches() file_contents = stream_fixture.get_source() - reader = pa.open_stream(file_contents) + reader = pa.ipc.open_stream(file_contents) result = reader.read_pandas() expected = pd.concat(frames) @@ -393,7 +393,7 @@ def run(self): connection, client_address = self._sock.accept() try: source = connection.makefile(mode='rb') - reader = pa.open_stream(source) + reader = pa.ipc.open_stream(source) self._schema = reader.schema if self._do_read_all: self._table = reader.read_all() @@ -494,7 +494,7 @@ def test_ipc_stream_no_batches(): writer.close() source = sink.getvalue() - reader = pa.open_stream(source) + reader = pa.ipc.open_stream(source) result = reader.read_all() assert result.schema.equals(table.schema) @@ -636,7 +636,7 @@ def write_file(batch, sink): def read_file(source): - reader = pa.open_file(source) + reader = pa.ipc.open_file(source) return [reader.get_batch(i) for i in range(reader.num_record_batches)] From 6578089472958b20126d5c56fe8f8737b02b5544 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 23 Dec 2018 01:37:13 +0100 Subject: [PATCH 103/328] ARROW-2592: [Python] Add "ignore_metadata" option to Table.to_pandas This option circumvents the index reconstruction logic if there is `'pandas'` metadata. This can also be achieved using `table.cast(table.schema.remove_metadata()).to_pandas()`, but this makes it more obvious and discoverable to users. A user had an issue reading a Parquet file with some old metadata that we are no longer able to correctly process. Author: Wes McKinney Closes #3239 from wesm/ARROW-2592 and squashes the following commits: 82ac7a01 Unit test for ignore_metadata option 6c4246ef Test stub 8cf45a7a Add ignore_metadata option to Table.to_pandas --- python/pyarrow/pandas_compat.py | 6 ++++-- python/pyarrow/table.pxi | 16 ++++++++++++---- python/pyarrow/tests/test_convert_pandas.py | 11 +++++++++++ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 0eebcf6e1eec3..6acca0c35cf40 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -548,7 +548,8 @@ def _make_datetimetz(tz): # Converting pyarrow.Table efficiently to pandas.DataFrame -def table_to_blockmanager(options, table, memory_pool, categories=None): +def table_to_blockmanager(options, table, memory_pool, categories=None, + ignore_metadata=False): from pyarrow.compat import DatetimeTZDtype index_columns = [] @@ -560,7 +561,8 @@ def table_to_blockmanager(options, table, memory_pool, categories=None): row_count = table.num_rows metadata = schema.metadata - has_pandas_metadata = metadata is not None and b'pandas' in metadata + has_pandas_metadata = (not ignore_metadata and metadata is not None + and b'pandas' in metadata) if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 4d52f26e749fc..29a784d60f5a8 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -890,7 +890,7 @@ cdef class RecordBatch: def to_pandas(self, MemoryPool memory_pool=None, categories=None, bint strings_to_categorical=False, bint zero_copy_only=False, bint integer_object_nulls=False, bint date_as_object=False, - bint use_threads=True): + bint use_threads=True, bint ignore_metadata=False): """ Convert the arrow::RecordBatch to a pandas DataFrame @@ -911,6 +911,9 @@ cdef class RecordBatch: Cast dates to objects use_threads: boolean, default True Whether to parallelize the conversion using multiple threads + ignore_metadata : boolean, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present Returns ------- @@ -921,7 +924,8 @@ cdef class RecordBatch: strings_to_categorical=strings_to_categorical, zero_copy_only=zero_copy_only, integer_object_nulls=integer_object_nulls, - date_as_object=date_as_object, use_threads=use_threads + date_as_object=date_as_object, use_threads=use_threads, + ignore_metadata=ignore_metadata ) @classmethod @@ -1385,7 +1389,7 @@ cdef class Table: def to_pandas(self, MemoryPool memory_pool=None, categories=None, bint strings_to_categorical=False, bint zero_copy_only=False, bint integer_object_nulls=False, bint date_as_object=False, - bint use_threads=True): + bint use_threads=True, bint ignore_metadata=False): """ Convert the arrow::Table to a pandas DataFrame @@ -1406,6 +1410,9 @@ cdef class Table: Cast dates to objects use_threads: boolean, default True Whether to parallelize the conversion using multiple threads + ignore_metadata : boolean, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present Returns ------- @@ -1422,7 +1429,8 @@ cdef class Table: use_threads=use_threads) mgr = pdcompat.table_to_blockmanager(options, self, memory_pool, - categories) + categories, + ignore_metadata=ignore_metadata) return pd.DataFrame(mgr) def to_pydict(self): diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 41bcae83db516..12214847f3e53 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -376,6 +376,17 @@ def test_metadata_with_mixed_types(self): assert data_column['pandas_type'] == 'bytes' assert data_column['numpy_type'] == 'object' + def test_ignore_metadata(self): + df = pd.DataFrame({'a': [1, 2, 3], 'b': ['foo', 'bar', 'baz']}, + index=['one', 'two', 'three']) + table = pa.Table.from_pandas(df) + + result = table.to_pandas(ignore_metadata=True) + expected = (table.cast(table.schema.remove_metadata()) + .to_pandas()) + + assert result.equals(expected) + def test_list_metadata(self): df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]}) schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))]) From 79d8bf2de3c4d7f6e17d6bea5d5d477310e58668 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 23 Dec 2018 16:16:46 +0100 Subject: [PATCH 104/328] =?UTF-8?q?ARROW-4107:=20[Python]=C2=A0Use=20ninja?= =?UTF-8?q?=20in=20pyarrow=20manylinux1=20build?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Uwe L. Korn Closes #3253 from xhochy/ARROW-4107 and squashes the following commits: 6ed02454 ARROW-4107:  Use ninja in pyarrow manylinux1 build --- python/manylinux1/build_arrow.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index b1d8f8588dfc5..902bcb3eff360 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -35,6 +35,7 @@ cd /arrow/python # PyArrow build configuration export PYARROW_BUILD_TYPE='release' +export PYARROW_CMAKE_GENERATOR='Ninja' export PYARROW_WITH_ORC=1 export PYARROW_WITH_PARQUET=1 export PYARROW_WITH_PLASMA=1 From 6b798875c0e5a328e007f7ce634a8b4ce50eb553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 24 Dec 2018 10:12:30 -0600 Subject: [PATCH 105/328] ARROW-4109: [Packaging] Missing glog dependency from arrow-cpp conda recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up of https://github.com/apache/arrow/pull/3234 Crossbow builds: [kszucs/crossbow/build-386](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-386) Author: Krisztián Szűcs Closes #3255 from kszucs/conda_recipe_glogs and squashes the following commits: ed110abb6 add glog to arrow-cpp --- dev/tasks/conda-recipes/arrow-cpp/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index 725fd2291e75a..129136e2580ea 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -44,6 +44,7 @@ requirements: - rapidjson - zlib - glog + - gflags - snappy - brotli - zstd From 385c4384eb0dcc384b443f24765c64e9d6d88d28 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Mon, 24 Dec 2018 10:26:37 -0600 Subject: [PATCH 106/328] ARROW-3938: [Packaging] Stop to refer java/pom.xml to get version information Author: Kouhei Sutou Closes #3259 from kou/stop-to-refer-pom-xml-for-version and squashes the following commits: 3dce0a035 Stop to refer java/pom.xml to get version information --- c_glib/Makefile.am | 3 +- c_glib/configure.ac | 8 +- c_glib/doc/parquet-glib/parquet-glib-docs.xml | 4 + c_glib/meson.build | 3 +- c_glib/test/test-cuda.rb | 2 +- cpp/CMakeLists.txt | 8 +- cpp/src/gandiva/CMakeLists.txt | 2 + cpp/src/plasma/CMakeLists.txt | 1 + dev/release/00-prepare.sh | 92 ++++++++++++++++--- matlab/CMakeLists.txt | 6 +- python/setup.py | 30 +----- r/DESCRIPTION | 2 +- ruby/red-arrow-cuda/.gitignore | 2 - .../red-arrow-cuda/lib/arrow-cuda/version.rb | 19 ++-- ruby/red-arrow-cuda/red-arrow-cuda.gemspec | 6 +- ruby/red-arrow-cuda/test/helper.rb | 3 - ruby/red-arrow-cuda/version.rb | 71 -------------- ruby/red-arrow/.gitignore | 2 - .../red-arrow/lib/arrow/version.rb | 11 ++- ruby/red-arrow/red-arrow.gemspec | 6 +- ruby/red-arrow/test/helper.rb | 2 - ruby/red-arrow/version.rb | 71 -------------- ruby/red-gandiva/.gitignore | 2 - ruby/red-gandiva/lib/gandiva/version.rb | 26 ++++++ ruby/red-gandiva/red-gandiva.gemspec | 6 +- ruby/red-gandiva/test/helper.rb | 3 - ruby/red-gandiva/version.rb | 71 -------------- ruby/red-parquet/.gitignore | 2 - ruby/red-parquet/lib/parquet/version.rb | 26 ++++++ ruby/red-parquet/red-parquet.gemspec | 6 +- ruby/red-parquet/test/helper.rb | 3 - ruby/red-parquet/version.rb | 71 -------------- ruby/red-plasma/.gitignore | 2 - ruby/red-plasma/lib/plasma/version.rb | 26 ++++++ ruby/red-plasma/red-plasma.gemspec | 6 +- ruby/red-plasma/test/helper.rb | 3 - ruby/red-plasma/version.rb | 71 -------------- 37 files changed, 212 insertions(+), 466 deletions(-) rename c_glib/tool/get-version.py => ruby/red-arrow-cuda/lib/arrow-cuda/version.rb (69%) mode change 100755 => 100644 delete mode 100644 ruby/red-arrow-cuda/version.rb rename c_glib/tool/Makefile.am => ruby/red-arrow/lib/arrow/version.rb (80%) delete mode 100644 ruby/red-arrow/version.rb create mode 100644 ruby/red-gandiva/lib/gandiva/version.rb delete mode 100644 ruby/red-gandiva/version.rb create mode 100644 ruby/red-parquet/lib/parquet/version.rb delete mode 100644 ruby/red-parquet/version.rb create mode 100644 ruby/red-plasma/lib/plasma/version.rb delete mode 100644 ruby/red-plasma/version.rb diff --git a/c_glib/Makefile.am b/c_glib/Makefile.am index 149894c8241c2..53bb57e411b0c 100644 --- a/c_glib/Makefile.am +++ b/c_glib/Makefile.am @@ -24,8 +24,7 @@ SUBDIRS = \ parquet-glib \ plasma-glib \ doc \ - example \ - tool + example EXTRA_DIST = \ Gemfile \ diff --git a/c_glib/configure.ac b/c_glib/configure.ac index a6d8ed8e1d185..c63bfffa1d7f8 100644 --- a/c_glib/configure.ac +++ b/c_glib/configure.ac @@ -17,12 +17,7 @@ AC_PREREQ(2.65) -m4_define([arrow_glib_version], - m4_esyscmd(grep "^ " "$(dirname $0)/../java/pom.xml" | \ - sed -E \ - -e 's/(^ )//g' \ - -e 's/(<\/version>$)//g' | \ - tr -d '\n')) +m4_define([arrow_glib_version], 0.12.0-SNAPSHOT) AC_INIT([arrow-glib], arrow_glib_version, [https://issues.apache.org/jira/browse/ARROW], @@ -283,7 +278,6 @@ AC_CONFIG_FILES([ doc/plasma-glib/entities.xml example/Makefile example/lua/Makefile - tool/Makefile ]) AC_OUTPUT diff --git a/c_glib/doc/parquet-glib/parquet-glib-docs.xml b/c_glib/doc/parquet-glib/parquet-glib-docs.xml index 0f2c30ba7863f..4485a6765cb6b 100644 --- a/c_glib/doc/parquet-glib/parquet-glib-docs.xml +++ b/c_glib/doc/parquet-glib/parquet-glib-docs.xml @@ -57,6 +57,10 @@ Index of deprecated API + + Index of new symbols in 0.12.0 + + Index of new symbols in 0.11.0 diff --git a/c_glib/meson.build b/c_glib/meson.build index 194421c13d316..c2cf36c5d7c02 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -23,8 +23,7 @@ project('arrow-glib', 'c', 'cpp', 'cpp_std=c++11', ]) -python = find_program('python', 'python3', 'python2') -version = run_command(python, 'tool/get-version.py').stdout().strip() +version = '0.12.0-SNAPSHOT' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] diff --git a/c_glib/test/test-cuda.rb b/c_glib/test/test-cuda.rb index 32d486ef8ba97..ae915307b70f0 100644 --- a/c_glib/test/test-cuda.rb +++ b/c_glib/test/test-cuda.rb @@ -58,7 +58,7 @@ def test_export Arrow = GI.load("Arrow") ArrowCUDA = GI.load("ArrowCUDA") -manager = ArrowCUDA::ADeviceManager.new +manager = ArrowCUDA::DeviceManager.new context = manager.get_context(0) serialized_handle = #{serialized_handle.to_s.dump} handle = ArrowCUDA::IPCMemoryHandle.new(serialized_handle) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1672245924fb5..006b406ba0762 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -18,12 +18,8 @@ cmake_minimum_required(VERSION 3.2) message(STATUS "Building using CMake version: ${CMAKE_VERSION}") -# Extract Arrow version number -file(READ "${CMAKE_CURRENT_SOURCE_DIR}/../java/pom.xml" POM_XML) -string(REGEX MATCHALL - "\n [^<]+" ARROW_VERSION_TAG "${POM_XML}") -string(REGEX REPLACE - "(\n |)" "" ARROW_VERSION "${ARROW_VERSION_TAG}") +set(ARROW_VERSION "0.12.0-SNAPSHOT") + string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index d28c372a9e6ab..b574c67af3811 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +set(GANDIVA_VERSION "${ARROW_VERSION}") + # For "make gandiva" to build everything Gandiva-related add_custom_target(gandiva-all) add_custom_target(gandiva) diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index a71acf8ae43d8..2be5740bdd670 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -23,6 +23,7 @@ add_dependencies(plasma-all plasma plasma-tests plasma-benchmarks) # For the moment, Plasma is versioned like Arrow project(plasma VERSION "${ARROW_BASE_VERSION}") +set(PLASMA_VERSION "${ARROW_VERSION}") find_package(Threads) diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 5ff4ddc8f28a6..35d1998496fe0 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -21,6 +21,78 @@ set -e SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +update_versions() { + local version=$1 + + cd "${SOURCE_DIR}/../../cpp" + sed -i.bak -r -e \ + "s/^set\(ARROW_VERSION \".+\"\)/set(ARROW_VERSION \"${version}\")/" \ + CMakeLists.txt + rm -f CMakeLists.txt.bak + git add CMakeLists.txt + cd - + + cd "${SOURCE_DIR}/../../c_glib" + sed -i.bak -r -e \ + "s/^m4_define\(\[arrow_glib_version\], .+\)/m4_define([arrow_glib_version], ${version})/" \ + configure.ac + sed -i.bak -r -e \ + "s/^version = '.+'/version = '${version}'/" \ + meson.build + rm -f configure.ac.bak meson.build.bak + git add configure.ac meson.build + cd - + + # We can enable this when Arrow JS uses the same version. + # cd "${SOURCE_DIR}/../../js" + # sed -i.bak -r -e \ + # "s/^ \"version\": \".+\"/ \"version\": \"${version}\"/" \ + # package.json + # rm -f package.json + # git add package.json + # cd - + + cd "${SOURCE_DIR}/../../matlab" + sed -i.bak -r -e \ + "s/^set\(MLARROW_VERSION \".+\"\)/set(MLARROW_VERSION \"${version}\")/" \ + CMakeLists.txt + rm -f CMakeLists.txt.bak + git add CMakeLists.txt + cd - + + cd "${SOURCE_DIR}/../../python" + sed -i.bak -r -e \ + "s/^default_version: '.+'/default_version = '${version}'/" \ + setup.py + rm -f setup.py.bak + git add setup.py + cd - + + cd "${SOURCE_DIR}/../../r" + sed -i.bak -r -e \ + "s/^Version: .+/Version: ${version}/" \ + DESCRIPTION + rm -f DESCRIPTION.bak + git add DESCRIPTION + cd - + + cd "${SOURCE_DIR}/../../ruby" + sed -i.bak -r -e \ + "s/^ VERSION = \".+\"/ VERSION = \"${version}\"/g" \ + */*/*/version.rb + rm -f */*/*/version.rb.bak + git add */*/*/version.rb + cd - + + cd "${SOURCE_DIR}/../../rust" + sed -i.bak -r -e \ + "s/^version = \".+\"/version = \"${version}\"/g" \ + Cargo.toml + rm -f Cargo.toml.bak + git add Cargo.toml + cd - +} + if [ "$#" -eq 2 ]; then version=$1 nextVersion=$2 @@ -43,14 +115,19 @@ if [ "$#" -eq 2 ]; then echo "prepare release ${version} on tag ${tag} then reset to version ${nextVersionSNAPSHOT}" - cd "${SOURCE_DIR}/../../java" + update_versions "${version}" + git commit -m "[Release] Update versions for ${version}" + cd "${SOURCE_DIR}/../../java" mvn release:clean mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmodules -DdevelopmentVersion=${nextVersionSNAPSHOT} - cd - - echo "Updating .deb package names for $nextVersion" + echo "Updating versions for ${nextVersionSNAPSHOT}" + update_versions "${nextVersionSNAPSHOT}" + git commit -m "[Release] Update versions for ${nextVersionSNAPSHOT}" + + echo "Updating .deb package names for ${nextVersion}" deb_lib_suffix=$(echo $version | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') next_deb_lib_suffix=$(echo $nextVersion | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') cd $SOURCE_DIR/../tasks/linux-packages/ @@ -76,15 +153,6 @@ if [ "$#" -eq 2 ]; then git commit -m "[Release] Update .deb package names for $nextVersion" cd - - echo "prepare release ${version} in Rust crate" - - cd "${SOURCE_DIR}/../../rust" - sed -i.bak -r -e "s/version = \"$version\"/version = \"$nextVersion\"/g" Cargo.toml - rm -f Cargo.toml.bak - git add Cargo.toml - git commit -m "[Release] Update Rust Cargo.toml version for $nextVersion" - cd - - echo "Finish staging binary artifacts by running: sh dev/release/01-perform.sh" else diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index 897086637beaf..a6371d1dee4fa 100755 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -18,7 +18,11 @@ cmake_minimum_required(VERSION 3.2) set(CMAKE_CXX_STANDARD 11) -project(mlarrow) +set(MLARROW_VERSION "0.12.0-SNAPSHOT") +string(REGEX MATCH + "^[0-9]+\\.[0-9]+\\.[0-9]+" MLARROW_BASE_VERSION "${MLARROW_VERSION}") + +project(mlarrow VERSION "${MLARROW_BASE_VERSION}") # Grab CMAKE Modules from the CPP interface set(CPP_CMAKE_MODULES "${CMAKE_SOURCE_DIR}/../cpp/cmake_modules") diff --git a/python/setup.py b/python/setup.py index b8d192ddaec45..742851918c124 100755 --- a/python/setup.py +++ b/python/setup.py @@ -483,39 +483,15 @@ def _move_shared_libs_unix(build_prefix, build_lib, lib_name): # If the event of not running from a git clone (e.g. from a git archive # or a Python sdist), see if we can set the version number ourselves +default_version = '0.12.0-SNAPSHOT' if (not os.path.exists('../.git') and not os.environ.get('SETUPTOOLS_SCM_PRETEND_VERSION')): if os.path.exists('PKG-INFO'): # We're probably in a Python sdist, setuptools_scm will handle fine pass - elif os.path.exists('../java/pom.xml'): - # We're probably in a git archive - import xml.etree.ElementTree as ET - tree = ET.parse('../java/pom.xml') - version_tag = list(tree.getroot().findall( - '{http://maven.apache.org/POM/4.0.0}version'))[0] - use_setuptools_scm = False - os.environ['SETUPTOOLS_SCM_PRETEND_VERSION'] = \ - version_tag.text.replace("-SNAPSHOT", "a0") else: - raise RuntimeError("""\ - No reliable source available to get Arrow version. - - This is either because you copied the python/ directory yourself - outside of a git clone or source archive, or because you ran - `pip install` on the python/ directory. - - * Recommended workaround: first run `python sdist`, then - `pip install` the resulting source distribution. - - * If you're looking for an editable (in-place) install, - `python setup.py develop` should work fine in place of - `pip install -e .`. - - * If you really want to `pip install` the python/ directory, - set the SETUPTOOLS_SCM_PRETEND_VERSION environment variable - to force the Arrow version to the given value. - """) + os.environ['SETUPTOOLS_SCM_PRETEND_VERSION'] = \ + default_version.replace('-SNAPSHOT', 'a0') def parse_git(root, **kwargs): diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 5f93c83f236eb..10c28c3e7c42e 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: R Integration to 'Apache' 'Arrow' -Version: 0.0.0.9000 +Version: 0.12.0-SNAPSHOT Authors@R: c( person("Romain", "François", email = "romain@rstudio.com", role = c("aut", "cre")), person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")), diff --git a/ruby/red-arrow-cuda/.gitignore b/ruby/red-arrow-cuda/.gitignore index 3ec5511596306..779545d9026f1 100644 --- a/ruby/red-arrow-cuda/.gitignore +++ b/ruby/red-arrow-cuda/.gitignore @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -/lib/arrow-cuda/version.rb - /pkg/ diff --git a/c_glib/tool/get-version.py b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb old mode 100755 new mode 100644 similarity index 69% rename from c_glib/tool/get-version.py rename to ruby/red-arrow-cuda/lib/arrow-cuda/version.rb index aacea6da3e865..6426d2db7a471 --- a/c_glib/tool/get-version.py +++ b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb @@ -1,5 +1,3 @@ -#!/usr/bin/env python -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,13 +15,12 @@ # specific language governing permissions and limitations # under the License. -import os -import re +module ArrowCUDA + VERSION = "0.12.0-SNAPSHOT" -root = os.environ.get("MESON_SOURCE_ROOT", ".") -pom_xml = os.path.join(root, "..", "java", "pom.xml") -with open(pom_xml) as pom: - version_tag = re.search('^ (.+)', - pom.read(), - re.MULTILINE) - print(version_tag.group(1)) + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/ruby/red-arrow-cuda/red-arrow-cuda.gemspec b/ruby/red-arrow-cuda/red-arrow-cuda.gemspec index b2ee982945605..0c593ff37aa3a 100644 --- a/ruby/red-arrow-cuda/red-arrow-cuda.gemspec +++ b/ruby/red-arrow-cuda/red-arrow-cuda.gemspec @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -require_relative "version" +require_relative "lib/arrow-cuda/version" Gem::Specification.new do |spec| spec.name = "red-arrow-cuda" @@ -25,9 +25,9 @@ Gem::Specification.new do |spec| ArrowCUDA::Version::MAJOR.to_s, ArrowCUDA::Version::MINOR.to_s, ArrowCUDA::Version::MICRO.to_s, - # "beta1", + ArrowCUDA::Version::TAG, ] - spec.version = version_components.join(".") + spec.version = version_components.compact.join(".") spec.homepage = "https://arrow.apache.org/" spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] diff --git a/ruby/red-arrow-cuda/test/helper.rb b/ruby/red-arrow-cuda/test/helper.rb index 4d018332677ec..045eb10eea5d0 100644 --- a/ruby/red-arrow-cuda/test/helper.rb +++ b/ruby/red-arrow-cuda/test/helper.rb @@ -15,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -require_relative "../../red-arrow/version" -require_relative "../version" - require "arrow-cuda" require "test-unit" diff --git a/ruby/red-arrow-cuda/version.rb b/ruby/red-arrow-cuda/version.rb deleted file mode 100644 index c8bbbc7165f29..0000000000000 --- a/ruby/red-arrow-cuda/version.rb +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "pathname" - -version_rb_path = Pathname.new(__FILE__) -base_dir = version_rb_path.dirname -pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "arrow-cuda", "version.rb") - -need_update = false -if not lib_version_rb_path.exist? - need_update = true -elsif version_rb_path.mtime > lib_version_rb_path.mtime - need_update = true -elsif pom_xml_path.exist? and pom_xml_path.mtime > lib_version_rb_path.mtime - need_update = true -end - -if need_update - version = pom_xml_path.read.scan(/^ (.+?)<\/version>/)[0][0] - major, minor, micro, tag = version.split(/[.-]/) - lib_version_rb_path.open("w") do |lib_version_rb| - lib_version_rb.puts(<<-RUBY) -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module ArrowCUDA - module Version - MAJOR = #{major} - MINOR = #{minor} - MICRO = #{micro} - TAG = #{tag ? tag.dump : nil} - STRING = #{version.dump} - end - - VERSION = Version::STRING -end - RUBY - end -end - -require_relative "lib/arrow-cuda/version" diff --git a/ruby/red-arrow/.gitignore b/ruby/red-arrow/.gitignore index 9fcc9cdc16527..779545d9026f1 100644 --- a/ruby/red-arrow/.gitignore +++ b/ruby/red-arrow/.gitignore @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -/lib/arrow/version.rb - /pkg/ diff --git a/c_glib/tool/Makefile.am b/ruby/red-arrow/lib/arrow/version.rb similarity index 80% rename from c_glib/tool/Makefile.am rename to ruby/red-arrow/lib/arrow/version.rb index 5d7498b957520..8ff0779f0851f 100644 --- a/c_glib/tool/Makefile.am +++ b/ruby/red-arrow/lib/arrow/version.rb @@ -15,5 +15,12 @@ # specific language governing permissions and limitations # under the License. -EXTRA_DIST = \ - get-version.py +module Arrow + VERSION = "0.12.0-SNAPSHOT" + + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec index 3f0f68aa332cf..9db755fc67ccc 100644 --- a/ruby/red-arrow/red-arrow.gemspec +++ b/ruby/red-arrow/red-arrow.gemspec @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -require_relative "version" +require_relative "lib/arrow/version" Gem::Specification.new do |spec| spec.name = "red-arrow" @@ -25,9 +25,9 @@ Gem::Specification.new do |spec| Arrow::Version::MAJOR.to_s, Arrow::Version::MINOR.to_s, Arrow::Version::MICRO.to_s, - # "beta1", + Arrow::Version::TAG, ] - spec.version = version_components.join(".") + spec.version = version_components.compact.join(".") spec.homepage = "https://arrow.apache.org/" spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] diff --git a/ruby/red-arrow/test/helper.rb b/ruby/red-arrow/test/helper.rb index 2aa868bfa7c01..12f12d3a192e9 100644 --- a/ruby/red-arrow/test/helper.rb +++ b/ruby/red-arrow/test/helper.rb @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -require_relative "../version" - require "arrow" require "pathname" diff --git a/ruby/red-arrow/version.rb b/ruby/red-arrow/version.rb deleted file mode 100644 index e8f043f897d1f..0000000000000 --- a/ruby/red-arrow/version.rb +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "pathname" - -version_rb_path = Pathname.new(__FILE__) -base_dir = version_rb_path.dirname -pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "arrow", "version.rb") - -need_update = false -if not lib_version_rb_path.exist? - need_update = true -elsif version_rb_path.mtime > lib_version_rb_path.mtime - need_update = true -elsif pom_xml_path.exist? and pom_xml_path.mtime > lib_version_rb_path.mtime - need_update = true -end - -if need_update - version = pom_xml_path.read.scan(/^ (.+?)<\/version>/)[0][0] - major, minor, micro, tag = version.split(/[.-]/) - lib_version_rb_path.open("w") do |lib_version_rb| - lib_version_rb.puts(<<-RUBY) -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Arrow - module Version - MAJOR = #{major} - MINOR = #{minor} - MICRO = #{micro} - TAG = #{tag ? tag.dump : nil} - STRING = #{version.dump} - end - - VERSION = Version::STRING -end - RUBY - end -end - -require_relative "lib/arrow/version" diff --git a/ruby/red-gandiva/.gitignore b/ruby/red-gandiva/.gitignore index 99c64a5d3dd52..779545d9026f1 100644 --- a/ruby/red-gandiva/.gitignore +++ b/ruby/red-gandiva/.gitignore @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -/lib/gandiva/version.rb - /pkg/ diff --git a/ruby/red-gandiva/lib/gandiva/version.rb b/ruby/red-gandiva/lib/gandiva/version.rb new file mode 100644 index 0000000000000..dbdaf36857bd8 --- /dev/null +++ b/ruby/red-gandiva/lib/gandiva/version.rb @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Gandiva + VERSION = "0.12.0-SNAPSHOT" + + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/ruby/red-gandiva/red-gandiva.gemspec b/ruby/red-gandiva/red-gandiva.gemspec index 7f84faf2ec035..857559e021183 100644 --- a/ruby/red-gandiva/red-gandiva.gemspec +++ b/ruby/red-gandiva/red-gandiva.gemspec @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -require_relative "version" +require_relative "lib/gandiva/version" Gem::Specification.new do |spec| spec.name = "red-gandiva" @@ -25,9 +25,9 @@ Gem::Specification.new do |spec| Gandiva::Version::MAJOR.to_s, Gandiva::Version::MINOR.to_s, Gandiva::Version::MICRO.to_s, - # "beta1", + Gandiva::Version::TAG, ] - spec.version = version_components.join(".") + spec.version = version_components.compact.join(".") spec.homepage = "https://arrow.apache.org/" spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] diff --git a/ruby/red-gandiva/test/helper.rb b/ruby/red-gandiva/test/helper.rb index 2f4e7dc46b1e3..9c291f7aebf42 100644 --- a/ruby/red-gandiva/test/helper.rb +++ b/ruby/red-gandiva/test/helper.rb @@ -15,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -require_relative "../../red-arrow/version" -require_relative "../version" - require "gandiva" require "test-unit" diff --git a/ruby/red-gandiva/version.rb b/ruby/red-gandiva/version.rb deleted file mode 100644 index ba769796accad..0000000000000 --- a/ruby/red-gandiva/version.rb +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "pathname" - -version_rb_path = Pathname.new(__FILE__) -base_dir = version_rb_path.dirname -pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "gandiva", "version.rb") - -need_update = false -if not lib_version_rb_path.exist? - need_update = true -elsif version_rb_path.mtime > lib_version_rb_path.mtime - need_update = true -elsif pom_xml_path.exist? and pom_xml_path.mtime > lib_version_rb_path.mtime - need_update = true -end - -if need_update - version = pom_xml_path.read.scan(/^ (.+?)<\/version>/)[0][0] - major, minor, micro, tag = version.split(/[.-]/) - lib_version_rb_path.open("w") do |lib_version_rb| - lib_version_rb.puts(<<-RUBY) -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Gandiva - module Version - MAJOR = #{major} - MINOR = #{minor} - MICRO = #{micro} - TAG = #{tag ? tag.dump : nil} - STRING = #{version.dump} - end - - VERSION = Version::STRING -end - RUBY - end -end - -require_relative "lib/gandiva/version" diff --git a/ruby/red-parquet/.gitignore b/ruby/red-parquet/.gitignore index 542f54c56a5ca..779545d9026f1 100644 --- a/ruby/red-parquet/.gitignore +++ b/ruby/red-parquet/.gitignore @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -/lib/parquet/version.rb - /pkg/ diff --git a/ruby/red-parquet/lib/parquet/version.rb b/ruby/red-parquet/lib/parquet/version.rb new file mode 100644 index 0000000000000..997a92e4c321d --- /dev/null +++ b/ruby/red-parquet/lib/parquet/version.rb @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Parquet + VERSION = "0.12.0-SNAPSHOT" + + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/ruby/red-parquet/red-parquet.gemspec b/ruby/red-parquet/red-parquet.gemspec index 491648b7af97f..7688dcb5708f9 100644 --- a/ruby/red-parquet/red-parquet.gemspec +++ b/ruby/red-parquet/red-parquet.gemspec @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -require_relative "version" +require_relative "lib/parquet/version" Gem::Specification.new do |spec| spec.name = "red-parquet" @@ -25,9 +25,9 @@ Gem::Specification.new do |spec| Parquet::Version::MAJOR.to_s, Parquet::Version::MINOR.to_s, Parquet::Version::MICRO.to_s, - # "beta1", + Parquet::Version::TAG, ] - spec.version = version_components.join(".") + spec.version = version_components.compact.join(".") spec.homepage = "https://arrow.apache.org/" spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] diff --git a/ruby/red-parquet/test/helper.rb b/ruby/red-parquet/test/helper.rb index 43013ab5686d6..169d1df424ea7 100644 --- a/ruby/red-parquet/test/helper.rb +++ b/ruby/red-parquet/test/helper.rb @@ -15,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -require_relative "../../red-arrow/version" -require_relative "../version" - require "parquet" require "tempfile" diff --git a/ruby/red-parquet/version.rb b/ruby/red-parquet/version.rb deleted file mode 100644 index 06045167e9495..0000000000000 --- a/ruby/red-parquet/version.rb +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "pathname" - -version_rb_path = Pathname.new(__FILE__) -base_dir = version_rb_path.dirname -pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "parquet", "version.rb") - -need_update = false -if not lib_version_rb_path.exist? - need_update = true -elsif version_rb_path.mtime > lib_version_rb_path.mtime - need_update = true -elsif pom_xml_path.exist? and pom_xml_path.mtime > lib_version_rb_path.mtime - need_update = true -end - -if need_update - version = pom_xml_path.read.scan(/^ (.+?)<\/version>/)[0][0] - major, minor, micro, tag = version.split(/[.-]/) - lib_version_rb_path.open("w") do |lib_version_rb| - lib_version_rb.puts(<<-RUBY) -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Parquet - module Version - MAJOR = #{major} - MINOR = #{minor} - MICRO = #{micro} - TAG = #{tag ? tag.dump : nil} - STRING = #{version.dump} - end - - VERSION = Version::STRING -end - RUBY - end -end - -require_relative "lib/parquet/version" diff --git a/ruby/red-plasma/.gitignore b/ruby/red-plasma/.gitignore index bd50ff8187f6d..779545d9026f1 100644 --- a/ruby/red-plasma/.gitignore +++ b/ruby/red-plasma/.gitignore @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -/lib/plasma/version.rb - /pkg/ diff --git a/ruby/red-plasma/lib/plasma/version.rb b/ruby/red-plasma/lib/plasma/version.rb new file mode 100644 index 0000000000000..e88f2def82ec1 --- /dev/null +++ b/ruby/red-plasma/lib/plasma/version.rb @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Plasma + VERSION = "0.12.0-SNAPSHOT" + + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/ruby/red-plasma/red-plasma.gemspec b/ruby/red-plasma/red-plasma.gemspec index 53b4d1ec0dade..09b4a551ab571 100644 --- a/ruby/red-plasma/red-plasma.gemspec +++ b/ruby/red-plasma/red-plasma.gemspec @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -require_relative "version" +require_relative "lib/plasma/version" Gem::Specification.new do |spec| spec.name = "red-plasma" @@ -25,9 +25,9 @@ Gem::Specification.new do |spec| Plasma::Version::MAJOR.to_s, Plasma::Version::MINOR.to_s, Plasma::Version::MICRO.to_s, - # "beta1", + Plasma::Version::TAG, ] - spec.version = version_components.join(".") + spec.version = version_components.compact.join(".") spec.homepage = "https://arrow.apache.org/" spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] diff --git a/ruby/red-plasma/test/helper.rb b/ruby/red-plasma/test/helper.rb index d66d43ecc94c0..255cad2870044 100644 --- a/ruby/red-plasma/test/helper.rb +++ b/ruby/red-plasma/test/helper.rb @@ -15,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -require_relative "../../red-arrow/version" -require_relative "../version" - require "plasma" require "tempfile" diff --git a/ruby/red-plasma/version.rb b/ruby/red-plasma/version.rb deleted file mode 100644 index 015aac9594d26..0000000000000 --- a/ruby/red-plasma/version.rb +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "pathname" - -version_rb_path = Pathname.new(__FILE__) -base_dir = version_rb_path.dirname -pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "plasma", "version.rb") - -need_update = false -if not lib_version_rb_path.exist? - need_update = true -elsif version_rb_path.mtime > lib_version_rb_path.mtime - need_update = true -elsif pom_xml_path.exist? and pom_xml_path.mtime > lib_version_rb_path.mtime - need_update = true -end - -if need_update - version = pom_xml_path.read.scan(/^ (.+?)<\/version>/)[0][0] - major, minor, micro, tag = version.split(/[.-]/) - lib_version_rb_path.open("w") do |lib_version_rb| - lib_version_rb.puts(<<-RUBY) -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Plasma - module Version - MAJOR = #{major} - MINOR = #{minor} - MICRO = #{micro} - TAG = #{tag ? tag.dump : nil} - STRING = #{version.dump} - end - - VERSION = Version::STRING -end - RUBY - end -end - -require_relative "lib/plasma/version" From cd543b9756d602ebabda749c60a14d629db7a35a Mon Sep 17 00:00:00 2001 From: Tanya Schlusser Date: Mon, 24 Dec 2018 15:19:26 -0600 Subject: [PATCH 107/328] ARROW-2504: [Website] Add ApacheCon NA link Place a 234x60 link in the navbar next to the Apache Software Foundation link. Screenshot for full width: ![image](https://user-images.githubusercontent.com/7432951/48995065-1213f700-f10c-11e8-944f-e5d26f1bfe8b.png) Screenshot for medium width: ![image](https://user-images.githubusercontent.com/7432951/48995076-1b04c880-f10c-11e8-8abf-a3d2ef204596.png) Disappears in small width -- screenshot: ![image](https://user-images.githubusercontent.com/7432951/48995108-31128900-f10c-11e8-96a7-066e377081fa.png) Author: Tanya Schlusser Closes #3030 from tanyaschlusser/master and squashes the following commits: 77c6cd323 ARROW-2504: Add ApacheCon NA link --- site/_includes/header.html | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/site/_includes/header.html b/site/_includes/header.html index e0f23ecd24e30..5344501acfe29 100644 --- a/site/_includes/header.html +++ b/site/_includes/header.html @@ -77,9 +77,14 @@ - - - + From cfaea429d0f2d3d9baa2a10d6da759ffd0f9d7f8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Dec 2018 16:49:22 -0600 Subject: [PATCH 108/328] PARQUET-1481: [C++] Throw exception when encountering bad Thrift metadata in RecordReader Author: Wes McKinney Closes #3242 from wesm/PARQUET-1481 and squashes the following commits: b074227ba Add test case with example corrupt data file 59400a2f1 Throw exception when encountering bad Thrift metadata in RecordReader --- .../parquet/arrow/arrow-reader-writer-test.cc | 29 ++++++++++++++----- cpp/src/parquet/arrow/record_reader.cc | 8 +++-- cpp/submodules/parquet-testing | 2 +- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 4e62a22c350ff..bb9763224f3ba 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -2291,21 +2291,34 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) { ASSERT_EQ(expected, calculated); } -TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) { - // PARQUET-995 +void TryReadDataFile(const std::string& testing_file_path, bool should_succeed = true) { std::string dir_string(test::get_data_dir()); std::stringstream ss; - ss << dir_string << "/" - << "alltypes_plain.parquet"; + ss << dir_string << "/" << testing_file_path; auto path = ss.str(); auto pool = ::arrow::default_memory_pool(); std::unique_ptr arrow_reader; - ASSERT_NO_THROW( - arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false)))); - std::shared_ptr<::arrow::Table> table; - ASSERT_OK_NO_THROW(arrow_reader->ReadTable(&table)); + try { + arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false))); + std::shared_ptr<::arrow::Table> table; + ASSERT_OK(arrow_reader->ReadTable(&table)); + } catch (const ParquetException& e) { + if (should_succeed) { + FAIL() << "Exception thrown when reading file: " << e.what(); + } + } +} + +TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) { + // PARQUET-995 + TryReadDataFile("alltypes_plain.parquet"); +} + +TEST(TestArrowReaderAdHoc, CorruptedSchema) { + // PARQUET-1481 + TryReadDataFile("bad_data/PARQUET-1481.parquet", false /* should_succeed */); } class TestArrowReaderAdHocSparkAndHvr diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index d1bf2c5cdfdc6..4a988dacdd9aa 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -850,8 +850,12 @@ std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, case Type::FIXED_LEN_BYTE_ARRAY: return std::shared_ptr( new RecordReader(new TypedRecordReader(descr, pool))); - default: - DCHECK(false); + default: { + // PARQUET-1481: This can occur if the file is corrupt + std::stringstream ss; + ss << "Invalid physical column type: " << static_cast(descr->physical_type()); + throw ParquetException(ss.str()); + } } // Unreachable code, but supress compiler warning return nullptr; diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 92a8e6c2efdce..8eb0213c49175 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 92a8e6c2efdce1925c605d6313994db2c94478fb +Subproject commit 8eb0213c491752c9bbb1b884fcbb21deb548e464 From 49f93e0dc06023d664ecc82b625ad4d72f0fc0cd Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 26 Dec 2018 10:43:21 -0600 Subject: [PATCH 109/328] ARROW-4114: [C++] Add python to requirements list for running on ubuntu Author: Micah Kornfield Closes #3260 from emkornfield/update_build_instructions and squashes the following commits: 80c112b25 Add python to requirements list for running on ubuntu --- cpp/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/README.md b/cpp/README.md index b602bef1c7710..7e92648dc37aa 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -33,6 +33,10 @@ Building Arrow requires: * CMake 3.2 or higher * Boost +Testing arrow with ctest requires: + +* python + On Ubuntu/Debian you can install the requirements with: ```shell @@ -43,7 +47,8 @@ sudo apt-get install \ libboost-dev \ libboost-filesystem-dev \ libboost-regex-dev \ - libboost-system-dev + libboost-system-dev \ + python ``` On Alpine Linux: From 91c585d54b635212c78106790cf0ebed020fc758 Mon Sep 17 00:00:00 2001 From: Praveen Date: Wed, 26 Dec 2018 11:27:16 -0600 Subject: [PATCH 110/328] ARROW-4100: [Gandiva][C++] Fix regex for special character dot. Make dot a special character that needs to be escaped, else it does not match the sql standards. Author: Praveen Closes #3241 from praveenbingo/regex and squashes the following commits: 7792fec23 ARROW-4100: Add more valgrind suppressions. 12fb046e2 ARROW-4050: Fix valgrind suppressions. e97d38375 ARROW-4050: Fix regex for special character dot. --- cpp/src/gandiva/CMakeLists.txt | 1 + cpp/src/gandiva/like_holder_test.cc | 10 ++++++++++ cpp/src/gandiva/regex_util.cc | 2 +- cpp/valgrind.supp | 16 ++++++++++++++-- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index b574c67af3811..6b67c8699c511 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -168,6 +168,7 @@ ADD_GANDIVA_TEST(selection_vector_test) ADD_GANDIVA_TEST(lru_cache_test) ADD_GANDIVA_TEST(to_date_holder_test) ADD_GANDIVA_TEST(simple_arena_test) +ADD_GANDIVA_TEST(like_holder_test) if (ARROW_GANDIVA_JAVA) add_subdirectory(jni) diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index 3e3cd37c4fed1..d0ce8bb595021 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -84,6 +84,16 @@ TEST_F(TestLikeHolder, TestRegexEscape) { EXPECT_EQ(res, "%hello_abc.def#"); } +TEST_F(TestLikeHolder, TestDot) { + std::shared_ptr like_holder; + + auto status = LikeHolder::Make("abc.", &like_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_FALSE(like("abcd")); +} + TEST_F(TestLikeHolder, TestOptimise) { // optimise for 'starts_with' auto fnode = LikeHolder::TryOptimize(BuildLike("xy 123z%")); diff --git a/cpp/src/gandiva/regex_util.cc b/cpp/src/gandiva/regex_util.cc index 1d3860615d57f..abdd579d1f5e4 100644 --- a/cpp/src/gandiva/regex_util.cc +++ b/cpp/src/gandiva/regex_util.cc @@ -20,7 +20,7 @@ namespace gandiva { const std::set RegexUtil::pcre_regex_specials_ = { - '[', ']', '(', ')', '|', '^', '-', '+', '*', '?', '{', '}', '$', '\\'}; + '[', ']', '(', ')', '|', '^', '-', '+', '*', '?', '{', '}', '$', '\\', '.'}; Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char escape_char, std::string& pcre_pattern) { diff --git a/cpp/valgrind.supp b/cpp/valgrind.supp index d8bc8fb28f2d5..08076aade4d9e 100644 --- a/cpp/valgrind.supp +++ b/cpp/valgrind.supp @@ -25,11 +25,23 @@ :Conditional jump or move depends on uninitialised value(s) Memcheck:Cond ... - fun:_ZN3re23RE2C1E* + fun:*re2*RE2* } { :Use of uninitialised value of size 8 Memcheck:Value8 ... - fun:_ZN3re23RE2C1E* + fun:*re2*RE2* +} +{ + :Conditional jump or move depends on uninitialised value(s) + Memcheck:Cond + ... + fun:*re2*Prog* +} +{ + :Use of uninitialised value of size 8 + Memcheck:Value8 + ... + fun:*re2*Prog* } From 2849f46fcc203e4c9c5e09b3065ffb92cd133dce Mon Sep 17 00:00:00 2001 From: Pindikura Ravindra Date: Wed, 26 Dec 2018 13:44:49 -0600 Subject: [PATCH 111/328] ARROW-4115: [Gandiva] zero-init boolean data bufs Author: Pindikura Ravindra Closes #3263 from pravindra/arrow-4115 and squashes the following commits: d6b7834e3 ARROW-4115: zero-init boolean data bufs --- cpp/src/gandiva/projector.cc | 6 +++--- cpp/src/gandiva/tests/projector_test.cc | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index d5902fc72f16d..4cb352f2ad3c1 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -155,10 +155,10 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, int64_t data_len = arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width()); ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, data_len, &data)); - // Valgrind detects unitialized memory at byte level. Boolean types use bits - // and can leave buffer memory uninitialized in the last byte. + // This is not strictly required but valgrind gets confused and detects this + // as uninitialized memory access. See arrow::util::SetBitTo(). if (type->id() == arrow::Type::BOOL) { - data->mutable_data()[data_len - 1] = 0; + memset(data->mutable_data(), 0, data_len); } *array_data = arrow::ArrayData::Make(type, num_records, {null_bitmap, data}); diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 1aeb43b49b0dc..33cdce07ae6f7 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -227,10 +227,11 @@ static void TestArithmeticOpsForType(arrow::MemoryPool* pool) { EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data - int num_records = 4; - std::vector input0 = {1, 2, 53, 84}; - std::vector input1 = {10, 15, 23, 84}; - std::vector validity = {true, true, true, true}; + int num_records = 12; + std::vector input0 = {1, 2, 53, 84, 5, 15, 0, 1, 52, 83, 4, 120}; + std::vector input1 = {10, 15, 23, 84, 4, 51, 68, 9, 16, 18, 19, 37}; + std::vector validity = {true, true, true, true, true, true, + true, true, true, true, true, true}; auto array0 = MakeArrowArray(input0, validity); auto array1 = MakeArrowArray(input1, validity); From 46ecbb64e13d942803a21b23e5d7b7eff46bc752 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 26 Dec 2018 14:50:40 -0600 Subject: [PATCH 112/328] ARROW-4103: [Docs] Move documentation build instructions from source/python/development.rst to docs/README.md Author: Wes McKinney Closes #3243 from wesm/ARROW-4103 and squashes the following commits: 6873ac1c0 Direct user to project build instructions in docs/README.md --- docs/README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 docs/README.md diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000000000..e20b59df109cb --- /dev/null +++ b/docs/README.md @@ -0,0 +1,30 @@ + + +# Apache Arrow Documentation + +This directory contains source files for building the main project +documentation. This includes the [Arrow columnar format specification][2]. + +Instructions for building the documentation site are found in +[docs/source/python/development.rst][1]. The build depends on the API +documentation for some of the project subcomponents. + +[1]: https://github.com/apache/arrow/blob/master/docs/source/python/development.rst#building-the-documentation +[2]: https://github.com/apache/arrow/tree/master/docs/source/format \ No newline at end of file From 0c2f3541efc86923fc2aff30efe664fb48ba1efd Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 26 Dec 2018 18:00:15 -0600 Subject: [PATCH 113/328] ARROW-4116: [Python] Add warning to development instructions to avoid virtualenv when using Anaconda/miniconda Author: Wes McKinney Closes #3264 from wesm/ARROW-4116 and squashes the following commits: 1a2d8c590 Add warning to avoid virtualenv when using Anaconda/miniconda --- docs/source/python/development.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst index 1dcfda862817f..63e6051a7b864 100644 --- a/docs/source/python/development.rst +++ b/docs/source/python/development.rst @@ -113,6 +113,13 @@ about our build toolchain: Using pip ~~~~~~~~~ +.. warning:: + + If you installed Python using the Anaconda distribution or `Miniconda + `_, you cannot currently use ``virtualenv`` + to manage your development. Please follow the conda-based development + instructions instead. + On macOS, install all dependencies through Homebrew that are required for building Arrow C++: From 9c76600af968d6f22642ae06fab13d16813fc009 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 27 Dec 2018 09:52:36 +0900 Subject: [PATCH 114/328] ARROW-4112: [Packaging] Add support for Gandiva .deb Author: Kouhei Sutou Closes #3258 from kou/linux-packages-gandiva and squashes the following commits: fa621931 Add support for Gandiva .deb/.rpm --- dev/release/00-prepare.sh | 2 +- dev/release/rat_exclude_files.txt | 8 ++ .../apt/debian-stretch/Dockerfile | 4 + .../apt/ubuntu-bionic/Dockerfile | 1 + .../apt/ubuntu-cosmic/Dockerfile | 1 + .../apt/ubuntu-xenial/Dockerfile | 1 + .../linux-packages/debian.ubuntu-trusty/rules | 1 - dev/tasks/linux-packages/debian/control | 82 +++++++++++++++++++ .../debian/gir1.2-gandiva-1.0.install | 1 + .../debian/libgandiva-dev.install | 3 + .../debian/libgandiva-glib-dev.install | 5 ++ .../debian/libgandiva-glib-doc.doc-base | 9 ++ .../debian/libgandiva-glib-doc.install | 1 + .../debian/libgandiva-glib-doc.links | 3 + .../debian/libgandiva-glib12.install | 1 + .../debian/libgandiva12.install | 2 + dev/tasks/linux-packages/debian/rules | 3 +- dev/tasks/linux-packages/yum/arrow.spec.in | 3 +- .../linux-packages/yum/centos-6/Dockerfile | 6 +- dev/tasks/tasks.yml | 26 ++++++ 20 files changed, 155 insertions(+), 8 deletions(-) create mode 100644 dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install create mode 100644 dev/tasks/linux-packages/debian/libgandiva-dev.install create mode 100644 dev/tasks/linux-packages/debian/libgandiva-glib-dev.install create mode 100644 dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base create mode 100644 dev/tasks/linux-packages/debian/libgandiva-glib-doc.install create mode 100644 dev/tasks/linux-packages/debian/libgandiva-glib-doc.links create mode 100644 dev/tasks/linux-packages/debian/libgandiva-glib12.install create mode 100644 dev/tasks/linux-packages/debian/libgandiva12.install diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 35d1998496fe0..141882e22566a 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -136,7 +136,7 @@ if [ "$#" -eq 2 ]; then ${target} \ $(echo $target | sed -e "s/${deb_lib_suffix}/${next_deb_lib_suffix}/") done - deb_lib_suffix_substitute_pattern="s/(lib(arrow|parquet)[-a-z]*)${deb_lib_suffix}/\\1${next_deb_lib_suffix}/g" + deb_lib_suffix_substitute_pattern="s/(lib(arrow|gandiva|parquet|plasma)[-a-z]*)${deb_lib_suffix}/\\1${next_deb_lib_suffix}/g" sed -i.bak -r -e "${deb_lib_suffix_substitute_pattern}" debian*/control rm -f debian*/control.bak git add debian*/control diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index bcb474b79b060..7674e2fee0f29 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -72,6 +72,7 @@ dev/tasks/linux-packages/debian/compat dev/tasks/linux-packages/debian/control dev/tasks/linux-packages/debian/gir1.2-arrow-1.0.install dev/tasks/linux-packages/debian/gir1.2-arrow-cuda-1.0.install +dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install dev/tasks/linux-packages/debian/gir1.2-parquet-1.0.install dev/tasks/linux-packages/debian/gir1.2-plasma-1.0.install dev/tasks/linux-packages/debian/libarrow-dev.install @@ -87,6 +88,13 @@ dev/tasks/linux-packages/debian/libarrow-cuda12.install dev/tasks/linux-packages/debian/libarrow-python-dev.install dev/tasks/linux-packages/debian/libarrow-python12.install dev/tasks/linux-packages/debian/libarrow12.install +dev/tasks/linux-packages/debian/libgandiva-dev.install +dev/tasks/linux-packages/debian/libgandiva-glib-dev.install +dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base +dev/tasks/linux-packages/debian/libgandiva-glib-doc.install +dev/tasks/linux-packages/debian/libgandiva-glib-doc.links +dev/tasks/linux-packages/debian/libgandiva-glib12.install +dev/tasks/linux-packages/debian/libgandiva12.install dev/tasks/linux-packages/debian/libparquet-dev.install dev/tasks/linux-packages/debian/libparquet-glib-dev.install dev/tasks/linux-packages/debian/libparquet-glib-doc.doc-base diff --git a/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile b/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile index 4dde574cbf95d..70cefaabf262e 100644 --- a/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile +++ b/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile @@ -22,6 +22,9 @@ ENV DEBIAN_FRONTEND noninteractive ARG DEBUG RUN sed -i'' -e 's/main$/main contrib non-free/g' /etc/apt/sources.list +RUN \ + echo "deb http://deb.debian.org/debian stretch-backports main" > \ + /etc/apt/sources.list.d/backports.list RUN \ quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ @@ -30,6 +33,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile b/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile index 5d3c9ba2932ed..68de4d569a663 100644 --- a/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile +++ b/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile @@ -28,6 +28,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile b/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile index 519d058d4b2e3..0d871eaa2635d 100644 --- a/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile +++ b/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile @@ -28,6 +28,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile b/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile index 17cb27713f08c..c7c5b1e09ece1 100644 --- a/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile +++ b/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile @@ -28,6 +28,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/rules b/dev/tasks/linux-packages/debian.ubuntu-trusty/rules index 6f2ffdc416906..4eb26772df00c 100755 --- a/dev/tasks/linux-packages/debian.ubuntu-trusty/rules +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/rules @@ -22,7 +22,6 @@ override_dh_auto_configure: --builddirectory=cpp_build \ -- \ -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ - -DARROW_BUILD_TESTS=OFF \ -DARROW_ORC=ON \ -DARROW_PARQUET=ON \ -DARROW_PLASMA=ON diff --git a/dev/tasks/linux-packages/debian/control b/dev/tasks/linux-packages/debian/control index b5c696363798f..579c2e47bb520 100644 --- a/dev/tasks/linux-packages/debian/control +++ b/dev/tasks/linux-packages/debian/control @@ -5,6 +5,7 @@ Maintainer: Kouhei Sutou Build-Depends: autoconf-archive, bison, + clang-6.0, cmake, debhelper (>= 9.20160115), dh-autoreconf, @@ -102,6 +103,33 @@ Description: Apache Arrow is a data processing library for analysis . This package provides C++ header files for CUDA support. +Package: libgandiva12 +Section: libs +Architecture: any +Multi-Arch: same +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libarrow12 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides C++ library files. + +Package: libgandiva-dev +Section: libdevel +Architecture: any +Multi-Arch: same +Depends: + ${misc:Depends}, + libarrow-dev (= ${binary:Version}), + libgandiva12 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides C++ header files. + Package: libplasma12 Section: libs Architecture: any @@ -252,6 +280,60 @@ Description: Apache Arrow is a data processing library for analysis . This package provides GLib based header files for CUDA support. +Package: libgandiva-glib12 +Section: libs +Architecture: any +Multi-Arch: same +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libarrow-glib12 (= ${binary:Version}), + libgandiva12 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides GLib based library files. + +Package: gir1.2-gandiva-1.0 +Section: introspection +Architecture: any +Multi-Arch: same +Depends: + ${gir:Depends}, + ${misc:Depends} +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides GObject Introspection typelib files. + +Package: libgandiva-glib-dev +Section: libdevel +Architecture: any +Multi-Arch: same +Depends: + ${misc:Depends}, + libgandiva-dev (= ${binary:Version}), + libarrow-glib-dev (= ${binary:Version}), + libgandiva-glib12 (= ${binary:Version}), + gir1.2-gandiva-1.0 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides GLib based header files. + +Package: libgandiva-glib-doc +Section: doc +Architecture: all +Multi-Arch: foreign +Depends: + ${misc:Depends} +Recommends: libglib2.0-doc +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides documentations. + Package: libplasma-glib12 Section: libs Architecture: any diff --git a/dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install b/dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install new file mode 100644 index 0000000000000..0433b367a24c8 --- /dev/null +++ b/dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install @@ -0,0 +1 @@ +usr/lib/*/girepository-1.0/Gandiva-1.0.typelib diff --git a/dev/tasks/linux-packages/debian/libgandiva-dev.install b/dev/tasks/linux-packages/debian/libgandiva-dev.install new file mode 100644 index 0000000000000..1e5d264378e69 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-dev.install @@ -0,0 +1,3 @@ +usr/lib/*/libgandiva.a +usr/lib/*/libgandiva.so +usr/lib/*/pkgconfig/gandiva.pc diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-dev.install b/dev/tasks/linux-packages/debian/libgandiva-glib-dev.install new file mode 100644 index 0000000000000..4189dac66ed90 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-dev.install @@ -0,0 +1,5 @@ +usr/include/gandiva-glib/ +usr/lib/*/libgandiva-glib.a +usr/lib/*/libgandiva-glib.so +usr/lib/*/pkgconfig/gandiva-glib.pc +usr/share/gir-1.0/Gandiva-1.0.gir diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base new file mode 100644 index 0000000000000..bed6a124c5e08 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base @@ -0,0 +1,9 @@ +Document: gandiva-glib +Title: Gandiva GLib Reference Manual +Author: The Apache Software Foundation +Abstract: Gandiva GLib is a toolset for compiling and evaluating expressions on Arrow Data that uses GLib. +Section: Programming + +Format: HTML +Index: /usr/share/doc/libarrow-glib-doc/gandiva-glib/index.html +Files: /usr/share/doc/libarrow-glib-doc/gandiva-glib/*.html diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-doc.install b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.install new file mode 100644 index 0000000000000..54d2d066c275a --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.install @@ -0,0 +1 @@ +usr/share/doc/libarrow-glib-doc/gandiva-glib/ diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-doc.links b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.links new file mode 100644 index 0000000000000..291b004ed717a --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.links @@ -0,0 +1,3 @@ +usr/share/doc/libglib2.0-doc/glib usr/share/doc/libgandiva-glib-doc/glib +usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libgandiva-glib-doc/gobject +usr/share/doc/libarrow-glib-doc/gandiva-glib usr/share/gtk-doc/html/gandiva-glib diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib12.install b/dev/tasks/linux-packages/debian/libgandiva-glib12.install new file mode 100644 index 0000000000000..6257fd43823c0 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib12.install @@ -0,0 +1 @@ +usr/lib/*/libgandiva-glib.so.* diff --git a/dev/tasks/linux-packages/debian/libgandiva12.install b/dev/tasks/linux-packages/debian/libgandiva12.install new file mode 100644 index 0000000000000..38a05876db6e6 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva12.install @@ -0,0 +1,2 @@ +usr/lib/*/libgandiva.so.* +usr/lib/*/gandiva/ diff --git a/dev/tasks/linux-packages/debian/rules b/dev/tasks/linux-packages/debian/rules index f3cc2a045c1ee..d82f306cd2656 100755 --- a/dev/tasks/linux-packages/debian/rules +++ b/dev/tasks/linux-packages/debian/rules @@ -24,12 +24,13 @@ override_dh_auto_configure: --builddirectory=cpp_build \ -- \ -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ - -DARROW_BUILD_TESTS=OFF \ -DARROW_PYTHON=ON \ -DARROW_BOOST_USE_SHARED=ON \ -DARROW_ORC=ON \ -DARROW_PARQUET=ON \ -DARROW_PLASMA=ON \ + -DARROW_GANDIVA=ON \ + -DARROW_GANDIVA_JAVA=OFF \ -DPROTOBUF_HOME=/usr \ -DARROW_PROTOBUF_USE_SHARED=ON \ -DPythonInterp_FIND_VERSION=ON \ diff --git a/dev/tasks/linux-packages/yum/arrow.spec.in b/dev/tasks/linux-packages/yum/arrow.spec.in index ad60dfbdde18e..568477e90d6d3 100644 --- a/dev/tasks/linux-packages/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/yum/arrow.spec.in @@ -75,8 +75,7 @@ cd cpp/build %if %{use_parquet} -DARROW_PARQUET=ON \ %endif - -DARROW_PLASMA=ON \ - -DARROW_BUILD_TESTS=OFF + -DARROW_PLASMA=ON make %{?_smp_mflags} cd - diff --git a/dev/tasks/linux-packages/yum/centos-6/Dockerfile b/dev/tasks/linux-packages/yum/centos-6/Dockerfile index 8143b99efd180..c7de92296767a 100644 --- a/dev/tasks/linux-packages/yum/centos-6/Dockerfile +++ b/dev/tasks/linux-packages/yum/centos-6/Dockerfile @@ -20,14 +20,13 @@ FROM centos:6 ARG DEBUG ENV \ - SRPM_DOWNLOAD_URL=http://vault.centos.org/7.4.1708/os/Source/SPackages \ + SRPM_DOWNLOAD_URL=http://vault.centos.org/7.6.1810/os/Source/SPackages \ LIBARCHIVE_SRPM_BASE=libarchive-3.1.2-10.el7_2.src.rpm RUN \ quiet=$([ "${DEBUG}" = "yes" ] || echo "--quiet") && \ yum update -y ${quiet} && \ yum install -y ${quiet} \ - centos-release-scl \ epel-release && \ yum install -y \ autoconf268 \ @@ -43,9 +42,10 @@ RUN \ ~/rpmbuild/SPECS/libarchive.spec && \ yum install -y ${quiet} ~/rpmbuild/RPMS/*/libarchive-3.*.rpm && \ rm -rf ${LIBARCHIVE_SRPM_BASE} ~/rpmbuild/ && \ + yum install -y ${quiet} \ + centos-release-scl && \ yum install -y ${quiet} \ boost-devel \ - centos-release-scl \ cmake3 \ devtoolset-6 \ git \ diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index ea104d507eec1..52bbc577e6f1b 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -274,6 +274,7 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb @@ -292,6 +293,13 @@ tasks: - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12-dbgsym_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib12-dbgsym_{no_rc_version}-1_amd64.deb + - libgandiva-glib12_{no_rc_version}-1_amd64.deb + - libgandiva12-dbgsym_{no_rc_version}-1_amd64.deb + - libgandiva12_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb @@ -356,6 +364,7 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb @@ -369,6 +378,11 @@ tasks: - libarrow-python-dev_{no_rc_version}-1_amd64.deb - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib12_{no_rc_version}-1_amd64.deb + - libgandiva12_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb @@ -396,6 +410,7 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb @@ -409,6 +424,11 @@ tasks: - libarrow-python-dev_{no_rc_version}-1_amd64.deb - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib12_{no_rc_version}-1_amd64.deb + - libgandiva12_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb @@ -436,6 +456,7 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb @@ -449,6 +470,11 @@ tasks: - libarrow-python-dev_{no_rc_version}-1_amd64.deb - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib12_{no_rc_version}-1_amd64.deb + - libgandiva12_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb From abde663b215295c051ae46f8a4e2bcceec081a2f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 27 Dec 2018 10:24:00 +0900 Subject: [PATCH 115/328] ARROW-4078: [CI] Detect changes in docs/ directory and build the Linux Python entry if so Author: Wes McKinney Closes #3266 from wesm/ARROW-4078 and squashes the following commits: 395c4969 Detect changes in docs/ directory and build the Linux Python entry if so --- .travis.yml | 2 +- ci/detect-changes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 10300c9b6e287..99ff24aaacc97 100644 --- a/.travis.yml +++ b/.travis.yml @@ -106,7 +106,7 @@ matrix: # TODO(wesm): Run the benchmarks outside of Travis # - ARROW_TRAVIS_PYTHON_BENCHMARKS=1 before_script: - - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ] && [ $ARROW_CI_DOCS_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh diff --git a/ci/detect-changes.py b/ci/detect-changes.py index e9a647c5e6d9c..102dc56396c45 100644 --- a/ci/detect-changes.py +++ b/ci/detect-changes.py @@ -26,7 +26,7 @@ perr = functools.partial(print, file=sys.stderr) -LANGUAGE_TOPICS = ['c_glib', 'cpp', 'go', 'java', 'js', 'python', +LANGUAGE_TOPICS = ['c_glib', 'cpp', 'docs', 'go', 'java', 'js', 'python', 'r', 'ruby', 'rust'] ALL_TOPICS = LANGUAGE_TOPICS + ['integration', 'site', 'dev'] From 5904eea4cc2f422c14c8ef9d1ac323718ff765ea Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 26 Dec 2018 22:38:00 -0600 Subject: [PATCH 116/328] ARROW-3324: [Python] Destroy temporary metadata builder classes more eagerly when building files to reduce memory usage Destroy RowGroupMetadataBuilder after each row group is completed Author: Wes McKinney Closes #3261 from tanyaschlusser/ARROW-3324 and squashes the following commits: 5f3876706 Refine case a bit 4f2bdcdce Destroy RowGroupMetadataBuilder object after completing a row group to reduce memory usage --- cpp/src/parquet/metadata-test.cc | 2 +- cpp/src/parquet/metadata.cc | 67 +++++++++++++------------------- cpp/src/parquet/metadata.h | 25 ++++++------ python/scripts/test_leak.py | 66 ++++++++++++++++++++++++------- 4 files changed, 93 insertions(+), 67 deletions(-) diff --git a/cpp/src/parquet/metadata-test.cc b/cpp/src/parquet/metadata-test.cc index bcf911eab8b26..826ac4d6a504f 100644 --- a/cpp/src/parquet/metadata-test.cc +++ b/cpp/src/parquet/metadata-test.cc @@ -59,7 +59,6 @@ TEST(Metadata, TestBuildAccess) { auto f_builder = FileMetaDataBuilder::Make(&schema, props); auto rg1_builder = f_builder->AppendRowGroup(); - auto rg2_builder = f_builder->AppendRowGroup(); // Write the metadata // rowgroup1 metadata @@ -75,6 +74,7 @@ TEST(Metadata, TestBuildAccess) { rg1_builder->Finish(1024); // rowgroup2 metadata + auto rg2_builder = f_builder->AppendRowGroup(); col1_builder = rg2_builder->NextColumnChunk(); col2_builder = rg2_builder->NextColumnChunk(); // column metadata diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 22cfbdb91aa73..6ac53c58afed4 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -115,7 +115,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } possible_stats_ = nullptr; } - ~ColumnChunkMetaDataImpl() {} // column chunk inline int64_t file_offset() const { return column_->file_offset; } @@ -197,13 +196,13 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { }; std::unique_ptr ColumnChunkMetaData::Make( - const uint8_t* metadata, const ColumnDescriptor* descr, + const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, writer_version)); } -ColumnChunkMetaData::ColumnChunkMetaData(const uint8_t* metadata, +ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( @@ -272,7 +271,6 @@ class RowGroupMetaData::RowGroupMetaDataImpl { const SchemaDescriptor* schema, const ApplicationVersion* writer_version) : row_group_(row_group), schema_(schema), writer_version_(writer_version) {} - ~RowGroupMetaDataImpl() {} inline int num_columns() const { return static_cast(row_group_->columns.size()); } @@ -289,9 +287,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { << " columns, requested metadata for column: " << i; throw ParquetException(ss.str()); } - return ColumnChunkMetaData::Make( - reinterpret_cast(&row_group_->columns[i]), schema_->Column(i), - writer_version_); + return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), + writer_version_); } private: @@ -301,14 +298,13 @@ class RowGroupMetaData::RowGroupMetaDataImpl { }; std::unique_ptr RowGroupMetaData::Make( - const uint8_t* metadata, const SchemaDescriptor* schema, + const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) { return std::unique_ptr( new RowGroupMetaData(metadata, schema, writer_version)); } -RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata, - const SchemaDescriptor* schema, +RowGroupMetaData::RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) : impl_{std::unique_ptr(new RowGroupMetaDataImpl( reinterpret_cast(metadata), schema, writer_version))} { @@ -332,10 +328,11 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); - DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); + DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, + metadata_.get()); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -348,7 +345,6 @@ class FileMetaData::FileMetaDataImpl { InitColumnOrders(); InitKeyValueMetadata(); } - ~FileMetaDataImpl() {} inline uint32_t size() const { return metadata_len_; } inline int num_columns() const { return schema_.num_columns(); } @@ -375,9 +371,7 @@ class FileMetaData::FileMetaDataImpl { << " row groups, requested metadata for row group: " << i; throw ParquetException(ss.str()); } - return RowGroupMetaData::Make( - reinterpret_cast(&metadata_->row_groups[i]), &schema_, - &writer_version_); + return RowGroupMetaData::Make(&metadata_->row_groups[i], &schema_, &writer_version_); } const SchemaDescriptor* schema() const { return &schema_; } @@ -429,13 +423,13 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; -std::shared_ptr FileMetaData::Make(const uint8_t* metadata, +std::shared_ptr FileMetaData::Make(const void* metadata, uint32_t* metadata_len) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr(new FileMetaData(metadata, metadata_len)); } -FileMetaData::FileMetaData(const uint8_t* metadata, uint32_t* metadata_len) +FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) : impl_{std::unique_ptr( new FileMetaDataImpl(metadata, metadata_len))} {} @@ -606,11 +600,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { Init(column_chunk); } - ~ColumnChunkMetaDataBuilderImpl() {} - - const uint8_t* contents() const { - return reinterpret_cast(column_chunk_); - } + const void* contents() const { return column_chunk_; } // column chunk void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); } @@ -699,7 +689,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { std::unique_ptr ColumnChunkMetaDataBuilder::Make( const std::shared_ptr& props, const ColumnDescriptor* column, - uint8_t* contents) { + void* contents) { return std::unique_ptr( new ColumnChunkMetaDataBuilder(props, column, contents)); } @@ -717,14 +707,14 @@ ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( const std::shared_ptr& props, const ColumnDescriptor* column, - uint8_t* contents) + void* contents) : impl_{std::unique_ptr( new ColumnChunkMetaDataBuilderImpl( props, column, reinterpret_cast(contents)))} {} ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() {} -const uint8_t* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); } +const void* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); } void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) { impl_->set_file_path(path); @@ -754,12 +744,11 @@ void ColumnChunkMetaDataBuilder::SetStatistics(bool is_signed, class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { public: explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr& props, - const SchemaDescriptor* schema, uint8_t* contents) + const SchemaDescriptor* schema, void* contents) : properties_(props), schema_(schema), current_column_(0) { row_group_ = reinterpret_cast(contents); InitializeColumns(schema->num_columns()); } - ~RowGroupMetaDataBuilderImpl() {} ColumnChunkMetaDataBuilder* NextColumnChunk() { if (!(current_column_ < num_columns())) { @@ -770,8 +759,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { } auto column = schema_->Column(current_column_); auto column_builder = ColumnChunkMetaDataBuilder::Make( - properties_, column, - reinterpret_cast(&row_group_->columns[current_column_++])); + properties_, column, &row_group_->columns[current_column_++]); auto column_builder_ptr = column_builder.get(); column_builders_.push_back(std::move(column_builder)); return column_builder_ptr; @@ -820,14 +808,14 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { std::unique_ptr RowGroupMetaDataBuilder::Make( const std::shared_ptr& props, const SchemaDescriptor* schema_, - uint8_t* contents) { + void* contents) { return std::unique_ptr( new RowGroupMetaDataBuilder(props, schema_, contents)); } RowGroupMetaDataBuilder::RowGroupMetaDataBuilder( const std::shared_ptr& props, const SchemaDescriptor* schema_, - uint8_t* contents) + void* contents) : impl_{std::unique_ptr( new RowGroupMetaDataBuilderImpl(props, schema_, contents))} {} @@ -861,16 +849,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); } - ~FileMetaDataBuilderImpl() {} RowGroupMetaDataBuilder* AppendRowGroup() { - auto row_group = std::unique_ptr(new format::RowGroup()); - auto row_group_builder = RowGroupMetaDataBuilder::Make( - properties_, schema_, reinterpret_cast(row_group.get())); - RowGroupMetaDataBuilder* row_group_ptr = row_group_builder.get(); - row_group_builders_.push_back(std::move(row_group_builder)); - row_groups_.push_back(std::move(row_group)); - return row_group_ptr; + row_groups_.emplace_back(new format::RowGroup); + current_row_group_builder_ = + RowGroupMetaDataBuilder::Make(properties_, schema_, row_groups_.back().get()); + return current_row_group_builder_.get(); } std::unique_ptr Finish() { @@ -939,7 +923,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { private: const std::shared_ptr properties_; std::vector> row_groups_; - std::vector> row_group_builders_; + + std::unique_ptr current_row_group_builder_; const SchemaDescriptor* schema_; std::shared_ptr key_value_metadata_; }; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 25f4d4cd8cbdf..209c75a6ffbce 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -93,7 +93,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( - const uint8_t* metadata, const ColumnDescriptor* descr, + const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR); ~ColumnChunkMetaData(); @@ -119,7 +119,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t total_uncompressed_size() const; private: - explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr, + explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; @@ -130,7 +130,7 @@ class PARQUET_EXPORT RowGroupMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( - const uint8_t* metadata, const SchemaDescriptor* schema, + const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version = NULLPTR); ~RowGroupMetaData(); @@ -144,7 +144,7 @@ class PARQUET_EXPORT RowGroupMetaData { std::unique_ptr ColumnChunk(int i) const; private: - explicit RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema, + explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version = NULLPTR); // PIMPL Idiom class RowGroupMetaDataImpl; @@ -156,7 +156,7 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor - static std::shared_ptr Make(const uint8_t* serialized_metadata, + static std::shared_ptr Make(const void* serialized_metadata, uint32_t* metadata_len); ~FileMetaData(); @@ -182,7 +182,7 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; - explicit FileMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len); + explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); // PIMPL Idiom FileMetaData(); @@ -199,7 +199,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { static std::unique_ptr Make( const std::shared_ptr& props, const ColumnDescriptor* column, - uint8_t* contents); + void* contents); ~ColumnChunkMetaDataBuilder(); @@ -217,7 +217,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { bool dictionary_fallback); // The metadata contents, suitable for passing to ColumnChunkMetaData::Make - const uint8_t* contents() const; + const void* contents() const; // For writing metadata at end of column chunk void WriteTo(OutputStream* sink); @@ -226,7 +226,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, const ColumnDescriptor* column); explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, - const ColumnDescriptor* column, uint8_t* contents); + const ColumnDescriptor* column, void* contents); // PIMPL Idiom class ColumnChunkMetaDataBuilderImpl; std::unique_ptr impl_; @@ -237,7 +237,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { // API convenience to get a MetaData reader static std::unique_ptr Make( const std::shared_ptr& props, const SchemaDescriptor* schema_, - uint8_t* contents); + void* contents); ~RowGroupMetaDataBuilder(); @@ -253,7 +253,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { private: explicit RowGroupMetaDataBuilder(const std::shared_ptr& props, - const SchemaDescriptor* schema_, uint8_t* contents); + const SchemaDescriptor* schema_, void* contents); // PIMPL Idiom class RowGroupMetaDataBuilderImpl; std::unique_ptr impl_; @@ -268,9 +268,10 @@ class PARQUET_EXPORT FileMetaDataBuilder { ~FileMetaDataBuilder(); + // The prior RowGroupMetaDataBuilder (if any) is destroyed RowGroupMetaDataBuilder* AppendRowGroup(); - // commit the metadata + // Complete the Thrift structure std::unique_ptr Finish(); private: diff --git a/python/scripts/test_leak.py b/python/scripts/test_leak.py index e3de56b28a168..d3984a89ef754 100644 --- a/python/scripts/test_leak.py +++ b/python/scripts/test_leak.py @@ -19,29 +19,49 @@ import pyarrow as pa import numpy as np +import pandas as pd +import pandas.util.testing as tm import memory_profiler import gc import io +MEGABYTE = 1 << 20 -def leak(): + +def assert_does_not_leak(f, iterations=10, check_interval=1, tolerance=5): + gc.collect() + baseline = memory_profiler.memory_usage()[0] + for i in range(iterations): + f() + if i % check_interval == 0: + gc.collect() + usage = memory_profiler.memory_usage()[0] + diff = usage - baseline + print("{0}: {1}\r".format(i, diff), end="") + if diff > tolerance: + raise Exception("Memory increased by {0} megabytes after {1} " + "iterations".format(diff, i + 1)) + gc.collect() + usage = memory_profiler.memory_usage()[0] + diff = usage - baseline + print("\nMemory increased by {0} megabytes after {1} " + "iterations".format(diff, iterations)) + + +def test_leak1(): data = [pa.array(np.concatenate([np.random.randn(100000)] * 1000))] table = pa.Table.from_arrays(data, ['foo']) - while True: - print('calling to_pandas') - print('memory_usage: {0}'.format(memory_profiler.memory_usage())) - table.to_pandas() - gc.collect() -# leak() + def func(): + table.to_pandas() + assert_does_not_leak(func) -def leak2(): +def test_leak2(): data = [pa.array(np.concatenate([np.random.randn(100000)] * 10))] table = pa.Table.from_arrays(data, ['foo']) - while True: - print('calling to_pandas') - print('memory_usage: {0}'.format(memory_profiler.memory_usage())) + + def func(): df = table.to_pandas() batch = pa.RecordBatch.from_pandas(df) @@ -55,7 +75,27 @@ def leak2(): reader = pa.open_file(buf_reader) reader.read_all() - gc.collect() + assert_does_not_leak(func, iterations=50, tolerance=50) + + +def test_leak3(): + import pyarrow.parquet as pq + + df = pd.DataFrame({'a{0}'.format(i): [1, 2, 3, 4] + for i in range(50)}) + table = pa.Table.from_pandas(df, preserve_index=False) + + writer = pq.ParquetWriter('leak_test_' + tm.rands(5) + '.parquet', + table.schema) + + def func(): + writer.write_table(table, row_group_size=len(table)) + + # This does not "leak" per se but we do want to have this use as little + # memory as possible + assert_does_not_leak(func, iterations=500, + check_interval=50, tolerance=20) -leak2() +if __name__ == '__main__': + test_leak3() From a536529a624b793ffa18c3c39581fdf777e85f8f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 27 Dec 2018 17:11:58 +0100 Subject: [PATCH 117/328] ARROW-4102: [C++] Return common IdentityCast when casting to equal type I also added some code to make it easier to write cast tests in JSON. As one issue with the JSON parser -- we have a number of tests in cast-test.cc that check that values that are in null positions are ignored. We might augment the parser to be able to pass both values and validity bitmap as separate JSON strings Author: Wes McKinney Closes #3265 from wesm/ARROW-4102 and squashes the following commits: 8c27ba2a Fix bad memory access 9c52297f Add various identity cast tests, verify that fixed_size_binary identity casts work now --- cpp/src/arrow/compute/kernels/cast-test.cc | 118 ++++++++++++--------- cpp/src/arrow/compute/kernels/cast.cc | 27 +++-- cpp/src/arrow/ipc/json-simple.cc | 5 + python/pyarrow/tests/test_array.py | 3 +- 4 files changed, 95 insertions(+), 58 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc index 4c3992868ef6d..781e0af87a825 100644 --- a/cpp/src/arrow/compute/kernels/cast-test.cc +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -51,6 +51,10 @@ using std::vector; namespace arrow { namespace compute { +static std::vector> kNumericTypes = { + uint8(), int8(), uint16(), int16(), uint32(), + int32(), uint64(), int64(), float32(), float64()}; + static void AssertBufferSame(const Array& left, const Array& right, int buffer_index) { ASSERT_EQ(left.data()->buffers[buffer_index].get(), right.data()->buffers[buffer_index].get()); @@ -81,8 +85,10 @@ class TestCast : public ComputeFixture, public TestBase { void CheckZeroCopy(const Array& input, const shared_ptr& out_type) { shared_ptr result; ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result)); - AssertBufferSame(input, *result, 0); - AssertBufferSame(input, *result, 1); + ASSERT_EQ(input.data()->buffers.size(), result->data()->buffers.size()); + for (size_t i = 0; i < input.data()->buffers.size(); ++i) { + AssertBufferSame(input, *result, static_cast(i)); + } } template @@ -106,15 +112,25 @@ class TestCast : public ComputeFixture, public TestBase { CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); } } -}; -TEST_F(TestCast, SameTypeZeroCopy) { - vector is_valid = {true, false, true, true, true}; - vector v1 = {0, 1, 2, 3, 4}; + void CheckCaseJSON(const shared_ptr& in_type, + const shared_ptr& out_type, const std::string& in_json, + const std::string& expected_json, + const CastOptions& options = CastOptions()) { + shared_ptr input = ArrayFromJSON(in_type, in_json); + shared_ptr expected = ArrayFromJSON(out_type, expected_json); + DCHECK_EQ(input->length(), expected->length()); + CheckPass(*input, *expected, out_type, options); - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); + // Check a sliced variant + if (input->length() > 1) { + CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); + } + } +}; +TEST_F(TestCast, SameTypeZeroCopy) { + shared_ptr arr = ArrayFromJSON(int32(), "[0, null, 2, 3, 4]"); shared_ptr result; ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result)); @@ -124,20 +140,16 @@ TEST_F(TestCast, SameTypeZeroCopy) { TEST_F(TestCast, ToBoolean) { CastOptions options; + for (auto type : kNumericTypes) { + CheckCaseJSON(type, boolean(), "[0, null, 127, 1, 0]", + "[false, null, true, true, false]"); + } - vector is_valid = {true, false, true, true, true}; - - // int8, should suffice for other integers - vector v1 = {0, 1, 127, -1, 0}; - vector e1 = {false, true, true, true, false}; - CheckCase(int8(), v1, is_valid, boolean(), e1, - options); - - // floating point - vector v2 = {1.0, 0, 0, -1.0, 5.0}; - vector e2 = {true, false, false, true, true}; - CheckCase(float64(), v2, is_valid, boolean(), e2, - options); + // Check negative numbers + CheckCaseJSON(int8(), boolean(), "[0, null, 127, -1, 0]", + "[false, null, true, true, false]"); + CheckCaseJSON(float64(), boolean(), "[0, null, 127, -1, 0]", + "[false, null, true, true, false]"); } TEST_F(TestCast, ToIntUpcast) { @@ -648,36 +660,6 @@ TEST_F(TestCast, TimeToCompatible) { options); } -TEST_F(TestCast, PrimitiveZeroCopy) { - shared_ptr arr; - - ArrayFromVector(uint8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint8()); - ArrayFromVector(int8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int8()); - - ArrayFromVector(uint16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint16()); - ArrayFromVector(int16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int16()); - - ArrayFromVector(uint32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint32()); - ArrayFromVector(int32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int32()); - - ArrayFromVector(uint64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint64()); - ArrayFromVector(int64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int64()); - - ArrayFromVector(float32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, float32()); - - ArrayFromVector(float64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, float64()); -} - TEST_F(TestCast, DateToCompatible) { CastOptions options; @@ -1193,5 +1175,39 @@ TEST_F(TestCast, ListToList) { CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options); } +TEST_F(TestCast, IdentityCasts) { + // ARROW-4102 + auto CheckIdentityCast = [this](std::shared_ptr type, + const std::string& json) { + auto arr = ArrayFromJSON(type, json); + CheckZeroCopy(*arr, type); + }; + + CheckIdentityCast(null(), "[null, null, null]"); + CheckIdentityCast(boolean(), "[false, true, null, false]"); + + for (auto type : kNumericTypes) { + CheckIdentityCast(type, "[1, 2, null, 4]"); + } + CheckIdentityCast(binary(), "[\"foo\", \"bar\"]"); + CheckIdentityCast(utf8(), "[\"foo\", \"bar\"]"); + CheckIdentityCast(fixed_size_binary(3), "[\"foo\", \"bar\"]"); + + CheckIdentityCast(list(int8()), "[[1, 2], [null], [], [3]]"); + + CheckIdentityCast(time32(TimeUnit::MILLI), "[1, 2, 3, 4]"); + CheckIdentityCast(time64(TimeUnit::MICRO), "[1, 2, 3, 4]"); + CheckIdentityCast(date32(), "[1, 2, 3, 4]"); + CheckIdentityCast(date64(), "[86400000, 0]"); + CheckIdentityCast(timestamp(TimeUnit::SECOND), "[1, 2, 3, 4]"); + + { + auto dict_type = dictionary(int8(), ArrayFromJSON(int8(), "[1, 2, 3]")); + auto dict_indices = ArrayFromJSON(int8(), "[0, 1, 2, 0, null, 2]"); + auto dict_array = std::make_shared(dict_type, dict_indices); + CheckZeroCopy(*dict_array, dict_type); + } +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 7976ef0beffc6..15746d4c9965e 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -99,6 +99,8 @@ struct is_zero_copy_cast { static constexpr bool value = false; }; +// TODO(wesm): ARROW-4110; this is no longer needed, but may be useful if we +// ever _do_ want to generate identity cast kernels at compile time template struct is_zero_copy_cast< O, I, @@ -1143,6 +1145,17 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const ArrayData& i return Status::OK(); } +class IdentityCast : public UnaryKernel { + public: + IdentityCast() {} + + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { + DCHECK_EQ(input.kind(), Datum::ARRAY); + out->value = input.array()->Copy(); + return Status::OK(); + } +}; + class CastKernel : public UnaryKernel { public: CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy, @@ -1188,6 +1201,8 @@ class CastKernel : public UnaryKernel { std::shared_ptr out_type_; }; +// TODO(wesm): ARROW-4110 Do not generate cases that could return IdentityCast + #define CAST_CASE(InType, OutType) \ case OutType::type_id: \ is_zero_copy = is_zero_copy_cast::value; \ @@ -1233,12 +1248,10 @@ class CastKernel : public UnaryKernel { FN(Int64Type, Date64Type); #define DATE32_CASES(FN, IN_TYPE) \ - FN(Date32Type, Date32Type); \ FN(Date32Type, Date64Type); \ FN(Date32Type, Int32Type); #define DATE64_CASES(FN, IN_TYPE) \ - FN(Date64Type, Date64Type); \ FN(Date64Type, Date32Type); \ FN(Date64Type, Int64Type); @@ -1258,12 +1271,9 @@ class CastKernel : public UnaryKernel { FN(TimestampType, Date64Type); \ FN(TimestampType, Int64Type); -#define BINARY_CASES(FN, IN_TYPE) \ - FN(BinaryType, BinaryType); \ - FN(BinaryType, StringType); +#define BINARY_CASES(FN, IN_TYPE) FN(BinaryType, StringType); #define STRING_CASES(FN, IN_TYPE) \ - FN(StringType, StringType); \ FN(StringType, BooleanType); \ FN(StringType, UInt8Type); \ FN(StringType, Int8Type); \ @@ -1365,6 +1375,11 @@ Status GetListCastFunc(const DataType& in_type, const std::shared_ptr& Status GetCastFunction(const DataType& in_type, const std::shared_ptr& out_type, const CastOptions& options, std::unique_ptr* kernel) { + if (in_type.Equals(out_type)) { + *kernel = std::unique_ptr(new IdentityCast); + return Status::OK(); + } + switch (in_type.id()) { CAST_FUNCTION_CASE(NullType); CAST_FUNCTION_CASE(BooleanType); diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index 7a78fe4986cd5..047788ce0f5de 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -474,7 +474,12 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 3d3402139cb43..17ff9c625871a 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -768,7 +768,8 @@ def test_cast_date64_to_int(): ('float', [0.0, 0.1, 0.2]), ('double', [0.0, 0.1, 0.2]), ('string', ['a', 'b', 'c']), - ('binary', [b'a', b'b', b'c']) + ('binary', [b'a', b'b', b'c']), + (pa.binary(3), [b'abc', b'bcd', b'cde']) ]) def test_cast_identities(ty, values): arr = pa.array(values, type=ty) From 6781c2da8915f99eaa8438cce25329152a0defc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 27 Dec 2018 17:26:55 +0100 Subject: [PATCH 118/328] ARROW-4088: [Python] Table.from_batches() fails when passed a schema with metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Closes #3256 from kszucs/ARROW-4088 and squashes the following commits: b2698995 turn off check_metadata cf5c0829 propagate check_metadata to Schema's fields --- cpp/src/arrow/type-test.cc | 20 ++++++++++++-------- cpp/src/arrow/type.cc | 12 +++++++----- cpp/src/arrow/type.h | 4 ++-- python/pyarrow/tests/test_schema.py | 14 ++++++++++++++ 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 5b758d7a129fd..ec82e0a5dbbf9 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -58,6 +58,7 @@ TEST(TestField, Equals) { ASSERT_TRUE(f0.Equals(f0_other)); ASSERT_FALSE(f0.Equals(f0_nn)); ASSERT_FALSE(f0.Equals(f0_with_meta)); + ASSERT_TRUE(f0.Equals(f0_with_meta, false)); } TEST(TestField, TestMetadataConstruction) { @@ -200,28 +201,31 @@ TEST_F(TestSchema, GetFieldIndex) { } TEST_F(TestSchema, TestMetadataConstruction) { - auto f0 = field("f0", int32()); - auto f1 = field("f1", uint8(), false); - auto f2 = field("f2", utf8()); auto metadata0 = key_value_metadata({{"foo", "bar"}, {"bizz", "buzz"}}); auto metadata1 = key_value_metadata({{"foo", "baz"}}); - auto schema0 = ::arrow::schema({f0, f1, f2}, metadata0); - ASSERT_TRUE(metadata0->Equals(*schema0->metadata())); + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8(), true); + auto f3 = field("f2", utf8(), true, metadata1->Copy()); + auto schema0 = ::arrow::schema({f0, f1, f2}, metadata0); auto schema1 = ::arrow::schema({f0, f1, f2}, metadata1); - ASSERT_TRUE(metadata1->Equals(*schema1->metadata())); - auto schema2 = ::arrow::schema({f0, f1, f2}, metadata0->Copy()); - ASSERT_TRUE(metadata0->Equals(*schema2->metadata())); + auto schema3 = ::arrow::schema({f0, f1, f3}, metadata0->Copy()); + ASSERT_TRUE(metadata0->Equals(*schema0->metadata())); + ASSERT_TRUE(metadata1->Equals(*schema1->metadata())); + ASSERT_TRUE(metadata0->Equals(*schema2->metadata())); ASSERT_TRUE(schema0->Equals(*schema2)); ASSERT_FALSE(schema0->Equals(*schema1)); ASSERT_FALSE(schema2->Equals(*schema1)); + ASSERT_FALSE(schema2->Equals(*schema3)); // don't check metadata ASSERT_TRUE(schema0->Equals(*schema1, false)); ASSERT_TRUE(schema2->Equals(*schema1, false)); + ASSERT_TRUE(schema2->Equals(*schema3, false)); } TEST_F(TestSchema, TestAddMetadata) { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index ee7fda7c8c8f4..a8372b96132bd 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -65,13 +65,15 @@ std::vector> Field::Flatten() const { return flattened; } -bool Field::Equals(const Field& other) const { +bool Field::Equals(const Field& other, bool check_metadata) const { if (this == &other) { return true; } if (this->name_ == other.name_ && this->nullable_ == other.nullable_ && this->type_->Equals(*other.type_.get())) { - if (this->HasMetadata() && other.HasMetadata()) { + if (!check_metadata) { + return true; + } else if (this->HasMetadata() && other.HasMetadata()) { return metadata_->Equals(*other.metadata_); } else if (!this->HasMetadata() && !other.HasMetadata()) { return true; @@ -82,8 +84,8 @@ bool Field::Equals(const Field& other) const { return false; } -bool Field::Equals(const std::shared_ptr& other) const { - return Equals(*other.get()); +bool Field::Equals(const std::shared_ptr& other, bool check_metadata) const { + return Equals(*other.get(), check_metadata); } std::string Field::ToString() const { @@ -333,7 +335,7 @@ bool Schema::Equals(const Schema& other, bool check_metadata) const { return false; } for (int i = 0; i < num_fields(); ++i) { - if (!field(i)->Equals(*other.field(i).get())) { + if (!field(i)->Equals(*other.field(i).get(), check_metadata)) { return false; } } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index eb00f43caa172..0758ced80ad0c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -265,8 +265,8 @@ class ARROW_EXPORT Field { std::vector> Flatten() const; - bool Equals(const Field& other) const; - bool Equals(const std::shared_ptr& other) const; + bool Equals(const Field& other, bool check_metadata = true) const; + bool Equals(const std::shared_ptr& other, bool check_metadata = true) const; /// \brief Return a string representation ot the field std::string ToString() const; diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 5385c3c8c41d9..8549d61c3456f 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -334,6 +334,20 @@ def test_schema_equals(): assert not sch1.equals(sch3) +def test_schema_equals_propagates_check_metadata(): + # ARROW-4088 + schema1 = pa.schema([ + pa.field('foo', pa.int32()), + pa.field('bar', pa.string()) + ]) + schema2 = pa.schema([ + pa.field('foo', pa.int32()), + pa.field('bar', pa.string(), metadata={'a': 'alpha'}), + ]) + assert not schema1.equals(schema2) + assert schema1.equals(schema2, check_metadata=False) + + def test_schema_equality_operators(): fields = [ pa.field('foo', pa.int32()), From 0696eb591f4707377067b53ecdc9be1dbc4c6a34 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Thu, 27 Dec 2018 18:11:30 +0100 Subject: [PATCH 119/328] ARROW-3932: [Python] Include Benchmarks.md in Sphinx docs Author: Uwe L. Korn Closes #3249 from xhochy/ARROW-3932 and squashes the following commits: 8e969c1b Link to Sphinx documentation for benchmarks 06c3b8d0 ARROW-3932: Include Benchmarks.md in Sphinx docs --- docs/Benchmarks.md | 29 ----------------- docs/source/python/benchmarks.rst | 53 +++++++++++++++++++++++++++++++ docs/source/python/index.rst | 1 + python/README-benchmarks.md | 47 --------------------------- python/README.md | 3 ++ 5 files changed, 57 insertions(+), 76 deletions(-) delete mode 100644 docs/Benchmarks.md create mode 100644 docs/source/python/benchmarks.rst delete mode 100644 python/README-benchmarks.md diff --git a/docs/Benchmarks.md b/docs/Benchmarks.md deleted file mode 100644 index c84bf0dc1eb62..0000000000000 --- a/docs/Benchmarks.md +++ /dev/null @@ -1,29 +0,0 @@ - -## Benchmark Requirements - -The benchmarks are run using [asv][1] which is also their only requirement. - -## Running the benchmarks - -To run the benchmarks, call `asv run --python=same`. You cannot use the -plain `asv run` command at the moment as asv cannot handle python packages -in subdirectories of a repository. - -[1]: https://asv.readthedocs.org/ diff --git a/docs/source/python/benchmarks.rst b/docs/source/python/benchmarks.rst new file mode 100644 index 0000000000000..6c3144ae58637 --- /dev/null +++ b/docs/source/python/benchmarks.rst @@ -0,0 +1,53 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Benchmarks +========== + +The ``pyarrow`` package comes with a suite of benchmarks meant to +run with `asv`_. You'll need to install the ``asv`` package first +(``pip install asv`` or ``conda install -c conda-forge asv``). + +The benchmarks are run using `asv`_ which is also their only requirement. + +Running the benchmarks +---------------------- + +To run the benchmarks, call ``asv run --python=same``. You cannot use the +plain ``asv run`` command at the moment as asv cannot handle python packages +in subdirectories of a repository. + +Running with arbitrary revisions +-------------------------------- + +ASV allows to store results and generate graphs of the benchmarks over +the project's evolution. For this you have the latest development version of ASV: + +.. code:: + + pip install git+https://github.com/airspeed-velocity/asv + +Now you should be ready to run ``asv run`` or whatever other command +suits your needs. + +Compatibility +------------- + +We only expect the benchmarking setup to work with Python 3.6 or later, +on a Unix-like system. + +.. asv:: https://asv.readthedocs.org/ diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index cf691e37eaa25..fe04a73f32ef2 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -47,3 +47,4 @@ files into Arrow structures. api development getting_involved + benchmarks diff --git a/python/README-benchmarks.md b/python/README-benchmarks.md deleted file mode 100644 index 77901f3f020bb..0000000000000 --- a/python/README-benchmarks.md +++ /dev/null @@ -1,47 +0,0 @@ - - -# Benchmarks - -The `pyarrow` package comes with a suite of benchmarks meant to -run with [ASV](https://asv.readthedocs.io). You'll need to install -the `asv` package first (`pip install asv`). - -## Running with your local tree - -When developing, the simplest and fastest way to run the benchmark suite -against your local changes is to use the `asv dev` command. This will -use your current Python interpreter and environment. - -## Running with arbitrary revisions - -ASV allows to store results and generate graphs of the benchmarks over -the project's evolution. For this you have the latest development version of ASV: - -```shell -pip install git+https://github.com/airspeed-velocity/asv -``` - -Now you should be ready to run `asv run` or whatever other command -suits your needs. - -## Compatibility - -We only expect the benchmarking setup to work with Python 3.6 or later, -on a Unix-like system. diff --git a/python/README.md b/python/README.md index ce696939929f9..ce7bdde999eed 100644 --- a/python/README.md +++ b/python/README.md @@ -76,6 +76,8 @@ pytest pyarrow --help and look for the "custom options" section. +For running the benchmarks, see the [Sphinx documentation][5]. + ### Building the documentation ```bash @@ -86,3 +88,4 @@ python setup.py build_sphinx -s ../docs/source [2]: https://github.com/apache/arrow/blob/master/docs/source/python/development.rst [3]: https://github.com/pandas-dev/pandas [4]: https://docs.pytest.org/en/latest/ +[5]: https://arrow.apache.org/docs/latest/python/benchmarks.html From 9b03947c4369cb1b4d82022df00629baf2b6eb00 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 27 Dec 2018 12:17:50 -0600 Subject: [PATCH 120/328] ARROW-3928: [Python] Deduplicate Python objects when converting binary, string, date, time types to object arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a `deduplicate_objects` option to all of the `to_pandas` methods. It works with string types, date types (when `date_as_object=True`), and time types. I also made it so that `ScalarMemoTable` can be used with `string_view`, for more efficient memoization in this case. I made the default for `deduplicate_objects` is True. When the ratio of unique strings to the length of the array is low, not only does this use drastically less memory, it is also faster. I will write some benchmarks to show where the "crossover point" is when the overhead of hashing makes things slower. Let's consider a simple case where we have 10,000,000 strings of length 10, but only 1000 unique values: ``` In [50]: import pandas.util.testing as tm In [51]: unique_values = [tm.rands(10) for i in range(1000)] In [52]: values = unique_values * 10000 In [53]: arr = pa.array(values) In [54]: timeit arr.to_pandas() 236 ms ± 1.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [55]: timeit arr.to_pandas(deduplicate_objects=False) 730 ms ± 12.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` Almost 3 times faster in this case. The different in memory use is even more drastic ``` In [44]: unique_values = [tm.rands(10) for i in range(1000)] In [45]: values = unique_values * 10000 In [46]: arr = pa.array(values) In [49]: %memit result11 = arr.to_pandas() peak memory: 1505.89 MiB, increment: 76.27 MiB In [50]: %memit result12 = arr.to_pandas(deduplicate_objects=False) peak memory: 2202.29 MiB, increment: 696.11 MiB ``` As you can see, this is a huge problem. If our bug reports about Parquet memory use problems are any indication, users have been suffering from this issue for a long time. When the strings are mostly unique, then things are slower as expected, the peak memory use is higher because of the hash table ``` In [17]: unique_values = [tm.rands(10) for i in range(500000)] In [18]: values = unique_values * 2 In [19]: arr = pa.array(values) In [20]: timeit result = arr.to_pandas() 177 ms ± 574 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [21]: timeit result = arr.to_pandas(deduplicate_objects=False) 70.1 ms ± 783 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [42]: %memit result8 = arr.to_pandas() peak memory: 644.39 MiB, increment: 92.23 MiB In [43]: %memit result9 = arr.to_pandas(deduplicate_objects=False) peak memory: 610.85 MiB, increment: 58.41 MiB ``` In real world work, many duplicated strings is the most common use case. Given the massive memory use and moderate performance improvements, it makes sense to have this enabled by default. Author: Wes McKinney Closes #3257 from wesm/ARROW-3928 and squashes the following commits: d9a88700 Prettier output a00b51c7 Add benchmarks for object deduplication ca88b963 Add Python unit tests, deduplicate for date and time types also when converting to Python objects 7a7873b8 First working iteration of string deduplication when calling to_pandas --- cpp/src/arrow/python/arrow_to_pandas.cc | 286 +++++++++++--------- cpp/src/arrow/python/arrow_to_pandas.h | 41 +-- cpp/src/arrow/type.cc | 7 +- cpp/src/arrow/type.h | 9 +- cpp/src/arrow/type_traits.h | 5 + cpp/src/arrow/util/hashing.h | 21 +- python/benchmarks/convert_pandas.py | 22 ++ python/pyarrow/array.pxi | 91 ++++--- python/pyarrow/compat.py | 6 +- python/pyarrow/includes/libarrow.pxd | 9 +- python/pyarrow/lib.pxd | 14 +- python/pyarrow/pandas_compat.py | 5 +- python/pyarrow/table.pxi | 160 +---------- python/pyarrow/tests/test_convert_pandas.py | 85 ++++++ 14 files changed, 409 insertions(+), 352 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 29d64355bdaed..b532bfb705acd 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -36,9 +36,11 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/hashing.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/parallel.h" +#include "arrow/util/string_view.h" #include "arrow/visitor_inline.h" #include "arrow/compute/api.h" @@ -75,21 +77,21 @@ template struct WrapBytes {}; template <> -struct WrapBytes { +struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { return PyUnicode_FromStringAndSize(data, length); } }; template <> -struct WrapBytes { +struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { return PyBytes_FromStringAndSize(data, length); } }; template <> -struct WrapBytes { +struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { return PyBytes_FromStringAndSize(data, length); } @@ -216,7 +218,7 @@ class PandasBlock { CATEGORICAL }; - PandasBlock(PandasOptions options, int64_t num_rows, int num_columns) + PandasBlock(const PandasOptions& options, int64_t num_rows, int num_columns) : num_rows_(num_rows), num_columns_(num_columns), options_(options) {} virtual ~PandasBlock() {} @@ -301,8 +303,8 @@ inline const T* GetPrimitiveValues(const Array& arr) { } template -inline void ConvertIntegerWithNulls(PandasOptions options, const ChunkedArray& data, - double* out_values) { +inline void ConvertIntegerWithNulls(const PandasOptions& options, + const ChunkedArray& data, double* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); const T* in_values = GetPrimitiveValues(arr); @@ -315,8 +317,8 @@ inline void ConvertIntegerWithNulls(PandasOptions options, const ChunkedArray& d } template -inline void ConvertIntegerNoNullsSameType(PandasOptions options, const ChunkedArray& data, - T* out_values) { +inline void ConvertIntegerNoNullsSameType(const PandasOptions& options, + const ChunkedArray& data, T* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); if (arr.length() > 0) { @@ -328,8 +330,8 @@ inline void ConvertIntegerNoNullsSameType(PandasOptions options, const ChunkedAr } template -inline void ConvertIntegerNoNullsCast(PandasOptions options, const ChunkedArray& data, - OutType* out_values) { +inline void ConvertIntegerNoNullsCast(const PandasOptions& options, + const ChunkedArray& data, OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); const InType* in_values = GetPrimitiveValues(arr); @@ -339,8 +341,8 @@ inline void ConvertIntegerNoNullsCast(PandasOptions options, const ChunkedArray& } } -static Status ConvertBooleanWithNulls(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { +static Status ConvertBooleanWithNulls(const PandasOptions& options, + const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); @@ -363,7 +365,7 @@ static Status ConvertBooleanWithNulls(PandasOptions options, const ChunkedArray& return Status::OK(); } -static void ConvertBooleanNoNulls(PandasOptions options, const ChunkedArray& data, +static void ConvertBooleanNoNulls(const PandasOptions& options, const ChunkedArray& data, uint8_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); @@ -373,57 +375,106 @@ static void ConvertBooleanNoNulls(PandasOptions options, const ChunkedArray& dat } } -template -static Status ConvertIntegerObjects(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { - PyAcquireGIL lock; - constexpr bool is_signed = std::is_signed::value; - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = *data.chunk(c); - const auto* in_values = GetPrimitiveValues(arr); - - for (int i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - *out_values++ = is_signed ? PyLong_FromLongLong(in_values[i]) - : PyLong_FromUnsignedLongLong(in_values[i]); - RETURN_IF_PYERROR(); - } +// Generic Array -> PyObject** converter that handles object deduplication, if +// requested +template +inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func, + PyObject** out_values) { + const bool has_nulls = arr.null_count() > 0; + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + RETURN_NOT_OK(write_func(arr.GetView(i), out_values)); } + ++out_values; } return Status::OK(); } -template -inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { +template +struct MemoizationTraits { + using Scalar = typename T::c_type; +}; + +template +struct MemoizationTraits> { + // For binary, we memoize string_view as a scalar value to avoid having to + // unnecessarily copy the memory into the memo table data structure + using Scalar = util::string_view; +}; + +template +inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data, + WrapFunction&& wrap_func, PyObject** out_values) { using ArrayType = typename TypeTraits::ArrayType; + using Scalar = typename MemoizationTraits::Scalar; + PyAcquireGIL lock; + ::arrow::internal::ScalarMemoTable memo_table; + std::vector unique_values; + int32_t memo_size = 0; + + auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { + int32_t memo_index = memo_table.GetOrInsert(value); + if (memo_index == memo_size) { + // New entry + RETURN_NOT_OK(wrap_func(value, out_values)); + unique_values.push_back(*out_values); + ++memo_size; + } else { + // Duplicate entry + Py_INCREF(unique_values[memo_index]); + *out_values = unique_values[memo_index]; + } + return Status::OK(); + }; + + auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { + return wrap_func(value, out_values); + }; + for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); - - const bool has_nulls = data.null_count() > 0; - for (int64_t i = 0; i < arr.length(); ++i) { - if (has_nulls && arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - auto view = arr.GetView(i); - *out_values = WrapBytes::Wrap(view.data(), view.length()); - if (*out_values == nullptr) { - PyErr_Clear(); - return Status::UnknownError("Wrapping ", view, " failed"); - } - } - ++out_values; + if (options.deduplicate_objects) { + RETURN_NOT_OK(WriteArrayObjects(arr, WrapMemoized, out_values)); + } else { + RETURN_NOT_OK(WriteArrayObjects(arr, WrapUnmemoized, out_values)); } + out_values += arr.length(); } return Status::OK(); } -inline Status ConvertNulls(PandasOptions options, const ChunkedArray& data, +template +static Status ConvertIntegerObjects(const PandasOptions& options, + const ChunkedArray& data, PyObject** out_values) { + using T = typename Type::c_type; + auto WrapValue = [](T value, PyObject** out) { + *out = std::is_signed::value ? PyLong_FromLongLong(value) + : PyLong_FromUnsignedLongLong(value); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); +} + +template +inline Status ConvertBinaryLike(const PandasOptions& options, const ChunkedArray& data, + PyObject** out_values) { + auto WrapValue = [](const util::string_view& view, PyObject** out) { + *out = WrapBytes::Wrap(view.data(), view.length()); + if (*out == nullptr) { + PyErr_Clear(); + return Status::UnknownError("Wrapping ", view, " failed"); + } + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); +} + +inline Status ConvertNulls(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { @@ -439,7 +490,7 @@ inline Status ConvertNulls(PandasOptions options, const ChunkedArray& data, return Status::OK(); } -inline Status ConvertStruct(PandasOptions options, const ChunkedArray& data, +inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; if (data.num_chunks() <= 0) { @@ -503,7 +554,8 @@ inline Status ConvertStruct(PandasOptions options, const ChunkedArray& data, } template -inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr& col, +inline Status ConvertListsLike(const PandasOptions& options, + const std::shared_ptr& col, PyObject** out_values) { const ChunkedArray& data = *col->data().get(); const auto& list_type = checked_cast(*col->type()); @@ -604,69 +656,40 @@ inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) } } -template -static Status ConvertDates(PandasOptions options, const ChunkedArray& data, +template +static Status ConvertDates(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { - using ArrayType = typename TypeTraits::ArrayType; - - PyAcquireGIL lock; - OwnedRef date_ref; - - PyDateTime_IMPORT; - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - auto type = std::dynamic_pointer_cast(arr.type()); - DCHECK(type); - - const DateUnit unit = type->unit(); - - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - RETURN_NOT_OK(PyDate_from_int(arr.Value(i), unit, out_values++)); - RETURN_IF_PYERROR(); - } - } + { + PyAcquireGIL lock; + PyDateTime_IMPORT; } - - return Status::OK(); + auto WrapValue = [](typename Type::c_type value, PyObject** out) { + RETURN_NOT_OK(PyDate_from_int(value, Type::UNIT, out)); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); } -template -static Status ConvertTimes(PandasOptions options, const ChunkedArray& data, +template +static Status ConvertTimes(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { - using ArrayType = typename TypeTraits::ArrayType; - - PyAcquireGIL lock; - OwnedRef time_ref; - - PyDateTime_IMPORT; - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - auto type = std::dynamic_pointer_cast(arr.type()); - DCHECK(type); - - const TimeUnit::type unit = type->unit(); - - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - RETURN_NOT_OK(PyTime_from_int(arr.Value(i), unit, out_values++)); - RETURN_IF_PYERROR(); - } - } + { + PyAcquireGIL lock; + PyDateTime_IMPORT; } - return Status::OK(); + const TimeUnit::type unit = checked_cast(*data.type()).unit(); + + auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { + RETURN_NOT_OK(PyTime_from_int(value, unit, out)); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); } -static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, +static Status ConvertDecimals(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; OwnedRef decimal; @@ -715,21 +738,21 @@ class ObjectBlock : public PandasBlock { if (type == Type::BOOL) { RETURN_NOT_OK(ConvertBooleanWithNulls(options_, data, out_buffer)); } else if (type == Type::UINT8) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT8) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::UINT16) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT16) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::UINT32) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT32) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::UINT64) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT64) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::BINARY) { RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); } else if (type == Type::STRING) { @@ -1009,7 +1032,8 @@ class DatetimeBlock : public PandasBlock { class DatetimeTZBlock : public DatetimeBlock { public: - DatetimeTZBlock(PandasOptions options, const std::string& timezone, int64_t num_rows) + DatetimeTZBlock(const PandasOptions& options, const std::string& timezone, + int64_t num_rows) : DatetimeBlock(options, num_rows, 1), timezone_(timezone) {} // Like Categorical, the internal ndarray is 1-dimensional @@ -1038,7 +1062,8 @@ class DatetimeTZBlock : public DatetimeBlock { class CategoricalBlock : public PandasBlock { public: - explicit CategoricalBlock(PandasOptions options, MemoryPool* pool, int64_t num_rows) + explicit CategoricalBlock(const PandasOptions& options, MemoryPool* pool, + int64_t num_rows) : PandasBlock(options, num_rows, 1), pool_(pool), needs_copy_(false) {} Status Allocate() override { @@ -1235,7 +1260,7 @@ class CategoricalBlock : public PandasBlock { bool needs_copy_; }; -Status MakeBlock(PandasOptions options, PandasBlock::type type, int64_t num_rows, +Status MakeBlock(const PandasOptions& options, PandasBlock::type type, int64_t num_rows, int num_columns, std::shared_ptr* block) { #define BLOCK_CASE(NAME, TYPE) \ case PandasBlock::NAME: \ @@ -1518,7 +1543,7 @@ class DataFrameBlockCreator { class ArrowDeserializer { public: - ArrowDeserializer(PandasOptions options, const std::shared_ptr& col, + ArrowDeserializer(const PandasOptions& options, const std::shared_ptr& col, PyObject* py_ref) : col_(col), data_(*col->data().get()), options_(options), py_ref_(py_ref) {} @@ -1532,7 +1557,7 @@ class ArrowDeserializer { } template - Status ConvertValuesZeroCopy(PandasOptions options, int npy_type, + Status ConvertValuesZeroCopy(const PandasOptions& options, int npy_type, const std::shared_ptr& arr) { typedef typename internal::arrow_traits::T T; @@ -1738,9 +1763,7 @@ class ArrowDeserializer { if (data_.null_count() > 0) { if (options_.integer_object_nulls) { - using c_type = typename Type::c_type; - - return VisitObjects(ConvertIntegerObjects); + return VisitObjects(ConvertIntegerObjects); } else { RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); @@ -1878,15 +1901,16 @@ class ArrowDeserializer { PyObject* result_; }; -Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, - PyObject* py_ref, PyObject** out) { +Status ConvertArrayToPandas(const PandasOptions& options, + const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out) { static std::string dummy_name = "dummy"; auto field = std::make_shared(dummy_name, arr->type()); auto col = std::make_shared(field, arr); return ConvertColumnToPandas(options, col, py_ref, out); } -Status ConvertChunkedArrayToPandas(PandasOptions options, +Status ConvertChunkedArrayToPandas(const PandasOptions& options, const std::shared_ptr& ca, PyObject* py_ref, PyObject** out) { static std::string dummy_name = "dummy"; @@ -1895,19 +1919,21 @@ Status ConvertChunkedArrayToPandas(PandasOptions options, return ConvertColumnToPandas(options, col, py_ref, out); } -Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, - PyObject* py_ref, PyObject** out) { +Status ConvertColumnToPandas(const PandasOptions& options, + const std::shared_ptr& col, PyObject* py_ref, + PyObject** out) { ArrowDeserializer converter(options, col, py_ref); return converter.Convert(out); } -Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr
& table, - MemoryPool* pool, PyObject** out) { +Status ConvertTableToPandas(const PandasOptions& options, + const std::shared_ptr
& table, MemoryPool* pool, + PyObject** out) { return ConvertTableToPandas(options, std::unordered_set(), table, pool, out); } -Status ConvertTableToPandas(PandasOptions options, +Status ConvertTableToPandas(const PandasOptions& options, const std::unordered_set& categorical_columns, const std::shared_ptr
& table, MemoryPool* pool, PyObject** out) { diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 753bf4823566b..20bad40971020 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -43,32 +43,32 @@ namespace py { struct PandasOptions { /// If true, we will convert all string columns to categoricals - bool strings_to_categorical; - bool zero_copy_only; - bool integer_object_nulls; - bool date_as_object; - bool use_threads; - - PandasOptions() - : strings_to_categorical(false), - zero_copy_only(false), - integer_object_nulls(false), - date_as_object(false), - use_threads(false) {} + bool strings_to_categorical = false; + bool zero_copy_only = false; + bool integer_object_nulls = false; + bool date_as_object = false; + bool use_threads = false; + + /// \brief If true, do not create duplicate PyObject versions of equal + /// objects. This only applies to immutable objects like strings or datetime + /// objects + bool deduplicate_objects = false; }; ARROW_PYTHON_EXPORT -Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, - PyObject* py_ref, PyObject** out); +Status ConvertArrayToPandas(const PandasOptions& options, + const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out); ARROW_PYTHON_EXPORT -Status ConvertChunkedArrayToPandas(PandasOptions options, +Status ConvertChunkedArrayToPandas(const PandasOptions& options, const std::shared_ptr& col, PyObject* py_ref, PyObject** out); ARROW_PYTHON_EXPORT -Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, - PyObject* py_ref, PyObject** out); +Status ConvertColumnToPandas(const PandasOptions& options, + const std::shared_ptr& col, PyObject* py_ref, + PyObject** out); // Convert a whole table as efficiently as possible to a pandas.DataFrame. // @@ -77,15 +77,16 @@ Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& table, - MemoryPool* pool, PyObject** out); +Status ConvertTableToPandas(const PandasOptions& options, + const std::shared_ptr
& table, MemoryPool* pool, + PyObject** out); /// Convert a whole table as efficiently as possible to a pandas.DataFrame. /// /// Explicitly name columns that should be a categorical /// This option is only used on conversions that are applied to a table. ARROW_PYTHON_EXPORT -Status ConvertTableToPandas(PandasOptions options, +Status ConvertTableToPandas(const PandasOptions& options, const std::unordered_set& categorical_columns, const std::shared_ptr
& table, MemoryPool* pool, PyObject** out); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a8372b96132bd..cd57e2dfb2119 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -137,12 +137,11 @@ std::string FixedSizeBinaryType::ToString() const { // ---------------------------------------------------------------------- // Date types -DateType::DateType(Type::type type_id, DateUnit unit) - : FixedWidthType(type_id), unit_(unit) {} +DateType::DateType(Type::type type_id) : FixedWidthType(type_id) {} -Date32Type::Date32Type() : DateType(Type::DATE32, DateUnit::DAY) {} +Date32Type::Date32Type() : DateType(Type::DATE32) {} -Date64Type::Date64Type() : DateType(Type::DATE64, DateUnit::MILLI) {} +Date64Type::Date64Type() : DateType(Type::DATE64) {} std::string Date64Type::ToString() const { return std::string("date64[ms]"); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 0758ced80ad0c..6c3643c6344c8 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -600,17 +600,17 @@ enum class DateUnit : char { DAY = 0, MILLI = 1 }; /// \brief Base type class for date data class ARROW_EXPORT DateType : public FixedWidthType { public: - DateUnit unit() const { return unit_; } + virtual DateUnit unit() const = 0; protected: - DateType(Type::type type_id, DateUnit unit); - DateUnit unit_; + explicit DateType(Type::type type_id); }; /// Concrete type class for 32-bit date data (as number of days since UNIX epoch) class ARROW_EXPORT Date32Type : public DateType { public: static constexpr Type::type type_id = Type::DATE32; + static constexpr DateUnit UNIT = DateUnit::DAY; using c_type = int32_t; @@ -622,12 +622,14 @@ class ARROW_EXPORT Date32Type : public DateType { std::string ToString() const override; std::string name() const override { return "date32"; } + DateUnit unit() const override { return UNIT; } }; /// Concrete type class for 64-bit date data (as number of milliseconds since UNIX epoch) class ARROW_EXPORT Date64Type : public DateType { public: static constexpr Type::type type_id = Type::DATE64; + static constexpr DateUnit UNIT = DateUnit::MILLI; using c_type = int64_t; @@ -639,6 +641,7 @@ class ARROW_EXPORT Date64Type : public DateType { std::string ToString() const override; std::string name() const override { return "date64"; } + DateUnit unit() const override { return UNIT; } }; struct TimeUnit { diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index da5cf25f5eed1..b89f52f2da661 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -371,6 +371,11 @@ template using enable_if_boolean = typename std::enable_if::value>::type; +template +using enable_if_binary_like = + typename std::enable_if::value || + std::is_base_of::value>::type; + template using enable_if_fixed_size_binary = typename std::enable_if::value>::type; diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 76724b2a30035..3dde0beeb194e 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -102,6 +102,18 @@ struct ScalarHelper +struct ScalarHelper< + Scalar, AlgNum, + typename std::enable_if::value>::type> + : public ScalarHelperBase { + // ScalarHelper specialization for util::string_view + + static hash_t ComputeHash(const util::string_view& value) { + return ComputeStringHash(value.data(), static_cast(value.size())); + } +}; + template struct ScalarHelper::value>::type> @@ -332,7 +344,7 @@ class ScalarMemoTable { explicit ScalarMemoTable(int64_t entries = 0) : hash_table_(static_cast(entries)) {} - int32_t Get(const Scalar value) const { + int32_t Get(const Scalar& value) const { auto cmp_func = [value](const Payload* payload) -> bool { return ScalarHelper::CompareScalars(payload->value, value); }; @@ -346,7 +358,7 @@ class ScalarMemoTable { } template - int32_t GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found) { + int32_t GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found) { auto cmp_func = [value](const Payload* payload) -> bool { return ScalarHelper::CompareScalars(value, payload->value); }; @@ -364,7 +376,7 @@ class ScalarMemoTable { return memo_index; } - int32_t GetOrInsert(const Scalar value) { + int32_t GetOrInsert(const Scalar& value) { return GetOrInsert(value, [](int32_t i) {}, [](int32_t i) {}); } @@ -389,6 +401,7 @@ class ScalarMemoTable { Scalar value; int32_t memo_index; }; + using HashTableType = HashTableTemplateType; using HashTableEntry = typename HashTableType::Entry; HashTableType hash_table_; @@ -621,9 +634,11 @@ class BinaryMemoTable { struct Payload { int32_t memo_index; }; + using HashTableType = HashTable; using HashTableEntry = typename HashTable::Entry; HashTableType hash_table_; + std::vector offsets_; std::string values_; diff --git a/python/benchmarks/convert_pandas.py b/python/benchmarks/convert_pandas.py index 244b3dcc84713..bb8d7102ea783 100644 --- a/python/benchmarks/convert_pandas.py +++ b/python/benchmarks/convert_pandas.py @@ -17,6 +17,8 @@ import numpy as np import pandas as pd +import pandas.util.testing as tm + import pyarrow as pa @@ -50,6 +52,26 @@ def time_to_series(self, n, dtype): self.arrow_data.to_pandas() +class ToPandasStrings(object): + + param_names = ('uniqueness', 'total') + params = ((0.001, 0.01, 0.1, 0.5), (1000000,)) + string_length = 25 + + def setup(self, uniqueness, total): + nunique = int(total * uniqueness) + unique_values = [tm.rands(self.string_length) for i in range(nunique)] + values = unique_values * (total // nunique) + self.arr = pa.array(values, type=pa.string()) + self.table = pa.Table.from_arrays([self.arr], ['f0']) + + def time_to_pandas_dedup(self, *args): + self.arr.to_pandas() + + def time_to_pandas_no_dedup(self, *args): + self.arr.to_pandas(deduplicate_objects=False) + + class ZeroCopyPandasRead(object): def setup(self): diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b86872f7ea98d..ef95efe71b33c 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -339,7 +339,61 @@ def _restore_array(data): return pyarrow_wrap_array(MakeArray(ad)) -cdef class Array: +cdef class _PandasConvertible: + + def to_pandas(self, categories=None, bint strings_to_categorical=False, + bint zero_copy_only=False, bint integer_object_nulls=False, + bint date_as_object=False, + bint use_threads=True, + bint deduplicate_objects=True, + bint ignore_metadata=False): + """ + Convert to a pandas-compatible NumPy array or DataFrame, as appropriate + + Parameters + ---------- + strings_to_categorical : boolean, default False + Encode string (UTF8) and binary types to pandas.Categorical + categories: list, default empty + List of fields that should be returned as pandas.Categorical. Only + applies to table-like data structures + zero_copy_only : boolean, default False + Raise an ArrowException if this function call would require copying + the underlying data + integer_object_nulls : boolean, default False + Cast integers with nulls to objects + date_as_object : boolean, default False + Cast dates to objects + use_threads: boolean, default True + Whether to parallelize the conversion using multiple threads + deduplicate_objects : boolean, default False + Do not create multiple copies Python objects when created, to save + on memory use. Conversion will be slower + ignore_metadata : boolean, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present + + Returns + ------- + NumPy array or DataFrame depending on type of object + """ + cdef: + PyObject* out + PandasOptions options + + options = PandasOptions( + strings_to_categorical=strings_to_categorical, + zero_copy_only=zero_copy_only, + integer_object_nulls=integer_object_nulls, + date_as_object=date_as_object, + use_threads=use_threads, + deduplicate_objects=deduplicate_objects) + + return self._to_pandas(options, categories=categories, + ignore_metadata=ignore_metadata) + + +cdef class Array(_PandasConvertible): def __init__(self): raise TypeError("Do not call {}'s constructor directly, use one of " @@ -602,42 +656,13 @@ cdef class Array: return pyarrow_wrap_array(result) - def to_pandas(self, bint strings_to_categorical=False, - bint zero_copy_only=False, bint integer_object_nulls=False, - bint date_as_object=False): - """ - Convert to a NumPy array object suitable for use in pandas. - - Parameters - ---------- - strings_to_categorical : boolean, default False - Encode string (UTF8) and binary types to pandas.Categorical - zero_copy_only : boolean, default False - Raise an ArrowException if this function call would require copying - the underlying data - integer_object_nulls : boolean, default False - Cast integers with nulls to objects - date_as_object : boolean, default False - Cast dates to objects - - See also - -------- - Column.to_pandas - Table.to_pandas - RecordBatch.to_pandas - """ + def _to_pandas(self, options, **kwargs): cdef: PyObject* out - PandasOptions options + PandasOptions c_options = options - options = PandasOptions( - strings_to_categorical=strings_to_categorical, - zero_copy_only=zero_copy_only, - integer_object_nulls=integer_object_nulls, - date_as_object=date_as_object, - use_threads=False) with nogil: - check_status(ConvertArrayToPandas(options, self.sp_array, + check_status(ConvertArrayToPandas(c_options, self.sp_array, self, &out)) return wrap_array_output(out) diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index 068d5607de813..ee924ed388ff1 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -192,11 +192,15 @@ def _iterate_python_module_paths(package_name): for finder in sys.meta_path: try: spec = finder.find_spec(absolute_name, None) - except AttributeError: + except (AttributeError, TypeError): # On Travis (Python 3.5) the above produced: # AttributeError: 'VendorImporter' object has no # attribute 'find_spec' + # + # ARROW-4117: When running "asv dev", TypeError is raised + # due to the meta-importer spec = None + if spec is not None: break diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 7ce03bf6eb80c..cc77ff432967f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1064,20 +1064,20 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus TensorToNdarray(const shared_ptr[CTensor]& tensor, object base, PyObject** out) - CStatus ConvertArrayToPandas(PandasOptions options, + CStatus ConvertArrayToPandas(const PandasOptions& options, const shared_ptr[CArray]& arr, object py_ref, PyObject** out) - CStatus ConvertChunkedArrayToPandas(PandasOptions options, + CStatus ConvertChunkedArrayToPandas(const PandasOptions& options, const shared_ptr[CChunkedArray]& arr, object py_ref, PyObject** out) - CStatus ConvertColumnToPandas(PandasOptions options, + CStatus ConvertColumnToPandas(const PandasOptions& options, const shared_ptr[CColumn]& arr, object py_ref, PyObject** out) CStatus ConvertTableToPandas( - PandasOptions options, + const PandasOptions& options, const unordered_set[c_string]& categorical_columns, const shared_ptr[CTable]& table, CMemoryPool* pool, @@ -1110,6 +1110,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: c_bool integer_object_nulls c_bool date_as_object c_bool use_threads + c_bool deduplicate_objects cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil: diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index d829d6a0c50ad..8cd8f401a2749 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -179,7 +179,11 @@ cdef class FixedSizeBinaryValue(ArrayValue): pass -cdef class Array: +cdef class _PandasConvertible: + pass + + +cdef class Array(_PandasConvertible): cdef: shared_ptr[CArray] sp_array CArray* ap @@ -306,7 +310,7 @@ cdef object box_scalar(DataType type, int64_t index) -cdef class ChunkedArray: +cdef class ChunkedArray(_PandasConvertible): cdef: shared_ptr[CChunkedArray] sp_chunked_array CChunkedArray* chunked_array @@ -315,7 +319,7 @@ cdef class ChunkedArray: cdef getitem(self, int64_t i) -cdef class Column: +cdef class Column(_PandasConvertible): cdef: shared_ptr[CColumn] sp_column CColumn* column @@ -323,7 +327,7 @@ cdef class Column: cdef void init(self, const shared_ptr[CColumn]& column) -cdef class Table: +cdef class Table(_PandasConvertible): cdef: shared_ptr[CTable] sp_table CTable* table @@ -331,7 +335,7 @@ cdef class Table: cdef void init(self, const shared_ptr[CTable]& table) -cdef class RecordBatch: +cdef class RecordBatch(_PandasConvertible): cdef: shared_ptr[CRecordBatch] sp_batch CRecordBatch* batch diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 6acca0c35cf40..a5d8621590f13 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -548,7 +548,7 @@ def _make_datetimetz(tz): # Converting pyarrow.Table efficiently to pandas.DataFrame -def table_to_blockmanager(options, table, memory_pool, categories=None, +def table_to_blockmanager(options, table, categories=None, ignore_metadata=False): from pyarrow.compat import DatetimeTZDtype @@ -624,7 +624,8 @@ def table_to_blockmanager(options, table, memory_pool, categories=None, block_table.schema.get_field_index(raw_name) ) - blocks = _table_to_blocks(options, block_table, memory_pool, categories) + blocks = _table_to_blocks(options, block_table, pa.default_memory_pool(), + categories) # Construct the row index if len(index_arrays) > 1: diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 29a784d60f5a8..59680ed87aa38 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -28,7 +28,7 @@ else: import pyarrow.pandas_compat as pdcompat -cdef class ChunkedArray: +cdef class ChunkedArray(_PandasConvertible): """ Array backed via one or more memory chunks. @@ -145,43 +145,14 @@ cdef class ChunkedArray: return result - def to_pandas(self, bint strings_to_categorical=False, - bint zero_copy_only=False, bint integer_object_nulls=False, - bint date_as_object=False): - """ - Convert the arrow::ChunkedArray to an array object suitable for use - in pandas - - Parameters - ---------- - strings_to_categorical : boolean, default False - Encode string (UTF8) and binary types to pandas.Categorical - zero_copy_only : boolean, default False - Raise an ArrowException if this function call would require copying - the underlying data - integer_object_nulls : boolean, default False - Cast integers with nulls to objects - date_as_object : boolean, default False - Cast dates to objects - - See also - -------- - Column.to_pandas - """ + def _to_pandas(self, options, **kwargs): cdef: PyObject* out - PandasOptions options - - options = PandasOptions( - strings_to_categorical=strings_to_categorical, - zero_copy_only=zero_copy_only, - integer_object_nulls=integer_object_nulls, - date_as_object=date_as_object, - use_threads=False) + PandasOptions c_options = options with nogil: check_status(libarrow.ConvertChunkedArrayToPandas( - options, + c_options, self.sp_chunked_array, self, &out)) @@ -385,7 +356,7 @@ def column(object field_or_name, arr): return pyarrow_wrap_column(sp_column) -cdef class Column: +cdef class Column(_PandasConvertible): """ Named vector of elements of equal type. @@ -497,33 +468,8 @@ cdef class Column: return [pyarrow_wrap_column(col) for col in flattened] - def to_pandas(self, bint strings_to_categorical=False, - bint zero_copy_only=False, bint integer_object_nulls=False, - bint date_as_object=False): - """ - Convert the arrow::Column to a pandas.Series - - Parameters - ---------- - strings_to_categorical : boolean, default False - Encode string (UTF8) and binary types to pandas.Categorical - zero_copy_only : boolean, default False - Raise an ArrowException if this function call would require copying - the underlying data - integer_object_nulls : boolean, default False - Cast integers with nulls to objects - date_as_object : boolean, default False - Cast dates to objects - - Returns - ------- - pandas.Series - """ - values = self.data.to_pandas( - strings_to_categorical=strings_to_categorical, - zero_copy_only=zero_copy_only, - date_as_object=date_as_object, - integer_object_nulls=integer_object_nulls) + def _to_pandas(self, options, **kwargs): + values = self.data._to_pandas(options) result = pd.Series(values, name=self.name) if isinstance(self.type, TimestampType): @@ -685,7 +631,7 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): schema.reset(new CSchema(c_fields, c_meta)) -cdef class RecordBatch: +cdef class RecordBatch(_PandasConvertible): """ Batch of rows of columns of equal length @@ -887,46 +833,8 @@ cdef class RecordBatch: entries.append((name, column)) return OrderedDict(entries) - def to_pandas(self, MemoryPool memory_pool=None, categories=None, - bint strings_to_categorical=False, bint zero_copy_only=False, - bint integer_object_nulls=False, bint date_as_object=False, - bint use_threads=True, bint ignore_metadata=False): - """ - Convert the arrow::RecordBatch to a pandas DataFrame - - Parameters - ---------- - memory_pool: MemoryPool, optional - Specific memory pool to use to allocate casted columns - categories: list, default empty - List of columns that should be returned as pandas.Categorical - strings_to_categorical : boolean, default False - Encode string (UTF8) and binary types to pandas.Categorical - zero_copy_only : boolean, default False - Raise an ArrowException if this function call would require copying - the underlying data - integer_object_nulls : boolean, default False - Cast integers with nulls to objects - date_as_object : boolean, default False - Cast dates to objects - use_threads: boolean, default True - Whether to parallelize the conversion using multiple threads - ignore_metadata : boolean, default False - If True, do not use the 'pandas' metadata to reconstruct the - DataFrame index, if present - - Returns - ------- - pandas.DataFrame - """ - return Table.from_batches([self]).to_pandas( - memory_pool=memory_pool, categories=categories, - strings_to_categorical=strings_to_categorical, - zero_copy_only=zero_copy_only, - integer_object_nulls=integer_object_nulls, - date_as_object=date_as_object, use_threads=use_threads, - ignore_metadata=ignore_metadata - ) + def _to_pandas(self, options, **kwargs): + return Table.from_batches([self])._to_pandas(options, **kwargs) @classmethod def from_pandas(cls, df, Schema schema=None, bint preserve_index=True, @@ -1031,7 +939,7 @@ def table_to_blocks(PandasOptions options, Table table, return PyObject_to_object(result_obj) -cdef class Table: +cdef class Table(_PandasConvertible): """ A collection of top-level named, equal length Arrow arrays. @@ -1386,50 +1294,8 @@ cdef class Table: return result - def to_pandas(self, MemoryPool memory_pool=None, categories=None, - bint strings_to_categorical=False, bint zero_copy_only=False, - bint integer_object_nulls=False, bint date_as_object=False, - bint use_threads=True, bint ignore_metadata=False): - """ - Convert the arrow::Table to a pandas DataFrame - - Parameters - ---------- - memory_pool: MemoryPool, optional - Specific memory pool to use to allocate casted columns - categories: list, default empty - List of columns that should be returned as pandas.Categorical - strings_to_categorical : boolean, default False - Encode string (UTF8) and binary types to pandas.Categorical - zero_copy_only : boolean, default False - Raise an ArrowException if this function call would require copying - the underlying data - integer_object_nulls : boolean, default False - Cast integers with nulls to objects - date_as_object : boolean, default False - Cast dates to objects - use_threads: boolean, default True - Whether to parallelize the conversion using multiple threads - ignore_metadata : boolean, default False - If True, do not use the 'pandas' metadata to reconstruct the - DataFrame index, if present - - Returns - ------- - pandas.DataFrame - """ - cdef: - PandasOptions options - - options = PandasOptions( - strings_to_categorical=strings_to_categorical, - zero_copy_only=zero_copy_only, - integer_object_nulls=integer_object_nulls, - date_as_object=date_as_object, - use_threads=use_threads) - - mgr = pdcompat.table_to_blockmanager(options, self, memory_pool, - categories, + def _to_pandas(self, options, categories=None, ignore_metadata=False): + mgr = pdcompat.table_to_blockmanager(options, self, categories, ignore_metadata=ignore_metadata) return pd.DataFrame(mgr) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 12214847f3e53..8d8b65b2240b8 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -2316,6 +2316,91 @@ def test_convert_unsupported_type_error_message(): pa.Table.from_pandas(df) +# ---------------------------------------------------------------------- +# Test object deduplication in to_pandas + + +def _generate_dedup_example(nunique, repeats): + unique_values = [tm.rands(10) for i in range(nunique)] + return unique_values * repeats + + +def _assert_nunique(obj, expected): + assert len({id(x) for x in obj}) == expected + + +def test_to_pandas_deduplicate_strings_array_types(): + nunique = 100 + repeats = 10 + values = _generate_dedup_example(nunique, repeats) + + for arr in [pa.array(values, type=pa.binary()), + pa.array(values, type=pa.utf8()), + pa.chunked_array([values, values]), + pa.column('foo', [values, values])]: + _assert_nunique(arr.to_pandas(), nunique) + _assert_nunique(arr.to_pandas(deduplicate_objects=False), len(arr)) + + +def test_to_pandas_deduplicate_strings_table_types(): + nunique = 100 + repeats = 10 + values = _generate_dedup_example(nunique, repeats) + + arr = pa.array(values) + rb = pa.RecordBatch.from_arrays([arr], ['foo']) + tbl = pa.Table.from_batches([rb]) + + for obj in [rb, tbl]: + _assert_nunique(obj.to_pandas()['foo'], nunique) + _assert_nunique(obj.to_pandas(deduplicate_objects=False)['foo'], + len(obj)) + + +def test_to_pandas_deduplicate_integers_as_objects(): + nunique = 100 + repeats = 10 + + # Python automatically interns smaller integers + unique_values = list(np.random.randint(10000000, 1000000000, size=nunique)) + unique_values[nunique // 2] = None + + arr = pa.array(unique_values * repeats) + + _assert_nunique(arr.to_pandas(integer_object_nulls=True), nunique) + _assert_nunique(arr.to_pandas(integer_object_nulls=True, + deduplicate_objects=False), + # Account for None + (nunique - 1) * repeats + 1) + + +def test_to_pandas_deduplicate_date_time(): + nunique = 100 + repeats = 10 + + unique_values = list(range(nunique)) + + cases = [ + # raw type, array type, to_pandas options + ('int32', 'date32', {'date_as_object': True}), + ('int64', 'date64', {'date_as_object': True}), + ('int32', 'time32[ms]', {}), + ('int64', 'time64[us]', {}) + ] + + for raw_type, array_type, pandas_options in cases: + raw_arr = pa.array(unique_values * repeats, type=raw_type) + casted_arr = raw_arr.cast(array_type) + + _assert_nunique(casted_arr.to_pandas(**pandas_options), + nunique) + _assert_nunique(casted_arr.to_pandas(deduplicate_objects=False, + **pandas_options), + len(casted_arr)) + + +# --------------------------------------------------------------------- + def test_table_from_pandas_keeps_column_order_of_dataframe(): df1 = pd.DataFrame(OrderedDict([ ('partition', [0, 0, 1, 1]), From 83a4e979271535b74de9870289cf99d02f6eb16b Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Thu, 27 Dec 2018 12:36:54 -0600 Subject: [PATCH 121/328] ARROW-4080: [Rust] Improving lengthy build times in Appveyor This tries to cut the build times by skipping: 1. build for stable (it doesn't seem too useful). 1. benchmarks in travis 2. build for dev profiles in windows CI - now we only build with release profiles. Author: Chao Sun Closes #3231 from sunchao/ARROW-4080 and squashes the following commits: f5956404 Disable some flaky doctests 60f8b7d2 ARROW-4080: Improving lengthy build times in Appveyor --- .travis.yml | 1 - ci/rust-build-main.bat | 20 -------------------- ci/travis_script_rust.sh | 1 - rust/src/parquet/column/mod.rs | 4 ++-- rust/src/parquet/file/mod.rs | 8 ++++---- 5 files changed, 6 insertions(+), 28 deletions(-) diff --git a/.travis.yml b/.travis.yml index 99ff24aaacc97..b37194f8f2414 100644 --- a/.travis.yml +++ b/.travis.yml @@ -282,7 +282,6 @@ matrix: - if [ $ARROW_CI_RUST_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_cargo.sh script: - - RUSTUP_TOOLCHAIN=stable $TRAVIS_BUILD_DIR/ci/travis_script_rust.sh || true - RUSTUP_TOOLCHAIN=nightly $TRAVIS_BUILD_DIR/ci/travis_script_rust.sh after_success: - pushd ${TRAVIS_BUILD_DIR}/rust diff --git a/ci/rust-build-main.bat b/ci/rust-build-main.bat index e338f7e172e6e..6ef451204d45a 100644 --- a/ci/rust-build-main.bat +++ b/ci/rust-build-main.bat @@ -22,33 +22,13 @@ git submodule update --init || exit /B set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data pushd rust -@echo =================================== -@echo Build with stable toolchain -@echo =================================== - -rustup default stable -rustup show -cargo build --target %TARGET% -cargo build --target %TARGET% --release -@echo Test (debug) -@echo ------------ -cargo test --target %TARGET% -@echo -@echo Test (release) -@echo -------------- -cargo test --target %TARGET% --release - @echo =================================== @echo Build with nightly toolchain @echo =================================== rustup default nightly rustup show -cargo build --target %TARGET% || exit /B cargo build --target %TARGET% --release || exit /B -@echo Test (debug) -@echo ------------ -cargo test --target %TARGET% || exit /B @echo @echo Test (release) @echo -------------- diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh index 4b09bc22e4c20..af61dd39446ff 100755 --- a/ci/travis_script_rust.sh +++ b/ci/travis_script_rust.sh @@ -36,7 +36,6 @@ cargo rustc -- -D warnings cargo build cargo test -cargo bench cargo run --example builders cargo run --example dynamic_types cargo run --example read_csv diff --git a/rust/src/parquet/column/mod.rs b/rust/src/parquet/column/mod.rs index 09c4bde51f771..4ced32e28cbb9 100644 --- a/rust/src/parquet/column/mod.rs +++ b/rust/src/parquet/column/mod.rs @@ -35,7 +35,7 @@ //! The example uses column writer and reader APIs to write raw values, definition and //! repetition levels and read them to verify write/read correctness. //! -//! ```rust +//! ```rust,no_run //! use std::{fs, path::Path, rc::Rc}; //! //! use arrow::parquet::{ @@ -48,7 +48,7 @@ //! schema::parser::parse_message_type, //! }; //! -//! let path = Path::new("target/debug/examples/column_sample.parquet"); +//! let path = Path::new("/path/to/column_sample.parquet"); //! //! // Writing data using column writer API. //! diff --git a/rust/src/parquet/file/mod.rs b/rust/src/parquet/file/mod.rs index ebaebbad0bb6f..38fe8fa9b15b1 100644 --- a/rust/src/parquet/file/mod.rs +++ b/rust/src/parquet/file/mod.rs @@ -26,7 +26,7 @@ //! //! # Example of writing a new file //! -//! ```rust +//! ```rust,no_run //! use std::{fs, path::Path, rc::Rc}; //! //! use arrow::parquet::{ @@ -37,7 +37,7 @@ //! schema::parser::parse_message_type, //! }; //! -//! let path = Path::new("target/debug/examples/sample.parquet"); +//! let path = Path::new("/path/to/sample.parquet"); //! //! let message_type = " //! message schema { @@ -61,11 +61,11 @@ //! ``` //! # Example of reading an existing file //! -//! ```rust +//! ```rust,no_run //! use arrow::parquet::file::reader::{FileReader, SerializedFileReader}; //! use std::{fs::File, path::Path}; //! -//! let path = Path::new("target/debug/examples/sample.parquet"); +//! let path = Path::new("/path/to/sample.parquet"); //! if let Ok(file) = File::open(&path) { //! let file = File::open(&path).unwrap(); //! let reader = SerializedFileReader::new(file).unwrap(); From 0a631dbadb81a95c599ab68a2fd0801144d59f52 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Fri, 28 Dec 2018 00:18:31 -0600 Subject: [PATCH 122/328] ARROW-4113: [R] Fix version number Author: Kouhei Sutou Closes #3278 from kou/r-fix-package-version and squashes the following commits: 17fe7da6 Remove R from allow_failures 50377004 Fix version number --- .travis.yml | 1 - dev/release/00-prepare.sh | 41 ++++++++++++++++++++++++++------------- r/DESCRIPTION | 2 +- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/.travis.yml b/.travis.yml index b37194f8f2414..059daeef8fd14 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,7 +48,6 @@ matrix: fast_finish: true allow_failures: - jdk: oraclejdk9 - - language: r include: - name: "Lint C++, Python, R" os: linux diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 141882e22566a..47ef760b86b9e 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -22,7 +22,20 @@ set -e SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" update_versions() { - local version=$1 + local base_version=$1 + local next_version=$2 + local type=$3 + + case ${type} in + release) + version=${base_version} + r_version=${base_version} + ;; + snapshot) + version=${next_version}-SNAPSHOT + r_version=${base_version}.9000 + ;; + esac cd "${SOURCE_DIR}/../../cpp" sed -i.bak -r -e \ @@ -70,7 +83,7 @@ update_versions() { cd "${SOURCE_DIR}/../../r" sed -i.bak -r -e \ - "s/^Version: .+/Version: ${version}/" \ + "s/^Version: .+/Version: ${r_version}/" \ DESCRIPTION rm -f DESCRIPTION.bak git add DESCRIPTION @@ -95,8 +108,8 @@ update_versions() { if [ "$#" -eq 2 ]; then version=$1 - nextVersion=$2 - nextVersionSNAPSHOT=${nextVersion}-SNAPSHOT + next_version=$2 + next_version_snapshot=${next_version}-SNAPSHOT tag=apache-arrow-${version} echo "Updating changelog for $version" @@ -113,23 +126,23 @@ if [ "$#" -eq 2 ]; then git commit -m "[Release] Update .deb/.rpm changelogs for $version" cd - - echo "prepare release ${version} on tag ${tag} then reset to version ${nextVersionSNAPSHOT}" + echo "prepare release ${version} on tag ${tag} then reset to version ${next_version_snapshot}" - update_versions "${version}" + update_versions "${version}" "${next_version}" "release" git commit -m "[Release] Update versions for ${version}" cd "${SOURCE_DIR}/../../java" mvn release:clean - mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmodules -DdevelopmentVersion=${nextVersionSNAPSHOT} + mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmodules -DdevelopmentVersion=${next_version_snapshot} cd - - echo "Updating versions for ${nextVersionSNAPSHOT}" - update_versions "${nextVersionSNAPSHOT}" - git commit -m "[Release] Update versions for ${nextVersionSNAPSHOT}" + echo "Updating versions for ${next_version_snapshot}" + update_versions "${version}" "${next_version}" "snapshot" + git commit -m "[Release] Update versions for ${next_version_snapshot}" - echo "Updating .deb package names for ${nextVersion}" + echo "Updating .deb package names for ${next_version}" deb_lib_suffix=$(echo $version | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') - next_deb_lib_suffix=$(echo $nextVersion | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') + next_deb_lib_suffix=$(echo $next_version | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') cd $SOURCE_DIR/../tasks/linux-packages/ for target in debian*/lib*${deb_lib_suffix}.install; do git mv \ @@ -150,12 +163,12 @@ if [ "$#" -eq 2 ]; then sed -i.bak -r -e "${deb_lib_suffix_substitute_pattern}" rat_exclude_files.txt rm -f rat_exclude_files.txt.bak git add rat_exclude_files.txt - git commit -m "[Release] Update .deb package names for $nextVersion" + git commit -m "[Release] Update .deb package names for $next_version" cd - echo "Finish staging binary artifacts by running: sh dev/release/01-perform.sh" else - echo "Usage: $0 " + echo "Usage: $0 " exit fi diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 10c28c3e7c42e..45e0f83dcbd0a 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: R Integration to 'Apache' 'Arrow' -Version: 0.12.0-SNAPSHOT +Version: 0.11.0.9000 Authors@R: c( person("Romain", "François", email = "romain@rstudio.com", role = c("aut", "cre")), person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")), From 68daba2ba7390d0afee072aa00271a60d8ad4b07 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 28 Dec 2018 15:56:55 +0100 Subject: [PATCH 123/328] ARROW-3020: [C++/Python] Allow empty arrow::Table objects to be written as empty Parquet row groups While it's unclear how useful this is, it at least preserves the intent of the user if they decide to call `write_table` with an empty table Author: Wes McKinney Closes #3269 from wesm/ARROW-3020 and squashes the following commits: b8c0cc2d Revert changes to CMakeLists.txt 12b92cf6 Allow empty arrow::Table objects to be written as empty Parquet row groups, and read back --- cpp/src/parquet/arrow/writer.cc | 30 ++++++++++++++++++++-------- python/pyarrow/_parquet.pyx | 13 ++++++------ python/pyarrow/tests/test_parquet.py | 18 +++++++++++++++++ 3 files changed, 46 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index a8153cac1ebea..a5c0a62994b1b 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -312,6 +312,10 @@ class ArrowColumnWriter { Status Write(const Array& data); Status Write(const ChunkedArray& data, int64_t offset, const int64_t size) { + if (data.length() == 0) { + return Status::OK(); + } + int64_t absolute_position = 0; int chunk_index = 0; int64_t chunk_offset = 0; @@ -1134,22 +1138,32 @@ Status WriteFileMetaData(const FileMetaData& file_metadata, namespace {} // namespace Status FileWriter::WriteTable(const Table& table, int64_t chunk_size) { - if (chunk_size <= 0) { + if (chunk_size <= 0 && table.num_rows() > 0) { return Status::Invalid("chunk size per row_group must be greater than 0"); } else if (chunk_size > impl_->properties().max_row_group_length()) { chunk_size = impl_->properties().max_row_group_length(); } - for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) { - int64_t offset = chunk * chunk_size; - int64_t size = std::min(chunk_size, table.num_rows() - offset); - - RETURN_NOT_OK_ELSE(NewRowGroup(size), PARQUET_IGNORE_NOT_OK(Close())); + auto WriteRowGroup = [&](int64_t offset, int64_t size) { + RETURN_NOT_OK(NewRowGroup(size)); for (int i = 0; i < table.num_columns(); i++) { auto chunked_data = table.column(i)->data(); - RETURN_NOT_OK_ELSE(WriteColumnChunk(chunked_data, offset, size), - PARQUET_IGNORE_NOT_OK(Close())); + RETURN_NOT_OK(WriteColumnChunk(chunked_data, offset, size)); } + return Status::OK(); + }; + + if (table.num_rows() == 0) { + // Append a row group with 0 rows + RETURN_NOT_OK_ELSE(WriteRowGroup(0, 0), PARQUET_IGNORE_NOT_OK(Close())); + return Status::OK(); + } + + for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) { + int64_t offset = chunk * chunk_size; + RETURN_NOT_OK_ELSE( + WriteRowGroup(offset, std::min(chunk_size, table.num_rows() - offset)), + PARQUET_IGNORE_NOT_OK(Close())); } return Status::OK(); } diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 2e92bac9a74d8..fcecaf5680e42 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -909,17 +909,16 @@ cdef class ParquetWriter: check_status(self.sink.get().Close()) def write_table(self, Table table, row_group_size=None): - cdef CTable* ctable = table.table + cdef: + CTable* ctable = table.table + int64_t c_row_group_size if row_group_size is None or row_group_size == -1: - if ctable.num_rows() > 0: - row_group_size = ctable.num_rows() - else: - row_group_size = 1 + c_row_group_size = ctable.num_rows() elif row_group_size == 0: raise ValueError('Row group size cannot be 0') - - cdef int64_t c_row_group_size = row_group_size + else: + c_row_group_size = row_group_size with nogil: check_status(self.writer.get() diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 82c80e9e09d13..9f05170bdbeba 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -2251,6 +2251,24 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir): writer.write_table(table2) +def test_empty_row_groups(tempdir): + # ARROW-3020 + table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0']) + + path = tempdir / 'empty_row_groups.parquet' + + num_groups = 3 + with pq.ParquetWriter(path, table.schema) as writer: + for i in range(num_groups): + writer.write_table(table) + + reader = pq.ParquetFile(path) + assert reader.metadata.num_row_groups == num_groups + + for i in range(num_groups): + assert reader.read_row_group(i).equals(table) + + def test_writing_empty_lists(): # ARROW-2591: [Python] Segmentation fault issue in pq.write_table arr1 = pa.array([[], []], pa.list_(pa.int32())) From 8ed97cc15a2eff95dad28d3f5dce5af944f02ea3 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 28 Dec 2018 16:07:08 +0100 Subject: [PATCH 124/328] ARROW-4129: [Python] Fix syntax problem in benchmark docs Author: Uwe L. Korn Closes #3282 from xhochy/ARROW-4129 and squashes the following commits: 2430f156 ARROW-4129: Fix syntax problem in benchmark docs --- docs/source/python/benchmarks.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/benchmarks.rst b/docs/source/python/benchmarks.rst index 6c3144ae58637..7672294a4eddf 100644 --- a/docs/source/python/benchmarks.rst +++ b/docs/source/python/benchmarks.rst @@ -50,4 +50,4 @@ Compatibility We only expect the benchmarking setup to work with Python 3.6 or later, on a Unix-like system. -.. asv:: https://asv.readthedocs.org/ +.. _asv: https://asv.readthedocs.org/ From 7074889602a2279cfa2440697040a946628f5b56 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Sun, 30 Dec 2018 09:56:05 +0900 Subject: [PATCH 125/328] ARROW-4132: [GLib] Add more GArrowTable constructors Author: Kouhei Sutou Closes #3285 from kou/glib-table-new and squashes the following commits: 8bab8046 Add more GArrowTable constructors --- c_glib/arrow-glib/composite-array.h | 2 + c_glib/arrow-glib/orc-file-reader.h | 4 +- c_glib/arrow-glib/table.cpp | 204 +++++++++++++++++++++++++++- c_glib/arrow-glib/table.h | 33 ++++- c_glib/arrow-glib/version.h.in | 23 ++++ c_glib/test/test-table.rb | 61 +++++++-- 6 files changed, 310 insertions(+), 17 deletions(-) diff --git a/c_glib/arrow-glib/composite-array.h b/c_glib/arrow-glib/composite-array.h index c634dbfc3b006..10432e2e56ba3 100644 --- a/c_glib/arrow-glib/composite-array.h +++ b/c_glib/arrow-glib/composite-array.h @@ -130,8 +130,10 @@ GArrowStructArray *garrow_struct_array_new(GArrowDataType *data_type, GArrowArray *garrow_struct_array_get_field(GArrowStructArray *array, gint i); +#ifndef GARROW_DISABLE_DEPRECATED GARROW_DEPRECATED_IN_0_10_FOR(garrow_struct_array_flatten) GList *garrow_struct_array_get_fields(GArrowStructArray *array); +#endif GARROW_AVAILABLE_IN_0_10 GList *garrow_struct_array_flatten(GArrowStructArray *array, GError **error); diff --git a/c_glib/arrow-glib/orc-file-reader.h b/c_glib/arrow-glib/orc-file-reader.h index 9b2dbadefe43a..97cf1efa92ff7 100644 --- a/c_glib/arrow-glib/orc-file-reader.h +++ b/c_glib/arrow-glib/orc-file-reader.h @@ -39,7 +39,7 @@ garrow_orc_file_reader_new(GArrowSeekableInputStream *file, GError **error); #ifndef GARROW_DISABLE_DEPRECATED -G_GNUC_DEPRECATED_FOR(garrow_orc_file_reader_set_field_indices) +GARROW_DEPRECATED_IN_0_12_FOR(garrow_orc_file_reader_set_field_indices) void garrow_orc_file_reader_set_field_indexes(GArrowORCFileReader *reader, const gint *field_indexes, @@ -50,7 +50,7 @@ garrow_orc_file_reader_set_field_indices(GArrowORCFileReader *reader, const gint *field_indices, guint n_field_indices); #ifndef GARROW_DISABLE_DEPRECATED -G_GNUC_DEPRECATED_FOR(garrow_orc_file_reader_get_field_indices) +GARROW_DEPRECATED_IN_0_12_FOR(garrow_orc_file_reader_get_field_indices) const gint * garrow_orc_file_reader_get_field_indexes(GArrowORCFileReader *reader, guint *n_field_indexes); diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index f9e1b951a3658..b889eb2c9da23 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -21,8 +21,10 @@ # include #endif +#include #include #include +#include #include #include @@ -133,22 +135,218 @@ garrow_table_class_init(GArrowTableClass *klass) * @columns: (element-type GArrowColumn): The columns of the table. * * Returns: A newly created #GArrowTable. + * + * Deprecated: 0.12.0: Use garrow_table_new_values() instead. */ GArrowTable * garrow_table_new(GArrowSchema *schema, GList *columns) { + auto arrow_schema = garrow_schema_get_raw(schema); std::vector> arrow_columns; for (GList *node = columns; node; node = node->next) { - GArrowColumn *column = GARROW_COLUMN(node->data); + auto column = GARROW_COLUMN(node->data); arrow_columns.push_back(garrow_column_get_raw(column)); } - auto arrow_table = - arrow::Table::Make(garrow_schema_get_raw(schema), arrow_columns); + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); return garrow_table_new_raw(&arrow_table); } +/** + * garrow_table_new_values: (skip) + * @schema: The schema of the table. + * @values: The values of the table. All values must be instance of the + * same class. Available classes are #GArrowColumn, #GArrowArray and + * #GArrowRecordBatch. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_values(GArrowSchema *schema, + GList *values, + GError **error) +{ + const auto context = "[table][new][values]"; + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_columns; + std::vector> arrow_arrays; + std::vector> arrow_record_batches; + for (GList *node = values; node; node = node->next) { + if (GARROW_IS_COLUMN(node->data)) { + auto column = GARROW_COLUMN(node->data); + arrow_columns.push_back(garrow_column_get_raw(column)); + } else if (GARROW_IS_ARRAY(node->data)) { + auto array = GARROW_ARRAY(node->data); + arrow_arrays.push_back(garrow_array_get_raw(array)); + } else if (GARROW_IS_RECORD_BATCH(node->data)) { + auto record_batch = GARROW_RECORD_BATCH(node->data); + arrow_record_batches.push_back(garrow_record_batch_get_raw(record_batch)); + } else { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "%s: %s", + context, + "value must be one of " + "GArrowColumn, GArrowArray and GArrowRecordBatch"); + return NULL; + } + } + + size_t n_types = 0; + if (!arrow_columns.empty()) { + ++n_types; + } + if (!arrow_arrays.empty()) { + ++n_types; + } + if (!arrow_record_batches.empty()) { + ++n_types; + } + if (n_types > 1) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "%s: %s", + context, + "all values must be the same objects of " + "GArrowColumn, GArrowArray or GArrowRecordBatch"); + return NULL; + } + + if (!arrow_columns.empty()) { + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, context)) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } + } else if (!arrow_arrays.empty()) { + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, context)) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } + } else { + std::shared_ptr arrow_table; + auto status = arrow::Table::FromRecordBatches(arrow_schema, + arrow_record_batches, + &arrow_table); + if (garrow_error_check(error, status, context)) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } + } +} + +/** + * garrow_table_new_columns: + * @schema: The schema of the table. + * @columns: (array length=n_columns): The columns of the table. + * @n_columns: The number of columns. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_columns(GArrowSchema *schema, + GArrowColumn **columns, + gsize n_columns, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_columns; + for (gsize i = 0; i < n_columns; ++i) { + arrow_columns.push_back(garrow_column_get_raw(columns[i])); + } + + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, "[table][new][columns]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + +/** + * garrow_table_new_arrays: + * @schema: The schema of the table. + * @arrays: (array length=n_arrays): The arrays of the table. + * @n_arrays: The number of arrays. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_arrays(GArrowSchema *schema, + GArrowArray **arrays, + gsize n_arrays, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_arrays; + for (gsize i = 0; i < n_arrays; ++i) { + arrow_arrays.push_back(garrow_array_get_raw(arrays[i])); + } + + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, "[table][new][arrays]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + +/** + * garrow_table_new_record_batches: + * @schema: The schema of the table. + * @record_batches: (array length=n_record_batches): The record batches + * that have data for the table. + * @n_record_batches: The number of record batches. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_record_batches(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_record_batches; + for (gsize i = 0; i < n_record_batches; ++i) { + auto arrow_record_batch = garrow_record_batch_get_raw(record_batches[i]); + arrow_record_batches.push_back(arrow_record_batch); + } + + std::shared_ptr arrow_table; + auto status = arrow::Table::FromRecordBatches(arrow_schema, + arrow_record_batches, + &arrow_table); + if (garrow_error_check(error, status, "[table][new][record-batches]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + /** * garrow_table_equal: * @table: A #GArrowTable. diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h index ef7b0f5c289ce..bde2535033c7d 100644 --- a/c_glib/arrow-glib/table.h +++ b/c_glib/arrow-glib/table.h @@ -20,7 +20,9 @@ #pragma once #include +#include #include +#include G_BEGIN_DECLS @@ -35,8 +37,35 @@ struct _GArrowTableClass GObjectClass parent_class; }; -GArrowTable *garrow_table_new (GArrowSchema *schema, - GList *columns); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_table_new_values) +GArrowTable * +garrow_table_new(GArrowSchema *schema, + GList *columns); +#endif +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_values(GArrowSchema *schema, + GList *values, + GError **error); +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_columns(GArrowSchema *schema, + GArrowColumn **columns, + gsize n_columns, + GError **error); +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_arrays(GArrowSchema *schema, + GArrowArray **arrays, + gsize n_arrays, + GError **error); +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_record_batches(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches, + GError **error); gboolean garrow_table_equal (GArrowTable *table, GArrowTable *other_table); diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index eb734250e2352..501827d06e054 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,15 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_0_12: + * + * You can use this macro value for compile time API version check. + * + * Since: 0.12.0 + */ +#define GARROW_VERSION_0_12 G_ENCODE_VERSION(0, 12) + /** * GARROW_VERSION_0_10: * @@ -166,6 +175,20 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_12 +# define GARROW_DEPRECATED_IN_0_12 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_0_12_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_0_12 +# define GARROW_DEPRECATED_IN_0_12_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_12 +# define GARROW_AVAILABLE_IN_0_12 GARROW_UNAVAILABLE(0, 12) +#else +# define GARROW_AVAILABLE_IN_0_12 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_10 # define GARROW_DEPRECATED_IN_0_10 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_0_10_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb index 4394ad1353e7d..871e0d7c5ffd4 100644 --- a/c_glib/test/test-table.rb +++ b/c_glib/test/test-table.rb @@ -17,21 +17,19 @@ class TestTable < Test::Unit::TestCase include Helper::Buildable + include Helper::Omittable sub_test_case(".new") do - def test_columns - fields = [ + def setup + @fields = [ Arrow::Field.new("visible", Arrow::BooleanDataType.new), Arrow::Field.new("valid", Arrow::BooleanDataType.new), ] - schema = Arrow::Schema.new(fields) - columns = [ - Arrow::Column.new(fields[0], build_boolean_array([true])), - Arrow::Column.new(fields[1], build_boolean_array([false])), - ] - table = Arrow::Table.new(schema, columns) + @schema = Arrow::Schema.new(@fields) + end - data = table.n_columns.times.collect do |i| + def dump_table(table) + table.n_columns.times.collect do |i| column = table.get_column(i) values = [] column.data.chunks.each do |chunk| @@ -44,11 +42,54 @@ def test_columns values, ] end + end + + def test_columns + columns = [ + Arrow::Column.new(@fields[0], build_boolean_array([true])), + Arrow::Column.new(@fields[1], build_boolean_array([false])), + ] + table = Arrow::Table.new(@schema, columns) assert_equal([ ["visible", [true]], ["valid", [false]], ], - data) + dump_table(table)) + end + + def test_arrays + require_gi_bindings(3, 3, 1) + arrays = [ + build_boolean_array([true]), + build_boolean_array([false]), + ] + table = Arrow::Table.new(@schema, arrays) + assert_equal([ + ["visible", [true]], + ["valid", [false]], + ], + dump_table(table)) + end + + def test_record_batches + require_gi_bindings(3, 3, 1) + record_batches = [ + build_record_batch({ + "visible" => build_boolean_array([true]), + "valid" => build_boolean_array([false]) + }), + build_record_batch({ + "visible" => build_boolean_array([false]), + "valid" => build_boolean_array([true]) + }), + ] + table = Arrow::Table.new(@schema, record_batches) + + assert_equal([ + ["visible", [true, false]], + ["valid", [false, true]], + ], + dump_table(table)) end end From 7b122024303973c5594efc6eb6c77bf17fe1570e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 31 Dec 2018 10:43:58 -0600 Subject: [PATCH 126/328] ARROW-4135: [Python] Can't reload a pandas dataframe containing a list of datetime.time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reproduced via 0.11.1 Author: Krisztián Szűcs Closes #3289 from kszucs/ARROW-4135 and squashes the following commits: 70ca3e08 missign arg 511808bd forgot to uncomment 3093957f test --- python/pyarrow/tests/test_parquet.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 9f05170bdbeba..3a6c84678eba2 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -921,6 +921,14 @@ def _assert_unsupported(array): _assert_unsupported(a7) +def test_list_of_datetime_time_roundtrip(): + # ARROW-4135 + times = pd.to_datetime(['09:00', '09:30', '10:00', '10:30', '11:00', + '11:30', '12:00']) + df = pd.DataFrame({'time': [times.time]}) + _roundtrip_pandas_dataframe(df, write_kwargs={}) + + def test_large_list_records(): # This was fixed in PARQUET-1100 From 8d792b1c196dcb5f745cb48313558a9a35baccc2 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Mon, 31 Dec 2018 10:50:30 -0600 Subject: [PATCH 127/328] PARQUET-1484: [C++] Improve memory usage of FileMetaDataBuilder Author: Deepak Majeti Closes #3277 from majetideepak/PARQUET-1484 and squashes the following commits: 212e5230 PARQUET-1484: Improve memory usage of FileMetaDataBuilder --- cpp/src/parquet/metadata.cc | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 6ac53c58afed4..f05918d9fd7f0 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -851,23 +851,19 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { } RowGroupMetaDataBuilder* AppendRowGroup() { - row_groups_.emplace_back(new format::RowGroup); + row_groups_.emplace_back(); current_row_group_builder_ = - RowGroupMetaDataBuilder::Make(properties_, schema_, row_groups_.back().get()); + RowGroupMetaDataBuilder::Make(properties_, schema_, &row_groups_.back()); return current_row_group_builder_.get(); } std::unique_ptr Finish() { int64_t total_rows = 0; - std::vector row_groups; - for (auto row_group = row_groups_.begin(); row_group != row_groups_.end(); - row_group++) { - auto rowgroup = *((*row_group).get()); - row_groups.push_back(rowgroup); - total_rows += rowgroup.num_rows; + for (auto row_group : row_groups_) { + total_rows += row_group.num_rows; } metadata_->__set_num_rows(total_rows); - metadata_->__set_row_groups(row_groups); + metadata_->__set_row_groups(row_groups_); if (key_value_metadata_) { metadata_->key_value_metadata.clear(); @@ -922,7 +918,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { private: const std::shared_ptr properties_; - std::vector> row_groups_; + std::vector row_groups_; std::unique_ptr current_row_group_builder_; const SchemaDescriptor* schema_; From 8c26b77120e592b10453aca1ab419c30e378dd7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 31 Dec 2018 10:52:08 -0600 Subject: [PATCH 128/328] ARROW-4134: [Packaging] Properly setup timezone in docker tests to prevent ORC adapter's abort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Python ORC tests were failing because of unset timezone. Crossbow tests: [kszucs/crossbow/build-388](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-388) Author: Krisztián Szűcs Closes #3288 from kszucs/ARROW-4134 and squashes the following commits: 4f502625 setup timezone in tha base cpp image --- c_glib/Dockerfile | 2 -- cpp/Dockerfile | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/c_glib/Dockerfile b/c_glib/Dockerfile index 5d64a5f154f62..7c8e412bca6f4 100644 --- a/c_glib/Dockerfile +++ b/c_glib/Dockerfile @@ -17,9 +17,7 @@ FROM arrow:cpp -ENV DEBIAN_FRONTEND=noninteractive RUN apt-get -q install --no-install-recommends -y \ - tzdata \ ruby-dev \ pkg-config \ autoconf-archive \ diff --git a/cpp/Dockerfile b/cpp/Dockerfile index 84c00b91cc405..4e5a4e4bc1cb2 100644 --- a/cpp/Dockerfile +++ b/cpp/Dockerfile @@ -18,7 +18,8 @@ FROM ubuntu:18.04 # install build essentials -RUN apt-get update -y -q && \ +RUN export DEBIAN_FRONTEND=noninteractive && \ + apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ ca-certificates \ ccache \ @@ -27,6 +28,7 @@ RUN apt-get update -y -q && \ git \ ninja-build \ pkg-config \ + tzdata \ wget # install conda and required packages From 71ccba9b217a7af922d8a69be21ed4db205af741 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 31 Dec 2018 17:46:42 -0600 Subject: [PATCH 129/328] ARROW-4128: [C++] Update style guide to reflect NULLPTR and doxygen Author: Micah Kornfield Author: emkornfield Closes #3284 from emkornfield/update_style_guide and squashes the following commits: b90a669b allow anonymous namespaces d0446107 update style guide to reflect NULLPTR and doxygen --- cpp/README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/README.md b/cpp/README.md index 7e92648dc37aa..2724ff44d8ac1 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -322,9 +322,13 @@ This requires [Doxygen](http://www.doxygen.org) to be installed. ## Development -This project follows [Google's C++ Style Guide][3] with minor exceptions. We do -not encourage anonymous namespaces and we relax the line length restriction to -90 characters. +This project follows [Google's C++ Style Guide][3] with minor exceptions: + + * We relax the line length restriction to 90 characters. + * We use the NULLPTR macro defined in `src/arrow/util/macros.h` to + support building C++/CLI (ARROW-1134) + * We use doxygen style comments ("///") instead of line comments ("//") + in header files. ### Memory Pools From 9376d85c409f4b9b272297b3acb6a0f70dcedc32 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 1 Jan 2019 13:34:25 -0600 Subject: [PATCH 130/328] ARROW-3910: [Python] Set date_as_objects=True as default in to_pandas methods This does not add a deprecation warning primarily because it's a bit difficult to do (we would need to check the data types whether it's a date -- or in the case of a table, if any field is a date--, and then warn if so). `True` is the correct option though in order to accurately roundtrip data to and from pandas. Some users might have some workarounds floating around, but this is sufficiently advanced stuff already. With this patch, date data round trips with no special options ``` In [2]: import pyarrow as pa In [3]: import datetime In [4]: arr = pa.array([datetime.date(2000, 1, 1), None]) In [5]: arr Out[5]: [ 10957, null ] In [6]: arr.to_pandas() Out[6]: array([datetime.date(2000, 1, 1), None], dtype=object) In [7]: pa.array(arr.to_pandas()) Out[7]: [ 10957, null ] ``` If others strongly feel it's worth going to the effort of raising a deprecation warning, please chime in. Author: Wes McKinney Closes #3272 from wesm/ARROW-3910 and squashes the following commits: 308afe56 Add Windows makefile for Sphinx, add section about date conversions to pandas.rst f77c2967 Set date_as_objects=True as default in to_pandas methods --- docs/make.bat | 52 ++++++++++++ docs/source/building.rst | 71 ++++++++++++++++ docs/source/index.rst | 6 ++ docs/source/python/development.rst | 50 ----------- docs/source/python/pandas.rst | 68 ++++++++++++++- python/pyarrow/array.pxi | 6 +- python/pyarrow/tests/test_convert_pandas.py | 94 ++++++++------------- 7 files changed, 231 insertions(+), 116 deletions(-) create mode 100644 docs/make.bat create mode 100644 docs/source/building.rst diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000000000..36f2086c20b3f --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,52 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/source/building.rst b/docs/source/building.rst new file mode 100644 index 0000000000000..0fb4486db89c3 --- /dev/null +++ b/docs/source/building.rst @@ -0,0 +1,71 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Building the Documentation +========================== + +Prerequisites +------------- + +The documentation build process uses `Doxygen `_ and +`Sphinx `_ along with a few extensions. + +If you're using Conda, the required software can be installed in a single line: + +.. code-block:: shell + + conda install -c conda-forge --file ci/conda_env_sphinx.yml + +Otherwise, you'll first need to install `Doxygen `_ +yourself (for example from your distribution's official repositories, if +using Linux). Then you can install the Python-based requirements with the +following command: + +.. code-block:: shell + + pip install -r docs/requirements.txt + +Building +-------- + +.. note:: + + If you are building the documentation on Windows, not all sections + may build properly. + +These two steps are mandatory and must be executed in order. + +#. Process the C++ API using Doxygen + + .. code-block:: shell + + pushd cpp/apidoc + doxygen + popd + +#. Build the complete documentation using Sphinx + + .. code-block:: shell + + pushd docs + make html + popd + +After these steps are completed, the documentation is rendered in HTML +format in ``docs/_build/html``. In particular, you can point your browser +at ``docs/_build/html/index.html`` to read the docs and review any changes +you made. diff --git a/docs/source/index.rst b/docs/source/index.rst index fa6c683d14ecb..2b367b33823a2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -40,3 +40,9 @@ messaging and interprocess communication. cpp/index python/index + +.. toctree:: + :maxdepth: 2 + :caption: Other Topics + + building diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst index 63e6051a7b864..ba8cfef721441 100644 --- a/docs/source/python/development.rst +++ b/docs/source/python/development.rst @@ -364,53 +364,3 @@ Getting ``python-test.exe`` to run is a bit tricky because your set PYTHONHOME=%CONDA_PREFIX% Now ``python-test.exe`` or simply ``ctest`` (to run all tests) should work. - -Building the Documentation -========================== - -Prerequisites -------------- - -The documentation build process uses `Doxygen `_ and -`Sphinx `_ along with a few extensions. - -If you're using Conda, the required software can be installed in a single line: - -.. code-block:: shell - - conda install -c conda-forge --file ci/conda_env_sphinx.yml - -Otherwise, you'll first need to install `Doxygen `_ -yourself (for example from your distribution's official repositories, if -using Linux). Then you can install the Python-based requirements with the -following command: - -.. code-block:: shell - - pip install -r docs/requirements.txt - -Building --------- - -These two steps are mandatory and must be executed in order. - -#. Process the C++ API using Doxygen - - .. code-block:: shell - - pushd cpp/apidoc - doxygen - popd - -#. Build the complete documentation using Sphinx - - .. code-block:: shell - - pushd docs - make html - popd - -After these steps are completed, the documentation is rendered in HTML -format in ``docs/_build/html``. In particular, you can point your browser -at ``docs/_build/html/index.html`` to read the docs and review any changes -you made. diff --git a/docs/source/python/pandas.rst b/docs/source/python/pandas.rst index 16b4ff6926809..dbc5e77e83bff 100644 --- a/docs/source/python/pandas.rst +++ b/docs/source/python/pandas.rst @@ -29,6 +29,13 @@ to them. (such as a different type system, and support for null values) that this is a separate topic from :ref:`numpy_interop`. +To follow examples in this document, make sure to run: + +.. ipython:: python + + import pandas as pd + import pyarrow as pa + DataFrames ---------- @@ -120,5 +127,64 @@ Arrow -> pandas Conversion +-------------------------------------+--------------------------------------------------------+ | ``TIMESTAMP(unit=*)`` | ``pd.Timestamp`` (``np.datetime64[ns]``) | +-------------------------------------+--------------------------------------------------------+ -| ``DATE`` | ``pd.Timestamp`` (``np.datetime64[ns]``) | +| ``DATE`` | ``object``(with ``datetime.date`` objects) | +-------------------------------------+--------------------------------------------------------+ + +Categorical types +~~~~~~~~~~~~~~~~~ + +TODO + +Datetime (Timestamp) types +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +TODO + +Date types +~~~~~~~~~~ + +While dates can be handled using the ``datetime64[ns]`` type in +pandas, some systems work with object arrays of Python's built-in +``datetime.date`` object: + +.. ipython:: python + + from datetime import date + s = pd.Series([date(2018, 12, 31), None, date(2000, 1, 1)]) + s + +When converting to an Arrow array, the ``date32`` type will be used by +default: + +.. ipython:: python + + arr = pa.array(s) + arr.type + arr[0] + +To use the 64-bit ``date64``, specify this explicitly: + +.. ipython:: python + + arr = pa.array(s, type='date64') + arr.type + +When converting back with ``to_pandas``, object arrays of +``datetime.date`` objects are returned: + +.. ipython:: python + + arr.to_pandas() + +If you want to use NumPy's ``datetime64`` dtype instead, pass +``date_as_object=False``: + +.. ipython:: python + + s2 = pd.Series(arr.to_pandas(date_as_object=False)) + s2.dtype + +Time types +~~~~~~~~~~ + +TODO diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ef95efe71b33c..54d0e92cd5561 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -343,10 +343,8 @@ cdef class _PandasConvertible: def to_pandas(self, categories=None, bint strings_to_categorical=False, bint zero_copy_only=False, bint integer_object_nulls=False, - bint date_as_object=False, - bint use_threads=True, - bint deduplicate_objects=True, - bint ignore_metadata=False): + bint date_as_object=True, bint use_threads=True, + bint deduplicate_objects=True, bint ignore_metadata=False): """ Convert to a pandas-compatible NumPy array or DataFrame, as appropriate diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 8d8b65b2240b8..3e89f5eb4ff70 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -912,7 +912,7 @@ class MyDate(date): result = table.to_pandas() expected_df = pd.DataFrame( - {"date": np.array(["2000-01-01"], dtype="datetime64[ns]")} + {"date": np.array([date(2000, 1, 1)], dtype=object)} ) tm.assert_frame_equal(expected_df, result) @@ -962,7 +962,7 @@ def test_pandas_datetime_to_date64_failures(self, mask): with pytest.raises(pa.ArrowInvalid, match=expected_msg): pa.Array.from_pandas(s, type=pa.date64(), mask=mask) - def test_array_date_as_object(self): + def test_array_types_date_as_object(self): data = [date(2000, 1, 1), None, date(1970, 1, 1), @@ -972,58 +972,23 @@ def test_array_date_as_object(self): '1970-01-01', '2040-02-26'], dtype='datetime64') - arr = pa.array(data) - assert arr.equals(pa.array(expected)) - - result = arr.to_pandas() - assert result.dtype == expected.dtype - npt.assert_array_equal(arr.to_pandas(), expected) - - result = arr.to_pandas(date_as_object=True) - expected = expected.astype(object) - assert result.dtype == expected.dtype - npt.assert_array_equal(result, expected) - - def test_chunked_array_convert_date_as_object(self): - data = [date(2000, 1, 1), - None, - date(1970, 1, 1), - date(2040, 2, 26)] - expected = np.array(['2000-01-01', - None, - '1970-01-01', - '2040-02-26'], dtype='datetime64') - carr = pa.chunked_array([data]) - - result = carr.to_pandas() - assert result.dtype == expected.dtype - npt.assert_array_equal(carr.to_pandas(), expected) - - result = carr.to_pandas(date_as_object=True) - expected = expected.astype(object) - assert result.dtype == expected.dtype - npt.assert_array_equal(result, expected) + objects = [ + # The second value is the expected value for date_as_object=False + (pa.array(data), expected), + (pa.chunked_array([data]), expected), + (pa.column('date', [data]), expected.astype('M8[ns]'))] - def test_column_convert_date_as_object(self): - data = [date(2000, 1, 1), - None, - date(1970, 1, 1), - date(2040, 2, 26)] - expected = np.array(['2000-01-01', - None, - '1970-01-01', - '2040-02-26'], dtype='datetime64') - - arr = pa.array(data) - column = pa.column('date', arr) + assert objects[0][0].equals(pa.array(expected)) - result = column.to_pandas() - npt.assert_array_equal(column.to_pandas(), expected) + for obj, expected_datetime64 in objects: + result = obj.to_pandas() + expected_obj = expected.astype(object) + assert result.dtype == expected_obj.dtype + npt.assert_array_equal(result, expected_obj) - result = column.to_pandas(date_as_object=True) - expected = expected.astype(object) - assert result.dtype == expected.dtype - npt.assert_array_equal(result, expected) + result = obj.to_pandas(date_as_object=False) + assert result.dtype == expected_datetime64.dtype + npt.assert_array_equal(result, expected_datetime64) def test_table_convert_date_as_object(self): df = pd.DataFrame({ @@ -1034,8 +999,8 @@ def test_table_convert_date_as_object(self): table = pa.Table.from_pandas(df, preserve_index=False) - df_datetime = table.to_pandas() - df_object = table.to_pandas(date_as_object=True) + df_datetime = table.to_pandas(date_as_object=False) + df_object = table.to_pandas() tm.assert_frame_equal(df.astype('datetime64[ns]'), df_datetime, check_dtype=True) @@ -1055,9 +1020,7 @@ def test_date_infer(self): assert table.schema.equals(expected_schema) result = table.to_pandas() - expected = df.copy() - expected['date'] = pd.to_datetime(df['date']) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, df) def test_date_mask(self): arr = np.array([date(2017, 4, 3), date(2017, 4, 4)], @@ -1094,18 +1057,27 @@ def test_date_objects_typed(self): # Test converting back to pandas colnames = ['date32', 'date64'] table = pa.Table.from_arrays([a32, a64], colnames) - table_pandas = table.to_pandas() ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04', '2017-04-05'], - dtype='datetime64[D]') - .astype('datetime64[ns]')) + dtype='datetime64[D]')) ex_values[1] = pd.NaT.value - expected_pandas = pd.DataFrame({'date32': ex_values, - 'date64': ex_values}, + + ex_datetime64ns = ex_values.astype('datetime64[ns]') + expected_pandas = pd.DataFrame({'date32': ex_datetime64ns, + 'date64': ex_datetime64ns}, columns=colnames) + table_pandas = table.to_pandas(date_as_object=False) tm.assert_frame_equal(table_pandas, expected_pandas) + table_pandas_objects = table.to_pandas() + ex_objects = ex_values.astype('object') + expected_pandas_objects = pd.DataFrame({'date32': ex_objects, + 'date64': ex_objects}, + columns=colnames) + tm.assert_frame_equal(table_pandas_objects, + expected_pandas_objects) + def test_dates_from_integers(self): t1 = pa.date32() t2 = pa.date64() From 12912741c2cbb33fad2965ee3abc4d3b47a63515 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Thu, 3 Jan 2019 08:32:40 +0100 Subject: [PATCH 131/328] ARROW-4137: [Rust] Move parquet code into a separate crate This moves the parquet related code into a separate sub-crate that depends on arrow. Author: Chao Sun Author: Kouhei Sutou Closes #3291 from sunchao/ARROW-4137 and squashes the following commits: b2bcc1cd Add support for version update on release process bbeaaba2 Fix rustfmt 0545fd95 ARROW-4137: Move parquet code into a separate crate --- dev/release/00-prepare.sh | 6 +-- rust/Cargo.toml | 16 ++----- rust/parquet/Cargo.toml | 45 +++++++++++++++++++ rust/{ => parquet}/build.rs | 0 rust/{src/parquet => parquet/src}/basic.rs | 2 +- .../parquet => parquet/src}/column/mod.rs | 2 +- .../parquet => parquet/src}/column/page.rs | 8 ++-- .../parquet => parquet/src}/column/reader.rs | 22 ++++----- .../parquet => parquet/src}/column/writer.rs | 26 +++++------ .../parquet => parquet/src}/compression.rs | 8 ++-- .../{src/parquet => parquet/src}/data_type.rs | 4 +- .../src}/encodings/decoding.rs | 16 +++---- .../src}/encodings/encoding.rs | 20 ++++----- .../src}/encodings/levels.rs | 10 ++--- .../parquet => parquet/src}/encodings/mod.rs | 0 .../parquet => parquet/src}/encodings/rle.rs | 6 +-- rust/{src/parquet => parquet/src}/errors.rs | 0 .../parquet => parquet/src}/file/metadata.rs | 8 ++-- rust/{src/parquet => parquet/src}/file/mod.rs | 4 +- .../src}/file/properties.rs | 6 +-- .../parquet => parquet/src}/file/reader.rs | 20 ++++----- .../src}/file/statistics.rs | 6 +-- .../parquet => parquet/src}/file/writer.rs | 24 +++++----- .../parquet/mod.rs => parquet/src/lib.rs} | 7 +++ .../parquet => parquet/src}/record/api.rs | 10 ++--- .../parquet => parquet/src}/record/mod.rs | 0 .../parquet => parquet/src}/record/reader.rs | 20 ++++----- .../parquet => parquet/src}/record/triplet.rs | 18 ++++---- .../parquet => parquet/src}/schema/mod.rs | 2 +- .../parquet => parquet/src}/schema/parser.rs | 8 ++-- .../parquet => parquet/src}/schema/printer.rs | 14 +++--- .../parquet => parquet/src}/schema/types.rs | 8 ++-- .../src}/util/bit_packing.rs | 0 .../parquet => parquet/src}/util/bit_util.rs | 4 +- .../parquet => parquet/src}/util/hash_util.rs | 2 +- rust/{src/parquet => parquet/src}/util/io.rs | 4 +- .../parquet => parquet/src}/util/memory.rs | 0 rust/{src/parquet => parquet/src}/util/mod.rs | 0 .../src}/util/test_common.rs | 4 +- rust/src/lib.rs | 1 - 40 files changed, 199 insertions(+), 162 deletions(-) create mode 100644 rust/parquet/Cargo.toml rename rust/{ => parquet}/build.rs (100%) rename rust/{src/parquet => parquet/src}/basic.rs (99%) rename rust/{src/parquet => parquet/src}/column/mod.rs (99%) rename rust/{src/parquet => parquet/src}/column/page.rs (97%) rename rust/{src/parquet => parquet/src}/column/reader.rs (99%) rename rust/{src/parquet => parquet/src}/column/writer.rs (98%) rename rust/{src/parquet => parquet/src}/compression.rs (97%) rename rust/{src/parquet => parquet/src}/data_type.rs (99%) rename rust/{src/parquet => parquet/src}/encodings/decoding.rs (99%) rename rust/{src/parquet => parquet/src}/encodings/encoding.rs (98%) rename rust/{src/parquet => parquet/src}/encodings/levels.rs (98%) rename rust/{src/parquet => parquet/src}/encodings/mod.rs (100%) rename rust/{src/parquet => parquet/src}/encodings/rle.rs (99%) rename rust/{src/parquet => parquet/src}/errors.rs (100%) rename rust/{src/parquet => parquet/src}/file/metadata.rs (99%) rename rust/{src/parquet => parquet/src}/file/mod.rs (96%) rename rust/{src/parquet => parquet/src}/file/properties.rs (99%) rename rust/{src/parquet => parquet/src}/file/reader.rs (98%) rename rust/{src/parquet => parquet/src}/file/statistics.rs (99%) rename rust/{src/parquet => parquet/src}/file/writer.rs (98%) rename rust/{src/parquet/mod.rs => parquet/src/lib.rs} (87%) rename rust/{src/parquet => parquet/src}/record/api.rs (99%) rename rust/{src/parquet => parquet/src}/record/mod.rs (100%) rename rust/{src/parquet => parquet/src}/record/reader.rs (98%) rename rust/{src/parquet => parquet/src}/record/triplet.rs (97%) rename rust/{src/parquet => parquet/src}/schema/mod.rs (98%) rename rust/{src/parquet => parquet/src}/schema/parser.rs (99%) rename rust/{src/parquet => parquet/src}/schema/printer.rs (97%) rename rust/{src/parquet => parquet/src}/schema/types.rs (99%) rename rust/{src/parquet => parquet/src}/util/bit_packing.rs (100%) rename rust/{src/parquet => parquet/src}/util/bit_util.rs (99%) rename rust/{src/parquet => parquet/src}/util/hash_util.rs (99%) rename rust/{src/parquet => parquet/src}/util/io.rs (98%) rename rust/{src/parquet => parquet/src}/util/memory.rs (100%) rename rust/{src/parquet => parquet/src}/util/mod.rs (100%) rename rust/{src/parquet => parquet/src}/util/test_common.rs (98%) diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 47ef760b86b9e..20d9ab8fce651 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -100,9 +100,9 @@ update_versions() { cd "${SOURCE_DIR}/../../rust" sed -i.bak -r -e \ "s/^version = \".+\"/version = \"${version}\"/g" \ - Cargo.toml - rm -f Cargo.toml.bak - git add Cargo.toml + Cargo.toml parquet/Cargo.toml + rm -f Cargo.toml.bak parquet/Cargo.toml.bak + git add Cargo.toml parquet/Cargo.toml cd - } diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 49e8a9d9c8470..1bf64d73ade5e 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "0.11.0" +version = "0.12.0-SNAPSHOT" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" @@ -42,17 +42,6 @@ serde_derive = "1.0.80" serde_json = "1.0.13" rand = "0.5" csv = "1.0.0" -parquet-format = "2.5.0" -quick-error = "1.2.2" -byteorder = "1" -thrift = "0.0.4" -snap = "0.2" -brotli = "2.5" -flate2 = "1.0.2" -lz4 = "1.23" -zstd = "0.4" -chrono = "0.4" -num-bigint = "0.2" num = "0.2" [dev-dependencies] @@ -66,3 +55,6 @@ harness = false [[bench]] name = "builder" harness = false + +[workspace] +members = ["parquet"] \ No newline at end of file diff --git a/rust/parquet/Cargo.toml b/rust/parquet/Cargo.toml new file mode 100644 index 0000000000000..aa7eac224c0cf --- /dev/null +++ b/rust/parquet/Cargo.toml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "parquet" +version = "0.12.0-SNAPSHOT" +license = "Apache-2.0" +description = "Apache Parquet implementation in Rust" +authors = ["Apache Arrow "] +keywords = [ "arrow", "parquet", "hadoop" ] +readme = "README.md" +build = "build.rs" +edition = "2018" + +[dependencies] +parquet-format = "2.5.0" +quick-error = "1.2.2" +byteorder = "1" +thrift = "0.0.4" +snap = "0.2" +brotli = "2.5" +flate2 = "1.0.2" +lz4 = "1.23" +zstd = "0.4" +chrono = "0.4" +num-bigint = "0.2" +arrow = { path = ".." } + +[dev-dependencies] +lazy_static = "1" +rand = "0.5" \ No newline at end of file diff --git a/rust/build.rs b/rust/parquet/build.rs similarity index 100% rename from rust/build.rs rename to rust/parquet/build.rs diff --git a/rust/src/parquet/basic.rs b/rust/parquet/src/basic.rs similarity index 99% rename from rust/src/parquet/basic.rs rename to rust/parquet/src/basic.rs index 22e16347dc00f..8b1be49659bc0 100644 --- a/rust/src/parquet/basic.rs +++ b/rust/parquet/src/basic.rs @@ -22,7 +22,7 @@ use std::{convert, fmt, result, str}; use parquet_format as parquet; -use crate::parquet::errors::ParquetError; +use crate::errors::ParquetError; // ---------------------------------------------------------------------- // Types from the Thrift definition diff --git a/rust/src/parquet/column/mod.rs b/rust/parquet/src/column/mod.rs similarity index 99% rename from rust/src/parquet/column/mod.rs rename to rust/parquet/src/column/mod.rs index 4ced32e28cbb9..9a72199d940f2 100644 --- a/rust/src/parquet/column/mod.rs +++ b/rust/parquet/src/column/mod.rs @@ -38,7 +38,7 @@ //! ```rust,no_run //! use std::{fs, path::Path, rc::Rc}; //! -//! use arrow::parquet::{ +//! use parquet::{ //! column::{reader::ColumnReader, writer::ColumnWriter}, //! file::{ //! properties::WriterProperties, diff --git a/rust/src/parquet/column/page.rs b/rust/parquet/src/column/page.rs similarity index 97% rename from rust/src/parquet/column/page.rs rename to rust/parquet/src/column/page.rs index 115037cba0bd5..9e0c76fb83cbd 100644 --- a/rust/src/parquet/column/page.rs +++ b/rust/parquet/src/column/page.rs @@ -17,10 +17,10 @@ //! Contains Parquet Page definitions and page reader interface. -use crate::parquet::basic::{Encoding, PageType}; -use crate::parquet::errors::Result; -use crate::parquet::file::{metadata::ColumnChunkMetaData, statistics::Statistics}; -use crate::parquet::util::memory::ByteBufferPtr; +use crate::basic::{Encoding, PageType}; +use crate::errors::Result; +use crate::file::{metadata::ColumnChunkMetaData, statistics::Statistics}; +use crate::util::memory::ByteBufferPtr; /// Parquet Page definition. /// diff --git a/rust/src/parquet/column/reader.rs b/rust/parquet/src/column/reader.rs similarity index 99% rename from rust/src/parquet/column/reader.rs rename to rust/parquet/src/column/reader.rs index f3dde31ab9a14..d327c50879ea8 100644 --- a/rust/src/parquet/column/reader.rs +++ b/rust/parquet/src/column/reader.rs @@ -24,15 +24,15 @@ use std::{ }; use super::page::{Page, PageReader}; -use crate::parquet::basic::*; -use crate::parquet::data_type::*; -use crate::parquet::encodings::{ +use crate::basic::*; +use crate::data_type::*; +use crate::encodings::{ decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder}, levels::LevelDecoder, }; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::schema::types::ColumnDescPtr; -use crate::parquet::util::memory::ByteBufferPtr; +use crate::errors::{ParquetError, Result}; +use crate::schema::types::ColumnDescPtr; +use crate::util::memory::ByteBufferPtr; /// Column reader for a Parquet type. pub enum ColumnReader { @@ -490,14 +490,14 @@ mod tests { use rand::distributions::range::SampleRange; use std::{collections::VecDeque, rc::Rc, vec::IntoIter}; - use crate::parquet::basic::Type as PhysicalType; - use crate::parquet::column::page::Page; - use crate::parquet::encodings::{ + use crate::basic::Type as PhysicalType; + use crate::column::page::Page; + use crate::encodings::{ encoding::{get_encoder, DictEncoder, Encoder}, levels::{max_buffer_size, LevelEncoder}, }; - use crate::parquet::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; - use crate::parquet::util::{ + use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::util::{ memory::{ByteBufferPtr, MemTracker, MemTrackerPtr}, test_common::random_numbers_range, }; diff --git a/rust/src/parquet/column/writer.rs b/rust/parquet/src/column/writer.rs similarity index 98% rename from rust/src/parquet/column/writer.rs rename to rust/parquet/src/column/writer.rs index 4798d9ad17927..26bd7c5aac778 100644 --- a/rust/src/parquet/column/writer.rs +++ b/rust/parquet/src/column/writer.rs @@ -19,21 +19,21 @@ use std::{cmp, collections::VecDeque, mem, rc::Rc}; -use crate::parquet::basic::{Compression, Encoding, PageType, Type}; -use crate::parquet::column::page::{CompressedPage, Page, PageWriteSpec, PageWriter}; -use crate::parquet::compression::{create_codec, Codec}; -use crate::parquet::data_type::*; -use crate::parquet::encodings::{ +use crate::basic::{Compression, Encoding, PageType, Type}; +use crate::column::page::{CompressedPage, Page, PageWriteSpec, PageWriter}; +use crate::compression::{create_codec, Codec}; +use crate::data_type::*; +use crate::encodings::{ encoding::{get_encoder, DictEncoder, Encoder}, levels::{max_buffer_size, LevelEncoder}, }; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::file::{ +use crate::errors::{ParquetError, Result}; +use crate::file::{ metadata::ColumnChunkMetaData, properties::{WriterProperties, WriterPropertiesPtr, WriterVersion}, }; -use crate::parquet::schema::types::ColumnDescPtr; -use crate::parquet::util::memory::{ByteBufferPtr, MemTracker}; +use crate::schema::types::ColumnDescPtr; +use crate::util::memory::{ByteBufferPtr, MemTracker}; /// Column writer for a Parquet type. pub enum ColumnWriter { @@ -802,15 +802,15 @@ mod tests { use rand::distributions::range::SampleRange; - use crate::parquet::column::{ + use crate::column::{ page::PageReader, reader::{get_column_reader, get_typed_column_reader, ColumnReaderImpl}, }; - use crate::parquet::file::{ + use crate::file::{ properties::WriterProperties, reader::SerializedPageReader, writer::SerializedPageWriter, }; - use crate::parquet::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; - use crate::parquet::util::{ + use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::util::{ io::{FileSink, FileSource}, test_common::{get_temp_file, random_numbers_range}, }; diff --git a/rust/src/parquet/compression.rs b/rust/parquet/src/compression.rs similarity index 97% rename from rust/src/parquet/compression.rs rename to rust/parquet/src/compression.rs index 3690cca032361..3644ffcc54272 100644 --- a/rust/src/parquet/compression.rs +++ b/rust/parquet/src/compression.rs @@ -23,7 +23,7 @@ //! # Example //! //! ```rust -//! use arrow::parquet::{basic::Compression, compression::create_codec}; +//! use parquet::{basic::Compression, compression::create_codec}; //! //! let mut codec = match create_codec(Compression::SNAPPY) { //! Ok(Some(codec)) => codec, @@ -48,8 +48,8 @@ use lz4; use snap::{decompress_len, max_compress_len, Decoder, Encoder}; use zstd; -use crate::parquet::basic::Compression as CodecType; -use crate::parquet::errors::{ParquetError, Result}; +use crate::basic::Compression as CodecType; +use crate::errors::{ParquetError, Result}; /// Parquet compression codec interface. pub trait Codec { @@ -250,7 +250,7 @@ impl Codec for ZSTDCodec { mod tests { use super::*; - use crate::parquet::util::test_common::*; + use crate::util::test_common::*; fn test_roundtrip(c: CodecType, data: &Vec) { let mut c1 = create_codec(c).unwrap().unwrap(); diff --git a/rust/src/parquet/data_type.rs b/rust/parquet/src/data_type.rs similarity index 99% rename from rust/src/parquet/data_type.rs rename to rust/parquet/src/data_type.rs index 26bdebd71bc8b..bfe0889cf71c4 100644 --- a/rust/src/parquet/data_type.rs +++ b/rust/parquet/src/data_type.rs @@ -22,8 +22,8 @@ use std::mem; use byteorder::{BigEndian, ByteOrder}; -use crate::parquet::basic::Type; -use crate::parquet::util::memory::{ByteBuffer, ByteBufferPtr}; +use crate::basic::Type; +use crate::util::memory::{ByteBuffer, ByteBufferPtr}; /// Rust representation for logical type INT96, value is backed by an array of `u32`. /// The type only takes 12 bytes, without extra padding. diff --git a/rust/src/parquet/encodings/decoding.rs b/rust/parquet/src/encodings/decoding.rs similarity index 99% rename from rust/src/parquet/encodings/decoding.rs rename to rust/parquet/src/encodings/decoding.rs index c6a6fd49ee336..f0e93fe1abea7 100644 --- a/rust/src/parquet/encodings/decoding.rs +++ b/rust/parquet/src/encodings/decoding.rs @@ -23,11 +23,11 @@ use super::rle::RleDecoder; use byteorder::{ByteOrder, LittleEndian}; -use crate::parquet::basic::*; -use crate::parquet::data_type::*; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::schema::types::ColumnDescPtr; -use crate::parquet::util::{ +use crate::basic::*; +use crate::data_type::*; +use crate::errors::{ParquetError, Result}; +use crate::schema::types::ColumnDescPtr; +use crate::util::{ bit_util::BitReader, memory::{ByteBuffer, ByteBufferPtr}, }; @@ -865,10 +865,8 @@ mod tests { use std::{mem, rc::Rc}; - use crate::parquet::schema::types::{ - ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, - }; - use crate::parquet::util::{bit_util::set_array_bit, memory::MemTracker, test_common::RandGen}; + use crate::schema::types::{ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::util::{bit_util::set_array_bit, memory::MemTracker, test_common::RandGen}; #[test] fn test_get_decoders() { diff --git a/rust/src/parquet/encodings/encoding.rs b/rust/parquet/src/encodings/encoding.rs similarity index 98% rename from rust/src/parquet/encodings/encoding.rs rename to rust/parquet/src/encodings/encoding.rs index cecb03cb540a9..e1d674cc6ca2c 100644 --- a/rust/src/parquet/encodings/encoding.rs +++ b/rust/parquet/src/encodings/encoding.rs @@ -19,12 +19,12 @@ use std::{cmp, io::Write, marker::PhantomData, mem, slice}; -use crate::parquet::basic::*; -use crate::parquet::data_type::*; -use crate::parquet::encodings::rle::RleEncoder; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::schema::types::ColumnDescPtr; -use crate::parquet::util::{ +use crate::basic::*; +use crate::data_type::*; +use crate::encodings::rle::RleEncoder; +use crate::errors::{ParquetError, Result}; +use crate::schema::types::ColumnDescPtr; +use crate::util::{ bit_util::{log2, num_required_bits, BitWriter}, hash_util, memory::{Buffer, ByteBuffer, ByteBufferPtr, MemTrackerPtr}, @@ -988,11 +988,9 @@ mod tests { use std::rc::Rc; - use crate::parquet::decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder}; - use crate::parquet::schema::types::{ - ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, - }; - use crate::parquet::util::{memory::MemTracker, test_common::RandGen}; + use crate::decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder}; + use crate::schema::types::{ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::util::{memory::MemTracker, test_common::RandGen}; const TEST_SET_SIZE: usize = 1024; diff --git a/rust/src/parquet/encodings/levels.rs b/rust/parquet/src/encodings/levels.rs similarity index 98% rename from rust/src/parquet/encodings/levels.rs rename to rust/parquet/src/encodings/levels.rs index ec65198ce55f0..29c92ddcdba9b 100644 --- a/rust/src/parquet/encodings/levels.rs +++ b/rust/parquet/src/encodings/levels.rs @@ -19,10 +19,10 @@ use std::{cmp, mem}; use super::rle::{RleDecoder, RleEncoder}; -use crate::parquet::basic::Encoding; -use crate::parquet::data_type::AsBytes; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::util::{ +use crate::basic::Encoding; +use crate::data_type::AsBytes; +use crate::errors::{ParquetError, Result}; +use crate::util::{ bit_util::{ceil, log2, BitReader, BitWriter}, memory::ByteBufferPtr, }; @@ -267,7 +267,7 @@ impl LevelDecoder { mod tests { use super::*; - use crate::parquet::util::test_common::random_numbers_range; + use crate::util::test_common::random_numbers_range; fn test_internal_roundtrip(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { let size = max_buffer_size(enc, max_level, levels.len()); diff --git a/rust/src/parquet/encodings/mod.rs b/rust/parquet/src/encodings/mod.rs similarity index 100% rename from rust/src/parquet/encodings/mod.rs rename to rust/parquet/src/encodings/mod.rs diff --git a/rust/src/parquet/encodings/rle.rs b/rust/parquet/src/encodings/rle.rs similarity index 99% rename from rust/src/parquet/encodings/rle.rs rename to rust/parquet/src/encodings/rle.rs index 5b56c2a250495..1a8b6e5c7c6b8 100644 --- a/rust/src/parquet/encodings/rle.rs +++ b/rust/parquet/src/encodings/rle.rs @@ -20,8 +20,8 @@ use std::{ mem::{size_of, transmute_copy}, }; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::util::{ +use crate::errors::{ParquetError, Result}; +use crate::util::{ bit_util::{self, BitReader, BitWriter}, memory::ByteBufferPtr, }; @@ -520,7 +520,7 @@ mod tests { thread_rng, Rng, SeedableRng, }; - use crate::parquet::util::memory::ByteBufferPtr; + use crate::util::memory::ByteBufferPtr; const MAX_WIDTH: usize = 32; diff --git a/rust/src/parquet/errors.rs b/rust/parquet/src/errors.rs similarity index 100% rename from rust/src/parquet/errors.rs rename to rust/parquet/src/errors.rs diff --git a/rust/src/parquet/file/metadata.rs b/rust/parquet/src/file/metadata.rs similarity index 99% rename from rust/src/parquet/file/metadata.rs rename to rust/parquet/src/file/metadata.rs index 7f2442506f67f..06507fdcad2a8 100644 --- a/rust/src/parquet/file/metadata.rs +++ b/rust/parquet/src/file/metadata.rs @@ -37,10 +37,10 @@ use std::rc::Rc; use parquet_format::{ColumnChunk, ColumnMetaData, RowGroup}; -use crate::parquet::basic::{ColumnOrder, Compression, Encoding, Type}; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::file::statistics::{self, Statistics}; -use crate::parquet::schema::types::{ +use crate::basic::{ColumnOrder, Compression, Encoding, Type}; +use crate::errors::{ParquetError, Result}; +use crate::file::statistics::{self, Statistics}; +use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, Type as SchemaType, TypePtr, }; diff --git a/rust/src/parquet/file/mod.rs b/rust/parquet/src/file/mod.rs similarity index 96% rename from rust/src/parquet/file/mod.rs rename to rust/parquet/src/file/mod.rs index 38fe8fa9b15b1..407a97d5d6e5e 100644 --- a/rust/src/parquet/file/mod.rs +++ b/rust/parquet/src/file/mod.rs @@ -29,7 +29,7 @@ //! ```rust,no_run //! use std::{fs, path::Path, rc::Rc}; //! -//! use arrow::parquet::{ +//! use parquet::{ //! file::{ //! properties::WriterProperties, //! writer::{FileWriter, SerializedFileWriter}, @@ -62,7 +62,7 @@ //! # Example of reading an existing file //! //! ```rust,no_run -//! use arrow::parquet::file::reader::{FileReader, SerializedFileReader}; +//! use parquet::file::reader::{FileReader, SerializedFileReader}; //! use std::{fs::File, path::Path}; //! //! let path = Path::new("/path/to/sample.parquet"); diff --git a/rust/src/parquet/file/properties.rs b/rust/parquet/src/file/properties.rs similarity index 99% rename from rust/src/parquet/file/properties.rs rename to rust/parquet/src/file/properties.rs index 911ec55733490..47b232e6fab04 100644 --- a/rust/src/parquet/file/properties.rs +++ b/rust/parquet/src/file/properties.rs @@ -20,7 +20,7 @@ //! # Usage //! //! ```rust -//! use arrow::parquet::{ +//! use parquet::{ //! basic::{Compression, Encoding}, //! file::properties::*, //! schema::types::ColumnPath, @@ -50,8 +50,8 @@ use std::{collections::HashMap, rc::Rc}; -use crate::parquet::basic::{Compression, Encoding}; -use crate::parquet::schema::types::ColumnPath; +use crate::basic::{Compression, Encoding}; +use crate::schema::types::ColumnPath; const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; diff --git a/rust/src/parquet/file/reader.rs b/rust/parquet/src/file/reader.rs similarity index 98% rename from rust/src/parquet/file/reader.rs rename to rust/parquet/src/file/reader.rs index c2e5dd176dac5..747fbbc64f82e 100644 --- a/rust/src/parquet/file/reader.rs +++ b/rust/parquet/src/file/reader.rs @@ -32,17 +32,17 @@ use parquet_format::{ }; use thrift::protocol::TCompactInputProtocol; -use crate::parquet::basic::{ColumnOrder, Compression, Encoding, Type}; -use crate::parquet::column::{ +use crate::basic::{ColumnOrder, Compression, Encoding, Type}; +use crate::column::{ page::{Page, PageReader}, reader::{ColumnReader, ColumnReaderImpl}, }; -use crate::parquet::compression::{create_codec, Codec}; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::file::{metadata::*, statistics, FOOTER_SIZE, PARQUET_MAGIC}; -use crate::parquet::record::reader::RowIter; -use crate::parquet::schema::types::{self, SchemaDescriptor, Type as SchemaType}; -use crate::parquet::util::{io::FileSource, memory::ByteBufferPtr}; +use crate::compression::{create_codec, Codec}; +use crate::errors::{ParquetError, Result}; +use crate::file::{metadata::*, statistics, FOOTER_SIZE, PARQUET_MAGIC}; +use crate::record::reader::RowIter; +use crate::schema::types::{self, SchemaDescriptor, Type as SchemaType}; +use crate::util::{io::FileSource, memory::ByteBufferPtr}; // ---------------------------------------------------------------------- // APIs for file & row group readers @@ -545,8 +545,8 @@ mod tests { use parquet_format::TypeDefinedOrder; - use crate::parquet::basic::SortOrder; - use crate::parquet::util::test_common::{get_temp_file, get_test_file, get_test_path}; + use crate::basic::SortOrder; + use crate::util::test_common::{get_temp_file, get_test_file, get_test_path}; #[test] fn test_file_reader_metadata_size_smaller_than_footer() { diff --git a/rust/src/parquet/file/statistics.rs b/rust/parquet/src/file/statistics.rs similarity index 99% rename from rust/src/parquet/file/statistics.rs rename to rust/parquet/src/file/statistics.rs index ff4d731857f16..03831bbc72bf7 100644 --- a/rust/src/parquet/file/statistics.rs +++ b/rust/parquet/src/file/statistics.rs @@ -21,7 +21,7 @@ //! actual min and max values from statistics, see below: //! //! ```rust -//! use arrow::parquet::file::statistics::Statistics; +//! use parquet::file::statistics::Statistics; //! //! let stats = Statistics::int32(Some(1), Some(10), None, 3, true); //! assert_eq!(stats.null_count(), 3); @@ -42,8 +42,8 @@ use std::{cmp, fmt}; use byteorder::{ByteOrder, LittleEndian}; use parquet_format::Statistics as TStatistics; -use crate::parquet::basic::Type; -use crate::parquet::data_type::*; +use crate::basic::Type; +use crate::data_type::*; // Macro to generate methods create Statistics. macro_rules! statistics_new_func { diff --git a/rust/src/parquet/file/writer.rs b/rust/parquet/src/file/writer.rs similarity index 98% rename from rust/src/parquet/file/writer.rs rename to rust/parquet/src/file/writer.rs index 1e0c11641f9a4..e000842f3895f 100644 --- a/rust/src/parquet/file/writer.rs +++ b/rust/parquet/src/file/writer.rs @@ -28,18 +28,18 @@ use byteorder::{ByteOrder, LittleEndian}; use parquet_format as parquet; use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol}; -use crate::parquet::basic::PageType; -use crate::parquet::column::{ +use crate::basic::PageType; +use crate::column::{ page::{CompressedPage, Page, PageWriteSpec, PageWriter}, writer::{get_column_writer, ColumnWriter}, }; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::file::{ +use crate::errors::{ParquetError, Result}; +use crate::file::{ metadata::*, properties::WriterPropertiesPtr, statistics::to_thrift as statistics_to_thrift, FOOTER_SIZE, PARQUET_MAGIC, }; -use crate::parquet::schema::types::{self, SchemaDescPtr, SchemaDescriptor, TypePtr}; -use crate::parquet::util::io::{FileSink, Position}; +use crate::schema::types::{self, SchemaDescPtr, SchemaDescriptor, TypePtr}; +use crate::util::io::{FileSink, Position}; // ---------------------------------------------------------------------- // APIs for file & row group writers @@ -512,16 +512,16 @@ mod tests { use std::{error::Error, io::Cursor}; - use crate::parquet::basic::{Compression, Encoding, Repetition, Type}; - use crate::parquet::column::page::PageReader; - use crate::parquet::compression::{create_codec, Codec}; - use crate::parquet::file::{ + use crate::basic::{Compression, Encoding, Repetition, Type}; + use crate::column::page::PageReader; + use crate::compression::{create_codec, Codec}; + use crate::file::{ properties::WriterProperties, reader::{FileReader, SerializedFileReader, SerializedPageReader}, statistics::{from_thrift, to_thrift, Statistics}, }; - use crate::parquet::record::RowAccessor; - use crate::parquet::util::{memory::ByteBufferPtr, test_common::get_temp_file}; + use crate::record::RowAccessor; + use crate::util::{memory::ByteBufferPtr, test_common::get_temp_file}; #[test] fn test_file_writer_error_after_close() { diff --git a/rust/src/parquet/mod.rs b/rust/parquet/src/lib.rs similarity index 87% rename from rust/src/parquet/mod.rs rename to rust/parquet/src/lib.rs index 58cc7b13df6d6..75c56f5054f19 100644 --- a/rust/src/parquet/mod.rs +++ b/rust/parquet/src/lib.rs @@ -15,6 +15,13 @@ // specific language governing permissions and limitations // under the License. +#![feature(type_ascription)] +#![feature(rustc_private)] +#![feature(specialization)] +#![feature(try_from)] +#![allow(dead_code)] +#![allow(non_camel_case_types)] + #[macro_use] pub mod errors; pub mod basic; diff --git a/rust/src/parquet/record/api.rs b/rust/parquet/src/record/api.rs similarity index 99% rename from rust/src/parquet/record/api.rs rename to rust/parquet/src/record/api.rs index d6e3ec19b76f6..d0be43ad730ed 100644 --- a/rust/src/parquet/record/api.rs +++ b/rust/parquet/src/record/api.rs @@ -22,10 +22,10 @@ use std::fmt; use chrono::{Local, TimeZone}; use num_bigint::{BigInt, Sign}; -use crate::parquet::basic::{LogicalType, Type as PhysicalType}; -use crate::parquet::data_type::{ByteArray, Decimal, Int96}; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::schema::types::ColumnDescPtr; +use crate::basic::{LogicalType, Type as PhysicalType}; +use crate::data_type::{ByteArray, Decimal, Int96}; +use crate::errors::{ParquetError, Result}; +use crate::schema::types::ColumnDescPtr; /// Macro as a shortcut to generate 'not yet implemented' panic error. macro_rules! nyi { @@ -689,7 +689,7 @@ mod tests { use chrono; use std::rc::Rc; - use crate::parquet::schema::types::{ColumnDescriptor, ColumnPath, PrimitiveTypeBuilder}; + use crate::schema::types::{ColumnDescriptor, ColumnPath, PrimitiveTypeBuilder}; /// Creates test column descriptor based on provided type parameters. macro_rules! make_column_descr { diff --git a/rust/src/parquet/record/mod.rs b/rust/parquet/src/record/mod.rs similarity index 100% rename from rust/src/parquet/record/mod.rs rename to rust/parquet/src/record/mod.rs diff --git a/rust/src/parquet/record/reader.rs b/rust/parquet/src/record/reader.rs similarity index 98% rename from rust/src/parquet/record/reader.rs rename to rust/parquet/src/record/reader.rs index d9f3d6fea1978..e1d3c964eca3a 100644 --- a/rust/src/parquet/record/reader.rs +++ b/rust/parquet/src/record/reader.rs @@ -20,14 +20,14 @@ use std::{collections::HashMap, fmt, rc::Rc}; -use crate::parquet::basic::{LogicalType, Repetition}; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::file::reader::{FileReader, RowGroupReader}; -use crate::parquet::record::{ +use crate::basic::{LogicalType, Repetition}; +use crate::errors::{ParquetError, Result}; +use crate::file::reader::{FileReader, RowGroupReader}; +use crate::record::{ api::{make_list, make_map, make_row, Field, Row}, triplet::TripletIter, }; -use crate::parquet::schema::types::{ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr}; +use crate::schema::types::{ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr}; /// Default batch size for a reader const DEFAULT_BATCH_SIZE: usize = 1024; @@ -709,11 +709,11 @@ impl Iterator for ReaderIter { mod tests { use super::*; - use crate::parquet::errors::{ParquetError, Result}; - use crate::parquet::file::reader::{FileReader, SerializedFileReader}; - use crate::parquet::record::api::{Field, Row}; - use crate::parquet::schema::parser::parse_message_type; - use crate::parquet::util::test_common::get_test_file; + use crate::errors::{ParquetError, Result}; + use crate::file::reader::{FileReader, SerializedFileReader}; + use crate::record::api::{Field, Row}; + use crate::schema::parser::parse_message_type; + use crate::util::test_common::get_test_file; // Convenient macros to assemble row, list, map, and group. diff --git a/rust/src/parquet/record/triplet.rs b/rust/parquet/src/record/triplet.rs similarity index 97% rename from rust/src/parquet/record/triplet.rs rename to rust/parquet/src/record/triplet.rs index fadcbbce9ba5b..6ec7799ccb03c 100644 --- a/rust/src/parquet/record/triplet.rs +++ b/rust/parquet/src/record/triplet.rs @@ -15,12 +15,12 @@ // specific language governing permissions and limitations // under the License. -use crate::parquet::basic::Type as PhysicalType; -use crate::parquet::column::reader::{get_typed_column_reader, ColumnReader, ColumnReaderImpl}; -use crate::parquet::data_type::*; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::record::api::Field; -use crate::parquet::schema::types::ColumnDescPtr; +use crate::basic::Type as PhysicalType; +use crate::column::reader::{get_typed_column_reader, ColumnReader, ColumnReaderImpl}; +use crate::data_type::*; +use crate::errors::{ParquetError, Result}; +use crate::record::api::Field; +use crate::schema::types::ColumnDescPtr; /// Macro to generate simple functions that cover all types of triplet iterator. /// $func is a function of a typed triplet iterator and $token is a either {`ref`} or @@ -353,9 +353,9 @@ impl TypedTripletIter { mod tests { use super::*; - use crate::parquet::file::reader::{FileReader, SerializedFileReader}; - use crate::parquet::schema::types::ColumnPath; - use crate::parquet::util::test_common::get_test_file; + use crate::file::reader::{FileReader, SerializedFileReader}; + use crate::schema::types::ColumnPath; + use crate::util::test_common::get_test_file; #[test] #[should_panic(expected = "Expected positive batch size, found: 0")] diff --git a/rust/src/parquet/schema/mod.rs b/rust/parquet/src/schema/mod.rs similarity index 98% rename from rust/src/parquet/schema/mod.rs rename to rust/parquet/src/schema/mod.rs index 5319504964627..351ce97337178 100644 --- a/rust/src/parquet/schema/mod.rs +++ b/rust/parquet/src/schema/mod.rs @@ -20,7 +20,7 @@ //! # Example //! //! ```rust -//! use arrow::parquet::{ +//! use parquet::{ //! basic::{LogicalType, Repetition, Type as PhysicalType}, //! schema::{parser, printer, types::Type}, //! }; diff --git a/rust/src/parquet/schema/parser.rs b/rust/parquet/src/schema/parser.rs similarity index 99% rename from rust/src/parquet/schema/parser.rs rename to rust/parquet/src/schema/parser.rs index 2890c84a755ba..955c6c9830223 100644 --- a/rust/src/parquet/schema/parser.rs +++ b/rust/parquet/src/schema/parser.rs @@ -22,7 +22,7 @@ //! # Example //! //! ```rust -//! use arrow::parquet::schema::parser::parse_message_type; +//! use parquet::schema::parser::parse_message_type; //! //! let message_type = " //! message spark_schema { @@ -44,9 +44,9 @@ use std::rc::Rc; -use crate::parquet::basic::{LogicalType, Repetition, Type as PhysicalType}; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::schema::types::{Type, TypePtr}; +use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::errors::{ParquetError, Result}; +use crate::schema::types::{Type, TypePtr}; /// Parses message type as string into a Parquet [`Type`](`::schema::types::Type`) which, /// for example, could be used to extract individual columns. Returns Parquet general diff --git a/rust/src/parquet/schema/printer.rs b/rust/parquet/src/schema/printer.rs similarity index 97% rename from rust/src/parquet/schema/printer.rs rename to rust/parquet/src/schema/printer.rs index d61f116eb9e70..87c3683d9237d 100644 --- a/rust/src/parquet/schema/printer.rs +++ b/rust/parquet/src/schema/printer.rs @@ -21,7 +21,7 @@ //! # Example //! //! ```rust -//! use arrow::parquet::{ +//! use parquet::{ //! file::reader::{FileReader, SerializedFileReader}, //! schema::printer::{print_file_metadata, print_parquet_metadata, print_schema}, //! }; @@ -45,11 +45,9 @@ use std::{fmt, io}; -use crate::parquet::basic::{LogicalType, Type as PhysicalType}; -use crate::parquet::file::metadata::{ - ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData, -}; -use crate::parquet::schema::types::Type; +use crate::basic::{LogicalType, Type as PhysicalType}; +use crate::file::metadata::{ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData}; +use crate::schema::types::Type; /// Prints Parquet metadata [`ParquetMetaData`](`::file::metadata::ParquetMetaData`) /// information. @@ -260,8 +258,8 @@ mod tests { use std::rc::Rc; - use crate::parquet::basic::{Repetition, Type as PhysicalType}; - use crate::parquet::schema::{parser::parse_message_type, types::Type}; + use crate::basic::{Repetition, Type as PhysicalType}; + use crate::schema::{parser::parse_message_type, types::Type}; fn assert_print_parse_message(message: Type) { let mut s = String::new(); diff --git a/rust/src/parquet/schema/types.rs b/rust/parquet/src/schema/types.rs similarity index 99% rename from rust/src/parquet/schema/types.rs rename to rust/parquet/src/schema/types.rs index 90c767c093055..30ee9f60e1a3e 100644 --- a/rust/src/parquet/schema/types.rs +++ b/rust/parquet/src/schema/types.rs @@ -21,8 +21,8 @@ use std::{collections::HashMap, convert::From, fmt, rc::Rc}; use parquet_format::SchemaElement; -use crate::parquet::basic::{LogicalType, Repetition, Type as PhysicalType}; -use crate::parquet::errors::{ParquetError, Result}; +use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::errors::{ParquetError, Result}; // ---------------------------------------------------------------------- // Parquet Type definitions @@ -512,7 +512,7 @@ impl ColumnPath { /// Returns string representation of this column path. /// ```rust - /// use arrow::parquet::schema::types::ColumnPath; + /// use parquet::schema::types::ColumnPath; /// /// let path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c".to_string()]); /// assert_eq!(&path.string(), "a.b.c"); @@ -1014,7 +1014,7 @@ mod tests { use std::error::Error; - use crate::parquet::schema::parser::parse_message_type; + use crate::schema::parser::parse_message_type; #[test] fn test_primitive_type() { diff --git a/rust/src/parquet/util/bit_packing.rs b/rust/parquet/src/util/bit_packing.rs similarity index 100% rename from rust/src/parquet/util/bit_packing.rs rename to rust/parquet/src/util/bit_packing.rs diff --git a/rust/src/parquet/util/bit_util.rs b/rust/parquet/src/util/bit_util.rs similarity index 99% rename from rust/src/parquet/util/bit_util.rs rename to rust/parquet/src/util/bit_util.rs index 9dbb9a32333d2..ae680ecca4735 100644 --- a/rust/src/parquet/util/bit_util.rs +++ b/rust/parquet/src/util/bit_util.rs @@ -20,8 +20,8 @@ use std::{ mem::{size_of, transmute_copy}, }; -use crate::parquet::errors::{ParquetError, Result}; -use crate::parquet::util::{bit_packing::unpack32, memory::ByteBufferPtr}; +use crate::errors::{ParquetError, Result}; +use crate::util::{bit_packing::unpack32, memory::ByteBufferPtr}; /// Reads `$size` of bytes from `$src`, and reinterprets them as type `$ty`, in /// little-endian order. `$ty` must implement the `Default` trait. Otherwise this won't diff --git a/rust/src/parquet/util/hash_util.rs b/rust/parquet/src/util/hash_util.rs similarity index 99% rename from rust/src/parquet/util/hash_util.rs rename to rust/parquet/src/util/hash_util.rs index c7bffef8bbf34..b4685fbd004da 100644 --- a/rust/src/parquet/util/hash_util.rs +++ b/rust/parquet/src/util/hash_util.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::parquet::data_type::AsBytes; +use crate::data_type::AsBytes; /// Computes hash value for `data`, with a seed value `seed`. /// The data type `T` must implement the `AsBytes` trait. diff --git a/rust/src/parquet/util/io.rs b/rust/parquet/src/util/io.rs similarity index 98% rename from rust/src/parquet/util/io.rs rename to rust/parquet/src/util/io.rs index 8724e67c2dbe7..d667c8e817a91 100644 --- a/rust/src/parquet/util/io.rs +++ b/rust/parquet/src/util/io.rs @@ -17,7 +17,7 @@ use std::{cmp, fs::File, io::*, sync::Mutex}; -use crate::parquet::file::reader::ParquetReader; +use crate::file::reader::ParquetReader; // ---------------------------------------------------------------------- // Read/Write wrappers for `File`. @@ -132,7 +132,7 @@ impl<'a> Position for Cursor<&'a mut Vec> { mod tests { use super::*; - use crate::parquet::util::test_common::{get_temp_file, get_test_file}; + use crate::util::test_common::{get_temp_file, get_test_file}; #[test] fn test_io_read_fully() { diff --git a/rust/src/parquet/util/memory.rs b/rust/parquet/src/util/memory.rs similarity index 100% rename from rust/src/parquet/util/memory.rs rename to rust/parquet/src/util/memory.rs diff --git a/rust/src/parquet/util/mod.rs b/rust/parquet/src/util/mod.rs similarity index 100% rename from rust/src/parquet/util/mod.rs rename to rust/parquet/src/util/mod.rs diff --git a/rust/src/parquet/util/test_common.rs b/rust/parquet/src/util/test_common.rs similarity index 98% rename from rust/src/parquet/util/test_common.rs rename to rust/parquet/src/util/test_common.rs index f9b1af4a5cef4..ad315a6aa4f69 100644 --- a/rust/src/parquet/util/test_common.rs +++ b/rust/parquet/src/util/test_common.rs @@ -21,8 +21,8 @@ use rand::{ }; use std::{env, fs, io::Write, path::PathBuf, str::FromStr}; -use crate::parquet::data_type::*; -use crate::parquet::util::memory::ByteBufferPtr; +use crate::data_type::*; +use crate::util::memory::ByteBufferPtr; /// Random generator of data type `T` values and sequences. pub trait RandGen { diff --git a/rust/src/lib.rs b/rust/src/lib.rs index d5708b10504c4..199159e369b5f 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -32,7 +32,6 @@ pub mod csv; pub mod datatypes; pub mod error; pub mod memory; -pub mod parquet; pub mod record_batch; pub mod tensor; pub mod util; From c71d27fe55ca2a273f194c860b59074b0c998a74 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 3 Jan 2019 18:47:25 +0900 Subject: [PATCH 132/328] ARROW-4141: [Ruby] Add support for creating schema from raw Ruby objects The followings should be implemented by follow-up works: * Arrow::TimestampDataType.new(unit: ...) * Arrow::Time32DataType.new(unit: ...) * Arrow::Time64DataType.new(unit: ...) * Arrow::DecimalDataType.new(precision: ..., scale: ...) * Arrow::SparseUnionDataType.new(fields: ..., type_codes: ...) * Arrow::DenseUnionDataType.new(fields: ..., type_codes: ...) * Arrow::DictionaryDataType.new(fields: ..., type_codes: ...) Author: Kouhei Sutou Closes #3293 from kou/ruby-schema-new and squashes the following commits: d251ba9d Add .yardopts to rat exclude files 169b8656 Add support for creating schema from raw Ruby objects --- dev/release/rat_exclude_files.txt | 1 + ruby/red-arrow/.gitignore | 2 + ruby/red-arrow/.yardopts | 6 + ruby/red-arrow/README.md | 2 +- ruby/red-arrow/Rakefile | 4 + ruby/red-arrow/lib/arrow/data-type.rb | 110 +++++++++++++++++-- ruby/red-arrow/lib/arrow/field.rb | 99 +++++++++++++++-- ruby/red-arrow/lib/arrow/list-data-type.rb | 68 ++++++++++++ ruby/red-arrow/lib/arrow/loader.rb | 1 + ruby/red-arrow/lib/arrow/schema.rb | 71 ++++++++++++ ruby/red-arrow/lib/arrow/struct-data-type.rb | 104 ++++++++++++++++++ ruby/red-arrow/red-arrow.gemspec | 2 + ruby/red-arrow/test/test-data-type.rb | 47 ++++++++ ruby/red-arrow/test/test-field.rb | 71 ++++++++++++ ruby/red-arrow/test/test-list-data-type.rb | 43 ++++++++ ruby/red-arrow/test/test-schema.rb | 88 ++++++++++++--- ruby/red-arrow/test/test-struct-data-type.rb | 96 +++++++++++++--- 17 files changed, 765 insertions(+), 50 deletions(-) create mode 100644 ruby/red-arrow/.yardopts create mode 100644 ruby/red-arrow/lib/arrow/list-data-type.rb create mode 100644 ruby/red-arrow/test/test-data-type.rb create mode 100644 ruby/red-arrow/test/test-field.rb create mode 100644 ruby/red-arrow/test/test-list-data-type.rb diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 7674e2fee0f29..1086793630b7d 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -186,5 +186,6 @@ r/README.md r/README.Rmd r/man/*.Rd .gitattributes +ruby/red-arrow/.yardopts rust/test/data/*.csv rust/rust-toolchain diff --git a/ruby/red-arrow/.gitignore b/ruby/red-arrow/.gitignore index 779545d9026f1..68e4b5c7b5de0 100644 --- a/ruby/red-arrow/.gitignore +++ b/ruby/red-arrow/.gitignore @@ -15,4 +15,6 @@ # specific language governing permissions and limitations # under the License. +/.yardoc/ +/doc/reference/ /pkg/ diff --git a/ruby/red-arrow/.yardopts b/ruby/red-arrow/.yardopts new file mode 100644 index 0000000000000..67159b1dc2f3a --- /dev/null +++ b/ruby/red-arrow/.yardopts @@ -0,0 +1,6 @@ +--output-dir doc/reference +--markup markdown +--no-private +lib/**/*.rb +- +doc/text/* diff --git a/ruby/red-arrow/README.md b/ruby/red-arrow/README.md index a6798dd90551f..95ec396fae5b1 100644 --- a/ruby/red-arrow/README.md +++ b/ruby/red-arrow/README.md @@ -39,7 +39,7 @@ Note that the Apache Arrow GLib packages are "unofficial". "Official" packages w Install Red Arrow after you install Apache Arrow GLib: -```text +```console % gem install red-arrow ``` diff --git a/ruby/red-arrow/Rakefile b/ruby/red-arrow/Rakefile index 96851afb9f9f7..a3ece36b732ac 100644 --- a/ruby/red-arrow/Rakefile +++ b/ruby/red-arrow/Rakefile @@ -19,6 +19,7 @@ require "rubygems" require "bundler/gem_helper" +require "yard" base_dir = File.join(__dir__) @@ -37,3 +38,6 @@ task :test do end task default: :test + +YARD::Rake::YardocTask.new do |task| +end diff --git a/ruby/red-arrow/lib/arrow/data-type.rb b/ruby/red-arrow/lib/arrow/data-type.rb index dad74fb40dc83..03960e47debca 100644 --- a/ruby/red-arrow/lib/arrow/data-type.rb +++ b/ruby/red-arrow/lib/arrow/data-type.rb @@ -18,21 +18,117 @@ module Arrow class DataType class << self + # Creates a new suitable {Arrow::DataType}. + # + # @overload resolve(data_type) + # + # Returns the given data type itself. This is convenient to + # use this method as {Arrow::DataType} converter. + # + # @param data_type [Arrow::DataType] The data type. + # + # @return [Arrow::DataType] The given data type itself. + # + # @overload resolve(name, *arguments) + # + # Creates a suitable data type from type name. For example, + # you can create {Arrow::BooleanDataType} from `:boolean`. + # + # @param name [String, Symbol] The type name of the data type. + # + # @param arguments [::Array] The additional information of the + # data type. + # + # For example, {Arrow::TimestampDataType} needs unit as + # additional information. + # + # @example Create a boolean data type + # Arrow::DataType.resolve(:boolean) + # + # @example Create a milliseconds unit timestamp data type + # Arrow::DataType.resolve(:timestamp, :milli) + # + # @overload resolve(description) + # + # Creates a suitable data type from data type description. + # + # Data type description is a raw `Hash`. Data type description + # must have `:type` value. `:type` is the type of the data type. + # + # If the type needs additional information, you need to + # specify it. See constructor document what information is + # needed. For example, {Arrow::ListDataType#initialize} needs + # `:field` value. + # + # @param description [Hash] The description of the data type. + # + # @option description [String, Symbol] :type The type name of + # the data type. + # + # @example Create a boolean data type + # Arrow::DataType.resolve(type: :boolean) + # + # @example Create a list data type + # Arrow::DataType.resolve(type: :list, + # field: {name: "visible", type: :boolean}) def resolve(data_type) case data_type when DataType data_type when String, Symbol - data_type_name = data_type.to_s.capitalize.gsub(/\AUint/, "UInt") - data_type_class_name = "#{data_type_name}DataType" - unless Arrow.const_defined?(data_type_class_name) - raise ArgumentError, "invalid data type: #{data_typeinspect}" + resolve_class(data_type).new + when ::Array + type, *arguments = data_type + resolve_class(type).new(*arguments) + when Hash + type = nil + description = {} + data_type.each do |key, value| + key = key.to_sym + case key + when :type + type = value + else + description[key] = value + end + end + if type.nil? + message = + "data type description must have :type value: #{data_type.inspect}" + raise ArgumentError, message + end + data_type_class = resolve_class(type) + if description.empty? + data_type_class.new + else + data_type_class.new(description) end - data_type_class = Arrow.const_get(data_type_class_name) - data_type_class.new else - raise ArgumentError, "invalid data type: #{data_type.inspect}" + message = + "data type must be " + + "Arrow::DataType, String, Symbol, [String, ...], [Symbol, ...] " + + "{type: String, ...} or {type: Symbol, ...}: #{data_type.inspect}" + raise ArgumentError, message + end + end + + private + def resolve_class(data_type) + data_type_name = data_type.to_s.capitalize.gsub(/\AUint/, "UInt") + data_type_class_name = "#{data_type_name}DataType" + unless Arrow.const_defined?(data_type_class_name) + available_types = [] + Arrow.constants.each do |name| + if name.to_s.end_with?("DataType") + available_types << name.to_s.gsub(/DataType\z/, "").downcase.to_sym + end + end + message = + "unknown type: #{data_type.inspect}: " + + "available types: #{available_types.inspect}" + raise ArgumentError, message end + Arrow.const_get(data_type_class_name) end end end diff --git a/ruby/red-arrow/lib/arrow/field.rb b/ruby/red-arrow/lib/arrow/field.rb index be5865fd5564c..8c7c8eaa005cb 100644 --- a/ruby/red-arrow/lib/arrow/field.rb +++ b/ruby/red-arrow/lib/arrow/field.rb @@ -19,16 +19,99 @@ module Arrow class Field alias_method :initialize_raw, :initialize private :initialize_raw - def initialize(name, data_type) - case data_type - when String, Symbol - data_type_name = data_type.to_s.capitalize.gsub(/\AUint/, "UInt") - data_type_class_name = "#{data_type_name}DataType" - if Arrow.const_defined?(data_type_class_name) - data_type_class = Arrow.const_get(data_type_class_name) - data_type = data_type_class.new + + # Creates a new {Arrow::Field}. + # + # @overload initialize(name, data_type) + # + # @param name [String, Symbol] The name of the field. + # + # @param data_type [Arrow::DataType, Hash, String, Symbol] The + # data type of the field. + # + # You can specify data type as a description by `Hash`. + # + # See {Arrow::DataType.resolve} how to specify data type + # description. + # + # @example Create a field with {Arrow::DataType}s + # Arrow::Field.new("visible", Arrow::BooleanDataType.new) + # + # @example Create a field with data type description + # Arrow::Field.new("visible", :boolean) + # + # @example Create a field with name as `Symbol` + # Arrow::Field.new(:visible, :boolean) + # + # @overload initialize(description) + # + # @param description [Hash] The description of the field. + # + # Field description is a raw `Hash`. Field description must + # have `:name` and `:data_type` values. `:name` is the name of + # the field. `:data_type` is the data type of the field. You + # can use {Arrow::DataType} or data type description as + # `:data_type` value. + # + # See {Arrow::DataType.resolve} how to specify data type + # description. + # + # There is a shortcut for convenience. If field description + # doesn't have `:data_type`, all keys except `:name` are + # processes as data type description. For example, the + # following field descrptions are the same: + # + # ```ruby + # {name: "visible", data_type: {type: :boolean}} + # {name: "visible", type: :boolean} # Shortcut version + # ``` + # + # @option description [String, Symbol] :name The name of the field. + # + # @option description [Arrow::DataType, Hash] :data_type The + # data type of the field. You can specify data type description + # by `Hash`. + # + # See {Arrow::DataType.resolve} how to specify data type + # description. + # + # @example Create a field with {Arrow::DataType}s + # Arrow::Field.new(name: "visible", + # data_type: Arrow::BooleanDataType.new) + # + # @example Create a field with data type description + # Arrow::Field.new(name: "visible", data_type: {type: :boolean} + # + # @example Create a field with shortcut form + # Arrow::Field.new(name: "visible", type: :boolean) + def initialize(*args) + n_args = args.size + case n_args + when 1 + description = args[0] + name = nil + data_type = nil + data_type_description = {} + description.each do |key, value| + key = key.to_sym + case key + when :name + name = value + when :data_type + data_type = DataType.resolve(value) + else + data_type_description[key] = value + end end + data_type ||= DataType.resolve(data_type_description) + when 2 + name = args[0] + data_type = DataType.resolve(args[1]) + else + message = "wrong number of arguments (given, #{n_args}, expected 1..2)" + raise ArgumentError, message end + initialize_raw(name, data_type) end end diff --git a/ruby/red-arrow/lib/arrow/list-data-type.rb b/ruby/red-arrow/lib/arrow/list-data-type.rb new file mode 100644 index 0000000000000..c097da4e881e8 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/list-data-type.rb @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class ListDataType + alias_method :initialize_raw, :initialize + private :initialize_raw + + # Creates a new {Arrow::ListDataType}. + # + # @overload initialize(field) + # + # @param field [Arrow::Field, Hash] The field of the list data + # type. You can also specify field description by `Hash`. + # + # See {Arrow::Field.new} how to specify field description. + # + # @example Create a list data type with {Arrow::Field} + # visible_field = Arrow::Field.new("visible", :boolean) + # Arrow::ListDataType.new(visible_field) + # + # @example Create a list data type with field description + # Arrow::ListDataType.new(name: "visible", type: :boolean) + # + # @overload initialize(description) + # + # @param description [Hash] The description of the list data + # type. It must have `:field` value. + # + # @option description [Arrow::Field, Hash] :field The field of + # the list data type. You can also specify field description + # by `Hash`. + # + # See {Arrow::Field.new} how to specify field description. + # + # @example Create a list data type with {Arrow::Field} + # visible_field = Arrow::Field.new("visible", :boolean) + # Arrow::ListDataType.new(field: visible_field) + # + # @example Create a list data type with field description + # Arrow::ListDataType.new(field: {name: "visible", type: :boolean}) + def initialize(field) + if field.is_a?(Hash) and field.key?(:field) + description = field + field = description[:field] + end + if field.is_a?(Hash) + field_description = field + field = Field.new(field_description) + end + initialize_raw(field) + end + end +end diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb index 2092e461c1786..cea98e9a8578e 100644 --- a/ruby/red-arrow/lib/arrow/loader.rb +++ b/ruby/red-arrow/lib/arrow/loader.rb @@ -45,6 +45,7 @@ def require_libraries require "arrow/date64-array-builder" require "arrow/field" require "arrow/file-output-stream" + require "arrow/list-data-type" require "arrow/path-extension" require "arrow/record" require "arrow/record-batch" diff --git a/ruby/red-arrow/lib/arrow/schema.rb b/ruby/red-arrow/lib/arrow/schema.rb index 2e6bad29e6506..ecc3324b8a311 100644 --- a/ruby/red-arrow/lib/arrow/schema.rb +++ b/ruby/red-arrow/lib/arrow/schema.rb @@ -21,6 +21,77 @@ module Arrow class Schema include FieldContainable + alias_method :initialize_raw, :initialize + private :initialize_raw + + # Creates a new {Arrow::Schema}. + # + # @overload initialize(fields) + # + # @param fields [::Array] The fields of the + # schema. You can mix {Arrow::Field} and field description in + # the fields. + # + # See {Arrow::Field.new} how to specify field description. + # + # @example Create a schema with {Arrow::Field}s + # visible_field = Arrow::Field.new("visible", :boolean) + # Arrow::Schema.new([visible_field]) + # + # @example Create a schema with field descriptions + # visible_field_description = { + # name: "visible", + # data_type: :boolean, + # } + # Arrow::Schema.new([visible_field_description]) + # + # @example Create a schema with {Arrow::Field}s and field descriptions + # fields = [ + # Arrow::Field.new("visible", :boolean), + # { + # name: "count", + # type: :int32, + # }, + # ] + # Arrow::Schema.new(fields) + # + # @overload initialize(fields) + # + # @param fields [Hash{String, Symbol => Arrow::DataType, Hash}] + # The pairs of field name and field data type of the schema. + # You can mix {Arrow::DataType} and data description for field + # data type. + # + # See {Arrow::DataType.new} how to specify data type description. + # + # @example Create a schema with fields + # fields = { + # "visible" => Arrow::BooleanDataType.new, + # :count => :int32, + # :tags => { + # type: :list, + # field: { + # name: "tag", + # type: :string, + # }, + # }, + # } + # Arrow::Schema.new(fields) + def initialize(fields) + case fields + when ::Array + fields = fields.collect do |field| + field = Field.new(field) unless field.is_a?(Field) + field + end + when Hash + fields = fields.collect do |name, data_type| + Field.new(name, data_type) + end + end + initialize_raw(fields) + end + alias_method :[], :find_field end end diff --git a/ruby/red-arrow/lib/arrow/struct-data-type.rb b/ruby/red-arrow/lib/arrow/struct-data-type.rb index 7a59f1f620b81..ad810115d62ad 100644 --- a/ruby/red-arrow/lib/arrow/struct-data-type.rb +++ b/ruby/red-arrow/lib/arrow/struct-data-type.rb @@ -21,6 +21,110 @@ module Arrow class StructDataType include FieldContainable + alias_method :initialize_raw, :initialize + private :initialize_raw + + # Creates a new {Arrow::StructDataType}. + # + # @overload initialize(fields) + # + # @param fields [::Array] The fields of the + # struct data type. You can also specify field description as + # a field. You can mix {Arrow::Field} and field description. + # + # See {Arrow::Field.new} how to specify field description. + # + # @example Create a struct data type with {Arrow::Field}s + # visible_field = Arrow::Field.new("visible", :boolean) + # count_field = Arrow::Field.new("count", :int32) + # Arrow::StructDataType.new([visible_field, count_field]) + # + # @example Create a struct data type with field descriptions + # field_descriptions = [ + # {name: "visible", type: :boolean}, + # {name: "count", type: :int32}, + # ] + # Arrow::StructDataType.new(field_descriptions) + # + # @example Create a struct data type with {Arrow::Field} and field description + # fields = [ + # Arrow::Field.new("visible", :boolean), + # {name: "count", type: :int32}, + # ] + # Arrow::StructDataType.new(fields) + # + # @overload initialize(fields) + # + # @param fields [Hash{String, Symbol => Arrow::DataType, Hash}] + # The pairs of field name and field data type of the struct + # data type. You can also specify data type description by + # `Hash`. You can mix {Arrow::DataType} and data type description. + # + # See {Arrow::DataType.resolve} how to specify data type + # description. + # + # @example Create a struct data type with {Arrow::DataType}s + # fields = { + # "visible" => Arrow::BooleanDataType.new, + # "count" => Arrow::Int32DataType.new, + # } + # Arrow::StructDataType.new(fields) + # + # @example Create a struct data type with data type descriptions + # fields = { + # "visible" => :boolean, + # "count" => {type: :int32}, + # } + # Arrow::StructDataType.new(fields) + # + # @example Create a struct data type with {Arrow::DataType} and data type description + # fields = { + # "visible" => Arrow::BooleanDataType.new, + # "count" => {type: :int32}, + # } + # Arrow::StructDataType.new(fields) + # + # @overload initialize(description) + # + # @param description [Hash] The description of the struct data + # type. It must have `:fields` value. + # + # @option description + # [::Array, + # Hash{String, Symbol => Arrow::DataType, Hash, String, Symbol}] + # :fields The fields of the struct data type. + # + # @example Create a struct data type with {Arrow::Field} and field description + # fields = [ + # Arrow::Field.new("visible", :boolean), + # {name: "count", type: :int32}, + # ] + # Arrow::StructDataType.new(fields: fields) + # + # @example Create a struct data type with {Arrow::DataType} and data type description + # fields = { + # "visible" => Arrow::BooleanDataType.new, + # "count" => {type: :int32}, + # } + # Arrow::StructDataType.new(fields: fields) + def initialize(fields) + if fields.is_a?(Hash) and fields.key?(:fields) + description = fields + fields = description[:fields] + end + if fields.is_a?(Hash) + fields = fields.collect do |name, data_type| + Field.new(name, data_type) + end + else + fields = fields.collect do |field| + field = Field.new(field) unless field.is_a?(Field) + field + end + end + initialize_raw(fields) + end + alias_method :[], :find_field end end diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec index 9db755fc67ccc..8e79c75dcaff2 100644 --- a/ruby/red-arrow/red-arrow.gemspec +++ b/ruby/red-arrow/red-arrow.gemspec @@ -51,7 +51,9 @@ Gem::Specification.new do |spec| spec.add_development_dependency("bundler") spec.add_development_dependency("rake") + spec.add_development_dependency("redcarpet") spec.add_development_dependency("test-unit") + spec.add_development_dependency("yard") spec.metadata["msys2_mingw_dependencies"] = "apache-arrow" end diff --git a/ruby/red-arrow/test/test-data-type.rb b/ruby/red-arrow/test/test-data-type.rb new file mode 100644 index 0000000000000..c9dbfc6f11b6f --- /dev/null +++ b/ruby/red-arrow/test/test-data-type.rb @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class DataTypeTest < Test::Unit::TestCase + sub_test_case(".resolve") do + test("DataType") do + assert_equal(Arrow::BooleanDataType.new, + Arrow::DataType.resolve(Arrow::BooleanDataType.new)) + end + + test("String") do + assert_equal(Arrow::BooleanDataType.new, + Arrow::DataType.resolve("boolean")) + end + + test("Symbol") do + assert_equal(Arrow::BooleanDataType.new, + Arrow::DataType.resolve(:boolean)) + end + + test("Array") do + field = Arrow::Field.new(:visible, :boolean) + assert_equal(Arrow::ListDataType.new(field), + Arrow::DataType.resolve([:list, field])) + end + + test("Hash") do + field = Arrow::Field.new(:visible, :boolean) + assert_equal(Arrow::ListDataType.new(field), + Arrow::DataType.resolve(type: :list, field: field)) + end + end +end diff --git a/ruby/red-arrow/test/test-field.rb b/ruby/red-arrow/test/test-field.rb new file mode 100644 index 0000000000000..9be2068ea544b --- /dev/null +++ b/ruby/red-arrow/test/test-field.rb @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class FieldTest < Test::Unit::TestCase + sub_test_case(".new") do + test("String, Arrow::DataType") do + assert_equal("visible: bool", + Arrow::Field.new("visible", Arrow::BooleanDataType.new).to_s) + end + + test("Symbol, Arrow::DataType") do + assert_equal("visible: bool", + Arrow::Field.new(:visible, Arrow::BooleanDataType.new).to_s) + end + + test("String, Symbol") do + assert_equal("visible: bool", + Arrow::Field.new(:visible, :boolean).to_s) + end + + test("String, Hash") do + assert_equal("visible: bool", + Arrow::Field.new(:visible, type: :boolean).to_s) + end + + test("description: String") do + assert_equal("visible: bool", + Arrow::Field.new(name: "visible", + data_type: :boolean).to_s) + end + + test("description: Symbol") do + assert_equal("visible: bool", + Arrow::Field.new(name: :visible, + data_type: :boolean).to_s) + end + + test("description: shortcut") do + assert_equal("visible: bool", + Arrow::Field.new(name: :visible, + type: :boolean).to_s) + end + + test("Hash: shortcut: additional") do + description = { + name: :tags, + type: :list, + field: { + name: "tag", + type: :string, + }, + } + assert_equal("tags: list", + Arrow::Field.new(description).to_s) + end + end +end diff --git a/ruby/red-arrow/test/test-list-data-type.rb b/ruby/red-arrow/test/test-list-data-type.rb new file mode 100644 index 0000000000000..cca6ca3914b2b --- /dev/null +++ b/ruby/red-arrow/test/test-list-data-type.rb @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class ListDataTypeTest < Test::Unit::TestCase + sub_test_case(".new") do + test("Arrow::Field") do + field = Arrow::Field.new(:tag, :string) + assert_equal("list", + Arrow::ListDataType.new(field).to_s) + end + + test("Hash") do + assert_equal("list", + Arrow::ListDataType.new(name: "tag", type: :string).to_s) + end + + test("field: Arrow::Field") do + field = Arrow::Field.new(:tag, :string) + assert_equal("list", + Arrow::ListDataType.new(field: field).to_s) + end + + test("field: Hash") do + field_description = {name: "tag", type: :string} + assert_equal("list", + Arrow::ListDataType.new(field: field_description).to_s) + end + end +end diff --git a/ruby/red-arrow/test/test-schema.rb b/ruby/red-arrow/test/test-schema.rb index 2f989cf19f2ec..6cfbbb117d94d 100644 --- a/ruby/red-arrow/test/test-schema.rb +++ b/ruby/red-arrow/test/test-schema.rb @@ -19,31 +19,85 @@ class SchemaTest < Test::Unit::TestCase def setup @count_field = Arrow::Field.new("count", :uint32) @visible_field = Arrow::Field.new("visible", :boolean) - @schema = Arrow::Schema.new([@count_field, @visible_field]) end - sub_test_case("#[]") do - test("[String]") do - assert_equal([@count_field, @visible_field], - [@schema["count"], @schema["visible"]]) + sub_test_case(".new") do + test("[Arrow::Field]") do + fields = [ + @count_field, + @visible_field, + ] + assert_equal("count: uint32\n" + + "visible: bool", + Arrow::Schema.new(fields).to_s) end - test("[Symbol]") do - assert_equal([@count_field, @visible_field], - [@schema[:count], @schema[:visible]]) + test("[Arrow::Field, Hash]") do + fields = [ + @count_field, + {name: "visible", type: :boolean}, + ] + assert_equal("count: uint32\n" + + "visible: bool", + Arrow::Schema.new(fields).to_s) end - test("[Integer]") do - assert_equal([@count_field, @visible_field], - [@schema[0], @schema[1]]) + test("{String, Symbol => Arrow::DataType}") do + fields = { + "count" => Arrow::UInt32DataType.new, + :visible => :boolean, + } + assert_equal("count: uint32\n" + + "visible: bool", + Arrow::Schema.new(fields).to_s) end - test("[invalid]") do - invalid = [] - message = "field name or index must be String, Symbol or Integer" - message << ": <#{invalid.inspect}>" - assert_raise(ArgumentError.new(message)) do - @schema[invalid] + test("{String, Symbol => Hash}") do + fields = { + "count" => {type: :uint32}, + :tags => { + type: :list, + field: { + name: "tag", + type: :string, + }, + }, + } + assert_equal("count: uint32\n" + + "tags: list", + Arrow::Schema.new(fields).to_s) + end + end + + sub_test_case("instance methods") do + def setup + super + @schema = Arrow::Schema.new([@count_field, @visible_field]) + end + + sub_test_case("#[]") do + test("[String]") do + assert_equal([@count_field, @visible_field], + [@schema["count"], @schema["visible"]]) + end + + test("[Symbol]") do + assert_equal([@count_field, @visible_field], + [@schema[:count], @schema[:visible]]) + end + + test("[Integer]") do + assert_equal([@count_field, @visible_field], + [@schema[0], @schema[1]]) + end + + test("[invalid]") do + invalid = [] + message = "field name or index must be String, Symbol or Integer" + message << ": <#{invalid.inspect}>" + assert_raise(ArgumentError.new(message)) do + @schema[invalid] + end end end end diff --git a/ruby/red-arrow/test/test-struct-data-type.rb b/ruby/red-arrow/test/test-struct-data-type.rb index c802c44731072..d106e38b1d841 100644 --- a/ruby/red-arrow/test/test-struct-data-type.rb +++ b/ruby/red-arrow/test/test-struct-data-type.rb @@ -19,31 +19,93 @@ class StructDataTypeTest < Test::Unit::TestCase def setup @count_field = Arrow::Field.new("count", :uint32) @visible_field = Arrow::Field.new("visible", :boolean) - @data_type = Arrow::StructDataType.new([@count_field, @visible_field]) end - sub_test_case("#[]") do - test("[String]") do - assert_equal([@count_field, @visible_field], - [@data_type["count"], @data_type["visible"]]) + sub_test_case(".new") do + test("[Arrow::Field]") do + fields = [ + @count_field, + @visible_field, + ] + assert_equal("struct", + Arrow::StructDataType.new(fields).to_s) end - test("[Symbol]") do - assert_equal([@count_field, @visible_field], - [@data_type[:count], @data_type[:visible]]) + test("[Hash]") do + fields = [ + {name: "count", data_type: :uint32}, + {name: "visible", data_type: :boolean}, + ] + assert_equal("struct", + Arrow::StructDataType.new(fields).to_s) end - test("[Integer]") do - assert_equal([@count_field, @visible_field], - [@data_type[0], @data_type[1]]) + test("[Arrow::Field, Hash]") do + fields = [ + @count_field, + {name: "visible", data_type: :boolean}, + ] + assert_equal("struct", + Arrow::StructDataType.new(fields).to_s) end - test("[invalid]") do - invalid = [] - message = "field name or index must be String, Symbol or Integer" - message << ": <#{invalid.inspect}>" - assert_raise(ArgumentError.new(message)) do - @data_type[invalid] + test("{Arrow::DataType}") do + fields = { + "count" => Arrow::UInt32DataType.new, + "visible" => Arrow::BooleanDataType.new, + } + assert_equal("struct", + Arrow::StructDataType.new(fields).to_s) + end + + test("{Hash}") do + fields = { + "count" => {type: :uint32}, + "visible" => {type: :boolean}, + } + assert_equal("struct", + Arrow::StructDataType.new(fields).to_s) + end + + test("{String, Symbol}") do + fields = { + "count" => "uint32", + "visible" => :boolean, + } + assert_equal("struct", + Arrow::StructDataType.new(fields).to_s) + end + end + + sub_test_case("instance methods") do + def setup + super + @data_type = Arrow::StructDataType.new([@count_field, @visible_field]) + end + + sub_test_case("#[]") do + test("[String]") do + assert_equal([@count_field, @visible_field], + [@data_type["count"], @data_type["visible"]]) + end + + test("[Symbol]") do + assert_equal([@count_field, @visible_field], + [@data_type[:count], @data_type[:visible]]) + end + + test("[Integer]") do + assert_equal([@count_field, @visible_field], + [@data_type[0], @data_type[1]]) + end + + test("[invalid]") do + invalid = [] + message = "field name or index must be String, Symbol or Integer" + message << ": <#{invalid.inspect}>" + assert_raise(ArgumentError.new(message)) do + @data_type[invalid] + end end end end From 6ca8fcdeccc54a80ce90711441a41ec6ffbd216b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 3 Jan 2019 16:35:11 +0100 Subject: [PATCH 133/328] ARROW-4148: [CI/Python] Disable ORC on nightly Alpine builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nightly Python Alpine builds were [failing](https://travis-ci.org/kszucs/crossbow/builds/474545492) because PYARROW_WITH_ORC is enabled by default, but the underlying cpp image doesn't build against ORC. Crossbow builds: - ~[kszucs/crossbow/build-391](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=391)~ - [kszucs/crossbow/build-393](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=393) [GREEN] Author: Krisztián Szűcs Closes #3297 from kszucs/ARROW-4148 and squashes the following commits: 16e162e1 install dependencies from requirements.txt as well 6cd864f9 disable PYARROW_WITH_ORC --- python/Dockerfile.alpine | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/Dockerfile.alpine b/python/Dockerfile.alpine index ba0f2eb23f549..96362197a0343 100644 --- a/python/Dockerfile.alpine +++ b/python/Dockerfile.alpine @@ -30,9 +30,13 @@ RUN export PYTHON_MAJOR=${PYTHON_VERSION:0:1} && \ ADD python/requirements.txt \ python/requirements-test.txt \ /arrow/python/ -RUN pip install -r /arrow/python/requirements-test.txt cython +RUN pip install \ + -r /arrow/python/requirements.txt \ + -r /arrow/python/requirements-test.txt \ + cython ENV ARROW_PYTHON=ON \ + PYARROW_WITH_ORC=0 \ PYARROW_WITH_PARQUET=0 # build and test From 7f1fbf83284b745ee9215f6722e114ee467bdeb8 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 3 Jan 2019 21:52:57 +0100 Subject: [PATCH 134/328] ARROW-4009: [CI] Run Valgrind and C++ code coverage in different builds Also fix a couple ASAN / UBSAN issues (not all of them). Author: Antoine Pitrou Closes #3302 from pitrou/ARROW-4009-separate-codecov-valgrind and squashes the following commits: 7dacf9e1 Mark function inline 06372930 Use clang on non-coverage C++ job eca54b69 ARROW-4009: Run Valgrind and C++ code coverage in different builds --- .travis.yml | 30 +++- cpp/CMakeLists.txt | 8 + cpp/cmake_modules/san-config.cmake | 24 +-- cpp/src/arrow/array-test.cc | 13 +- cpp/src/arrow/array/builder_binary.h | 1 + cpp/src/arrow/array/builder_nested.h | 1 + cpp/src/arrow/buffer-builder.h | 205 +++++++++++++++++++++++++ cpp/src/arrow/buffer-test.cc | 1 + cpp/src/arrow/buffer.cc | 21 +-- cpp/src/arrow/buffer.h | 168 +------------------- cpp/src/arrow/io/readahead.cc | 2 + cpp/src/arrow/memory_pool.cc | 79 ++++++---- cpp/src/arrow/test-util.cc | 16 +- cpp/src/arrow/util/thread-pool-test.cc | 2 +- cpp/src/parquet/bloom_filter.h | 1 + 15 files changed, 324 insertions(+), 248 deletions(-) create mode 100644 cpp/src/arrow/buffer-builder.h diff --git a/.travis.yml b/.travis.yml index 059daeef8fd14..837b4cfef30db 100644 --- a/.travis.yml +++ b/.travis.yml @@ -61,13 +61,35 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh script: - $TRAVIS_BUILD_DIR/ci/travis_lint.sh - - name: "C++ unit tests, code coverage with gcc 4.8" + - name: "C++ unit tests w/ Valgrind, clang 6.0" + language: cpp + os: linux + env: + - ARROW_TRAVIS_VALGRIND=1 + - ARROW_TRAVIS_USE_TOOLCHAIN=1 + - ARROW_TRAVIS_PLASMA=1 + - ARROW_TRAVIS_ORC=1 + - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_GANDIVA=1 + - ARROW_BUILD_WARNING_LEVEL=CHECKIN + - CC="clang-6.0" + - CXX="clang++-6.0" + before_script: + - if [ $ARROW_CI_CPP_AFFECTED != "1" ]; then exit; fi + - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh + - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh + # If either C++ or Python changed, we must install the C++ libraries + - git submodule update --init + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh + script: + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 + # Separating Valgrind and C++ coverage makes individual jobs shorter + - name: "C++ unit tests w/ gcc 4.8, coverage" compiler: gcc language: cpp os: linux jdk: openjdk8 env: - - ARROW_TRAVIS_VALGRIND=1 - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_ORC=1 @@ -87,7 +109,7 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh || travis_terminate 1 - - name: "Python 2.7 and 3.6 unit tests, coverage with gcc 4.8" + - name: "Python 2.7 and 3.6 unit tests w/ Valgrind, gcc 4.8, coverage" compiler: gcc language: cpp os: linux @@ -98,10 +120,10 @@ matrix: - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_TRAVIS_COVERAGE=1 - ARROW_TRAVIS_PYTHON_DOCS=1 - - ARROW_BUILD_WARNING_LEVEL=CHECKIN - ARROW_TRAVIS_PYTHON_JVM=1 - ARROW_TRAVIS_PYTHON_GANDIVA=1 - ARROW_TRAVIS_OPTIONAL_INSTALL=1 + - ARROW_BUILD_WARNING_LEVEL=CHECKIN # TODO(wesm): Run the benchmarks outside of Travis # - ARROW_TRAVIS_PYTHON_BENCHMARKS=1 before_script: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 006b406ba0762..aba1a59618bb0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -111,6 +111,14 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Run the test suite using valgrind --tool=memcheck" OFF) + option(ARROW_USE_ASAN + "Enable Address Sanitizer checks" + OFF) + + option(ARROW_USE_TSAN + "Enable Thread Sanitizer checks" + OFF) + option(ARROW_BUILD_TESTS "Build the Arrow googletest unit tests, default OFF" OFF) diff --git a/cpp/cmake_modules/san-config.cmake b/cpp/cmake_modules/san-config.cmake index f2de9cf1f7553..22a9b0c8098a0 100644 --- a/cpp/cmake_modules/san-config.cmake +++ b/cpp/cmake_modules/san-config.cmake @@ -22,19 +22,6 @@ if (${ARROW_USE_ASAN}) ("${COMPILER_FAMILY}" STREQUAL "gcc" AND "${COMPILER_VERSION}" VERSION_GREATER "4.8"))) message(SEND_ERROR "Cannot use ASAN without clang or gcc >= 4.8") endif() - - # If UBSAN is also enabled, and we're on clang < 3.5, ensure static linking is - # enabled. Otherwise, we run into https://llvm.org/bugs/show_bug.cgi?id=18211 - if("${ARROW_USE_UBSAN}" AND - "${COMPILER_FAMILY}" STREQUAL "clang" AND - "${COMPILER_VERSION}" VERSION_LESS "3.5") - if("${ARROW_LINK}" STREQUAL "a") - message("Using static linking for ASAN+UBSAN build") - set(ARROW_LINK "s") - elseif("${ARROW_LINK}" STREQUAL "d") - message(SEND_ERROR "Cannot use dynamic linking when ASAN and UBSAN are both enabled") - endif() - endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -DADDRESS_SANITIZER") endif() @@ -49,7 +36,7 @@ if (${ARROW_USE_UBSAN}) ("${COMPILER_FAMILY}" STREQUAL "gcc" AND "${COMPILER_VERSION}" VERSION_GREATER "4.9"))) message(SEND_ERROR "Cannot use UBSAN without clang or gcc >= 4.9") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr -fno-sanitize-recover") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr -fno-sanitize-recover=all") endif () # Flag to enable thread sanitizer (clang or gcc 4.8) @@ -101,14 +88,7 @@ if ("${ARROW_USE_UBSAN}" OR "${ARROW_USE_ASAN}" OR "${ARROW_USE_TSAN}") # GCC 4.8 and 4.9 (latest as of this writing) don't allow you to specify a # sanitizer blacklist. if("${COMPILER_FAMILY}" STREQUAL "clang") - # Require clang 3.4 or newer; clang 3.3 has issues with TSAN and pthread - # symbol interception. - if("${COMPILER_VERSION}" VERSION_LESS "3.4") - message(SEND_ERROR "Must use clang 3.4 or newer to run a sanitizer build." - " Detected unsupported version ${COMPILER_VERSION}." - " Try using clang from $NATIVE_TOOLCHAIN/.") - endif() - add_definitions("-fsanitize-blacklist=${BUILD_SUPPORT_DIR}/sanitize-blacklist.txt") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize-blacklist=${BUILD_SUPPORT_DIR}/sanitize-blacklist.txt") else() message(WARNING "GCC does not support specifying a sanitizer blacklist. Known sanitizer check failures will not be suppressed.") endif() diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index bdb7eda118d51..ccdaad58c681a 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -30,6 +30,7 @@ #include #include "arrow/array.h" +#include "arrow/buffer-builder.h" #include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/ipc/test-common.h" @@ -761,22 +762,22 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendValuesLazyIter) { auto& draws = this->draws_; auto& valid_bytes = this->valid_bytes_; - auto doubler = [&draws](int64_t index) { return draws[index] * 2; }; - auto lazy_iter = internal::MakeLazyRange(doubler, size); + auto halve = [&draws](int64_t index) { return draws[index] / 2; }; + auto lazy_iter = internal::MakeLazyRange(halve, size); ASSERT_OK(this->builder_->AppendValues(lazy_iter.begin(), lazy_iter.end(), valid_bytes.begin())); - std::vector doubled; - transform(draws.begin(), draws.end(), back_inserter(doubled), - [](T in) { return in * 2; }); + std::vector halved; + transform(draws.begin(), draws.end(), back_inserter(halved), + [](T in) { return in / 2; }); std::shared_ptr result; FinishAndCheckPadding(this->builder_.get(), &result); std::shared_ptr expected; ASSERT_OK( - this->builder_->AppendValues(doubled.data(), doubled.size(), valid_bytes.data())); + this->builder_->AppendValues(halved.data(), halved.size(), valid_bytes.data())); FinishAndCheckPadding(this->builder_.get(), &expected); ASSERT_TRUE(expected->Equals(result)); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 7c101bdffc5e4..324279daa4a6e 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -25,6 +25,7 @@ #include "arrow/array.h" #include "arrow/array/builder_base.h" +#include "arrow/buffer-builder.h" #include "arrow/status.h" #include "arrow/type_traits.h" #include "arrow/util/macros.h" diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 863e6fef06f7d..19b0ad81b5a16 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -21,6 +21,7 @@ #include #include "arrow/array/builder_base.h" +#include "arrow/buffer-builder.h" namespace arrow { diff --git a/cpp/src/arrow/buffer-builder.h b/cpp/src/arrow/buffer-builder.h new file mode 100644 index 0000000000000..dafa3ee611f04 --- /dev/null +++ b/cpp/src/arrow/buffer-builder.h @@ -0,0 +1,205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_BUFFER_BUILDER_H +#define ARROW_BUFFER_BUILDER_H + +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Buffer builder classes + +/// \class BufferBuilder +/// \brief A class for incrementally building a contiguous chunk of in-memory data +class ARROW_EXPORT BufferBuilder { + public: + explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {} + + /// \brief Resize the buffer to the nearest multiple of 64 bytes + /// + /// \param elements the new capacity of the of the builder. Will be rounded + /// up to a multiple of 64 bytes for padding + /// \param shrink_to_fit if new capacity is smaller than the existing size, + /// reallocate internal buffer. Set to false to avoid reallocations when + /// shrinking the builder. + /// \return Status + Status Resize(const int64_t elements, bool shrink_to_fit = true) { + // Resize(0) is a no-op + if (elements == 0) { + return Status::OK(); + } + int64_t old_capacity = capacity_; + + if (buffer_ == NULLPTR) { + ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, elements, &buffer_)); + } else { + ARROW_RETURN_NOT_OK(buffer_->Resize(elements, shrink_to_fit)); + } + capacity_ = buffer_->capacity(); + data_ = buffer_->mutable_data(); + if (capacity_ > old_capacity) { + memset(data_ + old_capacity, 0, capacity_ - old_capacity); + } + return Status::OK(); + } + + /// \brief Ensure that builder can accommodate the additional number of bytes + /// without the need to perform allocations + /// + /// \param size number of additional bytes to make space for + /// \return Status + Status Reserve(const int64_t size) { return Resize(size_ + size, false); } + + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. + Status Append(const void* data, int64_t length) { + if (capacity_ < length + size_) { + int64_t new_capacity = BitUtil::NextPower2(length + size_); + ARROW_RETURN_NOT_OK(Resize(new_capacity)); + } + UnsafeAppend(data, length); + return Status::OK(); + } + + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. + template + Status Append(const std::array& data) { + constexpr auto nbytes = static_cast(NBYTES); + if (capacity_ < nbytes + size_) { + int64_t new_capacity = BitUtil::NextPower2(nbytes + size_); + ARROW_RETURN_NOT_OK(Resize(new_capacity)); + } + + if (nbytes > 0) { + std::copy(data.cbegin(), data.cend(), data_ + size_); + size_ += nbytes; + } + return Status::OK(); + } + + // Advance pointer and zero out memory + Status Advance(const int64_t length) { + if (capacity_ < length + size_) { + int64_t new_capacity = BitUtil::NextPower2(length + size_); + ARROW_RETURN_NOT_OK(Resize(new_capacity)); + } + if (length > 0) { + memset(data_ + size_, 0, static_cast(length)); + size_ += length; + } + return Status::OK(); + } + + // Unsafe methods don't check existing size + void UnsafeAppend(const void* data, int64_t length) { + if (length > 0) { + memcpy(data_ + size_, data, static_cast(length)); + size_ += length; + } + } + + /// \brief Return result of builder as a Buffer object. + /// + /// The builder is reset and can be reused afterwards. + /// + /// \param[out] out the finalized Buffer object + /// \param shrink_to_fit if the buffer size is smaller than its capacity, + /// reallocate to fit more tightly in memory. Set to false to avoid + /// a reallocation, at the expense of potentially more memory consumption. + /// \return Status + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); + *out = buffer_; + Reset(); + return Status::OK(); + } + + void Reset() { + buffer_ = NULLPTR; + capacity_ = size_ = 0; + } + + int64_t capacity() const { return capacity_; } + int64_t length() const { return size_; } + const uint8_t* data() const { return data_; } + + protected: + std::shared_ptr buffer_; + MemoryPool* pool_; + uint8_t* data_; + int64_t capacity_; + int64_t size_; +}; + +/// \brief A BufferBuilder subclass with convenience methods to append typed data +template +class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { + public: + explicit TypedBufferBuilder(MemoryPool* pool) : BufferBuilder(pool) {} + + Status Append(T arithmetic_value) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + return BufferBuilder::Append(reinterpret_cast(&arithmetic_value), + sizeof(T)); + } + + Status Append(const T* arithmetic_values, int64_t num_elements) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + return BufferBuilder::Append(reinterpret_cast(arithmetic_values), + num_elements * sizeof(T)); + } + + void UnsafeAppend(T arithmetic_value) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + BufferBuilder::UnsafeAppend(reinterpret_cast(&arithmetic_value), sizeof(T)); + } + + void UnsafeAppend(const T* arithmetic_values, int64_t num_elements) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + BufferBuilder::UnsafeAppend(reinterpret_cast(arithmetic_values), + num_elements * sizeof(T)); + } + + const T* data() const { return reinterpret_cast(data_); } + int64_t length() const { return size_ / sizeof(T); } + int64_t capacity() const { return capacity_ / sizeof(T); } +}; + +} // namespace arrow + +#endif // ARROW_BUFFER_BUILDER_H diff --git a/cpp/src/arrow/buffer-test.cc b/cpp/src/arrow/buffer-test.cc index 4d16f7f9c277d..5d01515078c05 100644 --- a/cpp/src/arrow/buffer-test.cc +++ b/cpp/src/arrow/buffer-test.cc @@ -26,6 +26,7 @@ #include +#include "arrow/buffer-builder.h" #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 01bb0c34968d3..9534d2b94f3e4 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -126,25 +126,18 @@ class PoolBuffer : public ResizableBuffer { } Status Resize(const int64_t new_size, bool shrink_to_fit = true) override { - if (!shrink_to_fit || (new_size > size_)) { - RETURN_NOT_OK(Reserve(new_size)); - } else { - // Buffer is not growing, so shrink to the requested size without + if (mutable_data_ && shrink_to_fit && new_size <= size_) { + // Buffer is non-null and is not growing, so shrink to the requested size without // excess space. int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(new_size); if (capacity_ != new_capacity) { // Buffer hasn't got yet the requested size. - if (new_size == 0) { - pool_->Free(mutable_data_, capacity_); - capacity_ = 0; - mutable_data_ = nullptr; - data_ = nullptr; - } else { - RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &mutable_data_)); - data_ = mutable_data_; - capacity_ = new_capacity; - } + RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &mutable_data_)); + data_ = mutable_data_; + capacity_ = new_capacity; } + } else { + RETURN_NOT_OK(Reserve(new_size)); } size_ = new_size; diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 6b2ad1bbefc7f..f0042897b027b 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -19,7 +19,6 @@ #define ARROW_BUFFER_H #include -#include #include #include #include @@ -29,7 +28,6 @@ #include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/util/bit-util.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -110,7 +108,10 @@ class ARROW_EXPORT Buffer { #ifndef NDEBUG CheckMutable(); #endif - memset(mutable_data_ + size_, 0, static_cast(capacity_ - size_)); + // A zero-capacity buffer can have a null data pointer + if (capacity_ != 0) { + memset(mutable_data_ + size_, 0, static_cast(capacity_ - size_)); + } } /// \brief Construct a new buffer that owns its memory from a std::string @@ -391,167 +392,6 @@ Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out); /// @} -// ---------------------------------------------------------------------- -// Buffer builder classes - -/// \class BufferBuilder -/// \brief A class for incrementally building a contiguous chunk of in-memory data -class ARROW_EXPORT BufferBuilder { - public: - explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {} - - /// \brief Resize the buffer to the nearest multiple of 64 bytes - /// - /// \param elements the new capacity of the of the builder. Will be rounded - /// up to a multiple of 64 bytes for padding - /// \param shrink_to_fit if new capacity is smaller than the existing size, - /// reallocate internal buffer. Set to false to avoid reallocations when - /// shrinking the builder. - /// \return Status - Status Resize(const int64_t elements, bool shrink_to_fit = true) { - // Resize(0) is a no-op - if (elements == 0) { - return Status::OK(); - } - int64_t old_capacity = capacity_; - - if (buffer_ == NULLPTR) { - ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, elements, &buffer_)); - } else { - ARROW_RETURN_NOT_OK(buffer_->Resize(elements, shrink_to_fit)); - } - capacity_ = buffer_->capacity(); - data_ = buffer_->mutable_data(); - if (capacity_ > old_capacity) { - memset(data_ + old_capacity, 0, capacity_ - old_capacity); - } - return Status::OK(); - } - - /// \brief Ensure that builder can accommodate the additional number of bytes - /// without the need to perform allocations - /// - /// \param size number of additional bytes to make space for - /// \return Status - Status Reserve(const int64_t size) { return Resize(size_ + size, false); } - - /// \brief Append the given data to the buffer - /// - /// The buffer is automatically expanded if necessary. - Status Append(const void* data, int64_t length) { - if (capacity_ < length + size_) { - int64_t new_capacity = BitUtil::NextPower2(length + size_); - ARROW_RETURN_NOT_OK(Resize(new_capacity)); - } - UnsafeAppend(data, length); - return Status::OK(); - } - - /// \brief Append the given data to the buffer - /// - /// The buffer is automatically expanded if necessary. - template - Status Append(const std::array& data) { - constexpr auto nbytes = static_cast(NBYTES); - if (capacity_ < nbytes + size_) { - int64_t new_capacity = BitUtil::NextPower2(nbytes + size_); - ARROW_RETURN_NOT_OK(Resize(new_capacity)); - } - - std::copy(data.cbegin(), data.cend(), data_ + size_); - size_ += nbytes; - return Status::OK(); - } - - // Advance pointer and zero out memory - Status Advance(const int64_t length) { - if (capacity_ < length + size_) { - int64_t new_capacity = BitUtil::NextPower2(length + size_); - ARROW_RETURN_NOT_OK(Resize(new_capacity)); - } - memset(data_ + size_, 0, static_cast(length)); - size_ += length; - return Status::OK(); - } - - // Unsafe methods don't check existing size - void UnsafeAppend(const void* data, int64_t length) { - memcpy(data_ + size_, data, static_cast(length)); - size_ += length; - } - - /// \brief Return result of builder as a Buffer object. - /// - /// The builder is reset and can be reused afterwards. - /// - /// \param[out] out the finalized Buffer object - /// \param shrink_to_fit if the buffer size is smaller than its capacity, - /// reallocate to fit more tightly in memory. Set to false to avoid - /// a reallocation, at the expense of potentially more memory consumption. - /// \return Status - Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { - ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); - *out = buffer_; - Reset(); - return Status::OK(); - } - - void Reset() { - buffer_ = NULLPTR; - capacity_ = size_ = 0; - } - - int64_t capacity() const { return capacity_; } - int64_t length() const { return size_; } - const uint8_t* data() const { return data_; } - - protected: - std::shared_ptr buffer_; - MemoryPool* pool_; - uint8_t* data_; - int64_t capacity_; - int64_t size_; -}; - -/// \brief A BufferBuilder subclass with convenience methods to append typed data -template -class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { - public: - explicit TypedBufferBuilder(MemoryPool* pool) : BufferBuilder(pool) {} - - Status Append(T arithmetic_value) { - static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - return BufferBuilder::Append(reinterpret_cast(&arithmetic_value), - sizeof(T)); - } - - Status Append(const T* arithmetic_values, int64_t num_elements) { - static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - return BufferBuilder::Append(reinterpret_cast(arithmetic_values), - num_elements * sizeof(T)); - } - - void UnsafeAppend(T arithmetic_value) { - static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - BufferBuilder::UnsafeAppend(reinterpret_cast(&arithmetic_value), sizeof(T)); - } - - void UnsafeAppend(const T* arithmetic_values, int64_t num_elements) { - static_assert(std::is_arithmetic::value, - "Convenience buffer append only supports arithmetic types"); - BufferBuilder::UnsafeAppend(reinterpret_cast(arithmetic_values), - num_elements * sizeof(T)); - } - - const T* data() const { return reinterpret_cast(data_); } - int64_t length() const { return size_ / sizeof(T); } - int64_t capacity() const { return capacity_ / sizeof(T); } -}; - } // namespace arrow #endif // ARROW_BUFFER_H diff --git a/cpp/src/arrow/io/readahead.cc b/cpp/src/arrow/io/readahead.cc index 89db6a66e8c8d..4222f87a5ca3b 100644 --- a/cpp/src/arrow/io/readahead.cc +++ b/cpp/src/arrow/io/readahead.cc @@ -162,11 +162,13 @@ class ReadaheadSpooler::Impl { int64_t bytes_read; RETURN_NOT_OK(AllocateResizableBuffer( pool_, read_size_ + buf->left_padding + buf->right_padding, &buffer)); + DCHECK_NE(buffer->mutable_data(), nullptr); RETURN_NOT_OK( raw_->Read(read_size_, &bytes_read, buffer->mutable_data() + buf->left_padding)); if (bytes_read < read_size_) { // Got a short read RETURN_NOT_OK(buffer->Resize(bytes_read + buf->left_padding + buf->right_padding)); + DCHECK_NE(buffer->mutable_data(), nullptr); } // Zero padding areas memset(buffer->mutable_data(), 0, buf->left_padding); diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index fb5beacf0f863..abf36fcac15e1 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -40,6 +40,14 @@ namespace arrow { constexpr size_t kAlignment = 64; namespace { + +#ifdef ARROW_JEMALLOC +inline size_t FixAllocationSize(int64_t size) { + // mallocx() and rallocx() don't support 0-sized allocations + return std::max(static_cast(size), kAlignment); +} +#endif + // Allocate memory according to the alignment requirements for Arrow // (as of May 2016 64 bytes) Status AllocateAligned(int64_t size, uint8_t** out) { @@ -58,8 +66,8 @@ Status AllocateAligned(int64_t size, uint8_t** out) { return Status::OutOfMemory("malloc of size ", size, " failed"); } #elif defined(ARROW_JEMALLOC) - *out = reinterpret_cast(mallocx( - std::max(static_cast(size), kAlignment), MALLOCX_ALIGN(kAlignment))); + *out = reinterpret_cast( + mallocx(FixAllocationSize(size), MALLOCX_ALIGN(kAlignment))); if (*out == NULL) { return Status::OutOfMemory("malloc of size ", size, " failed"); } @@ -76,6 +84,42 @@ Status AllocateAligned(int64_t size, uint8_t** out) { #endif return Status::OK(); } + +Status ReallocateAligned(int64_t old_size, int64_t new_size, uint8_t** ptr) { +#ifdef ARROW_JEMALLOC + uint8_t* previous_ptr = *ptr; + if (new_size < 0) { + return Status::Invalid("negative realloc size"); + } + if (static_cast(new_size) >= std::numeric_limits::max()) { + return Status::CapacityError("realloc overflows size_t"); + } + *ptr = reinterpret_cast( + rallocx(*ptr, FixAllocationSize(new_size), MALLOCX_ALIGN(kAlignment))); + if (*ptr == NULL) { + *ptr = previous_ptr; + return Status::OutOfMemory("realloc of size ", new_size, " failed"); + } +#else + // Note: We cannot use realloc() here as it doesn't guarantee alignment. + + // Allocate new chunk + uint8_t* out = nullptr; + RETURN_NOT_OK(AllocateAligned(new_size, &out)); + DCHECK(out); + // Copy contents and release old memory chunk + memcpy(out, *ptr, static_cast(std::min(new_size, old_size))); +#ifdef _WIN32 + _aligned_free(*ptr); +#else + std::free(*ptr); +#endif // defined(_MSC_VER) + *ptr = out; +#endif // defined(ARROW_JEMALLOC) + + return Status::OK(); +} + } // namespace MemoryPool::MemoryPool() {} @@ -99,36 +143,7 @@ class DefaultMemoryPool : public MemoryPool { } Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { -#ifdef ARROW_JEMALLOC - uint8_t* previous_ptr = *ptr; - if (new_size < 0) { - return Status::Invalid("negative realloc size"); - } - if (static_cast(new_size) >= std::numeric_limits::max()) { - return Status::CapacityError("realloc overflows size_t"); - } - *ptr = reinterpret_cast( - rallocx(*ptr, static_cast(new_size), MALLOCX_ALIGN(kAlignment))); - if (*ptr == NULL) { - *ptr = previous_ptr; - return Status::OutOfMemory("realloc of size ", new_size, " failed"); - } -#else - // Note: We cannot use realloc() here as it doesn't guarantee alignment. - - // Allocate new chunk - uint8_t* out = nullptr; - RETURN_NOT_OK(AllocateAligned(new_size, &out)); - DCHECK(out); - // Copy contents and release old memory chunk - memcpy(out, *ptr, static_cast(std::min(new_size, old_size))); -#ifdef _WIN32 - _aligned_free(*ptr); -#else - std::free(*ptr); -#endif // defined(_MSC_VER) - *ptr = out; -#endif // defined(ARROW_JEMALLOC) + RETURN_NOT_OK(ReallocateAligned(old_size, new_size, ptr)); stats_.UpdateAllocatedBytes(new_size - old_size); return Status::OK(); diff --git a/cpp/src/arrow/test-util.cc b/cpp/src/arrow/test-util.cc index 8c5f36417f881..617c53978f619 100644 --- a/cpp/src/arrow/test-util.cc +++ b/cpp/src/arrow/test-util.cc @@ -303,17 +303,23 @@ void AssertZeroPadded(const Array& array) { for (const auto& buffer : array.data()->buffers) { if (buffer) { const int64_t padding = buffer->capacity() - buffer->size(); - std::vector zeros(padding); - ASSERT_EQ(0, memcmp(buffer->data() + buffer->size(), zeros.data(), padding)); + if (padding > 0) { + std::vector zeros(padding); + ASSERT_EQ(0, memcmp(buffer->data() + buffer->size(), zeros.data(), padding)); + } } } } void TestInitialized(const Array& array) { for (const auto& buffer : array.data()->buffers) { - if (buffer) { - std::vector zeros(buffer->capacity()); - throw_away = memcmp(buffer->data(), zeros.data(), buffer->size()); + if (buffer && buffer->capacity() > 0) { + int total = 0; + auto data = buffer->data(); + for (int64_t i = 0; i < buffer->size(); ++i) { + total ^= data[i]; + } + throw_away = total; } } } diff --git a/cpp/src/arrow/util/thread-pool-test.cc b/cpp/src/arrow/util/thread-pool-test.cc index 6d7b9e230f080..22a8db21fd280 100644 --- a/cpp/src/arrow/util/thread-pool-test.cc +++ b/cpp/src/arrow/util/thread-pool-test.cc @@ -298,7 +298,7 @@ TEST_F(TestThreadPool, Submit) { // Test fork safety on Unix -#if !(defined(_WIN32) || defined(ARROW_VALGRIND)) +#if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER)) TEST_F(TestThreadPool, ForkSafety) { pid_t child_pid; int child_status; diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index 918780e04971a..0078051b49735 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -21,6 +21,7 @@ #include #include +#include "arrow/util/bit-util.h" #include "arrow/util/logging.h" #include "parquet/exception.h" #include "parquet/hasher.h" From 1ff79785e62855d003f4b5f0c054cbfd155160c1 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 4 Jan 2019 11:40:45 -0600 Subject: [PATCH 135/328] ARROW-4150: [C++] Ensure allocated buffers have non-null data pointer We would originally give 0-size buffers a null data pointer, but passing a null pointer to certain library functions such as memcpy() yields undefined behaviour. Also, a null pointer is a common indication that an error or bug occurred. Author: Antoine Pitrou Closes #3309 from pitrou/ARROW-4150-non-null-buffer-data and squashes the following commits: d9f1b03bf ARROW-4150: Ensure allocated buffers have non-null data pointer --- cpp/src/arrow/buffer-test.cc | 59 ++++++++++++++++++++++++++++++++++++ cpp/src/arrow/memory_pool.cc | 51 +++++++++++++++++++++---------- 2 files changed, 94 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/buffer-test.cc b/cpp/src/arrow/buffer-test.cc index 5d01515078c05..7c54e136195f3 100644 --- a/cpp/src/arrow/buffer-test.cc +++ b/cpp/src/arrow/buffer-test.cc @@ -177,6 +177,65 @@ TEST(TestBuffer, SliceMutableBuffer) { ASSERT_TRUE(slice->Equals(expected)); } +template +void TestZeroSizeAllocateBuffer(MemoryPool* pool, AllocateFunction&& allocate_func) { + auto allocated_bytes = pool->bytes_allocated(); + { + std::shared_ptr buffer; + + ASSERT_OK(allocate_func(pool, 0, &buffer)); + ASSERT_EQ(buffer->size(), 0); + // Even 0-sized buffers should not have a null data pointer + ASSERT_NE(buffer->data(), nullptr); + ASSERT_EQ(buffer->mutable_data(), buffer->data()); + + ASSERT_GE(pool->bytes_allocated(), allocated_bytes); + } + ASSERT_EQ(pool->bytes_allocated(), allocated_bytes); +} + +TEST(TestAllocateBuffer, ZeroSize) { + MemoryPool* pool = default_memory_pool(); + auto allocate_func = [](MemoryPool* pool, int64_t size, std::shared_ptr* out) { + return AllocateBuffer(pool, size, out); + }; + TestZeroSizeAllocateBuffer(pool, allocate_func); +} + +TEST(TestAllocateResizableBuffer, ZeroSize) { + MemoryPool* pool = default_memory_pool(); + auto allocate_func = [](MemoryPool* pool, int64_t size, std::shared_ptr* out) { + std::shared_ptr res; + RETURN_NOT_OK(AllocateResizableBuffer(pool, size, &res)); + *out = res; + return Status::OK(); + }; + TestZeroSizeAllocateBuffer(pool, allocate_func); +} + +TEST(TestAllocateResizableBuffer, ZeroResize) { + MemoryPool* pool = default_memory_pool(); + auto allocated_bytes = pool->bytes_allocated(); + { + std::shared_ptr buffer; + + ASSERT_OK(AllocateResizableBuffer(pool, 1000, &buffer)); + ASSERT_EQ(buffer->size(), 1000); + ASSERT_NE(buffer->data(), nullptr); + ASSERT_EQ(buffer->mutable_data(), buffer->data()); + + ASSERT_GE(pool->bytes_allocated(), allocated_bytes + 1000); + + ASSERT_OK(buffer->Resize(0)); + ASSERT_NE(buffer->data(), nullptr); + ASSERT_EQ(buffer->mutable_data(), buffer->data()); + + ASSERT_GE(pool->bytes_allocated(), allocated_bytes); + ASSERT_LT(pool->bytes_allocated(), allocated_bytes + 1000); + } + ASSERT_EQ(pool->bytes_allocated(), allocated_bytes); +} + TEST(TestBufferBuilder, ResizeReserve) { const std::string data = "some data"; auto data_ptr = data.c_str(); diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index abf36fcac15e1..3e0366a19da41 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -41,12 +41,9 @@ constexpr size_t kAlignment = 64; namespace { -#ifdef ARROW_JEMALLOC -inline size_t FixAllocationSize(int64_t size) { - // mallocx() and rallocx() don't support 0-sized allocations - return std::max(static_cast(size), kAlignment); -} -#endif +// A static piece of memory for 0-size allocations, so as to return +// an aligned non-null pointer. +alignas(kAlignment) static uint8_t zero_size_area[1]; // Allocate memory according to the alignment requirements for Arrow // (as of May 2016 64 bytes) @@ -55,6 +52,10 @@ Status AllocateAligned(int64_t size, uint8_t** out) { if (size < 0) { return Status::Invalid("negative malloc size"); } + if (size == 0) { + *out = zero_size_area; + return Status::OK(); + } if (static_cast(size) >= std::numeric_limits::max()) { return Status::CapacityError("malloc size overflows size_t"); } @@ -67,7 +68,7 @@ Status AllocateAligned(int64_t size, uint8_t** out) { } #elif defined(ARROW_JEMALLOC) *out = reinterpret_cast( - mallocx(FixAllocationSize(size), MALLOCX_ALIGN(kAlignment))); + mallocx(static_cast(size), MALLOCX_ALIGN(kAlignment))); if (*out == NULL) { return Status::OutOfMemory("malloc of size ", size, " failed"); } @@ -85,9 +86,32 @@ Status AllocateAligned(int64_t size, uint8_t** out) { return Status::OK(); } +void DeallocateAligned(uint8_t* ptr, int64_t size) { + if (ptr == zero_size_area) { + DCHECK_EQ(size, 0); + } else { +#ifdef _WIN32 + _aligned_free(ptr); +#elif defined(ARROW_JEMALLOC) + dallocx(ptr, MALLOCX_ALIGN(kAlignment)); +#else + std::free(ptr); +#endif + } +} + Status ReallocateAligned(int64_t old_size, int64_t new_size, uint8_t** ptr) { -#ifdef ARROW_JEMALLOC uint8_t* previous_ptr = *ptr; + if (previous_ptr == zero_size_area) { + DCHECK_EQ(old_size, 0); + return AllocateAligned(new_size, ptr); + } + if (new_size == 0) { + DeallocateAligned(previous_ptr, old_size); + *ptr = zero_size_area; + return Status::OK(); + } +#ifdef ARROW_JEMALLOC if (new_size < 0) { return Status::Invalid("negative realloc size"); } @@ -95,7 +119,7 @@ Status ReallocateAligned(int64_t old_size, int64_t new_size, uint8_t** ptr) { return Status::CapacityError("realloc overflows size_t"); } *ptr = reinterpret_cast( - rallocx(*ptr, FixAllocationSize(new_size), MALLOCX_ALIGN(kAlignment))); + rallocx(*ptr, static_cast(new_size), MALLOCX_ALIGN(kAlignment))); if (*ptr == NULL) { *ptr = previous_ptr; return Status::OutOfMemory("realloc of size ", new_size, " failed"); @@ -152,13 +176,8 @@ class DefaultMemoryPool : public MemoryPool { int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } void Free(uint8_t* buffer, int64_t size) override { -#ifdef _WIN32 - _aligned_free(buffer); -#elif defined(ARROW_JEMALLOC) - dallocx(buffer, MALLOCX_ALIGN(kAlignment)); -#else - std::free(buffer); -#endif + DeallocateAligned(buffer, size); + stats_.UpdateAllocatedBytes(-size); } From 95f6ecfb9115659af3577693589ce4f9ae10eea3 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Fri, 4 Jan 2019 11:43:21 -0600 Subject: [PATCH 136/328] ARROW-4152: [GLib] Remove an example to show Torch integration Because Torch is not in active development. Author: Kouhei Sutou Closes #3303 from kou/glib-remove-torch-example and squashes the following commits: b29a96390 Remove an example to show Torch integration --- .travis.yml | 1 - c_glib/example/lua/Makefile.am | 1 - c_glib/example/lua/README.md | 5 - c_glib/example/lua/stream-to-torch-tensor.lua | 101 ------------------ ci/travis_before_script_c_glib.sh | 18 +--- ci/travis_script_c_glib.sh | 17 +-- 6 files changed, 6 insertions(+), 137 deletions(-) delete mode 100644 c_glib/example/lua/stream-to-torch-tensor.lua diff --git a/.travis.yml b/.travis.yml index 837b4cfef30db..f14f7e4785948 100644 --- a/.travis.yml +++ b/.travis.yml @@ -256,7 +256,6 @@ matrix: - ARROW_TRAVIS_USE_VENDORED_BOOST=1 - ARROW_TRAVIS_PARQUET=1 - ARROW_TRAVIS_PLASMA=1 - - BUILD_TORCH_EXAMPLE=no - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" before_script: - if [ $ARROW_CI_RUBY_AFFECTED != "1" ]; then exit; fi diff --git a/c_glib/example/lua/Makefile.am b/c_glib/example/lua/Makefile.am index 86bdbed8a0228..9019d24741c1a 100644 --- a/c_glib/example/lua/Makefile.am +++ b/c_glib/example/lua/Makefile.am @@ -20,6 +20,5 @@ dist_lua_example_DATA = \ README.md \ read-batch.lua \ read-stream.lua \ - stream-to-torch-tensor.lua \ write-batch.lua \ write-stream.lua diff --git a/c_glib/example/lua/README.md b/c_glib/example/lua/README.md index e7e3351fef148..7d388d46acb33 100644 --- a/c_glib/example/lua/README.md +++ b/c_glib/example/lua/README.md @@ -48,8 +48,3 @@ Here are example codes in this directory: * `read-stream.lua`: It shows how to read Arrow array from file in stream mode. - - * `stream-to-torch-tensor.lua`: It shows how to read Arrow array - from file in stream mode and convert it to - [Torch](http://torch.ch/)'s - [`Tensor` object](http://torch7.readthedocs.io/en/rtd/tensor/index.html). diff --git a/c_glib/example/lua/stream-to-torch-tensor.lua b/c_glib/example/lua/stream-to-torch-tensor.lua deleted file mode 100644 index fc765e3c96872..0000000000000 --- a/c_glib/example/lua/stream-to-torch-tensor.lua +++ /dev/null @@ -1,101 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - -local lgi = require 'lgi' -local Arrow = lgi.Arrow - -local torch = require 'torch' - -Arrow.Array.torch_types = function(self) - return nil -end - -Arrow.Array.to_torch = function(self) - local types = self:torch_types() - if not types then - return nil - end - - local storage_type = types[1] - local tensor_type = types[2] - - local size = self:get_length() - local storage = storage_type(size) - if not storage then - return nil - end - - for i = 1, size do - storage[i] = self:get_value(i - 1) - end - return tensor_type(storage) -end - -Arrow.UInt8Array.torch_types = function(self) - return {torch.ByteStorage, torch.ByteTensor} -end - -Arrow.Int8Array.torch_types = function(self) - return {torch.CharStorage, torch.CharTensor} -end - -Arrow.Int16Array.torch_types = function(self) - return {torch.ShortStorage, torch.ShortTensor} -end - -Arrow.Int32Array.torch_types = function(self) - return {torch.IntStorage, torch.IntTensor} -end - -Arrow.Int64Array.torch_types = function(self) - return {torch.LongStorage, torch.LongTensor} -end - -Arrow.FloatArray.torch_types = function(self) - return {torch.FloatStorage, torch.FloatTensor} -end - -Arrow.DoubleArray.torch_types = function(self) - return {torch.DoubleStorage, torch.DoubleTensor} -end - - -local input_path = arg[1] or "/tmp/stream.arrow"; - -local input = Arrow.MemoryMappedInputStream.new(input_path) -local reader = Arrow.RecordBatchStreamReader.new(input) - -local i = 0 -while true do - local record_batch = reader:read_next_record_batch() - if not record_batch then - break - end - - print(string.rep("=", 40)) - print("record-batch["..i.."]:") - for j = 0, record_batch:get_n_columns() - 1 do - local column = record_batch:get_column(j) - local column_name = record_batch:get_column_name(j) - print(" "..column_name..":") - print(column:to_torch()) - end - - i = i + 1 -end - -input:close() diff --git a/ci/travis_before_script_c_glib.sh b/ci/travis_before_script_c_glib.sh index 7cd1c2a064396..e8dd0cdc80d2e 100755 --- a/ci/travis_before_script_c_glib.sh +++ b/ci/travis_before_script_c_glib.sh @@ -44,22 +44,8 @@ gem install test-unit gobject-introspection if [ $TRAVIS_OS_NAME = "osx" ]; then sudo env PKG_CONFIG_PATH=$PKG_CONFIG_PATH luarocks install lgi else - if [ $BUILD_TORCH_EXAMPLE = "yes" ]; then - git clone \ - --quiet \ - --depth 1 \ - --recursive \ - https://github.com/torch/distro.git ~/torch - pushd ~/torch - ./install-deps > /dev/null - echo "yes" | ./install.sh > /dev/null - . ~/torch/install/bin/torch-activate - popd - luarocks install lgi - else - sudo apt install -y -qq luarocks - sudo luarocks install lgi - fi + sudo apt install -y -qq luarocks + sudo luarocks install lgi fi pushd $ARROW_C_GLIB_DIR diff --git a/ci/travis_script_c_glib.sh b/ci/travis_script_c_glib.sh index adecc5c742967..c42a047ddf445 100755 --- a/ci/travis_script_c_glib.sh +++ b/ci/travis_script_c_glib.sh @@ -32,19 +32,10 @@ arrow_c_glib_run_test() export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$arrow_c_glib_lib_dir/pkgconfig pushd example/lua - if [ "$BUILD_TORCH_EXAMPLE" = "yes" ]; then - . ~/torch/install/bin/torch-activate - luajit write-batch.lua - luajit read-batch.lua - luajit write-stream.lua - luajit read-stream.lua - luajit stream-to-torch-tensor.lua - else - lua write-batch.lua - lua read-batch.lua - lua write-stream.lua - lua read-stream.lua - fi + lua write-batch.lua + lua read-batch.lua + lua write-stream.lua + lua read-stream.lua popd } From 1e9a23612d258cd51a20b9eccf7a13bd5be52007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Jan 2019 11:53:57 -0600 Subject: [PATCH 137/328] ARROW-4149: [CI/C++] Parquet test misses ZSTD compression codec in CMake 3.2 nightly builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parquet Zstd tests were enabled regardless `ARROW_WITH_ZSTD` which can be set to [OFF](https://github.com/apache/arrow/blob/master/cpp/CMakeLists.txt#L271) depending CMake's version. Crossbow build: - ~[kszucs/crossbow/build-392](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-392)~ - [kszucs/crossbow/build-395](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-395) Author: Krisztián Szűcs Closes #3299 from kszucs/ARROW-4149 and squashes the following commits: 525ef76f1 lint b29bda570 disable more tests 54e6437fe only run Zstd tests if ARROW_WITH_ZSTD is set --- cpp/src/arrow/io/compressed-test.cc | 16 ++++++++++------ cpp/src/parquet/CMakeLists.txt | 5 +++++ cpp/src/parquet/column_writer-test.cc | 14 +++++++++----- cpp/src/parquet/file-deserialize-test.cc | 8 +++++--- cpp/src/parquet/file-serialize-test.cc | 2 ++ 5 files changed, 31 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/io/compressed-test.cc b/cpp/src/arrow/io/compressed-test.cc index 507302f384c0b..a099fbb316a65 100644 --- a/cpp/src/arrow/io/compressed-test.cc +++ b/cpp/src/arrow/io/compressed-test.cc @@ -199,12 +199,14 @@ TEST_P(CompressedInputStreamTest, InvalidData) { INSTANTIATE_TEST_CASE_P(TestGZipInputStream, CompressedInputStreamTest, ::testing::Values(Compression::GZIP)); -INSTANTIATE_TEST_CASE_P(TestZSTDInputStream, CompressedInputStreamTest, - ::testing::Values(Compression::ZSTD)); - INSTANTIATE_TEST_CASE_P(TestBrotliInputStream, CompressedInputStreamTest, ::testing::Values(Compression::BROTLI)); +#ifdef ARROW_WITH_ZSTD +INSTANTIATE_TEST_CASE_P(TestZSTDInputStream, CompressedInputStreamTest, + ::testing::Values(Compression::ZSTD)); +#endif + class CompressedOutputStreamTest : public ::testing::TestWithParam { protected: Compression::type GetCompression() { return GetParam(); } @@ -235,11 +237,13 @@ TEST_P(CompressedOutputStreamTest, RandomData) { INSTANTIATE_TEST_CASE_P(TestGZipOutputStream, CompressedOutputStreamTest, ::testing::Values(Compression::GZIP)); -INSTANTIATE_TEST_CASE_P(TestZSTDOutputStream, CompressedOutputStreamTest, - ::testing::Values(Compression::ZSTD)); - INSTANTIATE_TEST_CASE_P(TestBrotliOutputStream, CompressedOutputStreamTest, ::testing::Values(Compression::BROTLI)); +#ifdef ARROW_WITH_ZSTD +INSTANTIATE_TEST_CASE_P(TestZSTDOutputStream, CompressedOutputStreamTest, + ::testing::Values(Compression::ZSTD)); +#endif + } // namespace io } // namespace arrow diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 4eb8f68a2ba98..f6796726fce90 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -282,3 +282,8 @@ ADD_ARROW_BENCHMARK(encoding-benchmark PREFIX "parquet" LABELS "parquet-benchmarks" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) + +# Required for tests, the ExternalProject for zstd does not build on CMake < 3.7 +if (ARROW_WITH_ZSTD) + add_definitions(-DARROW_WITH_ZSTD) +endif() diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc index 4416e3d18e9ad..28a18b1008ac8 100644 --- a/cpp/src/parquet/column_writer-test.cc +++ b/cpp/src/parquet/column_writer-test.cc @@ -349,11 +349,6 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithLz4Compression) { LARGE_SIZE); } -TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithZstdCompression) { - this->TestRequiredWithSettings(Encoding::PLAIN, Compression::ZSTD, false, false, - LARGE_SIZE); -} - TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStats) { this->TestRequiredWithSettings(Encoding::PLAIN, Compression::UNCOMPRESSED, false, true, LARGE_SIZE); @@ -379,10 +374,19 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndLz4Compression) { LARGE_SIZE); } +// The ExternalProject for zstd does not build on CMake < 3.7, so we do not +// require it here +#ifdef ARROW_WITH_ZSTD +TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithZstdCompression) { + this->TestRequiredWithSettings(Encoding::PLAIN, Compression::ZSTD, false, false, + LARGE_SIZE); +} + TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndZstdCompression) { this->TestRequiredWithSettings(Encoding::PLAIN, Compression::ZSTD, false, true, LARGE_SIZE); } +#endif TYPED_TEST(TestPrimitiveWriter, Optional) { // Optional and non-repeated, with definition levels diff --git a/cpp/src/parquet/file-deserialize-test.cc b/cpp/src/parquet/file-deserialize-test.cc index 17dfe387fd6e0..f1c17240439fb 100644 --- a/cpp/src/parquet/file-deserialize-test.cc +++ b/cpp/src/parquet/file-deserialize-test.cc @@ -176,9 +176,11 @@ TEST_F(TestPageSerde, TestFailLargePageHeaders) { } TEST_F(TestPageSerde, Compression) { - Compression::type codec_types[5] = {Compression::GZIP, Compression::SNAPPY, - Compression::BROTLI, Compression::LZ4, - Compression::ZSTD}; + std::vector codec_types = {Compression::GZIP, Compression::SNAPPY, + Compression::BROTLI, Compression::LZ4}; +#ifdef ARROW_WITH_ZSTD + codec_types.push_back(Compression::ZSTD); +#endif const int32_t num_rows = 32; // dummy value data_page_header_.num_values = num_rows; diff --git a/cpp/src/parquet/file-serialize-test.cc b/cpp/src/parquet/file-serialize-test.cc index 750faa20e2454..88dd657603184 100644 --- a/cpp/src/parquet/file-serialize-test.cc +++ b/cpp/src/parquet/file-serialize-test.cc @@ -301,9 +301,11 @@ TYPED_TEST(TestSerialize, SmallFileLz4) { ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::LZ4)); } +#ifdef ARROW_WITH_ZSTD TYPED_TEST(TestSerialize, SmallFileZstd) { ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::ZSTD)); } +#endif } // namespace test From cc9e228dd4a9b3403d52de07f134603a824b3354 Mon Sep 17 00:00:00 2001 From: "minmin.fmm" Date: Fri, 4 Jan 2019 12:03:44 -0600 Subject: [PATCH 138/328] ARROW-4122: [C++] Initialize class members based on codebase static analysis Author: minmin.fmm Closes #3267 from micafan/fix_cpp_uninit_ctor and squashes the following commits: 71a86c5e3 fix lint error 270a992c5 fix cpp uninit ctor --- cpp/src/arrow/array/builder_dict.cc | 3 ++- cpp/src/arrow/io/compressed.cc | 2 +- cpp/src/arrow/io/hdfs.cc | 2 +- cpp/src/arrow/ipc/feather-internal.h | 2 +- cpp/src/arrow/ipc/feather.cc | 1 + cpp/src/arrow/ipc/reader.cc | 4 +++- cpp/src/arrow/python/arrow_to_pandas.cc | 5 ++++- cpp/src/arrow/util/bit-stream-utils.h | 7 ++++++- cpp/src/arrow/util/compression_zlib.cc | 2 +- cpp/src/plasma/client.cc | 2 +- 10 files changed, 21 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index e534c3cadb14b..89939597f1e8b 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -161,7 +161,7 @@ DictionaryBuilder::~DictionaryBuilder() {} template DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), byte_width_(-1), values_builder_(pool) { + : ArrayBuilder(type, pool), delta_offset_(0), byte_width_(-1), values_builder_(pool) { DCHECK_EQ(T::type_id, type->id()) << "inconsistent type passed to DictionaryBuilder"; } @@ -175,6 +175,7 @@ template <> DictionaryBuilder::DictionaryBuilder( const std::shared_ptr& type, MemoryPool* pool) : ArrayBuilder(type, pool), + delta_offset_(0), byte_width_(checked_cast(*type).byte_width()) {} template diff --git a/cpp/src/arrow/io/compressed.cc b/cpp/src/arrow/io/compressed.cc index e5fd6b4adf4c7..1311dbc246634 100644 --- a/cpp/src/arrow/io/compressed.cc +++ b/cpp/src/arrow/io/compressed.cc @@ -44,7 +44,7 @@ namespace io { class CompressedOutputStream::Impl { public: Impl(MemoryPool* pool, Codec* codec, const std::shared_ptr& raw) - : pool_(pool), raw_(raw), codec_(codec), is_open_(true) {} + : pool_(pool), raw_(raw), codec_(codec), is_open_(true), compressed_pos_(0) {} ~Impl() { DCHECK(Close().ok()); } diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 3e9b804ca233c..0a50d3dcdcd90 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -336,7 +336,7 @@ static void SetPathInfo(const hdfsFileInfo* input, HdfsPathInfo* out) { // Private implementation class HadoopFileSystem::HadoopFileSystemImpl { public: - HadoopFileSystemImpl() {} + HadoopFileSystemImpl() : driver_(NULLPTR), port_(0), fs_(NULLPTR) {} Status Connect(const HdfsConnectionConfig* config) { if (config->driver == HdfsDriver::LIBHDFS3) { diff --git a/cpp/src/arrow/ipc/feather-internal.h b/cpp/src/arrow/ipc/feather-internal.h index 90512dd117238..2aa04b2db72ba 100644 --- a/cpp/src/arrow/ipc/feather-internal.h +++ b/cpp/src/arrow/ipc/feather-internal.h @@ -119,7 +119,7 @@ class ARROW_EXPORT TableBuilder { class ARROW_EXPORT TableMetadata { public: - TableMetadata() {} + TableMetadata() : table_(NULLPTR) {} ~TableMetadata() = default; Status Open(const std::shared_ptr& buffer) { diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index b0ab62c678c72..d28bf7512999a 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -180,6 +180,7 @@ ColumnBuilder::ColumnBuilder(TableBuilder* parent, const std::string& name) fbb_ = &parent->fbb(); name_ = name; type_ = ColumnType::PRIMITIVE; + meta_time_.unit = TimeUnit::SECOND; } flatbuffers::Offset ColumnBuilder::CreateColumnMetadata() { diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index b2c26767be4e9..59a322a64338a 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -499,7 +499,9 @@ Status RecordBatchStreamReader::ReadNext(std::shared_ptr* batch) { class RecordBatchFileReader::RecordBatchFileReaderImpl { public: - RecordBatchFileReaderImpl() { dictionary_memo_ = std::make_shared(); } + RecordBatchFileReaderImpl() : file_(NULLPTR), footer_offset_(0), footer_(NULLPTR) { + dictionary_memo_ = std::make_shared(); + } Status ReadFooter() { int magic_size = static_cast(strlen(kArrowMagicBytes)); diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index b532bfb705acd..8aa0bf74b7b27 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -1064,7 +1064,10 @@ class CategoricalBlock : public PandasBlock { public: explicit CategoricalBlock(const PandasOptions& options, MemoryPool* pool, int64_t num_rows) - : PandasBlock(options, num_rows, 1), pool_(pool), needs_copy_(false) {} + : PandasBlock(options, num_rows, 1), + pool_(pool), + ordered_(false), + needs_copy_(false) {} Status Allocate() override { return Status::NotImplemented( diff --git a/cpp/src/arrow/util/bit-stream-utils.h b/cpp/src/arrow/util/bit-stream-utils.h index ff215e488b4a3..ae62a7ff1e2b3 100644 --- a/cpp/src/arrow/util/bit-stream-utils.h +++ b/cpp/src/arrow/util/bit-stream-utils.h @@ -110,7 +110,12 @@ class BitReader { memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); } - BitReader() : buffer_(NULL), max_bytes_(0) {} + BitReader() + : buffer_(NULL), + max_bytes_(0), + buffered_values_(0), + byte_offset_(0), + bit_offset_(0) {} void Reset(const uint8_t* buffer, int buffer_len) { buffer_ = buffer; diff --git a/cpp/src/arrow/util/compression_zlib.cc b/cpp/src/arrow/util/compression_zlib.cc index dfda317e3bf36..736b0ab4f1524 100644 --- a/cpp/src/arrow/util/compression_zlib.cc +++ b/cpp/src/arrow/util/compression_zlib.cc @@ -85,7 +85,7 @@ static Status ZlibErrorPrefix(const char* prefix_msg, const char* msg) { class GZipDecompressor : public Decompressor { public: - GZipDecompressor() : initialized_(false) {} + GZipDecompressor() : initialized_(false), finished_(false) {} ~GZipDecompressor() override { if (initialized_) { diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index 8d153585c3d4e..f08d6efd71ee7 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -261,7 +261,7 @@ class PlasmaClient::Impl : public std::enable_shared_from_thisRelease(object_id_)); } -PlasmaClient::Impl::Impl() { +PlasmaClient::Impl::Impl() : store_conn_(0), store_capacity_(0) { #ifdef PLASMA_CUDA DCHECK_OK(CudaDeviceManager::GetInstance(&manager_)); #endif From c569a4c5684938230bf99e6b20b401322760089d Mon Sep 17 00:00:00 2001 From: Antonio Cavallo Date: Fri, 4 Jan 2019 12:13:54 -0600 Subject: [PATCH 139/328] ARROW-4127: [Documentation][Python] Add instructions to build with Docker Author: Antonio Cavallo Closes #3281 from cav71/documentation and squashes the following commits: a1c5dab21 Add Docker documentation build instructions --- docs/source/building.rst | 21 +++++++++++++++++++++ docs/source/python/development.rst | 5 +++++ 2 files changed, 26 insertions(+) diff --git a/docs/source/building.rst b/docs/source/building.rst index 0fb4486db89c3..dfa857498cf80 100644 --- a/docs/source/building.rst +++ b/docs/source/building.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _building-docs: + Building the Documentation ========================== @@ -69,3 +71,22 @@ After these steps are completed, the documentation is rendered in HTML format in ``docs/_build/html``. In particular, you can point your browser at ``docs/_build/html/index.html`` to read the docs and review any changes you made. + + +.. _building-docker: + +Building with Docker +-------------------- + +You can use Docker to build the documentation: + +.. code-block:: shell + + docker-compose build cpp + docker-compose build python + docker-compose build docs + docker-compose run docs + +The final output is located under [#]_:: + + docs/_build/html diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst index ba8cfef721441..0bc1c62b4af18 100644 --- a/docs/source/python/development.rst +++ b/docs/source/python/development.rst @@ -364,3 +364,8 @@ Getting ``python-test.exe`` to run is a bit tricky because your set PYTHONHOME=%CONDA_PREFIX% Now ``python-test.exe`` or simply ``ctest`` (to run all tests) should work. + +Building the Documentation +========================== + +See :ref:`building-docs` for instructions to build the HTML documentation. From 7405406928ac0e3ab03bf2091173563ed54d2a07 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 4 Jan 2019 19:15:35 +0100 Subject: [PATCH 140/328] ARROW-4156: [C++] Don't use object libs with Xcode Author: Uwe L. Korn Closes #3308 from xhochy/ARROW-4156 and squashes the following commits: 1c76769d ARROW-4156: Don't use object libs with Xcode --- cpp/cmake_modules/BuildUtils.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 7c1db679bf23e..77db28e2aab28 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -119,9 +119,11 @@ function(ADD_ARROW_LIB LIB_NAME) set(BUILD_STATIC ${ARROW_BUILD_STATIC}) endif() - if(MSVC) + if(MSVC OR (CMAKE_GENERATOR STREQUAL Xcode)) # MSVC needs to compile C++ separately for each library kind (shared and static) # because of dllexport declarations + # The Xcode generator doesn't reliably work with Xcode as target names are not + # guessed correctly. set(LIB_DEPS ${ARG_SOURCES}) set(EXTRA_DEPS ${ARG_DEPENDENCIES}) From 161d00fbeeb2f1992da8d8ac0e96fb14de51b646 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 4 Jan 2019 12:36:15 -0600 Subject: [PATCH 141/328] ARROW-4157: [C++] Fix clang documentation warnings on Ubuntu 18.04 I also added an option `ARROW_USE_CCACHE` to turn ccache on and off Author: Wes McKinney Closes #3310 from wesm/doc-fixes and squashes the following commits: e6c2f203f Fix clang documentation warnings on Ubuntu 18.04 --- cpp/CMakeLists.txt | 18 ++++-- cpp/src/arrow/gpu/cuda_context.h | 16 ++--- cpp/src/arrow/gpu/cuda_memory.h | 2 +- cpp/src/arrow/python/serialize.cc | 100 +++++++++++++++--------------- 4 files changed, 71 insertions(+), 65 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index aba1a59618bb0..3d2b698b8ff25 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -81,12 +81,6 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR INFER_FOUND) set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() -find_program(CCACHE_FOUND ccache) -if(CCACHE_FOUND) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_FOUND}) - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_FOUND}) -endif(CCACHE_FOUND) - # ---------------------------------------------------------------------- # cmake options @@ -115,6 +109,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Enable Address Sanitizer checks" OFF) + option(ARROW_USE_CCACHE + "Use ccache when compiling (if available)" + ON) + option(ARROW_USE_TSAN "Enable Thread Sanitizer checks" OFF) @@ -349,6 +347,14 @@ that have not been built" OFF) endif() +if (ARROW_USE_CCACHE) + find_program(CCACHE_FOUND ccache) + if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_FOUND}) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_FOUND}) + endif(CCACHE_FOUND) +endif() + if (ARROW_OPTIONAL_INSTALL) # Don't make the "install" target depend on the "all" target set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) diff --git a/cpp/src/arrow/gpu/cuda_context.h b/cpp/src/arrow/gpu/cuda_context.h index 9a67cea8975d1..300106214b488 100644 --- a/cpp/src/arrow/gpu/cuda_context.h +++ b/cpp/src/arrow/gpu/cuda_context.h @@ -37,23 +37,23 @@ class ARROW_EXPORT CudaDeviceManager { static Status GetInstance(CudaDeviceManager** manager); /// \brief Get the CUDA driver context for a particular device - /// \param[in] device_number + /// \param[in] device_number the CUDA device /// \param[out] out cached context - Status GetContext(int gpu_number, std::shared_ptr* ctx); + Status GetContext(int device_number, std::shared_ptr* out); /// \brief Get the shared CUDA driver context for a particular device - /// \param[in] device_number + /// \param[in] device_number the CUDA device /// \param[in] handle CUDA context handler created by another library /// \param[out] out shared context Status GetSharedContext(int device_number, void* handle, std::shared_ptr* out); /// \brief Allocate host memory with fast access to given GPU device - /// \param[in] device_number + /// \param[in] device_number the CUDA device /// \param[in] nbytes number of bytes /// \param[out] out the allocated buffer Status AllocateHost(int device_number, int64_t nbytes, - std::shared_ptr* buffer); + std::shared_ptr* out); Status FreeHost(void* data, int64_t nbytes); @@ -98,15 +98,15 @@ class ARROW_EXPORT CudaContext : public std::enable_shared_from_this* buffer); + std::shared_ptr* out); /// \brief Close memory mapped with IPC buffer /// \param[in] buffer a CudaBuffer referencing /// \return Status - Status CloseIpcBuffer(CudaBuffer* buf); + Status CloseIpcBuffer(CudaBuffer* buffer); /// \brief Block until the all device tasks are completed. Status Synchronize(void); diff --git a/cpp/src/arrow/gpu/cuda_memory.h b/cpp/src/arrow/gpu/cuda_memory.h index c8f80837cd9df..64fa02d789325 100644 --- a/cpp/src/arrow/gpu/cuda_memory.h +++ b/cpp/src/arrow/gpu/cuda_memory.h @@ -207,7 +207,7 @@ class ARROW_EXPORT CudaBufferWriter : public io::WritableFile { }; /// \brief Allocate CUDA-accessible memory on CPU host -/// \param[in] device_number +/// \param[in] device_number device to expose host memory /// \param[in] size number of bytes /// \param[out] out the allocated buffer /// \return Status diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index ca94369be5157..38ab238e9a2e1 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -55,8 +55,8 @@ using internal::checked_cast; namespace py { -/// A Sequence is a heterogeneous collections of elements. It can contain -/// scalar Python types, lists, tuples, dictionaries and tensors. +// A Sequence is a heterogeneous collections of elements. It can contain +// scalar Python types, lists, tuples, dictionaries and tensors. class SequenceBuilder { public: explicit SequenceBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) @@ -81,7 +81,7 @@ class SequenceBuilder { dict_offsets_({0}), set_offsets_({0}) {} - /// Appending a none to the sequence + // Appending a none to the sequence Status AppendNone() { RETURN_NOT_OK(offsets_.Append(0)); RETURN_NOT_OK(types_.Append(0)); @@ -106,90 +106,90 @@ class SequenceBuilder { return out->Append(val); } - /// Appending a boolean to the sequence + // Appending a boolean to the sequence Status AppendBool(const bool data) { return AppendPrimitive(data, &bool_tag_, &bools_); } - /// Appending a python 2 int64_t to the sequence + // Appending a python 2 int64_t to the sequence Status AppendPy2Int64(const int64_t data) { return AppendPrimitive(data, &py2_int_tag_, &py2_ints_); } - /// Appending an int64_t to the sequence + // Appending an int64_t to the sequence Status AppendInt64(const int64_t data) { return AppendPrimitive(data, &int_tag_, &ints_); } - /// Append a list of bytes to the sequence + // Append a list of bytes to the sequence Status AppendBytes(const uint8_t* data, int32_t length) { RETURN_NOT_OK(Update(bytes_.length(), &bytes_tag_)); return bytes_.Append(data, length); } - /// Appending a string to the sequence + // Appending a string to the sequence Status AppendString(const char* data, int32_t length) { RETURN_NOT_OK(Update(strings_.length(), &string_tag_)); return strings_.Append(data, length); } - /// Appending a half_float to the sequence + // Appending a half_float to the sequence Status AppendHalfFloat(const npy_half data) { return AppendPrimitive(data, &half_float_tag_, &half_floats_); } - /// Appending a float to the sequence + // Appending a float to the sequence Status AppendFloat(const float data) { return AppendPrimitive(data, &float_tag_, &floats_); } - /// Appending a double to the sequence + // Appending a double to the sequence Status AppendDouble(const double data) { return AppendPrimitive(data, &double_tag_, &doubles_); } - /// Appending a Date64 timestamp to the sequence + // Appending a Date64 timestamp to the sequence Status AppendDate64(const int64_t timestamp) { return AppendPrimitive(timestamp, &date64_tag_, &date64s_); } - /// Appending a tensor to the sequence - /// - /// \param tensor_index Index of the tensor in the object. + // Appending a tensor to the sequence + // + // \param tensor_index Index of the tensor in the object. Status AppendTensor(const int32_t tensor_index) { RETURN_NOT_OK(Update(tensor_indices_.length(), &tensor_tag_)); return tensor_indices_.Append(tensor_index); } - /// Appending a numpy ndarray to the sequence - /// - /// \param tensor_index Index of the tensor in the object. + // Appending a numpy ndarray to the sequence + // + // \param tensor_index Index of the tensor in the object. Status AppendNdarray(const int32_t ndarray_index) { RETURN_NOT_OK(Update(ndarray_indices_.length(), &ndarray_tag_)); return ndarray_indices_.Append(ndarray_index); } - /// Appending a buffer to the sequence - /// - /// \param buffer_index Indes of the buffer in the object. + // Appending a buffer to the sequence + // + // \param buffer_index Indes of the buffer in the object. Status AppendBuffer(const int32_t buffer_index) { RETURN_NOT_OK(Update(buffer_indices_.length(), &buffer_tag_)); return buffer_indices_.Append(buffer_index); } - /// Add a sublist to the sequence. The data contained in the sublist will be - /// specified in the "Finish" method. - /// - /// To construct l = [[11, 22], 33, [44, 55]] you would for example run - /// list = ListBuilder(); - /// list.AppendList(2); - /// list.Append(33); - /// list.AppendList(2); - /// list.Finish([11, 22, 44, 55]); - /// list.Finish(); + // Add a sublist to the sequence. The data contained in the sublist will be + // specified in the "Finish" method. + // + // To construct l = [[11, 22], 33, [44, 55]] you would for example run + // list = ListBuilder(); + // list.AppendList(2); + // list.Append(33); + // list.AppendList(2); + // list.Finish([11, 22, 44, 55]); + // list.Finish(); - /// \param size - /// The size of the sublist + // \param size + // The size of the sublist Status AppendList(Py_ssize_t size) { int32_t offset; RETURN_NOT_OK(internal::CastSize(list_offsets_.back() + size, &offset)); @@ -256,8 +256,8 @@ class SequenceBuilder { return Status::OK(); } - /// Finish building the sequence and return the result. - /// Input arrays may be nullptr + // Finish building the sequence and return the result. + // Input arrays may be nullptr Status Finish(const Array* list_data, const Array* tuple_data, const Array* dict_data, const Array* set_data, std::shared_ptr* out) { fields_.resize(num_tags_); @@ -356,28 +356,28 @@ class SequenceBuilder { std::vector type_ids_; }; -/// Constructing dictionaries of key/value pairs. Sequences of -/// keys and values are built separately using a pair of -/// SequenceBuilders. The resulting Arrow representation -/// can be obtained via the Finish method. +// Constructing dictionaries of key/value pairs. Sequences of +// keys and values are built separately using a pair of +// SequenceBuilders. The resulting Arrow representation +// can be obtained via the Finish method. class DictBuilder { public: explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) {} - /// Builder for the keys of the dictionary + // Builder for the keys of the dictionary SequenceBuilder& keys() { return keys_; } - /// Builder for the values of the dictionary + // Builder for the values of the dictionary SequenceBuilder& vals() { return vals_; } - /// Construct an Arrow StructArray representing the dictionary. - /// Contains a field "keys" for the keys and "vals" for the values. - /// \param val_list_data - /// List containing the data from nested lists in the value - /// list of the dictionary - /// - /// \param val_dict_data - /// List containing the data from nested dictionaries in the - /// value list of the dictionary + // Construct an Arrow StructArray representing the dictionary. + // Contains a field "keys" for the keys and "vals" for the values. + // \param val_list_data + // List containing the data from nested lists in the value + // list of the dictionary + // + // \param val_dict_data + // List containing the data from nested dictionaries in the + // value list of the dictionary Status Finish(const Array* key_tuple_data, const Array* key_dict_data, const Array* val_list_data, const Array* val_tuple_data, const Array* val_dict_data, const Array* val_set_data, From c322aecd82c93f96a6d8b8852c8336a750ebfbb1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 4 Jan 2019 13:38:28 -0600 Subject: [PATCH 142/328] ARROW-4158: Allow committers to set ARROW_GITHUB_API_TOKEN for merge script, better debugging output Before this the error message printed in rate limit scenario was simply `url` (from the `KeyError`) Author: Wes McKinney Closes #3311 from wesm/ARROW-4158 and squashes the following commits: ca4b16e04 Better debugging output from merge PR script, add option to use GitHub API token for GET requests --- dev/README.md | 9 +++++++-- dev/merge_arrow_pr.py | 43 ++++++++++++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/dev/README.md b/dev/README.md index 98aeef6d9a4d8..ead36d3747e76 100644 --- a/dev/README.md +++ b/dev/README.md @@ -28,17 +28,22 @@ https://gitbox.apache.org/setup/ to be able to push to GitHub as the main remote. * How to merge a Pull request: -have an apache and apache-github remote setup + ``` -git remote add apache-github https://github.com/apache/arrow.git git remote add apache git@github.com:apache/arrow.git ``` + run the following command + ``` dev/merge_arrow_pr.py ``` +This uses the GitHub REST API; if you encounter rate limit issues, you may set +a `ARROW_GITHUB_API_TOKEN` environment variable to use a Personal Access Token. + Note: + * The directory name of your Arrow git clone must be called arrow * Without jira-python installed you'll have to close the JIRA manually diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index 8539d5d3401fd..3d6ca31476ee3 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -24,8 +24,17 @@ # This utility assumes you already have a local Arrow git clone and that you # have added remotes corresponding to both (i) the Github Apache Arrow mirror # and (ii) the apache git repo. +# +# There are several pieces of authorization possibly needed via environment +# variables +# +# JIRA_USERNAME: your Apache JIRA id +# JIRA_PASSWORD: your Apache JIRA password +# ARROW_GITHUB_API_TOKEN: a GitHub API token to use for API requests (to avoid +# rate limiting) import os +import pprint import re import subprocess import sys @@ -48,12 +57,10 @@ BRANCH_PREFIX = "PR_TOOL" JIRA_API_BASE = "https://issues.apache.org/jira" - -def get_json(url): - req = requests.get(url) +def get_json(url, headers=None): + req = requests.get(url, headers=headers) return req.json() - def run_cmd(cmd): if isinstance(cmd, six.string_types): cmd = cmd.split(' ') @@ -192,8 +199,15 @@ def __init__(self, project_name): self.github_api = ("https://api.github.com/repos/apache/{0}" .format(project_name)) + token = os.environ.get('ARROW_GITHUB_API_TOKEN', None) + if token: + self.headers = {'Authorization': 'token {0}'.format(token)} + else: + self.headers = None + def get_pr_data(self, number): - return get_json("%s/pulls/%s" % (self.github_api, number)) + return get_json("%s/pulls/%s" % (self.github_api, number), + headers=self.headers) class CommandInput(object): @@ -225,13 +239,16 @@ def __init__(self, cmd, github_api, git_remote, jira_con, number): self.con = jira_con self.number = number self._pr_data = github_api.get_pr_data(number) - self.url = self._pr_data["url"] - self.title = self._pr_data["title"] - - self.body = self._pr_data["body"] - self.target_ref = self._pr_data["base"]["ref"] - self.user_login = self._pr_data["user"]["login"] - self.base_ref = self._pr_data["head"]["ref"] + try: + self.url = self._pr_data["url"] + self.title = self._pr_data["title"] + self.body = self._pr_data["body"] + self.target_ref = self._pr_data["base"]["ref"] + self.user_login = self._pr_data["user"]["login"] + self.base_ref = self._pr_data["head"]["ref"] + except KeyError: + pprint.pprint(self._pr_data) + raise self.description = "%s/%s" % (self.user_login, self.base_ref) self.jira_issue = self._get_jira() @@ -435,4 +452,4 @@ def get_version_json(version_str): try: cli() except Exception as e: - print(e.args[0]) + raise From 4057b5f2f1402026c5853e53a038db8371650fbd Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 4 Jan 2019 15:17:13 -0600 Subject: [PATCH 143/328] PARQUET-690: [C++] Reuse Thrift resources when serializing metadata structures This patch should yield fewer memory allocations on the Parquet write path, using the same approach from Apache Impala. Before we were allocating a new buffer for each Thrift object serialization. Since a ColumnChunk generally will contain many data page headers, this is a bit wasteful Author: Wes McKinney Closes #3268 from wesm/PARQUET-690 and squashes the following commits: a5303f826 Fix lint issues 47de8356c Reuse Thrift resources when serializing metadata structures --- cpp/src/parquet/column_writer.cc | 9 ++- cpp/src/parquet/file-deserialize-test.cc | 4 +- cpp/src/parquet/metadata.cc | 6 +- cpp/src/parquet/thrift.h | 93 +++++++++++++++--------- 4 files changed, 69 insertions(+), 43 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 37fce9c036b31..dfb65f1969777 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -141,6 +141,7 @@ class SerializedPageWriter : public PageWriter { total_uncompressed_size_(0), total_compressed_size_(0) { compressor_ = GetCodecFromArrow(codec); + thrift_serializer_.reset(new ThriftSerializer); } int64_t WriteDictionaryPage(const DictionaryPage& page) override { @@ -171,8 +172,7 @@ class SerializedPageWriter : public PageWriter { if (dictionary_page_offset_ == 0) { dictionary_page_offset_ = start_pos; } - int64_t header_size = - SerializeThriftMsg(&page_header, sizeof(format::PageHeader), sink_); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_); sink_->Write(compressed_data->data(), compressed_data->size()); total_uncompressed_size_ += uncompressed_size + header_size; @@ -237,8 +237,7 @@ class SerializedPageWriter : public PageWriter { data_page_offset_ = start_pos; } - int64_t header_size = - SerializeThriftMsg(&page_header, sizeof(format::PageHeader), sink_); + int64_t header_size = thrift_serializer_->Serialize(&page_header, sink_); sink_->Write(compressed_data->data(), compressed_data->size()); total_uncompressed_size_ += uncompressed_size + header_size; @@ -270,6 +269,8 @@ class SerializedPageWriter : public PageWriter { int64_t total_uncompressed_size_; int64_t total_compressed_size_; + std::unique_ptr thrift_serializer_; + // Compression codec to use. std::unique_ptr<::arrow::util::Codec> compressor_; }; diff --git a/cpp/src/parquet/file-deserialize-test.cc b/cpp/src/parquet/file-deserialize-test.cc index f1c17240439fb..4db338b4bcb54 100644 --- a/cpp/src/parquet/file-deserialize-test.cc +++ b/cpp/src/parquet/file-deserialize-test.cc @@ -85,8 +85,8 @@ class TestPageSerde : public ::testing::Test { page_header_.compressed_page_size = compressed_size; page_header_.type = format::PageType::DATA_PAGE; - ASSERT_NO_THROW( - SerializeThriftMsg(&page_header_, max_serialized_len, out_stream_.get())); + ThriftSerializer serializer; + ASSERT_NO_THROW(serializer.Serialize(&page_header_, out_stream_.get())); } void ResetStream() { out_stream_.reset(new InMemoryOutputStream); } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index f05918d9fd7f0..cc0bfec6321cd 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -361,7 +361,8 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } void WriteTo(OutputStream* dst) const { - SerializeThriftMsg(metadata_.get(), 1024, dst); + ThriftSerializer serializer; + serializer.Serialize(metadata_.get(), dst); } std::unique_ptr RowGroup(int i) { @@ -667,7 +668,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { } void WriteTo(OutputStream* sink) { - SerializeThriftMsg(column_chunk_, sizeof(format::ColumnChunk), sink); + ThriftSerializer serializer; + serializer.Serialize(column_chunk_, sink); } const ColumnDescriptor* descr() const { return column_; } diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index 9c665acfac4ff..1afd9bf436550 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -#ifndef PARQUET_THRIFT_UTIL_H -#define PARQUET_THRIFT_UTIL_H +#pragma once #include "arrow/util/windows_compatibility.h" @@ -28,6 +27,7 @@ #else #include #endif +#include // TCompactProtocol requires some #defines to work right. #define SIGNED_RIGHT_SHIFT_IS 1 @@ -105,18 +105,18 @@ static inline format::CompressionCodec::type ToThrift(Compression::type type) { // ---------------------------------------------------------------------- // Thrift struct serialization / deserialization utilities +using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; + // Deserialize a thrift message from buf/len. buf/len must at least contain // all the bytes needed to store the thrift message. On return, len will be // set to the actual length of the header. template inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { // Deserialize msg bytes into c++ thrift msg using memory transport. - shared_ptr tmem_transport( - new apache::thrift::transport::TMemoryBuffer(const_cast(buf), *len)); - apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> - tproto_factory; - shared_ptr tproto = + shared_ptr tmem_transport( + new ThriftBuffer(const_cast(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; + shared_ptr tproto = // tproto_factory.getProtocol(tmem_transport); try { deserialized_msg->read(tproto.get()); @@ -129,34 +129,57 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali *len = *len - bytes_left; } -// Serialize obj into a buffer. The result is returned as a string. -// The arguments are the object to be serialized and -// the expected size of the serialized object -template -inline int64_t SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out) { - shared_ptr mem_buffer( - new apache::thrift::transport::TMemoryBuffer(len)); - apache::thrift::protocol::TCompactProtocolFactoryT< - apache::thrift::transport::TMemoryBuffer> - tproto_factory; - shared_ptr tproto = - tproto_factory.getProtocol(mem_buffer); - try { - mem_buffer->resetBuffer(); - obj->write(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't serialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); +/// Utility class to serialize thrift objects to a binary format. This object +/// should be reused if possible to reuse the underlying memory. +/// Note: thrift will encode NULLs into the serialized buffer so it is not valid +/// to treat it as a string. +class ThriftSerializer { + public: + explicit ThriftSerializer(int initial_buffer_size = 1024) + : mem_buffer_(new ThriftBuffer(initial_buffer_size)) { + apache::thrift::protocol::TCompactProtocolFactoryT factory; + protocol_ = factory.getProtocol(mem_buffer_); } - uint8_t* out_buffer; - uint32_t out_length; - mem_buffer->getBuffer(&out_buffer, &out_length); - out->Write(out_buffer, out_length); - return out_length; -} + /// Serialize obj into a memory buffer. The result is returned in buffer/len. The + /// memory returned is owned by this object and will be invalid when another object + /// is serialized. + template + void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) { + SerializeObject(obj); + mem_buffer_->getBuffer(buffer, len); + } -} // namespace parquet + template + void SerializeToString(const T* obj, std::string* result) { + SerializeObject(obj); + *result = mem_buffer_->getBufferAsString(); + } + + template + int64_t Serialize(const T* obj, OutputStream* out) { + uint8_t* out_buffer; + uint32_t out_length; + SerializeToBuffer(obj, &out_length, &out_buffer); + out->Write(out_buffer, out_length); + return static_cast(out_length); + } -#endif // PARQUET_THRIFT_UTIL_H + private: + template + void SerializeObject(const T* obj) { + try { + mem_buffer_->resetBuffer(); + obj->write(protocol_.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't serialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + } + + shared_ptr mem_buffer_; + shared_ptr protocol_; +}; + +} // namespace parquet From fba4f32001386b2ed593a69ec6d546a104eb45ba Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Fri, 4 Jan 2019 15:36:41 -0600 Subject: [PATCH 144/328] ARROW-3760: [R] Support Arrow CSV reader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main entry point is the `csv_read()` function, all it does is create a `csv::TableReader` with the `csv_table_reader()` generic and then `$Read()` from it. as in the #2947 for feather format, `csv_table_reader` is generic with the methods: - arrow::io::InputStream: calls the TableReader actor with the other options - character and fs_path: depending on the `mmap` option (TRUE by default) it opens the file with `mmap_open()` of `file_open()` and then calls the other method. ``` r library(arrow) tf <- tempfile() readr::write_csv(iris, tf) tab1 <- csv_read(tf) tab1 #> arrow::Table as_tibble(tab1) #> # A tibble: 150 x 5 #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> #> 1 5.1 3.5 1.4 0.2 setosa #> 2 4.9 3 1.4 0.2 setosa #> 3 4.7 3.2 1.3 0.2 setosa #> 4 4.6 3.1 1.5 0.2 setosa #> 5 5 3.6 1.4 0.2 setosa #> 6 5.4 3.9 1.7 0.4 setosa #> 7 4.6 3.4 1.4 0.3 setosa #> 8 5 3.4 1.5 0.2 setosa #> 9 4.4 2.9 1.4 0.2 setosa #> 10 4.9 3.1 1.5 0.1 setosa #> # … with 140 more rows ``` Created on 2018-11-13 by the [reprex package](https://reprex.tidyverse.org) (v0.2.1.9000) Author: Romain Francois Closes #2949 from romainfrancois/ARROW-3760/csv_reader and squashes the following commits: 951e9f58b s/csv_read/read_csv_arrow/ 7770ec54c not using readr:: at this point bb13a76e0 rebase 83b51621a s/file_open/ReadableFile/ 959020c91 No need to special use mmap for file path method 6e740037d going through CharacterVector makes sure this is a character vector 258550143 line breaks for readability 0ab839783 linting 09187e63b Expose arrow::csv::TableReader, functions csv_table_reader() + csv_read() --- r/DESCRIPTION | 1 + r/NAMESPACE | 11 ++ r/R/RcppExports.R | 20 ++++ r/R/csv.R | 182 +++++++++++++++++++++++++++++ r/man/csv_convert_options.Rd | 14 +++ r/man/csv_parse_options.Rd | 33 ++++++ r/man/csv_read_options.Rd | 16 +++ r/man/csv_table_reader.Rd | 24 ++++ r/man/read_csv_arrow.Rd | 14 +++ r/src/RcppExports.cpp | 63 ++++++++++ r/src/arrow_types.h | 1 + r/src/csv.cpp | 76 ++++++++++++ r/tests/testthat/test-arrow-csv-.R | 33 ++++++ 13 files changed, 488 insertions(+) create mode 100644 r/R/csv.R create mode 100644 r/man/csv_convert_options.Rd create mode 100644 r/man/csv_parse_options.Rd create mode 100644 r/man/csv_read_options.Rd create mode 100644 r/man/csv_table_reader.Rd create mode 100644 r/man/read_csv_arrow.Rd create mode 100644 r/src/csv.cpp create mode 100644 r/tests/testthat/test-arrow-csv-.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 45e0f83dcbd0a..a2632973134b9 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -55,6 +55,7 @@ Collate: 'array.R' 'buffer.R' 'compute.R' + 'csv.R' 'dictionary.R' 'feather.R' 'io.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index 65d60d846f4cb..8846defbd8e65 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -39,6 +39,11 @@ S3method(buffer,default) S3method(buffer,integer) S3method(buffer,numeric) S3method(buffer,raw) +S3method(csv_table_reader,"arrow::csv::TableReader") +S3method(csv_table_reader,"arrow::io::InputStream") +S3method(csv_table_reader,character) +S3method(csv_table_reader,default) +S3method(csv_table_reader,fs_path) S3method(length,"arrow::Array") S3method(names,"arrow::RecordBatch") S3method(print,"arrow-enum") @@ -92,6 +97,10 @@ export(boolean) export(buffer) export(cast_options) export(chunked_array) +export(csv_convert_options) +export(csv_parse_options) +export(csv_read_options) +export(csv_table_reader) export(date32) export(date64) export(decimal) @@ -111,6 +120,7 @@ export(mmap_open) export(null) export(print.integer64) export(read_arrow) +export(read_csv_arrow) export(read_feather) export(read_message) export(read_record_batch) @@ -141,6 +151,7 @@ importFrom(glue,glue) importFrom(purrr,map) importFrom(purrr,map2) importFrom(purrr,map_int) +importFrom(rlang,abort) importFrom(rlang,dots_n) importFrom(rlang,list2) importFrom(rlang,warn) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index 0310eab2027b9..55b9ab33ebf98 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -193,6 +193,26 @@ Table__cast <- function(table, schema, options) { .Call(`_arrow_Table__cast`, table, schema, options) } +csv___ReadOptions__initialize <- function(options) { + .Call(`_arrow_csv___ReadOptions__initialize`, options) +} + +csv___ParseOptions__initialize <- function(options) { + .Call(`_arrow_csv___ParseOptions__initialize`, options) +} + +csv___ConvertOptions__initialize <- function(options) { + .Call(`_arrow_csv___ConvertOptions__initialize`, options) +} + +csv___TableReader__Make <- function(input, read_options, parse_options, convert_options) { + .Call(`_arrow_csv___TableReader__Make`, input, read_options, parse_options, convert_options) +} + +csv___TableReader__Read <- function(table_reader) { + .Call(`_arrow_csv___TableReader__Read`, table_reader) +} + shared_ptr_is_null <- function(xp) { .Call(`_arrow_shared_ptr_is_null`, xp) } diff --git a/r/R/csv.R b/r/R/csv.R new file mode 100644 index 0000000000000..bad87559c05e5 --- /dev/null +++ b/r/R/csv.R @@ -0,0 +1,182 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include R6.R + +`arrow::csv::TableReader` <- R6Class("arrow::csv::TableReader", inherit = `arrow::Object`, + public = list( + Read = function() shared_ptr(`arrow::Table`, csv___TableReader__Read(self)) + ) +) + +`arrow::csv::ReadOptions` <- R6Class("arrow::csv::ReadOptions", inherit = `arrow::Object`) +`arrow::csv::ParseOptions` <- R6Class("arrow::csv::ParseOptions", inherit = `arrow::Object`) +`arrow::csv::ConvertOptions` <- R6Class("arrow::csv::ConvertOptions", inherit = `arrow::Object`) + +#' read options for the csv reader +#' +#' @param use_threads Whether to use the global CPU thread pool +#' @param block_size Block size we request from the IO layer; also determines the size of chunks when use_threads is `TRUE` +#' +#' @export +csv_read_options <- function(use_threads = TRUE, block_size = 1048576L) { + shared_ptr(`arrow::csv::ReadOptions`, csv___ReadOptions__initialize( + list( + use_threads = use_threads, + block_size = block_size + ) + )) +} + +#' Parsing options +#' +#' @param delimiter Field delimiter +#' @param quoting Whether quoting is used +#' @param quote_char Quoting character (if `quoting` is `TRUE`) +#' @param double_quote Whether a quote inside a value is double-quoted +#' @param escaping Whether escaping is used +#' @param escape_char Escaping character (if `escaping` is `TRUE`) +#' @param newlines_in_values Whether values are allowed to contain CR (`0x0d``) and LF (`0x0a``) characters +#' @param ignore_empty_lines Whether empty lines are ignored. If false, an empty line represents +#' @param header_rows Number of header rows to skip (including the first row containing column names) +#' +#' @export +csv_parse_options <- function( + delimiter = ",", quoting = TRUE, quote_char = '"', + double_quote = TRUE, escaping = FALSE, escape_char = '\\', + newlines_in_values = FALSE, ignore_empty_lines = TRUE, + header_rows = 1L +){ + shared_ptr(`arrow::csv::ParseOptions`, csv___ParseOptions__initialize( + list( + delimiter = delimiter, + quoting = quoting, + quote_char = quote_char, + double_quote = double_quote, + escaping = escaping, + escape_char = escape_char, + newlines_in_values = newlines_in_values, + ignore_empty_lines = ignore_empty_lines, + header_rows = header_rows + ) + )) +} + +#' Conversion Options for the csv reader +#' +#' @param check_utf8 Whether to check UTF8 validity of string columns +#' +#' @export +csv_convert_options <- function(check_utf8 = TRUE){ + shared_ptr(`arrow::csv::ConvertOptions`, csv___ConvertOptions__initialize( + list( + check_utf8 = check_utf8 + ) + )) +} + +#' CSV table reader +#' +#' @param file file +#' @param read_options, see [csv_read_options()] +#' @param parse_options, see [csv_parse_options()] +#' @param convert_options, see [csv_convert_options()] +#' @param ... additional parameters. +#' +#' @export +csv_table_reader <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +){ + UseMethod("csv_table_reader") +} + +#' @importFrom rlang abort +#' @export +csv_table_reader.default <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +) { + abort("unsupported") +} + +#' @export +`csv_table_reader.character` <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +){ + csv_table_reader(fs::path_abs(file), + read_options = read_options, + parse_options = parse_options, + convert_options = convert_options, + ... + ) +} + +#' @export +`csv_table_reader.fs_path` <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +){ + csv_table_reader(ReadableFile(file), + read_options = read_options, + parse_options = parse_options, + convert_options = convert_options, + ... + ) +} + +#' @export +`csv_table_reader.arrow::io::InputStream` <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +){ + shared_ptr(`arrow::csv::TableReader`, + csv___TableReader__Make(file, read_options, parse_options, convert_options) + ) +} + +#' @export +`csv_table_reader.arrow::csv::TableReader` <- function(file, + read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), + ... +){ + file +} + +#' Read csv file into an arrow::Table +#' +#' Use arrow::csv::TableReader from [csv_table_reader()] +#' +#' @param ... Used to construct an arrow::csv::TableReader +#' @export +read_csv_arrow <- function(...) { + csv_table_reader(...)$Read() +} + diff --git a/r/man/csv_convert_options.Rd b/r/man/csv_convert_options.Rd new file mode 100644 index 0000000000000..323c6e01970ca --- /dev/null +++ b/r/man/csv_convert_options.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{csv_convert_options} +\alias{csv_convert_options} +\title{Conversion Options for the csv reader} +\usage{ +csv_convert_options(check_utf8 = TRUE) +} +\arguments{ +\item{check_utf8}{Whether to check UTF8 validity of string columns} +} +\description{ +Conversion Options for the csv reader +} diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd new file mode 100644 index 0000000000000..9540771437f75 --- /dev/null +++ b/r/man/csv_parse_options.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{csv_parse_options} +\alias{csv_parse_options} +\title{Parsing options} +\usage{ +csv_parse_options(delimiter = ",", quoting = TRUE, + quote_char = "\\"", double_quote = TRUE, escaping = FALSE, + escape_char = "\\\\", newlines_in_values = FALSE, + ignore_empty_lines = TRUE, header_rows = 1L) +} +\arguments{ +\item{delimiter}{Field delimiter} + +\item{quoting}{Whether quoting is used} + +\item{quote_char}{Quoting character (if \code{quoting} is \code{TRUE})} + +\item{double_quote}{Whether a quote inside a value is double-quoted} + +\item{escaping}{Whether escaping is used} + +\item{escape_char}{Escaping character (if \code{escaping} is \code{TRUE})} + +\item{newlines_in_values}{Whether values are allowed to contain CR (\code{0x0d``) and LF (}0x0a``) characters} + +\item{ignore_empty_lines}{Whether empty lines are ignored. If false, an empty line represents} + +\item{header_rows}{Number of header rows to skip (including the first row containing column names)} +} +\description{ +Parsing options +} diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd new file mode 100644 index 0000000000000..3fa2d8ccbf2f2 --- /dev/null +++ b/r/man/csv_read_options.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{csv_read_options} +\alias{csv_read_options} +\title{read options for the csv reader} +\usage{ +csv_read_options(use_threads = TRUE, block_size = 1048576L) +} +\arguments{ +\item{use_threads}{Whether to use the global CPU thread pool} + +\item{block_size}{Block size we request from the IO layer; also determines the size of chunks when use_threads is \code{TRUE}} +} +\description{ +read options for the csv reader +} diff --git a/r/man/csv_table_reader.Rd b/r/man/csv_table_reader.Rd new file mode 100644 index 0000000000000..029cd0b5923c2 --- /dev/null +++ b/r/man/csv_table_reader.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{csv_table_reader} +\alias{csv_table_reader} +\title{CSV table reader} +\usage{ +csv_table_reader(file, read_options = csv_read_options(), + parse_options = csv_parse_options(), + convert_options = csv_convert_options(), ...) +} +\arguments{ +\item{file}{file} + +\item{read_options, }{see \code{\link[=csv_read_options]{csv_read_options()}}} + +\item{parse_options, }{see \code{\link[=csv_parse_options]{csv_parse_options()}}} + +\item{convert_options, }{see \code{\link[=csv_convert_options]{csv_convert_options()}}} + +\item{...}{additional parameters.} +} +\description{ +CSV table reader +} diff --git a/r/man/read_csv_arrow.Rd b/r/man/read_csv_arrow.Rd new file mode 100644 index 0000000000000..4cdca91246b5b --- /dev/null +++ b/r/man/read_csv_arrow.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{read_csv_arrow} +\alias{read_csv_arrow} +\title{Read csv file into an arrow::Table} +\usage{ +read_csv_arrow(...) +} +\arguments{ +\item{...}{Used to construct an arrow::csv::TableReader} +} +\description{ +Use arrow::csv::TableReader from \code{\link[=csv_table_reader]{csv_table_reader()}} +} diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index e5a784eb70c23..c752afba1c258 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -558,6 +558,64 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// csv___ReadOptions__initialize +std::shared_ptr csv___ReadOptions__initialize(List_ options); +RcppExport SEXP _arrow_csv___ReadOptions__initialize(SEXP optionsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< List_ >::type options(optionsSEXP); + rcpp_result_gen = Rcpp::wrap(csv___ReadOptions__initialize(options)); + return rcpp_result_gen; +END_RCPP +} +// csv___ParseOptions__initialize +std::shared_ptr csv___ParseOptions__initialize(List_ options); +RcppExport SEXP _arrow_csv___ParseOptions__initialize(SEXP optionsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< List_ >::type options(optionsSEXP); + rcpp_result_gen = Rcpp::wrap(csv___ParseOptions__initialize(options)); + return rcpp_result_gen; +END_RCPP +} +// csv___ConvertOptions__initialize +std::shared_ptr csv___ConvertOptions__initialize(List_ options); +RcppExport SEXP _arrow_csv___ConvertOptions__initialize(SEXP optionsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< List_ >::type options(optionsSEXP); + rcpp_result_gen = Rcpp::wrap(csv___ConvertOptions__initialize(options)); + return rcpp_result_gen; +END_RCPP +} +// csv___TableReader__Make +std::shared_ptr csv___TableReader__Make(const std::shared_ptr& input, const std::shared_ptr& read_options, const std::shared_ptr& parse_options, const std::shared_ptr& convert_options); +RcppExport SEXP _arrow_csv___TableReader__Make(SEXP inputSEXP, SEXP read_optionsSEXP, SEXP parse_optionsSEXP, SEXP convert_optionsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type input(inputSEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type read_options(read_optionsSEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type parse_options(parse_optionsSEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type convert_options(convert_optionsSEXP); + rcpp_result_gen = Rcpp::wrap(csv___TableReader__Make(input, read_options, parse_options, convert_options)); + return rcpp_result_gen; +END_RCPP +} +// csv___TableReader__Read +std::shared_ptr csv___TableReader__Read(const std::shared_ptr& table_reader); +RcppExport SEXP _arrow_csv___TableReader__Read(SEXP table_readerSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type table_reader(table_readerSEXP); + rcpp_result_gen = Rcpp::wrap(csv___TableReader__Read(table_reader)); + return rcpp_result_gen; +END_RCPP +} // shared_ptr_is_null bool shared_ptr_is_null(SEXP xp); RcppExport SEXP _arrow_shared_ptr_is_null(SEXP xpSEXP) { @@ -2200,6 +2258,11 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_ChunkedArray__cast", (DL_FUNC) &_arrow_ChunkedArray__cast, 3}, {"_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, {"_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, + {"_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, + {"_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, + {"_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1}, + {"_arrow_csv___TableReader__Make", (DL_FUNC) &_arrow_csv___TableReader__Make, 4}, + {"_arrow_csv___TableReader__Read", (DL_FUNC) &_arrow_csv___TableReader__Read, 1}, {"_arrow_shared_ptr_is_null", (DL_FUNC) &_arrow_shared_ptr_is_null, 1}, {"_arrow_unique_ptr_is_null", (DL_FUNC) &_arrow_unique_ptr_is_null, 1}, {"_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index dba7a91c21e33..6fef7997dbfa7 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -22,6 +22,7 @@ #undef Free #include #include +#include #include #include #include diff --git a/r/src/csv.cpp b/r/src/csv.cpp new file mode 100644 index 0000000000000..0e1d09fb65e8b --- /dev/null +++ b/r/src/csv.cpp @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow_types.h" + +using namespace Rcpp; + +// [[Rcpp::export]] +std::shared_ptr csv___ReadOptions__initialize(List_ options) { + auto res = + std::make_shared(arrow::csv::ReadOptions::Defaults()); + res->use_threads = options["use_threads"]; + res->block_size = options["block_size"]; + return res; +} + +inline char get_char(CharacterVector x) { return CHAR(STRING_ELT(x, 0))[0]; } + +// [[Rcpp::export]] +std::shared_ptr csv___ParseOptions__initialize(List_ options) { + auto res = + std::make_shared(arrow::csv::ParseOptions::Defaults()); + res->delimiter = get_char(options["delimiter"]); + res->quoting = options["quoting"]; + res->quote_char = get_char(options["quote_char"]); + res->double_quote = options["double_quote"]; + res->escape_char = get_char(options["escape_char"]); + res->newlines_in_values = options["newlines_in_values"]; + res->header_rows = options["header_rows"]; + res->ignore_empty_lines = options["ignore_empty_lines"]; + return res; +} + +// [[Rcpp::export]] +std::shared_ptr csv___ConvertOptions__initialize( + List_ options) { + auto res = std::make_shared( + arrow::csv::ConvertOptions::Defaults()); + res->check_utf8 = options["check_utf8"]; + return res; +} + +// [[Rcpp::export]] +std::shared_ptr csv___TableReader__Make( + const std::shared_ptr& input, + const std::shared_ptr& read_options, + const std::shared_ptr& parse_options, + const std::shared_ptr& convert_options) { + std::shared_ptr table_reader; + STOP_IF_NOT_OK(arrow::csv::TableReader::Make(arrow::default_memory_pool(), input, + *read_options, *parse_options, + *convert_options, &table_reader)); + return table_reader; +} + +// [[Rcpp::export]] +std::shared_ptr csv___TableReader__Read( + const std::shared_ptr& table_reader) { + std::shared_ptr table; + STOP_IF_NOT_OK(table_reader->Read(&table)); + return table; +} diff --git a/r/tests/testthat/test-arrow-csv-.R b/r/tests/testthat/test-arrow-csv-.R new file mode 100644 index 0000000000000..2afd0622821ae --- /dev/null +++ b/r/tests/testthat/test-arrow-csv-.R @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("arrow::csv::TableReader") + +test_that("Can read csv file", { + tf <- local_tempfile() + write.csv(iris, tf, row.names = FALSE, quote = FALSE) + + tab1 <- read_csv_arrow(tf) + tab2 <- read_csv_arrow(mmap_open(tf)) + tab3 <- read_csv_arrow(ReadableFile(tf)) + + iris$Species <- as.character(iris$Species) + tab0 <- table(iris) + expect_equal(tab0, tab1) + expect_equal(tab0, tab2) + expect_equal(tab0, tab3) +}) From fa37ea335546c12768939db8f1974696edeb2b2b Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 5 Jan 2019 06:15:43 +0100 Subject: [PATCH 145/328] =?UTF-8?q?[Documentation]=C2=A0Fix=20syntax=20err?= =?UTF-8?q?or=20in=20building.rst=20(#3313)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/building.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/source/building.rst b/docs/source/building.rst index dfa857498cf80..2a0e9fb6bf9c7 100644 --- a/docs/source/building.rst +++ b/docs/source/building.rst @@ -87,6 +87,4 @@ You can use Docker to build the documentation: docker-compose build docs docker-compose run docs -The final output is located under [#]_:: - - docs/_build/html +The final output is located under ``docs/_build/html``. From a4f4808e274e46ce71b08188071d3e2db230c82e Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Sat, 5 Jan 2019 18:55:26 +0900 Subject: [PATCH 146/328] ARROW-4153: [GLib] Add builder_append_value() for consistency Because we use builder_append_values() for multiple values. builder_append() is deprecated. Author: Kouhei Sutou Closes #3304 from kou/glib-builder-append-value and squashes the following commits: e93c0419 Add builder_append_value() for consistency --- c_glib/arrow-cuda-glib/cuda.cpp | 2 +- c_glib/arrow-glib/array-builder.cpp | 626 ++++++++++++++++++++++++-- c_glib/arrow-glib/array-builder.h | 160 +++++++ c_glib/arrow-glib/codec.cpp | 2 +- c_glib/arrow-glib/orc-file-reader.cpp | 3 +- c_glib/example/build.c | 6 +- c_glib/test/helper/buildable.rb | 8 +- c_glib/test/test-array.rb | 18 +- c_glib/test/test-binary-array.rb | 10 +- c_glib/test/test-boolean-array.rb | 14 +- c_glib/test/test-date32-array.rb | 14 +- c_glib/test/test-date64-array.rb | 14 +- c_glib/test/test-decimal-array.rb | 4 +- c_glib/test/test-double-array.rb | 14 +- c_glib/test/test-float-array.rb | 14 +- c_glib/test/test-int16-array.rb | 14 +- c_glib/test/test-int32-array.rb | 14 +- c_glib/test/test-int64-array.rb | 14 +- c_glib/test/test-int8-array.rb | 14 +- c_glib/test/test-list-array.rb | 14 +- c_glib/test/test-string-array.rb | 6 +- c_glib/test/test-struct-array.rb | 12 +- c_glib/test/test-uint16-array.rb | 14 +- c_glib/test/test-uint32-array.rb | 14 +- c_glib/test/test-uint64-array.rb | 14 +- c_glib/test/test-uint8-array.rb | 14 +- 26 files changed, 873 insertions(+), 180 deletions(-) diff --git a/c_glib/arrow-cuda-glib/cuda.cpp b/c_glib/arrow-cuda-glib/cuda.cpp index 3f82f8fa806cb..9679cc0ff7fd8 100644 --- a/c_glib/arrow-cuda-glib/cuda.cpp +++ b/c_glib/arrow-cuda-glib/cuda.cpp @@ -648,7 +648,7 @@ garrow_cuda_ipc_memory_handle_new(const guint8 *data, * * Returns: (transfer full): A newly created #GArrowBuffer on success, * %NULL on error. The buffer has serialized @handle. The serialized - * @handle can be deserialized by garrow_gpu_cuda_ipc_memory_handle_new() + * @handle can be deserialized by garrow_cuda_ipc_memory_handle_new() * in other process. * * Since: 0.8.0 diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index a5c75790de939..4b61bfaf7fab9 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -29,10 +29,10 @@ template gboolean -garrow_array_builder_append(GArrowArrayBuilder *builder, - VALUE value, - GError **error, - const gchar *context) +garrow_array_builder_append_value(GArrowArrayBuilder *builder, + VALUE value, + GError **error, + const gchar *context) { auto arrow_builder = static_cast(garrow_array_builder_get_raw(builder)); @@ -446,17 +446,38 @@ garrow_boolean_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_boolean_array_builder_append_value() instead. */ gboolean garrow_boolean_array_builder_append(GArrowBooleanArrayBuilder *builder, gboolean value, GError **error) { - return garrow_array_builder_append + return garrow_boolean_array_builder_append_value(builder, value, error); +} + +/** + * garrow_boolean_array_builder_append_value: + * @builder: A #GArrowBooleanArrayBuilder. + * @value: A boolean value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_boolean_array_builder_append_value(GArrowBooleanArrayBuilder *builder, + gboolean value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), static_cast(value), error, - "[boolean-array-builder][append]"); + "[boolean-array-builder][append-value]"); } /** @@ -583,17 +604,38 @@ garrow_int_array_builder_new(void) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.6.0 + * + * Deprecated: 0.12.0: + * Use garrow_int_array_builder_append_value() instead. */ gboolean garrow_int_array_builder_append(GArrowIntArrayBuilder *builder, gint64 value, GError **error) { - return garrow_array_builder_append + return garrow_int_array_builder_append_value(builder, value, error); +} + +/** + * garrow_int_array_builder_append_value: + * @builder: A #GArrowIntArrayBuilder. + * @value: A int value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_int_array_builder_append_value(GArrowIntArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[int-array-builder][append]"); + "[int-array-builder][append-value]"); } /** @@ -718,17 +760,38 @@ garrow_uint_array_builder_new(void) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.8.0 + * + * Deprecated: 0.12.0: + * Use garrow_uint_array_builder_append_value() instead. */ gboolean garrow_uint_array_builder_append(GArrowUIntArrayBuilder *builder, guint64 value, GError **error) { - return garrow_array_builder_append + return garrow_uint_array_builder_append_value(builder, value, error); +} + +/** + * garrow_uint_array_builder_append_value: + * @builder: A #GArrowUIntArrayBuilder. + * @value: A unsigned int value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_uint_array_builder_append_value(GArrowUIntArrayBuilder *builder, + guint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[uint-array-builder][append]"); + "[uint-array-builder][append-value]"); } /** @@ -848,17 +911,38 @@ garrow_int8_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_int8_array_builder_append_value() instead. */ gboolean garrow_int8_array_builder_append(GArrowInt8ArrayBuilder *builder, gint8 value, GError **error) { - return garrow_array_builder_append + return garrow_int8_array_builder_append_value(builder, value, error); +} + +/** + * garrow_int8_array_builder_append_value: + * @builder: A #GArrowInt8ArrayBuilder. + * @value: A int8 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_int8_array_builder_append_value(GArrowInt8ArrayBuilder *builder, + gint8 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[int8-array-builder][append]"); + "[int8-array-builder][append-value]"); } /** @@ -976,17 +1060,38 @@ garrow_uint8_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_uint8_array_builder_append_value() instead. */ gboolean garrow_uint8_array_builder_append(GArrowUInt8ArrayBuilder *builder, guint8 value, GError **error) { - return garrow_array_builder_append + return garrow_uint8_array_builder_append_value(builder, value, error); +} + +/** + * garrow_uint8_array_builder_append_value: + * @builder: A #GArrowUInt8ArrayBuilder. + * @value: An uint8 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_uint8_array_builder_append_value(GArrowUInt8ArrayBuilder *builder, + guint8 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[uint8-array-builder][append]"); + "[uint8-array-builder][append-value]"); } /** @@ -1104,17 +1209,38 @@ garrow_int16_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_int16_array_builder_append_value() instead. */ gboolean garrow_int16_array_builder_append(GArrowInt16ArrayBuilder *builder, gint16 value, GError **error) { - return garrow_array_builder_append + return garrow_int16_array_builder_append_value(builder, value, error); +} + +/** + * garrow_int16_array_builder_append_value: + * @builder: A #GArrowInt16ArrayBuilder. + * @value: A int16 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_int16_array_builder_append_value(GArrowInt16ArrayBuilder *builder, + gint16 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[int16-array-builder][append]"); + "[int16-array-builder][append-value]"); } /** @@ -1232,17 +1358,38 @@ garrow_uint16_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_uint16_array_builder_append_value() instead. */ gboolean garrow_uint16_array_builder_append(GArrowUInt16ArrayBuilder *builder, guint16 value, GError **error) { - return garrow_array_builder_append + return garrow_uint16_array_builder_append_value(builder, value, error); +} + +/** + * garrow_uint16_array_builder_append_value: + * @builder: A #GArrowUInt16ArrayBuilder. + * @value: An uint16 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_uint16_array_builder_append_value(GArrowUInt16ArrayBuilder *builder, + guint16 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[uint16-array-builder][append]"); + "[uint16-array-builder][append-value]"); } /** @@ -1360,17 +1507,38 @@ garrow_int32_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_int32_array_builder_append_value() instead. */ gboolean garrow_int32_array_builder_append(GArrowInt32ArrayBuilder *builder, gint32 value, GError **error) { - return garrow_array_builder_append + return garrow_int32_array_builder_append_value(builder, value, error); +} + +/** + * garrow_int32_array_builder_append_value: + * @builder: A #GArrowInt32ArrayBuilder. + * @value: A int32 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_int32_array_builder_append_value(GArrowInt32ArrayBuilder *builder, + gint32 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[int32-array-builder][append]"); + "[int32-array-builder][append-value]"); } /** @@ -1488,17 +1656,38 @@ garrow_uint32_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_uint32_array_builder_append_value() instead. */ gboolean garrow_uint32_array_builder_append(GArrowUInt32ArrayBuilder *builder, guint32 value, GError **error) { - return garrow_array_builder_append + return garrow_uint32_array_builder_append_value(builder, value, error); +} + +/** + * garrow_uint32_array_builder_append_value: + * @builder: A #GArrowUInt32ArrayBuilder. + * @value: An uint32 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_uint32_array_builder_append_value(GArrowUInt32ArrayBuilder *builder, + guint32 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[uint32-array-builder][append]"); + "[uint32-array-builder][append-value]"); } /** @@ -1616,17 +1805,38 @@ garrow_int64_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_int64_array_builder_append_value() instead. */ gboolean garrow_int64_array_builder_append(GArrowInt64ArrayBuilder *builder, gint64 value, GError **error) { - return garrow_array_builder_append + return garrow_int64_array_builder_append_value(builder, value, error); +} + +/** + * garrow_int64_array_builder_append_value: + * @builder: A #GArrowInt64ArrayBuilder. + * @value: A int64 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_int64_array_builder_append_value(GArrowInt64ArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[int64-array-builder][append]"); + "[int64-array-builder][append-value]"); } /** @@ -1744,17 +1954,38 @@ garrow_uint64_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_uint64_array_builder_append_value() instead. */ gboolean garrow_uint64_array_builder_append(GArrowUInt64ArrayBuilder *builder, guint64 value, GError **error) { - return garrow_array_builder_append + return garrow_uint64_array_builder_append_value(builder, value, error); +} + +/** + * garrow_uint64_array_builder_append_value: + * @builder: A #GArrowUInt64ArrayBuilder. + * @value: An uint64 value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_uint64_array_builder_append_value(GArrowUInt64ArrayBuilder *builder, + guint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[uint64-array-builder][append]"); + "[uint64-array-builder][append-value]"); } /** @@ -1872,17 +2103,38 @@ garrow_float_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_float_array_builder_append_value() instead. */ gboolean garrow_float_array_builder_append(GArrowFloatArrayBuilder *builder, gfloat value, GError **error) { - return garrow_array_builder_append + return garrow_float_array_builder_append_value(builder, value, error); +} + +/** + * garrow_float_array_builder_append_value: + * @builder: A #GArrowFloatArrayBuilder. + * @value: A float value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_float_array_builder_append_value(GArrowFloatArrayBuilder *builder, + gfloat value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[float-array-builder][append]"); + "[float-array-builder][append-value]"); } /** @@ -2000,17 +2252,38 @@ garrow_double_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_double_array_builder_append_value() instead. */ gboolean garrow_double_array_builder_append(GArrowDoubleArrayBuilder *builder, gdouble value, GError **error) { - return garrow_array_builder_append + return garrow_double_array_builder_append_value(builder, value, error); +} + +/** + * garrow_double_array_builder_append_value: + * @builder: A #GArrowDoubleArrayBuilder. + * @value: A double value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_double_array_builder_append_value(GArrowDoubleArrayBuilder *builder, + gdouble value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[double-array-builder][append]"); + "[double-array-builder][append-value]"); } /** @@ -2129,19 +2402,44 @@ garrow_binary_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_binary_array_builder_append_value() instead. */ gboolean garrow_binary_array_builder_append(GArrowBinaryArrayBuilder *builder, const guint8 *value, gint32 length, GError **error) +{ + return garrow_binary_array_builder_append_value(builder, value, length, error); +} + +/** + * garrow_binary_array_builder_append_value: + * @builder: A #GArrowBinaryArrayBuilder. + * @value: (array length=length): A binary value. + * @length: A value length. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_binary_array_builder_append_value(GArrowBinaryArrayBuilder *builder, + const guint8 *value, + gint32 length, + GError **error) { auto arrow_builder = static_cast( garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); auto status = arrow_builder->Append(value, length); - return garrow_error_check(error, status, "[binary-array-builder][append]"); + return garrow_error_check(error, + status, + "[binary-array-builder][append-value]"); } /** @@ -2197,11 +2495,32 @@ garrow_string_array_builder_new(void) * @error: (nullable): Return location for a #GError or %NULL. * * Returns: %TRUE on success, %FALSE if there was an error. + * + * Deprecated: 0.12.0: + * Use garrow_string_array_builder_append_value() instead. */ gboolean garrow_string_array_builder_append(GArrowStringArrayBuilder *builder, const gchar *value, GError **error) +{ + return garrow_string_array_builder_append_value(builder, value, error); +} + +/** + * garrow_string_array_builder_append_value: + * @builder: A #GArrowStringArrayBuilder. + * @value: A string value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_string_array_builder_append_value(GArrowStringArrayBuilder *builder, + const gchar *value, + GError **error) { auto arrow_builder = static_cast( @@ -2209,7 +2528,9 @@ garrow_string_array_builder_append(GArrowStringArrayBuilder *builder, auto status = arrow_builder->Append(value, static_cast(strlen(value))); - return garrow_error_check(error, status, "[string-array-builder][append]"); + return garrow_error_check(error, + status, + "[string-array-builder][append-value]"); } /** @@ -2290,17 +2611,38 @@ garrow_date32_array_builder_new(void) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.7.0 + * + * Deprecated: 0.12.0: + * Use garrow_date32_array_builder_append_value() instead. */ gboolean garrow_date32_array_builder_append(GArrowDate32ArrayBuilder *builder, gint32 value, GError **error) { - return garrow_array_builder_append + return garrow_date32_array_builder_append_value(builder, value, error); +} + +/** + * garrow_date32_array_builder_append_value: + * @builder: A #GArrowDate32ArrayBuilder. + * @value: The number of days since UNIX epoch in signed 32bit integer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_date32_array_builder_append_value(GArrowDate32ArrayBuilder *builder, + gint32 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[date32-array-builder][append]"); + "[date32-array-builder][append-value]"); } /** @@ -2425,17 +2767,38 @@ garrow_date64_array_builder_new(void) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.7.0 + * + * Deprecated: 0.12.0: + * Use garrow_date64_array_builder_append_value() instead. */ gboolean garrow_date64_array_builder_append(GArrowDate64ArrayBuilder *builder, gint64 value, GError **error) { - return garrow_array_builder_append + return garrow_date64_array_builder_append_value(builder, value, error); +} + +/** + * garrow_date64_array_builder_append_value: + * @builder: A #GArrowDate64ArrayBuilder. + * @value: The number of milliseconds since UNIX epoch in signed 64bit integer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_date64_array_builder_append_value(GArrowDate64ArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[date64-array-builder][append]"); + "[date64-array-builder][append-value]"); } /** @@ -2562,17 +2925,38 @@ garrow_timestamp_array_builder_new(GArrowTimestampDataType *data_type) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.7.0 + * + * Deprecated: 0.12.0: + * Use garrow_timestamp_array_builder_append_value() instead. */ gboolean garrow_timestamp_array_builder_append(GArrowTimestampArrayBuilder *builder, gint64 value, GError **error) { - return garrow_array_builder_append + return garrow_timestamp_array_builder_append_value(builder, value, error); +} + +/** + * garrow_timestamp_array_builder_append_value: + * @builder: A #GArrowTimestampArrayBuilder. + * @value: The number of milliseconds since UNIX epoch in signed 64bit integer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_timestamp_array_builder_append_value(GArrowTimestampArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[timestamp-array-builder][append]"); + "[timestamp-array-builder][append-value]"); } /** @@ -2699,17 +3083,38 @@ garrow_time32_array_builder_new(GArrowTime32DataType *data_type) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.7.0 + * + * Deprecated: 0.12.0: + * Use garrow_time32_array_builder_append_value() instead. */ gboolean garrow_time32_array_builder_append(GArrowTime32ArrayBuilder *builder, gint32 value, GError **error) { - return garrow_array_builder_append + return garrow_time32_array_builder_append_value(builder, value, error); +} + +/** + * garrow_time32_array_builder_append_value: + * @builder: A #GArrowTime32ArrayBuilder. + * @value: The number of days since UNIX epoch in signed 32bit integer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_time32_array_builder_append_value(GArrowTime32ArrayBuilder *builder, + gint32 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[time32-array-builder][append]"); + "[time32-array-builder][append-value]"); } /** @@ -2836,17 +3241,38 @@ garrow_time64_array_builder_new(GArrowTime64DataType *data_type) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.7.0 + * + * Deprecated: 0.12.0: + * Use garrow_time64_array_builder_append_value() instead. */ gboolean garrow_time64_array_builder_append(GArrowTime64ArrayBuilder *builder, gint64 value, GError **error) { - return garrow_array_builder_append + return garrow_time64_array_builder_append_value(builder, value, error); +} + +/** + * garrow_time64_array_builder_append_value: + * @builder: A #GArrowTime64ArrayBuilder. + * @value: The number of milliseconds since UNIX epoch in signed 64bit integer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_time64_array_builder_append_value(GArrowTime64ArrayBuilder *builder, + gint64 value, + GError **error) +{ + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), value, error, - "[time64-array-builder][append]"); + "[time64-array-builder][append-value]"); } /** @@ -3047,17 +3473,72 @@ garrow_list_array_builder_new(GArrowListDataType *data_type, * g_object_unref(array); * } * ]| + * + * Deprecated: 0.12.0: + * Use garrow_list_array_builder_append_value() instead. */ gboolean garrow_list_array_builder_append(GArrowListArrayBuilder *builder, GError **error) +{ + return garrow_list_array_builder_append_value(builder, error); +} + +/** + * garrow_list_array_builder_append_value: + * @builder: A #GArrowListArrayBuilder. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * It appends a new list element. To append a new list element, you + * need to call this function then append list element values to + * `value_builder`. `value_builder` is the #GArrowArrayBuilder + * specified to constructor. You can get `value_builder` by + * garrow_list_array_builder_get_value_builder(). + * + * |[ + * GArrowInt8ArrayBuilder *value_builder; + * GArrowListArrayBuilder *builder; + * + * value_builder = garrow_int8_array_builder_new(); + * builder = garrow_list_array_builder_new(value_builder, NULL); + * + * // Start 0th list element: [1, 0, -1] + * garrow_list_array_builder_append(builder, NULL); + * garrow_int8_array_builder_append(value_builder, 1); + * garrow_int8_array_builder_append(value_builder, 0); + * garrow_int8_array_builder_append(value_builder, -1); + * + * // Start 1st list element: [-29, 29] + * garrow_list_array_builder_append(builder, NULL); + * garrow_int8_array_builder_append(value_builder, -29); + * garrow_int8_array_builder_append(value_builder, 29); + * + * { + * // [[1, 0, -1], [-29, 29]] + * GArrowArray *array = garrow_array_builder_finish(builder); + * // Now, builder is needless. + * g_object_unref(builder); + * g_object_unref(value_builder); + * + * // Use array... + * g_object_unref(array); + * } + * ]| + * + * Since: 0.12.0 + */ +gboolean +garrow_list_array_builder_append_value(GArrowListArrayBuilder *builder, + GError **error) { auto arrow_builder = static_cast( garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); auto status = arrow_builder->Append(); - return garrow_error_check(error, status, "[list-array-builder][append]"); + return garrow_error_check(error, status, "[list-array-builder][append-value]"); } /** @@ -3195,17 +3676,49 @@ garrow_struct_array_builder_new(GArrowStructDataType *data_type, * |[ * // TODO * ]| + * + * Deprecated: 0.12.0: + * Use garrow_struct_array_builder_append_value() instead. */ gboolean garrow_struct_array_builder_append(GArrowStructArrayBuilder *builder, GError **error) +{ + return garrow_struct_array_builder_append_value(builder, error); +} + +/** + * garrow_struct_array_builder_append_value: + * @builder: A #GArrowStructArrayBuilder. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * It appends a new struct element. To append a new struct element, + * you need to call this function then append struct element field + * values to all `field_builder`s. `field_value`s are the + * #GArrowArrayBuilder specified to constructor. You can get + * `field_builder` by garrow_struct_array_builder_get_field_builder() + * or garrow_struct_array_builder_get_field_builders(). + * + * |[ + * // TODO + * ]| + * + * Since: 0.12.0 + */ +gboolean +garrow_struct_array_builder_append_value(GArrowStructArrayBuilder *builder, + GError **error) { auto arrow_builder = static_cast( garrow_array_builder_get_raw(GARROW_ARRAY_BUILDER(builder))); auto status = arrow_builder->Append(); - return garrow_error_check(error, status, "[struct-array-builder][append]"); + return garrow_error_check(error, + status, + "[struct-array-builder][append-value]"); } /** @@ -3315,18 +3828,39 @@ garrow_decimal128_array_builder_new(GArrowDecimalDataType *data_type) * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 0.10.0 + * + * Deprecated: 0.12.0: + * Use garrow_decimal128_array_builder_append_value() instead. */ gboolean garrow_decimal128_array_builder_append(GArrowDecimal128ArrayBuilder *builder, GArrowDecimal128 *value, GError **error) +{ + return garrow_decimal128_array_builder_append_value(builder, value, error); +} + +/** + * garrow_decimal128_array_builder_append_value: + * @builder: A #GArrowDecimal128ArrayBuilder. + * @value: A decimal value. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_array_builder_append_value(GArrowDecimal128ArrayBuilder *builder, + GArrowDecimal128 *value, + GError **error) { auto arrow_decimal = garrow_decimal128_get_raw(value); - return garrow_array_builder_append + return garrow_array_builder_append_value (GARROW_ARRAY_BUILDER(builder), *arrow_decimal, error, - "[decimal128-array-builder][append]"); + "[decimal128-array-builder][append-value]"); } G_END_DECLS diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h index db340b70ab77c..1ddc0266f4993 100644 --- a/c_glib/arrow-glib/array-builder.h +++ b/c_glib/arrow-glib/array-builder.h @@ -90,9 +90,16 @@ GType garrow_boolean_array_builder_get_type(void) G_GNUC_CONST; GArrowBooleanArrayBuilder *garrow_boolean_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_boolean_array_builder_append_value) gboolean garrow_boolean_array_builder_append(GArrowBooleanArrayBuilder *builder, gboolean value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_boolean_array_builder_append_value(GArrowBooleanArrayBuilder *builder, + gboolean value, + GError **error); gboolean garrow_boolean_array_builder_append_values(GArrowBooleanArrayBuilder *builder, const gboolean *values, gint64 values_length, @@ -150,9 +157,16 @@ GType garrow_int_array_builder_get_type(void) G_GNUC_CONST; GArrowIntArrayBuilder *garrow_int_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_int_array_builder_append_value) gboolean garrow_int_array_builder_append(GArrowIntArrayBuilder *builder, gint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_int_array_builder_append_value(GArrowIntArrayBuilder *builder, + gint64 value, + GError **error); gboolean garrow_int_array_builder_append_values(GArrowIntArrayBuilder *builder, const gint64 *values, gint64 values_length, @@ -179,9 +193,16 @@ struct _GArrowUIntArrayBuilderClass GArrowUIntArrayBuilder *garrow_uint_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint_array_builder_append_value) gboolean garrow_uint_array_builder_append(GArrowUIntArrayBuilder *builder, guint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_uint_array_builder_append_value(GArrowUIntArrayBuilder *builder, + guint64 value, + GError **error); gboolean garrow_uint_array_builder_append_values(GArrowUIntArrayBuilder *builder, const guint64 *values, gint64 values_length, @@ -239,9 +260,16 @@ GType garrow_int8_array_builder_get_type(void) G_GNUC_CONST; GArrowInt8ArrayBuilder *garrow_int8_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_int8_array_builder_append_value) gboolean garrow_int8_array_builder_append(GArrowInt8ArrayBuilder *builder, gint8 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_int8_array_builder_append_value(GArrowInt8ArrayBuilder *builder, + gint8 value, + GError **error); gboolean garrow_int8_array_builder_append_values(GArrowInt8ArrayBuilder *builder, const gint8 *values, gint64 values_length, @@ -299,9 +327,16 @@ GType garrow_uint8_array_builder_get_type(void) G_GNUC_CONST; GArrowUInt8ArrayBuilder *garrow_uint8_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint8_array_builder_append_value) gboolean garrow_uint8_array_builder_append(GArrowUInt8ArrayBuilder *builder, guint8 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_uint8_array_builder_append_value(GArrowUInt8ArrayBuilder *builder, + guint8 value, + GError **error); gboolean garrow_uint8_array_builder_append_values(GArrowUInt8ArrayBuilder *builder, const guint8 *values, gint64 values_length, @@ -359,9 +394,16 @@ GType garrow_int16_array_builder_get_type(void) G_GNUC_CONST; GArrowInt16ArrayBuilder *garrow_int16_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_int16_array_builder_append_value) gboolean garrow_int16_array_builder_append(GArrowInt16ArrayBuilder *builder, gint16 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_int16_array_builder_append_value(GArrowInt16ArrayBuilder *builder, + gint16 value, + GError **error); gboolean garrow_int16_array_builder_append_values(GArrowInt16ArrayBuilder *builder, const gint16 *values, gint64 values_length, @@ -419,9 +461,16 @@ GType garrow_uint16_array_builder_get_type(void) G_GNUC_CONST; GArrowUInt16ArrayBuilder *garrow_uint16_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint16_array_builder_append_value) gboolean garrow_uint16_array_builder_append(GArrowUInt16ArrayBuilder *builder, guint16 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_uint16_array_builder_append_value(GArrowUInt16ArrayBuilder *builder, + guint16 value, + GError **error); gboolean garrow_uint16_array_builder_append_values(GArrowUInt16ArrayBuilder *builder, const guint16 *values, gint64 values_length, @@ -479,9 +528,16 @@ GType garrow_int32_array_builder_get_type(void) G_GNUC_CONST; GArrowInt32ArrayBuilder *garrow_int32_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_int32_array_builder_append_value) gboolean garrow_int32_array_builder_append(GArrowInt32ArrayBuilder *builder, gint32 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_int32_array_builder_append_value(GArrowInt32ArrayBuilder *builder, + gint32 value, + GError **error); gboolean garrow_int32_array_builder_append_values(GArrowInt32ArrayBuilder *builder, const gint32 *values, gint64 values_length, @@ -539,9 +595,16 @@ GType garrow_uint32_array_builder_get_type(void) G_GNUC_CONST; GArrowUInt32ArrayBuilder *garrow_uint32_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint32_array_builder_append_value) gboolean garrow_uint32_array_builder_append(GArrowUInt32ArrayBuilder *builder, guint32 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_uint32_array_builder_append_value(GArrowUInt32ArrayBuilder *builder, + guint32 value, + GError **error); gboolean garrow_uint32_array_builder_append_values(GArrowUInt32ArrayBuilder *builder, const guint32 *values, gint64 values_length, @@ -599,9 +662,16 @@ GType garrow_int64_array_builder_get_type(void) G_GNUC_CONST; GArrowInt64ArrayBuilder *garrow_int64_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_int64_array_builder_append_value) gboolean garrow_int64_array_builder_append(GArrowInt64ArrayBuilder *builder, gint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_int64_array_builder_append_value(GArrowInt64ArrayBuilder *builder, + gint64 value, + GError **error); gboolean garrow_int64_array_builder_append_values(GArrowInt64ArrayBuilder *builder, const gint64 *values, gint64 values_length, @@ -659,9 +729,16 @@ GType garrow_uint64_array_builder_get_type(void) G_GNUC_CONST; GArrowUInt64ArrayBuilder *garrow_uint64_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_uint64_array_builder_append_value) gboolean garrow_uint64_array_builder_append(GArrowUInt64ArrayBuilder *builder, guint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_uint64_array_builder_append_value(GArrowUInt64ArrayBuilder *builder, + guint64 value, + GError **error); gboolean garrow_uint64_array_builder_append_values(GArrowUInt64ArrayBuilder *builder, const guint64 *values, gint64 values_length, @@ -719,9 +796,16 @@ GType garrow_float_array_builder_get_type(void) G_GNUC_CONST; GArrowFloatArrayBuilder *garrow_float_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_float_array_builder_append_value) gboolean garrow_float_array_builder_append(GArrowFloatArrayBuilder *builder, gfloat value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_float_array_builder_append_value(GArrowFloatArrayBuilder *builder, + gfloat value, + GError **error); gboolean garrow_float_array_builder_append_values(GArrowFloatArrayBuilder *builder, const gfloat *values, gint64 values_length, @@ -779,9 +863,16 @@ GType garrow_double_array_builder_get_type(void) G_GNUC_CONST; GArrowDoubleArrayBuilder *garrow_double_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_double_array_builder_append_value) gboolean garrow_double_array_builder_append(GArrowDoubleArrayBuilder *builder, gdouble value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_double_array_builder_append_value(GArrowDoubleArrayBuilder *builder, + gdouble value, + GError **error); gboolean garrow_double_array_builder_append_values(GArrowDoubleArrayBuilder *builder, const gdouble *values, gint64 values_length, @@ -839,10 +930,18 @@ GType garrow_binary_array_builder_get_type(void) G_GNUC_CONST; GArrowBinaryArrayBuilder *garrow_binary_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_binary_array_builder_append_value) gboolean garrow_binary_array_builder_append(GArrowBinaryArrayBuilder *builder, const guint8 *value, gint32 length, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_binary_array_builder_append_value(GArrowBinaryArrayBuilder *builder, + const guint8 *value, + gint32 length, + GError **error); gboolean garrow_binary_array_builder_append_null(GArrowBinaryArrayBuilder *builder, GError **error); @@ -891,9 +990,16 @@ GType garrow_string_array_builder_get_type(void) G_GNUC_CONST; GArrowStringArrayBuilder *garrow_string_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_string_array_builder_append_value) gboolean garrow_string_array_builder_append(GArrowStringArrayBuilder *builder, const gchar *value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_string_array_builder_append_value(GArrowStringArrayBuilder *builder, + const gchar *value, + GError **error); gboolean garrow_string_array_builder_append_values(GArrowStringArrayBuilder *builder, const gchar **values, gint64 values_length, @@ -946,9 +1052,16 @@ GType garrow_date32_array_builder_get_type(void) G_GNUC_CONST; GArrowDate32ArrayBuilder *garrow_date32_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_date32_array_builder_append_value) gboolean garrow_date32_array_builder_append(GArrowDate32ArrayBuilder *builder, gint32 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_date32_array_builder_append_value(GArrowDate32ArrayBuilder *builder, + gint32 value, + GError **error); gboolean garrow_date32_array_builder_append_values(GArrowDate32ArrayBuilder *builder, const gint32 *values, gint64 values_length, @@ -1006,9 +1119,16 @@ GType garrow_date64_array_builder_get_type(void) G_GNUC_CONST; GArrowDate64ArrayBuilder *garrow_date64_array_builder_new(void); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_date64_array_builder_append_value) gboolean garrow_date64_array_builder_append(GArrowDate64ArrayBuilder *builder, gint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_date64_array_builder_append_value(GArrowDate64ArrayBuilder *builder, + gint64 value, + GError **error); gboolean garrow_date64_array_builder_append_values(GArrowDate64ArrayBuilder *builder, const gint64 *values, gint64 values_length, @@ -1067,9 +1187,16 @@ GType garrow_timestamp_array_builder_get_type(void) G_GNUC_CONST; GArrowTimestampArrayBuilder * garrow_timestamp_array_builder_new(GArrowTimestampDataType *data_type); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_timestamp_array_builder_append_value) gboolean garrow_timestamp_array_builder_append(GArrowTimestampArrayBuilder *builder, gint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_timestamp_array_builder_append_value(GArrowTimestampArrayBuilder *builder, + gint64 value, + GError **error); gboolean garrow_timestamp_array_builder_append_values(GArrowTimestampArrayBuilder *builder, const gint64 *values, gint64 values_length, @@ -1127,9 +1254,16 @@ GType garrow_time32_array_builder_get_type(void) G_GNUC_CONST; GArrowTime32ArrayBuilder *garrow_time32_array_builder_new(GArrowTime32DataType *data_type); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_time32_array_builder_append_value) gboolean garrow_time32_array_builder_append(GArrowTime32ArrayBuilder *builder, gint32 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_time32_array_builder_append_value(GArrowTime32ArrayBuilder *builder, + gint32 value, + GError **error); gboolean garrow_time32_array_builder_append_values(GArrowTime32ArrayBuilder *builder, const gint32 *values, gint64 values_length, @@ -1187,9 +1321,16 @@ GType garrow_time64_array_builder_get_type(void) G_GNUC_CONST; GArrowTime64ArrayBuilder *garrow_time64_array_builder_new(GArrowTime64DataType *data_type); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_time64_array_builder_append_value) gboolean garrow_time64_array_builder_append(GArrowTime64ArrayBuilder *builder, gint64 value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_time64_array_builder_append_value(GArrowTime64ArrayBuilder *builder, + gint64 value, + GError **error); gboolean garrow_time64_array_builder_append_values(GArrowTime64ArrayBuilder *builder, const gint64 *values, gint64 values_length, @@ -1248,8 +1389,14 @@ GType garrow_list_array_builder_get_type(void) G_GNUC_CONST; GArrowListArrayBuilder *garrow_list_array_builder_new(GArrowListDataType *data_type, GError **error); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_list_array_builder_append_value) gboolean garrow_list_array_builder_append(GArrowListArrayBuilder *builder, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_list_array_builder_append_value(GArrowListArrayBuilder *builder, + GError **error); gboolean garrow_list_array_builder_append_null(GArrowListArrayBuilder *builder, GError **error); @@ -1301,8 +1448,14 @@ GType garrow_struct_array_builder_get_type(void) G_GNUC_CONST; GArrowStructArrayBuilder *garrow_struct_array_builder_new(GArrowStructDataType *data_type, GError **error); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_struct_array_builder_append_value) gboolean garrow_struct_array_builder_append(GArrowStructArrayBuilder *builder, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_struct_array_builder_append_value(GArrowStructArrayBuilder *builder, + GError **error); gboolean garrow_struct_array_builder_append_null(GArrowStructArrayBuilder *builder, GError **error); @@ -1324,8 +1477,15 @@ struct _GArrowDecimal128ArrayBuilderClass GArrowDecimal128ArrayBuilder *garrow_decimal128_array_builder_new(GArrowDecimalDataType *data_type); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_decimal128_array_builder_append_value) gboolean garrow_decimal128_array_builder_append(GArrowDecimal128ArrayBuilder *builder, GArrowDecimal128 *value, GError **error); +#endif +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_array_builder_append_value(GArrowDecimal128ArrayBuilder *builder, + GArrowDecimal128 *value, + GError **error); G_END_DECLS diff --git a/c_glib/arrow-glib/codec.cpp b/c_glib/arrow-glib/codec.cpp index 45863878e9c7e..7f06fabde74e8 100644 --- a/c_glib/arrow-glib/codec.cpp +++ b/c_glib/arrow-glib/codec.cpp @@ -119,7 +119,7 @@ garrow_codec_class_init(GArrowCodecClass *klass) /** * garrow_codec_new: - * @type: A #GArrowCodompressionType. + * @type: A #GArrowCompressionType. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: A newly created #GArrowCodec on success, %NULL on error. diff --git a/c_glib/arrow-glib/orc-file-reader.cpp b/c_glib/arrow-glib/orc-file-reader.cpp index bde3cfc8fa04f..31905a2f9fea1 100644 --- a/c_glib/arrow-glib/orc-file-reader.cpp +++ b/c_glib/arrow-glib/orc-file-reader.cpp @@ -199,8 +199,7 @@ garrow_orc_file_reader_new(GArrowSeekableInputStream *input, * Since: 0.10.0 * * Deprecated: 0.12.0: - * Use garrow_orc_file_reader_set_field_indices() instead. - * + * Use garrow_orc_file_reader_set_field_indices() instead. */ void garrow_orc_file_reader_set_field_indexes(GArrowORCFileReader *reader, diff --git a/c_glib/example/build.c b/c_glib/example/build.c index 8c6cf74d74815..9b2d58d2b2bba 100644 --- a/c_glib/example/build.c +++ b/c_glib/example/build.c @@ -33,13 +33,13 @@ main(int argc, char **argv) builder = garrow_int32_array_builder_new(); if (success) { - success = garrow_int32_array_builder_append(builder, 29, &error); + success = garrow_int32_array_builder_append_value(builder, 29, &error); } if (success) { - success = garrow_int32_array_builder_append(builder, 2929, &error); + success = garrow_int32_array_builder_append_value(builder, 2929, &error); } if (success) { - success = garrow_int32_array_builder_append(builder, 292929, &error); + success = garrow_int32_array_builder_append_value(builder, 292929, &error); } if (!success) { g_print("failed to append: %s\n", error->message); diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb index d6d1ff89b6a3e..f3ae709512eeb 100644 --- a/c_glib/test/helper/buildable.rb +++ b/c_glib/test/helper/buildable.rb @@ -135,20 +135,20 @@ def append_to_builder(builder, value) data_type = builder.value_data_type case data_type when Arrow::ListDataType - builder.append + builder.append_value value_builder = builder.value_builder value.each do |v| append_to_builder(value_builder, v) end when Arrow::StructDataType - builder.append + builder.append_value value.each do |name, v| field_index = data_type.get_field_index(name) field_builder = builder.get_field_builder(field_index) append_to_builder(field_builder, v) end else - builder.append(value) + builder.append_value(value) end end end @@ -179,7 +179,7 @@ def build_array(builder, values) if value.nil? builder.append_null else - builder.append(value) + builder.append_value(value) end end builder.finish diff --git a/c_glib/test/test-array.rb b/c_glib/test/test-array.rb index 12fba7346c36f..3befde3c7a9bb 100644 --- a/c_glib/test/test-array.rb +++ b/c_glib/test/test-array.rb @@ -42,7 +42,7 @@ def test_equal_range def test_is_null builder = Arrow::BooleanArrayBuilder.new builder.append_null - builder.append(true) + builder.append_value(true) array = builder.finish assert_equal([true, false], array.length.times.collect {|i| array.null?(i)}) @@ -51,7 +51,7 @@ def test_is_null def test_is_valid builder = Arrow::BooleanArrayBuilder.new builder.append_null - builder.append(true) + builder.append_value(true) array = builder.finish assert_equal([false, true], array.length.times.collect {|i| array.valid?(i)}) @@ -59,7 +59,7 @@ def test_is_valid def test_length builder = Arrow::BooleanArrayBuilder.new - builder.append(true) + builder.append_value(true) array = builder.finish assert_equal(1, array.length) end @@ -75,10 +75,10 @@ def test_n_nulls def test_null_bitmap builder = Arrow::BooleanArrayBuilder.new builder.append_null - builder.append(true) - builder.append(false) + builder.append_value(true) + builder.append_value(false) builder.append_null - builder.append(false) + builder.append_value(false) array = builder.finish assert_equal(0b10110, array.null_bitmap.data.to_s.unpack("c*")[0]) end @@ -97,9 +97,9 @@ def test_value_type def test_slice builder = Arrow::BooleanArrayBuilder.new - builder.append(true) - builder.append(false) - builder.append(true) + builder.append_value(true) + builder.append_value(false) + builder.append_value(true) array = builder.finish sub_array = array.slice(1, 2) assert_equal([false, true], diff --git a/c_glib/test/test-binary-array.rb b/c_glib/test/test-binary-array.rb index 2dfd9cfbaaf14..0dcaf4eef60c5 100644 --- a/c_glib/test/test-binary-array.rb +++ b/c_glib/test/test-binary-array.rb @@ -32,7 +32,7 @@ def test_new def test_value data = "\x00\x01\x02" builder = Arrow::BinaryArrayBuilder.new - builder.append(data) + builder.append_value(data) array = builder.finish assert_equal(data, array.get_value(0).to_s) end @@ -41,8 +41,8 @@ def test_buffer data1 = "\x00\x01\x02" data2 = "\x03\x04\x05" builder = Arrow::BinaryArrayBuilder.new - builder.append(data1) - builder.append(data2) + builder.append_value(data1) + builder.append_value(data2) array = builder.finish assert_equal(data1 + data2, array.buffer.data.to_s) end @@ -51,8 +51,8 @@ def test_offsets_buffer data1 = "\x00\x01" data2 = "\x02\x03\x04" builder = Arrow::BinaryArrayBuilder.new - builder.append(data1) - builder.append(data2) + builder.append_value(data1) + builder.append_value(data2) array = builder.finish byte_per_offset = 4 assert_equal([0, 2, 5].pack("l*"), diff --git a/c_glib/test/test-boolean-array.rb b/c_glib/test/test-boolean-array.rb index ac07ec995ea32..e8c7e5efe2fc5 100644 --- a/c_glib/test/test-boolean-array.rb +++ b/c_glib/test/test-boolean-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::BooleanArrayBuilder.new - builder.append(true) - builder.append(false) - builder.append(true) + builder.append_value(true) + builder.append_value(false) + builder.append_value(true) array = builder.finish assert_equal([0b101].pack("C*"), array.buffer.data.to_s) end def test_value builder = Arrow::BooleanArrayBuilder.new - builder.append(true) + builder.append_value(true) array = builder.finish assert_equal(true, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 3, 1) builder = Arrow::BooleanArrayBuilder.new - builder.append(true) - builder.append(false) - builder.append(true) + builder.append_value(true) + builder.append_value(false) + builder.append_value(true) array = builder.finish assert_equal([true, false, true], array.values) end diff --git a/c_glib/test/test-date32-array.rb b/c_glib/test/test-date32-array.rb index f1425693f381e..09ef78650bd59 100644 --- a/c_glib/test/test-date32-array.rb +++ b/c_glib/test/test-date32-array.rb @@ -34,9 +34,9 @@ def test_buffer after_epoch = 17406 # 2017-08-28 builder = Arrow::Date32ArrayBuilder.new - builder.append(0) - builder.append(after_epoch) - builder.append(before_epoch) + builder.append_value(0) + builder.append_value(after_epoch) + builder.append_value(before_epoch) array = builder.finish assert_equal([0, after_epoch, before_epoch].pack("l*"), array.buffer.data.to_s) @@ -46,7 +46,7 @@ def test_value after_epoch = 17406 # 2017-08-28 builder = Arrow::Date32ArrayBuilder.new - builder.append(after_epoch) + builder.append_value(after_epoch) array = builder.finish assert_equal(after_epoch, array.get_value(0)) end @@ -56,9 +56,9 @@ def test_values after_epoch = 17406 # 2017-08-28 builder = Arrow::Date32ArrayBuilder.new - builder.append(0) - builder.append(after_epoch) - builder.append(before_epoch) + builder.append_value(0) + builder.append_value(after_epoch) + builder.append_value(before_epoch) array = builder.finish assert_equal([0, after_epoch, before_epoch], array.values) end diff --git a/c_glib/test/test-date64-array.rb b/c_glib/test/test-date64-array.rb index 1ea9f5a6a0545..4d9f189196fc8 100644 --- a/c_glib/test/test-date64-array.rb +++ b/c_glib/test/test-date64-array.rb @@ -34,9 +34,9 @@ def test_buffer after_epoch = 1503878400000 # 2017-08-28T00:00:00Z builder = Arrow::Date64ArrayBuilder.new - builder.append(0) - builder.append(after_epoch) - builder.append(before_epoch) + builder.append_value(0) + builder.append_value(after_epoch) + builder.append_value(before_epoch) array = builder.finish assert_equal([0, after_epoch, before_epoch].pack("q*"), array.buffer.data.to_s) @@ -46,7 +46,7 @@ def test_value after_epoch = 1503878400000 # 2017-08-28T00:00:00Z builder = Arrow::Date64ArrayBuilder.new - builder.append(after_epoch) + builder.append_value(after_epoch) array = builder.finish assert_equal(after_epoch, array.get_value(0)) end @@ -56,9 +56,9 @@ def test_values after_epoch = 1503878400000 # 2017-08-28T00:00:00Z builder = Arrow::Date64ArrayBuilder.new - builder.append(0) - builder.append(after_epoch) - builder.append(before_epoch) + builder.append_value(0) + builder.append_value(after_epoch) + builder.append_value(before_epoch) array = builder.finish assert_equal([0, after_epoch, before_epoch], array.values) end diff --git a/c_glib/test/test-decimal-array.rb b/c_glib/test/test-decimal-array.rb index a65e10037659a..a5eb28253d95f 100644 --- a/c_glib/test/test-decimal-array.rb +++ b/c_glib/test/test-decimal-array.rb @@ -20,7 +20,7 @@ def test_format_value data_type = Arrow::DecimalDataType.new(8,2) builder = Arrow::Decimal128ArrayBuilder.new(data_type) decimal = Arrow::Decimal128.new("23423445") - builder.append(decimal) + builder.append_value(decimal) array = builder.finish assert_equal("234234.45", array.format_value(0)) end @@ -29,7 +29,7 @@ def test_value data_type = Arrow::DecimalDataType.new(8,2) builder = Arrow::Decimal128ArrayBuilder.new(data_type) decimal = Arrow::Decimal128.new("23423445") - builder.append(decimal) + builder.append_value(decimal) array = builder.finish assert_equal("234234.45", array.get_value(0).to_string_scale(array.value_data_type.scale)) diff --git a/c_glib/test/test-double-array.rb b/c_glib/test/test-double-array.rb index 1213a5dfe53d6..020ed8f079960 100644 --- a/c_glib/test/test-double-array.rb +++ b/c_glib/test/test-double-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::DoubleArrayBuilder.new - builder.append(-1.1) - builder.append(2.2) - builder.append(-4.4) + builder.append_value(-1.1) + builder.append_value(2.2) + builder.append_value(-4.4) array = builder.finish assert_equal([-1.1, 2.2, -4.4].pack("d*"), array.buffer.data.to_s) end def test_value builder = Arrow::DoubleArrayBuilder.new - builder.append(1.5) + builder.append_value(1.5) array = builder.finish assert_in_delta(1.5, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::DoubleArrayBuilder.new - builder.append(1.5) - builder.append(3) - builder.append(4.5) + builder.append_value(1.5) + builder.append_value(3) + builder.append_value(4.5) array = builder.finish assert_equal([1.5, 3.0, 4.5], array.values) end diff --git a/c_glib/test/test-float-array.rb b/c_glib/test/test-float-array.rb index c8e1b4d864c08..c2a71a0dd39db 100644 --- a/c_glib/test/test-float-array.rb +++ b/c_glib/test/test-float-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::FloatArrayBuilder.new - builder.append(-1.1) - builder.append(2.2) - builder.append(-4.4) + builder.append_value(-1.1) + builder.append_value(2.2) + builder.append_value(-4.4) array = builder.finish assert_equal([-1.1, 2.2, -4.4].pack("f*"), array.buffer.data.to_s) end def test_value builder = Arrow::FloatArrayBuilder.new - builder.append(1.5) + builder.append_value(1.5) array = builder.finish assert_in_delta(1.5, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::FloatArrayBuilder.new - builder.append(1.5) - builder.append(3) - builder.append(4.5) + builder.append_value(1.5) + builder.append_value(3) + builder.append_value(4.5) array = builder.finish assert_equal([1.5, 3.0, 4.5], array.values) end diff --git a/c_glib/test/test-int16-array.rb b/c_glib/test/test-int16-array.rb index 13646e0d5b818..e0efb68019b24 100644 --- a/c_glib/test/test-int16-array.rb +++ b/c_glib/test/test-int16-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::Int16ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4].pack("s*"), array.buffer.data.to_s) end def test_value builder = Arrow::Int16ArrayBuilder.new - builder.append(-1) + builder.append_value(-1) array = builder.finish assert_equal(-1, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::Int16ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4], array.values) end diff --git a/c_glib/test/test-int32-array.rb b/c_glib/test/test-int32-array.rb index d1579a8eba881..9827e532bf154 100644 --- a/c_glib/test/test-int32-array.rb +++ b/c_glib/test/test-int32-array.rb @@ -28,25 +28,25 @@ def test_new def test_buffer builder = Arrow::Int32ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4].pack("l*"), array.buffer.data.to_s) end def test_value builder = Arrow::Int32ArrayBuilder.new - builder.append(-1) + builder.append_value(-1) array = builder.finish assert_equal(-1, array.get_value(0)) end def test_values builder = Arrow::Int32ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4], array.values) end diff --git a/c_glib/test/test-int64-array.rb b/c_glib/test/test-int64-array.rb index 5d9c37a55c084..39a74d34e23fa 100644 --- a/c_glib/test/test-int64-array.rb +++ b/c_glib/test/test-int64-array.rb @@ -28,25 +28,25 @@ def test_new def test_buffer builder = Arrow::Int64ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4].pack("q*"), array.buffer.data.to_s) end def test_value builder = Arrow::Int64ArrayBuilder.new - builder.append(-1) + builder.append_value(-1) array = builder.finish assert_equal(-1, array.get_value(0)) end def test_values builder = Arrow::Int64ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4], array.values) end diff --git a/c_glib/test/test-int8-array.rb b/c_glib/test/test-int8-array.rb index e17c10c53611e..46fe591a575c2 100644 --- a/c_glib/test/test-int8-array.rb +++ b/c_glib/test/test-int8-array.rb @@ -28,25 +28,25 @@ def test_new def test_buffer builder = Arrow::Int8ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4].pack("c*"), array.buffer.data.to_s) end def test_value builder = Arrow::Int8ArrayBuilder.new - builder.append(-1) + builder.append_value(-1) array = builder.finish assert_equal(-1, array.get_value(0)) end def test_values builder = Arrow::Int8ArrayBuilder.new - builder.append(-1) - builder.append(2) - builder.append(-4) + builder.append_value(-1) + builder.append_value(2) + builder.append_value(-4) array = builder.finish assert_equal([-1, 2, -4], array.values) end diff --git a/c_glib/test/test-list-array.rb b/c_glib/test/test-list-array.rb index 14f84067ac525..271d32236acbd 100644 --- a/c_glib/test/test-list-array.rb +++ b/c_glib/test/test-list-array.rb @@ -38,14 +38,14 @@ def test_value builder = Arrow::ListArrayBuilder.new(data_type) value_builder = builder.value_builder - builder.append - value_builder.append(-29) - value_builder.append(29) + builder.append_value + value_builder.append_value(-29) + value_builder.append_value(29) - builder.append - value_builder.append(-1) - value_builder.append(0) - value_builder.append(1) + builder.append_value + value_builder.append_value(-1) + value_builder.append_value(0) + value_builder.append_value(1) array = builder.finish value = array.get_value(1) diff --git a/c_glib/test/test-string-array.rb b/c_glib/test/test-string-array.rb index a9edb0ae49152..61459edbb8059 100644 --- a/c_glib/test/test-string-array.rb +++ b/c_glib/test/test-string-array.rb @@ -31,15 +31,15 @@ def test_new def test_value builder = Arrow::StringArrayBuilder.new - builder.append("Hello") + builder.append_value("Hello") array = builder.finish assert_equal("Hello", array.get_string(0)) end def test_buffer builder = Arrow::StringArrayBuilder.new - builder.append("Hello") - builder.append("World") + builder.append_value("Hello") + builder.append_value("World") array = builder.finish assert_equal("HelloWorld", array.buffer.data.to_s) end diff --git a/c_glib/test/test-struct-array.rb b/c_glib/test/test-struct-array.rb index 78760a9b30984..af7e299d8b7ce 100644 --- a/c_glib/test/test-struct-array.rb +++ b/c_glib/test/test-struct-array.rb @@ -58,13 +58,13 @@ def test_flatten data_type = Arrow::StructDataType.new(fields) builder = Arrow::StructArrayBuilder.new(data_type) - builder.append - builder.get_field_builder(0).append(-29) - builder.get_field_builder(1).append(true) + builder.append_value + builder.get_field_builder(0).append_value(-29) + builder.get_field_builder(1).append_value(true) - builder.append - builder.field_builders[0].append(2) - builder.field_builders[1].append(false) + builder.append_value + builder.field_builders[0].append_value(2) + builder.field_builders[1].append_value(false) array = builder.finish values = array.length.times.collect do |i| diff --git a/c_glib/test/test-uint16-array.rb b/c_glib/test/test-uint16-array.rb index 1362c8e7ff507..baa6934e4f4e2 100644 --- a/c_glib/test/test-uint16-array.rb +++ b/c_glib/test/test-uint16-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::UInt16ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4].pack("S*"), array.buffer.data.to_s) end def test_value builder = Arrow::UInt16ArrayBuilder.new - builder.append(1) + builder.append_value(1) array = builder.finish assert_equal(1, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::UInt16ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4], array.values) end diff --git a/c_glib/test/test-uint32-array.rb b/c_glib/test/test-uint32-array.rb index 01b3edb353ff2..b9efb4cf00403 100644 --- a/c_glib/test/test-uint32-array.rb +++ b/c_glib/test/test-uint32-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::UInt32ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4].pack("L*"), array.buffer.data.to_s) end def test_value builder = Arrow::UInt32ArrayBuilder.new - builder.append(1) + builder.append_value(1) array = builder.finish assert_equal(1, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::UInt32ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4], array.values) end diff --git a/c_glib/test/test-uint64-array.rb b/c_glib/test/test-uint64-array.rb index a002af269293c..b4275cefdd9b8 100644 --- a/c_glib/test/test-uint64-array.rb +++ b/c_glib/test/test-uint64-array.rb @@ -29,16 +29,16 @@ def test_new def test_buffer builder = Arrow::UInt64ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4].pack("Q*"), array.buffer.data.to_s) end def test_value builder = Arrow::UInt64ArrayBuilder.new - builder.append(1) + builder.append_value(1) array = builder.finish assert_equal(1, array.get_value(0)) end @@ -46,9 +46,9 @@ def test_value def test_values require_gi_bindings(3, 1, 7) builder = Arrow::UInt64ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4], array.values) end diff --git a/c_glib/test/test-uint8-array.rb b/c_glib/test/test-uint8-array.rb index 9137e53be70e5..08dfb3064cccb 100644 --- a/c_glib/test/test-uint8-array.rb +++ b/c_glib/test/test-uint8-array.rb @@ -28,25 +28,25 @@ def test_new def test_buffer builder = Arrow::UInt8ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4].pack("C*"), array.buffer.data.to_s) end def test_value builder = Arrow::UInt8ArrayBuilder.new - builder.append(1) + builder.append_value(1) array = builder.finish assert_equal(1, array.get_value(0)) end def test_values builder = Arrow::UInt8ArrayBuilder.new - builder.append(1) - builder.append(2) - builder.append(4) + builder.append_value(1) + builder.append_value(2) + builder.append_value(4) array = builder.finish assert_equal([1, 2, 4], array.values) end From 857deae933478970b4fc0ff55fab61f32a5c6e4f Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Sat, 5 Jan 2019 20:23:22 +0900 Subject: [PATCH 147/328] ARROW-4154: [GLib] Add GArrowDecimal128DataType garrow_decimal_data_type_new() is deprecated. Author: Kouhei Sutou Closes #3305 from kou/glib-decimal128-data-type and squashes the following commits: b51b7a19 Use decimal128 4823eea6 Add GArrowDecimal128DataType --- c_glib/arrow-glib/Makefile.am | 6 +- c_glib/arrow-glib/array-builder.cpp | 6 +- c_glib/arrow-glib/array-builder.h | 5 +- c_glib/arrow-glib/basic-array.cpp | 4 +- c_glib/arrow-glib/basic-data-type.cpp | 61 +++++++++++++++---- c_glib/arrow-glib/basic-data-type.h | 28 +++++++-- .../{decimal.cpp => decimal128.cpp} | 2 +- c_glib/arrow-glib/{decimal.h => decimal128.h} | 0 .../{decimal.hpp => decimal128.hpp} | 2 +- c_glib/arrow-glib/meson.build | 6 +- c_glib/arrow-glib/orc-file-reader.h | 2 + c_glib/doc/arrow-glib/arrow-glib-docs.xml | 2 +- ...imal-array.rb => test-decimal128-array.rb} | 6 +- ...a-type.rb => test-decimal128-data-type.rb} | 10 +-- .../{test-decimal.rb => test-decimal128.rb} | 0 15 files changed, 99 insertions(+), 41 deletions(-) rename c_glib/arrow-glib/{decimal.cpp => decimal128.cpp} (99%) rename c_glib/arrow-glib/{decimal.h => decimal128.h} (100%) rename c_glib/arrow-glib/{decimal.hpp => decimal128.hpp} (96%) rename c_glib/test/{test-decimal-array.rb => test-decimal128-array.rb} (89%) rename c_glib/test/{test-decimal-data-type.rb => test-decimal128-data-type.rb} (80%) rename c_glib/test/{test-decimal.rb => test-decimal128.rb} (100%) diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am index bf97168eb81d7..a296595571438 100644 --- a/c_glib/arrow-glib/Makefile.am +++ b/c_glib/arrow-glib/Makefile.am @@ -59,7 +59,7 @@ libarrow_glib_la_headers = \ composite-array.h \ composite-data-type.h \ data-type.h \ - decimal.h \ + decimal128.h \ error.h \ field.h \ gobject-type.h \ @@ -110,7 +110,7 @@ libarrow_glib_la_sources = \ column.cpp \ composite-array.cpp \ composite-data-type.cpp \ - decimal.cpp \ + decimal128.cpp \ error.cpp \ field.cpp \ record-batch.cpp \ @@ -155,7 +155,7 @@ libarrow_glib_la_cpp_headers = \ codec.hpp \ column.hpp \ data-type.hpp \ - decimal.hpp \ + decimal128.hpp \ error.hpp \ field.hpp \ record-batch.hpp \ diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 4b61bfaf7fab9..5f2d4119ce6a2 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -23,9 +23,9 @@ #include #include +#include #include #include -#include template gboolean @@ -3803,14 +3803,14 @@ garrow_decimal128_array_builder_class_init(GArrowDecimal128ArrayBuilderClass *kl /** * garrow_decimal128_array_builder_new: - * @data_type: #GArrowDecimalDataType for the decimal. + * @data_type: #GArrowDecimal128DataType for the decimal. * * Returns: A newly created #GArrowDecimal128ArrayBuilder. * * Since: 0.10.0 */ GArrowDecimal128ArrayBuilder * -garrow_decimal128_array_builder_new(GArrowDecimalDataType *data_type) +garrow_decimal128_array_builder_new(GArrowDecimal128DataType *data_type) { auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); auto builder = garrow_array_builder_new(arrow_data_type, diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h index 1ddc0266f4993..b2ad6f4bfd3fd 100644 --- a/c_glib/arrow-glib/array-builder.h +++ b/c_glib/arrow-glib/array-builder.h @@ -20,8 +20,7 @@ #pragma once #include -#include -#include +#include G_BEGIN_DECLS @@ -1475,7 +1474,7 @@ struct _GArrowDecimal128ArrayBuilderClass GArrowArrayBuilderClass parent_class; }; -GArrowDecimal128ArrayBuilder *garrow_decimal128_array_builder_new(GArrowDecimalDataType *data_type); +GArrowDecimal128ArrayBuilder *garrow_decimal128_array_builder_new(GArrowDecimal128DataType *data_type); #ifndef GARROW_DISABLE_DEPRECATED GARROW_DEPRECATED_IN_0_12_FOR(garrow_decimal128_array_builder_append_value) diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index fef43a0285e25..9aebd9cb8957a 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -22,12 +22,12 @@ #endif #include +#include #include #include -#include +#include #include #include -#include #include diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index cd3aa97679b5d..2a599963ee3aa 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -85,7 +85,9 @@ G_BEGIN_DECLS * #GArrowTime64DataType is a class for the number of microseconds or * nanoseconds since midnight in 64-bit signed integer data type. * - * #GArrowDecimalDataType is a class for 128-bit decimal data type. + * #GArrowDecimalDataType is a base class for decimal data type. + * + * #GArrowDecimal128DataType is a class for 128-bit decimal data type. */ typedef struct GArrowDataTypePrivate_ { @@ -1040,9 +1042,9 @@ garrow_time64_data_type_new(GArrowTimeUnit unit, GError **error) } -G_DEFINE_TYPE(GArrowDecimalDataType, - garrow_decimal_data_type, - GARROW_TYPE_DATA_TYPE) +G_DEFINE_ABSTRACT_TYPE(GArrowDecimalDataType, + garrow_decimal_data_type, + GARROW_TYPE_DATA_TYPE) static void garrow_decimal_data_type_init(GArrowDecimalDataType *object) @@ -1062,18 +1064,16 @@ garrow_decimal_data_type_class_init(GArrowDecimalDataTypeClass *klass) * Returns: The newly created decimal data type. * * Since: 0.10.0 + * + * Deprecate: 0.12.0: + * Use garrow_decimal128_data_type_new() instead. */ GArrowDecimalDataType * garrow_decimal_data_type_new(gint32 precision, gint32 scale) { - auto arrow_data_type = arrow::decimal(precision, scale); - - GArrowDecimalDataType *data_type = - GARROW_DECIMAL_DATA_TYPE(g_object_new(GARROW_TYPE_DECIMAL_DATA_TYPE, - "data-type", &arrow_data_type, - NULL)); - return data_type; + auto decimal128_data_type = garrow_decimal128_data_type_new(precision, scale); + return GARROW_DECIMAL_DATA_TYPE(decimal128_data_type); } /** @@ -1112,6 +1112,43 @@ garrow_decimal_data_type_get_scale(GArrowDecimalDataType *decimal_data_type) return arrow_decimal_type->scale(); } + +G_DEFINE_TYPE(GArrowDecimal128DataType, + garrow_decimal128_data_type, + GARROW_TYPE_DECIMAL_DATA_TYPE) + +static void +garrow_decimal128_data_type_init(GArrowDecimal128DataType *object) +{ +} + +static void +garrow_decimal128_data_type_class_init(GArrowDecimal128DataTypeClass *klass) +{ +} + +/** + * garrow_decimal128_data_type_new: + * @precision: The precision of decimal data. + * @scale: The scale of decimal data. + * + * Returns: The newly created 128-bit decimal data type. + * + * Since: 0.12.0 + */ +GArrowDecimal128DataType * +garrow_decimal128_data_type_new(gint32 precision, + gint32 scale) +{ + auto arrow_data_type = arrow::decimal(precision, scale); + + auto data_type = + GARROW_DECIMAL128_DATA_TYPE(g_object_new(GARROW_TYPE_DECIMAL128_DATA_TYPE, + "data-type", &arrow_data_type, + NULL)); + return data_type; +} + G_END_DECLS GArrowDataType * @@ -1199,7 +1236,7 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) type = GARROW_TYPE_DICTIONARY_DATA_TYPE; break; case arrow::Type::type::DECIMAL: - type = GARROW_TYPE_DECIMAL_DATA_TYPE; + type = GARROW_TYPE_DECIMAL128_DATA_TYPE; break; default: type = GARROW_TYPE_DATA_TYPE; diff --git a/c_glib/arrow-glib/basic-data-type.h b/c_glib/arrow-glib/basic-data-type.h index 45fddba34d4bc..ef41f1dbcfa0b 100644 --- a/c_glib/arrow-glib/basic-data-type.h +++ b/c_glib/arrow-glib/basic-data-type.h @@ -19,9 +19,9 @@ #pragma once -#include +#include #include -#include +#include G_BEGIN_DECLS @@ -651,6 +651,7 @@ GArrowTime64DataType *garrow_time64_data_type_new (GArrowTimeUnit unit, #define GARROW_TYPE_DECIMAL_DATA_TYPE (garrow_decimal_data_type_get_type()) +/* TODO: Delivered from GArrowFixedSizeBinaryDataType. */ G_DECLARE_DERIVABLE_TYPE(GArrowDecimalDataType, garrow_decimal_data_type, GARROW, @@ -661,9 +662,28 @@ struct _GArrowDecimalDataTypeClass GArrowDataTypeClass parent_class; }; -GArrowDecimalDataType *garrow_decimal_data_type_new (gint32 precision, - gint32 scale); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_decimal128_data_type_new) +GArrowDecimalDataType * +garrow_decimal_data_type_new(gint32 precision, gint32 scale); +#endif gint32 garrow_decimal_data_type_get_precision(GArrowDecimalDataType *decimal_data_type); gint32 garrow_decimal_data_type_get_scale(GArrowDecimalDataType *decimal_data_type); + +#define GARROW_TYPE_DECIMAL128_DATA_TYPE (garrow_decimal128_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDecimal128DataType, + garrow_decimal128_data_type, + GARROW, + DECIMAL128_DATA_TYPE, + GArrowDecimalDataType) +struct _GArrowDecimal128DataTypeClass +{ + GArrowDecimalDataTypeClass parent_class; +}; + +GARROW_AVAILABLE_IN_0_12 +GArrowDecimal128DataType * +garrow_decimal128_data_type_new(gint32 precision, gint32 scale); + G_END_DECLS diff --git a/c_glib/arrow-glib/decimal.cpp b/c_glib/arrow-glib/decimal128.cpp similarity index 99% rename from c_glib/arrow-glib/decimal.cpp rename to c_glib/arrow-glib/decimal128.cpp index 34eb417a96105..e30eb7ee58638 100644 --- a/c_glib/arrow-glib/decimal.cpp +++ b/c_glib/arrow-glib/decimal128.cpp @@ -21,7 +21,7 @@ # include #endif -#include +#include #include G_BEGIN_DECLS diff --git a/c_glib/arrow-glib/decimal.h b/c_glib/arrow-glib/decimal128.h similarity index 100% rename from c_glib/arrow-glib/decimal.h rename to c_glib/arrow-glib/decimal128.h diff --git a/c_glib/arrow-glib/decimal.hpp b/c_glib/arrow-glib/decimal128.hpp similarity index 96% rename from c_glib/arrow-glib/decimal.hpp rename to c_glib/arrow-glib/decimal128.hpp index ce56cfe0bd062..84bf47e409f50 100644 --- a/c_glib/arrow-glib/decimal.hpp +++ b/c_glib/arrow-glib/decimal128.hpp @@ -23,7 +23,7 @@ #include -#include +#include GArrowDecimal128 *garrow_decimal128_new_raw(std::shared_ptr *arrow_decimal128); std::shared_ptr garrow_decimal128_get_raw(GArrowDecimal128 *decimal); diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index d962ec103175e..14126bee8d784 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -27,7 +27,7 @@ sources = files( 'column.cpp', 'composite-array.cpp', 'composite-data-type.cpp', - 'decimal.cpp', + 'decimal128.cpp', 'error.cpp', 'field.cpp', 'record-batch.cpp', @@ -77,7 +77,7 @@ c_headers = files( 'composite-array.h', 'composite-data-type.h', 'data-type.h', - 'decimal.h', + 'decimal128.h', 'error.h', 'field.h', 'gobject-type.h', @@ -128,7 +128,7 @@ cpp_headers = files( 'codec.hpp', 'column.hpp', 'data-type.hpp', - 'decimal.hpp', + 'decimal128.hpp', 'error.hpp', 'field.hpp', 'record-batch.hpp', diff --git a/c_glib/arrow-glib/orc-file-reader.h b/c_glib/arrow-glib/orc-file-reader.h index 97cf1efa92ff7..9551d52e0fd55 100644 --- a/c_glib/arrow-glib/orc-file-reader.h +++ b/c_glib/arrow-glib/orc-file-reader.h @@ -45,6 +45,7 @@ garrow_orc_file_reader_set_field_indexes(GArrowORCFileReader *reader, const gint *field_indexes, guint n_field_indexes); #endif +GARROW_AVAILABLE_IN_0_12 void garrow_orc_file_reader_set_field_indices(GArrowORCFileReader *reader, const gint *field_indices, @@ -55,6 +56,7 @@ const gint * garrow_orc_file_reader_get_field_indexes(GArrowORCFileReader *reader, guint *n_field_indexes); #endif +GARROW_AVAILABLE_IN_0_12 const gint * garrow_orc_file_reader_get_field_indices(GArrowORCFileReader *reader, guint *n_field_indices); diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index 17b75005ff97a..f9f01fe23e4de 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -53,7 +53,7 @@ Decimal - + Tensor diff --git a/c_glib/test/test-decimal-array.rb b/c_glib/test/test-decimal128-array.rb similarity index 89% rename from c_glib/test/test-decimal-array.rb rename to c_glib/test/test-decimal128-array.rb index a5eb28253d95f..132ceb7788585 100644 --- a/c_glib/test/test-decimal-array.rb +++ b/c_glib/test/test-decimal128-array.rb @@ -15,9 +15,9 @@ # specific language governing permissions and limitations # under the License. -class TestDecimalArray < Test::Unit::TestCase +class TestDecimal128Array < Test::Unit::TestCase def test_format_value - data_type = Arrow::DecimalDataType.new(8,2) + data_type = Arrow::Decimal128DataType.new(8, 2) builder = Arrow::Decimal128ArrayBuilder.new(data_type) decimal = Arrow::Decimal128.new("23423445") builder.append_value(decimal) @@ -26,7 +26,7 @@ def test_format_value end def test_value - data_type = Arrow::DecimalDataType.new(8,2) + data_type = Arrow::Decimal128DataType.new(8, 2) builder = Arrow::Decimal128ArrayBuilder.new(data_type) decimal = Arrow::Decimal128.new("23423445") builder.append_value(decimal) diff --git a/c_glib/test/test-decimal-data-type.rb b/c_glib/test/test-decimal128-data-type.rb similarity index 80% rename from c_glib/test/test-decimal-data-type.rb rename to c_glib/test/test-decimal128-data-type.rb index 04bfe78f925c0..27a31e28309cd 100644 --- a/c_glib/test/test-decimal-data-type.rb +++ b/c_glib/test/test-decimal128-data-type.rb @@ -15,24 +15,24 @@ # specific language governing permissions and limitations # under the License. -class TestDecimalDataType < Test::Unit::TestCase +class TestDecimal128DataType < Test::Unit::TestCase def test_type - data_type = Arrow::DecimalDataType.new(2, 0) + data_type = Arrow::Decimal128DataType.new(2, 0) assert_equal(Arrow::Type::DECIMAL, data_type.id) end def test_to_s - data_type = Arrow::DecimalDataType.new(2, 0) + data_type = Arrow::Decimal128DataType.new(2, 0) assert_equal("decimal(2, 0)", data_type.to_s) end def test_precision - data_type = Arrow::DecimalDataType.new(8, 2) + data_type = Arrow::Decimal128DataType.new(8, 2) assert_equal(8, data_type.precision) end def test_scale - data_type = Arrow::DecimalDataType.new(8, 2) + data_type = Arrow::Decimal128DataType.new(8, 2) assert_equal(2, data_type.scale) end end diff --git a/c_glib/test/test-decimal.rb b/c_glib/test/test-decimal128.rb similarity index 100% rename from c_glib/test/test-decimal.rb rename to c_glib/test/test-decimal128.rb From 46b1bc764ade2ac776a94255e4ca0467f375ee4e Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Sat, 5 Jan 2019 12:16:04 -0600 Subject: [PATCH 148/328] ARROW-4160: [Rust] Add README and executable files to parquet Author: Chao Sun Closes #3314 from sunchao/ARROW-4160 and squashes the following commits: 9d215df22 ARROW-4160: Add README and executable files to parquet --- rust/parquet/Cargo.toml | 4 +- rust/parquet/README.md | 111 +++++++++++++++++++++++++ rust/parquet/src/bin/parquet-read.rs | 87 +++++++++++++++++++ rust/parquet/src/bin/parquet-schema.rs | 88 ++++++++++++++++++++ 4 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 rust/parquet/README.md create mode 100644 rust/parquet/src/bin/parquet-read.rs create mode 100644 rust/parquet/src/bin/parquet-schema.rs diff --git a/rust/parquet/Cargo.toml b/rust/parquet/Cargo.toml index aa7eac224c0cf..7478992327ddc 100644 --- a/rust/parquet/Cargo.toml +++ b/rust/parquet/Cargo.toml @@ -17,9 +17,11 @@ [package] name = "parquet" -version = "0.12.0-SNAPSHOT" +version = "0.5.0-SNAPSHOT" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" +homepage = "https://github.com/apache/arrow" +repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] keywords = [ "arrow", "parquet", "hadoop" ] readme = "README.md" diff --git a/rust/parquet/README.md b/rust/parquet/README.md new file mode 100644 index 0000000000000..e9238ffba8a13 --- /dev/null +++ b/rust/parquet/README.md @@ -0,0 +1,111 @@ + + +# An Apache Parquet implementation in Rust + +## Usage +Add this to your Cargo.toml: +```toml +[dependencies] +parquet = "0.4" +``` + +and this to your crate root: +```rust +extern crate parquet; +``` + +Example usage of reading data: +```rust +use std::fs::File; +use std::path::Path; +use parquet::file::reader::{FileReader, SerializedFileReader}; + +let file = File::open(&Path::new("/path/to/file")).unwrap(); +let reader = SerializedFileReader::new(file).unwrap(); +let mut iter = reader.get_row_iter(None).unwrap(); +while let Some(record) = iter.next() { + println!("{}", record); +} +``` +See [crate documentation](https://docs.rs/crate/parquet/0.4.2) on available API. + +## Supported Parquet Version +- Parquet-format 2.4.0 + +To update Parquet format to a newer version, check if [parquet-format](https://github.com/sunchao/parquet-format-rs) +version is available. Then simply update version of `parquet-format` crate in Cargo.toml. + +## Features +- [X] All encodings supported +- [X] All compression codecs supported +- [X] Read support + - [X] Primitive column value readers + - [X] Row record reader + - [ ] Arrow record reader +- [X] Statistics support +- [X] Write support + - [X] Primitive column value writers + - [ ] Row record writer + - [ ] Arrow record writer +- [ ] Predicate pushdown +- [ ] Parquet format 2.5 support +- [ ] HDFS support + +## Requirements +- Rust nightly + +See [Working with nightly Rust](https://github.com/rust-lang-nursery/rustup.rs/blob/master/README.md#working-with-nightly-rust) +to install nightly toolchain and set it as default. + +## Build +Run `cargo build` or `cargo build --release` to build in release mode. +Some features take advantage of SSE4.2 instructions, which can be +enabled by adding `RUSTFLAGS="-C target-feature=+sse4.2"` before the +`cargo build` command. + +## Test +Run `cargo test` for unit tests. + +## Binaries +The following binaries are provided (use `cargo install` to install them): +- **parquet-schema** for printing Parquet file schema and metadata. +`Usage: parquet-schema [verbose]`, where `file-path` is the path to a Parquet file, +and optional `verbose` is the boolean flag that allows to print full metadata or schema only +(when not specified only schema will be printed). + +- **parquet-read** for reading records from a Parquet file. +`Usage: parquet-read [num-records]`, where `file-path` is the path to a Parquet file, +and `num-records` is the number of records to read from a file (when not specified all records will +be printed). + +If you see `Library not loaded` error, please make sure `LD_LIBRARY_PATH` is set properly: +``` +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(rustc --print sysroot)/lib +``` + +## Benchmarks +Run `cargo bench` for benchmarks. + +## Docs +To build documentation, run `cargo doc --no-deps`. +To compile and view in the browser, run `cargo doc --no-deps --open`. + +## License +Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0. diff --git a/rust/parquet/src/bin/parquet-read.rs b/rust/parquet/src/bin/parquet-read.rs new file mode 100644 index 0000000000000..c86b26e3e7a4d --- /dev/null +++ b/rust/parquet/src/bin/parquet-read.rs @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary file to read data from a Parquet file. +//! +//! # Install +//! +//! `parquet-read` can be installed using `cargo`: +//! ``` +//! cargo install parquet +//! ``` +//! After this `parquet-read` should be globally available: +//! ``` +//! parquet-read XYZ.parquet +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --bin parquet-read XYZ.parquet +//! ``` +//! +//! # Usage +//! +//! ``` +//! parquet-read [num-records] +//! ``` +//! where `file-path` is the path to a Parquet file and `num-records` is the optional +//! numeric option that allows to specify number of records to read from a file. +//! When not provided, all records are read. +//! +//! Note that `parquet-read` reads full file schema, no projection or filtering is +//! applied. + +extern crate parquet; + +use std::{env, fs::File, path::Path, process}; + +use parquet::file::reader::{FileReader, SerializedFileReader}; + +fn main() { + let args: Vec = env::args().collect(); + if args.len() != 2 && args.len() != 3 { + println!("Usage: parquet-read [num-records]"); + process::exit(1); + } + + let mut num_records: Option = None; + if args.len() == 3 { + match args[2].parse() { + Ok(value) => num_records = Some(value), + Err(e) => panic!("Error when reading value for [num-records], {}", e), + } + } + + let path = Path::new(&args[1]); + let file = File::open(&path).unwrap(); + let parquet_reader = SerializedFileReader::new(file).unwrap(); + + // Use full schema as projected schema + let mut iter = parquet_reader.get_row_iter(None).unwrap(); + + let mut start = 0; + let end = num_records.unwrap_or(0); + let all_records = num_records.is_none(); + + while all_records || start < end { + match iter.next() { + Some(row) => println!("{}", row), + None => break, + } + start += 1; + } +} diff --git a/rust/parquet/src/bin/parquet-schema.rs b/rust/parquet/src/bin/parquet-schema.rs new file mode 100644 index 0000000000000..2eaf7652ae9d6 --- /dev/null +++ b/rust/parquet/src/bin/parquet-schema.rs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary file to print the schema and metadata of a Parquet file. +//! +//! # Install +//! +//! `parquet-schema` can be installed using `cargo`: +//! ``` +//! cargo install parquet +//! ``` +//! After this `parquet-schema` should be globally available: +//! ``` +//! parquet-schema XYZ.parquet +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --bin parquet-schema XYZ.parquet +//! ``` +//! +//! # Usage +//! +//! ``` +//! parquet-schema [verbose] +//! ``` +//! where `file-path` is the path to a Parquet file and `verbose` is the optional boolean +//! flag that allows to print schema only, when set to `false` (default behaviour when +//! not provided), or print full file metadata, when set to `true`. + +extern crate parquet; + +use std::{env, fs::File, path::Path, process}; + +use parquet::{ + file::reader::{FileReader, SerializedFileReader}, + schema::printer::{print_file_metadata, print_parquet_metadata}, +}; + +fn main() { + let args: Vec = env::args().collect(); + if args.len() != 2 && args.len() != 3 { + println!("Usage: parquet-schema [verbose]"); + process::exit(1); + } + let path = Path::new(&args[1]); + let mut verbose = false; + if args.len() == 3 { + match args[2].parse() { + Ok(b) => verbose = b, + Err(e) => panic!( + "Error when reading value for [verbose] (expected either 'true' or 'false'): {}", + e + ), + } + } + let file = match File::open(&path) { + Err(e) => panic!("Error when opening file {}: {}", path.display(), e), + Ok(f) => f, + }; + match SerializedFileReader::new(file) { + Err(e) => panic!("Error when parsing Parquet file: {}", e), + Ok(parquet_reader) => { + let metadata = parquet_reader.metadata(); + println!("Metadata for file: {}", &args[1]); + println!(""); + if verbose { + print_parquet_metadata(&mut std::io::stdout(), &metadata); + } else { + print_file_metadata(&mut std::io::stdout(), &metadata.file_metadata()); + } + } + } +} From 66f0d39a1c9ddd5e9de85ff7bfc8c13601372050 Mon Sep 17 00:00:00 2001 From: Binyang2014 Date: Sun, 6 Jan 2019 02:31:10 +0800 Subject: [PATCH 149/328] [Documentation][C++] Change build conda create command for Windows developer (#3316) [Documentation][C++] Change build conda create command for Windows developer --- cpp/apidoc/Windows.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/apidoc/Windows.md b/cpp/apidoc/Windows.md index 5199c2fdbfa59..8a724d0342be7 100644 --- a/cpp/apidoc/Windows.md +++ b/cpp/apidoc/Windows.md @@ -38,10 +38,11 @@ Launch cmd.exe and run following commands: conda config --add channels conda-forge ``` -Now, you can bootstrap a build environment +Now, you can bootstrap a build environment (call from the root directory of the +Arrow codebase): ```shell -conda create -n arrow-dev cmake git boost-cpp flatbuffers rapidjson cmake thrift-cpp snappy zlib brotli gflags lz4-c zstd -c conda-forge +conda create -n arrow-dev --file=ci\conda_env_cpp.yml ``` > **Note:** Make sure to get the `conda-forge` build of `gflags` as the From 5723adad7ad80c95ba8fcb55d40186d6a29edb74 Mon Sep 17 00:00:00 2001 From: Jeffrey Wong Date: Sat, 5 Jan 2019 12:33:09 -0600 Subject: [PATCH 150/328] ARROW-3731: MVP to read parquet in R library I am contributing to [Arrow 3731](https://issues.apache.org/jira/browse/ARROW-3731). This PR has the minimum functionality to read parquet files into an arrow::Table, which can then be converted to a tibble. Multiple parquet files can be read inside `lapply`, and then concatenated at the end. Steps to compile 1) Build arrow and parquet c++ projects 2) In R run `devtools::load_all()` What I could use help with: The biggest challenge for me is my lack of experience with pkg-config. The R library has a `configure` file which uses pkg-config to figure out what c++ libraries to link to. Currently, `configure` looks up the Arrow project and links to -larrow only. We need it to also link to -lparquet. I do not know how to modify pkg-config's metadata to let it know to link to both -larrow and -lparquet Author: Jeffrey Wong Author: Romain Francois Author: jeffwong-nflx Closes #3230 from jeffwong-nflx/master and squashes the following commits: c67fa3d36 Merge pull request #3 from jeffwong-nflx/cleanup 1df3026cb don't hard code -larrow and -lparquet 8ccaa5172 cleanup 75ba5c9ae add contributor 56adad2ae Merge pull request #2 from romainfrancois/3731/parquet-2 7d6e64df2 read_parquet() only reading one parquet file, and gains a `as_tibble` argument e936b4400 need parquet on travis too ff260c587 header was too commented, renamed to parquet.cpp 9e1897f80 styling etc ... 456c5d260 read parquet files 22d89dd23 hardcode -larrow and -lparquet --- .travis.yml | 2 ++ r/DESCRIPTION | 2 ++ r/NAMESPACE | 1 + r/R/RcppExports.R | 4 +++ r/R/parquet.R | 33 +++++++++++++++++++++++ r/README.Rmd | 2 +- r/README.md | 61 ++++++++++++------------------------------- r/configure | 4 +-- r/man/read_parquet.Rd | 21 +++++++++++++++ r/src/RcppExports.cpp | 12 +++++++++ r/src/parquet.cpp | 37 ++++++++++++++++++++++++++ 11 files changed, 131 insertions(+), 48 deletions(-) create mode 100644 r/R/parquet.R create mode 100644 r/man/read_parquet.Rd create mode 100644 r/src/parquet.cpp diff --git a/.travis.yml b/.travis.yml index f14f7e4785948..916ccf460ecf8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -326,6 +326,8 @@ matrix: language: r cache: packages latex: false + env: + - ARROW_TRAVIS_PARQUET=1 before_install: # Have to copy-paste this here because of how R's build steps work - eval `python $TRAVIS_BUILD_DIR/ci/detect-changes.py` diff --git a/r/DESCRIPTION b/r/DESCRIPTION index a2632973134b9..5303a877f9e26 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -4,6 +4,7 @@ Version: 0.11.0.9000 Authors@R: c( person("Romain", "François", email = "romain@rstudio.com", role = c("aut", "cre")), person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")), + person("Jeffrey", "Wong", email = "jeffreyw@netflix.com", role = c("ctb")), person("Apache Arrow", email = "dev@arrow.apache.org", role = c("aut", "cph")) ) Description: R Integration to 'Apache' 'Arrow'. @@ -62,6 +63,7 @@ Collate: 'memory_pool.R' 'message.R' 'on_exit.R' + 'parquet.R' 'read_record_batch.R' 'read_table.R' 'reexports-bit64.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index 8846defbd8e65..f8f6384dce1f8 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -123,6 +123,7 @@ export(read_arrow) export(read_csv_arrow) export(read_feather) export(read_message) +export(read_parquet) export(read_record_batch) export(read_schema) export(read_table) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index 55b9ab33ebf98..c6fe8719f4e89 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -637,6 +637,10 @@ ipc___ReadMessage <- function(stream) { .Call(`_arrow_ipc___ReadMessage`, stream) } +read_parquet_file <- function(filename) { + .Call(`_arrow_read_parquet_file`, filename) +} + RecordBatch__num_columns <- function(x) { .Call(`_arrow_RecordBatch__num_columns`, x) } diff --git a/r/R/parquet.R b/r/R/parquet.R new file mode 100644 index 0000000000000..141da7bd04b2c --- /dev/null +++ b/r/R/parquet.R @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Read parquet file from disk +#' +#' @param file a file path +#' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble. +#' @param ... currently ignored +#' +#' @return a [arrow::Table][arrow__Table], or a data frame if `as_tibble` is `TRUE`. +#' +#' @export +read_parquet <- function(file, as_tibble = TRUE, ...) { + tab <- shared_ptr(`arrow::Table`, read_parquet_file(f)) + if (isTRUE(as_tibble)) { + tab <- as_tibble(tab) + } + tab +} diff --git a/r/README.Rmd b/r/README.Rmd index 2c51d01c0f00f..9f0f39fef5352 100644 --- a/r/README.Rmd +++ b/r/README.Rmd @@ -25,7 +25,7 @@ git clone https://github.com/apache/arrow.git cd arrow/cpp && mkdir release && cd release # It is important to statically link to boost libraries -cmake .. -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off +cmake .. -DARROW_PARQUET=ON -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off make install ``` diff --git a/r/README.md b/r/README.md index 868fdff0a06e0..987d0c24a185b 100644 --- a/r/README.md +++ b/r/README.md @@ -14,7 +14,7 @@ git clone https://github.com/apache/arrow.git cd arrow/cpp && mkdir release && cd release # It is important to statically link to boost libraries -cmake .. -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off +cmake .. -DARROW_PARQUET=ON -DCMAKE_BUILD_TYPE=Release -DARROW_BOOST_USE_SHARED:BOOL=Off make install ``` @@ -38,48 +38,19 @@ tf <- tempfile() #> # A tibble: 10 x 2 #> x y #> -#> 1 1 -0.255 -#> 2 2 -0.162 -#> 3 3 -0.614 -#> 4 4 -0.322 -#> 5 5 0.0693 -#> 6 6 -0.920 -#> 7 7 -1.08 -#> 8 8 0.658 -#> 9 9 0.821 -#> 10 10 0.539 -arrow::write_arrow(tib, tf) - -# read it back with pyarrow -pa <- import("pyarrow") -as_tibble(pa$open_file(tf)$read_pandas()) -#> # A tibble: 10 x 2 -#> x y -#> -#> 1 1 -0.255 -#> 2 2 -0.162 -#> 3 3 -0.614 -#> 4 4 -0.322 -#> 5 5 0.0693 -#> 6 6 -0.920 -#> 7 7 -1.08 -#> 8 8 0.658 -#> 9 9 0.821 -#> 10 10 0.539 -``` - -## Development - -### Code style - -We use Google C++ style in our C++ code. Check for style errors with - -``` -./lint.sh -``` - -You can fix the style issues with - +#> 1 1 0.0855 +#> 2 2 -1.68 +#> 3 3 -0.0294 +#> 4 4 -0.124 +#> 5 5 0.0675 +#> 6 6 1.64 +#> 7 7 1.54 +#> 8 8 -0.0209 +#> 9 9 -0.982 +#> 10 10 0.349 +# arrow::write_arrow(tib, tf) + +# # read it back with pyarrow +# pa <- import("pyarrow") +# as_tibble(pa$open_file(tf)$read_pandas()) ``` -./lint.sh --fix -``` \ No newline at end of file diff --git a/r/configure b/r/configure index 28f6a73ac7ef5..c17fd4c2ef624 100755 --- a/r/configure +++ b/r/configure @@ -26,13 +26,13 @@ # R CMD INSTALL --configure-vars='INCLUDE_DIR=/.../include LIB_DIR=/.../lib' # Library settings -PKG_CONFIG_NAME="arrow" +PKG_CONFIG_NAME="arrow parquet" PKG_DEB_NAME="arrow" PKG_RPM_NAME="arrow" PKG_CSW_NAME="arrow" PKG_BREW_NAME="apache-arrow" PKG_TEST_HEADER="" -PKG_LIBS="-larrow" +PKG_LIBS="-larrow -lparquet" # Use pkg-config if available pkg-config --version >/dev/null 2>&1 diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd new file mode 100644 index 0000000000000..c29e18bca5baf --- /dev/null +++ b/r/man/read_parquet.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet.R +\name{read_parquet} +\alias{read_parquet} +\title{Read parquet file from disk} +\usage{ +read_parquet(file, as_tibble = TRUE, ...) +} +\arguments{ +\item{file}{a file path} + +\item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.} + +\item{...}{currently ignored} +} +\value{ +a \link[=arrow__Table]{arrow::Table}, or a data frame if \code{as_tibble} is \code{TRUE}. +} +\description{ +Read parquet file from disk +} diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index c752afba1c258..1e8fed1867655 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -1779,6 +1779,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// read_parquet_file +std::shared_ptr read_parquet_file(std::string filename); +RcppExport SEXP _arrow_read_parquet_file(SEXP filenameSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type filename(filenameSEXP); + rcpp_result_gen = Rcpp::wrap(read_parquet_file(filename)); + return rcpp_result_gen; +END_RCPP +} // RecordBatch__num_columns int RecordBatch__num_columns(const std::shared_ptr& x); RcppExport SEXP _arrow_RecordBatch__num_columns(SEXP xSEXP) { @@ -2369,6 +2380,7 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1}, {"_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, {"_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, + {"_arrow_read_parquet_file", (DL_FUNC) &_arrow_read_parquet_file, 1}, {"_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, {"_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, {"_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp new file mode 100644 index 0000000000000..859bd4826e7c2 --- /dev/null +++ b/r/src/parquet.cpp @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +// [[Rcpp::export]] +std::shared_ptr read_parquet_file(std::string filename) { + std::shared_ptr infile; + PARQUET_THROW_NOT_OK( + arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool(), &infile)); + + std::unique_ptr reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr table; + PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); + + return table; +} From 91a72ac7e1361b78167a703ecd5dddb85b621159 Mon Sep 17 00:00:00 2001 From: jlapacik Date: Sat, 5 Jan 2019 14:19:52 -0600 Subject: [PATCH 151/328] ARROW-4130: [Go] offset not used when accessing binary array Closes https://github.com/apache/arrow/issues/3270 . Author: jlapacik Closes #3283 from jlapacik/fix/go-binary-slice and squashes the following commits: 5cf6a4f03 assign slice value in out of bounds tests 0666c0ed4 allocate new slice for each test case 9b5a00057 remove single letter variable b46f8412d ARROW-4130: offset not used when accessing binary array --- go/arrow/array/binary.go | 38 +++- go/arrow/array/binary_test.go | 343 ++++++++++++++++++++++++++++++++++ 2 files changed, 376 insertions(+), 5 deletions(-) diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index 0b89b7e5817cc..850fb09b4a81a 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -38,7 +38,13 @@ func NewBinaryData(data *Data) *Binary { } // Value returns the slice at index i. This value should not be mutated. -func (a *Binary) Value(i int) []byte { return a.valueBytes[a.valueOffsets[i]:a.valueOffsets[i+1]] } +func (a *Binary) Value(i int) []byte { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + idx := a.array.data.offset + i + return a.valueBytes[a.valueOffsets[idx]:a.valueOffsets[idx+1]] +} // ValueString returns the string at index i without performing additional allocations. // The string is only valid for the lifetime of the Binary array. @@ -47,10 +53,32 @@ func (a *Binary) ValueString(i int) string { return *(*string)(unsafe.Pointer(&b)) } -func (a *Binary) ValueOffset(i int) int { return int(a.valueOffsets[i]) } -func (a *Binary) ValueLen(i int) int { return int(a.valueOffsets[i+1] - a.valueOffsets[i]) } -func (a *Binary) ValueOffsets() []int32 { return a.valueOffsets } -func (a *Binary) ValueBytes() []byte { return a.valueBytes } +func (a *Binary) ValueOffset(i int) int { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + return int(a.valueOffsets[a.array.data.offset+i]) +} + +func (a *Binary) ValueLen(i int) int { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + beg := a.array.data.offset + i + return int(a.valueOffsets[beg+1] - a.valueOffsets[beg]) +} + +func (a *Binary) ValueOffsets() []int32 { + beg := a.array.data.offset + end := beg + a.array.data.length + 1 + return a.valueOffsets[beg:end] +} + +func (a *Binary) ValueBytes() []byte { + beg := a.array.data.offset + end := beg + a.array.data.length + return a.valueBytes[a.valueOffsets[beg]:a.valueOffsets[end]] +} func (a *Binary) setData(data *Data) { if len(data.buffers) != 3 { diff --git a/go/arrow/array/binary_test.go b/go/arrow/array/binary_test.go index 87d1b58c47d14..2af45dee60f76 100644 --- a/go/arrow/array/binary_test.go +++ b/go/arrow/array/binary_test.go @@ -17,6 +17,7 @@ package array import ( + "reflect" "testing" "github.com/stretchr/testify/assert" @@ -62,3 +63,345 @@ func TestBinary(t *testing.T) { b.Release() } + +func TestBinarySliceData(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "def", "g", "hijk", "lm", "n", "opq", "rs", "tu"} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + for _, v := range values { + b.AppendString(v) + } + + arr := b.NewArray().(*Binary) + defer arr.Release() + + if got, want := arr.Len(), len(values); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]string, arr.Len()) + + for i := range vs { + vs[i] = arr.ValueString(i) + } + + if got, want := vs, values; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + tests := []struct { + interval [2]int64 + want []string + }{ + { + interval: [2]int64{0, 0}, + want: []string{}, + }, + { + interval: [2]int64{0, 5}, + want: []string{"a", "bc", "def", "g", "hijk"}, + }, + { + interval: [2]int64{0, 10}, + want: []string{"a", "bc", "def", "g", "hijk", "lm", "n", "opq", "rs", "tu"}, + }, + { + interval: [2]int64{5, 10}, + want: []string{"lm", "n", "opq", "rs", "tu"}, + }, + { + interval: [2]int64{10, 10}, + want: []string{}, + }, + { + interval: [2]int64{2, 7}, + want: []string{"def", "g", "hijk", "lm", "n"}, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + slice := NewSlice(arr, tc.interval[0], tc.interval[1]).(*Binary) + defer slice.Release() + + if got, want := slice.Len(), len(tc.want); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]string, slice.Len()) + + for i := range vs { + vs[i] = slice.ValueString(i) + } + + if got, want := vs, tc.want; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + }) + } +} + +func TestBinarySliceDataWithNull(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"} + valids := []bool{true, true, false, false, true, true, true, true, false, true} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + b.AppendStringValues(values, valids) + + arr := b.NewArray().(*Binary) + defer arr.Release() + + if got, want := arr.Len(), len(values); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 3; got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]string, arr.Len()) + + for i := range vs { + vs[i] = arr.ValueString(i) + } + + if got, want := vs, values; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + tests := []struct { + interval [2]int64 + nulls int + want []string + }{ + { + interval: [2]int64{0, 2}, + nulls: 0, + want: []string{"a", "bc"}, + }, + { + interval: [2]int64{0, 3}, + nulls: 1, + want: []string{"a", "bc", ""}, + }, + { + interval: [2]int64{0, 4}, + nulls: 2, + want: []string{"a", "bc", "", ""}, + }, + { + interval: [2]int64{4, 8}, + nulls: 0, + want: []string{"hijk", "lm", "", "opq"}, + }, + { + interval: [2]int64{2, 9}, + nulls: 3, + want: []string{"", "", "hijk", "lm", "", "opq", ""}, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + slice := NewSlice(arr, tc.interval[0], tc.interval[1]).(*Binary) + defer slice.Release() + + if got, want := slice.Len(), len(tc.want); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.NullN(), tc.nulls; got != want { + t.Errorf("got=%d, want=%d", got, want) + } + + vs := make([]string, slice.Len()) + + for i := range vs { + vs[i] = slice.ValueString(i) + } + + if got, want := vs, tc.want; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + }) + } +} + +func TestBinarySliceOutOfBounds(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "def", "g", "hijk", "lm", "n", "opq", "rs", "tu"} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + for _, v := range values { + b.AppendString(v) + } + + arr := b.NewArray().(*Binary) + defer arr.Release() + + slice := NewSlice(arr, 3, 8).(*Binary) + defer slice.Release() + + tests := []struct { + index int + panic bool + }{ + { + index: -1, + panic: true, + }, + { + index: 5, + panic: true, + }, + { + index: 0, + panic: false, + }, + { + index: 4, + panic: false, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + var val string + + if tc.panic { + defer func() { + e := recover() + if e == nil { + t.Fatalf("this should have panicked, but did not; slice value %q", val) + } + if got, want := e.(string), "arrow/array: index out of range"; got != want { + t.Fatalf("invalid error. got=%q, want=%q", got, want) + } + }() + } else { + defer func() { + if e := recover(); e != nil { + t.Fatalf("unexpected panic: %v", e) + } + }() + } + + val = slice.ValueString(tc.index) + }) + } +} + +func TestBinaryValueOffset(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"} + valids := []bool{true, true, false, false, true, true, true, true, false, true} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + b.AppendStringValues(values, valids) + + arr := b.NewArray().(*Binary) + defer arr.Release() + + slice := NewSlice(arr, 2, 9).(*Binary) + defer slice.Release() + + offset := 3 + vs := values[2:9] + + for i, v := range vs { + assert.Equal(t, offset, slice.ValueOffset(i)) + offset += len(v) + } +} + +func TestBinaryValueLen(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"} + valids := []bool{true, true, false, false, true, true, true, true, false, true} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + b.AppendStringValues(values, valids) + + arr := b.NewArray().(*Binary) + defer arr.Release() + + slice := NewSlice(arr, 2, 9).(*Binary) + defer slice.Release() + + vs := values[2:9] + + for i, v := range vs { + assert.Equal(t, len(v), slice.ValueLen(i)) + } +} + +func TestBinaryValueOffsets(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"} + valids := []bool{true, true, false, false, true, true, true, true, false, true} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + b.AppendStringValues(values, valids) + + arr := b.NewArray().(*Binary) + defer arr.Release() + + assert.Equal(t, []int32{0, 1, 3, 3, 3, 7, 9, 9, 12, 12, 14}, arr.ValueOffsets()) + + slice := NewSlice(arr, 2, 9).(*Binary) + defer slice.Release() + + assert.Equal(t, []int32{3, 3, 3, 7, 9, 9, 12, 12}, slice.ValueOffsets()) +} + +func TestBinaryValueBytes(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"} + valids := []bool{true, true, false, false, true, true, true, true, false, true} + + b := NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + defer b.Release() + + b.AppendStringValues(values, valids) + + arr := b.NewArray().(*Binary) + defer arr.Release() + + assert.Equal(t, []byte{'a', 'b', 'c', 'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q', 't', 'u'}, arr.ValueBytes()) + + slice := NewSlice(arr, 2, 9).(*Binary) + defer slice.Release() + + assert.Equal(t, []byte{'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q'}, slice.ValueBytes()) +} From 489534046290db2f607c59bf57f32d888e8109ed Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 5 Jan 2019 15:48:20 -0700 Subject: [PATCH 152/328] ARROW-4155: [Rust] Implement array_ops::sum() for PrimitiveArray Adds the ability to return the sum of column Author: Neville Dipale Closes #3317 from nevi-me/rust/array-ops and squashes the following commits: bba0788 fix if-else branch 62a91f3 add condition to check null values on sum 9ca0034 cargo fmt a23d240 : Implement array_ops::sum() for PrimitiveArray --- rust/src/array_ops.rs | 53 +++++++++++++++++++++++++++++++++++++++++++ rust/src/datatypes.rs | 33 +++++++++++++++++---------- 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/rust/src/array_ops.rs b/rust/src/array_ops.rs index 59145754f0248..517111ba76a45 100644 --- a/rust/src/array_ops.rs +++ b/rust/src/array_ops.rs @@ -155,6 +155,35 @@ where n } +/// Returns the sum of values in the array. +/// +/// Returns `None` if the array is empty or only contains null values. +pub fn sum(array: &PrimitiveArray) -> Option +where + T: ArrowNumericType, + T::Native: Add, +{ + let mut n: T::Native = T::default_value(); + // iteratively track whether all values are null (or array is empty) + let mut all_nulls = true; + let data = array.data(); + for i in 0..data.len() { + if data.is_null(i) { + continue; + } + if all_nulls { + all_nulls = false; + } + let m = array.value(i); + n = n + m; + } + if all_nulls { + None + } else { + Some(n) + } +} + /// Perform `left == right` operation on two arrays. pub fn eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where @@ -399,6 +428,30 @@ mod tests { assert_eq!(13, c.value(2)); } + #[test] + fn test_primitive_array_sum() { + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + assert_eq!(15, sum(&a).unwrap()); + } + + #[test] + fn test_primitive_array_float_sum() { + let a = Float64Array::from(vec![1.1, 2.2, 3.3, 4.4, 5.5]); + assert_eq!(16.5, sum(&a).unwrap()); + } + + #[test] + fn test_primitive_array_sum_with_nulls() { + let a = Int32Array::from(vec![None, Some(2), Some(3), None, Some(5)]); + assert_eq!(10, sum(&a).unwrap()); + } + + #[test] + fn test_primitive_array_sum_all_nulls() { + let a = Int32Array::from(vec![None, None, None]); + assert_eq!(None, sum(&a)); + } + #[test] fn test_primitive_array_eq() { let a = Int32Array::from(vec![8, 8, 8, 8, 8]); diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index 36cb818cdfc7a..49e06eb0969b2 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -83,10 +83,15 @@ pub trait ArrowPrimitiveType: 'static { /// Returns the bit width of this primitive type. fn get_bit_width() -> usize; + + /// Returns a default value of this primitive type. + /// + /// This is useful for aggregate array ops like `sum()`, `mean()`. + fn default_value() -> Self::Native; } macro_rules! make_type { - ($name:ident, $native_ty:ty, $data_ty:path, $bit_width:expr) => { + ($name:ident, $native_ty:ty, $data_ty:path, $bit_width:expr, $default_val:expr) => { impl ArrowNativeType for $native_ty {} pub struct $name {} @@ -101,21 +106,25 @@ macro_rules! make_type { fn get_bit_width() -> usize { $bit_width } + + fn default_value() -> Self::Native { + $default_val + } } }; } -make_type!(BooleanType, bool, DataType::Boolean, 1); -make_type!(Int8Type, i8, DataType::Int8, 8); -make_type!(Int16Type, i16, DataType::Int16, 16); -make_type!(Int32Type, i32, DataType::Int32, 32); -make_type!(Int64Type, i64, DataType::Int64, 64); -make_type!(UInt8Type, u8, DataType::UInt8, 8); -make_type!(UInt16Type, u16, DataType::UInt16, 16); -make_type!(UInt32Type, u32, DataType::UInt32, 32); -make_type!(UInt64Type, u64, DataType::UInt64, 64); -make_type!(Float32Type, f32, DataType::Float32, 32); -make_type!(Float64Type, f64, DataType::Float64, 64); +make_type!(BooleanType, bool, DataType::Boolean, 1, false); +make_type!(Int8Type, i8, DataType::Int8, 8, 0i8); +make_type!(Int16Type, i16, DataType::Int16, 16, 0i16); +make_type!(Int32Type, i32, DataType::Int32, 32, 0i32); +make_type!(Int64Type, i64, DataType::Int64, 64, 0i64); +make_type!(UInt8Type, u8, DataType::UInt8, 8, 0u8); +make_type!(UInt16Type, u16, DataType::UInt16, 16, 0u16); +make_type!(UInt32Type, u32, DataType::UInt32, 32, 0u32); +make_type!(UInt64Type, u64, DataType::UInt64, 64, 0u64); +make_type!(Float32Type, f32, DataType::Float32, 32, 0.0f32); +make_type!(Float64Type, f64, DataType::Float64, 64, 0.0f64); /// A subtype of primitive type that represents numeric values. pub trait ArrowNumericType: ArrowPrimitiveType {} From 601498f7169f2340b393bccba1d0a0e0b65d1562 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Sun, 6 Jan 2019 20:53:13 +0900 Subject: [PATCH 153/328] ARROW-4161: [GLib] Add PlasmaClientOptions Author: Kouhei Sutou Closes #3315 from kou/glib-plasma-client-new-options and squashes the following commits: 73eff12a Add support for Plasma::ClientOptions ed52a8ab Add PlasmaClientOptions --- c_glib/plasma-glib/client.cpp | 137 +++++++++++++++++- c_glib/plasma-glib/client.h | 21 +++ .../test/plasma/test-plasma-client-options.rb | 31 ++++ c_glib/test/plasma/test-plasma-client.rb | 3 +- .../test/plasma/test-plasma-created-object.rb | 2 +- .../plasma/test-plasma-referred-object.rb | 2 +- ruby/red-plasma/lib/plasma/client.rb | 13 +- ruby/red-plasma/test/test-plasma-client.rb | 24 ++- 8 files changed, 223 insertions(+), 10 deletions(-) create mode 100644 c_glib/test/plasma/test-plasma-client-options.rb diff --git a/c_glib/plasma-glib/client.cpp b/c_glib/plasma-glib/client.cpp index 9591a0a714f27..2038ea61f042a 100644 --- a/c_glib/plasma-glib/client.cpp +++ b/c_glib/plasma-glib/client.cpp @@ -39,6 +39,9 @@ G_BEGIN_DECLS * @title: Client related classes * @include: plasma-glib/plasma-glib.h * + * #GPlasmaClientOptions is a class for customizing plasma store + * connection. + * * #GPlasmaClientCreateOptions is a class for customizing object creation. * * #GPlasmaClient is a class for an interface with a plasma store. @@ -46,6 +49,131 @@ G_BEGIN_DECLS * Since: 0.12.0 */ +typedef struct GPlasmaClientCreatePrivate_ { + gint n_retries; +} GPlasmaClientOptionsPrivate; + +enum { + PROP_N_RETRIES = 1 +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GPlasmaClientOptions, + gplasma_client_options, + G_TYPE_OBJECT) + +#define GPLASMA_CLIENT_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + gplasma_client_options_get_instance_private( \ + GPLASMA_CLIENT_OPTIONS(object))) + +static void +gplasma_client_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GPLASMA_CLIENT_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_N_RETRIES: + priv->n_retries = g_value_get_int(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gplasma_client_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GPLASMA_CLIENT_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_N_RETRIES: + g_value_set_int(value, priv->n_retries); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gplasma_client_options_init(GPlasmaClientOptions *object) +{ +} + +static void +gplasma_client_options_class_init(GPlasmaClientOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->set_property = gplasma_client_options_set_property; + gobject_class->get_property = gplasma_client_options_get_property; + + GParamSpec *spec; + spec = g_param_spec_int("n-retries", + "N retries", + "The number of retries to connect plasma store. " + "-1 means that the system default value is used.", + -1, + G_MAXINT, + -1, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT)); + g_object_class_install_property(gobject_class, PROP_N_RETRIES, spec); +} + +/** + * gplasma_client_options_new: + * + * Returns: A newly created #GPlasmaClientOptions. + * + * Since: 0.12.0 + */ +GPlasmaClientOptions * +gplasma_client_options_new(void) +{ + auto options = g_object_new(GPLASMA_TYPE_CLIENT_OPTIONS, + NULL); + return GPLASMA_CLIENT_OPTIONS(options); +} + +/** + * gplasma_client_options_set_n_retries: + * @options: A #GPlasmaClientOptions. + * @n_retries: The number of retires on connect. + * + * Since: 0.12.0 + */ +void +gplasma_client_options_set_n_retries(GPlasmaClientOptions *options, + gint n_retries) +{ + auto priv = GPLASMA_CLIENT_OPTIONS_GET_PRIVATE(options); + priv->n_retries = n_retries; +} + +/** + * gplasma_client_options_get_n_retries: + * @options: A #GPlasmaClientOptions. + * + * Returns: The number of retries on connect. + * + * Since: 0.12.0 + */ +gint +gplasma_client_options_get_n_retries(GPlasmaClientOptions *options) +{ + auto priv = GPLASMA_CLIENT_OPTIONS_GET_PRIVATE(options); + return priv->n_retries; +} + + typedef struct GPlasmaClientCreateOptionsPrivate_ { guint8 *metadata; gsize metadata_size; @@ -182,6 +310,7 @@ gplasma_client_create_options_get_metadata(GPlasmaClientCreateOptions *options, return priv->metadata; } + typedef struct GPlasmaClientPrivate_ { plasma::PlasmaClient *client; bool disconnected; @@ -262,6 +391,7 @@ gplasma_client_class_init(GPlasmaClientClass *klass) /** * gplasma_client_new: * @store_socket_name: The name of the UNIX domain socket. + * @options: (nullable): The options to custom how to connect to plasma store. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (nullable): A newly created #GPlasmaClient on success, @@ -271,10 +401,15 @@ gplasma_client_class_init(GPlasmaClientClass *klass) */ GPlasmaClient * gplasma_client_new(const gchar *store_socket_name, + GPlasmaClientOptions *options, GError **error) { auto plasma_client = new plasma::PlasmaClient(); - auto status = plasma_client->Connect(store_socket_name, ""); + int n_retries = -1; + if (options) { + n_retries = gplasma_client_options_get_n_retries(options); + } + auto status = plasma_client->Connect(store_socket_name, "", 0, n_retries); if (garrow_error_check(error, status, "[plasma][client][new]")) { return gplasma_client_new_raw(plasma_client); } else { diff --git a/c_glib/plasma-glib/client.h b/c_glib/plasma-glib/client.h index 34b0ba22e3188..2cb983e14e970 100644 --- a/c_glib/plasma-glib/client.h +++ b/c_glib/plasma-glib/client.h @@ -23,6 +23,26 @@ G_BEGIN_DECLS +#define GPLASMA_TYPE_CLIENT_OPTIONS (gplasma_client_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GPlasmaClientOptions, + gplasma_client_options, + GPLASMA, + CLIENT_OPTIONS, + GObject) + +struct _GPlasmaClientOptionsClass +{ + GObjectClass parent_class; +}; + +GPlasmaClientOptions *gplasma_client_options_new(void); +void +gplasma_client_options_set_n_retries(GPlasmaClientOptions *options, + gint n_retries); +gint +gplasma_client_options_get_n_retries(GPlasmaClientOptions *options); + + #define GPLASMA_TYPE_CLIENT_CREATE_OPTIONS \ (gplasma_client_create_options_get_type()) G_DECLARE_DERIVABLE_TYPE(GPlasmaClientCreateOptions, @@ -59,6 +79,7 @@ struct _GPlasmaClientClass }; GPlasmaClient *gplasma_client_new(const gchar *store_socket_name, + GPlasmaClientOptions *options, GError **error); GPlasmaCreatedObject * gplasma_client_create(GPlasmaClient *client, diff --git a/c_glib/test/plasma/test-plasma-client-options.rb b/c_glib/test/plasma/test-plasma-client-options.rb new file mode 100644 index 0000000000000..abe6fd3ce46ff --- /dev/null +++ b/c_glib/test/plasma/test-plasma-client-options.rb @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestPlasmaClientOptions < Test::Unit::TestCase + include Helper::Omittable + + def setup + omit("Plasma is required") unless defined?(::Plasma) + @options = Plasma::ClientOptions.new + end + + test("n_retries") do + assert_equal(-1, @options.n_retries) + @options.n_retries = 10 + assert_equal(10, @options.n_retries) + end +end diff --git a/c_glib/test/plasma/test-plasma-client.rb b/c_glib/test/plasma/test-plasma-client.rb index 6caf09f02570c..a57d1fc5944e9 100644 --- a/c_glib/test/plasma/test-plasma-client.rb +++ b/c_glib/test/plasma/test-plasma-client.rb @@ -23,7 +23,8 @@ def setup omit("Plasma is required") unless defined?(::Plasma) @store = Helper::PlasmaStore.new @store.start - @client = Plasma::Client.new(@store.socket_path) + @options = Plasma::ClientOptions.new + @client = Plasma::Client.new(@store.socket_path, @options) @id = Plasma::ObjectID.new("Hello") @data = "World" @options = Plasma::ClientCreateOptions.new diff --git a/c_glib/test/plasma/test-plasma-created-object.rb b/c_glib/test/plasma/test-plasma-created-object.rb index 54d6774790abe..9025ff4ac22d9 100644 --- a/c_glib/test/plasma/test-plasma-created-object.rb +++ b/c_glib/test/plasma/test-plasma-created-object.rb @@ -21,7 +21,7 @@ def setup omit("Plasma is required") unless defined?(::Plasma) @store = Helper::PlasmaStore.new @store.start - @client = Plasma::Client.new(@store.socket_path) + @client = Plasma::Client.new(@store.socket_path, nil) @id = Plasma::ObjectID.new("Hello") @data = "World" diff --git a/c_glib/test/plasma/test-plasma-referred-object.rb b/c_glib/test/plasma/test-plasma-referred-object.rb index f55c0b13c5603..a74641ed5dcd3 100644 --- a/c_glib/test/plasma/test-plasma-referred-object.rb +++ b/c_glib/test/plasma/test-plasma-referred-object.rb @@ -21,7 +21,7 @@ def setup omit("Plasma is required") unless defined?(::Plasma) @store = Helper::PlasmaStore.new @store.start - @client = Plasma::Client.new(@store.socket_path) + @client = Plasma::Client.new(@store.socket_path, nil) @id = Plasma::ObjectID.new("Hello") @data = "World" diff --git a/ruby/red-plasma/lib/plasma/client.rb b/ruby/red-plasma/lib/plasma/client.rb index 464ef8c336fd9..d32ded6ff60b4 100644 --- a/ruby/red-plasma/lib/plasma/client.rb +++ b/ruby/red-plasma/lib/plasma/client.rb @@ -18,9 +18,18 @@ module Plasma class Client alias_method :initialize_raw, :initialize - def initialize(socket_path) + private :initialize_raw + def initialize(socket_path, options=nil) socket_path = socket_path.to_path if socket_path.respond_to?(:to_path) - initialize_raw(socket_path) + if options + options_raw = options + options = ClientOptions.new + options_raw.each do |key, value| + setter = "#{key}=" + options.__send__(setter, value) if options.respond_to?(setter) + end + end + initialize_raw(socket_path, options) end end end diff --git a/ruby/red-plasma/test/test-plasma-client.rb b/ruby/red-plasma/test/test-plasma-client.rb index e7f8dbdba42e0..de76fb9d36e8b 100644 --- a/ruby/red-plasma/test/test-plasma-client.rb +++ b/ruby/red-plasma/test/test-plasma-client.rb @@ -20,15 +20,31 @@ def setup @store = nil @store = Helper::PlasmaStore.new @store.start + @id = Plasma::ObjectID.new("Hello") + @data = "World" end def teardown @store.stop if @store end - def test_new - assert_nothing_raised do - Plasma::Client.new(Pathname(@store.socket_path)) - end + def test_new_pathname + client = Plasma::Client.new(Pathname(@store.socket_path)) + object = client.create(@id, @data.bytesize, nil) + object.data.set_data(0, @data) + object.seal + + object = client.refer_object(@id, -1) + assert_equal(@data, object.data.data.to_s) + end + + def test_new_options + client = Plasma::Client.new(@store.socket_path, n_retries: 1) + object = client.create(@id, @data.bytesize, nil) + object.data.set_data(0, @data) + object.seal + + object = client.refer_object(@id, -1) + assert_equal(@data, object.data.data.to_s) end end From fa6e4238fdce81a17c1957ffbc8cd7defdbc3831 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Sun, 6 Jan 2019 21:21:15 +0900 Subject: [PATCH 154/328] ARROW-4162: [Ruby] Add support for creating data types from description Author: Kouhei Sutou Closes #3318 from kou/ruby-data-type-new-by-description and squashes the following commits: e7231e69 Add support for creating data types from description --- .../lib/arrow/decimal128-data-type.rb | 69 ++++++++++++ .../lib/arrow/dense-union-data-type.rb | 90 +++++++++++++++ .../lib/arrow/dictionary-data-type.rb | 106 ++++++++++++++++++ ruby/red-arrow/lib/arrow/loader.rb | 7 ++ .../lib/arrow/sparse-union-data-type.rb | 90 +++++++++++++++ ruby/red-arrow/lib/arrow/time32-data-type.rb | 61 ++++++++++ ruby/red-arrow/lib/arrow/time64-data-type.rb | 61 ++++++++++ .../lib/arrow/timestamp-data-type.rb | 57 ++++++++++ .../test/test-decimal128-data-type.rb | 31 +++++ .../test/test-dense-union-data-type.rb | 41 +++++++ .../test/test-dictionary-data-type.rb | 40 +++++++ .../test/test-sparse-union-data-type.rb | 41 +++++++ ruby/red-arrow/test/test-time32-data-type.rb | 42 +++++++ ruby/red-arrow/test/test-time64-data-type.rb | 42 +++++++ .../test/test-timestamp-data-type.rb | 42 +++++++ 15 files changed, 820 insertions(+) create mode 100644 ruby/red-arrow/lib/arrow/decimal128-data-type.rb create mode 100644 ruby/red-arrow/lib/arrow/dense-union-data-type.rb create mode 100644 ruby/red-arrow/lib/arrow/dictionary-data-type.rb create mode 100644 ruby/red-arrow/lib/arrow/sparse-union-data-type.rb create mode 100644 ruby/red-arrow/lib/arrow/time32-data-type.rb create mode 100644 ruby/red-arrow/lib/arrow/time64-data-type.rb create mode 100644 ruby/red-arrow/lib/arrow/timestamp-data-type.rb create mode 100644 ruby/red-arrow/test/test-decimal128-data-type.rb create mode 100644 ruby/red-arrow/test/test-dense-union-data-type.rb create mode 100644 ruby/red-arrow/test/test-dictionary-data-type.rb create mode 100644 ruby/red-arrow/test/test-sparse-union-data-type.rb create mode 100644 ruby/red-arrow/test/test-time32-data-type.rb create mode 100644 ruby/red-arrow/test/test-time64-data-type.rb create mode 100644 ruby/red-arrow/test/test-timestamp-data-type.rb diff --git a/ruby/red-arrow/lib/arrow/decimal128-data-type.rb b/ruby/red-arrow/lib/arrow/decimal128-data-type.rb new file mode 100644 index 0000000000000..c97944bf8db76 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/decimal128-data-type.rb @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class Decimal128DataType + alias_method :initialize_raw, :initialize + private :initialize_raw + + # Creates a new {Arrow::Decimal128DataType}. + # + # @overload initialize(precision, scale) + # + # @param precision [Integer] The precision of the decimal data + # type. It's the number of digits including the number of + # digits after the decimal point. + # + # @param scale [Integer] The scale of the decimal data + # type. It's the number of digits after the decimal point. + # + # @example Create a decimal data type for "XXXXXX.YY" decimal + # Arrow::Decimal128DataType.new(8, 2) + # + # @overload initialize(description) + # + # @param description [Hash] The description of the decimal data + # type. It must have `:precision` and `:scale` values. + # + # @option description [Integer] :precision The precision of the + # decimal data type. It's the number of digits including the + # number of digits after the decimal point. + # + # @option description [Integer] :scale The scale of the decimal + # data type. It's the number of digits after the decimal + # point. + # + # @example Create a decimal data type for "XXXXXX.YY" decimal + # Arrow::Decimal128DataType.new(precision: 8, + # scale: 2) + def initialize(*args) + n_args = args.size + case n_args + when 1 + description = args[0] + precision = description[:precision] + scale = description[:scale] + when 2 + precision, scale = args + else + message = "wrong number of arguments (given, #{n_args}, expected 1..2)" + raise ArgumentError, message + end + initialize_raw(precision, scale) + end + end +end diff --git a/ruby/red-arrow/lib/arrow/dense-union-data-type.rb b/ruby/red-arrow/lib/arrow/dense-union-data-type.rb new file mode 100644 index 0000000000000..740b31331c964 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/dense-union-data-type.rb @@ -0,0 +1,90 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class DenseUnionDataType + alias_method :initialize_raw, :initialize + private :initialize_raw + + # Creates a new {Arrow::DenseUnionDataType}. + # + # @overload initialize(fields, type_codes) + # + # @param fields [::Array] The fields of the + # dense union data type. You can mix {Arrow::Field} and field + # description in the fields. + # + # See {Arrow::Field.new} how to specify field description. + # + # @param type_codes [::Array] The IDs that indicates + # corresponding fields. + # + # @example Create a dense union data type for {2: visible, 9: count} + # fields = [ + # Arrow::Field.new("visible", :boolean), + # { + # name: "count", + # type: :int32, + # }, + # ] + # Arrow::DenseUnionDataType.new(fields, [2, 9]) + # + # @overload initialize(description) + # + # @param description [Hash] The description of the dense union + # data type. It must have `:fields` and `:type_codes` values. + # + # @option description [::Array] :fields The + # fields of the dense union data type. You can mix + # {Arrow::Field} and field description in the fields. + # + # See {Arrow::Field.new} how to specify field description. + # + # @option description [::Array] :type_codes The IDs + # that indicates corresponding fields. + # + # @example Create a dense union data type for {2: visible, 9: count} + # fields = [ + # Arrow::Field.new("visible", :boolean), + # { + # name: "count", + # type: :int32, + # }, + # ] + # Arrow::DenseUnionDataType.new(fields: fields, + # type_codes: [2, 9]) + def initialize(*args) + n_args = args.size + case n_args + when 1 + description = args[0] + fields = description[:fields] + type_codes = description[:type_codes] + when 2 + fields, type_codes = args + else + message = "wrong number of arguments (given, #{n_args}, expected 1..2)" + raise ArgumentError, message + end + fields = fields.collect do |field| + field = Field.new(field) unless field.is_a?(Field) + field + end + initialize_raw(fields, type_codes) + end + end +end diff --git a/ruby/red-arrow/lib/arrow/dictionary-data-type.rb b/ruby/red-arrow/lib/arrow/dictionary-data-type.rb new file mode 100644 index 0000000000000..e799fdfac799e --- /dev/null +++ b/ruby/red-arrow/lib/arrow/dictionary-data-type.rb @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class DictionaryDataType + alias_method :initialize_raw, :initialize + private :initialize_raw + + # Creates a new {Arrow::DictionaryDataType}. + # + # @overload initialize(index_data_type, dictionary, ordered) + # + # @param index_data_type [Arrow::DataType, Hash, String, Symbol] + # The index data type of the dictionary data type. It must be + # signed integer data types. Here are available signed integer + # data types: + # + # * Arrow::Int8DataType + # * Arrow::Int16DataType + # * Arrow::Int32DataType + # * Arrow::Int64DataType + # + # You can specify data type as a description by `Hash`. + # + # See {Arrow::DataType.resolve} how to specify data type + # description. + # + # @param dictionary [Arrow::Array] The real values of the + # dictionary data type. + # + # @param ordered [Boolean] Whether dictionary contents are + # ordered or not. + # + # @example Create a dictionary data type for {0: "Hello", 1: "World"} + # index_data_type = :int8 + # dictionary = Arrow::StringArray.new(["Hello", "World"]) + # ordered = true + # Arrow::DictionaryDataType.new(index_data_type, + # dictionary, + # ordered) + # + # @overload initialize(description) + # + # @param description [Hash] The description of the dictionary + # data type. It must have `:index_data_type`, `:dictionary` + # and `:ordered` values. + # + # @option description [Arrow::DataType, Hash, String, Symbol] + # :index_data_type The index data type of the dictionary data + # type. It must be signed integer data types. Here are + # available signed integer data types: + # + # * Arrow::Int8DataType + # * Arrow::Int16DataType + # * Arrow::Int32DataType + # * Arrow::Int64DataType + # + # You can specify data type as a description by `Hash`. + # + # See {Arrow::DataType.resolve} how to specify data type + # description. + # + # @option description [Arrow::Array] :dictionary The real values + # of the dictionary data type. + # + # @option description [Boolean] :ordered Whether dictionary + # contents are ordered or not. + # + # @example Create a dictionary data type for {0: "Hello", 1: "World"} + # dictionary = Arrow::StringArray.new(["Hello", "World"]) + # Arrow::DictionaryDataType.new(index_data_type: :int8, + # dictionary: dictionary, + # ordered: true) + def initialize(*args) + n_args = args.size + case n_args + when 1 + description = args[0] + index_data_type = description[:index_data_type] + dictionary = description[:dictionary] + ordered = description[:ordered] + when 3 + index_data_type, dictionary, ordered = args + else + message = "wrong number of arguments (given, #{n_args}, expected 1 or 3)" + raise ArgumentError, message + end + index_data_type = DataType.resolve(index_data_type) + initialize_raw(index_data_type, dictionary, ordered) + end + end +end diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb index cea98e9a8578e..8747476222955 100644 --- a/ruby/red-arrow/lib/arrow/loader.rb +++ b/ruby/red-arrow/lib/arrow/loader.rb @@ -43,6 +43,9 @@ def require_libraries require "arrow/date32-array-builder" require "arrow/date64-array" require "arrow/date64-array-builder" + require "arrow/decimal128-data-type" + require "arrow/dense-union-data-type" + require "arrow/dictionary-data-type" require "arrow/field" require "arrow/file-output-stream" require "arrow/list-data-type" @@ -54,6 +57,7 @@ def require_libraries require "arrow/rolling-window" require "arrow/schema" require "arrow/slicer" + require "arrow/sparse-union-data-type" require "arrow/struct-array" require "arrow/struct-data-type" require "arrow/table" @@ -63,8 +67,11 @@ def require_libraries require "arrow/table-loader" require "arrow/table-saver" require "arrow/tensor" + require "arrow/time32-data-type" + require "arrow/time64-data-type" require "arrow/timestamp-array" require "arrow/timestamp-array-builder" + require "arrow/timestamp-data-type" require "arrow/writable" end diff --git a/ruby/red-arrow/lib/arrow/sparse-union-data-type.rb b/ruby/red-arrow/lib/arrow/sparse-union-data-type.rb new file mode 100644 index 0000000000000..fb0ddf0909165 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/sparse-union-data-type.rb @@ -0,0 +1,90 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class SparseUnionDataType + alias_method :initialize_raw, :initialize + private :initialize_raw + + # Creates a new {Arrow::SparseUnionDataType}. + # + # @overload initialize(fields, type_codes) + # + # @param fields [::Array] The fields of the + # sparse union data type. You can mix {Arrow::Field} and field + # description in the fields. + # + # See {Arrow::Field.new} how to specify field description. + # + # @param type_codes [::Array] The IDs that indicates + # corresponding fields. + # + # @example Create a sparse union data type for {2: visible, 9: count} + # fields = [ + # Arrow::Field.new("visible", :boolean), + # { + # name: "count", + # type: :int32, + # }, + # ] + # Arrow::SparseUnionDataType.new(fields, [2, 9]) + # + # @overload initialize(description) + # + # @param description [Hash] The description of the sparse union + # data type. It must have `:fields` and `:type_codes` values. + # + # @option description [::Array] :fields The + # fields of the sparse union data type. You can mix + # {Arrow::Field} and field description in the fields. + # + # See {Arrow::Field.new} how to specify field description. + # + # @option description [::Array] :type_codes The IDs + # that indicates corresponding fields. + # + # @example Create a sparse union data type for {2: visible, 9: count} + # fields = [ + # Arrow::Field.new("visible", :boolean), + # { + # name: "count", + # type: :int32, + # }, + # ] + # Arrow::SparseUnionDataType.new(fields: fields, + # type_codes: [2, 9]) + def initialize(*args) + n_args = args.size + case n_args + when 1 + description = args[0] + fields = description[:fields] + type_codes = description[:type_codes] + when 2 + fields, type_codes = args + else + message = "wrong number of arguments (given, #{n_args}, expected 1..2)" + raise ArgumentError, message + end + fields = fields.collect do |field| + field = Field.new(field) unless field.is_a?(Field) + field + end + initialize_raw(fields, type_codes) + end + end +end diff --git a/ruby/red-arrow/lib/arrow/time32-data-type.rb b/ruby/red-arrow/lib/arrow/time32-data-type.rb new file mode 100644 index 0000000000000..9e8d955494338 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/time32-data-type.rb @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class Time32DataType + alias_method :initialize_raw, :initialize + private :initialize_raw + + # Creates a new {Arrow::Time32DataType}. + # + # @overload initialize(unit) + # + # @param unit [Arrow::TimeUnit, Symbol] The unit of the + # time32 data type. + # + # The unit must be second or millisecond. + # + # @example Create a time32 data type with {Arrow::TimeUnit} + # Arrow::Time32DataType.new(Arrow::TimeUnit::MILLI) + # + # @example Create a time32 data type with Symbol + # Arrow::Time32DataType.new(:milli) + # + # @overload initialize(description) + # + # @param description [Hash] The description of the time32 data + # type. It must have `:unit` value. + # + # @option description [Arrow::TimeUnit, Symbol] :unit The unit of + # the time32 data type. + # + # The unit must be second or millisecond. + # + # @example Create a time32 data type with {Arrow::TimeUnit} + # Arrow::Time32DataType.new(unit: Arrow::TimeUnit::MILLI) + # + # @example Create a time32 data type with Symbol + # Arrow::Time32DataType.new(unit: :milli) + def initialize(unit) + if unit.is_a?(Hash) + description = unit + unit = description[:unit] + end + initialize_raw(unit) + end + end +end diff --git a/ruby/red-arrow/lib/arrow/time64-data-type.rb b/ruby/red-arrow/lib/arrow/time64-data-type.rb new file mode 100644 index 0000000000000..ca31a561b43c4 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/time64-data-type.rb @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class Time64DataType + alias_method :initialize_raw, :initialize + private :initialize_raw + + # Creates a new {Arrow::Time64DataType}. + # + # @overload initialize(unit) + # + # @param unit [Arrow::TimeUnit, Symbol] The unit of the + # time64 data type. + # + # The unit must be microsecond or nanosecond. + # + # @example Create a time64 data type with {Arrow::TimeUnit} + # Arrow::Time64DataType.new(Arrow::TimeUnit::NANO) + # + # @example Create a time64 data type with Symbol + # Arrow::Time64DataType.new(:nano) + # + # @overload initialize(description) + # + # @param description [Hash] The description of the time64 data + # type. It must have `:unit` value. + # + # @option description [Arrow::TimeUnit, Symbol] :unit The unit of + # the time64 data type. + # + # The unit must be microsecond or nanosecond. + # + # @example Create a time64 data type with {Arrow::TimeUnit} + # Arrow::Time64DataType.new(unit: Arrow::TimeUnit::NANO) + # + # @example Create a time64 data type with Symbol + # Arrow::Time64DataType.new(unit: :nano) + def initialize(unit) + if unit.is_a?(Hash) + description = unit + unit = description[:unit] + end + initialize_raw(unit) + end + end +end diff --git a/ruby/red-arrow/lib/arrow/timestamp-data-type.rb b/ruby/red-arrow/lib/arrow/timestamp-data-type.rb new file mode 100644 index 0000000000000..86ed3e00eadd1 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/timestamp-data-type.rb @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class TimestampDataType + alias_method :initialize_raw, :initialize + private :initialize_raw + + # Creates a new {Arrow::TimestampDataType}. + # + # @overload initialize(unit) + # + # @param unit [Arrow::TimeUnit, Symbol] The unit of the + # timestamp data type. + # + # @example Create a timestamp data type with {Arrow::TimeUnit} + # Arrow::TimestampDataType.new(Arrow::TimeUnit::MILLI) + # + # @example Create a timestamp data type with Symbol + # Arrow::TimestampDataType.new(:milli) + # + # @overload initialize(description) + # + # @param description [Hash] The description of the timestamp data + # type. It must have `:unit` value. + # + # @option description [Arrow::TimeUnit, Symbol] :unit The unit of + # the timestamp data type. + # + # @example Create a timestamp data type with {Arrow::TimeUnit} + # Arrow::TimestampDataType.new(unit: Arrow::TimeUnit::MILLI) + # + # @example Create a timestamp data type with Symbol + # Arrow::TimestampDataType.new(unit: :milli) + def initialize(unit) + if unit.is_a?(Hash) + description = unit + unit = description[:unit] + end + initialize_raw(unit) + end + end +end diff --git a/ruby/red-arrow/test/test-decimal128-data-type.rb b/ruby/red-arrow/test/test-decimal128-data-type.rb new file mode 100644 index 0000000000000..6cdd22fff8ea8 --- /dev/null +++ b/ruby/red-arrow/test/test-decimal128-data-type.rb @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class Decimal128DataTypeTest < Test::Unit::TestCase + sub_test_case(".new") do + test("ordered arguments") do + assert_equal("decimal(8, 2)", + Arrow::Decimal128DataType.new(8, 2).to_s) + end + + test("description") do + assert_equal("decimal(8, 2)", + Arrow::Decimal128DataType.new(precision: 8, + scale: 2).to_s) + end + end +end diff --git a/ruby/red-arrow/test/test-dense-union-data-type.rb b/ruby/red-arrow/test/test-dense-union-data-type.rb new file mode 100644 index 0000000000000..96699e52e45d9 --- /dev/null +++ b/ruby/red-arrow/test/test-dense-union-data-type.rb @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class DenseUnionDataTypeTest < Test::Unit::TestCase + sub_test_case(".new") do + def setup + @fields = [ + Arrow::Field.new("visible", :boolean), + { + name: "count", + type: :int32, + }, + ] + end + + test("ordered arguments") do + assert_equal("union[dense]", + Arrow::DenseUnionDataType.new(@fields, [2, 9]).to_s) + end + + test("description") do + assert_equal("union[dense]", + Arrow::DenseUnionDataType.new(fields: @fields, + type_codes: [2, 9]).to_s) + end + end +end diff --git a/ruby/red-arrow/test/test-dictionary-data-type.rb b/ruby/red-arrow/test/test-dictionary-data-type.rb new file mode 100644 index 0000000000000..be9cd6f301035 --- /dev/null +++ b/ruby/red-arrow/test/test-dictionary-data-type.rb @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class DictionaryDataTypeTest < Test::Unit::TestCase + sub_test_case(".new") do + def setup + @index_data_type = :int8 + @dictionary = Arrow::StringArray.new(["Hello", "World"]) + @ordered = true + end + + test("ordered arguments") do + assert_equal("dictionary", + Arrow::DictionaryDataType.new(@index_data_type, + @dictionary, + @ordered).to_s) + end + + test("description") do + assert_equal("dictionary", + Arrow::DictionaryDataType.new(index_data_type: @index_data_type, + dictionary: @dictionary, + ordered: @ordered).to_s) + end + end +end diff --git a/ruby/red-arrow/test/test-sparse-union-data-type.rb b/ruby/red-arrow/test/test-sparse-union-data-type.rb new file mode 100644 index 0000000000000..4159b42268da9 --- /dev/null +++ b/ruby/red-arrow/test/test-sparse-union-data-type.rb @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class SparseUnionDataTypeTest < Test::Unit::TestCase + sub_test_case(".new") do + def setup + @fields = [ + Arrow::Field.new("visible", :boolean), + { + name: "count", + type: :int32, + }, + ] + end + + test("ordered arguments") do + assert_equal("union[sparse]", + Arrow::SparseUnionDataType.new(@fields, [2, 9]).to_s) + end + + test("description") do + assert_equal("union[sparse]", + Arrow::SparseUnionDataType.new(fields: @fields, + type_codes: [2, 9]).to_s) + end + end +end diff --git a/ruby/red-arrow/test/test-time32-data-type.rb b/ruby/red-arrow/test/test-time32-data-type.rb new file mode 100644 index 0000000000000..26f17359a1223 --- /dev/null +++ b/ruby/red-arrow/test/test-time32-data-type.rb @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class Time32DataTypeTest < Test::Unit::TestCase + sub_test_case(".new") do + test("Arrow::TimeUnit") do + assert_equal("time32[ms]", + Arrow::Time32DataType.new(Arrow::TimeUnit::MILLI).to_s) + end + + test("Symbol") do + assert_equal("time32[ms]", + Arrow::Time32DataType.new(:milli).to_s) + end + + test("unit: Arrow::TimeUnit") do + data_type = Arrow::Time32DataType.new(unit: Arrow::TimeUnit::MILLI) + assert_equal("time32[ms]", + data_type.to_s) + end + + test("unit: Symbol") do + data_type = Arrow::Time32DataType.new(unit: :milli) + assert_equal("time32[ms]", + data_type.to_s) + end + end +end diff --git a/ruby/red-arrow/test/test-time64-data-type.rb b/ruby/red-arrow/test/test-time64-data-type.rb new file mode 100644 index 0000000000000..a5f34175398ca --- /dev/null +++ b/ruby/red-arrow/test/test-time64-data-type.rb @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class Time64DataTypeTest < Test::Unit::TestCase + sub_test_case(".new") do + test("Arrow::TimeUnit") do + assert_equal("time64[ns]", + Arrow::Time64DataType.new(Arrow::TimeUnit::NANO).to_s) + end + + test("Symbol") do + assert_equal("time64[ns]", + Arrow::Time64DataType.new(:nano).to_s) + end + + test("unit: Arrow::TimeUnit") do + data_type = Arrow::Time64DataType.new(unit: Arrow::TimeUnit::NANO) + assert_equal("time64[ns]", + data_type.to_s) + end + + test("unit: Symbol") do + data_type = Arrow::Time64DataType.new(unit: :nano) + assert_equal("time64[ns]", + data_type.to_s) + end + end +end diff --git a/ruby/red-arrow/test/test-timestamp-data-type.rb b/ruby/red-arrow/test/test-timestamp-data-type.rb new file mode 100644 index 0000000000000..f8ccd3d8bb8b4 --- /dev/null +++ b/ruby/red-arrow/test/test-timestamp-data-type.rb @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TimestampDataTypeTest < Test::Unit::TestCase + sub_test_case(".new") do + test("Arrow::TimeUnit") do + assert_equal("timestamp[ms]", + Arrow::TimestampDataType.new(Arrow::TimeUnit::MILLI).to_s) + end + + test("Symbol") do + assert_equal("timestamp[ms]", + Arrow::TimestampDataType.new(:milli).to_s) + end + + test("unit: Arrow::TimeUnit") do + data_type = Arrow::TimestampDataType.new(unit: Arrow::TimeUnit::MILLI) + assert_equal("timestamp[ms]", + data_type.to_s) + end + + test("unit: Symbol") do + data_type = Arrow::TimestampDataType.new(unit: :milli) + assert_equal("timestamp[ms]", + data_type.to_s) + end + end +end From b95628f2980fd800efe73ab0e4778dd209f7596c Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Mon, 7 Jan 2019 08:54:59 +0900 Subject: [PATCH 155/328] ARROW-4166: [Ruby] Add support for saving to and loading from buffer Author: Kouhei Sutou Closes #3320 from kou/ruby-table-io-buffer and squashes the following commits: 7025e765 Add support for saving to and loading from buffer --- ruby/red-arrow/lib/arrow/table-loader.rb | 46 ++++-- ruby/red-arrow/lib/arrow/table-saver.rb | 66 +++++---- ruby/red-arrow/test/test-table.rb | 139 ++++++++++++------ .../lib/parquet/arrow-table-loadable.rb | 7 +- .../lib/parquet/arrow-table-savable.rb | 6 +- ruby/red-parquet/test/test-arrow-table.rb | 8 +- 6 files changed, 177 insertions(+), 95 deletions(-) diff --git a/ruby/red-arrow/lib/arrow/table-loader.rb b/ruby/red-arrow/lib/arrow/table-loader.rb index a6ce9a1029bb3..9bfd41042768a 100644 --- a/ruby/red-arrow/lib/arrow/table-loader.rb +++ b/ruby/red-arrow/lib/arrow/table-loader.rb @@ -18,14 +18,14 @@ module Arrow class TableLoader class << self - def load(path, options={}) - new(path, options).load + def load(output, options={}) + new(output, options).load end end - def initialize(path, options={}) - path = path.to_path if path.respond_to?(:to_path) - @path = path + def initialize(output, options={}) + output = output.to_path if output.respond_to?(:to_path) + @output = output @options = options fill_options end @@ -50,7 +50,7 @@ def load __send__(custom_load_method) else # For backward compatibility. - __send__(custom_load_method, @path) + __send__(custom_load_method, @output) end end @@ -60,11 +60,15 @@ def fill_options return end - extension = PathExtension.new(@path) - info = extension.extract + if @output.is_a?(Buffer) + info = {} + else + extension = PathExtension.new(@output) + info = extension.extract + end format = info[:format] @options = @options.dup - if respond_to?("load_as_#{format}", true) + if format and respond_to?("load_as_#{format}", true) @options[:format] ||= format.to_sym else @options[:format] ||= :arrow @@ -74,6 +78,14 @@ def fill_options end end + def open_input_stream + if @output.is_a?(Buffer) + BufferInputStream.new(@output) + else + MemoryMappedInputStream.new(@output) + end + end + def load_raw(input, reader) schema = reader.schema chunked_arrays = [] @@ -100,7 +112,7 @@ def load_as_arrow RecordBatchStreamReader, ] reader_class_candidates.each do |reader_class_candidate| - input = MemoryMappedInputStream.new(@path) + input = open_input_stream begin reader = reader_class_candidate.new(input) rescue Arrow::Error @@ -114,20 +126,20 @@ def load_as_arrow end def load_as_batch - input = MemoryMappedInputStream.new(@path) + input = open_input_stream reader = RecordBatchFileReader.new(input) load_raw(input, reader) end def load_as_stream - input = MemoryMappedInputStream.new(@path) + input = open_input_stream reader = RecordBatchStreamReader.new(input) load_raw(input, reader) end if Arrow.const_defined?(:ORCFileReader) def load_as_orc - input = MemoryMappedInputStream.new(@path) + input = open_input_stream reader = ORCFileReader.new(input) field_indexes = @options[:field_indexes] reader.set_field_indexes(field_indexes) if field_indexes @@ -140,11 +152,15 @@ def load_as_orc def load_as_csv options = @options.dup options.delete(:format) - CSVLoader.load(Pathname.new(@path), options) + if @output.is_a?(Buffer) + CSVLoader.load(@output.data.to_s, options) + else + CSVLoader.load(Pathname.new(@output), options) + end end def load_as_feather - input = MemoryMappedInputStream.new(@path) + input = open_input_stream reader = FeatherFileReader.new(input) table = reader.read table.instance_variable_set(:@input, input) diff --git a/ruby/red-arrow/lib/arrow/table-saver.rb b/ruby/red-arrow/lib/arrow/table-saver.rb index 99e6e490532c1..817cc548717d8 100644 --- a/ruby/red-arrow/lib/arrow/table-saver.rb +++ b/ruby/red-arrow/lib/arrow/table-saver.rb @@ -18,15 +18,15 @@ module Arrow class TableSaver class << self - def save(table, path, options={}) - new(table, path, options).save + def save(table, output, options={}) + new(table, output, options).save end end - def initialize(table, path, options={}) + def initialize(table, output, options={}) @table = table - path = path.to_path if path.respond_to?(:to_path) - @path = path + output = output.to_path if output.respond_to?(:to_path) + @output = output @options = options fill_options end @@ -51,7 +51,7 @@ def save __send__(custom_save_method) else # For backward compatibility. - __send__(custom_save_method, @path) + __send__(custom_save_method, @output) end end @@ -61,11 +61,15 @@ def fill_options return end - extension = PathExtension.new(@path) - info = extension.extract + if @output.is_a?(Buffer) + info = {} + else + extension = PathExtension.new(@output) + info = extension.extract + end format = info[:format] @options = @options.dup - if respond_to?("save_as_#{format}", true) + if format and respond_to?("save_as_#{format}", true) @options[:format] ||= format.to_sym else @options[:format] ||= :arrow @@ -75,8 +79,30 @@ def fill_options end end + def open_raw_output_stream(&block) + if @output.is_a?(Buffer) + BufferOutputStream.open(@output, &block) + else + FileOutputStream.open(@output, false, &block) + end + end + + def open_output_stream(&block) + compression = @options[:compression] + if compression + codec = Codec.new(compression) + open_raw_output_stream do |raw_output| + CompressedOutputStream.open(codec, raw_output) do |output| + yield(output) + end + end + else + open_raw_output_stream(&block) + end + end + def save_raw(writer_class) - FileOutputStream.open(@path, false) do |output| + open_output_stream do |output| writer_class.open(output, @table.schema) do |writer| writer.write_table(@table) end @@ -95,24 +121,8 @@ def save_as_stream save_raw(RecordBatchStreamWriter) end - def open_output - compression = @options[:compression] - if compression - codec = Codec.new(compression) - FileOutputStream.open(@path, false) do |raw_output| - CompressedOutputStream.open(codec, raw_output) do |output| - yield(output) - end - end - else - ::File.open(@path, "w") do |output| - yield(output) - end - end - end - def save_as_csv - open_output do |output| + open_output_stream do |output| csv = CSV.new(output) names = @table.schema.fields.collect(&:name) csv << names @@ -125,7 +135,7 @@ def save_as_csv end def save_as_feather - FileOutputStream.open(@path, false) do |output| + open_output_stream do |output| FeatherFileWriter.open(output) do |writer| writer.write(@table) end diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb index 1576f779ce3b6..2876f762f00bd 100644 --- a/ruby/red-arrow/test/test-table.rb +++ b/ruby/red-arrow/test/test-table.rb @@ -395,83 +395,128 @@ def setup end sub_test_case("#save and .load") do - sub_test_case(":format") do - test("default") do - file = Tempfile.new(["red-arrow", ".arrow"]) - @table.save(file.path) - assert_equal(@table, Arrow::Table.load(file.path)) + module SaveLoadFormatTests + def test_default + output = create_output(".arrow") + @table.save(output) + assert_equal(@table, Arrow::Table.load(output)) end - test(":batch") do - file = Tempfile.new(["red-arrow", ".arrow"]) - @table.save(file.path, :format => :batch) - assert_equal(@table, Arrow::Table.load(file.path, :format => :batch)) + def test_batch + output = create_output(".arrow") + @table.save(output, format: :batch) + assert_equal(@table, Arrow::Table.load(output, format: :batch)) end - test(":stream") do - file = Tempfile.new(["red-arrow", ".arrow"]) - @table.save(file.path, :format => :stream) - assert_equal(@table, Arrow::Table.load(file.path, :format => :stream)) + def test_stream + output = create_output(".arrow") + @table.save(output, format: :stream) + assert_equal(@table, Arrow::Table.load(output, format: :stream)) end - test(":csv") do - file = Tempfile.new(["red-arrow", ".csv"]) - @table.save(file.path, :format => :csv) + def test_csv + output = create_output(".csv") + @table.save(output, format: :csv) assert_equal(@table, - Arrow::Table.load(file.path, - :format => :csv, - :schema => @table.schema)) + Arrow::Table.load(output, + format: :csv, + schema: @table.schema)) end - test("csv.gz") do - file = Tempfile.new(["red-arrow", ".csv.gz"]) - @table.save(file.path) + def test_csv_gz + output = create_output(".csv.gz") + @table.save(output, + format: :csv, + compression: :gzip) assert_equal(@table, - Arrow::Table.load(file.path, - :format => :csv, - :compression => :gzip, - :schema => @table.schema)) + Arrow::Table.load(output, + format: :csv, + compression: :gzip, + schema: @table.schema)) end + end + + sub_test_case("path") do + sub_test_case(":format") do + include SaveLoadFormatTests - sub_test_case("load: auto detect") do - test("batch") do - file = Tempfile.new(["red-arrow", ".arrow"]) - @table.save(file.path, :format => :batch) - assert_equal(@table, Arrow::Table.load(file.path)) + def create_output(extension) + @file = Tempfile.new(["red-arrow", extension]) + @file.path end - test("stream") do - file = Tempfile.new(["red-arrow", ".arrow"]) - @table.save(file.path, :format => :stream) - assert_equal(@table, Arrow::Table.load(file.path)) + sub_test_case("save: auto detect") do + test("csv") do + output = create_output(".csv") + @table.save(output) + assert_equal(@table, + Arrow::Table.load(output, + format: :csv, + schema: @table.schema)) + end + + test("csv.gz") do + output = create_output(".csv.gz") + @table.save(output) + assert_equal(@table, + Arrow::Table.load(output, + format: :csv, + compression: :gzip, + schema: @table.schema)) + end end - test("csv") do - path = fixture_path("with-header.csv") - assert_equal(<<-TABLE, Arrow::Table.load(path, skip_lines: /^#/).to_s) + sub_test_case("load: auto detect") do + test("batch") do + output = create_output(".arrow") + @table.save(output, format: :batch) + assert_equal(@table, Arrow::Table.load(output)) + end + + test("stream") do + output = create_output(".arrow") + @table.save(output, format: :stream) + assert_equal(@table, Arrow::Table.load(output)) + end + + test("csv") do + path = fixture_path("with-header.csv") + table = Arrow::Table.load(path, skip_lines: /^\#/) + assert_equal(<<-TABLE, table.to_s) name score 0 alice 10 1 bob 29 2 chris -1 - TABLE - end + TABLE + end - test("csv.gz") do - file = Tempfile.new(["red-arrow", ".csv.gz"]) - Zlib::GzipWriter.wrap(file) do |gz| - gz.write(<<-CSV) + test("csv.gz") do + file = Tempfile.new(["red-arrow", ".csv.gz"]) + Zlib::GzipWriter.wrap(file) do |gz| + gz.write(<<-CSV) name,score alice,10 bob,29 chris,-1 - CSV - end - assert_equal(<<-TABLE, Arrow::Table.load(file.path).to_s) + CSV + end + assert_equal(<<-TABLE, Arrow::Table.load(file.path).to_s) name score 0 alice 10 1 bob 29 2 chris -1 TABLE + end + end + end + end + + sub_test_case("Buffer") do + sub_test_case(":format") do + include SaveLoadFormatTests + + def create_output(extension) + Arrow::ResizableBuffer.new(1024) end end end diff --git a/ruby/red-parquet/lib/parquet/arrow-table-loadable.rb b/ruby/red-parquet/lib/parquet/arrow-table-loadable.rb index 4df527bb8da3b..e3aa1ce0a67bf 100644 --- a/ruby/red-parquet/lib/parquet/arrow-table-loadable.rb +++ b/ruby/red-parquet/lib/parquet/arrow-table-loadable.rb @@ -19,9 +19,12 @@ module Parquet module ArrowTableLoadable private def load_as_parquet - reader = Parquet::ArrowFileReader.new(@path) + input = open_input_stream + reader = Parquet::ArrowFileReader.new(input) reader.use_threads = (@options[:use_threads] != false) - reader.read_table + table = reader.read_table + table.instance_variable_set(:@input, input) + table end end end diff --git a/ruby/red-parquet/lib/parquet/arrow-table-savable.rb b/ruby/red-parquet/lib/parquet/arrow-table-savable.rb index 5d96d5f58ec00..7667381867d9a 100644 --- a/ruby/red-parquet/lib/parquet/arrow-table-savable.rb +++ b/ruby/red-parquet/lib/parquet/arrow-table-savable.rb @@ -20,8 +20,10 @@ module ArrowTableSavable private def save_as_parquet chunk_size = @options[:chunk_size] || 1024 # TODO - Parquet::ArrowFileWriter.open(@table.schema, @path) do |writer| - writer.write_table(@table, chunk_size) + open_output_stream do |output| + Parquet::ArrowFileWriter.open(@table.schema, output) do |writer| + writer.write_table(@table, chunk_size) + end end end end diff --git a/ruby/red-parquet/test/test-arrow-table.rb b/ruby/red-parquet/test/test-arrow-table.rb index 258b4173948c3..1a565b64451a8 100644 --- a/ruby/red-parquet/test/test-arrow-table.rb +++ b/ruby/red-parquet/test/test-arrow-table.rb @@ -40,9 +40,15 @@ def setup @table = Arrow::Table.new(schema, [@count_column, @visible_column]) end - def test_save_load + def test_save_load_path tempfile = Tempfile.open(["red-parquet", ".parquet"]) @table.save(tempfile.path) assert_equal(@table, Arrow::Table.load(tempfile.path)) end + + def test_save_load_buffer + buffer = Arrow::ResizableBuffer.new(1024) + @table.save(buffer, format: :parquet) + assert_equal(@table, Arrow::Table.load(buffer, format: :parquet)) + end end From 5fad19185fd224e464c21b00d0cb6fdd04d65b0a Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Mon, 7 Jan 2019 09:07:52 +0900 Subject: [PATCH 156/328] ARROW-4171: [Rust] fix parquet crate release version Author: Chao Sun Closes #3324 from sunchao/ARROW-4171 and squashes the following commits: 9a9fc00e ARROW-4171: fix parquet crate release version --- rust/parquet/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/parquet/Cargo.toml b/rust/parquet/Cargo.toml index 7478992327ddc..e0272ab4f09e1 100644 --- a/rust/parquet/Cargo.toml +++ b/rust/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "0.5.0-SNAPSHOT" +version = "0.12.0-SNAPSHOT" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow" From 00026303d4419a457ab3e01126b05b5aacefee8a Mon Sep 17 00:00:00 2001 From: "Bruno P. Kinoshita" Date: Sun, 6 Jan 2019 21:20:25 +1300 Subject: [PATCH 157/328] Fix link to Intel SIMD docs --- docs/source/format/Layout.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/Layout.rst b/docs/source/format/Layout.rst index 868a99b34f8d0..69cbf0654900a 100644 --- a/docs/source/format/Layout.rst +++ b/docs/source/format/Layout.rst @@ -659,6 +659,6 @@ Apache Drill Documentation - `Value Vectors`_ .. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering .. _Intel performance guide: https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors .. _Endianness: https://en.wikipedia.org/wiki/Endianness -.. _SIMD: https://software.intel.com/en-us/node/600110 +.. _SIMD: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-introduction-to-the-simd-data-layout-templates .. _Parquet: https://parquet.apache.org/documentation/latest/ .. _Value Vectors: https://drill.apache.org/docs/value-vectors/ From 1eec9e8195716573b04bbe9416d0be2ed3430261 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Mon, 7 Jan 2019 11:32:35 +0900 Subject: [PATCH 158/328] ARROW-4168: [GLib] Use property to keep GArrowDataType passed in garrow_field_new() This is follow-up of https://github.com/apache/arrow/pull/3197#pullrequestreview-186349753 Author: Yosuke Shiro Author: Kouhei Sutou Closes #3322 from shiro615/glib-use-property-to-keep-data-type and squashes the following commits: 2135b583 Remove needless new lines cc85b1ef Fix indent 46844bc1 Use {class_name}_data_type to unify argument names a6af562a Reuse GARROW_DATA_TYPE(list_data_type) 77117f08 Call g_object_unref() for GArrowDataType cae21658 Use property to keep GArrowDataType in garrow_field_new() --- c_glib/arrow-glib/column.cpp | 5 +- c_glib/arrow-glib/composite-data-type.cpp | 99 ++++++++++++----------- c_glib/arrow-glib/composite-data-type.h | 24 +++--- c_glib/arrow-glib/field.cpp | 67 ++++++++------- c_glib/arrow-glib/field.hpp | 3 +- c_glib/arrow-glib/schema.cpp | 18 ++++- c_glib/gandiva-glib/node.cpp | 1 - 7 files changed, 123 insertions(+), 94 deletions(-) diff --git a/c_glib/arrow-glib/column.cpp b/c_glib/arrow-glib/column.cpp index e3e964f557659..68694b3d67903 100644 --- a/c_glib/arrow-glib/column.cpp +++ b/c_glib/arrow-glib/column.cpp @@ -322,7 +322,10 @@ garrow_column_get_field(GArrowColumn *column) } else { const auto arrow_column = garrow_column_get_raw(column); auto arrow_field = arrow_column->field(); - return garrow_field_new_raw(&arrow_field); + auto data_type = garrow_column_get_data_type(column); + auto field = garrow_field_new_raw(&arrow_field, data_type); + g_object_unref(data_type); + return field; } } diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index 599506f269c8c..8046d2e23a31a 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -92,15 +92,13 @@ garrow_list_data_type_new(GArrowField *field) GArrowField * garrow_list_data_type_get_value_field(GArrowListDataType *list_data_type) { - auto arrow_data_type = - garrow_data_type_get_raw(GARROW_DATA_TYPE(list_data_type)); + auto data_type = GARROW_DATA_TYPE(list_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); auto arrow_list_data_type = static_cast(arrow_data_type.get()); auto arrow_field = arrow_list_data_type->value_field(); - auto field = garrow_field_new_raw(&arrow_field); - - return field; + return garrow_field_new_raw(&arrow_field, data_type); } @@ -143,22 +141,22 @@ garrow_struct_data_type_new(GList *fields) /** * garrow_struct_data_type_get_n_fields: - * @data_type: A #GArrowStructDataType. + * @struct_data_type: A #GArrowStructDataType. * * Returns: The number of fields of the struct data type. * * Since: 0.12.0 */ gint -garrow_struct_data_type_get_n_fields(GArrowStructDataType *data_type) +garrow_struct_data_type_get_n_fields(GArrowStructDataType *struct_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(struct_data_type)); return arrow_data_type->num_children(); } /** * garrow_struct_data_type_get_fields: - * @data_type: A #GArrowStructDataType. + * @struct_data_type: A #GArrowStructDataType. * * Returns: (transfer full) (element-type GArrowField): * The fields of the struct data type. @@ -166,21 +164,23 @@ garrow_struct_data_type_get_n_fields(GArrowStructDataType *data_type) * Since: 0.12.0 */ GList * -garrow_struct_data_type_get_fields(GArrowStructDataType *data_type) +garrow_struct_data_type_get_fields(GArrowStructDataType *struct_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto data_type = GARROW_DATA_TYPE(struct_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); auto arrow_fields = arrow_data_type->children(); GList *fields = NULL; for (auto arrow_field : arrow_fields) { - fields = g_list_prepend(fields, garrow_field_new_raw(&arrow_field)); + fields = g_list_prepend(fields, + garrow_field_new_raw(&arrow_field, data_type)); } return g_list_reverse(fields); } /** * garrow_struct_data_type_get_field: - * @data_type: A #GArrowStructDataType. + * @struct_data_type: A #GArrowStructDataType. * @i: The index of the target field. * * Returns: (transfer full) (nullable): @@ -189,10 +189,11 @@ garrow_struct_data_type_get_fields(GArrowStructDataType *data_type) * Since: 0.12.0 */ GArrowField * -garrow_struct_data_type_get_field(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field(GArrowStructDataType *struct_data_type, gint i) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto data_type = GARROW_DATA_TYPE(struct_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); if (i < 0) { i += arrow_data_type->num_children(); @@ -206,7 +207,7 @@ garrow_struct_data_type_get_field(GArrowStructDataType *data_type, auto arrow_field = arrow_data_type->child(i); if (arrow_field) { - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, data_type); } else { return NULL; } @@ -214,7 +215,7 @@ garrow_struct_data_type_get_field(GArrowStructDataType *data_type, /** * garrow_struct_data_type_get_field_by_name: - * @data_type: A #GArrowStructDataType. + * @struct_data_type: A #GArrowStructDataType. * @name: The name of the target field. * * Returns: (transfer full) (nullable): @@ -223,16 +224,17 @@ garrow_struct_data_type_get_field(GArrowStructDataType *data_type, * Since: 0.12.0 */ GArrowField * -garrow_struct_data_type_get_field_by_name(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field_by_name(GArrowStructDataType *struct_data_type, const gchar *name) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto data_type = GARROW_DATA_TYPE(struct_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); auto arrow_struct_data_type = std::static_pointer_cast(arrow_data_type); auto arrow_field = arrow_struct_data_type->GetFieldByName(name); if (arrow_field) { - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, data_type); } else { return NULL; } @@ -240,7 +242,7 @@ garrow_struct_data_type_get_field_by_name(GArrowStructDataType *data_type, /** * garrow_struct_data_type_get_field_index: - * @data_type: A #GArrowStructDataType. + * @struct_data_type: A #GArrowStructDataType. * @name: The name of the target field. * * Returns: The index of the target index in the struct data type @@ -249,10 +251,10 @@ garrow_struct_data_type_get_field_by_name(GArrowStructDataType *data_type, * Since: 0.12.0 */ gint -garrow_struct_data_type_get_field_index(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field_index(GArrowStructDataType *struct_data_type, const gchar *name) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(struct_data_type)); auto arrow_struct_data_type = std::static_pointer_cast(arrow_data_type); @@ -276,22 +278,22 @@ garrow_union_data_type_class_init(GArrowUnionDataTypeClass *klass) /** * garrow_union_data_type_get_n_fields: - * @data_type: A #GArrowUnionDataType. + * @union_data_type: A #GArrowUnionDataType. * * Returns: The number of fields of the union data type. * * Since: 0.12.0 */ gint -garrow_union_data_type_get_n_fields(GArrowUnionDataType *data_type) +garrow_union_data_type_get_n_fields(GArrowUnionDataType *union_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(union_data_type)); return arrow_data_type->num_children(); } /** * garrow_union_data_type_get_fields: - * @data_type: A #GArrowUnionDataType. + * @union_data_type: A #GArrowUnionDataType. * * Returns: (transfer full) (element-type GArrowField): * The fields of the union data type. @@ -299,21 +301,23 @@ garrow_union_data_type_get_n_fields(GArrowUnionDataType *data_type) * Since: 0.12.0 */ GList * -garrow_union_data_type_get_fields(GArrowUnionDataType *data_type) +garrow_union_data_type_get_fields(GArrowUnionDataType *union_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto data_type = GARROW_DATA_TYPE(union_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); auto arrow_fields = arrow_data_type->children(); GList *fields = NULL; for (auto arrow_field : arrow_fields) { - fields = g_list_prepend(fields, garrow_field_new_raw(&arrow_field)); + fields = g_list_prepend(fields, + garrow_field_new_raw(&arrow_field, data_type)); } return g_list_reverse(fields); } /** * garrow_union_data_type_get_field: - * @data_type: A #GArrowUnionDataType. + * @union_data_type: A #GArrowUnionDataType. * @i: The index of the target field. * * Returns: (transfer full) (nullable): @@ -322,10 +326,11 @@ garrow_union_data_type_get_fields(GArrowUnionDataType *data_type) * Since: 0.12.0 */ GArrowField * -garrow_union_data_type_get_field(GArrowUnionDataType *data_type, - gint i) +garrow_union_data_type_get_field(GArrowUnionDataType *union_data_type, + gint i) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto data_type = GARROW_DATA_TYPE(union_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); if (i < 0) { i += arrow_data_type->num_children(); @@ -339,7 +344,7 @@ garrow_union_data_type_get_field(GArrowUnionDataType *data_type, auto arrow_field = arrow_data_type->child(i); if (arrow_field) { - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, data_type); } else { return NULL; } @@ -347,7 +352,7 @@ garrow_union_data_type_get_field(GArrowUnionDataType *data_type, /** * garrow_union_data_type_get_type_codes: - * @data_type: A #GArrowUnionDataType. + * @union_data_type: A #GArrowUnionDataType. * @n_type_codes: (out): The number of type codes. * * Returns: (transfer full) (array length=n_type_codes): @@ -358,10 +363,10 @@ garrow_union_data_type_get_field(GArrowUnionDataType *data_type, * Since: 0.12.0 */ guint8 * -garrow_union_data_type_get_type_codes(GArrowUnionDataType *data_type, +garrow_union_data_type_get_type_codes(GArrowUnionDataType *union_data_type, gsize *n_type_codes) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(union_data_type)); auto arrow_union_data_type = std::static_pointer_cast(arrow_data_type); @@ -515,16 +520,16 @@ garrow_dictionary_data_type_new(GArrowDataType *index_data_type, /** * garrow_dictionary_data_type_get_index_data_type: - * @data_type: The #GArrowDictionaryDataType. + * @dictionary_data_type: The #GArrowDictionaryDataType. * * Returns: (transfer full): The #GArrowDataType of index. * * Since: 0.8.0 */ GArrowDataType * -garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *data_type) +garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *dictionary_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(dictionary_data_type)); auto arrow_dictionary_data_type = std::static_pointer_cast(arrow_data_type); auto arrow_index_data_type = arrow_dictionary_data_type->index_type(); @@ -533,16 +538,16 @@ garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *data_t /** * garrow_dictionary_data_type_get_dictionary: - * @data_type: The #GArrowDictionaryDataType. + * @dictionary_data_type: The #GArrowDictionaryDataType. * * Returns: (transfer full): The dictionary as #GArrowArray. * * Since: 0.8.0 */ GArrowArray * -garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *data_type) +garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *dictionary_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(dictionary_data_type)); auto arrow_dictionary_data_type = std::static_pointer_cast(arrow_data_type); auto arrow_dictionary = arrow_dictionary_data_type->dictionary(); @@ -551,16 +556,16 @@ garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *data_type) /** * garrow_dictionary_data_type_is_ordered: - * @data_type: The #GArrowDictionaryDataType. + * @dictionary_data_type: The #GArrowDictionaryDataType. * * Returns: Whether dictionary contents are ordered or not. * * Since: 0.8.0 */ gboolean -garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *data_type) +garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *dictionary_data_type) { - auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(dictionary_data_type)); auto arrow_dictionary_data_type = std::static_pointer_cast(arrow_data_type); return arrow_dictionary_data_type->ordered(); diff --git a/c_glib/arrow-glib/composite-data-type.h b/c_glib/arrow-glib/composite-data-type.h index 25e1ac3d94929..f60a9cdeb6911 100644 --- a/c_glib/arrow-glib/composite-data-type.h +++ b/c_glib/arrow-glib/composite-data-type.h @@ -83,17 +83,17 @@ struct _GArrowStructDataTypeClass GArrowStructDataType *garrow_struct_data_type_new (GList *fields); gint -garrow_struct_data_type_get_n_fields(GArrowStructDataType *data_type); +garrow_struct_data_type_get_n_fields(GArrowStructDataType *struct_data_type); GList * -garrow_struct_data_type_get_fields(GArrowStructDataType *data_type); +garrow_struct_data_type_get_fields(GArrowStructDataType *struct_data_type); GArrowField * -garrow_struct_data_type_get_field(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field(GArrowStructDataType *struct_data_type, gint i); GArrowField * -garrow_struct_data_type_get_field_by_name(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field_by_name(GArrowStructDataType *struct_data_type, const gchar *name); gint -garrow_struct_data_type_get_field_index(GArrowStructDataType *data_type, +garrow_struct_data_type_get_field_index(GArrowStructDataType *struct_data_type, const gchar *name); @@ -109,14 +109,14 @@ struct _GArrowUnionDataTypeClass }; gint -garrow_union_data_type_get_n_fields(GArrowUnionDataType *data_type); +garrow_union_data_type_get_n_fields(GArrowUnionDataType *union_data_type); GList * -garrow_union_data_type_get_fields(GArrowUnionDataType *data_type); +garrow_union_data_type_get_fields(GArrowUnionDataType *union_data_type); GArrowField * -garrow_union_data_type_get_field(GArrowUnionDataType *data_type, +garrow_union_data_type_get_field(GArrowUnionDataType *union_data_type, gint i); guint8 * -garrow_union_data_type_get_type_codes(GArrowUnionDataType *data_type, +garrow_union_data_type_get_type_codes(GArrowUnionDataType *union_data_type, gsize *n_type_codes); @@ -172,11 +172,11 @@ garrow_dictionary_data_type_new(GArrowDataType *index_data_type, GArrowArray *dictionary, gboolean ordered); GArrowDataType * -garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *data_type); +garrow_dictionary_data_type_get_index_data_type(GArrowDictionaryDataType *dictionary_data_type); GArrowArray * -garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *data_type); +garrow_dictionary_data_type_get_dictionary(GArrowDictionaryDataType *dictionary_data_type); gboolean -garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *data_type); +garrow_dictionary_data_type_is_ordered(GArrowDictionaryDataType *dictionary_data_type); G_END_DECLS diff --git a/c_glib/arrow-glib/field.cpp b/c_glib/arrow-glib/field.cpp index b989d288ec30f..d74053af48f05 100644 --- a/c_glib/arrow-glib/field.cpp +++ b/c_glib/arrow-glib/field.cpp @@ -37,11 +37,12 @@ G_BEGIN_DECLS typedef struct GArrowFieldPrivate_ { std::shared_ptr field; + GArrowDataType *data_type; } GArrowFieldPrivate; enum { - PROP_0, - PROP_FIELD + PROP_FIELD = 1, + PROP_DATA_TYPE }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowField, @@ -54,11 +55,22 @@ G_DEFINE_TYPE_WITH_PRIVATE(GArrowField, GARROW_FIELD(obj))) static void -garrow_field_finalize(GObject *object) +garrow_field_dispose(GObject *object) { - GArrowFieldPrivate *priv; + auto priv = GARROW_FIELD_GET_PRIVATE(object); - priv = GARROW_FIELD_GET_PRIVATE(object); + if (priv->data_type) { + g_object_unref(priv->data_type); + priv->data_type = nullptr; + } + + G_OBJECT_CLASS(garrow_field_parent_class)->dispose(object); +} + +static void +garrow_field_finalize(GObject *object) +{ + auto priv = GARROW_FIELD_GET_PRIVATE(object); priv->field = nullptr; @@ -80,19 +92,9 @@ garrow_field_set_property(GObject *object, priv->field = *static_cast *>(g_value_get_pointer(value)); break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + case PROP_DATA_TYPE: + priv->data_type = GARROW_DATA_TYPE(g_value_dup_object(value)); break; - } -} - -static void -garrow_field_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) -{ - switch (prop_id) { default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -107,21 +109,27 @@ garrow_field_init(GArrowField *object) static void garrow_field_class_init(GArrowFieldClass *klass) { - GObjectClass *gobject_class; - GParamSpec *spec; - - gobject_class = G_OBJECT_CLASS(klass); + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = garrow_field_dispose; gobject_class->finalize = garrow_field_finalize; gobject_class->set_property = garrow_field_set_property; - gobject_class->get_property = garrow_field_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("field", "Field", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_FIELD, spec); + + spec = g_param_spec_object("data-type", + "Data type", + "The data type", + GARROW_TYPE_DATA_TYPE, + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATA_TYPE, spec); } /** @@ -137,7 +145,7 @@ garrow_field_new(const gchar *name, { auto arrow_data_type = garrow_data_type_get_raw(data_type); auto arrow_field = std::make_shared(name, arrow_data_type); - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, data_type); } /** @@ -157,7 +165,7 @@ garrow_field_new_full(const gchar *name, std::make_shared(name, garrow_data_type_get_raw(data_type), nullable); - return garrow_field_new_raw(&arrow_field); + return garrow_field_new_raw(&arrow_field, data_type); } /** @@ -177,14 +185,13 @@ garrow_field_get_name(GArrowField *field) * garrow_field_get_data_type: * @field: A #GArrowField. * - * Returns: (transfer full): The data type of the field. + * Returns: (transfer none): The data type of the field. */ GArrowDataType * garrow_field_get_data_type(GArrowField *field) { - const auto arrow_field = garrow_field_get_raw(field); - auto type = arrow_field->type(); - return garrow_data_type_new_raw(&type); + auto priv = GARROW_FIELD_GET_PRIVATE(field); + return priv->data_type; } /** @@ -233,10 +240,12 @@ garrow_field_to_string(GArrowField *field) G_END_DECLS GArrowField * -garrow_field_new_raw(std::shared_ptr *arrow_field) +garrow_field_new_raw(std::shared_ptr *arrow_field, + GArrowDataType *data_type) { auto field = GARROW_FIELD(g_object_new(GARROW_TYPE_FIELD, "field", arrow_field, + "data-type", data_type, NULL)); return field; } diff --git a/c_glib/arrow-glib/field.hpp b/c_glib/arrow-glib/field.hpp index e130ad5992409..f8d0d46c97ab4 100644 --- a/c_glib/arrow-glib/field.hpp +++ b/c_glib/arrow-glib/field.hpp @@ -23,5 +23,6 @@ #include -GArrowField *garrow_field_new_raw(std::shared_ptr *arrow_field); +GArrowField *garrow_field_new_raw(std::shared_ptr *arrow_field, + GArrowDataType *data_type); std::shared_ptr garrow_field_get_raw(GArrowField *field); diff --git a/c_glib/arrow-glib/schema.cpp b/c_glib/arrow-glib/schema.cpp index 1affaaede766b..64332419e0972 100644 --- a/c_glib/arrow-glib/schema.cpp +++ b/c_glib/arrow-glib/schema.cpp @@ -21,6 +21,7 @@ # include #endif +#include #include #include #include @@ -173,7 +174,11 @@ garrow_schema_get_field(GArrowSchema *schema, guint i) { const auto arrow_schema = garrow_schema_get_raw(schema); auto arrow_field = arrow_schema->field(i); - return garrow_field_new_raw(&arrow_field); + auto arrow_data_type = arrow_field->type(); + auto data_type = garrow_data_type_new_raw(&arrow_data_type); + auto field = garrow_field_new_raw(&arrow_field, data_type); + g_object_unref(data_type); + return field; } /** @@ -192,7 +197,11 @@ garrow_schema_get_field_by_name(GArrowSchema *schema, if (arrow_field == nullptr) { return NULL; } else { - return garrow_field_new_raw(&arrow_field); + auto arrow_data_type = arrow_field->type(); + auto data_type = garrow_data_type_new_raw(&arrow_data_type); + auto field = garrow_field_new_raw(&arrow_field, data_type); + g_object_unref(data_type); + return field; } } @@ -223,7 +232,10 @@ garrow_schema_get_fields(GArrowSchema *schema) GList *fields = NULL; for (auto arrow_field : arrow_schema->fields()) { - GArrowField *field = garrow_field_new_raw(&arrow_field); + auto arrow_data_type = arrow_field->type(); + auto data_type = garrow_data_type_new_raw(&arrow_data_type); + auto field = garrow_field_new_raw(&arrow_field, data_type); + g_object_unref(data_type); fields = g_list_prepend(fields, field); } diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index 709836524d848..2c68cbeabe330 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -1200,7 +1200,6 @@ ggandiva_field_node_new_raw(std::shared_ptr *gandiva_node, "field", field, "return-type", return_type, NULL); - g_object_unref(return_type); return GGANDIVA_FIELD_NODE(field_node); } From 16460d3b90f194c1212ec0b709b2a8171360ef54 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Mon, 7 Jan 2019 10:32:00 +0100 Subject: [PATCH 159/328] ARROW-4173: Fix JIRA library name in error message Author: Kouhei Sutou Closes #3326 from kou/dev-fix-jira-library-name and squashes the following commits: a16654dc Fix JIRA library name in error message --- dev/merge_arrow_pr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index 3d6ca31476ee3..5a926f5f6d17a 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -47,8 +47,8 @@ try: import jira.client except ImportError: - print("Could not find jira-python library. " - "Run 'sudo pip install jira-python' to install.") + print("Could not find jira library. " + "Run 'sudo pip install jira' to install.") print("Exiting without trying to close the associated JIRA.") sys.exit(1) From 84e10b69a8043f507eabc7b3f224a265baa33a1a Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Mon, 7 Jan 2019 19:58:39 +0900 Subject: [PATCH 160/328] ARROW-4174: [Ruby] Add support for building composite array from raw Ruby objects Author: Kouhei Sutou Closes #3327 from kou/ruby-array-builder and squashes the following commits: 20e5874c Add support for old GObject Introspection 36b993ba Add support for building composite array from raw Ruby objects --- c_glib/arrow-glib/array-builder.cpp | 21 +++ c_glib/arrow-glib/array-builder.h | 3 + c_glib/arrow-glib/decimal128.cpp | 18 +++ c_glib/arrow-glib/decimal128.h | 4 + ruby/red-arrow/lib/arrow/array.rb | 8 +- .../lib/arrow/decimal128-array-builder.rb | 64 ++++++++ ruby/red-arrow/lib/arrow/field.rb | 2 +- .../red-arrow/lib/arrow/list-array-builder.rb | 86 +++++++++++ ruby/red-arrow/lib/arrow/loader.rb | 5 +- .../lib/arrow/struct-array-builder.rb | 129 ++++++++++++++++ .../test/test-decimal128-array-builder.rb | 95 ++++++++++++ ruby/red-arrow/test/test-decimal128-array.rb | 38 +++++ .../red-arrow/test/test-list-array-builder.rb | 62 ++++++++ ruby/red-arrow/test/test-list-array.rb | 32 ++++ .../test/test-struct-array-builder.rb | 145 ++++++++++++++++++ ruby/red-arrow/test/test-struct-array.rb | 21 +++ 16 files changed, 728 insertions(+), 5 deletions(-) create mode 100644 ruby/red-arrow/lib/arrow/decimal128-array-builder.rb create mode 100644 ruby/red-arrow/lib/arrow/list-array-builder.rb create mode 100644 ruby/red-arrow/lib/arrow/struct-array-builder.rb create mode 100644 ruby/red-arrow/test/test-decimal128-array-builder.rb create mode 100644 ruby/red-arrow/test/test-decimal128-array.rb create mode 100644 ruby/red-arrow/test/test-list-array-builder.rb create mode 100644 ruby/red-arrow/test/test-list-array.rb create mode 100644 ruby/red-arrow/test/test-struct-array-builder.rb diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 5f2d4119ce6a2..095c68d87689d 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -3863,6 +3863,27 @@ garrow_decimal128_array_builder_append_value(GArrowDecimal128ArrayBuilder *build "[decimal128-array-builder][append-value]"); } +/** + * garrow_decimal128_array_builder_append_null: + * @builder: A #GArrowDecimal128ArrayBuilder. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * It appends a new NULL element. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_array_builder_append_null(GArrowDecimal128ArrayBuilder *builder, + GError **error) +{ + return garrow_array_builder_append_null + (GARROW_ARRAY_BUILDER(builder), + error, + "[decimal128-array-builder][append-null]"); +} + G_END_DECLS GArrowArrayBuilder * diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h index b2ad6f4bfd3fd..bc0a99429b8f1 100644 --- a/c_glib/arrow-glib/array-builder.h +++ b/c_glib/arrow-glib/array-builder.h @@ -1486,5 +1486,8 @@ GARROW_AVAILABLE_IN_0_12 gboolean garrow_decimal128_array_builder_append_value(GArrowDecimal128ArrayBuilder *builder, GArrowDecimal128 *value, GError **error); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_array_builder_append_null(GArrowDecimal128ArrayBuilder *builder, + GError **error); G_END_DECLS diff --git a/c_glib/arrow-glib/decimal128.cpp b/c_glib/arrow-glib/decimal128.cpp index e30eb7ee58638..d87a5019c1203 100644 --- a/c_glib/arrow-glib/decimal128.cpp +++ b/c_glib/arrow-glib/decimal128.cpp @@ -136,6 +136,24 @@ garrow_decimal128_new_integer(const gint64 data) return garrow_decimal128_new_raw(&arrow_decimal); } +/** + * garrow_decimal128_equal: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if both of them is the same value, %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal == *arrow_other_decimal; +} + /** * garrow_decimal128_to_string_scale: * @decimal: A #GArrowDecimal128. diff --git a/c_glib/arrow-glib/decimal128.h b/c_glib/arrow-glib/decimal128.h index 918cf3d49b4d2..e8fa59980cd94 100644 --- a/c_glib/arrow-glib/decimal128.h +++ b/c_glib/arrow-glib/decimal128.h @@ -20,6 +20,7 @@ #pragma once #include +#include G_BEGIN_DECLS @@ -37,6 +38,9 @@ struct _GArrowDecimal128Class GArrowDecimal128 *garrow_decimal128_new_string(const gchar *data); GArrowDecimal128 *garrow_decimal128_new_integer(const gint64 data); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); gchar *garrow_decimal128_to_string_scale(GArrowDecimal128 *decimal, gint32 scale); gchar *garrow_decimal128_to_string(GArrowDecimal128 *decimal); diff --git a/ruby/red-arrow/lib/arrow/array.rb b/ruby/red-arrow/lib/arrow/array.rb index 049224154dca3..359e70e007bdd 100644 --- a/ruby/red-arrow/lib/arrow/array.rb +++ b/ruby/red-arrow/lib/arrow/array.rb @@ -21,12 +21,14 @@ class Array class << self def new(*args) - return super if args.size != 1 - builder_class_name = "#{name}Builder" if const_defined?(builder_class_name) builder_class = const_get(builder_class_name) - builder_class.build(*args) + if args.size == builder_class.method(:build).arity + builder_class.build(*args) + else + super + end else super end diff --git a/ruby/red-arrow/lib/arrow/decimal128-array-builder.rb b/ruby/red-arrow/lib/arrow/decimal128-array-builder.rb new file mode 100644 index 0000000000000..9a849d487571e --- /dev/null +++ b/ruby/red-arrow/lib/arrow/decimal128-array-builder.rb @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require "bigdecimal" + +module Arrow + class Decimal128ArrayBuilder + class << self + def build(data_type, values) + builder = new(data_type) + builder.build(values) + end + end + + alias_method :append_value_raw, :append_value + def append_value(value) + case value + when nil + return append_null + when String + value = Decimal128.new(value) + when Float + value = Decimal128.new(value.to_s) + when BigDecimal + value = Decimal128.new(value.to_s) + end + append_value_raw(value) + end + + def append_values(values, is_valids=nil) + if is_valids + is_valids.each_with_index do |is_valid, i| + if is_valid + append_value(values[i]) + else + append_null + end + end + else + values.each do |value| + if value.nil? + append_null + else + append_value(value) + end + end + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/field.rb b/ruby/red-arrow/lib/arrow/field.rb index 8c7c8eaa005cb..599ff30975985 100644 --- a/ruby/red-arrow/lib/arrow/field.rb +++ b/ruby/red-arrow/lib/arrow/field.rb @@ -108,7 +108,7 @@ def initialize(*args) name = args[0] data_type = DataType.resolve(args[1]) else - message = "wrong number of arguments (given, #{n_args}, expected 1..2)" + message = "wrong number of arguments (given #{n_args}, expected 1..2)" raise ArgumentError, message end diff --git a/ruby/red-arrow/lib/arrow/list-array-builder.rb b/ruby/red-arrow/lib/arrow/list-array-builder.rb new file mode 100644 index 0000000000000..aa093c2de9b5c --- /dev/null +++ b/ruby/red-arrow/lib/arrow/list-array-builder.rb @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class ListArrayBuilder + class << self + def build(data_type, values) + builder = new(data_type) + builder.build(values) + end + end + + alias_method :append_value_raw, :append_value + + # @overload append_value + # + # Starts appending a list record. You also need to append list + # value by {#value_builder}. + # + # @overload append_value(list) + # + # Appends a list record including list value. + # + # @param value [nil, ::Array] The list value of the record. + # + # If this is `nil`, the list record is null. + # + # If this is `Array`, it's the list value of the record. + # + # @since 0.12.0 + def append_value(*args) + n_args = args.size + + case n_args + when 0 + append_value_raw + when 1 + value = args[0] + case value + when nil + append_null + when ::Array + append_value_raw + @value_builder ||= value_builder + @value_builder.append_values(value, nil) + else + message = "list value must be nil or Array: #{value.inspect}" + raise ArgumentError, message + end + else + message = "wrong number of arguments (given #{n_args}, expected 0..1)" + raise ArgumentError, message + end + end + + def append_values(lists, is_valids=nil) + if is_valids + is_valids.each_with_index do |is_valid, i| + if is_valid + append_value(lists[i]) + else + append_null + end + end + else + lists.each do |list| + append_value(list) + end + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb index 8747476222955..acd2573e3218f 100644 --- a/ruby/red-arrow/lib/arrow/loader.rb +++ b/ruby/red-arrow/lib/arrow/loader.rb @@ -43,11 +43,13 @@ def require_libraries require "arrow/date32-array-builder" require "arrow/date64-array" require "arrow/date64-array-builder" + require "arrow/decimal128-array-builder" require "arrow/decimal128-data-type" require "arrow/dense-union-data-type" require "arrow/dictionary-data-type" require "arrow/field" require "arrow/file-output-stream" + require "arrow/list-array-builder" require "arrow/list-data-type" require "arrow/path-extension" require "arrow/record" @@ -59,6 +61,7 @@ def require_libraries require "arrow/slicer" require "arrow/sparse-union-data-type" require "arrow/struct-array" + require "arrow/struct-array-builder" require "arrow/struct-data-type" require "arrow/table" require "arrow/table-formatter" @@ -101,7 +104,7 @@ def load_method_info(info, klass, method_name) end super(info, klass, method_name) else - super + super end end end diff --git a/ruby/red-arrow/lib/arrow/struct-array-builder.rb b/ruby/red-arrow/lib/arrow/struct-array-builder.rb new file mode 100644 index 0000000000000..883ce84da7de7 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/struct-array-builder.rb @@ -0,0 +1,129 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class StructArrayBuilder + class << self + def build(data_type, values) + builder = new(data_type) + builder.build(values) + end + end + + def [](index_or_name) + find_field_builder(index_or_name) + end + + def find_field_builder(index_or_name) + case index_or_name + when String, Symbol + name = index_or_name + (@name_to_builder ||= build_name_to_builder)[name.to_s] + else + index = index_or_name + cached_field_builders[index] + end + end + + alias_method :append_value_raw, :append_value + + # @overload append_value + # + # Starts appending a struct record. You need to append values of + # fields. + # + # @overload append_value(value) + # + # Appends a struct record including values of fields. + # + # @param value [nil, ::Array, Hash] The struct record value. + # + # If this is `nil`, the struct record is null. + # + # If this is `Array` or `Hash`, they are values of fields. + # + # @since 0.12.0 + def append_value(*args) + n_args = args.size + + case n_args + when 0 + append_value_raw + when 1 + value = args[0] + case value + when nil + append_null + when ::Array + append_value_raw + value.each_with_index do |sub_value, i| + self[i].append_value(sub_value) + end + when Hash + append_value_raw + value.each do |name, sub_value| + self[name].append_value(sub_value) + end + else + message = "struct value must be nil, Array or Hash: #{value.inspect}" + raise ArgumentError, message + end + else + message = "wrong number of arguments (given #{n_args}, expected 0..1)" + raise ArgumentError, message + end + end + + def append_values(values, is_valids=nil) + if is_valids + is_valids.each_with_index do |is_valid, i| + if is_valid + append_value(values[i]) + else + append_null + end + end + else + values.each do |value| + append_value(value) + end + end + end + + alias_method :append_null_raw, :append_null + def append_null + append_null_raw + cached_field_builders.each do |builder| + builder.append_null + end + end + + private + def cached_field_builders + @field_builders ||= field_builders + end + + def build_name_to_builder + name_to_builder = {} + builders = cached_field_builders + value_data_type.fields.each_with_index do |field, i| + name_to_builder[field.name] = builders[i] + end + name_to_builder + end + end +end diff --git a/ruby/red-arrow/test/test-decimal128-array-builder.rb b/ruby/red-arrow/test/test-decimal128-array-builder.rb new file mode 100644 index 0000000000000..841846490b792 --- /dev/null +++ b/ruby/red-arrow/test/test-decimal128-array-builder.rb @@ -0,0 +1,95 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class Decimal128ArrayBuilderTest < Test::Unit::TestCase + def setup + @data_type = Arrow::Decimal128DataType.new(8, 2) + @builder = Arrow::Decimal128ArrayBuilder.new(@data_type) + end + + sub_test_case("#append_value") do + test("nil") do + @builder.append_value(nil) + array = @builder.finish + assert_equal(nil, array[0]) + end + + test("Arrow::Decimal128") do + @builder.append_value(Arrow::Decimal128.new("10.1")) + array = @builder.finish + assert_equal(Arrow::Decimal128.new("10.1"), + array[0]) + end + + test("String") do + @builder.append_value("10.1") + array = @builder.finish + assert_equal(Arrow::Decimal128.new("10.1"), + array[0]) + end + + test("Float") do + @builder.append_value(10.1) + array = @builder.finish + assert_equal(Arrow::Decimal128.new("10.1"), + array[0]) + end + + test("BigDecimal") do + @builder.append_value(BigDecimal("10.1")) + array = @builder.finish + assert_equal(Arrow::Decimal128.new("10.1"), + array[0]) + end + end + + sub_test_case("#append_values") do + test("mixed") do + @builder.append_values([ + Arrow::Decimal128.new("10.1"), + nil, + "10.1", + 10.1, + BigDecimal("10.1"), + ]) + array = @builder.finish + assert_equal([ + Arrow::Decimal128.new("10.1"), + nil, + Arrow::Decimal128.new("10.1"), + Arrow::Decimal128.new("10.1"), + Arrow::Decimal128.new("10.1"), + ], + array.to_a) + end + + test("is_valids") do + @builder.append_values([ + Arrow::Decimal128.new("10.1"), + nil, + Arrow::Decimal128.new("10.1"), + ]) + array = @builder.finish + assert_equal([ + Arrow::Decimal128.new("10.1"), + nil, + Arrow::Decimal128.new("10.1"), + ], + array.to_a) + end + end +end diff --git a/ruby/red-arrow/test/test-decimal128-array.rb b/ruby/red-arrow/test/test-decimal128-array.rb new file mode 100644 index 0000000000000..9162be8b4cf13 --- /dev/null +++ b/ruby/red-arrow/test/test-decimal128-array.rb @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class Decimal128ArrayTest < Test::Unit::TestCase + sub_test_case(".new") do + test("build") do + data_type = Arrow::Decimal128DataType.new(8, 2) + values = [ + 10.1, + nil, + "10.1", + BigDecimal("10.1"), + ] + array = Arrow::Decimal128Array.new(data_type, values) + assert_equal([ + Arrow::Decimal128.new("10.1"), + nil, + Arrow::Decimal128.new("10.1"), + Arrow::Decimal128.new("10.1"), + ], + array.to_a) + end + end +end diff --git a/ruby/red-arrow/test/test-list-array-builder.rb b/ruby/red-arrow/test/test-list-array-builder.rb new file mode 100644 index 0000000000000..e36f2c8340be4 --- /dev/null +++ b/ruby/red-arrow/test/test-list-array-builder.rb @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class ListArrayBuilderTest < Test::Unit::TestCase + def setup + @data_type = Arrow::ListDataType.new(name: "visible", type: :boolean) + @builder = Arrow::ListArrayBuilder.new(@data_type) + end + + sub_test_case("#append_value") do + test("nil") do + @builder.append_value(nil) + array = @builder.finish + assert_equal(nil, array[0]) + end + + test("Array") do + @builder.append_value([true, false, true]) + array = @builder.finish + assert_equal([true, false, true], array[0].to_a) + end + end + + sub_test_case("#append_values") do + test("[nil, Array]") do + @builder.append_values([[false], nil, [true, false, true]]) + array = @builder.finish + assert_equal([ + [false], + nil, + [true, false, true], + ], + array.collect {|list| list ? list.to_a : nil}) + end + + test("is_valids") do + @builder.append_values([[false], [true, true], [true, false, true]], + [true, false, true]) + array = @builder.finish + assert_equal([ + [false], + nil, + [true, false, true], + ], + array.collect {|list| list ? list.to_a : nil}) + end + end +end diff --git a/ruby/red-arrow/test/test-list-array.rb b/ruby/red-arrow/test/test-list-array.rb new file mode 100644 index 0000000000000..c1f762492e4ef --- /dev/null +++ b/ruby/red-arrow/test/test-list-array.rb @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class ListArrayTest < Test::Unit::TestCase + sub_test_case(".new") do + test("build") do + data_type = Arrow::ListDataType.new(name: "visible", type: :boolean) + values = [ + [true, false], + nil, + [false, true, false], + ] + array = Arrow::ListArray.new(data_type, values) + assert_equal(values, + array.collect {|value| value ? value.to_a : nil}) + end + end +end diff --git a/ruby/red-arrow/test/test-struct-array-builder.rb b/ruby/red-arrow/test/test-struct-array-builder.rb new file mode 100644 index 0000000000000..205564c816c30 --- /dev/null +++ b/ruby/red-arrow/test/test-struct-array-builder.rb @@ -0,0 +1,145 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class StructArrayBuilderTest < Test::Unit::TestCase + def setup + @data_type = Arrow::StructDataType.new(visible: {type: :boolean}, + count: {type: :uint64}) + @builder = Arrow::StructArrayBuilder.new(@data_type) + end + + sub_test_case("#append_value") do + test("nil") do + @builder.append_value(nil) + array = @builder.finish + assert_equal([ + [nil], + [nil], + ], + [ + array[0].to_a, + array[1].to_a, + ]) + end + + test("Array") do + @builder.append_value([true, 1]) + array = @builder.finish + assert_equal([ + [true], + [1], + ], + [ + array[0].to_a, + array[1].to_a, + ]) + end + + test("Hash") do + @builder.append_value(count: 1, visible: true) + array = @builder.finish + assert_equal([ + [true], + [1], + ], + [ + array[0].to_a, + array[1].to_a, + ]) + end + end + + sub_test_case("#append_values") do + test("[nil]") do + @builder.append_values([nil]) + array = @builder.finish + assert_equal([ + [nil], + [nil], + ], + [ + array[0].to_a, + array[1].to_a, + ]) + end + + test("[Array]") do + @builder.append_values([[true, 1]]) + array = @builder.finish + assert_equal([ + [true], + [1], + ], + [ + array[0].to_a, + array[1].to_a, + ]) + end + + test("[Hash]") do + @builder.append_values([{count: 1, visible: true}]) + array = @builder.finish + assert_equal([ + [true], + [1], + ], + [ + array[0].to_a, + array[1].to_a, + ]) + end + + test("[nil, Array, Hash]") do + @builder.append_values([ + nil, + [true, 1], + {count: 2, visible: false}, + ]) + array = @builder.finish + assert_equal([ + [nil, true, false], + [nil, 1, 2], + ], + [ + array[0].to_a, + array[1].to_a, + ]) + end + + test("is_valids") do + @builder.append_values([ + [true, 1], + [false, 2], + [true, 3], + ], + [ + true, + false, + true, + ]) + array = @builder.finish + assert_equal([ + [true, nil, true], + [1, nil, 3], + ], + [ + array[0].to_a, + array[1].to_a, + ]) + end + end +end diff --git a/ruby/red-arrow/test/test-struct-array.rb b/ruby/red-arrow/test/test-struct-array.rb index 1957db4d1fd5a..986b0a9db1696 100644 --- a/ruby/red-arrow/test/test-struct-array.rb +++ b/ruby/red-arrow/test/test-struct-array.rb @@ -16,6 +16,27 @@ # under the License. class StructArrayTest < Test::Unit::TestCase + sub_test_case(".new") do + test("build") do + data_type = Arrow::StructDataType.new(visible: :boolean, + count: :uint64) + values = [ + [true, 1], + nil, + [false, 2], + ] + array = Arrow::StructArray.new(data_type, values) + assert_equal([ + [true, nil, false], + [1, nil, 2], + ], + [ + array[0].to_a, + array[1].to_a, + ]) + end + end + test("#[]") do type = Arrow::StructDataType.new([ Arrow::Field.new("field1", :boolean), From ed1d60d0e459108b23ce4ff9bc9129a005058ece Mon Sep 17 00:00:00 2001 From: Siddharth Dave Date: Mon, 7 Jan 2019 10:42:50 -0600 Subject: [PATCH 161/328] ARROW-3544: [Gandiva] [C++] Create function registry in multiple compilation units to reduce build times Refactored function_registry into separate files. The function signatures are now split across 3 different files viz. function_registry.cc, function_registry_binaryfn.cc & function_registry_unaryfn.cc. This approach reduces the build times on my setup with little refactoring. I looked into tensorflow as well, as Wes had suggested, however tensorflow heavily utilizes tempalate based approach, which is quite different from this & may require quite a bit of refactoring in gandiva. Author: Siddharth Dave Closes #3051 from siddharthdave/master and squashes the following commits: 1d75ac7f6 ARROW-3544: Extremely long compile time for function_registry.cc in release mode on clang 6 removed individual clasess for each registry. removed unused header files. cleaned up code. ab93602f3 ARROW-3544: Extremely long compile time for function_registry.cc in release mode on clang 6 fixed lint errors with make format 8161eddb5 ARROW-3544: Extremely long compile time for function_registry.cc in release mode on clang 6 added a note in function_registry_common.h that its for internal use. 60954a038 ARROW-3544: Extremely long compile time for function_registry.cc in release mode on clang 6 > Replaced STRINGIFY with ARROW_STRINGIFY 241b6340c ARROW-3544: Extremely long compile time for function_registry.cc in release mode on clang 6 > Yet another refactor of macros > removed redundant comments faeffeef4 ARROW-3544: Extremely long compile time for function_registry.cc in release mode on clang 6 1. incorporated review comments suggested by ravindra 2. refactored code a bit & removed unused includes etc. 508b7835e ARROW-3544: Extremely long compile time for function_registry.cc in release mode on clang 6 Fixed make check-format errors. b8176dd40 ARROW-3544: Extremely long compile time for function_registry.cc in release mode on clang 6 Incorported review comments: 1. removed duplicate code/macros & moved it into a new header file function_registry_common.h 2. added separate classes for holding different function types 3. during initialization, the map is populated by individualy populating from these classes into 1 common map. 1788fb32e ARROW-3544: Extremely long compile time for function_registry.cc in release mode on clang 6 Fixing whitespace issue reported during 'make lint' 2cdb6df58 ARROW-3544: Extremely long compile time for function_registry.cc in release mode on clang 6 Refactored function_registry into separate files. The function signatures are now split across 3 different files viz. function_registry.cc, function_registry_binaryfn.cc & function_registry_unaryfn.cc. This approach reduces the build times on my setup with little refactoring. --- cpp/src/arrow/test-util.h | 48 +- cpp/src/arrow/util/macros.h | 2 + cpp/src/gandiva/CMakeLists.txt | 6 + cpp/src/gandiva/function_registry.cc | 440 ++---------------- cpp/src/gandiva/function_registry.h | 25 +- .../gandiva/function_registry_arithmetic.cc | 78 ++++ .../gandiva/function_registry_arithmetic.h | 30 ++ cpp/src/gandiva/function_registry_common.h | 218 +++++++++ cpp/src/gandiva/function_registry_datetime.cc | 65 +++ cpp/src/gandiva/function_registry_datetime.h | 30 ++ cpp/src/gandiva/function_registry_hash.cc | 53 +++ cpp/src/gandiva/function_registry_hash.h | 30 ++ cpp/src/gandiva/function_registry_math_ops.cc | 67 +++ cpp/src/gandiva/function_registry_math_ops.h | 30 ++ cpp/src/gandiva/function_registry_string.cc | 50 ++ cpp/src/gandiva/function_registry_string.h | 30 ++ .../function_registry_timestamp_arithmetic.cc | 81 ++++ .../function_registry_timestamp_arithmetic.h | 30 ++ cpp/src/gandiva/native_function.h | 4 +- 19 files changed, 862 insertions(+), 455 deletions(-) create mode 100644 cpp/src/gandiva/function_registry_arithmetic.cc create mode 100644 cpp/src/gandiva/function_registry_arithmetic.h create mode 100644 cpp/src/gandiva/function_registry_common.h create mode 100644 cpp/src/gandiva/function_registry_datetime.cc create mode 100644 cpp/src/gandiva/function_registry_datetime.h create mode 100644 cpp/src/gandiva/function_registry_hash.cc create mode 100644 cpp/src/gandiva/function_registry_hash.h create mode 100644 cpp/src/gandiva/function_registry_math_ops.cc create mode 100644 cpp/src/gandiva/function_registry_math_ops.h create mode 100644 cpp/src/gandiva/function_registry_string.cc create mode 100644 cpp/src/gandiva/function_registry_string.h create mode 100644 cpp/src/gandiva/function_registry_timestamp_arithmetic.cc create mode 100644 cpp/src/gandiva/function_registry_timestamp_arithmetic.h diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 33321633090af..aa7c73e59ac54 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -46,35 +46,33 @@ #include "arrow/util/macros.h" #include "arrow/util/visibility.h" -#define STRINGIFY(x) #x - -#define ASSERT_RAISES(ENUM, expr) \ - do { \ - ::arrow::Status s = (expr); \ - if (!s.Is##ENUM()) { \ - FAIL() << "Expected '" STRINGIFY(expr) "' to fail with " STRINGIFY( \ - ENUM) ", but got " \ - << s.ToString(); \ - } \ +#define ASSERT_RAISES(ENUM, expr) \ + do { \ + ::arrow::Status s = (expr); \ + if (!s.Is##ENUM()) { \ + FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ + ENUM) ", but got " \ + << s.ToString(); \ + } \ } while (false) -#define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr) \ - do { \ - ::arrow::Status s = (expr); \ - if (!s.Is##ENUM()) { \ - FAIL() << "Expected '" STRINGIFY(expr) "' to fail with " STRINGIFY( \ - ENUM) ", but got " \ - << s.ToString(); \ - } \ - ASSERT_EQ((message), s.ToString()); \ +#define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr) \ + do { \ + ::arrow::Status s = (expr); \ + if (!s.Is##ENUM()) { \ + FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ + ENUM) ", but got " \ + << s.ToString(); \ + } \ + ASSERT_EQ((message), s.ToString()); \ } while (false) -#define ASSERT_OK(expr) \ - do { \ - ::arrow::Status _s = (expr); \ - if (!_s.ok()) { \ - FAIL() << "'" STRINGIFY(expr) "' failed with " << _s.ToString(); \ - } \ +#define ASSERT_OK(expr) \ + do { \ + ::arrow::Status _s = (expr); \ + if (!_s.ok()) { \ + FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _s.ToString(); \ + } \ } while (false) #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index 1d188820837fc..f4c58f4030afd 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -18,6 +18,8 @@ #ifndef ARROW_UTIL_MACROS_H #define ARROW_UTIL_MACROS_H +#define ARROW_STRINGIFY(x) #x + // From Google gutil #ifndef ARROW_DISALLOW_COPY_AND_ASSIGN #define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName) \ diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 6b67c8699c511..90fe7cf8c9c57 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -55,6 +55,12 @@ set(SRC_FILES annotator.cc exported_funcs_registry.cc filter.cc function_registry.cc + function_registry_arithmetic.cc + function_registry_datetime.cc + function_registry_hash.cc + function_registry_math_ops.cc + function_registry_string.cc + function_registry_timestamp_arithmetic.cc function_signature.cc gdv_function_stubs.cc llvm_generator.cc diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 3928fbeb0edb3..83d80b4988690 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -16,7 +16,15 @@ // under the License. #include "gandiva/function_registry.h" - +#include "gandiva/function_registry_arithmetic.h" +#include "gandiva/function_registry_datetime.h" +#include "gandiva/function_registry_hash.h" +#include "gandiva/function_registry_math_ops.h" +#include "gandiva/function_registry_string.h" +#include "gandiva/function_registry_timestamp_arithmetic.h" + +#include +#include #include namespace gandiva { @@ -35,424 +43,46 @@ using arrow::uint32; using arrow::uint64; using arrow::uint8; using arrow::utf8; +using std::iterator; using std::vector; -#define STRINGIFY(a) #a - -// Binary functions that : -// - have the same input type for both params -// - output type is same as the input type -// - NULL handling is of type NULL_IF_NULL -// -// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32 -#define BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, TYPE(), kResultNullIfNull, \ - STRINGIFY(NAME##_##TYPE##_##TYPE)) - -// Binary functions that : -// - have the same input type for both params -// - NULL handling is of type NULL_IINTERNAL -// - can return error. -// -// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32 -#define BINARY_UNSAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ - NativeFunction(#NAME, DataTypeVector{IN_TYPE(), IN_TYPE()}, OUT_TYPE(), \ - kResultNullIfNull, STRINGIFY(NAME##_##IN_TYPE##_##IN_TYPE), \ - NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors) - -#define BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(NAME, TYPE) \ - BINARY_UNSAFE_NULL_IF_NULL(NAME, TYPE, TYPE) - -// Binary functions that : -// - have different input types, or output type -// - NULL handling is of type NULL_IF_NULL -// -// The pre-compiled fn name includes the base name & input type names. eg. mod_int64_int32 -#define BINARY_GENERIC_SAFE_NULL_IF_NULL(NAME, IN_TYPE1, IN_TYPE2, OUT_TYPE) \ - NativeFunction(#NAME, DataTypeVector{IN_TYPE1(), IN_TYPE2()}, OUT_TYPE(), \ - kResultNullIfNull, STRINGIFY(NAME##_##IN_TYPE1##_##IN_TYPE2)) - -// Binary functions that : -// - have the same input type -// - output type is boolean -// - NULL handling is of type NULL_IF_NULL -// -// The pre-compiled fn name includes the base name & input type names. -// eg. equal_int32_int32 -#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, boolean(), kResultNullIfNull, \ - STRINGIFY(NAME##_##TYPE##_##TYPE)) - -// Unary functions that : -// - NULL handling is of type NULL_IF_NULL -// -// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32 -#define UNARY_SAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ - NativeFunction(#NAME, DataTypeVector{IN_TYPE()}, OUT_TYPE(), kResultNullIfNull, \ - STRINGIFY(NAME##_##IN_TYPE)) - -// Unary functions that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type name. eg. isnull_int32 -#define UNARY_SAFE_NULL_NEVER_BOOL(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE()}, boolean(), kResultNullNever, \ - STRINGIFY(NAME##_##TYPE)) - -// Unary functions that : -// - NULL handling is of type NULL_INTERNAL -// -// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32 -#define UNARY_UNSAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ - NativeFunction(#NAME, DataTypeVector{IN_TYPE()}, OUT_TYPE(), kResultNullIfNull, \ - STRINGIFY(NAME##_##IN_TYPE), \ - NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors) - -// Binary functions that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type names, -// eg. is_distinct_from_int32_int32 -#define BINARY_SAFE_NULL_NEVER_BOOL(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, boolean(), kResultNullNever, \ - STRINGIFY(NAME##_##TYPE##_##TYPE)) - -// Extract functions (used with data/time types) that : -// - NULL handling is of type NULL_IF_NULL -// -// The pre-compiled fn name includes the base name & input type name. eg. extractYear_date -#define EXTRACT_SAFE_NULL_IF_NULL(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), kResultNullIfNull, \ - STRINGIFY(NAME##_##TYPE)) - -// Hash32 functions that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type name. hash32_int8 -#define HASH32_SAFE_NULL_NEVER(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE()}, int32(), kResultNullNever, \ - STRINGIFY(NAME##_##TYPE)) - -// Hash32 functions that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type name. hash32_int8 -#define HASH64_SAFE_NULL_NEVER(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), kResultNullNever, \ - STRINGIFY(NAME##_##TYPE)) - -// Hash32 functions with seed that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8 -#define HASH32_SEED_SAFE_NULL_NEVER(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), int32()}, int32(), kResultNullNever, \ - STRINGIFY(NAME##WithSeed_##TYPE)) - -// Hash64 functions with seed that : -// - NULL handling is of type NULL_NEVER -// -// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8 -#define HASH64_SEED_SAFE_NULL_NEVER(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), int64()}, int64(), kResultNullNever, \ - STRINGIFY(NAME##WithSeed_##TYPE)) - -// Iterate the inner macro over all numeric types -#define NUMERIC_TYPES(INNER, NAME) \ - INNER(NAME, int8), INNER(NAME, int16), INNER(NAME, int32), INNER(NAME, int64), \ - INNER(NAME, uint8), INNER(NAME, uint16), INNER(NAME, uint32), INNER(NAME, uint64), \ - INNER(NAME, float32), INNER(NAME, float64) - -// Iterate the inner macro over numeric and date/time types -#define NUMERIC_DATE_TYPES(INNER, NAME) \ - NUMERIC_TYPES(INNER, NAME), DATE_TYPES(INNER, NAME), TIME_TYPES(INNER, NAME) - -// Iterate the inner macro over all date types -#define DATE_TYPES(INNER, NAME) INNER(NAME, date64), INNER(NAME, timestamp) - -// Iterate the inner macro over all time types -#define TIME_TYPES(INNER, NAME) INNER(NAME, time32) - -// Iterate the inner macro over all data types -#define VAR_LEN_TYPES(INNER, NAME) INNER(NAME, utf8), INNER(NAME, binary) - -// Iterate the inner macro over all numeric types, date types and bool type -#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \ - NUMERIC_DATE_TYPES(INNER, NAME), INNER(NAME, boolean) - -// Iterate the inner macro over all numeric types, date types, bool and varlen types -#define NUMERIC_BOOL_DATE_VAR_LEN_TYPES(INNER, NAME) \ - NUMERIC_BOOL_DATE_TYPES(INNER, NAME), VAR_LEN_TYPES(INNER, NAME) - -// list of registered native functions. -NativeFunction FunctionRegistry::pc_registry_[] = { - // Arithmetic operations - NUMERIC_TYPES(BINARY_SYMMETRIC_SAFE_NULL_IF_NULL, add), - NUMERIC_TYPES(BINARY_SYMMETRIC_SAFE_NULL_IF_NULL, subtract), - NUMERIC_TYPES(BINARY_SYMMETRIC_SAFE_NULL_IF_NULL, multiply), - NUMERIC_TYPES(BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL, divide), - BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, int64, int32, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, int64, int64, int64), - NUMERIC_BOOL_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, equal), - NUMERIC_BOOL_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, not_equal), - NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, less_than), - NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, less_than_or_equal_to), - NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, greater_than), - NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, greater_than_or_equal_to), - UNARY_SAFE_NULL_IF_NULL(not, boolean, boolean), - - // cast operations - UNARY_SAFE_NULL_IF_NULL(castBIGINT, int32, int64), - UNARY_SAFE_NULL_IF_NULL(castFLOAT4, int32, float32), - UNARY_SAFE_NULL_IF_NULL(castFLOAT4, int64, float32), - UNARY_SAFE_NULL_IF_NULL(castFLOAT8, int32, float64), - UNARY_SAFE_NULL_IF_NULL(castFLOAT8, int64, float64), - UNARY_SAFE_NULL_IF_NULL(castFLOAT8, float32, float64), - UNARY_SAFE_NULL_IF_NULL(castDATE, int64, date64), - - // extended math ops - UNARY_SAFE_NULL_IF_NULL(cbrt, int32, float64), - UNARY_SAFE_NULL_IF_NULL(cbrt, int64, float64), - UNARY_SAFE_NULL_IF_NULL(cbrt, uint32, float64), - UNARY_SAFE_NULL_IF_NULL(cbrt, uint64, float64), - UNARY_SAFE_NULL_IF_NULL(cbrt, float32, float64), - UNARY_SAFE_NULL_IF_NULL(cbrt, float64, float64), - - UNARY_SAFE_NULL_IF_NULL(exp, int32, float64), - UNARY_SAFE_NULL_IF_NULL(exp, int64, float64), - UNARY_SAFE_NULL_IF_NULL(exp, uint32, float64), - UNARY_SAFE_NULL_IF_NULL(exp, uint64, float64), - UNARY_SAFE_NULL_IF_NULL(exp, float32, float64), - UNARY_SAFE_NULL_IF_NULL(exp, float64, float64), - - UNARY_SAFE_NULL_IF_NULL(log, int32, float64), - UNARY_SAFE_NULL_IF_NULL(log, int64, float64), - UNARY_SAFE_NULL_IF_NULL(log, uint32, float64), - UNARY_SAFE_NULL_IF_NULL(log, uint64, float64), - UNARY_SAFE_NULL_IF_NULL(log, float32, float64), - UNARY_SAFE_NULL_IF_NULL(log, float64, float64), - - UNARY_SAFE_NULL_IF_NULL(log10, int32, float64), - UNARY_SAFE_NULL_IF_NULL(log10, int64, float64), - UNARY_SAFE_NULL_IF_NULL(log10, uint32, float64), - UNARY_SAFE_NULL_IF_NULL(log10, uint64, float64), - UNARY_SAFE_NULL_IF_NULL(log10, float32, float64), - UNARY_SAFE_NULL_IF_NULL(log10, float64, float64), - - BINARY_UNSAFE_NULL_IF_NULL(log, int32, float64), - BINARY_UNSAFE_NULL_IF_NULL(log, int64, float64), - BINARY_UNSAFE_NULL_IF_NULL(log, uint32, float64), - BINARY_UNSAFE_NULL_IF_NULL(log, uint64, float64), - BINARY_UNSAFE_NULL_IF_NULL(log, float32, float64), - BINARY_UNSAFE_NULL_IF_NULL(log, float64, float64), - - BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(power, float64), - - // nullable never operations - NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnull), - NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnotnull), - NUMERIC_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnumeric), - - // nullable never binary operations - NUMERIC_BOOL_DATE_TYPES(BINARY_SAFE_NULL_NEVER_BOOL, is_distinct_from), - NUMERIC_BOOL_DATE_TYPES(BINARY_SAFE_NULL_NEVER_BOOL, is_not_distinct_from), - - // date/timestamp operations - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractMillennium), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractCentury), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDecade), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractYear), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDoy), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractQuarter), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractMonth), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractWeek), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDow), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDay), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractHour), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractMinute), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractSecond), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractEpoch), - - BINARY_GENERIC_SAFE_NULL_IF_NULL(months_between, date64, date64, float64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(months_between, timestamp, timestamp, float64), - - // date_trunc operations on date/timestamp - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Millennium), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Century), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Decade), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Year), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Quarter), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Month), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Week), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Day), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Hour), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Minute), - DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, date_trunc_Second), - - // time operations - TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractHour), - TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractMinute), - TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractSecond), - - // timestamp diff operations - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffSecond, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffMinute, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffHour, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffDay, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffWeek, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffMonth, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffQuarter, timestamp, timestamp, int32), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampdiffYear, timestamp, timestamp, int32), - - // timestamp add int32 operations - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddSecond, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMinute, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddHour, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddDay, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddWeek, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMonth, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddQuarter, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddYear, timestamp, int32, timestamp), - // date add int32 operations - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddSecond, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMinute, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddHour, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddDay, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddWeek, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMonth, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddQuarter, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddYear, date64, int32, date64), - - // timestamp add int64 operations - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddSecond, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMinute, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddHour, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddDay, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddWeek, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMonth, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddQuarter, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddYear, timestamp, int64, timestamp), - // date add int64 operations - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddSecond, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMinute, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddHour, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddDay, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddWeek, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddMonth, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddQuarter, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(timestampaddYear, date64, int64, date64), - - // date_add(date64, int32), date_add(timestamp, int32) - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, timestamp, int32, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, timestamp, int32, timestamp), - - // date_add(date64, int64), date_add(timestamp, int64) - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, timestamp, int64, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, timestamp, int64, timestamp), - - // date_add(int32, date64), date_add(int32, timestamp) - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, int32, date64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, int32, date64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, int32, timestamp, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, int32, timestamp, timestamp), - - // date_add(int64, date64), date_add(int64, timestamp) - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, int64, date64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, int64, date64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_add, int64, timestamp, timestamp), - BINARY_GENERIC_SAFE_NULL_IF_NULL(add, int64, timestamp, timestamp), - - // date_sub(date64, int32), subtract and date_diff - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_sub, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(subtract, date64, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_diff, date64, int32, date64), - // date_sub(timestamp, int32), subtract and date_diff - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_sub, timestamp, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(subtract, timestamp, int32, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_diff, timestamp, int32, date64), - - // date_sub(date64, int64), subtract and date_diff - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_sub, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(subtract, date64, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_diff, date64, int64, date64), - // date_sub(timestamp, int64), subtract and date_diff - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_sub, timestamp, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(subtract, timestamp, int64, date64), - BINARY_GENERIC_SAFE_NULL_IF_NULL(date_diff, timestamp, int64, date64), - - // hash functions - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash32), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, hash32AsDouble), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, hash32), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, hash32AsDouble), - - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, hash64), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, hash64AsDouble), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, hash64), - NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, hash64AsDouble), - - // utf8/binary operations - UNARY_SAFE_NULL_IF_NULL(octet_length, utf8, int32), - UNARY_SAFE_NULL_IF_NULL(octet_length, binary, int32), - UNARY_SAFE_NULL_IF_NULL(bit_length, utf8, int32), - UNARY_SAFE_NULL_IF_NULL(bit_length, binary, int32), - UNARY_UNSAFE_NULL_IF_NULL(char_length, utf8, int32), - UNARY_UNSAFE_NULL_IF_NULL(length, utf8, int32), - UNARY_UNSAFE_NULL_IF_NULL(lengthUtf8, binary, int32), - - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, equal), - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, not_equal), - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, less_than), - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, less_than_or_equal_to), - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, greater_than), - VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, greater_than_or_equal_to), +FunctionRegistry::iterator FunctionRegistry::begin() const { + return &(*pc_registry_.begin()); +} - BINARY_RELATIONAL_SAFE_NULL_IF_NULL(starts_with, utf8), - BINARY_RELATIONAL_SAFE_NULL_IF_NULL(ends_with, utf8), +FunctionRegistry::iterator FunctionRegistry::end() const { + return &(*pc_registry_.end()); +} - NativeFunction("upper", DataTypeVector{utf8()}, utf8(), kResultNullIfNull, - "upper_utf8", NativeFunction::kNeedsContext), +std::vector FunctionRegistry::pc_registry_; - NativeFunction("like", DataTypeVector{utf8(), utf8()}, boolean(), kResultNullIfNull, - "gdv_fn_like_utf8_utf8", NativeFunction::kNeedsFunctionHolder), +SignatureMap FunctionRegistry::pc_registry_map_ = InitPCMap(); - NativeFunction("castDATE", DataTypeVector{utf8()}, date64(), kResultNullIfNull, - "castDATE_utf8", - NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), +SignatureMap FunctionRegistry::InitPCMap() { + SignatureMap map; - NativeFunction("to_date", DataTypeVector{utf8(), utf8(), int32()}, date64(), - kResultNullInternal, "gdv_fn_to_date_utf8_utf8_int32", - NativeFunction::kNeedsContext | NativeFunction::kNeedsFunctionHolder | - NativeFunction::kCanReturnErrors), -}; // namespace gandiva + auto v1 = GetArithmeticFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v1.begin(), v1.end()); -FunctionRegistry::iterator FunctionRegistry::begin() const { - return std::begin(pc_registry_); -} + auto v2 = GetDateTimeFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v2.begin(), v2.end()); -FunctionRegistry::iterator FunctionRegistry::end() const { - return std::end(pc_registry_); -} + auto v3 = GetHashFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v3.begin(), v3.end()); -FunctionRegistry::SignatureMap FunctionRegistry::pc_registry_map_ = InitPCMap(); + auto v4 = GetMathOpsFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v4.begin(), v4.end()); -FunctionRegistry::SignatureMap FunctionRegistry::InitPCMap() { - SignatureMap map; + auto v5 = GetStringFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v5.begin(), v5.end()); - int num_entries = static_cast(sizeof(pc_registry_) / sizeof(NativeFunction)); - for (int i = 0; i < num_entries; i++) { - const NativeFunction* entry = &pc_registry_[i]; + auto v6 = GetDateTimeArithmeticFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v6.begin(), v6.end()); - DCHECK(map.find(&entry->signature()) == map.end()); - map[&entry->signature()] = entry; - // printf("%s -> %s\n", entry->signature().ToString().c_str(), - // entry->pc_name().c_str()); + for (auto& elem : pc_registry_) { + map.insert(std::make_pair(&(elem.signature()), &elem)); } + return map; } diff --git a/cpp/src/gandiva/function_registry.h b/cpp/src/gandiva/function_registry.h index 0f74089fc6d8e..810bf2d3eb338 100644 --- a/cpp/src/gandiva/function_registry.h +++ b/cpp/src/gandiva/function_registry.h @@ -18,8 +18,8 @@ #ifndef GANDIVA_FUNCTION_REGISTRY_H #define GANDIVA_FUNCTION_REGISTRY_H -#include - +#include +#include "gandiva/function_registry_common.h" #include "gandiva/gandiva_aliases.h" #include "gandiva/native_function.h" @@ -37,28 +37,9 @@ class FunctionRegistry { iterator end() const; private: - struct KeyHash { - std::size_t operator()(const FunctionSignature* k) const { return k->Hash(); } - }; - - struct KeyEquals { - bool operator()(const FunctionSignature* s1, const FunctionSignature* s2) const { - return *s1 == *s2; - } - }; - - static DataTypePtr time32() { return arrow::time32(arrow::TimeUnit::MILLI); } - - static DataTypePtr time64() { return arrow::time64(arrow::TimeUnit::MICRO); } - - static DataTypePtr timestamp() { return arrow::timestamp(arrow::TimeUnit::MILLI); } - - typedef std::unordered_map - SignatureMap; static SignatureMap InitPCMap(); - static NativeFunction pc_registry_[]; + static std::vector pc_registry_; static SignatureMap pc_registry_map_; }; diff --git a/cpp/src/gandiva/function_registry_arithmetic.cc b/cpp/src/gandiva/function_registry_arithmetic.cc new file mode 100644 index 0000000000000..800bc493f0019 --- /dev/null +++ b/cpp/src/gandiva/function_registry_arithmetic.cc @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_arithmetic.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define BINARY_SYMMETRIC_FN(name) NUMERIC_TYPES(BINARY_SYMMETRIC_SAFE_NULL_IF_NULL, name) + +#define BINARY_RELATIONAL_BOOL_FN(name) \ + NUMERIC_BOOL_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, name) + +#define BINARY_RELATIONAL_BOOL_DATE_FN(name) \ + NUMERIC_DATE_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, name) + +#define UNARY_OCTET_LEN_FN(name) \ + UNARY_SAFE_NULL_IF_NULL(name, utf8, int32), UNARY_SAFE_NULL_IF_NULL(name, binary, int32) + +#define UNARY_CAST_TO_FLOAT64(name) UNARY_SAFE_NULL_IF_NULL(castFLOAT8, name, float64) + +#define UNARY_CAST_TO_FLOAT32(name) UNARY_SAFE_NULL_IF_NULL(castFLOAT4, name, float32) + +std::vector GetArithmeticFunctionRegistry() { + static std::vector arithmetic_fn_registry_ = { + UNARY_SAFE_NULL_IF_NULL(not, boolean, boolean), + UNARY_SAFE_NULL_IF_NULL(castBIGINT, int32, int64), + + UNARY_CAST_TO_FLOAT32(int32), + UNARY_CAST_TO_FLOAT32(int64), + + UNARY_CAST_TO_FLOAT64(int32), + UNARY_CAST_TO_FLOAT64(int64), + UNARY_CAST_TO_FLOAT64(float32), + + UNARY_SAFE_NULL_IF_NULL(castDATE, int64, date64), + + BINARY_SYMMETRIC_FN(add), + BINARY_SYMMETRIC_FN(subtract), + BINARY_SYMMETRIC_FN(multiply), + + NUMERIC_TYPES(BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL, divide), + BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, int64, int32, int32), + BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, int64, int64, int64), + + BINARY_RELATIONAL_BOOL_FN(equal), + BINARY_RELATIONAL_BOOL_FN(not_equal), + + BINARY_RELATIONAL_BOOL_DATE_FN(less_than), + BINARY_RELATIONAL_BOOL_DATE_FN(less_than_or_equal_to), + BINARY_RELATIONAL_BOOL_DATE_FN(greater_than), + BINARY_RELATIONAL_BOOL_DATE_FN(greater_than_or_equal_to), + + UNARY_OCTET_LEN_FN(octet_length), + UNARY_OCTET_LEN_FN(bit_length), + + UNARY_UNSAFE_NULL_IF_NULL(char_length, utf8, int32), + UNARY_UNSAFE_NULL_IF_NULL(length, utf8, int32), + UNARY_UNSAFE_NULL_IF_NULL(lengthUtf8, binary, int32)}; + + return arithmetic_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_arithmetic.h b/cpp/src/gandiva/function_registry_arithmetic.h new file mode 100644 index 0000000000000..e98a4e7b5b1b4 --- /dev/null +++ b/cpp/src/gandiva/function_registry_arithmetic.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_ARITHMETIC_H +#define GANDIVA_FUNCTION_REGISTRY_ARITHMETIC_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetArithmeticFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_ARITHMETIC_H diff --git a/cpp/src/gandiva/function_registry_common.h b/cpp/src/gandiva/function_registry_common.h new file mode 100644 index 0000000000000..78babce9a7dbf --- /dev/null +++ b/cpp/src/gandiva/function_registry_common.h @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_COMMON_H +#define GANDIVA_FUNCTION_REGISTRY_COMMON_H + +#include +#include +#include + +#include "gandiva/arrow.h" +#include "gandiva/function_signature.h" +#include "gandiva/gandiva_aliases.h" +#include "gandiva/native_function.h" + +/* This is a private file, intended for internal use by gandiva & must not be included + * directly. + */ +namespace gandiva { + +using arrow::binary; +using arrow::boolean; +using arrow::date64; +using arrow::float32; +using arrow::float64; +using arrow::int16; +using arrow::int32; +using arrow::int64; +using arrow::int8; +using arrow::uint16; +using arrow::uint32; +using arrow::uint64; +using arrow::uint8; +using arrow::utf8; +using std::vector; + +inline DataTypePtr time32() { return arrow::time32(arrow::TimeUnit::MILLI); } + +inline DataTypePtr time64() { return arrow::time64(arrow::TimeUnit::MICRO); } + +inline DataTypePtr timestamp() { return arrow::timestamp(arrow::TimeUnit::MILLI); } + +struct KeyHash { + std::size_t operator()(const FunctionSignature* k) const { return k->Hash(); } +}; + +struct KeyEquals { + bool operator()(const FunctionSignature* s1, const FunctionSignature* s2) const { + return *s1 == *s2; + } +}; + +typedef std::unordered_map + SignatureMap; + +// Binary functions that : +// - have the same input type for both params +// - output type is same as the input type +// - NULL handling is of type NULL_IF_NULL +// +// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32 +#define BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, TYPE(), kResultNullIfNull, \ + ARROW_STRINGIFY(NAME##_##TYPE##_##TYPE)) + +// Binary functions that : +// - have the same input type for both params +// - NULL handling is of type NULL_IINTERNAL +// - can return error. +// +// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32 +#define BINARY_UNSAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ + NativeFunction(#NAME, DataTypeVector{IN_TYPE(), IN_TYPE()}, OUT_TYPE(), \ + kResultNullIfNull, ARROW_STRINGIFY(NAME##_##IN_TYPE##_##IN_TYPE), \ + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors) + +#define BINARY_SYMMETRIC_UNSAFE_NULL_IF_NULL(NAME, TYPE) \ + BINARY_UNSAFE_NULL_IF_NULL(NAME, TYPE, TYPE) + +// Binary functions that : +// - have different input types, or output type +// - NULL handling is of type NULL_IF_NULL +// +// The pre-compiled fn name includes the base name & input type names. eg. mod_int64_int32 +#define BINARY_GENERIC_SAFE_NULL_IF_NULL(NAME, IN_TYPE1, IN_TYPE2, OUT_TYPE) \ + NativeFunction(#NAME, DataTypeVector{IN_TYPE1(), IN_TYPE2()}, OUT_TYPE(), \ + kResultNullIfNull, ARROW_STRINGIFY(NAME##_##IN_TYPE1##_##IN_TYPE2)) + +// Binary functions that : +// - have the same input type +// - output type is boolean +// - NULL handling is of type NULL_IF_NULL +// +// The pre-compiled fn name includes the base name & input type names. +// eg. equal_int32_int32 +#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, boolean(), kResultNullIfNull, \ + ARROW_STRINGIFY(NAME##_##TYPE##_##TYPE)) + +// Unary functions that : +// - NULL handling is of type NULL_IF_NULL +// +// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32 +#define UNARY_SAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ + NativeFunction(#NAME, DataTypeVector{IN_TYPE()}, OUT_TYPE(), kResultNullIfNull, \ + ARROW_STRINGIFY(NAME##_##IN_TYPE)) + +// Unary functions that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. eg. isnull_int32 +#define UNARY_SAFE_NULL_NEVER_BOOL(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE()}, boolean(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##_##TYPE)) + +// Unary functions that : +// - NULL handling is of type NULL_INTERNAL +// +// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32 +#define UNARY_UNSAFE_NULL_IF_NULL(NAME, IN_TYPE, OUT_TYPE) \ + NativeFunction(#NAME, DataTypeVector{IN_TYPE()}, OUT_TYPE(), kResultNullIfNull, \ + ARROW_STRINGIFY(NAME##_##IN_TYPE), \ + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors) + +// Binary functions that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type names, +// eg. is_distinct_from_int32_int32 +#define BINARY_SAFE_NULL_NEVER_BOOL(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, boolean(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##_##TYPE##_##TYPE)) + +// Extract functions (used with data/time types) that : +// - NULL handling is of type NULL_IF_NULL +// +// The pre-compiled fn name includes the base name & input type name. eg. extractYear_date +#define EXTRACT_SAFE_NULL_IF_NULL(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), kResultNullIfNull, \ + ARROW_STRINGIFY(NAME##_##TYPE)) + +// Hash32 functions that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32_int8 +#define HASH32_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE()}, int32(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##_##TYPE)) + +// Hash32 functions that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32_int8 +#define HASH64_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE()}, int64(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##_##TYPE)) + +// Hash32 functions with seed that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8 +#define HASH32_SEED_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), int32()}, int32(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##WithSeed_##TYPE)) + +// Hash64 functions with seed that : +// - NULL handling is of type NULL_NEVER +// +// The pre-compiled fn name includes the base name & input type name. hash32WithSeed_int8 +#define HASH64_SEED_SAFE_NULL_NEVER(NAME, TYPE) \ + NativeFunction(#NAME, DataTypeVector{TYPE(), int64()}, int64(), kResultNullNever, \ + ARROW_STRINGIFY(NAME##WithSeed_##TYPE)) + +// Iterate the inner macro over all numeric types +#define NUMERIC_TYPES(INNER, NAME) \ + INNER(NAME, int8), INNER(NAME, int16), INNER(NAME, int32), INNER(NAME, int64), \ + INNER(NAME, uint8), INNER(NAME, uint16), INNER(NAME, uint32), INNER(NAME, uint64), \ + INNER(NAME, float32), INNER(NAME, float64) + +// Iterate the inner macro over numeric and date/time types +#define NUMERIC_DATE_TYPES(INNER, NAME) \ + NUMERIC_TYPES(INNER, NAME), DATE_TYPES(INNER, NAME), TIME_TYPES(INNER, NAME) + +// Iterate the inner macro over all date types +#define DATE_TYPES(INNER, NAME) INNER(NAME, date64), INNER(NAME, timestamp) + +// Iterate the inner macro over all time types +#define TIME_TYPES(INNER, NAME) INNER(NAME, time32) + +// Iterate the inner macro over all data types +#define VAR_LEN_TYPES(INNER, NAME) INNER(NAME, utf8), INNER(NAME, binary) + +// Iterate the inner macro over all numeric types, date types and bool type +#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \ + NUMERIC_DATE_TYPES(INNER, NAME), INNER(NAME, boolean) + +// Iterate the inner macro over all numeric types, date types, bool and varlen types +#define NUMERIC_BOOL_DATE_VAR_LEN_TYPES(INNER, NAME) \ + NUMERIC_BOOL_DATE_TYPES(INNER, NAME), VAR_LEN_TYPES(INNER, NAME) + +} // namespace gandiva + +#endif diff --git a/cpp/src/gandiva/function_registry_datetime.cc b/cpp/src/gandiva/function_registry_datetime.cc new file mode 100644 index 0000000000000..145b7d39395b4 --- /dev/null +++ b/cpp/src/gandiva/function_registry_datetime.cc @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_datetime.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define DATE_EXTRACTION_FNS(name) \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Millennium), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Century), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Decade), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Year), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Quarter), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Month), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Week), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Day), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Hour), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Minute), \ + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Second) + +#define TIME_EXTRACTION_FNS(name) \ + TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Hour), \ + TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Minute), \ + TIME_TYPES(EXTRACT_SAFE_NULL_IF_NULL, name##Second) + +std::vector GetDateTimeFunctionRegistry() { + static std::vector date_time_fn_registry_ = { + DATE_EXTRACTION_FNS(extract), + DATE_EXTRACTION_FNS(date_trunc_), + + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDoy), + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractDow), + DATE_TYPES(EXTRACT_SAFE_NULL_IF_NULL, extractEpoch), + + TIME_EXTRACTION_FNS(extract), + + NativeFunction("castDATE", DataTypeVector{utf8()}, date64(), kResultNullIfNull, + "castDATE_utf8", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + + NativeFunction("to_date", DataTypeVector{utf8(), utf8(), int32()}, date64(), + kResultNullInternal, "gdv_fn_to_date_utf8_utf8_int32", + NativeFunction::kNeedsContext | + NativeFunction::kNeedsFunctionHolder | + NativeFunction::kCanReturnErrors)}; + + return date_time_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_datetime.h b/cpp/src/gandiva/function_registry_datetime.h new file mode 100644 index 0000000000000..c9b88942215d8 --- /dev/null +++ b/cpp/src/gandiva/function_registry_datetime.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_DATE_TIME_H +#define GANDIVA_FUNCTION_REGISTRY_DATE_TIME_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetDateTimeFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_DATE_TIME_H diff --git a/cpp/src/gandiva/function_registry_hash.cc b/cpp/src/gandiva/function_registry_hash.cc new file mode 100644 index 0000000000000..a163a230eaca3 --- /dev/null +++ b/cpp/src/gandiva/function_registry_hash.cc @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_hash.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define HASH32_SAFE_NULL_NEVER_FN(name) \ + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SAFE_NULL_NEVER, name) + +#define HASH32_SEED_SAFE_NULL_NEVER_FN(name) \ + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH32_SEED_SAFE_NULL_NEVER, name) + +#define HASH64_SAFE_NULL_NEVER_FN(name) \ + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SAFE_NULL_NEVER, name) + +#define HASH64_SEED_SAFE_NULL_NEVER_FN(name) \ + NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH64_SEED_SAFE_NULL_NEVER, name) + +std::vector GetHashFunctionRegistry() { + static std::vector hash_fn_registry_ = { + HASH32_SAFE_NULL_NEVER_FN(hash), + HASH32_SAFE_NULL_NEVER_FN(hash32), + HASH32_SAFE_NULL_NEVER_FN(hash32AsDouble), + + HASH32_SEED_SAFE_NULL_NEVER_FN(hash32), + HASH32_SEED_SAFE_NULL_NEVER_FN(hash32AsDouble), + + HASH64_SAFE_NULL_NEVER_FN(hash64), + HASH64_SAFE_NULL_NEVER_FN(hash64AsDouble), + + HASH64_SEED_SAFE_NULL_NEVER_FN(hash64), + HASH64_SEED_SAFE_NULL_NEVER_FN(hash64AsDouble)}; + + return hash_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_hash.h b/cpp/src/gandiva/function_registry_hash.h new file mode 100644 index 0000000000000..dc02cb21e37b5 --- /dev/null +++ b/cpp/src/gandiva/function_registry_hash.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_HASH_H +#define GANDIVA_FUNCTION_REGISTRY_HASH_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetHashFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_HASH_H diff --git a/cpp/src/gandiva/function_registry_math_ops.cc b/cpp/src/gandiva/function_registry_math_ops.cc new file mode 100644 index 0000000000000..31b4b13119a86 --- /dev/null +++ b/cpp/src/gandiva/function_registry_math_ops.cc @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_math_ops.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define MATH_UNARY_OPS(name) \ + UNARY_SAFE_NULL_IF_NULL(name, int32, float64), \ + UNARY_SAFE_NULL_IF_NULL(name, int64, float64), \ + UNARY_SAFE_NULL_IF_NULL(name, uint32, float64), \ + UNARY_SAFE_NULL_IF_NULL(name, uint64, float64), \ + UNARY_SAFE_NULL_IF_NULL(name, float32, float64), \ + UNARY_SAFE_NULL_IF_NULL(name, float64, float64) + +#define MATH_BINARY_UNSAFE(name) \ + BINARY_UNSAFE_NULL_IF_NULL(name, int32, float64), \ + BINARY_UNSAFE_NULL_IF_NULL(name, int64, float64), \ + BINARY_UNSAFE_NULL_IF_NULL(name, uint32, float64), \ + BINARY_UNSAFE_NULL_IF_NULL(name, uint64, float64), \ + BINARY_UNSAFE_NULL_IF_NULL(name, float32, float64), \ + BINARY_UNSAFE_NULL_IF_NULL(name, float64, float64) + +#define UNARY_SAFE_NULL_NEVER_BOOL_FN(name) \ + NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, name) + +#define BINARY_SAFE_NULL_NEVER_BOOL_FN(name) \ + NUMERIC_BOOL_DATE_TYPES(BINARY_SAFE_NULL_NEVER_BOOL, name) + +std::vector GetMathOpsFunctionRegistry() { + static std::vector math_fn_registry_ = { + MATH_UNARY_OPS(cbrt), + MATH_UNARY_OPS(exp), + MATH_UNARY_OPS(log), + MATH_UNARY_OPS(log10), + + MATH_BINARY_UNSAFE(log), + + BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(power, float64), + + UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull), + UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull), + + NUMERIC_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnumeric), + + BINARY_SAFE_NULL_NEVER_BOOL_FN(is_distinct_from), + BINARY_SAFE_NULL_NEVER_BOOL_FN(is_not_distinct_from)}; + + return math_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_math_ops.h b/cpp/src/gandiva/function_registry_math_ops.h new file mode 100644 index 0000000000000..0204ffc8809ac --- /dev/null +++ b/cpp/src/gandiva/function_registry_math_ops.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_MATHOPS_H +#define GANDIVA_FUNCTION_REGISTRY_MATHOPS_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetMathOpsFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_MATHOPS_H diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc new file mode 100644 index 0000000000000..c97925af9cbb3 --- /dev/null +++ b/cpp/src/gandiva/function_registry_string.cc @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_string.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(name) \ + VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, name) + +#define BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(name) \ + BINARY_RELATIONAL_SAFE_NULL_IF_NULL(name, utf8) + +std::vector GetStringFunctionRegistry() { + static std::vector string_fn_registry_ = { + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(equal), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(not_equal), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(less_than), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(less_than_or_equal_to), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(greater_than), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_FN(greater_than_or_equal_to), + + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(starts_with), + BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(ends_with), + + NativeFunction("upper", DataTypeVector{utf8()}, utf8(), kResultNullIfNull, + "upper_utf8", NativeFunction::kNeedsContext), + + NativeFunction("like", DataTypeVector{utf8(), utf8()}, boolean(), kResultNullIfNull, + "gdv_fn_like_utf8_utf8", NativeFunction::kNeedsFunctionHolder)}; + + return string_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_string.h b/cpp/src/gandiva/function_registry_string.h new file mode 100644 index 0000000000000..c9217893e5c0b --- /dev/null +++ b/cpp/src/gandiva/function_registry_string.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_STRING_H +#define GANDIVA_FUNCTION_REGISTRY_STRING_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetStringFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_STRING_H diff --git a/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc b/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc new file mode 100644 index 0000000000000..7af76909b7d8f --- /dev/null +++ b/cpp/src/gandiva/function_registry_timestamp_arithmetic.cc @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_timestamp_arithmetic.h" +#include "gandiva/function_registry_common.h" + +namespace gandiva { + +#define TIMESTAMP_ADD_FNS(name) \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int32, timestamp), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int32, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int64, timestamp), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int64, date64) + +#define TIMESTAMP_DIFF_FN(name) \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, timestamp, int32) + +#define DATE_ADD_FNS(name) \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int32, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int32, timestamp), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int64, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int64, timestamp), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, int32, date64, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, int32, timestamp, timestamp), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, int64, date64, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, int64, timestamp, timestamp) + +#define DATE_DIFF_FNS(name) \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int32, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int32, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, date64, int64, date64), \ + BINARY_GENERIC_SAFE_NULL_IF_NULL(name, timestamp, int64, date64) + +std::vector GetDateTimeArithmeticFunctionRegistry() { + static std::vector datetime_fn_registry_ = { + BINARY_GENERIC_SAFE_NULL_IF_NULL(months_between, date64, date64, float64), + BINARY_GENERIC_SAFE_NULL_IF_NULL(months_between, timestamp, timestamp, float64), + + TIMESTAMP_DIFF_FN(timestampdiffSecond), + TIMESTAMP_DIFF_FN(timestampdiffMinute), + TIMESTAMP_DIFF_FN(timestampdiffHour), + TIMESTAMP_DIFF_FN(timestampdiffDay), + TIMESTAMP_DIFF_FN(timestampdiffWeek), + TIMESTAMP_DIFF_FN(timestampdiffMonth), + TIMESTAMP_DIFF_FN(timestampdiffQuarter), + TIMESTAMP_DIFF_FN(timestampdiffYear), + + TIMESTAMP_ADD_FNS(timestampaddSecond), + TIMESTAMP_ADD_FNS(timestampaddMinute), + TIMESTAMP_ADD_FNS(timestampaddHour), + TIMESTAMP_ADD_FNS(timestampaddDay), + TIMESTAMP_ADD_FNS(timestampaddWeek), + TIMESTAMP_ADD_FNS(timestampaddMonth), + TIMESTAMP_ADD_FNS(timestampaddQuarter), + TIMESTAMP_ADD_FNS(timestampaddYear), + + DATE_ADD_FNS(date_add), + DATE_ADD_FNS(add), + + DATE_DIFF_FNS(date_sub), + DATE_DIFF_FNS(subtract), + DATE_DIFF_FNS(date_diff)}; + + return datetime_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_timestamp_arithmetic.h b/cpp/src/gandiva/function_registry_timestamp_arithmetic.h new file mode 100644 index 0000000000000..f1b97093663ba --- /dev/null +++ b/cpp/src/gandiva/function_registry_timestamp_arithmetic.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_REGISTRY_TIMESTAMP_ARITHMETIC_H +#define GANDIVA_FUNCTION_REGISTRY_TIMESTAMP_ARITHMETIC_H + +#include +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetDateTimeArithmeticFunctionRegistry(); + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_REGISTRY_TIMESTAMP_ARITHMETIC_H diff --git a/cpp/src/gandiva/native_function.h b/cpp/src/gandiva/native_function.h index 7a250e01cb619..5b130a9313c5b 100644 --- a/cpp/src/gandiva/native_function.h +++ b/cpp/src/gandiva/native_function.h @@ -52,7 +52,6 @@ class NativeFunction { bool NeedsFunctionHolder() const { return (flags_ & kNeedsFunctionHolder) != 0; } bool CanReturnErrors() const { return (flags_ & kCanReturnErrors) != 0; } - private: NativeFunction(const std::string& base_name, const DataTypeVector& param_types, DataTypePtr ret_type, const ResultNullableType& result_nullable_type, const std::string& pc_name, int32_t flags = 0) @@ -61,6 +60,7 @@ class NativeFunction { result_nullable_type_(result_nullable_type), pc_name_(pc_name) {} + private: FunctionSignature signature_; /// attributes @@ -69,8 +69,6 @@ class NativeFunction { /// pre-compiled function name. std::string pc_name_; - - friend class FunctionRegistry; }; } // end namespace gandiva From 1aecb987790bb78c084a2c8f4ce224acc2dfd13b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Jan 2019 20:14:39 +0100 Subject: [PATCH 162/328] ARROW-4179: [Python] Use more public API to determine whether a test has a pytest mark or not There was an internal API change in pytest 4.1.0 that resulted in our pytest configuration logic failing to check if marks were set or not on unit tests. I confirmed that the following fixes the problem both on pytest 4.0.x (where things still worked) and 4.1.0 (when things broke) Author: Wes McKinney Closes #3333 from wesm/ARROW-4179 and squashes the following commits: 646c1cb2 Use iter_markers to get a list of marks for each unit test since the behavior of item.obj changed --- python/pyarrow/tests/conftest.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 3c092cfb60247..daaba59d4d35e 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -146,6 +146,8 @@ def pytest_collection_modifyitems(config, items): def pytest_runtest_setup(item): only_set = False + item_marks = {mark.name: mark for mark in item.iter_markers()} + for group in groups: flag = '--{0}'.format(group) only_flag = '--only-{0}'.format(group) @@ -154,7 +156,7 @@ def pytest_runtest_setup(item): if item.config.getoption(only_flag): only_set = True - elif getattr(item.obj, group, None): + elif group in item_marks: is_enabled = (item.config.getoption(flag) or item.config.getoption(enable_flag)) is_disabled = item.config.getoption(disable_flag) @@ -165,8 +167,7 @@ def pytest_runtest_setup(item): skip_item = True for group in groups: only_flag = '--only-{0}'.format(group) - if (getattr(item.obj, group, False) and - item.config.getoption(only_flag)): + if group in item_marks and item.config.getoption(only_flag): skip_item = False if skip_item: From 72405a1fa3c8548b6d331f9049fed74aefd1e9ae Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Tue, 8 Jan 2019 05:25:47 +0900 Subject: [PATCH 163/328] ARROW-4151: [Rust] Restructure project directories Author: Chao Sun Closes #3325 from sunchao/ARROW-4151 and squashes the following commits: 869dc69f Fix cargo coverage c24223ec Fix CI scripts 0672fa18 Fix 00-prepare.sh 0dbbb08a ARROW-4151: Restructure project directories --- .travis.yml | 2 +- ci/travis_script_rust.sh | 8 +-- dev/release/00-prepare.sh | 6 +-- dev/release/rat_exclude_files.txt | 2 +- rust/Cargo.toml | 46 ++--------------- rust/arrow/Cargo.toml | 57 ++++++++++++++++++++++ rust/{ => arrow}/README.md | 0 rust/{ => arrow}/benches/array_from_vec.rs | 0 rust/{ => arrow}/benches/builder.rs | 0 rust/{ => arrow}/examples/builders.rs | 0 rust/{ => arrow}/examples/dynamic_types.rs | 0 rust/{ => arrow}/examples/read_csv.rs | 0 rust/{ => arrow}/src/array.rs | 0 rust/{ => arrow}/src/array_data.rs | 0 rust/{ => arrow}/src/array_ops.rs | 0 rust/{ => arrow}/src/bitmap.rs | 0 rust/{ => arrow}/src/buffer.rs | 0 rust/{ => arrow}/src/builder.rs | 0 rust/{ => arrow}/src/csv/mod.rs | 0 rust/{ => arrow}/src/csv/reader.rs | 0 rust/{ => arrow}/src/datatypes.rs | 0 rust/{ => arrow}/src/error.rs | 0 rust/{ => arrow}/src/lib.rs | 0 rust/{ => arrow}/src/memory.rs | 0 rust/{ => arrow}/src/mod.rs | 0 rust/{ => arrow}/src/record_batch.rs | 0 rust/{ => arrow}/src/tensor.rs | 0 rust/{ => arrow}/src/util/bit_util.rs | 0 rust/{ => arrow}/src/util/mod.rs | 0 rust/{ => arrow}/src/util/test_util.rs | 0 rust/{ => arrow}/test/data/null_test.csv | 0 rust/{ => arrow}/test/data/uk_cities.csv | 0 rust/parquet/Cargo.toml | 2 +- 33 files changed, 71 insertions(+), 52 deletions(-) create mode 100644 rust/arrow/Cargo.toml rename rust/{ => arrow}/README.md (100%) rename rust/{ => arrow}/benches/array_from_vec.rs (100%) rename rust/{ => arrow}/benches/builder.rs (100%) rename rust/{ => arrow}/examples/builders.rs (100%) rename rust/{ => arrow}/examples/dynamic_types.rs (100%) rename rust/{ => arrow}/examples/read_csv.rs (100%) rename rust/{ => arrow}/src/array.rs (100%) rename rust/{ => arrow}/src/array_data.rs (100%) rename rust/{ => arrow}/src/array_ops.rs (100%) rename rust/{ => arrow}/src/bitmap.rs (100%) rename rust/{ => arrow}/src/buffer.rs (100%) rename rust/{ => arrow}/src/builder.rs (100%) rename rust/{ => arrow}/src/csv/mod.rs (100%) rename rust/{ => arrow}/src/csv/reader.rs (100%) rename rust/{ => arrow}/src/datatypes.rs (100%) rename rust/{ => arrow}/src/error.rs (100%) rename rust/{ => arrow}/src/lib.rs (100%) rename rust/{ => arrow}/src/memory.rs (100%) rename rust/{ => arrow}/src/mod.rs (100%) rename rust/{ => arrow}/src/record_batch.rs (100%) rename rust/{ => arrow}/src/tensor.rs (100%) rename rust/{ => arrow}/src/util/bit_util.rs (100%) rename rust/{ => arrow}/src/util/mod.rs (100%) rename rust/{ => arrow}/src/util/test_util.rs (100%) rename rust/{ => arrow}/test/data/null_test.csv (100%) rename rust/{ => arrow}/test/data/uk_cities.csv (100%) diff --git a/.travis.yml b/.travis.yml index 916ccf460ecf8..ffbb691f652f5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -307,7 +307,7 @@ matrix: - pushd ${TRAVIS_BUILD_DIR}/rust # Run coverage for codecov.io - mkdir -p target/kcov - - RUST_BACKTRACE=1 RUSTUP_TOOLCHAIN=stable cargo coverage --verbose + - RUST_BACKTRACE=1 RUSTUP_TOOLCHAIN=nightly cargo coverage --verbose - bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" - name: Go language: go diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh index af61dd39446ff..8e3c8c3906b24 100755 --- a/ci/travis_script_rust.sh +++ b/ci/travis_script_rust.sh @@ -31,11 +31,11 @@ rustup show # raises on any formatting errors cargo +stable fmt --all -- --check -# raises on any warnings -cargo rustc -- -D warnings - -cargo build +RUSTFLAGS="-D warnings" cargo build cargo test + +# run examples +cd arrow cargo run --example builders cargo run --example dynamic_types cargo run --example read_csv diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 20d9ab8fce651..1c233a35c21ef 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -100,9 +100,9 @@ update_versions() { cd "${SOURCE_DIR}/../../rust" sed -i.bak -r -e \ "s/^version = \".+\"/version = \"${version}\"/g" \ - Cargo.toml parquet/Cargo.toml - rm -f Cargo.toml.bak parquet/Cargo.toml.bak - git add Cargo.toml parquet/Cargo.toml + arrow/Cargo.toml parquet/Cargo.toml + rm -f arrow/Cargo.toml.bak parquet/Cargo.toml.bak + git add arrow/Cargo.toml parquet/Cargo.toml cd - } diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 1086793630b7d..720b19d894ace 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -187,5 +187,5 @@ r/README.Rmd r/man/*.Rd .gitattributes ruby/red-arrow/.yardopts -rust/test/data/*.csv +rust/arrow/test/data/*.csv rust/rust-toolchain diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 1bf64d73ade5e..abfb71ada7951 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -15,46 +15,8 @@ # specific language governing permissions and limitations # under the License. -[package] -name = "arrow" -version = "0.12.0-SNAPSHOT" -description = "Rust implementation of Apache Arrow" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = [ "arrow" ] -include = [ - "src/**/*.rs", - "Cargo.toml", -] -edition = "2018" - -[lib] -name = "arrow" -path = "src/lib.rs" - -[dependencies] -bytes = "0.4" -libc = "0.2" -serde = { version = "1.0.80", features = ["alloc", "rc"] } -serde_derive = "1.0.80" -serde_json = "1.0.13" -rand = "0.5" -csv = "1.0.0" -num = "0.2" - -[dev-dependencies] -criterion = "0.2" -lazy_static = "1" - -[[bench]] -name = "array_from_vec" -harness = false - -[[bench]] -name = "builder" -harness = false - [workspace] -members = ["parquet"] \ No newline at end of file +members = [ + "arrow", + "parquet", +] \ No newline at end of file diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml new file mode 100644 index 0000000000000..77e8d53fa55b5 --- /dev/null +++ b/rust/arrow/Cargo.toml @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow" +version = "0.12.0-SNAPSHOT" +description = "Rust implementation of Apache Arrow" +homepage = "https://github.com/apache/arrow" +repository = "https://github.com/apache/arrow" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = [ "arrow" ] +include = [ + "src/**/*.rs", + "Cargo.toml", +] +edition = "2018" + +[lib] +name = "arrow" +path = "src/lib.rs" + +[dependencies] +bytes = "0.4" +libc = "0.2" +serde = { version = "1.0.80", features = ["alloc", "rc"] } +serde_derive = "1.0.80" +serde_json = "1.0.13" +rand = "0.5" +csv = "1.0.0" +num = "0.2" + +[dev-dependencies] +criterion = "0.2" +lazy_static = "1" + +[[bench]] +name = "array_from_vec" +harness = false + +[[bench]] +name = "builder" +harness = false \ No newline at end of file diff --git a/rust/README.md b/rust/arrow/README.md similarity index 100% rename from rust/README.md rename to rust/arrow/README.md diff --git a/rust/benches/array_from_vec.rs b/rust/arrow/benches/array_from_vec.rs similarity index 100% rename from rust/benches/array_from_vec.rs rename to rust/arrow/benches/array_from_vec.rs diff --git a/rust/benches/builder.rs b/rust/arrow/benches/builder.rs similarity index 100% rename from rust/benches/builder.rs rename to rust/arrow/benches/builder.rs diff --git a/rust/examples/builders.rs b/rust/arrow/examples/builders.rs similarity index 100% rename from rust/examples/builders.rs rename to rust/arrow/examples/builders.rs diff --git a/rust/examples/dynamic_types.rs b/rust/arrow/examples/dynamic_types.rs similarity index 100% rename from rust/examples/dynamic_types.rs rename to rust/arrow/examples/dynamic_types.rs diff --git a/rust/examples/read_csv.rs b/rust/arrow/examples/read_csv.rs similarity index 100% rename from rust/examples/read_csv.rs rename to rust/arrow/examples/read_csv.rs diff --git a/rust/src/array.rs b/rust/arrow/src/array.rs similarity index 100% rename from rust/src/array.rs rename to rust/arrow/src/array.rs diff --git a/rust/src/array_data.rs b/rust/arrow/src/array_data.rs similarity index 100% rename from rust/src/array_data.rs rename to rust/arrow/src/array_data.rs diff --git a/rust/src/array_ops.rs b/rust/arrow/src/array_ops.rs similarity index 100% rename from rust/src/array_ops.rs rename to rust/arrow/src/array_ops.rs diff --git a/rust/src/bitmap.rs b/rust/arrow/src/bitmap.rs similarity index 100% rename from rust/src/bitmap.rs rename to rust/arrow/src/bitmap.rs diff --git a/rust/src/buffer.rs b/rust/arrow/src/buffer.rs similarity index 100% rename from rust/src/buffer.rs rename to rust/arrow/src/buffer.rs diff --git a/rust/src/builder.rs b/rust/arrow/src/builder.rs similarity index 100% rename from rust/src/builder.rs rename to rust/arrow/src/builder.rs diff --git a/rust/src/csv/mod.rs b/rust/arrow/src/csv/mod.rs similarity index 100% rename from rust/src/csv/mod.rs rename to rust/arrow/src/csv/mod.rs diff --git a/rust/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs similarity index 100% rename from rust/src/csv/reader.rs rename to rust/arrow/src/csv/reader.rs diff --git a/rust/src/datatypes.rs b/rust/arrow/src/datatypes.rs similarity index 100% rename from rust/src/datatypes.rs rename to rust/arrow/src/datatypes.rs diff --git a/rust/src/error.rs b/rust/arrow/src/error.rs similarity index 100% rename from rust/src/error.rs rename to rust/arrow/src/error.rs diff --git a/rust/src/lib.rs b/rust/arrow/src/lib.rs similarity index 100% rename from rust/src/lib.rs rename to rust/arrow/src/lib.rs diff --git a/rust/src/memory.rs b/rust/arrow/src/memory.rs similarity index 100% rename from rust/src/memory.rs rename to rust/arrow/src/memory.rs diff --git a/rust/src/mod.rs b/rust/arrow/src/mod.rs similarity index 100% rename from rust/src/mod.rs rename to rust/arrow/src/mod.rs diff --git a/rust/src/record_batch.rs b/rust/arrow/src/record_batch.rs similarity index 100% rename from rust/src/record_batch.rs rename to rust/arrow/src/record_batch.rs diff --git a/rust/src/tensor.rs b/rust/arrow/src/tensor.rs similarity index 100% rename from rust/src/tensor.rs rename to rust/arrow/src/tensor.rs diff --git a/rust/src/util/bit_util.rs b/rust/arrow/src/util/bit_util.rs similarity index 100% rename from rust/src/util/bit_util.rs rename to rust/arrow/src/util/bit_util.rs diff --git a/rust/src/util/mod.rs b/rust/arrow/src/util/mod.rs similarity index 100% rename from rust/src/util/mod.rs rename to rust/arrow/src/util/mod.rs diff --git a/rust/src/util/test_util.rs b/rust/arrow/src/util/test_util.rs similarity index 100% rename from rust/src/util/test_util.rs rename to rust/arrow/src/util/test_util.rs diff --git a/rust/test/data/null_test.csv b/rust/arrow/test/data/null_test.csv similarity index 100% rename from rust/test/data/null_test.csv rename to rust/arrow/test/data/null_test.csv diff --git a/rust/test/data/uk_cities.csv b/rust/arrow/test/data/uk_cities.csv similarity index 100% rename from rust/test/data/uk_cities.csv rename to rust/arrow/test/data/uk_cities.csv diff --git a/rust/parquet/Cargo.toml b/rust/parquet/Cargo.toml index e0272ab4f09e1..3cb4f05052315 100644 --- a/rust/parquet/Cargo.toml +++ b/rust/parquet/Cargo.toml @@ -40,7 +40,7 @@ lz4 = "1.23" zstd = "0.4" chrono = "0.4" num-bigint = "0.2" -arrow = { path = ".." } +arrow = { path = "../arrow" } [dev-dependencies] lazy_static = "1" From b92b1f5b08a64004c8b35db24a34ac71de7bd0e3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Jan 2019 14:33:51 -0600 Subject: [PATCH 164/328] ARROW-4125: [Python] Don't fail ASV if Plasma extension is not built (e.g. on Windows) I would guess I'm the first person to try to run the benchmark suite on Windows! Author: Wes McKinney Closes #3271 from wesm/benchmark-no-plasma and squashes the following commits: c99b76fae flake c7ede9fd4 Revert whitespace change 4938932d2 Check for ImportError in benchmarks/plasma.py 008ae7b98 Don't fail ASV if Plasma extension is not built (e.g. on Windows) --- python/benchmarks/plasma.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/benchmarks/plasma.py b/python/benchmarks/plasma.py index 398ec72561255..90a284515315c 100644 --- a/python/benchmarks/plasma.py +++ b/python/benchmarks/plasma.py @@ -18,7 +18,12 @@ import numpy as np import timeit -import pyarrow.plasma as plasma +try: + import pyarrow.plasma as plasma +except ImportError: + # TODO(wesm): These are not asv benchmarks, so we can just fail + # silently here + pass class SimplePlasmaThroughput(object): From 134081bea48d48307ed08b2e638fa40a3415ba77 Mon Sep 17 00:00:00 2001 From: jlapacik Date: Mon, 7 Jan 2019 15:47:33 -0600 Subject: [PATCH 165/328] ARROW-4126: [Go] offset not used when accessing boolean array Closes https://github.com/apache/arrow/issues/3273 . Author: jlapacik Closes #3275 from jlapacik/fix/go-boolean-slice and squashes the following commits: 67c5d739a assign slice value in out of bounds tests 9e3ac33dd allocate new slice for each test case 6901d09f1 ARROW-4126: offset not used when accessing boolean array --- go/arrow/array/boolean.go | 7 +- go/arrow/array/boolean_test.go | 260 +++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+), 1 deletion(-) create mode 100644 go/arrow/array/boolean_test.go diff --git a/go/arrow/array/boolean.go b/go/arrow/array/boolean.go index 19a692345e357..68de951e0ce8c 100644 --- a/go/arrow/array/boolean.go +++ b/go/arrow/array/boolean.go @@ -45,7 +45,12 @@ func NewBooleanData(data *Data) *Boolean { return a } -func (a *Boolean) Value(i int) bool { return bitutil.BitIsSet(a.values, i) } +func (a *Boolean) Value(i int) bool { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + return bitutil.BitIsSet(a.values, a.array.data.offset+i) +} func (a *Boolean) String() string { o := new(strings.Builder) diff --git a/go/arrow/array/boolean_test.go b/go/arrow/array/boolean_test.go new file mode 100644 index 0000000000000..e6f4b9bf2bc51 --- /dev/null +++ b/go/arrow/array/boolean_test.go @@ -0,0 +1,260 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package array_test + +import ( + "reflect" + "testing" + + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" +) + +func TestBooleanSliceData(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + values := []bool{true, false, true, true, true, true, true, false, true, false} + + b := array.NewBooleanBuilder(pool) + defer b.Release() + + for _, v := range values { + b.Append(v) + } + + arr := b.NewArray().(*array.Boolean) + defer arr.Release() + + if got, want := arr.Len(), len(values); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]bool, arr.Len()) + + for i := range vs { + vs[i] = arr.Value(i) + } + + if got, want := vs, values; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + tests := []struct { + interval [2]int64 + want []bool + }{ + { + interval: [2]int64{0, 0}, + want: []bool{}, + }, + { + interval: [2]int64{10, 10}, + want: []bool{}, + }, + { + interval: [2]int64{0, 5}, + want: []bool{true, false, true, true, true}, + }, + { + interval: [2]int64{5, 10}, + want: []bool{true, true, false, true, false}, + }, + { + interval: [2]int64{2, 7}, + want: []bool{true, true, true, true, true}, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + slice := array.NewSlice(arr, tc.interval[0], tc.interval[1]).(*array.Boolean) + defer slice.Release() + + if got, want := slice.Len(), len(tc.want); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]bool, slice.Len()) + + for i := range vs { + vs[i] = slice.Value(i) + } + + if got, want := vs, tc.want; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + }) + } +} + +func TestBooleanSliceDataWithNull(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + values := []bool{true, false, true, false, false, false, true, false, true, false} + valids := []bool{true, false, true, true, true, true, true, false, true, true} + + b := array.NewBooleanBuilder(pool) + defer b.Release() + + b.AppendValues(values, valids) + + arr := b.NewArray().(*array.Boolean) + defer arr.Release() + + if got, want := arr.Len(), len(valids); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 2; got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]bool, arr.Len()) + + for i := range vs { + vs[i] = arr.Value(i) + } + + if got, want := vs, values; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + tests := []struct { + interval [2]int64 + nulls int + want []bool + }{ + { + interval: [2]int64{2, 9}, + nulls: 1, + want: []bool{true, false, false, false, true, false, true}, + }, + { + interval: [2]int64{0, 7}, + nulls: 1, + want: []bool{true, false, true, false, false, false, true}, + }, + { + interval: [2]int64{1, 8}, + nulls: 2, + want: []bool{false, true, false, false, false, true, false}, + }, + { + interval: [2]int64{2, 7}, + nulls: 0, + want: []bool{true, false, false, false, true}, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + slice := array.NewSlice(arr, tc.interval[0], tc.interval[1]).(*array.Boolean) + defer slice.Release() + + if got, want := slice.NullN(), tc.nulls; got != want { + t.Errorf("got=%d, want=%d", got, want) + } + + if got, want := slice.Len(), len(tc.want); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + vs := make([]bool, slice.Len()) + + for i := range vs { + vs[i] = slice.Value(i) + } + + if got, want := vs, tc.want; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + }) + } +} + +func TestBooleanSliceOutOfBounds(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + values := []bool{true, false, true, false, true, false, true, false, true, false} + + b := array.NewBooleanBuilder(pool) + defer b.Release() + + for _, v := range values { + b.Append(v) + } + + arr := b.NewArray().(*array.Boolean) + defer arr.Release() + + slice := array.NewSlice(arr, 3, 8).(*array.Boolean) + defer slice.Release() + + tests := []struct { + index int + panic bool + }{ + { + index: -1, + panic: true, + }, + { + index: 5, + panic: true, + }, + { + index: 0, + panic: false, + }, + { + index: 4, + panic: false, + }, + } + + for _, tc := range tests { + t.Run("", func(t *testing.T) { + + var val bool + + if tc.panic { + defer func() { + e := recover() + if e == nil { + t.Fatalf("this should have panicked, but did not; slice value %v", val) + } + if got, want := e.(string), "arrow/array: index out of range"; got != want { + t.Fatalf("invalid error. got=%q, want=%q", got, want) + } + }() + } else { + defer func() { + if e := recover(); e != nil { + t.Fatalf("unexpected panic: %v", e) + } + }() + } + + val = slice.Value(tc.index) + }) + } +} From 0eadd412eb1bf10ebd7ec6babcd18a6852fb82a2 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Mon, 7 Jan 2019 19:33:09 -0700 Subject: [PATCH 166/328] ARROW-3665: [Rust] Implement StructArrayBuilder This implements `StructArrayBuilder` which can be used to build struct arrays. There are some trickness in terms of being able to store child builders of different types. A natural way is to box them into `ArrayBuilder` trait and store in a vector. But, this makes it impossible to cast them into specific type in `field_builder()`. To solve the above issue, this maintains two references to each input builder instance, one in `Box` type and another in `Box` type. The former is used for casting mentioned above, while the latter is used for calling general methods on an builder, such as `len()` and `finish()`. To enable this, this also changed `ArrayBuilder::finish` to return a `ArrayRef` instead of a specific array. The old `finish` method is implemented on each specific array builder, so one can obtain, say, `Int32Array` from a `Int32ArrayBuilder`. Author: Chao Sun Closes #3276 from sunchao/ARROW-3665 and squashes the following commits: 8fa3b61 Rename to_any to into_box_any 8aac785 Add append 227f3b2 Add from_schema 0e8e669 Fix rustfmt issues 5f59518 ARROW-3665: Implement StructArrayBuilder --- rust/arrow/examples/builders.rs | 2 +- rust/arrow/src/array.rs | 12 +- rust/arrow/src/array_ops.rs | 2 +- rust/arrow/src/bitmap.rs | 2 +- rust/arrow/src/builder.rs | 465 ++++++++++++++++++++++++++++---- rust/arrow/src/csv/reader.rs | 18 +- rust/arrow/src/datatypes.rs | 2 +- 7 files changed, 439 insertions(+), 64 deletions(-) diff --git a/rust/arrow/examples/builders.rs b/rust/arrow/examples/builders.rs index 5273558d966e0..92f45ce67d981 100644 --- a/rust/arrow/examples/builders.rs +++ b/rust/arrow/examples/builders.rs @@ -18,7 +18,7 @@ ///! Many builders are available to easily create different types of arrow arrays extern crate arrow; -use arrow::builder::{ArrayBuilder, Int32Builder}; +use arrow::builder::Int32Builder; fn main() { // Primitive Arrays diff --git a/rust/arrow/src/array.rs b/rust/arrow/src/array.rs index 251dd35eea150..5184b66426399 100644 --- a/rust/arrow/src/array.rs +++ b/rust/arrow/src/array.rs @@ -568,11 +568,17 @@ impl From for BinaryArray { "BinaryArray can only be created from List arrays, mismatched data types." ); - let data = ArrayData::builder(DataType::Utf8) + let mut builder = ArrayData::builder(DataType::Utf8) .len(v.len()) .add_buffer(v.data().buffers()[0].clone()) - .add_buffer(v.data().child_data()[0].buffers()[0].clone()) - .build(); + .add_buffer(v.data().child_data()[0].buffers()[0].clone()); + if let Some(bitmap) = v.data().null_bitmap() { + builder = builder + .null_count(v.data().null_count()) + .null_bit_buffer(bitmap.bits.clone()) + } + + let data = builder.build(); Self::from(data) } } diff --git a/rust/arrow/src/array_ops.rs b/rust/arrow/src/array_ops.rs index 517111ba76a45..69637094942cf 100644 --- a/rust/arrow/src/array_ops.rs +++ b/rust/arrow/src/array_ops.rs @@ -22,7 +22,7 @@ use std::ops::{Add, Div, Mul, Sub}; use num::Zero; use crate::array::{Array, BooleanArray, PrimitiveArray}; -use crate::builder::{ArrayBuilder, PrimitiveArrayBuilder}; +use crate::builder::PrimitiveArrayBuilder; use crate::datatypes; use crate::datatypes::ArrowNumericType; use crate::error::{ArrowError, Result}; diff --git a/rust/arrow/src/bitmap.rs b/rust/arrow/src/bitmap.rs index 3d5a77f78a51e..b5771c2b171c8 100644 --- a/rust/arrow/src/bitmap.rs +++ b/rust/arrow/src/bitmap.rs @@ -20,7 +20,7 @@ use crate::util::bit_util; #[derive(PartialEq, Debug)] pub struct Bitmap { - bits: Buffer, + pub(crate) bits: Buffer, } impl Bitmap { diff --git a/rust/arrow/src/builder.rs b/rust/arrow/src/builder.rs index a4c8666233877..b762c516331eb 100644 --- a/rust/arrow/src/builder.rs +++ b/rust/arrow/src/builder.rs @@ -22,6 +22,7 @@ use std::any::Any; use std::io::Write; use std::marker::PhantomData; use std::mem; +use std::sync::Arc; use crate::array::*; use crate::array_data::ArrayData; @@ -211,15 +212,12 @@ impl BufferBuilderTrait for BufferBuilder { } /// Trait for dealing with different array builders at runtime -pub trait ArrayBuilder { - /// The type of array that this builder creates - type ArrayType: Array; - +pub trait ArrayBuilder: Any { /// Returns the number of array slots in the builder fn len(&self) -> usize; /// Builds the array - fn finish(&mut self) -> Self::ArrayType; + fn finish(&mut self) -> ArrayRef; /// Returns the builder as an non-mutable `Any` reference. /// @@ -234,6 +232,9 @@ pub trait ArrayBuilder { /// type. In this case, one can first cast this into a `Any`, and then use /// `downcast_mut` to get a reference on the specific builder. fn as_any_mut(&mut self) -> &mut Any; + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box; } /// Array builder for fixed-width primitive types @@ -255,8 +256,6 @@ pub type Float32Builder = PrimitiveArrayBuilder; pub type Float64Builder = PrimitiveArrayBuilder; impl ArrayBuilder for PrimitiveArrayBuilder { - type ArrayType = PrimitiveArray; - /// Returns the builder as an non-mutable `Any` reference. fn as_any(&self) -> &Any { self @@ -267,22 +266,19 @@ impl ArrayBuilder for PrimitiveArrayBuilder { self } + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + /// Returns the number of array slots in the builder fn len(&self) -> usize { self.values_builder.len } - /// Builds the `PrimitiveArray` and reset this builder. - fn finish(&mut self) -> PrimitiveArray { - let len = self.len(); - let null_bit_buffer = self.bitmap_builder.finish(); - let data = ArrayData::builder(T::get_data_type()) - .len(len) - .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) - .add_buffer(self.values_builder.finish()) - .null_bit_buffer(null_bit_buffer) - .build(); - PrimitiveArray::::from(data) + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) } } @@ -329,6 +325,23 @@ impl PrimitiveArrayBuilder { self.values_builder.push_slice(v)?; Ok(()) } + + /// Builds the `PrimitiveArray` and reset this builder. + pub fn finish(&mut self) -> PrimitiveArray { + let len = self.len(); + let null_bit_buffer = self.bitmap_builder.finish(); + let null_count = len - bit_util::count_set_bits(null_bit_buffer.data()); + let mut builder = ArrayData::builder(T::get_data_type()) + .len(len) + .add_buffer(self.values_builder.finish()); + if null_count > 0 { + builder = builder + .null_count(null_count) + .null_bit_buffer(null_bit_buffer); + } + let data = builder.build(); + PrimitiveArray::::from(data) + } } /// Array builder for `ListArray` @@ -357,8 +370,6 @@ impl ArrayBuilder for ListArrayBuilder where T: 'static, { - type ArrayType = ListArray; - /// Returns the builder as an non-mutable `Any` reference. fn as_any(&self) -> &Any { self @@ -369,13 +380,45 @@ where self } + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + /// Returns the number of array slots in the builder fn len(&self) -> usize { self.len } + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } +} + +impl ListArrayBuilder +where + T: 'static, +{ + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to push values into the child array builder, + /// but you must call `append` to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + /// Finish the current variable-length list array slot + pub fn append(&mut self, is_valid: bool) -> Result<()> { + self.offsets_builder + .push(self.values_builder.len() as i32)?; + self.bitmap_builder.push(is_valid)?; + self.len += 1; + Ok(()) + } + /// Builds the `ListArray` and reset this builder. - fn finish(&mut self) -> ListArray { + pub fn finish(&mut self) -> ListArray { let len = self.len(); self.len = 0; let values_arr = self @@ -401,33 +444,12 @@ where } } -impl ListArrayBuilder { - /// Returns the child array builder as a mutable reference. - /// - /// This mutable reference can be used to push values into the child array builder, - /// but you must call `append` to delimit each distinct list value. - pub fn values(&mut self) -> &mut T { - &mut self.values_builder - } - - /// Finish the current variable-length list array slot - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.offsets_builder - .push(self.values_builder.len() as i32)?; - self.bitmap_builder.push(is_valid)?; - self.len += 1; - Ok(()) - } -} - /// Array builder for `BinaryArray` pub struct BinaryArrayBuilder { builder: ListArrayBuilder, } impl ArrayBuilder for BinaryArrayBuilder { - type ArrayType = BinaryArray; - /// Returns the builder as an non-mutable `Any` reference. fn as_any(&self) -> &Any { self @@ -438,14 +460,19 @@ impl ArrayBuilder for BinaryArrayBuilder { self } + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + /// Returns the number of array slots in the builder fn len(&self) -> usize { self.builder.len() } - /// Builds the `BinaryArray` and reset this builder. - fn finish(&mut self) -> BinaryArray { - BinaryArray::from(self.builder.finish()) + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) } } @@ -481,6 +508,179 @@ impl BinaryArrayBuilder { pub fn append(&mut self, is_valid: bool) -> Result<()> { self.builder.append(is_valid) } + + /// Append a null value to the array. + pub fn append_null(&mut self) -> Result<()> { + self.append(false) + } + + /// Builds the `BinaryArray` and reset this builder. + pub fn finish(&mut self) -> BinaryArray { + BinaryArray::from(self.builder.finish()) + } +} + +/// Array builder for Struct types. +/// +/// Note that callers should make sure that methods of all the child field builders are +/// properly called to maintain the consistency of the data structure. +pub struct StructArrayBuilder { + fields: Vec, + field_anys: Vec>, + field_builders: Vec>, + bitmap_builder: BooleanBufferBuilder, + len: usize, +} + +impl ArrayBuilder for StructArrayBuilder { + /// Returns the number of array slots in the builder. + /// + /// Note that this always return the first child field builder's length, and it is + /// the caller's responsibility to maintain the consistency that all the child field + /// builder should have the equal number of elements. + fn len(&self) -> usize { + self.len + } + + /// Builds the array. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Returns the builder as an non-mutable `Any` reference. + /// + /// This is most useful when one wants to call non-mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_ref` to get a reference on the specific builder. + fn as_any(&self) -> &Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + /// + /// This is most useful when one wants to call mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_mut` to get a reference on the specific builder. + fn as_any_mut(&mut self) -> &mut Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } +} + +impl StructArrayBuilder { + pub fn new(fields: Vec, builders: Vec>) -> Self { + let mut field_anys = Vec::with_capacity(builders.len()); + let mut field_builders = Vec::with_capacity(builders.len()); + + // Create and maintain two references for each of the input builder. We need the + // extra `Any` reference because we need to cast the builder to a specific type + // in `field_builder()` by calling `downcast_mut`. + for f in builders.into_iter() { + let raw_f = Box::into_raw(f); + let raw_f_copy = raw_f; + unsafe { + field_anys.push(Box::from_raw(raw_f).into_box_any()); + field_builders.push(Box::from_raw(raw_f_copy)); + } + } + + Self { + fields, + field_anys, + field_builders, + bitmap_builder: BooleanBufferBuilder::new(0), + len: 0, + } + } + + pub fn from_schema(schema: Schema, capacity: usize) -> Self { + let fields = schema.fields(); + let mut builders = Vec::with_capacity(fields.len()); + for f in schema.fields() { + builders.push(Self::from_field(f.clone(), capacity)); + } + Self::new(schema.fields, builders) + } + + fn from_field(f: Field, capacity: usize) -> Box { + match f.data_type() { + DataType::Boolean => Box::new(BooleanBuilder::new(capacity)), + DataType::Int8 => Box::new(Int8Builder::new(capacity)), + DataType::Int16 => Box::new(Int16Builder::new(capacity)), + DataType::Int32 => Box::new(Int32Builder::new(capacity)), + DataType::Int64 => Box::new(Int64Builder::new(capacity)), + DataType::UInt8 => Box::new(UInt8Builder::new(capacity)), + DataType::UInt16 => Box::new(UInt16Builder::new(capacity)), + DataType::UInt32 => Box::new(UInt32Builder::new(capacity)), + DataType::UInt64 => Box::new(UInt64Builder::new(capacity)), + DataType::Float32 => Box::new(Float32Builder::new(capacity)), + DataType::Float64 => Box::new(Float64Builder::new(capacity)), + DataType::Utf8 => Box::new(BinaryArrayBuilder::new(capacity)), + DataType::Struct(fields) => { + let schema = Schema::new(fields.clone()); + Box::new(Self::from_schema(schema, capacity)) + } + t @ _ => panic!("Data type {:?} is not currently supported", t), + } + } + + /// Returns a mutable reference to the child field builder at index `i`. + /// Result will be `None` if the input type `T` provided doesn't match the actual + /// field builder's type. + pub fn field_builder(&mut self, i: usize) -> Option<&mut T> { + self.field_anys[i].downcast_mut::() + } + + /// Returns the number of fields for the struct this builder is building. + pub fn num_fields(&self) -> usize { + self.field_builders.len() + } + + /// Appends an element (either null or non-null) to the struct. The actual elements + /// should be appended for each child sub-array in a consistent way. + pub fn append(&mut self, is_valid: bool) -> Result<()> { + self.bitmap_builder.push(is_valid)?; + self.len += 1; + Ok(()) + } + + /// Appends a null element to the struct. + pub fn append_null(&mut self) -> Result<()> { + self.append(false) + } + + /// Builds the `StructArray` and reset this builder. + pub fn finish(&mut self) -> StructArray { + let mut child_data = Vec::with_capacity(self.field_builders.len()); + for f in &mut self.field_builders { + let arr = f.finish(); + child_data.push(arr.data()); + } + + let null_bit_buffer = self.bitmap_builder.finish(); + let null_count = self.len - bit_util::count_set_bits(null_bit_buffer.data()); + let mut builder = ArrayData::builder(DataType::Struct(self.fields.clone())) + .len(self.len) + .child_data(child_data); + if null_count > 0 { + builder = builder + .null_count(null_count) + .null_bit_buffer(null_bit_buffer); + } + StructArray::from(builder.build()) + } +} + +impl Drop for StructArrayBuilder { + fn drop(&mut self) { + // To avoid double drop on the field array builders. + let builders = ::std::mem::replace(&mut self.field_builders, Vec::new()); + ::std::mem::forget(builders); + } } #[cfg(test)] @@ -488,6 +688,7 @@ mod tests { use super::*; use crate::array::Array; + use crate::bitmap::Bitmap; #[test] fn test_builder_i32_empty() { @@ -983,4 +1184,178 @@ mod tests { assert_eq!(5, binary_array.value_offset(2)); assert_eq!(5, binary_array.value_length(2)); } + + #[test] + fn test_struct_array_builder() { + let string_builder = BinaryArrayBuilder::new(4); + let int_builder = Int32Builder::new(4); + + let mut fields = Vec::new(); + let mut field_builders = Vec::new(); + fields.push(Field::new("f1", DataType::Utf8, false)); + field_builders.push(Box::new(string_builder) as Box); + fields.push(Field::new("f2", DataType::Int32, false)); + field_builders.push(Box::new(int_builder) as Box); + + let mut builder = StructArrayBuilder::new(fields, field_builders); + assert_eq!(2, builder.num_fields()); + + let string_builder = builder + .field_builder::(0) + .expect("builder at field 0 should be binary builder"); + string_builder.push_string("joe").unwrap(); + string_builder.append_null().unwrap(); + string_builder.append_null().unwrap(); + string_builder.push_string("mark").unwrap(); + + let int_builder = builder + .field_builder::(1) + .expect("builder at field 1 should be int builder"); + int_builder.push(1).unwrap(); + int_builder.push(2).unwrap(); + int_builder.push_null().unwrap(); + int_builder.push(4).unwrap(); + + builder.append(true).unwrap(); + builder.append(true).unwrap(); + builder.append_null().unwrap(); + builder.append(true).unwrap(); + + let arr = builder.finish(); + + let struct_data = arr.data(); + assert_eq!(4, struct_data.len()); + assert_eq!(1, struct_data.null_count()); + assert_eq!( + &Some(Bitmap::from(Buffer::from(&[11_u8]))), + struct_data.null_bitmap() + ); + + let expected_string_data = ArrayData::builder(DataType::Utf8) + .len(4) + .null_count(2) + .null_bit_buffer(Buffer::from(&[9_u8])) + .add_buffer(Buffer::from(&[0, 3, 3, 3, 7].to_byte_slice())) + .add_buffer(Buffer::from("joemark".as_bytes())) + .build(); + + let expected_int_data = ArrayData::builder(DataType::Int32) + .len(4) + .null_count(1) + .null_bit_buffer(Buffer::from(&[11_u8])) + .add_buffer(Buffer::from(&[1, 2, 0, 4].to_byte_slice())) + .build(); + + assert_eq!(expected_string_data, arr.column(0).data()); + + // TODO: implement equality for ArrayData + assert_eq!(expected_int_data.len(), arr.column(1).data().len()); + assert_eq!( + expected_int_data.null_count(), + arr.column(1).data().null_count() + ); + assert_eq!( + expected_int_data.null_bitmap(), + arr.column(1).data().null_bitmap() + ); + let expected_value_buf = expected_int_data.buffers()[0].clone(); + let actual_value_buf = arr.column(1).data().buffers()[0].clone(); + for i in 0..expected_int_data.len() { + if !expected_int_data.is_null(i) { + assert_eq!( + expected_value_buf.data()[i * 4..(i + 1) * 4], + actual_value_buf.data()[i * 4..(i + 1) * 4] + ); + } + } + } + + #[test] + fn test_struct_array_builder_finish() { + let int_builder = Int32Builder::new(10); + let bool_builder = BooleanBuilder::new(10); + + let mut fields = Vec::new(); + let mut field_builders = Vec::new(); + fields.push(Field::new("f1", DataType::Int32, false)); + field_builders.push(Box::new(int_builder) as Box); + fields.push(Field::new("f2", DataType::Boolean, false)); + field_builders.push(Box::new(bool_builder) as Box); + + let mut builder = StructArrayBuilder::new(fields, field_builders); + builder + .field_builder::(0) + .unwrap() + .push_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + .unwrap(); + builder + .field_builder::(1) + .unwrap() + .push_slice(&[ + false, true, false, true, false, true, false, true, false, true, + ]) + .unwrap(); + + let arr = builder.finish(); + assert_eq!(10, arr.len()); + assert_eq!(0, builder.len()); + + builder + .field_builder::(0) + .unwrap() + .push_slice(&[1, 3, 5, 7, 9]) + .unwrap(); + builder + .field_builder::(1) + .unwrap() + .push_slice(&[false, true, false, true, false]) + .unwrap(); + + let arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, builder.len()); + } + + #[test] + fn test_struct_array_builder_from_schema() { + let mut fields = Vec::new(); + fields.push(Field::new("f1", DataType::Float32, false)); + fields.push(Field::new("f2", DataType::Utf8, false)); + let mut sub_fields = Vec::new(); + sub_fields.push(Field::new("g1", DataType::Int32, false)); + sub_fields.push(Field::new("g2", DataType::Boolean, false)); + let struct_type = DataType::Struct(sub_fields); + fields.push(Field::new("f3", struct_type, false)); + + let mut builder = StructArrayBuilder::from_schema(Schema::new(fields), 5); + assert_eq!(3, builder.num_fields()); + assert!(builder.field_builder::(0).is_some()); + assert!(builder.field_builder::(1).is_some()); + assert!(builder.field_builder::(2).is_some()); + } + + #[test] + #[should_panic(expected = "Data type List(Int64) is not currently supported")] + fn test_struct_array_builder_from_schema_unsupported_type() { + let mut fields = Vec::new(); + fields.push(Field::new("f1", DataType::Int16, false)); + let list_type = DataType::List(Box::new(DataType::Int64)); + fields.push(Field::new("f2", list_type, false)); + + let _ = StructArrayBuilder::from_schema(Schema::new(fields), 5); + } + + #[test] + fn test_struct_array_builder_field_builder_type_mismatch() { + let int_builder = Int32Builder::new(10); + + let mut fields = Vec::new(); + let mut field_builders = Vec::new(); + fields.push(Field::new("f1", DataType::Int32, false)); + field_builders.push(Box::new(int_builder) as Box); + + let mut builder = StructArrayBuilder::new(fields, field_builders); + assert!(builder.field_builder::(0).is_none()); + } + } diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index b9c46fc3217cc..10be0abe96081 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -46,7 +46,7 @@ use std::sync::Arc; use csv as csv_crate; -use crate::array::{ArrayRef, BinaryArray}; +use crate::array::ArrayRef; use crate::builder::*; use crate::datatypes::*; use crate::error::{ArrowError, Result}; @@ -138,20 +138,14 @@ impl Reader { &DataType::Float32 => self.build_primitive_array::(rows, i), &DataType::Float64 => self.build_primitive_array::(rows, i), &DataType::Utf8 => { - let values_builder: UInt8Builder = UInt8Builder::new(rows.len()); - let mut list_builder = ListArrayBuilder::new(values_builder); + let mut builder = BinaryArrayBuilder::new(rows.len()); for row_index in 0..rows.len() { match rows[row_index].get(*i) { - Some(s) => { - list_builder.values().push_slice(s.as_bytes()).unwrap(); - list_builder.append(true).unwrap(); - } - _ => { - list_builder.append(false).unwrap(); - } + Some(s) => builder.push_string(s).unwrap(), + _ => builder.append(false).unwrap(), } } - Ok(Arc::new(BinaryArray::from(list_builder.finish())) as ArrayRef) + Ok(Arc::new(builder.finish()) as ArrayRef) } other => Err(ArrowError::ParseError(format!( "Unsupported data type {:?}", @@ -196,7 +190,7 @@ impl Reader { _ => builder.push_null()?, } } - Ok(Arc::new(builder.finish()) as ArrayRef) + Ok(Arc::new(builder.finish())) } } diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 49e06eb0969b2..0627b4523a1ce 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -350,7 +350,7 @@ impl fmt::Display for Field { /// layout. #[derive(Serialize, Deserialize, Debug, Clone)] pub struct Schema { - fields: Vec, + pub(crate) fields: Vec, } impl Schema { From a1ea48b51982e9ac13b28728edf8e009527eea2e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 7 Jan 2019 20:14:13 -0700 Subject: [PATCH 167/328] ARROW-4185: [Rust] Change directory before running Rust examples on Windows Author: Andy Grove Closes #3341 from andygrove/ARROW-4185 and squashes the following commits: c9fa73e Change directory before running Rust examples --- ci/rust-build-main.bat | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/rust-build-main.bat b/ci/rust-build-main.bat index 6ef451204d45a..ac5c9e7589245 100644 --- a/ci/rust-build-main.bat +++ b/ci/rust-build-main.bat @@ -36,6 +36,7 @@ cargo test --target %TARGET% --release || exit /B @echo @echo Run example (release) @echo --------------------- +cd arrow cargo run --example builders --target %TARGET% --release || exit /B cargo run --example dynamic_types --target %TARGET% --release || exit /B cargo run --example read_csv --target %TARGET% --release || exit /B From 1143942bc5264d89a343031e522ffc5aa7abf7b3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 7 Jan 2019 22:51:10 -0700 Subject: [PATCH 168/328] ARROW-4042: [Rust] Rename BinaryArray::get_value to value This PR makes BinaryArray consistent with PrimitiveArray (and with the C++ implementation) Author: Andy Grove Closes #3343 from andygrove/ARROW-4042 and squashes the following commits: 861d09e Rename BinaryArray::get_value to value for consistency with PrimitiveArray --- rust/arrow/examples/read_csv.rs | 2 +- rust/arrow/src/array.rs | 16 ++++++++-------- rust/arrow/src/builder.rs | 12 ++++++------ rust/arrow/src/csv/reader.rs | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/rust/arrow/examples/read_csv.rs b/rust/arrow/examples/read_csv.rs index 147d2f9c23845..fd15e333bcfc9 100644 --- a/rust/arrow/examples/read_csv.rs +++ b/rust/arrow/examples/read_csv.rs @@ -59,7 +59,7 @@ fn main() { .unwrap(); for i in 0..batch.num_rows() { - let city_name: String = String::from_utf8(city.get_value(i).to_vec()).unwrap(); + let city_name: String = String::from_utf8(city.value(i).to_vec()).unwrap(); println!( "City: {}, Latitude: {}, Longitude: {}", diff --git a/rust/arrow/src/array.rs b/rust/arrow/src/array.rs index 5184b66426399..f8272eb007db6 100644 --- a/rust/arrow/src/array.rs +++ b/rust/arrow/src/array.rs @@ -470,7 +470,7 @@ pub struct BinaryArray { impl BinaryArray { /// Returns the element at index `i` as a byte slice. - pub fn get_value(&self, i: usize) -> &[u8] { + pub fn value(&self, i: usize) -> &[u8] { assert!(i < self.data.len(), "BinaryArray out of bounds access"); let offset = i.checked_add(self.data.offset()).unwrap(); unsafe { @@ -486,7 +486,7 @@ impl BinaryArray { /// /// Note this doesn't do any bound checking, for performance reason. pub fn get_string(&self, i: usize) -> String { - let slice = self.get_value(i); + let slice = self.value(i); unsafe { String::from_utf8_unchecked(Vec::from(slice)) } } @@ -951,13 +951,13 @@ mod tests { let binary_array = BinaryArray::from(array_data); assert_eq!(3, binary_array.len()); assert_eq!(0, binary_array.null_count()); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.get_value(0)); + assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); assert_eq!("hello", binary_array.get_string(0)); - assert_eq!([] as [u8; 0], binary_array.get_value(1)); + assert_eq!([] as [u8; 0], binary_array.value(1)); assert_eq!("", binary_array.get_string(1)); assert_eq!( [b'p', b'a', b'r', b'q', b'u', b'e', b't'], - binary_array.get_value(2) + binary_array.value(2) ); assert_eq!("parquet", binary_array.get_string(2)); assert_eq!(5, binary_array.value_offset(2)); @@ -977,7 +977,7 @@ mod tests { let binary_array = BinaryArray::from(array_data); assert_eq!( [b'p', b'a', b'r', b'q', b'u', b'e', b't'], - binary_array.get_value(1) + binary_array.value(1) ); assert_eq!("parquet", binary_array.get_string(1)); assert_eq!(5, binary_array.value_offset(0)); @@ -1019,7 +1019,7 @@ mod tests { assert_eq!(binary_array1.len(), binary_array2.len()); assert_eq!(binary_array1.null_count(), binary_array2.null_count()); for i in 0..binary_array1.len() { - assert_eq!(binary_array1.get_value(i), binary_array2.get_value(i)); + assert_eq!(binary_array1.value(i), binary_array2.value(i)); assert_eq!(binary_array1.get_string(i), binary_array2.get_string(i)); assert_eq!(binary_array1.value_offset(i), binary_array2.value_offset(i)); assert_eq!(binary_array1.value_length(i), binary_array2.value_length(i)); @@ -1082,7 +1082,7 @@ mod tests { .add_buffer(Buffer::from(&values[..])) .build(); let binary_array = BinaryArray::from(array_data); - binary_array.get_value(4); + binary_array.value(4); } #[test] diff --git a/rust/arrow/src/builder.rs b/rust/arrow/src/builder.rs index b762c516331eb..a0bb43c7dee53 100644 --- a/rust/arrow/src/builder.rs +++ b/rust/arrow/src/builder.rs @@ -1133,11 +1133,11 @@ mod tests { assert_eq!(3, binary_array.len()); assert_eq!(0, binary_array.null_count()); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.get_value(0)); + assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); assert_eq!("hello", binary_array.get_string(0)); - assert_eq!([] as [u8; 0], binary_array.get_value(1)); + assert_eq!([] as [u8; 0], binary_array.value(1)); assert_eq!("", binary_array.get_string(1)); - assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.get_value(2)); + assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2)); assert_eq!("world", binary_array.get_string(2)); assert_eq!(5, binary_array.value_offset(2)); assert_eq!(5, binary_array.value_length(2)); @@ -1175,11 +1175,11 @@ mod tests { assert_eq!(3, binary_array.len()); assert_eq!(0, binary_array.null_count()); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.get_value(0)); + assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); assert_eq!("hello", binary_array.get_string(0)); - assert_eq!([] as [u8; 0], binary_array.get_value(1)); + assert_eq!([] as [u8; 0], binary_array.value(1)); assert_eq!("", binary_array.get_string(1)); - assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.get_value(2)); + assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2)); assert_eq!("world", binary_array.get_string(2)); assert_eq!(5, binary_array.value_offset(2)); assert_eq!(5, binary_array.value_length(2)); diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 10be0abe96081..57c7dde1b250d 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -231,7 +231,7 @@ mod tests { .downcast_ref::() .unwrap(); - let city_name: String = String::from_utf8(city.get_value(13).to_vec()).unwrap(); + let city_name: String = String::from_utf8(city.value(13).to_vec()).unwrap(); assert_eq!("Aberdeen, Aberdeen City, UK", city_name); } From 2057859744cb2ada93fc97838e09eb954963dc00 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 8 Jan 2019 11:03:17 +0100 Subject: [PATCH 169/328] ARROW-4188: [Rust] Move Rust README to top level rust directory Author: Andy Grove Closes #3342 from andygrove/ARROW-4188 and squashes the following commits: fedcd7bc split README between top level and arrow level b68f77cb Merge branch 'master' into ARROW-4188 e6dbd87f add badges back f2ee7e05 Move Rust README to top level rust directory --- rust/README.md | 50 ++++++++++++++++++++++++++++++++++++++++++++ rust/arrow/README.md | 22 ------------------- 2 files changed, 50 insertions(+), 22 deletions(-) create mode 100644 rust/README.md diff --git a/rust/README.md b/rust/README.md new file mode 100644 index 0000000000000..8fe7885de068c --- /dev/null +++ b/rust/README.md @@ -0,0 +1,50 @@ + + +# Native Rust implementation of Apache Arrow + +## The Rust implementation of Arrow consists of the following crates + +- Arrow [(README)](arrow/README.md) +- Parquet [(README)](parquet/README.md) + +## Run Tests + +Parquet support in Arrow requires data to test against, this data is in a +git submodule. To pull down this data run the following: + +```bash +git submodule update --init +``` + +The data can then be found in `cpp/submodules/parquet_testing/data`. +Create a new environment variable called `PARQUET_TEST_DATA` to point +to this location and then `cargo test` as usual. + +## Code Formatting + +Our CI uses `rustfmt` to check code formatting. Although the project is +built and tested against nightly rust we use the stable version of +`rustfmt`. So before submitting a PR be sure to run the following +and check for lint issues: + +```bash +cargo +stable fmt --all -- --check +``` + diff --git a/rust/arrow/README.md b/rust/arrow/README.md index cbfd4dd684a0f..9df2dd2e9e26f 100644 --- a/rust/arrow/README.md +++ b/rust/arrow/README.md @@ -57,28 +57,6 @@ cargo run --example dynamic_types cargo run --example read_csv ``` -## Run Tests - -Parquet support in Arrow requires data to test against, this data is in a -git submodule. To pull down this data run the following: - -```bash -git submodule update --init -``` - -The data can then be found in `cpp/submodules/parquet_testing/data`. -Create a new environment variable called `PARQUET_TEST_DATA` to point -to this location and then `cargo test` as usual. - -Our CI uses `rustfmt` to check code formatting. Although the project is -built and tested against nightly rust we use the stable version of -`rustfmt`. So before submitting a PR be sure to run the following -and check for lint issues: - -```bash -cargo +stable fmt --all -- --check -``` - # Publishing to crates.io An Arrow committer can publish this crate after an official project release has From 55848a36edb5ea5e0765068ef5f09d07d09d4898 Mon Sep 17 00:00:00 2001 From: Pindikura Ravindra Date: Tue, 8 Jan 2019 16:13:18 +0530 Subject: [PATCH 170/328] ARROW-4104: [Java] fix a race condition in AllocationManager (#3246) --- .../java/org/apache/arrow/memory/AllocationManager.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java index 687674f951b89..c10d246013290 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -230,7 +230,7 @@ public boolean transferBalance(final BufferLedger target) { // since two balance transfers out from the allocator manager could cause incorrect // accounting, we need to ensure // that this won't happen by synchronizing on the allocator manager instance. - synchronized (this) { + synchronized (AllocationManager.this) { if (owningLedger != this) { return true; } @@ -310,7 +310,7 @@ public int decrement(int decrement) { allocator.assertOpen(); final int outcome; - synchronized (this) { + synchronized (AllocationManager.this) { outcome = bufRefCnt.addAndGet(-decrement); if (outcome == 0) { lDestructionTime = System.nanoTime(); @@ -411,7 +411,7 @@ public int getSize() { * @return Amount of accounted(owned) memory associated with this ledger. */ public int getAccountedSize() { - synchronized (this) { + synchronized (AllocationManager.this) { if (owningLedger == this) { return size; } else { From 8704f8bd98f1edcf1f9ecc51d6fb3b4b5b4ecb88 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Tue, 8 Jan 2019 22:32:13 +0900 Subject: [PATCH 171/328] ARROW-4183: [Ruby] Add Arrow::Struct as an element of Arrow::StructArray Returning Arrow::Array by Arrow::StructArray#[] is deprecated. It'll return Arrow::Struct in the next release. It's for consistency. All Arrow::Array#[] implementations should return an element. Author: Kouhei Sutou Closes #3338 from kou/ruby-struct and squashes the following commits: a0561954 Add Arrow::Struct as an element of Arrow::StructArray --- .../lib/arrow/struct-array-builder.rb | 9 ++- ruby/red-arrow/lib/arrow/struct-array.rb | 34 ++++++++ ruby/red-arrow/lib/arrow/struct.rb | 68 ++++++++++++++++ .../test/test-struct-array-builder.rb | 47 +++++++---- ruby/red-arrow/test/test-struct-array.rb | 58 +++++++++---- ruby/red-arrow/test/test-struct.rb | 81 +++++++++++++++++++ 6 files changed, 263 insertions(+), 34 deletions(-) create mode 100644 ruby/red-arrow/lib/arrow/struct.rb create mode 100644 ruby/red-arrow/test/test-struct.rb diff --git a/ruby/red-arrow/lib/arrow/struct-array-builder.rb b/ruby/red-arrow/lib/arrow/struct-array-builder.rb index 883ce84da7de7..52f75aab46d35 100644 --- a/ruby/red-arrow/lib/arrow/struct-array-builder.rb +++ b/ruby/red-arrow/lib/arrow/struct-array-builder.rb @@ -73,13 +73,20 @@ def append_value(*args) value.each_with_index do |sub_value, i| self[i].append_value(sub_value) end + when Arrow::Struct + append_value_raw + value.values.each_with_index do |sub_value, i| + self[i].append_value(sub_value) + end when Hash append_value_raw value.each do |name, sub_value| self[name].append_value(sub_value) end else - message = "struct value must be nil, Array or Hash: #{value.inspect}" + message = + "struct value must be nil, Array, " + + "Arrow::Struct or Hash: #{value.inspect}" raise ArgumentError, message end else diff --git a/ruby/red-arrow/lib/arrow/struct-array.rb b/ruby/red-arrow/lib/arrow/struct-array.rb index 4f9834c5d330f..e55a507868f1a 100644 --- a/ruby/red-arrow/lib/arrow/struct-array.rb +++ b/ruby/red-arrow/lib/arrow/struct-array.rb @@ -15,10 +15,44 @@ # specific language governing permissions and limitations # under the License. +require "arrow/struct" + module Arrow class StructArray def [](i) + warn("Use #{self.class}\#find_field instead. " + + "This will returns Arrow::Struct instead of Arrow::Array " + + "since 0.13.0.") get_field(i) end + + def get_value(i) + Struct.new(self, i) + end + + def find_field(index_or_name) + case index_or_name + when String, Symbol + name = index_or_name + (@name_to_field ||= build_name_to_field)[name.to_s] + else + index = index_or_name + cached_fields[index] + end + end + + private + def cached_fields + @fields ||= fields + end + + def build_name_to_field + name_to_field = {} + field_arrays = cached_fields + value_data_type.fields.each_with_index do |field, i| + name_to_field[field.name] = field_arrays[i] + end + name_to_field + end end end diff --git a/ruby/red-arrow/lib/arrow/struct.rb b/ruby/red-arrow/lib/arrow/struct.rb new file mode 100644 index 0000000000000..4ae12b871e49e --- /dev/null +++ b/ruby/red-arrow/lib/arrow/struct.rb @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class Struct + attr_accessor :index + def initialize(array, index) + @array = array + @index = index + end + + def [](field_name_or_field_index) + field = @array.find_field(field_name_or_field_index) + return nil if field.nil? + field[@index] + end + + def fields + @array.value_data_type.fields + end + + def values + @array.fields.collect do |field| + field[@index] + end + end + + def to_a + values + end + + def to_h + attributes = {} + field_arrays = @array.fields + fields.each_with_index do |field, i| + attributes[field.name] = field_arrays[i][@index] + end + attributes + end + + def respond_to_missing?(name, include_private) + return true if @array.find_field(name) + super + end + + def method_missing(name, *args, &block) + if args.empty? + field = @array.find_field(name) + return field[@index] if field + end + super + end + end +end diff --git a/ruby/red-arrow/test/test-struct-array-builder.rb b/ruby/red-arrow/test/test-struct-array-builder.rb index 205564c816c30..42e1ded78e318 100644 --- a/ruby/red-arrow/test/test-struct-array-builder.rb +++ b/ruby/red-arrow/test/test-struct-array-builder.rb @@ -31,8 +31,8 @@ def setup [nil], ], [ - array[0].to_a, - array[1].to_a, + array.find_field(0).to_a, + array.find_field(1).to_a, ]) end @@ -44,8 +44,23 @@ def setup [1], ], [ - array[0].to_a, - array[1].to_a, + array.find_field(0).to_a, + array.find_field(1).to_a, + ]) + end + + test("Arrow::Struct") do + source_array = Arrow::StructArray.new(@data_type, [[true, 1]]) + struct = source_array.get_value(0) + @builder.append_value(struct) + array = @builder.finish + assert_equal([ + [true], + [1], + ], + [ + array.find_field(0).to_a, + array.find_field(1).to_a, ]) end @@ -57,8 +72,8 @@ def setup [1], ], [ - array[0].to_a, - array[1].to_a, + array.find_field(0).to_a, + array.find_field(1).to_a, ]) end end @@ -72,8 +87,8 @@ def setup [nil], ], [ - array[0].to_a, - array[1].to_a, + array.find_field(0).to_a, + array.find_field(1).to_a, ]) end @@ -85,8 +100,8 @@ def setup [1], ], [ - array[0].to_a, - array[1].to_a, + array.find_field(0).to_a, + array.find_field(1).to_a, ]) end @@ -98,8 +113,8 @@ def setup [1], ], [ - array[0].to_a, - array[1].to_a, + array.find_field(0).to_a, + array.find_field(1).to_a, ]) end @@ -115,8 +130,8 @@ def setup [nil, 1, 2], ], [ - array[0].to_a, - array[1].to_a, + array.find_field(0).to_a, + array.find_field(1).to_a, ]) end @@ -137,8 +152,8 @@ def setup [1, nil, 3], ], [ - array[0].to_a, - array[1].to_a, + array.find_field(0).to_a, + array.find_field(1).to_a, ]) end end diff --git a/ruby/red-arrow/test/test-struct-array.rb b/ruby/red-arrow/test/test-struct-array.rb index 986b0a9db1696..5a00434713a33 100644 --- a/ruby/red-arrow/test/test-struct-array.rb +++ b/ruby/red-arrow/test/test-struct-array.rb @@ -31,27 +31,51 @@ class StructArrayTest < Test::Unit::TestCase [1, nil, 2], ], [ - array[0].to_a, - array[1].to_a, + array.find_field(0).to_a, + array.find_field(1).to_a, ]) end end - test("#[]") do - type = Arrow::StructDataType.new([ - Arrow::Field.new("field1", :boolean), - Arrow::Field.new("field2", :uint64), - ]) - builder = Arrow::StructArrayBuilder.new(type) - builder.append - builder.get_field_builder(0).append(true) - builder.get_field_builder(1).append(1) - builder.append - builder.get_field_builder(0).append(false) - builder.get_field_builder(1).append(2) - array = builder.finish + sub_test_case("instance methods") do + def setup + @data_type = Arrow::StructDataType.new(visible: {type: :boolean}, + count: {type: :uint64}) + @values = [ + [true, 1], + [false, 2], + ] + @array = Arrow::StructArray.new(@data_type, @values) + end - assert_equal([[true, false], [1, 2]], - [array[0].to_a, array[1].to_a]) + test("#[]") do + notify("TODO: Returns Arrow::Struct instead.") + assert_equal([[true, false], [1, 2]], + [@array[0].to_a, @array[1].to_a]) + end + + sub_test_case("#find_field") do + test("Integer") do + assert_equal([ + [true, false], + [1, 2], + ], + [ + @array.find_field(0).to_a, + @array.find_field(1).to_a, + ]) + end + + test("String, Symbol") do + assert_equal([ + [true, false], + [1, 2], + ], + [ + @array.find_field("visible").to_a, + @array.find_field(:count).to_a, + ]) + end + end end end diff --git a/ruby/red-arrow/test/test-struct.rb b/ruby/red-arrow/test/test-struct.rb new file mode 100644 index 0000000000000..412549c7dfb34 --- /dev/null +++ b/ruby/red-arrow/test/test-struct.rb @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class StructTest < Test::Unit::TestCase + def setup + @data_type = Arrow::StructDataType.new(visible: {type: :boolean}, + count: {type: :uint64}) + @values = [ + [true, 1], + [false, 2], + ] + @array = Arrow::StructArray.new(@data_type, @values) + @struct = @array.get_value(0) + end + + sub_test_case("#[]") do + test("Integer") do + assert_equal(true, @struct[0]) + end + + test("String") do + assert_equal(true, @struct["visible"]) + end + + test("Symbol") do + assert_equal(true, @struct[:visible]) + end + end + + test("#fields") do + assert_equal(@data_type.fields, + @struct.fields) + end + + test("#values") do + assert_equal([true, 1], + @struct.values) + end + + test("#to_a") do + assert_equal([true, 1], + @struct.to_a) + end + + test("#to_h") do + assert_equal({ + "visible" => true, + "count" => 1, + }, + @struct.to_h) + end + + test("#respond_to_missing?") do + assert_equal([ + true, + false, + ], + [ + @struct.respond_to?(:visible), + @struct.respond_to?(:nonexistent), + ]) + end + + test("#method_missing?") do + assert_equal(1, @struct.count) + end +end From af07f75c1f692d1ed4cea93d358ff1acda6a1771 Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Tue, 8 Jan 2019 06:45:13 -0700 Subject: [PATCH 172/328] ARROW-4060: [Rust] Add parquet arrow converter. This is the first step of adding an arrow reader and writer for parquet-rs. This commit contains a converter which converts parquet schema to arrow schema. Copied from this pr https://github.com/sunchao/parquet-rs/pull/185. Author: Renjie Liu Closes #3279 from liurenjie1024/rust-arrow-schema-converter and squashes the following commits: 1bfa00f Resolve conflict 8806b16 Add parquet arrow converter --- rust/parquet/src/errors.rs | 6 + rust/parquet/src/lib.rs | 1 + rust/parquet/src/reader/mod.rs | 25 + rust/parquet/src/reader/schema.rs | 779 ++++++++++++++++++++++++++++++ rust/parquet/src/schema/types.rs | 14 +- 5 files changed, 824 insertions(+), 1 deletion(-) create mode 100644 rust/parquet/src/reader/mod.rs create mode 100644 rust/parquet/src/reader/schema.rs diff --git a/rust/parquet/src/errors.rs b/rust/parquet/src/errors.rs index a5532c1eb66dc..abfbda9dba9f2 100644 --- a/rust/parquet/src/errors.rs +++ b/rust/parquet/src/errors.rs @@ -50,6 +50,12 @@ quick_error! { display("EOF: {}", message) description(message) } + /// Arrow error. + /// Returned when reading into arrow or writing from arrow. + ArrowError(message: String) { + display("Arrow: {}", message) + description(message) + } } } diff --git a/rust/parquet/src/lib.rs b/rust/parquet/src/lib.rs index 75c56f5054f19..cad85ecde317c 100644 --- a/rust/parquet/src/lib.rs +++ b/rust/parquet/src/lib.rs @@ -37,5 +37,6 @@ pub mod column; pub mod compression; mod encodings; pub mod file; +pub mod reader; pub mod record; pub mod schema; diff --git a/rust/parquet/src/reader/mod.rs b/rust/parquet/src/reader/mod.rs new file mode 100644 index 0000000000000..fe580c5e92b37 --- /dev/null +++ b/rust/parquet/src/reader/mod.rs @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [Apache Arrow](http://arrow.apache.org/) is a cross-language development platform for +//! in-memory data. +//! +//! This mod provides API for converting between arrow and parquet. + +pub mod schema; + +pub use self::schema::{parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns}; diff --git a/rust/parquet/src/reader/schema.rs b/rust/parquet/src/reader/schema.rs new file mode 100644 index 0000000000000..68fd867a821cd --- /dev/null +++ b/rust/parquet/src/reader/schema.rs @@ -0,0 +1,779 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Provides API for converting parquet schema to arrow schema and vice versa. +//! +//! The main interfaces for converting parquet schema to arrow schema are +//! `parquet_to_arrow_schema` and `parquet_to_arrow_schema_by_columns`. +//! +//! The interfaces for converting arrow schema to parquet schema is coming. + +use std::{collections::HashSet, rc::Rc}; + +use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::errors::{ParquetError::ArrowError, Result}; +use crate::schema::types::{SchemaDescPtr, Type, TypePtr}; + +use arrow::datatypes::{DataType, Field, Schema}; + +/// Convert parquet schema to arrow schema. +pub fn parquet_to_arrow_schema(parquet_schema: SchemaDescPtr) -> Result { + parquet_to_arrow_schema_by_columns(parquet_schema.clone(), 0..parquet_schema.columns().len()) +} + +/// Convert parquet schema to arrow schema, only preserving some leaf columns. +pub fn parquet_to_arrow_schema_by_columns( + parquet_schema: SchemaDescPtr, + column_indices: T, +) -> Result +where + T: IntoIterator, +{ + let mut base_nodes = Vec::new(); + let mut base_nodes_set = HashSet::new(); + let mut leaves = HashSet::new(); + + for c in column_indices { + let column = parquet_schema.column(c).self_type() as *const Type; + let root = parquet_schema.get_column_root_ptr(c); + let root_raw_ptr = root.clone().as_ref() as *const Type; + + leaves.insert(column); + if !base_nodes_set.contains(&root_raw_ptr) { + base_nodes.push(root); + base_nodes_set.insert(root_raw_ptr); + } + } + + let leaves = Rc::new(leaves); + base_nodes + .into_iter() + .map(|t| ParquetTypeConverter::new(t, leaves.clone()).to_field()) + .collect::>>>() + .map(|result| result.into_iter().filter_map(|f| f).collect::>()) + .map(|fields| Schema::new(fields)) +} + +/// This struct is used to group methods and data structures used to convert parquet +/// schema together. +struct ParquetTypeConverter { + schema: TypePtr, + /// This is the columns that need to be converted to arrow schema. + columns_to_convert: Rc>, +} + +impl ParquetTypeConverter { + fn new(schema: TypePtr, columns_to_convert: Rc>) -> Self { + Self { + schema, + columns_to_convert, + } + } + + fn clone_with_schema(&self, other: TypePtr) -> Self { + Self { + schema: other, + columns_to_convert: self.columns_to_convert.clone(), + } + } +} + +impl ParquetTypeConverter { + // Public interfaces. + + /// Converts parquet schema to arrow data type. + /// + /// This function discards schema name. + /// + /// If this schema is a primitive type and not included in the leaves, the result is + /// Ok(None). + /// + /// If this schema is a group type and none of its children is reserved in the + /// conversion, the result is Ok(None). + fn to_data_type(&self) -> Result> { + match self.schema.as_ref() { + Type::PrimitiveType { .. } => self.to_primitive_type(), + Type::GroupType { .. } => self.to_group_type(), + } + } + + /// Converts parquet schema to arrow field. + /// + /// This method is roughly the same as + /// [`to_data_type`](`ParquetTypeConverter::to_data_type`), except it reserves schema + /// name. + fn to_field(&self) -> Result> { + self.to_data_type() + .map(|opt| opt.map(|dt| Field::new(self.schema.name(), dt, self.is_nullable()))) + } + + // Utility functions. + + /// Checks whether this schema is nullable. + fn is_nullable(&self) -> bool { + let basic_info = self.schema.get_basic_info(); + if basic_info.has_repetition() { + match basic_info.repetition() { + Repetition::OPTIONAL => true, + Repetition::REPEATED => true, + Repetition::REQUIRED => false, + } + } else { + false + } + } + + fn is_repeated(&self) -> bool { + let basic_info = self.schema.get_basic_info(); + + basic_info.has_repetition() && basic_info.repetition() == Repetition::REPEATED + } + + fn is_self_included(&self) -> bool { + self.columns_to_convert + .contains(&(self.schema.as_ref() as *const Type)) + } + + // Functions for primitive types. + + /// Entry point for converting parquet primitive type to arrow type. + /// + /// This function takes care of repetition. + fn to_primitive_type(&self) -> Result> { + if self.is_self_included() { + self.to_primitive_type_inner().map(|dt| { + if self.is_repeated() { + Some(DataType::List(Box::new(dt))) + } else { + Some(dt) + } + }) + } else { + Ok(None) + } + } + + /// Converting parquet primitive type to arrow data type. + fn to_primitive_type_inner(&self) -> Result { + match self.schema.get_physical_type() { + PhysicalType::BOOLEAN => Ok(DataType::Boolean), + PhysicalType::INT32 => self.to_int32(), + PhysicalType::INT64 => self.to_int64(), + PhysicalType::FLOAT => Ok(DataType::Float32), + PhysicalType::DOUBLE => Ok(DataType::Float64), + PhysicalType::BYTE_ARRAY => self.to_byte_array(), + other => Err(ArrowError(format!( + "Unable to convert parquet type {}", + other + ))), + } + } + + fn to_int32(&self) -> Result { + match self.schema.get_basic_info().logical_type() { + LogicalType::NONE => Ok(DataType::Int32), + LogicalType::UINT_8 => Ok(DataType::UInt8), + LogicalType::UINT_16 => Ok(DataType::UInt16), + LogicalType::UINT_32 => Ok(DataType::UInt32), + LogicalType::INT_8 => Ok(DataType::Int8), + LogicalType::INT_16 => Ok(DataType::Int16), + LogicalType::INT_32 => Ok(DataType::Int32), + other => Err(ArrowError(format!( + "Unable to convert parquet logical type {}", + other + ))), + } + } + + fn to_int64(&self) -> Result { + match self.schema.get_basic_info().logical_type() { + LogicalType::NONE => Ok(DataType::Int64), + LogicalType::INT_64 => Ok(DataType::Int64), + LogicalType::UINT_64 => Ok(DataType::UInt64), + other => Err(ArrowError(format!( + "Unable to convert parquet logical type {}", + other + ))), + } + } + + fn to_byte_array(&self) -> Result { + match self.schema.get_basic_info().logical_type() { + LogicalType::UTF8 => Ok(DataType::Utf8), + other => Err(ArrowError(format!( + "Unable to convert parquet logical type {}", + other + ))), + } + } + + // Functions for group types. + + /// Entry point for converting parquet group type. + /// + /// This function takes care of logical type and repetition. + fn to_group_type(&self) -> Result> { + if self.is_repeated() { + self.to_struct() + .map(|opt| opt.map(|dt| DataType::List(Box::new(dt)))) + } else { + match self.schema.get_basic_info().logical_type() { + LogicalType::LIST => self.to_list(), + _ => self.to_struct(), + } + } + } + + /// Converts a parquet group type to arrow struct. + fn to_struct(&self) -> Result> { + match self.schema.as_ref() { + Type::PrimitiveType { .. } => panic!( + "{:?} is a struct type, and can't be processed as primitive.", + self.schema + ), + Type::GroupType { + basic_info: _, + fields, + } => fields + .iter() + .map(|field_ptr| self.clone_with_schema(field_ptr.clone()).to_field()) + .collect::>>>() + .map(|result| result.into_iter().filter_map(|f| f).collect::>()) + .map(|fields| { + if fields.is_empty() { + None + } else { + Some(DataType::Struct(fields)) + } + }), + } + } + + /// Converts a parquet list to arrow list. + /// + /// To fully understand this algorithm, please refer to + /// [parquet doc](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md). + fn to_list(&self) -> Result> { + match self.schema.as_ref() { + Type::PrimitiveType { .. } => panic!( + "{:?} is a list type and can't be processed as primitive.", + self.schema + ), + Type::GroupType { + basic_info: _, + fields, + } if fields.len() == 1 => { + let list_item = fields.first().unwrap(); + let item_converter = self.clone_with_schema(list_item.clone()); + + let item_type = match list_item.as_ref() { + Type::PrimitiveType { .. } => { + if item_converter.is_repeated() { + item_converter.to_primitive_type_inner().map(|dt| Some(dt)) + } else { + Err(ArrowError( + "Primitive element type of list must be repeated.".to_string(), + )) + } + } + Type::GroupType { + basic_info: _, + fields, + } => { + if fields.len() > 1 { + item_converter.to_struct() + } else if fields.len() == 1 + && list_item.name() != "array" + && list_item.name() != format!("{}_tuple", self.schema.name()) + { + let nested_item = fields.first().unwrap(); + let nested_item_converter = self.clone_with_schema(nested_item.clone()); + + nested_item_converter.to_data_type() + } else { + item_converter.to_struct() + } + } + }; + + item_type.map(|opt| opt.map(|dt| DataType::List(Box::new(dt)))) + } + _ => Err(ArrowError( + "Group element type of list can only contain one field.".to_string(), + )), + } + } +} + +#[cfg(test)] +mod tests { + use std::rc::Rc; + + use crate::schema::{parser::parse_message_type, types::SchemaDescriptor}; + + use arrow::datatypes::{DataType, Field}; + + use super::{parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns}; + + #[test] + fn test_flat_primitives() { + let message_type = " + message test_schema { + REQUIRED BOOLEAN boolean; + REQUIRED INT32 int8 (INT_8); + REQUIRED INT32 int16 (INT_16); + REQUIRED INT32 int32; + REQUIRED INT64 int64 ; + OPTIONAL DOUBLE double; + OPTIONAL FLOAT float; + OPTIONAL BINARY string (UTF8); + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + + let parquet_schema = SchemaDescriptor::new(Rc::new(parquet_group_type)); + let converted_arrow_schema = parquet_to_arrow_schema(Rc::new(parquet_schema)).unwrap(); + + let arrow_fields = vec![ + Field::new("boolean", DataType::Boolean, false), + Field::new("int8", DataType::Int8, false), + Field::new("int16", DataType::Int16, false), + Field::new("int32", DataType::Int32, false), + Field::new("int64", DataType::Int64, false), + Field::new("double", DataType::Float64, true), + Field::new("float", DataType::Float32, true), + Field::new("string", DataType::Utf8, true), + ]; + + assert_eq!(&arrow_fields, converted_arrow_schema.fields()); + } + + #[test] + fn test_duplicate_fields() { + let message_type = " + message test_schema { + REQUIRED BOOLEAN boolean; + REQUIRED INT32 int8 (INT_8); + } + "; + + let parquet_group_type = parse_message_type(message_type).unwrap(); + + let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); + let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); + + let arrow_fields = vec![ + Field::new("boolean", DataType::Boolean, false), + Field::new("int8", DataType::Int8, false), + ]; + assert_eq!(&arrow_fields, converted_arrow_schema.fields()); + + let converted_arrow_schema = + parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![0usize, 1usize]) + .unwrap(); + assert_eq!(&arrow_fields, converted_arrow_schema.fields()); + } + + #[test] + fn test_parquet_lists() { + let mut arrow_fields = Vec::new(); + + // LIST encoding example taken from parquet-format/LogicalTypes.md + let message_type = " + message test_schema { + REQUIRED GROUP my_list (LIST) { + REPEATED GROUP list { + OPTIONAL BINARY element (UTF8); + } + } + OPTIONAL GROUP my_list (LIST) { + REPEATED GROUP list { + REQUIRED BINARY element (UTF8); + } + } + OPTIONAL GROUP array_of_arrays (LIST) { + REPEATED GROUP list { + REQUIRED GROUP element (LIST) { + REPEATED GROUP list { + REQUIRED INT32 element; + } + } + } + } + OPTIONAL GROUP my_list (LIST) { + REPEATED GROUP element { + REQUIRED BINARY str (UTF8); + } + } + OPTIONAL GROUP my_list (LIST) { + REPEATED INT32 element; + } + OPTIONAL GROUP my_list (LIST) { + REPEATED GROUP element { + REQUIRED BINARY str (UTF8); + REQUIRED INT32 num; + } + } + OPTIONAL GROUP my_list (LIST) { + REPEATED GROUP array { + REQUIRED BINARY str (UTF8); + } + + } + OPTIONAL GROUP my_list (LIST) { + REPEATED GROUP my_list_tuple { + REQUIRED BINARY str (UTF8); + } + } + REPEATED INT32 name; + } + "; + + // // List (list non-null, elements nullable) + // required group my_list (LIST) { + // repeated group list { + // optional binary element (UTF8); + // } + // } + { + arrow_fields.push(Field::new( + "my_list", + DataType::List(Box::new(DataType::Utf8)), + false, + )); + } + + // // List (list nullable, elements non-null) + // optional group my_list (LIST) { + // repeated group list { + // required binary element (UTF8); + // } + // } + { + arrow_fields.push(Field::new( + "my_list", + DataType::List(Box::new(DataType::Utf8)), + true, + )); + } + + // Element types can be nested structures. For example, a list of lists: + // + // // List> + // optional group array_of_arrays (LIST) { + // repeated group list { + // required group element (LIST) { + // repeated group list { + // required int32 element; + // } + // } + // } + // } + { + let arrow_inner_list = DataType::List(Box::new(DataType::Int32)); + arrow_fields.push(Field::new( + "array_of_arrays", + DataType::List(Box::new(arrow_inner_list)), + true, + )); + } + + // // List (list nullable, elements non-null) + // optional group my_list (LIST) { + // repeated group element { + // required binary str (UTF8); + // }; + // } + { + arrow_fields.push(Field::new( + "my_list", + DataType::List(Box::new(DataType::Utf8)), + true, + )); + } + + // // List (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated int32 element; + // } + { + arrow_fields.push(Field::new( + "my_list", + DataType::List(Box::new(DataType::Int32)), + true, + )); + } + + // // List> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group element { + // required binary str (UTF8); + // required int32 num; + // }; + // } + { + let arrow_struct = DataType::Struct(vec![ + Field::new("str", DataType::Utf8, false), + Field::new("num", DataType::Int32, false), + ]); + arrow_fields.push(Field::new( + "my_list", + DataType::List(Box::new(arrow_struct)), + true, + )); + } + + // // List> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group array { + // required binary str (UTF8); + // }; + // } + // Special case: group is named array + { + let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); + arrow_fields.push(Field::new( + "my_list", + DataType::List(Box::new(arrow_struct)), + true, + )); + } + + // // List> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group my_list_tuple { + // required binary str (UTF8); + // }; + // } + // Special case: group named ends in _tuple + { + let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); + arrow_fields.push(Field::new( + "my_list", + DataType::List(Box::new(arrow_struct)), + true, + )); + } + + // One-level encoding: Only allows required lists with required cells + // repeated value_type name + { + arrow_fields.push(Field::new( + "name", + DataType::List(Box::new(DataType::Int32)), + true, + )); + } + + let parquet_group_type = parse_message_type(message_type).unwrap(); + + let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); + let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); + let converted_fields = converted_arrow_schema.fields(); + + assert_eq!(arrow_fields.len(), converted_fields.len()); + for i in 0..arrow_fields.len() { + assert_eq!(arrow_fields[i], converted_fields[i]); + } + } + + #[test] + fn test_nested_schema() { + let mut arrow_fields = Vec::new(); + { + let group1_fields = vec![ + Field::new("leaf1", DataType::Boolean, false), + Field::new("leaf2", DataType::Int32, false), + ]; + let group1_struct = Field::new("group1", DataType::Struct(group1_fields), false); + arrow_fields.push(group1_struct); + + let leaf3_field = Field::new("leaf3", DataType::Int64, false); + arrow_fields.push(leaf3_field); + } + + let message_type = " + message test_schema { + REQUIRED GROUP group1 { + REQUIRED BOOLEAN leaf1; + REQUIRED INT32 leaf2; + } + REQUIRED INT64 leaf3; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + + let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); + let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); + let converted_fields = converted_arrow_schema.fields(); + + assert_eq!(arrow_fields.len(), converted_fields.len()); + for i in 0..arrow_fields.len() { + assert_eq!(arrow_fields[i], converted_fields[i]); + } + } + + #[test] + fn test_nested_schema_partial() { + let mut arrow_fields = Vec::new(); + { + let group1_fields = vec![Field::new("leaf1", DataType::Int64, false)]; + let group1 = Field::new("group1", DataType::Struct(group1_fields), false); + arrow_fields.push(group1); + + let group2_fields = vec![Field::new("leaf4", DataType::Int64, false)]; + let group2 = Field::new("group2", DataType::Struct(group2_fields), false); + arrow_fields.push(group2); + + arrow_fields.push(Field::new("leaf5", DataType::Int64, false)); + } + + let message_type = " + message test_schema { + REQUIRED GROUP group1 { + REQUIRED INT64 leaf1; + REQUIRED INT64 leaf2; + } + REQUIRED GROUP group2 { + REQUIRED INT64 leaf3; + REQUIRED INT64 leaf4; + } + REQUIRED INT64 leaf5; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + + // Expected partial arrow schema (columns 0, 3, 4): + // required group group1 { + // required int64 leaf1; + // } + // required group group2 { + // required int64 leaf4; + // } + // required int64 leaf5; + + let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); + let converted_arrow_schema = + parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![0, 3, 4]).unwrap(); + let converted_fields = converted_arrow_schema.fields(); + + assert_eq!(arrow_fields.len(), converted_fields.len()); + for i in 0..arrow_fields.len() { + assert_eq!(arrow_fields[i], converted_fields[i]); + } + } + + #[test] + fn test_nested_schema_partial_ordering() { + let mut arrow_fields = Vec::new(); + { + let group2_fields = vec![Field::new("leaf4", DataType::Int64, false)]; + let group2 = Field::new("group2", DataType::Struct(group2_fields), false); + arrow_fields.push(group2); + + arrow_fields.push(Field::new("leaf5", DataType::Int64, false)); + + let group1_fields = vec![Field::new("leaf1", DataType::Int64, false)]; + let group1 = Field::new("group1", DataType::Struct(group1_fields), false); + arrow_fields.push(group1); + } + + let message_type = " + message test_schema { + REQUIRED GROUP group1 { + REQUIRED INT64 leaf1; + REQUIRED INT64 leaf2; + } + REQUIRED GROUP group2 { + REQUIRED INT64 leaf3; + REQUIRED INT64 leaf4; + } + REQUIRED INT64 leaf5; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + + // Expected partial arrow schema (columns 3, 4, 0): + // required group group1 { + // required int64 leaf1; + // } + // required group group2 { + // required int64 leaf4; + // } + // required int64 leaf5; + + let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); + let converted_arrow_schema = + parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![3, 4, 0]).unwrap(); + let converted_fields = converted_arrow_schema.fields(); + + assert_eq!(arrow_fields.len(), converted_fields.len()); + for i in 0..arrow_fields.len() { + assert_eq!(arrow_fields[i], converted_fields[i]); + } + } + + #[test] + fn test_repeated_nested_schema() { + let mut arrow_fields = Vec::new(); + { + arrow_fields.push(Field::new("leaf1", DataType::Int32, true)); + + let inner_group_list = Field::new( + "innerGroup", + DataType::List(Box::new(DataType::Struct(vec![Field::new( + "leaf3", + DataType::Int32, + true, + )]))), + true, + ); + + let outer_group_list = Field::new( + "outerGroup", + DataType::List(Box::new(DataType::Struct(vec![ + Field::new("leaf2", DataType::Int32, true), + inner_group_list, + ]))), + true, + ); + arrow_fields.push(outer_group_list); + } + + let message_type = " + message test_schema { + OPTIONAL INT32 leaf1; + REPEATED GROUP outerGroup { + OPTIONAL INT32 leaf2; + REPEATED GROUP innerGroup { + OPTIONAL INT32 leaf3; + } + } + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + + let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type))); + let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap(); + let converted_fields = converted_arrow_schema.fields(); + + assert_eq!(arrow_fields.len(), converted_fields.len()); + for i in 0..arrow_fields.len() { + assert_eq!(arrow_fields[i], converted_fields[i]); + } + } +} diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/types.rs index 30ee9f60e1a3e..aa314d6100183 100644 --- a/rust/parquet/src/schema/types.rs +++ b/rust/parquet/src/schema/types.rs @@ -741,19 +741,31 @@ impl SchemaDescriptor { /// Returns column root [`Type`](`::schema::types::Type`) for a field position. pub fn get_column_root(&self, i: usize) -> &Type { + let result = self.column_root_of(i); + result.as_ref() + } + + /// Returns column root [`Type`](`::schema::types::Type`) pointer for a field position. + pub fn get_column_root_ptr(&self, i: usize) -> TypePtr { + let result = self.column_root_of(i); + result.clone() + } + + fn column_root_of(&self, i: usize) -> &Rc { assert!( i < self.leaves.len(), "Index out of bound: {} not in [0, {})", i, self.leaves.len() ); + let result = self.leaf_to_base.get(&i); assert!( result.is_some(), "Expected a value for index {} but found None", i ); - result.unwrap().as_ref() + result.unwrap() } /// Returns schema as [`Type`](`::schema::types::Type`). From 4f2f53336f2293eea33235e86e41aa9f08e98a1a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 8 Jan 2019 15:02:14 +0100 Subject: [PATCH 173/328] ARROW-4178: [C++] Fix TSan and UBSan errors Author: Antoine Pitrou Closes #3334 from pitrou/ARROW-4178-tsan-ubsan-fixes and squashes the following commits: b836f733 ARROW-4178: Fix TSan and UBSan errors --- cpp/build-support/run-test.sh | 4 ++ cpp/build-support/tsan-suppressions.txt | 19 +++++++ cpp/build-support/ubsan-suppressions.txt | 16 ++++++ cpp/src/arrow/compare.cc | 10 +++- cpp/src/arrow/compute/kernels/cast.cc | 1 + cpp/src/arrow/csv/column-builder.cc | 4 +- cpp/src/arrow/io/file-test.cc | 4 +- cpp/src/arrow/io/readahead-test.cc | 56 +++++++++++++++++-- cpp/src/arrow/util/bit-stream-utils.h | 3 +- cpp/src/arrow/util/bit-util-test.cc | 2 + cpp/src/arrow/util/decimal-test.cc | 4 +- cpp/src/arrow/util/decimal.cc | 16 ++++-- cpp/src/arrow/util/int-util.h | 16 ++++++ cpp/src/arrow/util/macros.h | 9 +++ cpp/src/arrow/util/parsing.h | 5 +- cpp/src/arrow/util/thread-pool-test.cc | 3 +- cpp/src/parquet/arrow/reader.cc | 70 +++++++++++++++--------- cpp/src/parquet/bloom_filter.h | 4 +- cpp/src/parquet/column_reader-test.cc | 4 +- cpp/src/parquet/encoding-internal.h | 7 ++- cpp/src/parquet/types.h | 3 +- cpp/src/parquet/util/memory.cc | 7 ++- cpp/src/parquet/util/memory.h | 1 + 23 files changed, 213 insertions(+), 55 deletions(-) create mode 100644 cpp/build-support/tsan-suppressions.txt create mode 100644 cpp/build-support/ubsan-suppressions.txt diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index 656ab7bd3b805..6b1c09efb4d8d 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -80,6 +80,10 @@ function setup_sanitizers() { TSAN_OPTIONS="$TSAN_OPTIONS history_size=7" export TSAN_OPTIONS + UBSAN_OPTIONS="$UBSAN_OPTIONS print_stacktrace=1" + UBSAN_OPTIONS="$UBSAN_OPTIONS suppressions=$ROOT/build-support/ubsan-suppressions.txt" + export UBSAN_OPTIONS + # Enable leak detection even under LLVM 3.4, where it was disabled by default. # This flag only takes effect when running an ASAN build. # ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1" diff --git a/cpp/build-support/tsan-suppressions.txt b/cpp/build-support/tsan-suppressions.txt new file mode 100644 index 0000000000000..ce897c8591188 --- /dev/null +++ b/cpp/build-support/tsan-suppressions.txt @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Thread leak in CUDA +thread:libcuda.so diff --git a/cpp/build-support/ubsan-suppressions.txt b/cpp/build-support/ubsan-suppressions.txt new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/cpp/build-support/ubsan-suppressions.txt @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2f4f5d16364f1..efc8ad82faf93 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -324,7 +324,15 @@ static bool IsEqualPrimitive(const PrimitiveArray& left, const PrimitiveArray& r right_data = right.values()->data() + right.offset() * byte_width; } - if (left.null_count() > 0) { + if (byte_width == 0) { + // Special case 0-width data, as the data pointers may be null + for (int64_t i = 0; i < left.length(); ++i) { + if (left.IsNull(i) != right.IsNull(i)) { + return false; + } + } + return true; + } else if (left.null_count() > 0) { for (int64_t i = 0; i < left.length(); ++i) { const bool left_null = left.IsNull(i); const bool right_null = right.IsNull(i); diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 15746d4c9965e..092aebc8c3d2e 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -404,6 +404,7 @@ struct is_float_truncate< template struct CastFunctor::value>::type> { + ARROW_DISABLE_UBSAN("float-cast-overflow") void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { using in_type = typename I::c_type; diff --git a/cpp/src/arrow/csv/column-builder.cc b/cpp/src/arrow/csv/column-builder.cc index 28cbad47580e8..1f37046798fd7 100644 --- a/cpp/src/arrow/csv/column-builder.cc +++ b/cpp/src/arrow/csv/column-builder.cc @@ -305,12 +305,12 @@ Status InferringColumnBuilder::TryConvertChunk(size_t chunk_index) { void InferringColumnBuilder::Insert(int64_t block_index, const std::shared_ptr& parser) { - DCHECK_NE(converter_, nullptr); - // Create a slot for the new chunk and spawn a task to convert it size_t chunk_index = static_cast(block_index); { std::lock_guard lock(mutex_); + + DCHECK_NE(converter_, nullptr); if (chunks_.size() <= chunk_index) { chunks_.resize(chunk_index + 1); } diff --git a/cpp/src/arrow/io/file-test.cc b/cpp/src/arrow/io/file-test.cc index 6d780c0940eba..f329ae9d504e5 100644 --- a/cpp/src/arrow/io/file-test.cc +++ b/cpp/src/arrow/io/file-test.cc @@ -468,10 +468,10 @@ class MyMemoryPool : public MemoryPool { int64_t bytes_allocated() const override { return -1; } - int64_t num_allocations() const { return num_allocations_; } + int64_t num_allocations() const { return num_allocations_.load(); } private: - int64_t num_allocations_; + std::atomic num_allocations_; }; TEST_F(TestReadableFile, CustomMemoryPool) { diff --git a/cpp/src/arrow/io/readahead-test.cc b/cpp/src/arrow/io/readahead-test.cc index b7f404f666983..6575e898590d8 100644 --- a/cpp/src/arrow/io/readahead-test.cc +++ b/cpp/src/arrow/io/readahead-test.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,51 @@ using internal::checked_cast; namespace io { namespace internal { +class LockedInputStream : public InputStream { + public: + explicit LockedInputStream(const std::shared_ptr& stream) + : stream_(stream) {} + + Status Close() override { + std::lock_guard lock(mutex_); + return stream_->Close(); + } + + bool closed() const override { + std::lock_guard lock(mutex_); + return stream_->closed(); + } + + Status Tell(int64_t* position) const override { + std::lock_guard lock(mutex_); + return stream_->Tell(position); + } + + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override { + std::lock_guard lock(mutex_); + return stream_->Read(nbytes, bytes_read, buffer); + } + + Status Read(int64_t nbytes, std::shared_ptr* out) override { + std::lock_guard lock(mutex_); + return stream_->Read(nbytes, out); + } + + bool supports_zero_copy() const override { + std::lock_guard lock(mutex_); + return stream_->supports_zero_copy(); + } + + util::string_view Peek(int64_t nbytes) const override { + std::lock_guard lock(mutex_); + return stream_->Peek(nbytes); + } + + protected: + std::shared_ptr stream_; + mutable std::mutex mutex_; +}; + static void sleep_for(double seconds) { std::this_thread::sleep_for( std::chrono::nanoseconds(static_cast(seconds * 1e9))); @@ -57,13 +103,13 @@ static void busy_wait(double seconds, std::function predicate) { } } -std::shared_ptr DataReader(const std::string& data) { +std::shared_ptr DataReader(const std::string& data) { std::shared_ptr buffer; ABORT_NOT_OK(Buffer::FromString(data, &buffer)); - return std::make_shared(buffer); + return std::make_shared(std::make_shared(buffer)); } -static int64_t WaitForPosition(const RandomAccessFile& file, int64_t expected, +static int64_t WaitForPosition(const FileInterface& file, int64_t expected, double seconds = 0.2) { int64_t pos = -1; busy_wait(seconds, [&]() -> bool { @@ -73,12 +119,12 @@ static int64_t WaitForPosition(const RandomAccessFile& file, int64_t expected, return pos; } -static void AssertEventualPosition(const RandomAccessFile& file, int64_t expected) { +static void AssertEventualPosition(const FileInterface& file, int64_t expected) { int64_t pos = WaitForPosition(file, expected); ASSERT_EQ(pos, expected) << "File didn't reach expected position"; } -static void AssertPosition(const RandomAccessFile& file, int64_t expected) { +static void AssertPosition(const FileInterface& file, int64_t expected) { int64_t pos = -1; ABORT_NOT_OK(file.Tell(&pos)); ASSERT_EQ(pos, expected) << "File didn't reach expected position"; diff --git a/cpp/src/arrow/util/bit-stream-utils.h b/cpp/src/arrow/util/bit-stream-utils.h index ae62a7ff1e2b3..ad86ee87c9fda 100644 --- a/cpp/src/arrow/util/bit-stream-utils.h +++ b/cpp/src/arrow/util/bit-stream-utils.h @@ -397,7 +397,8 @@ inline bool BitReader::GetVlqInt(int32_t* v) { } inline bool BitWriter::PutZigZagVlqInt(int32_t v) { - uint32_t u = (v << 1) ^ (v >> 31); + // Note negative left shift is undefined + uint32_t u = (static_cast(v) << 1) ^ (v >> 31); return PutVlqInt(u); } diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 5f181e9b7b14c..b12e2ecf9eef9 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -756,7 +756,9 @@ static void TestZigZag(int32_t v) { TEST(BitStreamUtil, ZigZag) { TestZigZag(0); TestZigZag(1); + TestZigZag(1234); TestZigZag(-1); + TestZigZag(-1234); TestZigZag(std::numeric_limits::max()); TestZigZag(-std::numeric_limits::max()); } diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index 94c270280ea3c..5925d98d9d8d5 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -417,8 +417,8 @@ TEST(Decimal128Test, TestFromBigEndian) { auto negated = -value; little_endian = negated.ToBytes(); std::reverse(little_endian.begin(), little_endian.end()); - // Convert all of the bytes since we have to include the sign bit - ASSERT_OK(Decimal128::FromBigEndian(little_endian.data(), 16, &out)); + // The sign bit is looked up in the MSB + ASSERT_OK(Decimal128::FromBigEndian(little_endian.data() + 15 - ii, ii + 1, &out)); ASSERT_EQ(negated, out); // Take the complement and convert to big endian diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index f6e110561b275..c980e2a9e773c 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -29,11 +29,15 @@ #include "arrow/status.h" #include "arrow/util/bit-util.h" #include "arrow/util/decimal.h" +#include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" namespace arrow { +using internal::SafeLeftShift; +using internal::SafeSignedAdd; + static const Decimal128 ScaleMultipliers[] = { Decimal128(0LL), Decimal128(10LL), @@ -405,7 +409,7 @@ Decimal128& Decimal128::Negate() { low_bits_ = ~low_bits_ + 1; high_bits_ = ~high_bits_; if (low_bits_ == 0) { - ++high_bits_; + high_bits_ = SafeSignedAdd(high_bits_, 1); } return *this; } @@ -414,9 +418,9 @@ Decimal128& Decimal128::Abs() { return *this < 0 ? Negate() : *this; } Decimal128& Decimal128::operator+=(const Decimal128& right) { const uint64_t sum = low_bits_ + right.low_bits_; - high_bits_ += right.high_bits_; + high_bits_ = SafeSignedAdd(high_bits_, right.high_bits_); if (sum < low_bits_) { - ++high_bits_; + high_bits_ = SafeSignedAdd(high_bits_, 1); } low_bits_ = sum; return *this; @@ -454,7 +458,7 @@ Decimal128& Decimal128::operator&=(const Decimal128& right) { Decimal128& Decimal128::operator<<=(uint32_t bits) { if (bits != 0) { if (bits < 64) { - high_bits_ <<= bits; + high_bits_ = SafeLeftShift(high_bits_, bits); high_bits_ |= (low_bits_ >> (64 - bits)); low_bits_ <<= bits; } else if (bits < 128) { @@ -925,7 +929,7 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 } else { high = -1 * (is_negative && length < kMaxDecimalBytes); // Shift left enough bits to make room for the incoming int64_t - high <<= high_bits_offset * CHAR_BIT; + high = SafeLeftShift(high, high_bits_offset * CHAR_BIT); // Preserve the upper bits by inplace OR-ing the int64_t high |= high_bits; } @@ -943,7 +947,7 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 // Sign extend the low bits if necessary low = -1 * (is_negative && length < 8); // Shift left enough bits to make room for the incoming int64_t - low <<= low_bits_offset * CHAR_BIT; + low = SafeLeftShift(low, low_bits_offset * CHAR_BIT); // Preserve the upper bits by inplace OR-ing the int64_t low |= low_bits; } diff --git a/cpp/src/arrow/util/int-util.h b/cpp/src/arrow/util/int-util.h index 66d389e5f40cf..d3ae09f75cfa6 100644 --- a/cpp/src/arrow/util/int-util.h +++ b/cpp/src/arrow/util/int-util.h @@ -19,6 +19,7 @@ #define ARROW_UTIL_INT_UTIL_H #include +#include #include "arrow/util/visibility.h" @@ -67,6 +68,21 @@ template ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length, const int32_t* transpose_map); +/// Signed addition with well-defined behaviour on overflow (as unsigned) +template +SignedInt SafeSignedAdd(SignedInt u, SignedInt v) { + using UnsignedInt = typename std::make_unsigned::type; + return static_cast(static_cast(u) + + static_cast(v)); +} + +/// Signed left shift with well-defined behaviour on negative numbers or overflow +template +SignedInt SafeLeftShift(SignedInt u, Shift shift) { + using UnsignedInt = typename std::make_unsigned::type; + return static_cast(static_cast(u) << shift); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index f4c58f4030afd..ab258252695ab 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -113,6 +113,15 @@ #endif #endif // !defined(MANUALLY_ALIGNED_STRUCT) +// ---------------------------------------------------------------------- +// Convenience macro disabling a particular UBSan check in a function + +#if defined(__clang__) +#define ARROW_DISABLE_UBSAN(feature) __attribute__((no_sanitize(feature))) +#else +#define ARROW_DISABLE_UBSAN(feature) +#endif + // ---------------------------------------------------------------------- // From googletest // (also in parquet-cpp) diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h index 46d0f7c322b46..23e7061ac8738 100644 --- a/cpp/src/arrow/util/parsing.h +++ b/cpp/src/arrow/util/parsing.h @@ -335,7 +335,10 @@ class StringToSignedIntConverterMixin { if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) { return false; } - *out = static_cast(-static_cast(unsigned_value)); + // To avoid both compiler warnings (with unsigned negation) + // and undefined behaviour (with signed negation overflow), + // use the expanded formula for 2's complement negation. + *out = static_cast(~unsigned_value + 1); } else { if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) { return false; diff --git a/cpp/src/arrow/util/thread-pool-test.cc b/cpp/src/arrow/util/thread-pool-test.cc index 22a8db21fd280..c0deb20ccdde1 100644 --- a/cpp/src/arrow/util/thread-pool-test.cc +++ b/cpp/src/arrow/util/thread-pool-test.cc @@ -298,7 +298,8 @@ TEST_F(TestThreadPool, Submit) { // Test fork safety on Unix -#if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER)) +#if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \ + defined(THREAD_SANITIZER)) TEST_F(TestThreadPool, ForkSafety) { pid_t child_pid; int child_status; diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index b5905fddff489..58c703f7fe068 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -29,6 +29,7 @@ #include "arrow/api.h" #include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/thread-pool.h" @@ -76,6 +77,8 @@ namespace parquet { namespace arrow { using ::arrow::BitUtil::BytesForBits; +using ::arrow::BitUtil::FromBigEndian; +using ::arrow::internal::SafeLeftShift; template using ArrayType = typename ::arrow::TypeTraits::ArrayType; @@ -1098,8 +1101,6 @@ struct TransferFunctor< }; static uint64_t BytesToInteger(const uint8_t* bytes, int32_t start, int32_t stop) { - using ::arrow::BitUtil::FromBigEndian; - const int32_t length = stop - start; DCHECK_GE(length, 0); @@ -1155,37 +1156,54 @@ static constexpr int32_t kMaxDecimalBytes = 16; /// \brief Convert a sequence of big-endian bytes to one int64_t (high bits) and one /// uint64_t (low bits). -static void BytesToIntegerPair(const uint8_t* bytes, - const int32_t total_number_of_bytes_used, int64_t* high, - uint64_t* low) { - DCHECK_GE(total_number_of_bytes_used, kMinDecimalBytes); - DCHECK_LE(total_number_of_bytes_used, kMaxDecimalBytes); - - /// Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the - /// sign bit. - const bool is_negative = static_cast(bytes[0]) < 0; +static void BytesToIntegerPair(const uint8_t* bytes, const int32_t length, + int64_t* out_high, uint64_t* out_low) { + DCHECK_GE(length, kMinDecimalBytes); + DCHECK_LE(length, kMaxDecimalBytes); - /// Sign extend the low bits if necessary - *low = UINT64_MAX * (is_negative && total_number_of_bytes_used < 8); - *high = -1 * (is_negative && total_number_of_bytes_used < kMaxDecimalBytes); + // XXX This code is copied from Decimal::FromBigEndian - /// Stop byte of the high bytes - const int32_t high_bits_offset = std::max(0, total_number_of_bytes_used - 8); + int64_t high, low; + + // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the + // sign bit. + const bool is_negative = static_cast(bytes[0]) < 0; - /// Shift left enough bits to make room for the incoming int64_t - *high <<= high_bits_offset * CHAR_BIT; + // 1. Extract the high bytes + // Stop byte of the high bytes + const int32_t high_bits_offset = std::max(0, length - 8); + const auto high_bits = BytesToInteger(bytes, 0, high_bits_offset); - /// Preserve the upper bits by inplace OR-ing the int64_t - *high |= BytesToInteger(bytes, 0, high_bits_offset); + if (high_bits_offset == 8) { + // Avoid undefined shift by 64 below + high = high_bits; + } else { + high = -1 * (is_negative && length < kMaxDecimalBytes); + // Shift left enough bits to make room for the incoming int64_t + high = SafeLeftShift(high, high_bits_offset * CHAR_BIT); + // Preserve the upper bits by inplace OR-ing the int64_t + high |= high_bits; + } - /// Stop byte of the low bytes - const int32_t low_bits_offset = std::min(total_number_of_bytes_used, 8); + // 2. Extract the low bytes + // Stop byte of the low bytes + const int32_t low_bits_offset = std::min(length, 8); + const auto low_bits = BytesToInteger(bytes, high_bits_offset, length); - /// Shift left enough bits to make room for the incoming uint64_t - *low <<= low_bits_offset * CHAR_BIT; + if (low_bits_offset == 8) { + // Avoid undefined shift by 64 below + low = low_bits; + } else { + // Sign extend the low bits if necessary + low = -1 * (is_negative && length < 8); + // Shift left enough bits to make room for the incoming int64_t + low = SafeLeftShift(low, low_bits_offset * CHAR_BIT); + // Preserve the upper bits by inplace OR-ing the int64_t + low |= low_bits; + } - /// Preserve the upper bits by inplace OR-ing the uint64_t - *low |= BytesToInteger(bytes, high_bits_offset, total_number_of_bytes_used); + *out_high = high; + *out_low = static_cast(low); } static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width, diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index 0078051b49735..a66fc8d1b080c 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -155,11 +155,13 @@ class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter { static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) { DCHECK(fpp > 0.0 && fpp < 1.0); const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8)); - uint32_t num_bits = static_cast(m); + uint32_t num_bits; // Handle overflow. if (m < 0 || m > kMaximumBloomFilterBytes << 3) { num_bits = static_cast(kMaximumBloomFilterBytes << 3); + } else { + num_bits = static_cast(m); } // Round up to lower bound diff --git a/cpp/src/parquet/column_reader-test.cc b/cpp/src/parquet/column_reader-test.cc index 60f2be2362510..0475ca591de02 100644 --- a/cpp/src/parquet/column_reader-test.cc +++ b/cpp/src/parquet/column_reader-test.cc @@ -102,7 +102,7 @@ class TestPrimitiveReader : public ::testing::Test { &vresult[0] + total_values_read, &values_read)); total_values_read += static_cast(values_read); batch_actual += batch; - batch_size = std::max(batch_size * 2, 4096); + batch_size = std::min(1 << 24, std::max(batch_size * 2, 4096)); } while (batch > 0); ASSERT_EQ(num_levels_, batch_actual); @@ -147,7 +147,7 @@ class TestPrimitiveReader : public ::testing::Test { total_values_read += batch - static_cast(null_count); batch_actual += batch; levels_actual += static_cast(levels_read); - batch_size = std::max(batch_size * 2, 4096); + batch_size = std::min(1 << 24, std::max(batch_size * 2, 4096)); } while ((batch > 0) || (levels_read > 0)); ASSERT_EQ(num_levels_, levels_actual); diff --git a/cpp/src/parquet/encoding-internal.h b/cpp/src/parquet/encoding-internal.h index e2dfc2380ddcf..8fbfb402a7fb1 100644 --- a/cpp/src/parquet/encoding-internal.h +++ b/cpp/src/parquet/encoding-internal.h @@ -83,7 +83,10 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, if (data_size < bytes_to_decode) { ParquetException::EofException(); } - memcpy(out, data, bytes_to_decode); + // If bytes_to_decode == 0, data could be null + if (bytes_to_decode > 0) { + memcpy(out, data, bytes_to_decode); + } return bytes_to_decode; } @@ -382,7 +385,7 @@ template inline void DictionaryDecoder::SetDict(Decoder* dictionary) { int num_dictionary_values = dictionary->values_left(); dictionary_.Resize(num_dictionary_values); - dictionary->Decode(&dictionary_[0], num_dictionary_values); + dictionary->Decode(dictionary_.data(), num_dictionary_values); } template <> diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 1812f5547abc2..2bc51e7dc7902 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -160,7 +160,8 @@ struct ByteArray { }; inline bool operator==(const ByteArray& left, const ByteArray& right) { - return left.len == right.len && 0 == std::memcmp(left.ptr, right.ptr, left.len); + return left.len == right.len && + (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0); } inline bool operator!=(const ByteArray& left, const ByteArray& right) { diff --git a/cpp/src/parquet/util/memory.cc b/cpp/src/parquet/util/memory.cc index 6251f1c85c085..b3f83bdfdfd32 100644 --- a/cpp/src/parquet/util/memory.cc +++ b/cpp/src/parquet/util/memory.cc @@ -233,8 +233,11 @@ void InMemoryOutputStream::Write(const uint8_t* data, int64_t length) { PARQUET_THROW_NOT_OK(buffer_->Resize(new_capacity)); capacity_ = new_capacity; } - memcpy(Head(), data, length); - size_ += length; + // If length == 0, data may be null + if (length > 0) { + memcpy(Head(), data, length); + size_ += length; + } } int64_t InMemoryOutputStream::Tell() { return size_; } diff --git a/cpp/src/parquet/util/memory.h b/cpp/src/parquet/util/memory.h index 8677e6b9dacbc..d63ed84dd7ead 100644 --- a/cpp/src/parquet/util/memory.h +++ b/cpp/src/parquet/util/memory.h @@ -66,6 +66,7 @@ class PARQUET_EXPORT Vector { void Swap(Vector& v); inline T& operator[](int64_t i) const { return data_[i]; } + T* data() { return data_; } const T* data() const { return data_; } private: From d6ddcbf1566be6afb0e123589adfb5e5d60e3a4c Mon Sep 17 00:00:00 2001 From: Pindikura Ravindra Date: Tue, 8 Jan 2019 09:32:38 -0600 Subject: [PATCH 174/328] ARROW-3701: [Gandiva] add op for decimal 128 The code changes are complete. However, the perf in the non-fast code path is slow - I'll debug and fix that. Author: Pindikura Ravindra Author: praveenbingo Closes #2942 from pravindra/decimal2 and squashes the following commits: 0f7e78a76 ARROW-3701: off gandiva tests in py 2.7 613524602 ARROW-3701: fix format error c0fddfbc6 ARROW-3701: fix python unresolved symbol db8581162 ARROW-3701: added a comment regarding structs. 194c4377a ARROW-3701: revert surefire version 5d07b79e2 ARROW-3701: Address review comments 36691c1c7 ARROW-3701: add benchmark for large decimals 75f7ac9d4 ARROW-3701: misc cleanups 59db4603d ARROW-3701: Fix java checkstyle issue 8a227ec9c ARROW-3701: Workaround for jni JIT issue 9cbd4ab59 ARROW-3701: switch to surefire 2.19 for dbg ecaff4631 ARROW-3701: Enable decimal tests 54a210511 ARROW-3701: Support for decimal literal and null b76a3ec1b ARROW-3701: First decimal function --- .travis.yml | 3 +- cpp/src/arrow/util/decimal-test.cc | 104 +++++ cpp/src/arrow/util/decimal.cc | 97 ++++- cpp/src/arrow/util/decimal.h | 19 + cpp/src/gandiva/CMakeLists.txt | 4 + cpp/src/gandiva/arrow.h | 11 + cpp/src/gandiva/decimal_full.h | 75 ++++ cpp/src/gandiva/decimal_ir.cc | 405 ++++++++++++++++++ cpp/src/gandiva/decimal_ir.h | 171 ++++++++ cpp/src/gandiva/decimal_type_util.cc | 80 ++++ cpp/src/gandiva/decimal_type_util.h | 90 ++++ cpp/src/gandiva/decimal_type_util_test.cc | 58 +++ cpp/src/gandiva/engine.cc | 9 +- cpp/src/gandiva/engine.h | 2 + cpp/src/gandiva/expression_registry.cc | 4 +- cpp/src/gandiva/function_ir_builder.cc | 81 ++++ cpp/src/gandiva/function_ir_builder.h | 64 +++ cpp/src/gandiva/function_registry.cc | 19 +- .../gandiva/function_registry_arithmetic.cc | 2 + cpp/src/gandiva/function_registry_common.h | 1 + cpp/src/gandiva/function_signature.h | 18 +- cpp/src/gandiva/jni/CMakeLists.txt | 2 +- .../gandiva/jni/expression_registry_helper.cc | 7 +- cpp/src/gandiva/jni/jni_common.cc | 6 + cpp/src/gandiva/literal_holder.h | 5 +- cpp/src/gandiva/llvm_generator.cc | 168 +++++--- cpp/src/gandiva/llvm_generator.h | 9 +- cpp/src/gandiva/llvm_types.cc | 1 + cpp/src/gandiva/llvm_types.h | 25 +- cpp/src/gandiva/lvalue.h | 35 +- cpp/src/gandiva/precompiled/CMakeLists.txt | 12 +- cpp/src/gandiva/precompiled/decimal_ops.cc | 219 ++++++++++ cpp/src/gandiva/precompiled/decimal_ops.h | 37 ++ .../gandiva/precompiled/decimal_ops_test.cc | 75 ++++ .../gandiva/precompiled/decimal_wrapper.cc | 43 ++ cpp/src/gandiva/projector.cc | 6 +- cpp/src/gandiva/proto/Types.proto | 8 + cpp/src/gandiva/tests/CMakeLists.txt | 8 +- cpp/src/gandiva/tests/decimal_single_test.cc | 224 ++++++++++ cpp/src/gandiva/tests/decimal_test.cc | 237 ++++++++++ cpp/src/gandiva/tests/generate_data.h | 20 + cpp/src/gandiva/tests/micro_benchmarks.cc | 126 +++++- cpp/src/gandiva/tests/test_util.h | 14 + cpp/src/gandiva/tests/timed_evaluate.h | 4 +- cpp/src/gandiva/tree_expr_builder.cc | 10 + cpp/src/gandiva/tree_expr_builder.h | 3 + cpp/valgrind.supp | 6 + java/gandiva/pom.xml | 7 +- .../evaluator/ConfigurationBuilder.java | 32 -- .../gandiva/evaluator/DecimalTypeUtil.java | 86 ++++ .../gandiva/evaluator/ExpressionRegistry.java | 5 +- .../arrow/gandiva/evaluator/Filter.java | 16 +- .../arrow/gandiva/evaluator/JniLoader.java | 148 +++++++ .../arrow/gandiva/evaluator/JniWrapper.java | 93 +--- .../arrow/gandiva/evaluator/Projector.java | 20 +- .../arrow/gandiva/expression/DecimalNode.java | 54 +++ .../arrow/gandiva/expression/TreeBuilder.java | 4 + .../gandiva/evaluator/BaseEvaluatorTest.java | 15 + .../evaluator/DecimalTypeUtilTest.java | 89 ++++ .../evaluator/ProjectorDecimalTest.java | 157 +++++++ python/pyarrow/gandiva.pyx | 10 + 61 files changed, 3128 insertions(+), 235 deletions(-) create mode 100644 cpp/src/gandiva/decimal_full.h create mode 100644 cpp/src/gandiva/decimal_ir.cc create mode 100644 cpp/src/gandiva/decimal_ir.h create mode 100644 cpp/src/gandiva/decimal_type_util.cc create mode 100644 cpp/src/gandiva/decimal_type_util.h create mode 100644 cpp/src/gandiva/decimal_type_util_test.cc create mode 100644 cpp/src/gandiva/function_ir_builder.cc create mode 100644 cpp/src/gandiva/function_ir_builder.h create mode 100644 cpp/src/gandiva/precompiled/decimal_ops.cc create mode 100644 cpp/src/gandiva/precompiled/decimal_ops.h create mode 100644 cpp/src/gandiva/precompiled/decimal_ops_test.cc create mode 100644 cpp/src/gandiva/precompiled/decimal_wrapper.cc create mode 100644 cpp/src/gandiva/tests/decimal_single_test.cc create mode 100644 cpp/src/gandiva/tests/decimal_test.cc create mode 100644 java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java create mode 100644 java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java create mode 100644 java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/DecimalNode.java create mode 100644 java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtilTest.java create mode 100644 java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorDecimalTest.java diff --git a/.travis.yml b/.travis.yml index ffbb691f652f5..8532cc7f3b662 100644 --- a/.travis.yml +++ b/.travis.yml @@ -121,7 +121,6 @@ matrix: - ARROW_TRAVIS_COVERAGE=1 - ARROW_TRAVIS_PYTHON_DOCS=1 - ARROW_TRAVIS_PYTHON_JVM=1 - - ARROW_TRAVIS_PYTHON_GANDIVA=1 - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN # TODO(wesm): Run the benchmarks outside of Travis @@ -138,6 +137,8 @@ matrix: - export PLASMA_VALGRIND=0 - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 || travis_terminate 1 - export PLASMA_VALGRIND=1 + # Gandiva tests are not enabled with python 2.7 + - ARROW_TRAVIS_PYTHON_GANDIVA=1 - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh - name: "[OS X] C++ w/ XCode 8.3" diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index 5925d98d9d8d5..73ac48cf88f20 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -466,4 +466,108 @@ TEST(Decimal128Test, TestToInteger) { ASSERT_RAISES(Invalid, invalid_int64.ToInteger(&out2)); } +TEST(Decimal128Test, GetWholeAndFraction) { + Decimal128 value("123456"); + Decimal128 whole; + Decimal128 fraction; + int32_t out; + + value.GetWholeAndFraction(0, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(123456, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(0, out); + + value.GetWholeAndFraction(1, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(12345, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(6, out); + + value.GetWholeAndFraction(5, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(1, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(23456, out); + + value.GetWholeAndFraction(7, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(0, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(123456, out); +} + +TEST(Decimal128Test, GetWholeAndFractionNegative) { + Decimal128 value("-123456"); + Decimal128 whole; + Decimal128 fraction; + int32_t out; + + value.GetWholeAndFraction(0, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(-123456, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(0, out); + + value.GetWholeAndFraction(1, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(-12345, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(-6, out); + + value.GetWholeAndFraction(5, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(-1, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(-23456, out); + + value.GetWholeAndFraction(7, &whole, &fraction); + ASSERT_OK(whole.ToInteger(&out)); + ASSERT_EQ(0, out); + ASSERT_OK(fraction.ToInteger(&out)); + ASSERT_EQ(-123456, out); +} + +TEST(Decimal128Test, IncreaseScale) { + Decimal128 result; + int32_t out; + + result = Decimal128("1234").IncreaseScaleBy(3); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(1234000, out); + + result = Decimal128("-1234").IncreaseScaleBy(3); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(-1234000, out); +} + +TEST(Decimal128Test, ReduceScaleAndRound) { + Decimal128 result; + int32_t out; + + result = Decimal128("123456").ReduceScaleBy(1, false); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(12345, out); + + result = Decimal128("123456").ReduceScaleBy(1, true); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(12346, out); + + result = Decimal128("123451").ReduceScaleBy(1, true); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(12345, out); + + result = Decimal128("-123789").ReduceScaleBy(2, true); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(-1238, out); + + result = Decimal128("-123749").ReduceScaleBy(2, true); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(-1237, out); + + result = Decimal128("-123750").ReduceScaleBy(2, true); + ASSERT_OK(result.ToInteger(&out)); + ASSERT_EQ(-1238, out); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index c980e2a9e773c..8d6c06931a8f8 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -39,7 +39,7 @@ using internal::SafeLeftShift; using internal::SafeSignedAdd; static const Decimal128 ScaleMultipliers[] = { - Decimal128(0LL), + Decimal128(1LL), Decimal128(10LL), Decimal128(100LL), Decimal128(1000LL), @@ -79,6 +79,47 @@ static const Decimal128 ScaleMultipliers[] = { Decimal128(542101086242752217LL, 68739955140067328ULL), Decimal128(5421010862427522170LL, 687399551400673280ULL)}; +static const Decimal128 ScaleMultipliersHalf[] = { + Decimal128(0ULL), + Decimal128(5ULL), + Decimal128(50ULL), + Decimal128(500ULL), + Decimal128(5000ULL), + Decimal128(50000ULL), + Decimal128(500000ULL), + Decimal128(5000000ULL), + Decimal128(50000000ULL), + Decimal128(500000000ULL), + Decimal128(5000000000ULL), + Decimal128(50000000000ULL), + Decimal128(500000000000ULL), + Decimal128(5000000000000ULL), + Decimal128(50000000000000ULL), + Decimal128(500000000000000ULL), + Decimal128(5000000000000000ULL), + Decimal128(50000000000000000ULL), + Decimal128(500000000000000000ULL), + Decimal128(5000000000000000000ULL), + Decimal128(2LL, 13106511852580896768ULL), + Decimal128(27LL, 1937910009842106368ULL), + Decimal128(271LL, 932356024711512064ULL), + Decimal128(2710LL, 9323560247115120640ULL), + Decimal128(27105LL, 1001882102603448320ULL), + Decimal128(271050LL, 10018821026034483200ULL), + Decimal128(2710505LL, 7954489891797073920ULL), + Decimal128(27105054LL, 5757922623132532736ULL), + Decimal128(271050543LL, 2238994010196672512ULL), + Decimal128(2710505431LL, 3943196028257173504ULL), + Decimal128(27105054312LL, 2538472135152631808ULL), + Decimal128(271050543121LL, 6937977277816766464ULL), + Decimal128(2710505431213LL, 14039540557039009792ULL), + Decimal128(27105054312137LL, 11268197054423236608ULL), + Decimal128(271050543121376LL, 2001506101975056384ULL), + Decimal128(2710505431213761LL, 1568316946041012224ULL), + Decimal128(27105054312137610LL, 15683169460410122240ULL), + Decimal128(271050543121376108LL, 9257742014424809472ULL), + Decimal128(2710505431213761085LL, 343699775700336640ULL)}; + static constexpr uint64_t kIntMask = 0xFFFFFFFF; static constexpr auto kCarryBit = static_cast(1) << static_cast(32); @@ -888,6 +929,60 @@ Status Decimal128::Rescale(int32_t original_scale, int32_t new_scale, return Status::OK(); } +void Decimal128::GetWholeAndFraction(int scale, Decimal128* whole, + Decimal128* fraction) const { + DCHECK_GE(scale, 0); + DCHECK_LE(scale, 38); + + Decimal128 multiplier(ScaleMultipliers[scale]); + DCHECK_OK(Divide(multiplier, whole, fraction)); +} + +const Decimal128& Decimal128::GetScaleMultiplier(int32_t scale) { + DCHECK_GE(scale, 0); + DCHECK_LE(scale, 38); + + return ScaleMultipliers[scale]; +} + +Decimal128 Decimal128::IncreaseScaleBy(int32_t increase_by) const { + DCHECK_GE(increase_by, 0); + DCHECK_LE(increase_by, 38); + + return (*this) * ScaleMultipliers[increase_by]; +} + +Decimal128 Decimal128::ReduceScaleBy(int32_t reduce_by, bool round) const { + DCHECK_GE(reduce_by, 0); + DCHECK_LE(reduce_by, 38); + + Decimal128 divisor(ScaleMultipliers[reduce_by]); + Decimal128 result; + Decimal128 remainder; + DCHECK_OK(Divide(divisor, &result, &remainder)); + if (round) { + auto divisor_half = ScaleMultipliersHalf[reduce_by]; + if (remainder.Abs() >= divisor_half) { + if (result > 0) { + result += 1; + } else { + result -= 1; + } + } + } + return result; +} + +int32_t Decimal128::CountLeadingBinaryZeros() const { + DCHECK_GE(*this, Decimal128(0)); + + if (high_bits_ == 0) { + return BitUtil::CountLeadingZeros(low_bits_) + 64; + } else { + return BitUtil::CountLeadingZeros(static_cast(high_bits_)); + } +} + // Helper function used by Decimal128::FromBigEndian static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) { // We don't bounds check the length here because this is called by diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index f59a4a42abed6..5734fa0d5a57a 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -139,9 +139,28 @@ class ARROW_EXPORT Decimal128 { /// \return error status if the length is an invalid value static Status FromBigEndian(const uint8_t* data, int32_t length, Decimal128* out); + /// \brief seperate the integer and fractional parts for the given scale. + void GetWholeAndFraction(int32_t scale, Decimal128* whole, Decimal128* fraction) const; + + /// \brief Scale multiplier for given scale value. + static const Decimal128& GetScaleMultiplier(int32_t scale); + /// \brief Convert Decimal128 from one scale to another Status Rescale(int32_t original_scale, int32_t new_scale, Decimal128* out) const; + /// \brief Scale up. + Decimal128 IncreaseScaleBy(int32_t increase_by) const; + + /// \brief Scale down. + /// - If 'round' is true, the right-most digits are dropped and the result value is + /// rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits + /// (>= 10^reduce_by / 2). + /// - If 'round' is false, the right-most digits are simply dropped. + Decimal128 ReduceScaleBy(int32_t reduce_by, bool round = true) const; + + /// \brief count the number of leading binary zeroes. + int32_t CountLeadingBinaryZeros() const; + /// \brief Convert to a signed integer template > Status ToInteger(T* out) const { diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 90fe7cf8c9c57..e743b0e041cb8 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -46,6 +46,8 @@ set(SRC_FILES annotator.cc bitmap_accumulator.cc configuration.cc context_helper.cc + decimal_ir.cc + decimal_type_util.cc engine.cc date_utils.cc expr_decomposer.cc @@ -54,6 +56,7 @@ set(SRC_FILES annotator.cc expression_registry.cc exported_funcs_registry.cc filter.cc + function_ir_builder.cc function_registry.cc function_registry_arithmetic.cc function_registry_datetime.cc @@ -175,6 +178,7 @@ ADD_GANDIVA_TEST(lru_cache_test) ADD_GANDIVA_TEST(to_date_holder_test) ADD_GANDIVA_TEST(simple_arena_test) ADD_GANDIVA_TEST(like_holder_test) +ADD_GANDIVA_TEST(decimal_type_util_test) if (ARROW_GANDIVA_JAVA) add_subdirectory(jni) diff --git a/cpp/src/gandiva/arrow.h b/cpp/src/gandiva/arrow.h index ea283523a56dc..cc2bd9a10294b 100644 --- a/cpp/src/gandiva/arrow.h +++ b/cpp/src/gandiva/arrow.h @@ -35,6 +35,9 @@ using ArrayPtr = std::shared_ptr; using DataTypePtr = std::shared_ptr; using DataTypeVector = std::vector; +using Decimal128TypePtr = std::shared_ptr; +using Decimal128TypeVector = std::vector; + using FieldPtr = std::shared_ptr; using FieldVector = std::vector; @@ -48,6 +51,14 @@ using ArrayDataVector = std::vector; using Status = arrow::Status; using StatusCode = arrow::StatusCode; +static inline bool is_decimal_128(DataTypePtr type) { + if (type->id() == arrow::Type::DECIMAL) { + auto decimal_type = arrow::internal::checked_cast(type.get()); + return decimal_type->byte_width() == 16; + } else { + return false; + } +} } // namespace gandiva #endif // GANDIVA_EXPR_ARROW_H diff --git a/cpp/src/gandiva/decimal_full.h b/cpp/src/gandiva/decimal_full.h new file mode 100644 index 0000000000000..3b84da1c03584 --- /dev/null +++ b/cpp/src/gandiva/decimal_full.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DECIMAL_FULL_H +#define DECIMAL_FULL_H + +#include +#include +#include +#include "arrow/util/decimal.h" + +namespace gandiva { + +using Decimal128 = arrow::Decimal128; + +/// Represents a 128-bit decimal value along with its precision and scale. +class Decimal128Full { + public: + Decimal128Full(int64_t high_bits, uint64_t low_bits, int32_t precision, int32_t scale) + : value_(high_bits, low_bits), precision_(precision), scale_(scale) {} + + Decimal128Full(std::string value, int32_t precision, int32_t scale) + : value_(value), precision_(precision), scale_(scale) {} + + Decimal128Full(const Decimal128& value, int32_t precision, int32_t scale) + : value_(value), precision_(precision), scale_(scale) {} + + Decimal128Full(int32_t precision, int32_t scale) + : value_(0), precision_(precision), scale_(scale) {} + + uint32_t scale() const { return scale_; } + + uint32_t precision() const { return precision_; } + + const arrow::Decimal128& value() const { return value_; } + + inline std::string ToString() const { + return value_.ToString(0) + "," + std::to_string(precision_) + "," + + std::to_string(scale_); + } + + friend std::ostream& operator<<(std::ostream& os, const Decimal128Full& dec) { + os << dec.ToString(); + return os; + } + + private: + Decimal128 value_; + + int32_t precision_; + int32_t scale_; +}; + +inline bool operator==(const Decimal128Full& left, const Decimal128Full& right) { + return left.value() == right.value() && left.precision() == right.precision() && + left.scale() == right.scale(); +} + +} // namespace gandiva + +#endif // DECIMAL_FULL_H diff --git a/cpp/src/gandiva/decimal_ir.cc b/cpp/src/gandiva/decimal_ir.cc new file mode 100644 index 0000000000000..38b35a64b293f --- /dev/null +++ b/cpp/src/gandiva/decimal_ir.cc @@ -0,0 +1,405 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/status.h" +#include "gandiva/decimal_ir.h" +#include "gandiva/decimal_type_util.h" + +// Algorithms adapted from Apache Impala + +namespace gandiva { + +#define ADD_TRACE_32(msg, value) \ + if (enable_ir_traces_) { \ + AddTrace32(msg, value); \ + } +#define ADD_TRACE_128(msg, value) \ + if (enable_ir_traces_) { \ + AddTrace128(msg, value); \ + } + +const char* DecimalIR::kScaleMultipliersName = "gandivaScaleMultipliers"; + +/// Populate globals required by decimal IR. +/// TODO: can this be done just once ? +void DecimalIR::AddGlobals(Engine* engine) { + auto types = engine->types(); + + // populate vector : [ 1, 10, 100, 1000, ..] + std::string value = "1"; + std::vector scale_multipliers; + for (int i = 0; i < DecimalTypeUtil::kMaxPrecision + 1; ++i) { + auto multiplier = + llvm::ConstantInt::get(llvm::Type::getInt128Ty(*engine->context()), value, 10); + scale_multipliers.push_back(multiplier); + value.append("0"); + } + + auto array_type = + llvm::ArrayType::get(types->i128_type(), DecimalTypeUtil::kMaxPrecision + 1); + auto initializer = llvm::ConstantArray::get( + array_type, llvm::ArrayRef(scale_multipliers)); + + auto globalScaleMultipliers = new llvm::GlobalVariable( + *engine->module(), array_type, true /*constant*/, + llvm::GlobalValue::LinkOnceAnyLinkage, initializer, kScaleMultipliersName); + globalScaleMultipliers->setAlignment(16); +} + +// Lookup intrinsic functions +void DecimalIR::InitializeIntrinsics() { + sadd_with_overflow_fn_ = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::sadd_with_overflow, types()->i128_type()); + DCHECK_NE(sadd_with_overflow_fn_, nullptr); + + smul_with_overflow_fn_ = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::smul_with_overflow, types()->i128_type()); + DCHECK_NE(smul_with_overflow_fn_, nullptr); + + i128_with_overflow_struct_type_ = + sadd_with_overflow_fn_->getFunctionType()->getReturnType(); +} + +// CPP: return kScaleMultipliers[scale] +llvm::Value* DecimalIR::GetScaleMultiplier(llvm::Value* scale) { + auto const_array = module()->getGlobalVariable(kScaleMultipliersName); + auto ptr = ir_builder()->CreateGEP(const_array, {types()->i32_constant(0), scale}); + return ir_builder()->CreateLoad(ptr); +} + +// CPP: x <= y ? y : x +llvm::Value* DecimalIR::GetHigherScale(llvm::Value* x_scale, llvm::Value* y_scale) { + llvm::Value* le = ir_builder()->CreateICmpSLE(x_scale, y_scale); + return ir_builder()->CreateSelect(le, y_scale, x_scale); +} + +// CPP: return (increase_scale_by <= 0) ? +// in_value : in_value * GetScaleMultiplier(increase_scale_by) +llvm::Value* DecimalIR::IncreaseScale(llvm::Value* in_value, + llvm::Value* increase_scale_by) { + llvm::Value* le_zero = + ir_builder()->CreateICmpSLE(increase_scale_by, types()->i32_constant(0)); + // then block + auto then_lambda = [&] { return in_value; }; + + // else block + auto else_lambda = [&] { + llvm::Value* multiplier = GetScaleMultiplier(increase_scale_by); + return ir_builder()->CreateMul(in_value, multiplier); + }; + + return BuildIfElse(le_zero, types()->i128_type(), then_lambda, else_lambda); +} + +// CPP: return (increase_scale_by <= 0) ? +// {in_value,false} : {in_value * GetScaleMultiplier(increase_scale_by),true} +// +// The return value also indicates if there was an overflow while increasing the scale. +DecimalIR::ValueWithOverflow DecimalIR::IncreaseScaleWithOverflowCheck( + llvm::Value* in_value, llvm::Value* increase_scale_by) { + llvm::Value* le_zero = + ir_builder()->CreateICmpSLE(increase_scale_by, types()->i32_constant(0)); + + // then block + auto then_lambda = [&] { + ValueWithOverflow ret{in_value, types()->false_constant()}; + return ret.AsStruct(this); + }; + + // else block + auto else_lambda = [&] { + llvm::Value* multiplier = GetScaleMultiplier(increase_scale_by); + return ir_builder()->CreateCall(smul_with_overflow_fn_, {in_value, multiplier}); + }; + + auto ir_struct = + BuildIfElse(le_zero, i128_with_overflow_struct_type_, then_lambda, else_lambda); + return ValueWithOverflow::MakeFromStruct(this, ir_struct); +} + +// CPP: return (reduce_scale_by <= 0) ? +// in_value : in_value / GetScaleMultiplier(reduce_scale_by) +// +// ReduceScale cannot cause an overflow. +llvm::Value* DecimalIR::ReduceScale(llvm::Value* in_value, llvm::Value* reduce_scale_by) { + auto le_zero = ir_builder()->CreateICmpSLE(reduce_scale_by, types()->i32_constant(0)); + // then block + auto then_lambda = [&] { return in_value; }; + + // else block + auto else_lambda = [&] { + // TODO : handle rounding. + llvm::Value* multiplier = GetScaleMultiplier(reduce_scale_by); + return ir_builder()->CreateSDiv(in_value, multiplier); + }; + + return BuildIfElse(le_zero, types()->i128_type(), then_lambda, else_lambda); +} + +/// @brief Fast-path for add +/// Adjust x and y to the same scale, and add them. +llvm::Value* DecimalIR::AddFastPath(const ValueFull& x, const ValueFull& y) { + auto higher_scale = GetHigherScale(x.scale(), y.scale()); + ADD_TRACE_32("AddFastPath : higher_scale", higher_scale); + + // CPP : x_scaled = IncreaseScale(x_value, higher_scale - x_scale) + auto x_delta = ir_builder()->CreateSub(higher_scale, x.scale()); + auto x_scaled = IncreaseScale(x.value(), x_delta); + ADD_TRACE_128("AddFastPath : x_scaled", x_scaled); + + // CPP : y_scaled = IncreaseScale(y_value, higher_scale - y_scale) + auto y_delta = ir_builder()->CreateSub(higher_scale, y.scale()); + auto y_scaled = IncreaseScale(y.value(), y_delta); + ADD_TRACE_128("AddFastPath : y_scaled", y_scaled); + + auto sum = ir_builder()->CreateAdd(x_scaled, y_scaled); + ADD_TRACE_128("AddFastPath : sum", sum); + return sum; +} + +// @brief Add with overflow check. +/// Adjust x and y to the same scale, add them, and reduce sum to output scale. +/// If there is an overflow, the sum is set to 0. +DecimalIR::ValueWithOverflow DecimalIR::AddWithOverflowCheck(const ValueFull& x, + const ValueFull& y, + const ValueFull& out) { + auto higher_scale = GetHigherScale(x.scale(), y.scale()); + ADD_TRACE_32("AddWithOverflowCheck : higher_scale", higher_scale); + + // CPP : x_scaled = IncreaseScale(x_value, higher_scale - x.scale()) + auto x_delta = ir_builder()->CreateSub(higher_scale, x.scale()); + auto x_scaled = IncreaseScaleWithOverflowCheck(x.value(), x_delta); + ADD_TRACE_128("AddWithOverflowCheck : x_scaled", x_scaled.value()); + + // CPP : y_scaled = IncreaseScale(y_value, higher_scale - y_scale) + auto y_delta = ir_builder()->CreateSub(higher_scale, y.scale()); + auto y_scaled = IncreaseScaleWithOverflowCheck(y.value(), y_delta); + ADD_TRACE_128("AddWithOverflowCheck : y_scaled", y_scaled.value()); + + // CPP : sum = x_scaled + y_scaled + auto sum_ir_struct = ir_builder()->CreateCall(sadd_with_overflow_fn_, + {x_scaled.value(), y_scaled.value()}); + auto sum = ValueWithOverflow::MakeFromStruct(this, sum_ir_struct); + ADD_TRACE_128("AddWithOverflowCheck : sum", sum.value()); + + // CPP : overflow ? 0 : sum / GetScaleMultiplier(max_scale - out_scale) + auto overflow = GetCombinedOverflow({x_scaled, y_scaled, sum}); + ADD_TRACE_32("AddWithOverflowCheck : overflow", overflow); + auto then_lambda = [&] { + // if there is an overflow, the value returned won't be used. so, save the division. + return types()->i128_constant(0); + }; + auto else_lambda = [&] { + auto reduce_scale_by = ir_builder()->CreateSub(higher_scale, out.scale()); + return ReduceScale(sum.value(), reduce_scale_by); + }; + auto sum_descaled = + BuildIfElse(overflow, types()->i128_type(), then_lambda, else_lambda); + return ValueWithOverflow(sum_descaled, overflow); +} + +// This is pretty complex, so use CPP fns. +llvm::Value* DecimalIR::AddLarge(const ValueFull& x, const ValueFull& y, + const ValueFull& out) { + std::vector args; + + auto x_split = ValueSplit::MakeFromInt128(this, x.value()); + args.push_back(x_split.high()); + args.push_back(x_split.low()); + args.push_back(x.precision()); + args.push_back(x.scale()); + + auto y_split = ValueSplit::MakeFromInt128(this, y.value()); + args.push_back(y_split.high()); + args.push_back(y_split.low()); + args.push_back(y.precision()); + args.push_back(y.scale()); + + args.push_back(out.precision()); + args.push_back(out.scale()); + + auto split = ir_builder()->CreateCall( + module()->getFunction("add_large_decimal128_decimal128"), args); + + auto sum = ValueSplit::MakeFromStruct(this, split).AsInt128(this); + ADD_TRACE_128("AddLarge : sum", sum); + return sum; +} + +/// The output scale/precision cannot be arbitary values. The algo here depends on them +/// to be the same as computed in DecimalTypeSql. +/// TODO: enforce this. +Status DecimalIR::BuildAdd() { + // Create fn prototype : + // int128_t + // add_decimal128_decimal128(int128_t x_value, int32_t x_precision, int32_t x_scale, + // int128_t y_value, int32_t y_precision, int32_t y_scale + // int32_t out_precision, int32_t out_scale) + auto i32 = types()->i32_type(); + auto i128 = types()->i128_type(); + auto function = BuildFunction("add_decimal128_decimal128", i128, + { + {"x_value", i128}, + {"x_precision", i32}, + {"x_scale", i32}, + {"y_value", i128}, + {"y_precision", i32}, + {"y_scale", i32}, + {"out_precision", i32}, + {"out_scale", i32}, + }); + + auto arg_iter = function->arg_begin(); + ValueFull x(&arg_iter[0], &arg_iter[1], &arg_iter[2]); + ValueFull y(&arg_iter[3], &arg_iter[4], &arg_iter[5]); + ValueFull out(nullptr, &arg_iter[6], &arg_iter[7]); + + auto entry = llvm::BasicBlock::Create(*context(), "entry", function); + ir_builder()->SetInsertPoint(entry); + + // CPP : + // if (out_precision < 38) { + // return AddFastPath(x, y) + // } else { + // ret = AddWithOverflowCheck(x, y) + // if (ret.overflow) + // return AddLarge(x, y) + // else + // return ret.value; + // } + llvm::Value* lt_max_precision = ir_builder()->CreateICmpSLT( + out.precision(), types()->i32_constant(DecimalTypeUtil::kMaxPrecision)); + auto then_lambda = [&] { + // fast-path add + return AddFastPath(x, y); + }; + auto else_lambda = [&] { + if (kUseOverflowIntrinsics) { + // do the add and check if there was overflow + auto ret = AddWithOverflowCheck(x, y, out); + + // if there is an overflow, switch to the AddLarge codepath. + return BuildIfElse(ret.overflow(), types()->i128_type(), + [&] { return AddLarge(x, y, out); }, + [&] { return ret.value(); }); + } else { + return AddLarge(x, y, out); + } + }; + auto value = + BuildIfElse(lt_max_precision, types()->i128_type(), then_lambda, else_lambda); + + // store result to out + ir_builder()->CreateRet(value); + return Status::OK(); +} + +Status DecimalIR::AddFunctions(Engine* engine) { + auto decimal_ir = std::make_shared(engine); + + // Populate global variables used by decimal operations. + decimal_ir->AddGlobals(engine); + + // Lookup intrinsic functions + decimal_ir->InitializeIntrinsics(); + + // build "add" + return decimal_ir->BuildAdd(); +} + +// Do an bitwise-or of all the overflow bits. +llvm::Value* DecimalIR::GetCombinedOverflow( + std::vector vec) { + llvm::Value* res = types()->false_constant(); + for (auto& val : vec) { + res = ir_builder()->CreateOr(res, val.overflow()); + } + return res; +} + +DecimalIR::ValueSplit DecimalIR::ValueSplit::MakeFromInt128(DecimalIR* decimal_ir, + llvm::Value* in) { + auto builder = decimal_ir->ir_builder(); + auto types = decimal_ir->types(); + + auto high = builder->CreateLShr(in, types->i128_constant(64)); + high = builder->CreateTrunc(high, types->i64_type()); + auto low = builder->CreateTrunc(in, types->i64_type()); + return ValueSplit(high, low); +} + +/// Convert IR struct {%i64, %i64} to cpp class ValueSplit +DecimalIR::ValueSplit DecimalIR::ValueSplit::MakeFromStruct(DecimalIR* decimal_ir, + llvm::Value* dstruct) { + auto builder = decimal_ir->ir_builder(); + auto high = builder->CreateExtractValue(dstruct, 0); + auto low = builder->CreateExtractValue(dstruct, 1); + return DecimalIR::ValueSplit(high, low); +} + +llvm::Value* DecimalIR::ValueSplit::AsInt128(DecimalIR* decimal_ir) const { + auto builder = decimal_ir->ir_builder(); + auto types = decimal_ir->types(); + + auto value = builder->CreateSExt(high_, types->i128_type()); + value = builder->CreateShl(value, types->i128_constant(64)); + value = builder->CreateAdd(value, builder->CreateZExt(low_, types->i128_type())); + return value; +} + +/// Convert IR struct {%i128, %i1} to cpp class ValueWithOverflow +DecimalIR::ValueWithOverflow DecimalIR::ValueWithOverflow::MakeFromStruct( + DecimalIR* decimal_ir, llvm::Value* dstruct) { + auto builder = decimal_ir->ir_builder(); + auto value = builder->CreateExtractValue(dstruct, 0); + auto overflow = builder->CreateExtractValue(dstruct, 1); + return DecimalIR::ValueWithOverflow(value, overflow); +} + +/// Convert to IR struct {%i128, %i1} +llvm::Value* DecimalIR::ValueWithOverflow::AsStruct(DecimalIR* decimal_ir) const { + auto builder = decimal_ir->ir_builder(); + + auto undef = llvm::UndefValue::get(decimal_ir->i128_with_overflow_struct_type_); + auto struct_val = builder->CreateInsertValue(undef, value(), 0); + return builder->CreateInsertValue(struct_val, overflow(), 1); +} + +/// debug traces +void DecimalIR::AddTrace(const std::string& fmt, std::vector args) { + DCHECK(enable_ir_traces_); + + auto ir_str = ir_builder()->CreateGlobalStringPtr(fmt); + args.insert(args.begin(), ir_str); + ir_builder()->CreateCall(module()->getFunction("printf"), args, "trace"); +} + +void DecimalIR::AddTrace32(const std::string& msg, llvm::Value* value) { + AddTrace("DECIMAL_IR_TRACE:: " + msg + " %d\n", {value}); +} + +void DecimalIR::AddTrace128(const std::string& msg, llvm::Value* value) { + // convert i128 into two i64s for printing + auto split = ValueSplit::MakeFromInt128(this, value); + AddTrace("DECIMAL_IR_TRACE:: " + msg + " %llx:%llx (%lld:%llu)\n", + {split.high(), split.low(), split.high(), split.low()}); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/decimal_ir.h b/cpp/src/gandiva/decimal_ir.h new file mode 100644 index 0000000000000..fae762c362d94 --- /dev/null +++ b/cpp/src/gandiva/decimal_ir.h @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_DECIMAL_ADD_IR_BUILDER_H +#define GANDIVA_DECIMAL_ADD_IR_BUILDER_H + +#include +#include +#include + +#include "gandiva/function_ir_builder.h" + +namespace gandiva { + +/// @brief Decimal IR functions +class DecimalIR : public FunctionIRBuilder { + public: + explicit DecimalIR(Engine* engine) + : FunctionIRBuilder(engine), enable_ir_traces_(false) {} + + /// Build decimal IR functions and add them to the engine. + static Status AddFunctions(Engine* engine); + + void EnableTraces() { enable_ir_traces_ = true; } + + private: + /// The intrinsic fn for divide with small divisors is about 10x slower, so not + /// using these. + static const bool kUseOverflowIntrinsics = false; + + // Holder for an i128 value, along with its with scale and precision. + class ValueFull { + public: + ValueFull(llvm::Value* value, llvm::Value* precision, llvm::Value* scale) + : value_(value), precision_(precision), scale_(scale) {} + + llvm::Value* value() const { return value_; } + llvm::Value* precision() const { return precision_; } + llvm::Value* scale() const { return scale_; } + + private: + llvm::Value* value_; + llvm::Value* precision_; + llvm::Value* scale_; + }; + + // Holder for an i128 value, and a boolean indicating overflow. + class ValueWithOverflow { + public: + ValueWithOverflow(llvm::Value* value, llvm::Value* overflow) + : value_(value), overflow_(overflow) {} + + // Make from IR struct + static ValueWithOverflow MakeFromStruct(DecimalIR* decimal_ir, llvm::Value* dstruct); + + // Build a corresponding IR struct + llvm::Value* AsStruct(DecimalIR* decimal_ir) const; + + llvm::Value* value() const { return value_; } + llvm::Value* overflow() const { return overflow_; } + + private: + llvm::Value* value_; + llvm::Value* overflow_; + }; + + // Holder for an i128 value that is split into two i64s + class ValueSplit { + public: + ValueSplit(llvm::Value* high, llvm::Value* low) : high_(high), low_(low) {} + + // Make from i128 value + static ValueSplit MakeFromInt128(DecimalIR* decimal_ir, llvm::Value* in); + + // Make from IR struct + static ValueSplit MakeFromStruct(DecimalIR* decimal_ir, llvm::Value* dstruct); + + // Combine the two parts into an i128 + llvm::Value* AsInt128(DecimalIR* decimal_ir) const; + + llvm::Value* high() const { return high_; } + llvm::Value* low() const { return low_; } + + private: + llvm::Value* high_; + llvm::Value* low_; + }; + + // Add global variables to the module. + static void AddGlobals(Engine* engine); + + // Initialize intrinsic functions that are used by decimal operations. + void InitializeIntrinsics(); + + // Create IR builder for decimal add function. + static Status MakeAdd(Engine* engine, std::shared_ptr* out); + + // Get the multiplier for specified scale (i.e 10^scale) + llvm::Value* GetScaleMultiplier(llvm::Value* scale); + + // Get the higher of the two scales + llvm::Value* GetHigherScale(llvm::Value* x_scale, llvm::Value* y_scale); + + // Increase scale of 'in_value' by 'increase_scale_by'. + // - If 'increase_scale_by' is <= 0, does nothing. + llvm::Value* IncreaseScale(llvm::Value* in_value, llvm::Value* increase_scale_by); + + // Similar to IncreaseScale. but, also check if there is overflow. + ValueWithOverflow IncreaseScaleWithOverflowCheck(llvm::Value* in_value, + llvm::Value* increase_scale_by); + + // Reduce scale of 'in_value' by 'reduce_scale_by'. + // - If 'reduce_scale_by' is <= 0, does nothing. + llvm::Value* ReduceScale(llvm::Value* in_value, llvm::Value* reduce_scale_by); + + // Fast path of add: guaranteed no overflow + llvm::Value* AddFastPath(const ValueFull& x, const ValueFull& y); + + // Similar to AddFastPath, but check if there's an overflow. + ValueWithOverflow AddWithOverflowCheck(const ValueFull& x, const ValueFull& y, + const ValueFull& out); + + // Do addition of large integers (both positive and negative). + llvm::Value* AddLarge(const ValueFull& x, const ValueFull& y, const ValueFull& out); + + // Get the combined overflow (logical or). + llvm::Value* GetCombinedOverflow(std::vector values); + + // Build the function for adding decimals. + Status BuildAdd(); + + // Add a trace in IR code. + void AddTrace(const std::string& fmt, std::vector args); + + // Add a trace msg along with a 32-bit integer. + void AddTrace32(const std::string& msg, llvm::Value* value); + + // Add a trace msg along with a 128-bit integer. + void AddTrace128(const std::string& msg, llvm::Value* value); + + // name of the global variable having the array of scale multipliers. + static const char* kScaleMultipliersName; + + // Intrinsic functions + llvm::Function* sadd_with_overflow_fn_; + llvm::Function* smul_with_overflow_fn_; + + // struct { i128: value, i1: overflow} + llvm::Type* i128_with_overflow_struct_type_; + + // if set to true, ir traces are enabled. Useful for debugging. + bool enable_ir_traces_; +}; + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_IR_BUILDER_H diff --git a/cpp/src/gandiva/decimal_type_util.cc b/cpp/src/gandiva/decimal_type_util.cc new file mode 100644 index 0000000000000..0ebfe661ce63d --- /dev/null +++ b/cpp/src/gandiva/decimal_type_util.cc @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/decimal_type_util.h" +#include "gandiva/logging.h" + +namespace gandiva { + +constexpr int32_t DecimalTypeUtil::kMaxDecimal32Precision; +constexpr int32_t DecimalTypeUtil::kMaxDecimal64Precision; +constexpr int32_t DecimalTypeUtil::kMaxPrecision; + +constexpr int32_t DecimalTypeUtil::kMaxScale; +constexpr int32_t DecimalTypeUtil::kMinAdjustedScale; + +#define DCHECK_TYPE(type) \ + { \ + DCHECK_GE(type->scale(), 0); \ + DCHECK_LE(type->precision(), kMaxPrecision); \ + } + +// Implementation of decimal rules. +Status DecimalTypeUtil::GetResultType(Op op, const Decimal128TypeVector& in_types, + Decimal128TypePtr* out_type) { + DCHECK_EQ(in_types.size(), 2); + + *out_type = nullptr; + auto t1 = in_types[0]; + auto t2 = in_types[1]; + DCHECK_TYPE(t1); + DCHECK_TYPE(t2); + + int32_t s1 = t1->scale(); + int32_t s2 = t2->scale(); + int32_t p1 = t1->precision(); + int32_t p2 = t2->precision(); + int32_t result_scale; + int32_t result_precision; + + switch (op) { + case kOpAdd: + case kOpSubtract: + result_scale = std::max(s1, s2); + result_precision = std::max(p1 - s1, p2 - s2) + result_scale + 1; + break; + + case kOpMultiply: + result_scale = s1 + s2; + result_precision = p1 + p2 + 1; + break; + + case kOpDivide: + result_scale = std::max(kMinAdjustedScale, s1 + p2 + 1); + result_precision = p1 - s1 + s2 + result_scale; + break; + + case kOpMod: + result_scale = std::max(s1, s2); + result_precision = std::min(p1 - s1, p2 - s2) + result_scale; + break; + } + *out_type = MakeAdjustedType(result_precision, result_scale); + return Status::OK(); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/decimal_type_util.h b/cpp/src/gandiva/decimal_type_util.h new file mode 100644 index 0000000000000..2c095c159bba0 --- /dev/null +++ b/cpp/src/gandiva/decimal_type_util.h @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Adapted from Apache Impala + +#ifndef GANDIVA_DECIMAL_TYPE_SQL_H +#define GANDIVA_DECIMAL_TYPE_SQL_H + +#include +#include + +#include "gandiva/arrow.h" + +namespace gandiva { + +/// @brief Handles conversion of scale/precision for operations on decimal types. +/// TODO : do validations for all of these. +class DecimalTypeUtil { + public: + enum Op { + kOpAdd, + kOpSubtract, + kOpMultiply, + kOpDivide, + kOpMod, + }; + + /// The maximum precision representable by a 4-byte decimal + static constexpr int32_t kMaxDecimal32Precision = 9; + + /// The maximum precision representable by a 8-byte decimal + static constexpr int32_t kMaxDecimal64Precision = 18; + + /// The maximum precision representable by a 16-byte decimal + static constexpr int32_t kMaxPrecision = 38; + + // The maximum scale representable. + static constexpr int32_t kMaxScale = kMaxPrecision; + + // When operating on decimal inputs, the integer part of the output can exceed the + // max precision. In such cases, the scale can be reduced, upto a minimum of + // kMinAdjustedScale. + // * There is no strong reason for 6, but both SQLServer and Impala use 6 too. + static constexpr int32_t kMinAdjustedScale = 6; + + // For specified operation and input scale/precision, determine the output + // scale/precision. + static Status GetResultType(Op op, const Decimal128TypeVector& in_types, + Decimal128TypePtr* out_type); + + static Decimal128TypePtr MakeType(int32_t precision, int32_t scale); + + private: + static Decimal128TypePtr MakeAdjustedType(int32_t precision, int32_t scale); +}; + +inline Decimal128TypePtr DecimalTypeUtil::MakeType(int32_t precision, int32_t scale) { + return std::dynamic_pointer_cast( + arrow::decimal(precision, scale)); +} + +// Reduce the scale if possible so that precision stays <= kMaxPrecision +inline Decimal128TypePtr DecimalTypeUtil::MakeAdjustedType(int32_t precision, + int32_t scale) { + if (precision > kMaxPrecision) { + int32_t min_scale = std::min(scale, kMinAdjustedScale); + int32_t delta = precision - kMaxPrecision; + precision = kMaxPrecision; + scale = std::max(scale - delta, min_scale); + } + return MakeType(precision, scale); +} + +} // namespace gandiva + +#endif // GANDIVA_DECIMAL_TYPE_SQL_H diff --git a/cpp/src/gandiva/decimal_type_util_test.cc b/cpp/src/gandiva/decimal_type_util_test.cc new file mode 100644 index 0000000000000..a593990638af5 --- /dev/null +++ b/cpp/src/gandiva/decimal_type_util_test.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Adapted from Apache Impala + +#include + +#include "gandiva/decimal_type_util.h" +#include "tests/test_util.h" + +namespace gandiva { + +#define DECIMAL_TYPE(p, s) DecimalTypeUtil::MakeType(p, s) + +Decimal128TypePtr DoOp(DecimalTypeUtil::Op op, Decimal128TypePtr d1, + Decimal128TypePtr d2) { + Decimal128TypePtr ret_type; + EXPECT_OK(DecimalTypeUtil::GetResultType(op, {d1, d2}, &ret_type)); + return ret_type; +} + +TEST(DecimalResultTypes, Basic) { + EXPECT_ARROW_TYPE_EQUALS( + DECIMAL_TYPE(31, 10), + DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(30, 10), DECIMAL_TYPE(30, 10))); + + EXPECT_ARROW_TYPE_EQUALS( + DECIMAL_TYPE(32, 6), + DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(30, 6), DECIMAL_TYPE(30, 5))); + + EXPECT_ARROW_TYPE_EQUALS( + DECIMAL_TYPE(38, 9), + DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(30, 10), DECIMAL_TYPE(38, 10))); + + EXPECT_ARROW_TYPE_EQUALS( + DECIMAL_TYPE(38, 9), + DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(38, 10), DECIMAL_TYPE(38, 38))); + + EXPECT_ARROW_TYPE_EQUALS( + DECIMAL_TYPE(38, 6), + DoOp(DecimalTypeUtil::kOpAdd, DECIMAL_TYPE(38, 10), DECIMAL_TYPE(38, 2))); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index da7a6d886c0e0..9aaafea8e498e 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -39,6 +39,7 @@ #include #include #include +#include "gandiva/decimal_ir.h" #include "gandiva/exported_funcs_registry.h" namespace gandiva { @@ -94,6 +95,10 @@ Status Engine::Make(std::shared_ptr config, auto status = engine_obj->LoadPreCompiledIRFiles(config->byte_code_file_path()); ARROW_RETURN_NOT_OK(status); + // Add decimal functions + status = DecimalIR::AddFunctions(engine_obj.get()); + ARROW_RETURN_NOT_OK(status); + *engine = std::move(engine_obj); return Status::OK(); } @@ -183,7 +188,7 @@ Status Engine::FinalizeModule(bool optimise_ir, bool dump_ir) { // run the optimiser llvm::PassManagerBuilder pass_builder; - pass_builder.OptLevel = 2; + pass_builder.OptLevel = 3; pass_builder.populateModulePassManager(*pass_manager); pass_manager->run(*module_); @@ -222,7 +227,7 @@ void Engine::DumpIR(std::string prefix) { std::string str; llvm::raw_string_ostream stream(str); - module_->print(stream, NULL); + module_->print(stream, nullptr); std::cout << "====" << prefix << "===" << str << "\n"; } diff --git a/cpp/src/gandiva/engine.h b/cpp/src/gandiva/engine.h index f377ebc38d3ef..16b5a56ebdb36 100644 --- a/cpp/src/gandiva/engine.h +++ b/cpp/src/gandiva/engine.h @@ -37,6 +37,8 @@ namespace gandiva { +class FunctionIRBuilder; + /// \brief LLVM Execution engine wrapper. class Engine { public: diff --git a/cpp/src/gandiva/expression_registry.cc b/cpp/src/gandiva/expression_registry.cc index fb5a45e779926..1a087c96f33bd 100644 --- a/cpp/src/gandiva/expression_registry.cc +++ b/cpp/src/gandiva/expression_registry.cc @@ -136,10 +136,12 @@ void ExpressionRegistry::AddArrowTypesToVector(arrow::Type::type& type, case arrow::Type::type::NA: vector.push_back(arrow::null()); break; + case arrow::Type::type::DECIMAL: + vector.push_back(arrow::decimal(0, 0)); + break; case arrow::Type::type::FIXED_SIZE_BINARY: case arrow::Type::type::MAP: case arrow::Type::type::INTERVAL: - case arrow::Type::type::DECIMAL: case arrow::Type::type::LIST: case arrow::Type::type::STRUCT: case arrow::Type::type::UNION: diff --git a/cpp/src/gandiva/function_ir_builder.cc b/cpp/src/gandiva/function_ir_builder.cc new file mode 100644 index 0000000000000..194273933cd15 --- /dev/null +++ b/cpp/src/gandiva/function_ir_builder.cc @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_ir_builder.h" + +namespace gandiva { + +llvm::Value* FunctionIRBuilder::BuildIfElse(llvm::Value* condition, + llvm::Type* return_type, + std::function then_func, + std::function else_func) { + llvm::IRBuilder<>* builder = ir_builder(); + llvm::Function* function = builder->GetInsertBlock()->getParent(); + DCHECK_NE(function, nullptr); + + // Create blocks for the then, else and merge cases. + llvm::BasicBlock* then_bb = llvm::BasicBlock::Create(*context(), "then", function); + llvm::BasicBlock* else_bb = llvm::BasicBlock::Create(*context(), "else", function); + llvm::BasicBlock* merge_bb = llvm::BasicBlock::Create(*context(), "merge", function); + + builder->CreateCondBr(condition, then_bb, else_bb); + + // Emit the then block. + builder->SetInsertPoint(then_bb); + auto then_value = then_func(); + builder->CreateBr(merge_bb); + + // refresh then_bb for phi (could have changed due to code generation of then_value). + then_bb = builder->GetInsertBlock(); + + // Emit the else block. + builder->SetInsertPoint(else_bb); + auto else_value = else_func(); + builder->CreateBr(merge_bb); + + // refresh else_bb for phi (could have changed due to code generation of else_value). + else_bb = builder->GetInsertBlock(); + + // Emit the merge block. + builder->SetInsertPoint(merge_bb); + llvm::PHINode* result_value = builder->CreatePHI(return_type, 2, "res_value"); + result_value->addIncoming(then_value, then_bb); + result_value->addIncoming(else_value, else_bb); + return result_value; +} + +llvm::Function* FunctionIRBuilder::BuildFunction(const std::string& function_name, + llvm::Type* return_type, + std::vector in_args) { + std::vector arg_types; + for (auto& arg : in_args) { + arg_types.push_back(arg.type); + } + auto prototype = llvm::FunctionType::get(return_type, arg_types, false /*isVarArg*/); + auto function = llvm::Function::Create(prototype, llvm::GlobalValue::ExternalLinkage, + function_name, module()); + + uint32_t i = 0; + for (auto& fn_arg : function->args()) { + DCHECK_LT(i, in_args.size()); + fn_arg.setName(in_args[i].name); + ++i; + } + return function; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_ir_builder.h b/cpp/src/gandiva/function_ir_builder.h new file mode 100644 index 0000000000000..7d6003a62d5bf --- /dev/null +++ b/cpp/src/gandiva/function_ir_builder.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_FUNCTION_IR_BUILDER_H +#define GANDIVA_FUNCTION_IR_BUILDER_H + +#include +#include +#include +#include + +#include "gandiva/engine.h" +#include "gandiva/gandiva_aliases.h" +#include "gandiva/llvm_types.h" + +namespace gandiva { + +/// @brief Base class for building IR functions. +class FunctionIRBuilder { + public: + explicit FunctionIRBuilder(Engine* engine) : engine_(engine) {} + virtual ~FunctionIRBuilder() = default; + + protected: + LLVMTypes* types() { return engine_->types(); } + llvm::Module* module() { return engine_->module(); } + llvm::LLVMContext* context() { return engine_->context(); } + llvm::IRBuilder<>* ir_builder() { return engine_->ir_builder(); } + + /// Build an if-else block. + llvm::Value* BuildIfElse(llvm::Value* condition, llvm::Type* return_type, + std::function then_func, + std::function else_func); + + struct NamedArg { + std::string name; + llvm::Type* type; + }; + + /// Build llvm fn. + llvm::Function* BuildFunction(const std::string& function_name, llvm::Type* return_type, + std::vector in_args); + + private: + Engine* engine_; +}; + +} // namespace gandiva + +#endif // GANDIVA_FUNCTION_IR_BUILDER_H diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 83d80b4988690..452cb6339954c 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -29,23 +29,6 @@ namespace gandiva { -using arrow::binary; -using arrow::boolean; -using arrow::date64; -using arrow::float32; -using arrow::float64; -using arrow::int16; -using arrow::int32; -using arrow::int64; -using arrow::int8; -using arrow::uint16; -using arrow::uint32; -using arrow::uint64; -using arrow::uint8; -using arrow::utf8; -using std::iterator; -using std::vector; - FunctionRegistry::iterator FunctionRegistry::begin() const { return &(*pc_registry_.begin()); } @@ -89,7 +72,7 @@ SignatureMap FunctionRegistry::InitPCMap() { const NativeFunction* FunctionRegistry::LookupSignature( const FunctionSignature& signature) const { auto got = pc_registry_map_.find(&signature); - return got == pc_registry_map_.end() ? NULL : got->second; + return got == pc_registry_map_.end() ? nullptr : got->second; } } // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_arithmetic.cc b/cpp/src/gandiva/function_registry_arithmetic.cc index 800bc493f0019..c5a798cb4e235 100644 --- a/cpp/src/gandiva/function_registry_arithmetic.cc +++ b/cpp/src/gandiva/function_registry_arithmetic.cc @@ -57,6 +57,8 @@ std::vector GetArithmeticFunctionRegistry() { BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, int64, int32, int32), BINARY_GENERIC_SAFE_NULL_IF_NULL(mod, int64, int64, int64), + BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(add, decimal128), + BINARY_RELATIONAL_BOOL_FN(equal), BINARY_RELATIONAL_BOOL_FN(not_equal), diff --git a/cpp/src/gandiva/function_registry_common.h b/cpp/src/gandiva/function_registry_common.h index 78babce9a7dbf..3ae065a14769d 100644 --- a/cpp/src/gandiva/function_registry_common.h +++ b/cpp/src/gandiva/function_registry_common.h @@ -53,6 +53,7 @@ inline DataTypePtr time32() { return arrow::time32(arrow::TimeUnit::MILLI); } inline DataTypePtr time64() { return arrow::time64(arrow::TimeUnit::MICRO); } inline DataTypePtr timestamp() { return arrow::timestamp(arrow::TimeUnit::MILLI); } +inline DataTypePtr decimal128() { return arrow::decimal(0, 0); } struct KeyHash { std::size_t operator()(const FunctionSignature* k) const { return k->Hash(); } diff --git a/cpp/src/gandiva/function_signature.h b/cpp/src/gandiva/function_signature.h index e5dff245b158f..ee82abc367e20 100644 --- a/cpp/src/gandiva/function_signature.h +++ b/cpp/src/gandiva/function_signature.h @@ -56,10 +56,22 @@ class FunctionSignature { std::string ToString() const; private: - // TODO : for some of the types, this shouldn't match type specific data. eg. for - // decimals, this shouldn't match precision/scale. bool DataTypeEquals(const DataTypePtr left, const DataTypePtr right) const { - return left->Equals(right); + if (left->id() == right->id()) { + switch (left->id()) { + case arrow::Type::DECIMAL: { + // For decimal types, the precision/scale isn't part of the signature. + auto dleft = arrow::internal::checked_cast(left.get()); + auto dright = arrow::internal::checked_cast(right.get()); + return (dleft != NULL) && (dright != NULL) && + (dleft->byte_width() == dright->byte_width()); + } + default: + return left->Equals(right); + } + } else { + return false; + } } std::string base_name_; diff --git a/cpp/src/gandiva/jni/CMakeLists.txt b/cpp/src/gandiva/jni/CMakeLists.txt index a07d3903a75ac..afc7fadeed4ad 100644 --- a/cpp/src/gandiva/jni/CMakeLists.txt +++ b/cpp/src/gandiva/jni/CMakeLists.txt @@ -78,5 +78,5 @@ add_dependencies(gandiva ${GANDIVA_JNI_LIBRARIES}) # statically linked stdc++ has conflicts with stdc++ loaded by other libraries. if (NOT APPLE) set_target_properties(gandiva_jni_shared PROPERTIES - LINK_FLAGS "-Wl,--version-script=${CMAKE_SOURCE_DIR}/src/gandiva/jni/symbols.map") + LINK_FLAGS "-Wl,--no-as-needed -Wl,--version-script=${CMAKE_SOURCE_DIR}/src/gandiva/jni/symbols.map") endif() diff --git a/cpp/src/gandiva/jni/expression_registry_helper.cc b/cpp/src/gandiva/jni/expression_registry_helper.cc index 5227329db472a..b5c6880a25cf1 100644 --- a/cpp/src/gandiva/jni/expression_registry_helper.cc +++ b/cpp/src/gandiva/jni/expression_registry_helper.cc @@ -121,10 +121,15 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) case arrow::Type::type::NA: gandiva_data_type->set_type(types::GandivaType::NONE); break; + case arrow::Type::type::DECIMAL: { + gandiva_data_type->set_type(types::GandivaType::DECIMAL); + gandiva_data_type->set_precision(0); + gandiva_data_type->set_scale(0); + break; + } case arrow::Type::type::FIXED_SIZE_BINARY: case arrow::Type::type::MAP: case arrow::Type::type::INTERVAL: - case arrow::Type::type::DECIMAL: case arrow::Type::type::LIST: case arrow::Type::type::STRUCT: case arrow::Type::type::UNION: diff --git a/cpp/src/gandiva/jni/jni_common.cc b/cpp/src/gandiva/jni/jni_common.cc index 639ad361f4a8a..7ad0d6d6ff449 100644 --- a/cpp/src/gandiva/jni/jni_common.cc +++ b/cpp/src/gandiva/jni/jni_common.cc @@ -381,6 +381,12 @@ NodePtr ProtoTypeToNode(const types::TreeNode& node) { return TreeExprBuilder::MakeBinaryLiteral(node.binarynode().value()); } + if (node.has_decimalnode()) { + std::string value = node.decimalnode().value(); + gandiva::Decimal128Full literal(value, node.decimalnode().precision(), + node.decimalnode().scale()); + return TreeExprBuilder::MakeDecimalLiteral(literal); + } std::cerr << "Unknown node type in protobuf\n"; return nullptr; } diff --git a/cpp/src/gandiva/literal_holder.h b/cpp/src/gandiva/literal_holder.h index 0a65ea2c3e249..ad6afcea1f413 100644 --- a/cpp/src/gandiva/literal_holder.h +++ b/cpp/src/gandiva/literal_holder.h @@ -22,11 +22,14 @@ #include +#include +#include "gandiva/decimal_full.h" + namespace gandiva { using LiteralHolder = boost::variant; + uint16_t, uint32_t, uint64_t, std::string, Decimal128Full>; } // namespace gandiva diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 50f147b2fc7dd..9ddbe93fa68ff 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -399,6 +399,17 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, return value; } +std::shared_ptr LLVMGenerator::BuildDecimalLValue(llvm::Value* value, + DataTypePtr arrow_type) { + // only decimals of size 128-bit supported. + DCHECK(is_decimal_128(arrow_type)); + auto decimal_type = + arrow::internal::checked_cast(arrow_type.get()); + return std::make_shared(value, nullptr, + types()->i32_constant(decimal_type->precision()), + types()->i32_constant(decimal_type->scale())); +} + #define ADD_VISITOR_TRACE(...) \ if (generator_->enable_ir_traces_) { \ generator_->AddTrace(__VA_ARGS__); \ @@ -422,20 +433,33 @@ LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* functi void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { llvm::IRBuilder<>* builder = ir_builder(); - llvm::Value* slot_ref = GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); - llvm::Value* slot_value; - if (dex.FieldType()->id() == arrow::Type::BOOL) { - slot_value = generator_->GetPackedBitValue(slot_ref, loop_var_); - } else { - llvm::Value* slot_offset = builder->CreateGEP(slot_ref, loop_var_); - slot_value = builder->CreateLoad(slot_offset, dex.FieldName()); - } + std::shared_ptr lvalue; + + switch (dex.FieldType()->id()) { + case arrow::Type::BOOL: + slot_value = generator_->GetPackedBitValue(slot_ref, loop_var_); + lvalue = std::make_shared(slot_value); + break; + case arrow::Type::DECIMAL: { + auto slot_offset = builder->CreateGEP(slot_ref, loop_var_); + slot_value = builder->CreateLoad(slot_offset, dex.FieldName()); + lvalue = generator_->BuildDecimalLValue(slot_value, dex.FieldType()); + break; + } + + default: { + auto slot_offset = builder->CreateGEP(slot_ref, loop_var_); + slot_value = builder->CreateLoad(slot_offset, dex.FieldName()); + lvalue = std::make_shared(slot_value); + break; + } + } ADD_VISITOR_TRACE("visit fixed-len data vector " + dex.FieldName() + " value %T", slot_value); - result_.reset(new LValue(slot_value)); + result_ = lvalue; } void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueDex& dex) { @@ -572,6 +596,19 @@ void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { value = types->i64_constant(boost::get(dex.holder())); break; + case arrow::Type::DECIMAL: { + // build code for struct + auto decimal_value = boost::get(dex.holder()); + auto int_value = + llvm::ConstantInt::get(llvm::Type::getInt128Ty(*generator_->context()), + decimal_value.value().ToIntegerString(), 10); + auto type = arrow::decimal(decimal_value.precision(), decimal_value.scale()); + auto lvalue = generator_->BuildDecimalLValue(int_value, type); + // set it as the l-value and return. + result_ = lvalue; + return; + } + default: DCHECK(0); } @@ -589,13 +626,14 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { auto params = BuildParams(dex.function_holder().get(), dex.args(), false, native_function->NeedsContext()); + auto arrow_return_type = dex.func_descriptor()->return_type(); if (native_function->CanReturnErrors()) { // slow path : if a function can return errors, skip invoking the function // unless all of the input args are valid. Otherwise, it can cause spurious errors. llvm::IRBuilder<>* builder = ir_builder(); LLVMTypes* types = generator_->types(); - auto arrow_type_id = native_function->signature().ret_type()->id(); + auto arrow_type_id = arrow_return_type->id(); auto result_type = types->IRType(arrow_type_id); // Build combined validity of the args. @@ -609,7 +647,7 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { auto then_lambda = [&] { ADD_VISITOR_TRACE("fn " + function_name + " can return errors : all args valid, invoke fn"); - return BuildFunctionCall(native_function, ¶ms); + return BuildFunctionCall(native_function, arrow_return_type, ¶ms); }; // else block @@ -624,10 +662,10 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { return std::make_shared(else_value, else_value_len); }; - result_ = BuildIfElse(is_valid, then_lambda, else_lambda, result_type); + result_ = BuildIfElse(is_valid, then_lambda, else_lambda, arrow_return_type); } else { // fast path : invoke function without computing validities. - result_ = BuildFunctionCall(native_function, ¶ms); + result_ = BuildFunctionCall(native_function, arrow_return_type, ¶ms); } } @@ -639,7 +677,8 @@ void LLVMGenerator::Visitor::Visit(const NullableNeverFuncDex& dex) { auto params = BuildParams(dex.function_holder().get(), dex.args(), true, native_function->NeedsContext()); - result_ = BuildFunctionCall(native_function, ¶ms); + auto arrow_return_type = dex.func_descriptor()->return_type(); + result_ = BuildFunctionCall(native_function, arrow_return_type, ¶ms); } void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { @@ -659,7 +698,8 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { new llvm::AllocaInst(types->i8_type(), 0, "result_valid", entry_block_); params.push_back(result_valid_ptr); - result_ = BuildFunctionCall(native_function, ¶ms); + auto arrow_return_type = dex.func_descriptor()->return_type(); + result_ = BuildFunctionCall(native_function, arrow_return_type, ¶ms); // load the result validity and truncate to i1. llvm::Value* result_valid_i8 = builder->CreateLoad(result_valid_ptr); @@ -672,7 +712,6 @@ void LLVMGenerator::Visitor::Visit(const NullableInternalFuncDex& dex) { void LLVMGenerator::Visitor::Visit(const IfDex& dex) { ADD_VISITOR_TRACE("visit IfExpression"); llvm::IRBuilder<>* builder = ir_builder(); - LLVMTypes* types = generator_->types(); // Evaluate condition. LValuePtr if_condition = BuildValueAndValidity(dex.condition_vv()); @@ -714,9 +753,8 @@ void LLVMGenerator::Visitor::Visit(const IfDex& dex) { }; // build the if-else condition. - auto result_type = types->IRType(dex.result_type()->id()); - result_ = BuildIfElse(validAndMatched, then_lambda, else_lambda, result_type); - if (result_type == types->i8_ptr_type()) { + result_ = BuildIfElse(validAndMatched, then_lambda, else_lambda, dex.result_type()); + if (arrow::is_binary_like(dex.result_type()->id())) { ADD_VISITOR_TRACE("IfElse result length %T", result_->length()); } ADD_VISITOR_TRACE("IfElse result value %T", result_->data()); @@ -906,7 +944,7 @@ void LLVMGenerator::Visitor::VisitInExpression(const InExprDexBase& dex) { LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, std::function then_func, std::function else_func, - llvm::Type* result_type) { + DataTypePtr result_type) { llvm::IRBuilder<>* builder = ir_builder(); llvm::LLVMContext* context = generator_->context(); LLVMTypes* types = generator_->types(); @@ -936,17 +974,31 @@ LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, // Emit the merge block. builder->SetInsertPoint(merge_bb); - llvm::PHINode* result_value = builder->CreatePHI(result_type, 2, "res_value"); + auto llvm_type = types->IRType(result_type->id()); + llvm::PHINode* result_value = builder->CreatePHI(llvm_type, 2, "res_value"); result_value->addIncoming(then_lvalue->data(), then_bb); result_value->addIncoming(else_lvalue->data(), else_bb); - llvm::PHINode* result_length = nullptr; - if (result_type == types->i8_ptr_type()) { - result_length = builder->CreatePHI(types->i32_type(), 2, "res_length"); - result_length->addIncoming(then_lvalue->length(), then_bb); - result_length->addIncoming(else_lvalue->length(), else_bb); + LValuePtr ret; + switch (result_type->id()) { + case arrow::Type::STRING: { + llvm::PHINode* result_length; + result_length = builder->CreatePHI(types->i32_type(), 2, "res_length"); + result_length->addIncoming(then_lvalue->length(), then_bb); + result_length->addIncoming(else_lvalue->length(), else_bb); + ret = std::make_shared(result_value, result_length); + break; + } + + case arrow::Type::DECIMAL: + ret = generator_->BuildDecimalLValue(result_value, result_type); + break; + + default: + ret = std::make_shared(result_value); + break; } - return std::make_shared(result_value, result_length); + return ret; } LValuePtr LLVMGenerator::Visitor::BuildValueAndValidity(const ValueValidityPair& pair) { @@ -963,25 +1015,46 @@ LValuePtr LLVMGenerator::Visitor::BuildValueAndValidity(const ValueValidityPair& } LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, + DataTypePtr arrow_return_type, std::vector* params) { - auto arrow_return_type = func->signature().ret_type()->id(); - auto llvm_return_type = generator_->types()->IRType(arrow_return_type); - - // add extra arg for return length for variable len return types (alloced on stack). - llvm::AllocaInst* result_len_ptr = nullptr; - if (arrow::is_binary_like(arrow_return_type)) { - result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, - "result_len", entry_block_); - params->push_back(result_len_ptr); - has_arena_allocs_ = true; - } + auto types = generator_->types(); + auto arrow_return_type_id = arrow_return_type->id(); + auto llvm_return_type = types->IRType(arrow_return_type_id); + + if (arrow_return_type_id == arrow::Type::DECIMAL) { + // For decimal fns, the output precision/scale are passed along as parameters. + // + // convert from this : + // out = add_decimal(v1, p1, s1, v2, p2, s2) + // to: + // out = add_decimal(v1, p1, s1, v2, p2, s2, out_p, out_s) + + // Append the out_precision and out_scale + auto ret_lvalue = generator_->BuildDecimalLValue(nullptr, arrow_return_type); + params->push_back(ret_lvalue->precision()); + params->push_back(ret_lvalue->scale()); + + // Make the function call + auto out = generator_->AddFunctionCall(func->pc_name(), llvm_return_type, *params); + ret_lvalue->set_data(out); + return ret_lvalue; + } else { + // add extra arg for return length for variable len return types (alloced on stack). + llvm::AllocaInst* result_len_ptr = nullptr; + if (arrow::is_binary_like(arrow_return_type_id)) { + result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, + "result_len", entry_block_); + params->push_back(result_len_ptr); + has_arena_allocs_ = true; + } - // Make the function call - llvm::IRBuilder<>* builder = ir_builder(); - auto value = generator_->AddFunctionCall(func->pc_name(), llvm_return_type, *params); - auto value_len = - (result_len_ptr == nullptr) ? nullptr : builder->CreateLoad(result_len_ptr); - return std::make_shared(value, value_len); + // Make the function call + llvm::IRBuilder<>* builder = ir_builder(); + auto value = generator_->AddFunctionCall(func->pc_name(), llvm_return_type, *params); + auto value_len = + (result_len_ptr == nullptr) ? nullptr : builder->CreateLoad(result_len_ptr); + return std::make_shared(value, value_len); + } } std::vector LLVMGenerator::Visitor::BuildParams( @@ -1007,12 +1080,9 @@ std::vector LLVMGenerator::Visitor::BuildParams( DexPtr value_expr = pair->value_expr(); value_expr->Accept(*this); LValue& result_ref = *result(); - params.push_back(result_ref.data()); - // build length (for var len data types) - if (result_ref.length() != nullptr) { - params.push_back(result_ref.length()); - } + // append all the parameters corresponding to this LValue. + result_ref.AppendFunctionParams(¶ms); // build validity. if (with_validity) { diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h index 49f209d280d13..937e5acc87b2e 100644 --- a/cpp/src/gandiva/llvm_generator.h +++ b/cpp/src/gandiva/llvm_generator.h @@ -119,12 +119,13 @@ class LLVMGenerator { bool with_validity, bool with_context); // Generate code to onvoke a function call. - LValuePtr BuildFunctionCall(const NativeFunction* func, + LValuePtr BuildFunctionCall(const NativeFunction* func, DataTypePtr arrow_return_type, std::vector* params); // Generate code for an if-else condition. LValuePtr BuildIfElse(llvm::Value* condition, std::function then_func, - std::function else_func, llvm::Type* result_type); + std::function else_func, + DataTypePtr arrow_return_type); // Switch to the entry_block and get reference of the validity/value/offsets buffer llvm::Value* GetBufferReference(int idx, BufferType buffer_type, FieldPtr field); @@ -184,6 +185,10 @@ class LLVMGenerator { void ClearPackedBitValueIfFalse(llvm::Value* bitmap, llvm::Value* position, llvm::Value* value); + // Generate code to build a DecimalLValue with specified value/precision/scale. + std::shared_ptr BuildDecimalLValue(llvm::Value* value, + DataTypePtr arrow_type); + /// Generate code to make a function call (to a pre-compiled IR function) which takes /// 'args' and has a return type 'ret_type'. llvm::Value* AddFunctionCall(const std::string& full_name, llvm::Type* ret_type, diff --git a/cpp/src/gandiva/llvm_types.cc b/cpp/src/gandiva/llvm_types.cc index 0b89d96e3fb02..18ff627a5651f 100644 --- a/cpp/src/gandiva/llvm_types.cc +++ b/cpp/src/gandiva/llvm_types.cc @@ -40,6 +40,7 @@ LLVMTypes::LLVMTypes(llvm::LLVMContext& context) : context_(context) { {arrow::Type::type::TIMESTAMP, i64_type()}, {arrow::Type::type::STRING, i8_ptr_type()}, {arrow::Type::type::BINARY, i8_ptr_type()}, + {arrow::Type::type::DECIMAL, i128_type()}, }; } diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index dab47d059f7f2..9cf4dd5d1c850 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -43,6 +43,8 @@ class LLVMTypes { llvm::Type* i64_type() { return llvm::Type::getInt64Ty(context_); } + llvm::Type* i128_type() { return llvm::Type::getInt128Ty(context_); } + llvm::Type* float_type() { return llvm::Type::getFloatTy(context_); } llvm::Type* double_type() { return llvm::Type::getDoubleTy(context_); } @@ -53,12 +55,19 @@ class LLVMTypes { llvm::PointerType* i64_ptr_type() { return llvm::PointerType::get(i64_type(), 0); } - llvm::PointerType* ptr_type(llvm::Type* base_type) { - return llvm::PointerType::get(base_type, 0); + llvm::PointerType* i128_ptr_type() { return llvm::PointerType::get(i128_type(), 0); } + + llvm::StructType* i128_split_type() { + // struct with high/low bits (see decimal_ops.cc:DecimalSplit) + return llvm::StructType::get(context_, {i64_type(), i64_type()}, false); } llvm::Type* void_type() { return llvm::Type::getVoidTy(context_); } + llvm::PointerType* ptr_type(llvm::Type* base_type) { + return llvm::PointerType::get(base_type, 0); + } + llvm::Constant* true_constant() { return llvm::ConstantInt::get(context_, llvm::APInt(1, 1)); } @@ -87,6 +96,18 @@ class LLVMTypes { return llvm::ConstantInt::get(context_, llvm::APInt(64, val)); } + llvm::Constant* i128_constant(int64_t val) { + return llvm::ConstantInt::get(context_, llvm::APInt(128, val)); + } + + llvm::Constant* i128_zero() { + return llvm::ConstantInt::get(context_, llvm::APInt(128, 0)); + } + + llvm::Constant* i128_one() { + return llvm::ConstantInt::get(context_, llvm::APInt(128, 1)); + } + llvm::Constant* float_constant(float val) { return llvm::ConstantFP::get(float_type(), val); } diff --git a/cpp/src/gandiva/lvalue.h b/cpp/src/gandiva/lvalue.h index 2ff03dcdd9c56..ce5040f6c37a6 100644 --- a/cpp/src/gandiva/lvalue.h +++ b/cpp/src/gandiva/lvalue.h @@ -18,9 +18,11 @@ #ifndef GANDIVA_LVALUE_H #define GANDIVA_LVALUE_H -#include "arrow/util/macros.h" +#include #include +#include "arrow/util/macros.h" +#include "gandiva/logging.h" namespace gandiva { @@ -30,17 +32,48 @@ class LValue { explicit LValue(llvm::Value* data, llvm::Value* length = NULLPTR, llvm::Value* validity = NULLPTR) : data_(data), length_(length), validity_(validity) {} + virtual ~LValue() = default; llvm::Value* data() { return data_; } llvm::Value* length() { return length_; } llvm::Value* validity() { return validity_; } + void set_data(llvm::Value* data) { data_ = data; } + + // Append the params required when passing this as a function parameter. + virtual void AppendFunctionParams(std::vector* params) { + params->push_back(data_); + if (length_ != NULLPTR) { + params->push_back(length_); + } + } + private: llvm::Value* data_; llvm::Value* length_; llvm::Value* validity_; }; +class DecimalLValue : public LValue { + public: + DecimalLValue(llvm::Value* data, llvm::Value* validity, llvm::Value* precision, + llvm::Value* scale) + : LValue(data, NULLPTR, validity), precision_(precision), scale_(scale) {} + + llvm::Value* precision() { return precision_; } + llvm::Value* scale() { return scale_; } + + void AppendFunctionParams(std::vector* params) override { + LValue::AppendFunctionParams(params); + params->push_back(precision_); + params->push_back(scale_); + } + + private: + llvm::Value* precision_; + llvm::Value* scale_; +}; + } // namespace gandiva #endif // GANDIVA_LVALUE_H diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 21a74bd4916ee..eab0b9007b09e 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -20,12 +20,16 @@ project(gandiva) set(PRECOMPILED_SRCS arithmetic_ops.cc bitmap.cc + decimal_ops.cc + decimal_wrapper.cc extended_math_ops.cc hash.cc print.cc string_ops.cc time.cc - timestamp_arithmetic.cc) + timestamp_arithmetic.cc + ../../arrow/status.cc + ../../arrow/util/decimal.cc) # Create bitcode for each of the source files. foreach(SRC_FILE ${PRECOMPILED_SRCS}) @@ -35,7 +39,10 @@ foreach(SRC_FILE ${PRECOMPILED_SRCS}) add_custom_command( OUTPUT ${BC_FILE} COMMAND ${CLANG_EXECUTABLE} - -std=c++11 -emit-llvm -O2 -c ${ABSOLUTE_SRC} -o ${BC_FILE} + -std=c++11 -emit-llvm + -DNDEBUG # DCHECK macros not implemented in precompiled code + -fno-use-cxa-atexit # Workaround for unresolved __dso_handle + -O3 -c ${ABSOLUTE_SRC} -o ${BC_FILE} -I${CMAKE_SOURCE_DIR}/src DEPENDS ${SRC_FILE}) list(APPEND BC_FILES ${BC_FILE}) @@ -77,4 +84,5 @@ if (ARROW_BUILD_TESTS) add_precompiled_unit_test(string_ops_test.cc string_ops.cc ../context_helper.cc) add_precompiled_unit_test(arithmetic_ops_test.cc arithmetic_ops.cc ../context_helper.cc) add_precompiled_unit_test(extended_math_ops_test.cc extended_math_ops.cc ../context_helper.cc) + add_precompiled_unit_test(decimal_ops_test.cc decimal_ops.cc ../decimal_type_util.cc) endif() diff --git a/cpp/src/gandiva/precompiled/decimal_ops.cc b/cpp/src/gandiva/precompiled/decimal_ops.cc new file mode 100644 index 0000000000000..57cb83e222367 --- /dev/null +++ b/cpp/src/gandiva/precompiled/decimal_ops.cc @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Alogrithms adapted from Apache Impala + +#include "gandiva/precompiled/decimal_ops.h" + +#include + +#include "gandiva/decimal_type_util.h" +#include "gandiva/logging.h" + +namespace gandiva { +namespace decimalops { + +static Decimal128 CheckAndIncreaseScale(Decimal128 in, int32_t delta) { + return (delta <= 0) ? in : in.IncreaseScaleBy(delta); +} + +static Decimal128 CheckAndReduceScale(Decimal128 in, int32_t delta) { + return (delta <= 0) ? in : in.ReduceScaleBy(delta); +} + +/// Adjust x and y to the same scale, and add them. +static Decimal128 AddFastPath(const Decimal128Full& x, const Decimal128Full& y, + int32_t out_scale) { + auto higher_scale = std::max(x.scale(), y.scale()); + + auto x_scaled = CheckAndIncreaseScale(x.value(), higher_scale - x.scale()); + auto y_scaled = CheckAndIncreaseScale(y.value(), higher_scale - y.scale()); + return x_scaled + y_scaled; +} + +/// Add x and y, caller has ensured there can be no overflow. +static Decimal128 AddNoOverflow(const Decimal128Full& x, const Decimal128Full& y, + int32_t out_scale) { + auto higher_scale = std::max(x.scale(), y.scale()); + auto sum = AddFastPath(x, y, out_scale); + return CheckAndReduceScale(sum, higher_scale - out_scale); +} + +/// Both x_value and y_value must be >= 0 +static Decimal128 AddLargePositive(const Decimal128Full& x, const Decimal128Full& y, + int32_t out_scale) { + DCHECK_GE(x.value(), 0); + DCHECK_GE(y.value(), 0); + + // separate out whole/fractions. + Decimal128 x_left, x_right, y_left, y_right; + x.value().GetWholeAndFraction(x.scale(), &x_left, &x_right); + y.value().GetWholeAndFraction(y.scale(), &y_left, &y_right); + + // Adjust fractional parts to higher scale. + auto higher_scale = std::max(x.scale(), y.scale()); + auto x_right_scaled = CheckAndIncreaseScale(x_right, higher_scale - x.scale()); + auto y_right_scaled = CheckAndIncreaseScale(y_right, higher_scale - y.scale()); + + Decimal128 right; + Decimal128 carry_to_left; + auto multiplier = Decimal128::GetScaleMultiplier(higher_scale); + if (x_right_scaled >= multiplier - y_right_scaled) { + right = x_right_scaled - (multiplier - y_right_scaled); + carry_to_left = 1; + } else { + right = x_right_scaled + y_right_scaled; + carry_to_left = 0; + } + right = CheckAndReduceScale(right, higher_scale - out_scale); + + auto left = x_left + y_left + carry_to_left; + return (left * Decimal128::GetScaleMultiplier(out_scale)) + right; +} + +/// x_value and y_value cannot be 0, and one must be positive and the other negative. +static Decimal128 AddLargeNegative(const Decimal128Full& x, const Decimal128Full& y, + int32_t out_scale) { + DCHECK_NE(x.value(), 0); + DCHECK_NE(y.value(), 0); + DCHECK((x.value() < 0 && y.value() > 0) || (x.value() > 0 && y.value() < 0)); + + // separate out whole/fractions. + Decimal128 x_left, x_right, y_left, y_right; + x.value().GetWholeAndFraction(x.scale(), &x_left, &x_right); + y.value().GetWholeAndFraction(y.scale(), &y_left, &y_right); + + // Adjust fractional parts to higher scale. + auto higher_scale = std::max(x.scale(), y.scale()); + x_right = CheckAndIncreaseScale(x_right, higher_scale - x.scale()); + y_right = CheckAndIncreaseScale(y_right, higher_scale - y.scale()); + + // Overflow not possible because one is +ve and the other is -ve. + auto left = x_left + y_left; + auto right = x_right + y_right; + + // If the whole and fractional parts have different signs, then we need to make the + // fractional part have the same sign as the whole part. If either left or right is + // zero, then nothing needs to be done. + if (left < 0 && right > 0) { + left += 1; + right -= Decimal128::GetScaleMultiplier(higher_scale); + } else if (left > 0 && right < 0) { + left -= 1; + right += Decimal128::GetScaleMultiplier(higher_scale); + } + right = CheckAndReduceScale(right, higher_scale - out_scale); + return (left * Decimal128::GetScaleMultiplier(out_scale)) + right; +} + +static Decimal128 AddLarge(const Decimal128Full& x, const Decimal128Full& y, + int32_t out_scale) { + if (x.value() >= 0 && y.value() >= 0) { + // both positive or 0 + return AddLargePositive(x, y, out_scale); + } else if (x.value() <= 0 && y.value() <= 0) { + // both negative or 0 + Decimal128Full x_neg(-x.value(), x.precision(), x.scale()); + Decimal128Full y_neg(-y.value(), y.precision(), y.scale()); + return -AddLargePositive(x_neg, y_neg, out_scale); + } else { + // one positive and the other negative + return AddLargeNegative(x, y, out_scale); + } +} + +// Suppose we have a number that requires x bits to be represented and we scale it up by +// 10^scale_by. Let's say now y bits are required to represent it. This function returns +// the maximum possible y - x for a given 'scale_by'. +inline int32_t MaxBitsRequiredIncreaseAfterScaling(int32_t scale_by) { + // We rely on the following formula: + // bits_required(x * 10^y) <= bits_required(x) + floor(log2(10^y)) + 1 + // We precompute floor(log2(10^x)) + 1 for x = 0, 1, 2...75, 76 + DCHECK_GE(scale_by, 0); + DCHECK_LE(scale_by, 76); + static const int32_t floor_log2_plus_one[] = { + 0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40, 44, 47, 50, + 54, 57, 60, 64, 67, 70, 74, 77, 80, 84, 87, 90, 94, 97, 100, 103, + 107, 110, 113, 117, 120, 123, 127, 130, 133, 137, 140, 143, 147, 150, 153, 157, + 160, 163, 167, 170, 173, 177, 180, 183, 187, 190, 193, 196, 200, 203, 206, 210, + 213, 216, 220, 223, 226, 230, 233, 236, 240, 243, 246, 250, 253}; + return floor_log2_plus_one[scale_by]; +} + +// If we have a number with 'num_lz' leading zeros, and we scale it up by 10^scale_by, +// this function returns the minimum number of leading zeros the result can have. +inline int32_t MinLeadingZerosAfterScaling(int32_t num_lz, int32_t scale_by) { + DCHECK_GE(scale_by, 0); + DCHECK_LE(scale_by, 76); + int32_t result = num_lz - MaxBitsRequiredIncreaseAfterScaling(scale_by); + return result; +} + +// Returns the maximum possible number of bits required to represent num * 10^scale_by. +inline int32_t MaxBitsRequiredAfterScaling(const Decimal128Full& num, int32_t scale_by) { + auto value = num.value(); + auto value_abs = value.Abs(); + + int32_t num_occupied = 128 - value_abs.CountLeadingBinaryZeros(); + DCHECK_GE(scale_by, 0); + DCHECK_LE(scale_by, 76); + return num_occupied + MaxBitsRequiredIncreaseAfterScaling(scale_by); +} + +// Returns the minimum number of leading zero x or y would have after one of them gets +// scaled up to match the scale of the other one. +inline int32_t MinLeadingZeros(const Decimal128Full& x, const Decimal128Full& y) { + auto x_value = x.value(); + auto x_value_abs = x_value.Abs(); + + auto y_value = y.value(); + auto y_value_abs = y_value.Abs(); + + int32_t x_lz = x_value_abs.CountLeadingBinaryZeros(); + int32_t y_lz = y_value_abs.CountLeadingBinaryZeros(); + if (x.scale() < y.scale()) { + x_lz = MinLeadingZerosAfterScaling(x_lz, y.scale() - x.scale()); + } else if (x.scale() > y.scale()) { + y_lz = MinLeadingZerosAfterScaling(y_lz, x.scale() - y.scale()); + } + return std::min(x_lz, y_lz); +} + +Decimal128 Add(const Decimal128Full& x, const Decimal128Full& y, int32_t out_precision, + int32_t out_scale) { + if (out_precision < DecimalTypeUtil::kMaxPrecision) { + // fast-path add + return AddFastPath(x, y, out_scale); + } else { + int32_t min_lz = MinLeadingZeros(x, y); + if (min_lz >= 3) { + // If both numbers have at least MIN_LZ leading zeros, we can add them directly + // without the risk of overflow. + // We want the result to have at least 2 leading zeros, which ensures that it fits + // into the maximum decimal because 2^126 - 1 < 10^38 - 1. If both x and y have at + // least 3 leading zeros, then we are guaranteed that the result will have at lest 2 + // leading zeros. + return AddNoOverflow(x, y, out_scale); + } else { + // slower-version : add whole/fraction parts separately, and then, combine. + return AddLarge(x, y, out_scale); + } + } +} + +} // namespace decimalops +} // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/decimal_ops.h b/cpp/src/gandiva/precompiled/decimal_ops.h new file mode 100644 index 0000000000000..25f094e4a8faa --- /dev/null +++ b/cpp/src/gandiva/precompiled/decimal_ops.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DECIMAL_SQL_H +#define DECIMAL_SQL_H + +#include +#include +#include "gandiva/decimal_full.h" + +namespace gandiva { +namespace decimalops { + +/// Return the sum of 'x' and 'y'. +/// out_precision and out_scale are passed along for efficiency, they must match +/// the rules in DecimalTypeSql::GetResultType. +Decimal128 Add(const Decimal128Full& x, const Decimal128Full& y, int32_t out_precision, + int32_t out_scale); + +} // namespace decimalops +} // namespace gandiva + +#endif // DECIMAL_SQL_H diff --git a/cpp/src/gandiva/precompiled/decimal_ops_test.cc b/cpp/src/gandiva/precompiled/decimal_ops_test.cc new file mode 100644 index 0000000000000..7daf734509b20 --- /dev/null +++ b/cpp/src/gandiva/precompiled/decimal_ops_test.cc @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/test-util.h" +#include "gandiva/decimal_type_util.h" +#include "gandiva/precompiled/decimal_ops.h" +#include "gandiva/precompiled/types.h" + +namespace gandiva { + +class TestDecimalSql : public ::testing::Test { + protected: + static void AddAndVerify(const Decimal128Full& x, const Decimal128Full& y, + const Decimal128Full& expected); +}; + +#define EXPECT_DECIMAL_EQ(x, y, expected, actual) \ + EXPECT_EQ(expected, actual) << (x).ToString() << " + " << (y).ToString() \ + << " expected : " << expected.ToString() << " actual " \ + << actual.ToString() + +void TestDecimalSql::AddAndVerify(const Decimal128Full& x, const Decimal128Full& y, + const Decimal128Full& expected) { + auto t1 = std::make_shared(x.precision(), x.scale()); + auto t2 = std::make_shared(y.precision(), y.scale()); + + Decimal128TypePtr out_type; + EXPECT_OK(DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, {t1, t2}, &out_type)); + + auto out_value = decimalops::Add(x, y, out_type->precision(), out_type->scale()); + EXPECT_DECIMAL_EQ(x, y, expected, + Decimal128Full(out_value, out_type->precision(), out_type->scale())); +} + +TEST_F(TestDecimalSql, Add) { + // fast-path + AddAndVerify(Decimal128Full{"201", 30, 3}, // x + Decimal128Full{"301", 30, 3}, // y + Decimal128Full{"502", 31, 3}); // expected + + // max precision + AddAndVerify(Decimal128Full{"09999999999999999999999999999999000000", 38, 5}, // x + Decimal128Full{"100", 38, 7}, // y + Decimal128Full{"99999999999999999999999999999990000010", 38, 6}); + + // Both -ve + AddAndVerify(Decimal128Full{"-201", 30, 3}, // x + Decimal128Full{"-301", 30, 2}, // y + Decimal128Full{"-3211", 32, 3}); // expected + + // -ve and max precision + AddAndVerify(Decimal128Full{"-09999999999999999999999999999999000000", 38, 5}, // x + Decimal128Full{"-100", 38, 7}, // y + Decimal128Full{"-99999999999999999999999999999990000010", 38, 6}); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/decimal_wrapper.cc b/cpp/src/gandiva/precompiled/decimal_wrapper.cc new file mode 100644 index 0000000000000..fdc751f7fe87c --- /dev/null +++ b/cpp/src/gandiva/precompiled/decimal_wrapper.cc @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/precompiled/decimal_ops.h" +#include "gandiva/precompiled/types.h" + +extern "C" { + +/// TODO : Passing around structs in IR can be fragile due to c-abi compatibility issues. +/// This seems to work for now, but will need to revisit if we hit issues. +struct DecimalSplit { + int64_t high_bits; + uint64_t low_bits; +}; + +FORCE_INLINE +DecimalSplit add_large_decimal128_decimal128(int64_t x_high, uint64_t x_low, + int32_t x_precision, int32_t x_scale, + int64_t y_high, uint64_t y_low, + int32_t y_precision, int32_t y_scale, + int32_t out_precision, int32_t out_scale) { + gandiva::Decimal128Full x(x_high, x_low, x_precision, x_scale); + gandiva::Decimal128Full y(y_high, y_low, y_precision, y_scale); + + arrow::Decimal128 out = gandiva::decimalops::Add(x, y, out_precision, out_scale); + return DecimalSplit{out.high_bits(), out.low_bits()}; +} + +} // extern "C" diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 4cb352f2ad3c1..8fc5b8c446927 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -143,7 +143,8 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* p // TODO : handle variable-len vectors Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, arrow::MemoryPool* pool, ArrayDataPtr* array_data) { - ARROW_RETURN_IF(!arrow::is_primitive(type->id()), + const auto* fw_type = dynamic_cast(type.get()); + ARROW_RETURN_IF(fw_type == nullptr, Status::Invalid("Unsupported output data type ", type)); std::shared_ptr null_bitmap; @@ -151,8 +152,7 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, bitmap_bytes, &null_bitmap)); std::shared_ptr data; - const auto& fw_type = dynamic_cast(*type); - int64_t data_len = arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width()); + int64_t data_len = arrow::BitUtil::BytesForBits(num_records * fw_type->bit_width()); ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, data_len, &data)); // This is not strictly required but valgrind gets confused and detects this diff --git a/cpp/src/gandiva/proto/Types.proto b/cpp/src/gandiva/proto/Types.proto index ac19d0f1c1919..7474065f68b73 100644 --- a/cpp/src/gandiva/proto/Types.proto +++ b/cpp/src/gandiva/proto/Types.proto @@ -146,6 +146,13 @@ message BinaryNode { optional bytes value = 1; } +message DecimalNode { + optional string value = 1; + optional int32 precision = 2; + optional int32 scale = 3; +} + + message TreeNode { optional FieldNode fieldNode = 1; optional FunctionNode fnNode = 2; @@ -164,6 +171,7 @@ message TreeNode { optional DoubleNode doubleNode = 16; optional StringNode stringNode = 17; optional BinaryNode binaryNode = 18; + optional DecimalNode decimalNode = 19; } message ExpressionRoot { diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index 9558fc0757f7b..b47e5fd5add59 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -27,11 +27,17 @@ ADD_GANDIVA_TEST(to_string_test) ADD_GANDIVA_TEST(hash_test) ADD_GANDIVA_TEST(in_expr_test) ADD_GANDIVA_TEST(null_validity_test) +ADD_GANDIVA_TEST(decimal_test) +ADD_GANDIVA_TEST(decimal_single_test) ADD_GANDIVA_TEST(projector_test_static SOURCES projector_test.cc USE_STATIC_LINKING) -ADD_BENCHMARK(micro_benchmarks +ADD_GANDIVA_TEST(decimal_single_test_static + SOURCES decimal_single_test.cc + USE_STATIC_LINKING) + +ADD_ARROW_BENCHMARK(micro_benchmarks PREFIX "gandiva" EXTRA_LINK_LIBS gandiva_static) diff --git a/cpp/src/gandiva/tests/decimal_single_test.cc b/cpp/src/gandiva/tests/decimal_single_test.cc new file mode 100644 index 0000000000000..728ccb7f79f4c --- /dev/null +++ b/cpp/src/gandiva/tests/decimal_single_test.cc @@ -0,0 +1,224 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include "arrow/memory_pool.h" +#include "arrow/status.h" + +#include "gandiva/decimal_full.h" +#include "gandiva/decimal_type_util.h" +#include "gandiva/projector.h" +#include "gandiva/tests/test_util.h" +#include "gandiva/tree_expr_builder.h" + +using arrow::Decimal128; + +namespace gandiva { + +#define EXPECT_DECIMAL_SUM_EQUALS(x, y, expected, actual) \ + EXPECT_EQ(expected, actual) << (x).ToString() << " + " << (y).ToString() \ + << " expected : " << (expected).ToString() \ + << " actual : " << (actual).ToString(); + +Decimal128Full decimal_literal(const char* value, int precision, int scale) { + std::string value_string = std::string(value); + return Decimal128Full(value_string, precision, scale); +} + +class TestDecimalOps : public ::testing::Test { + public: + void SetUp() { pool_ = arrow::default_memory_pool(); } + + ArrayPtr MakeDecimalVector(const Decimal128Full& in); + void AddAndVerify(const Decimal128Full& x, const Decimal128Full& y, + const Decimal128Full& expected); + + protected: + arrow::MemoryPool* pool_; +}; + +ArrayPtr TestDecimalOps::MakeDecimalVector(const Decimal128Full& in) { + std::vector ret; + + Decimal128 decimal_value = in.value(); + + auto decimal_type = std::make_shared(in.precision(), in.scale()); + return MakeArrowArrayDecimal(decimal_type, {decimal_value}, {true}); +} + +void TestDecimalOps::AddAndVerify(const Decimal128Full& x, const Decimal128Full& y, + const Decimal128Full& expected) { + auto x_type = std::make_shared(x.precision(), x.scale()); + auto y_type = std::make_shared(y.precision(), y.scale()); + auto field_x = field("x", x_type); + auto field_y = field("y", y_type); + auto schema = arrow::schema({field_x, field_y}); + + Decimal128TypePtr output_type; + auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, {x_type, y_type}, + &output_type); + EXPECT_OK(status); + + // output fields + auto res = field("res", output_type); + + // build expression : x + y + auto expr = TreeExprBuilder::MakeExpression("add", {field_x, field_y}, res); + + // Build a projector for the expression. + std::shared_ptr projector; + status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_OK(status); + + // Create a row-batch with some sample data + auto array_a = MakeDecimalVector(x); + auto array_b = MakeDecimalVector(y); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, 1 /*num_records*/, {array_a, array_b}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_OK(status); + + // Validate results + auto out_array = dynamic_cast(outputs[0].get()); + const Decimal128 out_value(out_array->GetValue(0)); + + auto dtype = dynamic_cast(out_array->type().get()); + std::string value_string = out_value.ToString(0); + Decimal128Full actual{value_string, dtype->precision(), dtype->scale()}; + + EXPECT_DECIMAL_SUM_EQUALS(x, y, expected, actual); +} + +TEST_F(TestDecimalOps, TestAdd) { + // fast-path + AddAndVerify(decimal_literal("201", 30, 3), // x + decimal_literal("301", 30, 3), // y + decimal_literal("502", 31, 3)); // expected + + AddAndVerify(decimal_literal("201", 30, 3), // x + decimal_literal("301", 30, 2), // y + decimal_literal("3211", 32, 3)); // expected + + AddAndVerify(decimal_literal("201", 30, 3), // x + decimal_literal("301", 30, 4), // y + decimal_literal("2311", 32, 4)); // expected + + // max precision, but no overflow + AddAndVerify(decimal_literal("201", 38, 3), // x + decimal_literal("301", 38, 3), // y + decimal_literal("502", 38, 3)); // expected + + AddAndVerify(decimal_literal("201", 38, 3), // x + decimal_literal("301", 38, 2), // y + decimal_literal("3211", 38, 3)); // expected + + AddAndVerify(decimal_literal("201", 38, 3), // x + decimal_literal("301", 38, 4), // y + decimal_literal("2311", 38, 4)); // expected + + AddAndVerify(decimal_literal("201", 38, 3), // x + decimal_literal("301", 38, 7), // y + decimal_literal("201030", 38, 6)); // expected + + AddAndVerify(decimal_literal("1201", 38, 3), // x + decimal_literal("1801", 38, 3), // y + decimal_literal("3002", 38, 3)); // carry-over from fractional + + // max precision + AddAndVerify(decimal_literal("09999999999999999999999999999999000000", 38, 5), // x + decimal_literal("100", 38, 7), // y + decimal_literal("99999999999999999999999999999990000010", 38, 6)); + + AddAndVerify(decimal_literal("-09999999999999999999999999999999000000", 38, 5), // x + decimal_literal("100", 38, 7), // y + decimal_literal("-99999999999999999999999999999989999990", 38, 6)); + + AddAndVerify(decimal_literal("09999999999999999999999999999999000000", 38, 5), // x + decimal_literal("-100", 38, 7), // y + decimal_literal("99999999999999999999999999999989999990", 38, 6)); + + AddAndVerify(decimal_literal("-09999999999999999999999999999999000000", 38, 5), // x + decimal_literal("-100", 38, 7), // y + decimal_literal("-99999999999999999999999999999990000010", 38, 6)); + + AddAndVerify(decimal_literal("09999999999999999999999999999999999999", 38, 6), // x + decimal_literal("89999999999999999999999999999999999999", 38, 7), // y + decimal_literal("18999999999999999999999999999999999999", 38, 6)); + + // Both -ve + AddAndVerify(decimal_literal("-201", 30, 3), // x + decimal_literal("-301", 30, 2), // y + decimal_literal("-3211", 32, 3)); // expected + + AddAndVerify(decimal_literal("-201", 38, 3), // x + decimal_literal("-301", 38, 4), // y + decimal_literal("-2311", 38, 4)); // expected + + // Mix of +ve and -ve + AddAndVerify(decimal_literal("-201", 30, 3), // x + decimal_literal("301", 30, 2), // y + decimal_literal("2809", 32, 3)); // expected + + AddAndVerify(decimal_literal("-201", 38, 3), // x + decimal_literal("301", 38, 4), // y + decimal_literal("-1709", 38, 4)); // expected + + AddAndVerify(decimal_literal("201", 38, 3), // x + decimal_literal("-301", 38, 7), // y + decimal_literal("200970", 38, 6)); // expected + + AddAndVerify(decimal_literal("-1901", 38, 4), // x + decimal_literal("1801", 38, 4), // y + decimal_literal("-100", 38, 4)); // expected + + AddAndVerify(decimal_literal("1801", 38, 4), // x + decimal_literal("-1901", 38, 4), // y + decimal_literal("-100", 38, 4)); // expected + + // rounding +ve + AddAndVerify(decimal_literal("1000999", 38, 6), // x + decimal_literal("10000999", 38, 7), // y + decimal_literal("2001099", 38, 6)); + + AddAndVerify(decimal_literal("1000999", 38, 6), // x + decimal_literal("10000995", 38, 7), // y + decimal_literal("2001099", 38, 6)); + + AddAndVerify(decimal_literal("1000999", 38, 6), // x + decimal_literal("10000992", 38, 7), // y + decimal_literal("2001098", 38, 6)); + + // rounding -ve + AddAndVerify(decimal_literal("-1000999", 38, 6), // x + decimal_literal("-10000999", 38, 7), // y + decimal_literal("-2001099", 38, 6)); + + AddAndVerify(decimal_literal("-1000999", 38, 6), // x + decimal_literal("-10000995", 38, 7), // y + decimal_literal("-2001099", 38, 6)); + + AddAndVerify(decimal_literal("-1000999", 38, 6), // x + decimal_literal("-10000992", 38, 7), // y + decimal_literal("-2001098", 38, 6)); +} +} // namespace gandiva diff --git a/cpp/src/gandiva/tests/decimal_test.cc b/cpp/src/gandiva/tests/decimal_test.cc new file mode 100644 index 0000000000000..f048fd275a61b --- /dev/null +++ b/cpp/src/gandiva/tests/decimal_test.cc @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/util/decimal.h" + +#include "gandiva/decimal_type_util.h" +#include "gandiva/projector.h" +#include "gandiva/tests/test_util.h" +#include "gandiva/tree_expr_builder.h" + +using arrow::Decimal128; + +namespace gandiva { + +class TestDecimal : public ::testing::Test { + public: + void SetUp() { pool_ = arrow::default_memory_pool(); } + + std::vector MakeDecimalVector(std::vector values, + int32_t scale); + + protected: + arrow::MemoryPool* pool_; +}; + +std::vector TestDecimal::MakeDecimalVector(std::vector values, + int32_t scale) { + std::vector ret; + for (auto str : values) { + Decimal128 str_value; + int32_t str_precision; + int32_t str_scale; + + auto status = Decimal128::FromString(str, &str_value, &str_precision, &str_scale); + DCHECK_OK(status); + + Decimal128 scaled_value; + status = str_value.Rescale(str_scale, scale, &scaled_value); + ret.push_back(scaled_value); + } + return ret; +} + +TEST_F(TestDecimal, TestSimple) { + // schema for input fields + constexpr int32_t precision = 36; + constexpr int32_t scale = 18; + auto decimal_type = std::make_shared(precision, scale); + auto field_a = field("a", decimal_type); + auto field_b = field("b", decimal_type); + auto field_c = field("c", decimal_type); + auto schema = arrow::schema({field_a, field_b, field_c}); + + Decimal128TypePtr add2_type; + auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, + {decimal_type, decimal_type}, &add2_type); + + Decimal128TypePtr output_type; + status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, + {add2_type, decimal_type}, &output_type); + + // output fields + auto res = field("res0", output_type); + + // build expression : a + b + c + auto node_a = TreeExprBuilder::MakeField(field_a); + auto node_b = TreeExprBuilder::MakeField(field_b); + auto node_c = TreeExprBuilder::MakeField(field_c); + auto add2 = TreeExprBuilder::MakeFunction("add", {node_a, node_b}, add2_type); + auto add3 = TreeExprBuilder::MakeFunction("add", {add2, node_c}, output_type); + auto expr = TreeExprBuilder::MakeExpression(add3, res); + + // Build a projector for the expression. + std::shared_ptr projector; + status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + DCHECK_OK(status); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "2", "3", "4"}, scale), + {false, true, true, true}); + auto array_b = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"2", "3", "4", "5"}, scale), + {false, true, true, true}); + auto array_c = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"3", "4", "5", "6"}, scale), + {true, true, true, true}); + + // prepare input record batch + auto in_batch = + arrow::RecordBatch::Make(schema, num_records, {array_a, array_b, array_c}); + + auto expected = + MakeArrowArrayDecimal(output_type, MakeDecimalVector({"6", "9", "12", "15"}, scale), + {false, true, true, true}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + DCHECK_OK(status); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(expected, outputs[0]); +} + +TEST_F(TestDecimal, TestLiteral) { + // schema for input fields + constexpr int32_t precision = 36; + constexpr int32_t scale = 18; + auto decimal_type = std::make_shared(precision, scale); + auto field_a = field("a", decimal_type); + auto schema = arrow::schema({ + field_a, + }); + + Decimal128TypePtr add2_type; + auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, + {decimal_type, decimal_type}, &add2_type); + + // output fields + auto res = field("res0", add2_type); + + // build expression : a + b + c + auto node_a = TreeExprBuilder::MakeField(field_a); + static std::string decimal_point_six = "6"; + Decimal128Full literal(decimal_point_six, 2, 1); + auto node_b = TreeExprBuilder::MakeDecimalLiteral(literal); + auto add2 = TreeExprBuilder::MakeFunction("add", {node_a, node_b}, add2_type); + auto expr = TreeExprBuilder::MakeExpression(add2, res); + + // Build a projector for the expression. + std::shared_ptr projector; + status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + DCHECK_OK(status); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "2", "3", "4"}, scale), + {false, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + auto expected = MakeArrowArrayDecimal( + add2_type, MakeDecimalVector({"1.6", "2.6", "3.6", "4.6"}, scale), + {false, true, true, true}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + DCHECK_OK(status); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(expected, outputs[0]); +} + +TEST_F(TestDecimal, TestIfElse) { + // schema for input fields + constexpr int32_t precision = 36; + constexpr int32_t scale = 18; + auto decimal_type = std::make_shared(precision, scale); + auto field_a = field("a", decimal_type); + auto field_b = field("b", decimal_type); + auto field_c = field("c", arrow::boolean()); + auto schema = arrow::schema({field_a, field_b, field_c}); + + // output fields + auto field_result = field("res", decimal_type); + + // build expression. + // if (c) + // a + // else + // b + auto node_a = TreeExprBuilder::MakeField(field_a); + auto node_b = TreeExprBuilder::MakeField(field_b); + auto node_c = TreeExprBuilder::MakeField(field_c); + auto if_node = TreeExprBuilder::MakeIf(node_c, node_a, node_b, decimal_type); + + auto expr = TreeExprBuilder::MakeExpression(if_node, field_result); + + // Build a projector for the expressions. + std::shared_ptr projector; + Status status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + DCHECK_OK(status); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"1", "2", "3", "4"}, scale), + {false, true, true, true}); + auto array_b = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"2", "3", "4", "5"}, scale), + {true, true, true, true}); + + auto array_c = MakeArrowArrayBool({true, false, true, false}, {true, true, true, true}); + + // expected output + auto exp = + MakeArrowArrayDecimal(decimal_type, MakeDecimalVector({"0", "3", "3", "5"}, scale), + {false, true, true, true}); + + // prepare input record batch + auto in_batch = + arrow::RecordBatch::Make(schema, num_records, {array_a, array_b, array_c}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + DCHECK_OK(status); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/tests/generate_data.h b/cpp/src/gandiva/tests/generate_data.h index 01665b8ee17c5..398057510cb08 100644 --- a/cpp/src/gandiva/tests/generate_data.h +++ b/cpp/src/gandiva/tests/generate_data.h @@ -19,6 +19,8 @@ #include #include +#include "arrow/util/decimal.h" + #ifndef GANDIVA_GENERATE_DATA_H #define GANDIVA_GENERATE_DATA_H @@ -79,6 +81,24 @@ class Int64DataGenerator : public DataGenerator { Random random_; }; +class Decimal128DataGenerator : public DataGenerator { + public: + explicit Decimal128DataGenerator(bool large) : large_(large) {} + + arrow::Decimal128 GenerateData() { + uint64_t low = random_.next(); + int64_t high = random_.next(); + if (large_) { + high += (1ull << 62); + } + return arrow::Decimal128(high, low); + } + + protected: + bool large_; + Random random_; +}; + class FastUtf8DataGenerator : public DataGenerator { public: explicit FastUtf8DataGenerator(int max_len) : max_len_(max_len), cur_char_('a') {} diff --git a/cpp/src/gandiva/tests/micro_benchmarks.cc b/cpp/src/gandiva/tests/micro_benchmarks.cc index ce86bf0612402..e0794a233a2ce 100644 --- a/cpp/src/gandiva/tests/micro_benchmarks.cc +++ b/cpp/src/gandiva/tests/micro_benchmarks.cc @@ -19,6 +19,7 @@ #include "arrow/memory_pool.h" #include "arrow/status.h" #include "benchmark/benchmark.h" +#include "gandiva/decimal_type_util.h" #include "gandiva/projector.h" #include "gandiva/tests/test_util.h" #include "gandiva/tests/timed_evaluate.h" @@ -31,10 +32,6 @@ using arrow::int32; using arrow::int64; using arrow::utf8; -// TODO : the base numbers are from a mac. they need to be caliberated -// for the hardware used by travis. -float tolerance_ratio = 6.0; - static void TimedTestAdd3(benchmark::State& state) { // schema for input fields auto field0 = field("f0", int64()); @@ -280,6 +277,119 @@ static void TimedTestInExpr(benchmark::State& state) { ASSERT_OK(status); } +static void DoDecimalAdd3(benchmark::State& state, int32_t precision, int32_t scale, + bool large = false) { + // schema for input fields + auto decimal_type = std::make_shared(precision, scale); + auto field0 = field("f0", decimal_type); + auto field1 = field("f1", decimal_type); + auto field2 = field("f2", decimal_type); + auto schema = arrow::schema({field0, field1, field2}); + + Decimal128TypePtr add2_type; + auto status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, + {decimal_type, decimal_type}, &add2_type); + + Decimal128TypePtr output_type; + status = DecimalTypeUtil::GetResultType(DecimalTypeUtil::kOpAdd, + {add2_type, decimal_type}, &output_type); + + // output field + auto field_sum = field("add", output_type); + + // Build expression + auto part_sum = TreeExprBuilder::MakeFunction( + "add", {TreeExprBuilder::MakeField(field1), TreeExprBuilder::MakeField(field2)}, + add2_type); + auto sum = TreeExprBuilder::MakeFunction( + "add", {TreeExprBuilder::MakeField(field0), part_sum}, output_type); + + auto sum_expr = TreeExprBuilder::MakeExpression(sum, field_sum); + + std::shared_ptr projector; + status = Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()); + + Decimal128DataGenerator data_generator(large); + ProjectEvaluator evaluator(projector); + + status = TimedEvaluate( + schema, evaluator, data_generator, arrow::default_memory_pool(), 1 * MILLION, + 16 * THOUSAND, state); + ASSERT_OK(status); +} + +static void DoDecimalAdd2(benchmark::State& state, int32_t precision, int32_t scale, + bool large = false) { + // schema for input fields + auto decimal_type = std::make_shared(precision, scale); + auto field0 = field("f0", decimal_type); + auto field1 = field("f1", decimal_type); + auto schema = arrow::schema({field0, field1}); + + Decimal128TypePtr output_type; + auto status = DecimalTypeUtil::GetResultType( + DecimalTypeUtil::kOpAdd, {decimal_type, decimal_type}, &output_type); + + // output field + auto field_sum = field("add", output_type); + + // Build expression + auto sum = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum); + + std::shared_ptr projector; + status = Projector::Make(schema, {sum}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()); + + Decimal128DataGenerator data_generator(large); + ProjectEvaluator evaluator(projector); + + status = TimedEvaluate( + schema, evaluator, data_generator, arrow::default_memory_pool(), 1 * MILLION, + 16 * THOUSAND, state); + ASSERT_OK(status); +} + +static void DecimalAdd2Fast(benchmark::State& state) { + // use lesser precision to test the fast-path + DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision - 6, 18); +} + +static void DecimalAdd2LeadingZeroes(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision, 6); +} + +static void DecimalAdd2LeadingZeroesWithDiv(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision, 18); +} + +static void DecimalAdd2Large(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision, 18, true); +} + +static void DecimalAdd3Fast(benchmark::State& state) { + // use lesser precision to test the fast-path + DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision - 6, 18); +} + +static void DecimalAdd3LeadingZeroes(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 6); +} + +static void DecimalAdd3LeadingZeroesWithDiv(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 18); +} + +static void DecimalAdd3Large(benchmark::State& state) { + // use max precision to test the large-integer-path + DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 18, true); +} + BENCHMARK(TimedTestAdd3)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestBigNested)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestBigNested)->MinTime(1.0)->Unit(benchmark::kMicrosecond); @@ -289,5 +399,13 @@ BENCHMARK(TimedTestFilterLike)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestAllocs)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestMultiOr)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestInExpr)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd2Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(DecimalAdd3Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond); } // namespace gandiva diff --git a/cpp/src/gandiva/tests/test_util.h b/cpp/src/gandiva/tests/test_util.h index 72b45b124b8dd..0e0e27a0c9aa4 100644 --- a/cpp/src/gandiva/tests/test_util.h +++ b/cpp/src/gandiva/tests/test_util.h @@ -21,6 +21,7 @@ #include #include "arrow/test-util.h" #include "gandiva/arrow.h" +#include "gandiva/configuration.h" #ifndef GANDIVA_TEST_UTIL_H #define GANDIVA_TEST_UTIL_H @@ -46,6 +47,14 @@ static ArrayPtr MakeArrowArray(std::vector values) { return out; } +template +static ArrayPtr MakeArrowArray(const std::shared_ptr& type, + std::vector values, std::vector validity) { + ArrayPtr out; + arrow::ArrayFromVector(type, validity, values, &out); + return out; +} + template static ArrayPtr MakeArrowTypeArray(const std::shared_ptr& type, const std::vector& values, @@ -68,11 +77,16 @@ static ArrayPtr MakeArrowTypeArray(const std::shared_ptr& type, #define MakeArrowArrayFloat64 MakeArrowArray #define MakeArrowArrayUtf8 MakeArrowArray #define MakeArrowArrayBinary MakeArrowArray +#define MakeArrowArrayDecimal MakeArrowArray #define EXPECT_ARROW_ARRAY_EQUALS(a, b) \ EXPECT_TRUE((a)->Equals(b)) << "expected array: " << (a)->ToString() \ << " actual array: " << (b)->ToString(); +#define EXPECT_ARROW_TYPE_EQUALS(a, b) \ + EXPECT_TRUE((a)->Equals(b)) << "expected type: " << (a)->ToString() \ + << " actual type: " << (b)->ToString(); + std::shared_ptr TestConfiguration() { auto builder = ConfigurationBuilder(); builder.set_byte_code_file_path(GANDIVA_BYTE_COMPILE_FILE_PATH); diff --git a/cpp/src/gandiva/tests/timed_evaluate.h b/cpp/src/gandiva/tests/timed_evaluate.h index dab47c2f218be..9db7d88d2a226 100644 --- a/cpp/src/gandiva/tests/timed_evaluate.h +++ b/cpp/src/gandiva/tests/timed_evaluate.h @@ -100,7 +100,9 @@ Status TimedEvaluate(SchemaPtr schema, BaseEvaluator& evaluator, for (int col = 0; col < num_fields; col++) { std::vector data = GenerateData(batch_size, data_generator); std::vector validity(batch_size, true); - ArrayPtr col_data = MakeArrowArray(data, validity); + ArrayPtr col_data = + MakeArrowArray(schema->field(col)->type(), data, validity); + columns.push_back(col_data); } diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index 86a2824075497..23a49e2b7929a 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -19,6 +19,7 @@ #include +#include "gandiva/decimal_type_util.h" #include "gandiva/gandiva_aliases.h" #include "gandiva/node.h" @@ -49,6 +50,11 @@ NodePtr TreeExprBuilder::MakeBinaryLiteral(const std::string& value) { return std::make_shared(arrow::binary(), LiteralHolder(value), false); } +NodePtr TreeExprBuilder::MakeDecimalLiteral(const Decimal128Full& value) { + return std::make_shared(arrow::decimal(value.precision(), value.scale()), + LiteralHolder(value), false); +} + NodePtr TreeExprBuilder::MakeNull(DataTypePtr data_type) { static const std::string empty; @@ -92,6 +98,10 @@ NodePtr TreeExprBuilder::MakeNull(DataTypePtr data_type) { return std::make_shared(data_type, LiteralHolder((int64_t)0), true); case arrow::Type::TIMESTAMP: return std::make_shared(data_type, LiteralHolder((int64_t)0), true); + case arrow::Type::DECIMAL: { + Decimal128Full literal(0, 0); + return std::make_shared(data_type, LiteralHolder(literal), true); + } default: return nullptr; } diff --git a/cpp/src/gandiva/tree_expr_builder.h b/cpp/src/gandiva/tree_expr_builder.h index cd261c8bf978d..ae5f7fb9df3fd 100644 --- a/cpp/src/gandiva/tree_expr_builder.h +++ b/cpp/src/gandiva/tree_expr_builder.h @@ -23,7 +23,9 @@ #include #include +#include "arrow/type.h" #include "gandiva/condition.h" +#include "gandiva/decimal_full.h" #include "gandiva/expression.h" namespace gandiva { @@ -45,6 +47,7 @@ class TreeExprBuilder { static NodePtr MakeLiteral(double value); static NodePtr MakeStringLiteral(const std::string& value); static NodePtr MakeBinaryLiteral(const std::string& value); + static NodePtr MakeDecimalLiteral(const Decimal128Full& value); /// \brief create a node on a null literal. /// returns null if data_type is null or if it's not a supported datatype. diff --git a/cpp/valgrind.supp b/cpp/valgrind.supp index 08076aade4d9e..8d2d5da904bab 100644 --- a/cpp/valgrind.supp +++ b/cpp/valgrind.supp @@ -21,6 +21,12 @@ Memcheck:Cond fun:*CastFunctor*BooleanType* } +{ + :Conditional jump or move depends on uninitialised value(s) + Memcheck:Cond + ... + fun:*llvm*PassManager* +} { :Conditional jump or move depends on uninitialised value(s) Memcheck:Cond diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index d365eb9193ac1..285ea861f9795 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -29,7 +29,7 @@ 2.5.0 18.0 true - ../../cpp/debug + ../../cpp/debug/debug @@ -68,6 +68,11 @@ 2.10 test + + net.java.dev.jna + jna + 4.5.0 + diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java index 96788b39e08ec..46deee95fa717 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java @@ -17,8 +17,6 @@ package org.apache.arrow.gandiva.evaluator; -import org.apache.arrow.gandiva.exceptions.GandivaException; - /** * Used to construct gandiva configuration objects. */ @@ -26,16 +24,6 @@ public class ConfigurationBuilder { private String byteCodeFilePath = ""; - private static volatile long defaultConfiguration = 0L; - - /** - * Ctor - ensure that gandiva is loaded. - * @throws GandivaException - if library cannot be loaded. - */ - public ConfigurationBuilder() throws GandivaException { - JniWrapper.getInstance(); - } - public ConfigurationBuilder withByteCodeFilePath(final String byteCodeFilePath) { this.byteCodeFilePath = byteCodeFilePath; return this; @@ -45,26 +33,6 @@ public String getByteCodeFilePath() { return byteCodeFilePath; } - /** - * Get the default configuration to invoke gandiva. - * @return default configuration - * @throws GandivaException if unable to get native builder instance. - */ - static long getDefaultConfiguration() throws GandivaException { - if (defaultConfiguration == 0L) { - synchronized (ConfigurationBuilder.class) { - if (defaultConfiguration == 0L) { - String defaultByteCodeFilePath = JniWrapper.getInstance().getByteCodeFilePath(); - - defaultConfiguration = new ConfigurationBuilder() - .withByteCodeFilePath(defaultByteCodeFilePath) - .buildConfigInstance(); - } - } - } - return defaultConfiguration; - } - public native long buildConfigInstance(); public native void releaseConfigInstance(long configId); diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java new file mode 100644 index 0000000000000..37dd0f61056b0 --- /dev/null +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.evaluator; + +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; + +public class DecimalTypeUtil { + + public enum OperationType { + ADD, + SUBTRACT, + MULTIPLY, + DIVIDE, + MOD + } + + private static final int MIN_ADJUSTED_SCALE = 6; + /// The maximum precision representable by a 16-byte decimal + private static final int MAX_PRECISION = 38; + + public static Decimal getResultTypeForOperation(OperationType operation, Decimal operand1, Decimal + operand2) { + int s1 = operand1.getScale(); + int s2 = operand2.getScale(); + int p1 = operand1.getPrecision(); + int p2 = operand2.getPrecision(); + int resultScale = 0; + int resultPrecision = 0; + switch (operation) { + case ADD: + case SUBTRACT: + resultScale = Math.max(operand1.getScale(), operand2.getScale()); + resultPrecision = resultScale + Math.max(operand1.getPrecision() - operand1.getScale(), + operand2.getPrecision() - operand2.getScale()) + 1; + break; + case MULTIPLY: + resultScale = s1 + s2; + resultPrecision = p1 + p2 + 1; + break; + case DIVIDE: + resultScale = + Math.max(MIN_ADJUSTED_SCALE, operand1.getScale() + operand2.getPrecision() + 1); + resultPrecision = + operand1.getPrecision() - operand1.getScale() + operand2.getScale() + resultScale; + break; + case MOD: + resultScale = Math.max(operand1.getScale(), operand2.getScale()); + resultPrecision = Math.min(operand1.getPrecision() - operand1.getScale(), + operand2.getPrecision() - operand2.getScale()) + + resultScale; + break; + default: + throw new RuntimeException("Needs support"); + } + return adjustScaleIfNeeded(resultPrecision, resultScale); + } + + private static Decimal adjustScaleIfNeeded(int precision, int scale) { + if (precision > MAX_PRECISION) { + int minScale = Math.min(scale, MIN_ADJUSTED_SCALE); + int delta = precision - MAX_PRECISION; + precision = MAX_PRECISION; + scale = Math.max(scale - delta, minScale); + } + return new Decimal(precision, scale); + } + +} + diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java index 9c41c1942e9b3..b9986791850a7 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java @@ -70,7 +70,7 @@ public static ExpressionRegistry getInstance() throws GandivaException { synchronized (ExpressionRegistry.class) { if (INSTANCE == null) { // ensure library is setup. - JniWrapper.getInstance(); + JniLoader.getInstance(); Set typesFromGandiva = getSupportedTypesFromGandiva(); Set functionsFromGandiva = getSupportedFunctionsFromGandiva(); INSTANCE = new ExpressionRegistry(typesFromGandiva, functionsFromGandiva); @@ -173,10 +173,11 @@ private static ArrowType getArrowType(ExtGandivaType type) { BIT_WIDTH_64); case GandivaType.NONE_VALUE: return new ArrowType.Null(); + case GandivaType.DECIMAL_VALUE: + return new ArrowType.Decimal(0,0); case GandivaType.FIXED_SIZE_BINARY_VALUE: case GandivaType.MAP_VALUE: case GandivaType.INTERVAL_VALUE: - case GandivaType.DECIMAL_VALUE: case GandivaType.DICTIONARY_VALUE: case GandivaType.LIST_VALUE: case GandivaType.STRUCT_VALUE: diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Filter.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Filter.java index 25904d3dc1d76..46508b1f97a34 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Filter.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Filter.java @@ -43,11 +43,13 @@ public class Filter { private static final Logger logger = LoggerFactory.getLogger(Filter.class); + private final JniWrapper wrapper; private final long moduleId; private final Schema schema; private boolean closed; - private Filter(long moduleId, Schema schema) { + private Filter(JniWrapper wrapper, long moduleId, Schema schema) { + this.wrapper = wrapper; this.moduleId = moduleId; this.schema = schema; this.closed = false; @@ -63,7 +65,7 @@ private Filter(long moduleId, Schema schema) { * @return A native filter object that can be used to invoke on a RecordBatch */ public static Filter make(Schema schema, Condition condition) throws GandivaException { - return make(schema, condition, ConfigurationBuilder.getDefaultConfiguration()); + return make(schema, condition, JniLoader.getDefaultConfiguration()); } /** @@ -81,11 +83,11 @@ public static Filter make(Schema schema, Condition condition, long configuration // Invoke the JNI layer to create the LLVM module representing the filter. GandivaTypes.Condition conditionBuf = condition.toProtobuf(); GandivaTypes.Schema schemaBuf = ArrowTypeHelper.arrowSchemaToProtobuf(schema); - JniWrapper gandivaBridge = JniWrapper.getInstance(); - long moduleId = gandivaBridge.buildFilter(schemaBuf.toByteArray(), + JniWrapper wrapper = JniLoader.getInstance().getWrapper(); + long moduleId = wrapper.buildFilter(schemaBuf.toByteArray(), conditionBuf.toByteArray(), configurationId); logger.info("Created module for the projector with id {}", moduleId); - return new Filter(moduleId, schema); + return new Filter(wrapper, moduleId, schema); } /** @@ -144,7 +146,7 @@ private void evaluate(int numRows, List buffers, List buf bufSizes[idx++] = bufLayout.getSize(); } - int numRecords = JniWrapper.getInstance().evaluateFilter(this.moduleId, numRows, + int numRecords = wrapper.evaluateFilter(this.moduleId, numRows, bufAddrs, bufSizes, selectionVector.getType().getNumber(), selectionVector.getBuffer().memoryAddress(), selectionVector.getBuffer().capacity()); @@ -161,7 +163,7 @@ public void close() throws GandivaException { return; } - JniWrapper.getInstance().closeFilter(this.moduleId); + wrapper.closeFilter(this.moduleId); this.closed = true; } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java new file mode 100644 index 0000000000000..3491b283e5dd5 --- /dev/null +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniLoader.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.evaluator; + +import static java.util.UUID.randomUUID; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; + +import org.apache.arrow.gandiva.exceptions.GandivaException; + +import com.sun.jna.NativeLibrary; + +/** + * This class handles loading of the jni library, and acts as a bridge for the native functions. + */ +class JniLoader { + private static final String LIBRARY_NAME = "gandiva_jni"; + private static final String IRHELPERS_BC = "irhelpers.bc"; + + private static volatile JniLoader INSTANCE; + private static volatile long defaultConfiguration = 0L; + + private final String byteCodeFilePath; + private final JniWrapper wrapper; + + private JniLoader(String byteCodeFilePath) { + this.byteCodeFilePath = byteCodeFilePath; + this.wrapper = new JniWrapper(); + } + + static JniLoader getInstance() throws GandivaException { + if (INSTANCE == null) { + synchronized (JniLoader.class) { + if (INSTANCE == null) { + INSTANCE = setupInstance(); + } + } + } + return INSTANCE; + } + + private static JniLoader setupInstance() throws GandivaException { + try { + String tempDir = System.getProperty("java.io.tmpdir"); + loadGandivaLibraryFromJar(tempDir); + File byteCodeFile = moveFileFromJarToTemp(tempDir, IRHELPERS_BC); + return new JniLoader(byteCodeFile.getAbsolutePath()); + } catch (IOException ioException) { + throw new GandivaException("unable to create native instance", ioException); + } + } + + private static void loadGandivaLibraryFromJar(final String tmpDir) + throws IOException, GandivaException { + final String libraryToLoad = System.mapLibraryName(LIBRARY_NAME); + final File libraryFile = moveFileFromJarToTemp(tmpDir, libraryToLoad); + // This is required to load the library with RT_GLOBAL flags. Otherwise, the symbols in the + // libgandiva.so aren't visible to the JIT. + NativeLibrary.getInstance(libraryFile.getAbsolutePath()); + System.load(libraryFile.getAbsolutePath()); + } + + + private static File moveFileFromJarToTemp(final String tmpDir, String libraryToLoad) + throws IOException, GandivaException { + final File temp = setupFile(tmpDir, libraryToLoad); + try (final InputStream is = JniLoader.class.getClassLoader() + .getResourceAsStream(libraryToLoad)) { + if (is == null) { + throw new GandivaException(libraryToLoad + " was not found inside JAR."); + } else { + Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + } + return temp; + } + + private static File setupFile(String tmpDir, String libraryToLoad) + throws IOException, GandivaException { + // accommodate multiple processes running with gandiva jar. + // length should be ok since uuid is only 36 characters. + final String randomizeFileName = libraryToLoad + randomUUID(); + final File temp = new File(tmpDir, randomizeFileName); + if (temp.exists() && !temp.delete()) { + throw new GandivaException("File: " + temp.getAbsolutePath() + + " already exists and cannot be removed."); + } + if (!temp.createNewFile()) { + throw new GandivaException("File: " + temp.getAbsolutePath() + + " could not be created."); + } + temp.deleteOnExit(); + return temp; + } + + /** + * Returns the byte code file path extracted from jar. + */ + public String getByteCodeFilePath() { + return byteCodeFilePath; + } + + /** + * Returns the jni wrapper. + */ + JniWrapper getWrapper() throws GandivaException { + return wrapper; + } + + /** + * Get the default configuration to invoke gandiva. + * @return default configuration + * @throws GandivaException if unable to get native builder instance. + */ + static long getDefaultConfiguration() throws GandivaException { + if (defaultConfiguration == 0L) { + synchronized (ConfigurationBuilder.class) { + if (defaultConfiguration == 0L) { + String defaultByteCodeFilePath = JniLoader.getInstance().getByteCodeFilePath(); + + defaultConfiguration = new ConfigurationBuilder() + .withByteCodeFilePath(defaultByteCodeFilePath) + .buildConfigInstance(); + } + } + } + return defaultConfiguration; + } +} diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java index eea42f6976ce4..f00b0fbb9151a 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java @@ -17,100 +17,15 @@ package org.apache.arrow.gandiva.evaluator; -import static java.util.UUID.randomUUID; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.StandardCopyOption; - import org.apache.arrow.gandiva.exceptions.GandivaException; /** * This class is implemented in JNI. This provides the Java interface - * to invoke functions in JNI + * to invoke functions in JNI. + * This file is used to generated the .h files required for jni. Avoid all + * external dependencies in this file. */ -class JniWrapper { - private static final String LIBRARY_NAME = "gandiva_jni"; - private static final String IRHELPERS_BC = "irhelpers.bc"; - - private static volatile JniWrapper INSTANCE; - - private final String byteCodeFilePath; - - private JniWrapper(String byteCodeFilePath) { - this.byteCodeFilePath = byteCodeFilePath; - } - - static JniWrapper getInstance() throws GandivaException { - if (INSTANCE == null) { - synchronized (JniWrapper.class) { - if (INSTANCE == null) { - INSTANCE = setupInstance(); - } - } - } - return INSTANCE; - } - - private static JniWrapper setupInstance() throws GandivaException { - try { - String tempDir = System.getProperty("java.io.tmpdir"); - loadGandivaLibraryFromJar(tempDir); - File byteCodeFile = moveFileFromJarToTemp(tempDir, IRHELPERS_BC); - return new JniWrapper(byteCodeFile.getAbsolutePath()); - } catch (IOException ioException) { - throw new GandivaException("unable to create native instance", ioException); - } - } - - private static void loadGandivaLibraryFromJar(final String tmpDir) - throws IOException, GandivaException { - final String libraryToLoad = System.mapLibraryName(LIBRARY_NAME); - final File libraryFile = moveFileFromJarToTemp(tmpDir, libraryToLoad); - System.load(libraryFile.getAbsolutePath()); - } - - - private static File moveFileFromJarToTemp(final String tmpDir, String libraryToLoad) - throws IOException, GandivaException { - final File temp = setupFile(tmpDir, libraryToLoad); - try (final InputStream is = JniWrapper.class.getClassLoader() - .getResourceAsStream(libraryToLoad)) { - if (is == null) { - throw new GandivaException(libraryToLoad + " was not found inside JAR."); - } else { - Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); - } - } - return temp; - } - - private static File setupFile(String tmpDir, String libraryToLoad) - throws IOException, GandivaException { - // accommodate multiple processes running with gandiva jar. - // length should be ok since uuid is only 36 characters. - final String randomizeFileName = libraryToLoad + randomUUID(); - final File temp = new File(tmpDir, randomizeFileName); - if (temp.exists() && !temp.delete()) { - throw new GandivaException("File: " + temp.getAbsolutePath() + - " already exists and cannot be removed."); - } - if (!temp.createNewFile()) { - throw new GandivaException("File: " + temp.getAbsolutePath() + - " could not be created."); - } - temp.deleteOnExit(); - return temp; - } - - /** - * Returns the byte code file path extracted from jar. - */ - public String getByteCodeFilePath() { - return byteCodeFilePath; - } +public class JniWrapper { /** * Generates the projector module to evaluate the expressions with diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index d7578936b3d83..af1a4ca539cc4 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -46,12 +46,14 @@ public class Projector { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(Projector.class); + private JniWrapper wrapper; private final long moduleId; private final Schema schema; private final int numExprs; private boolean closed; - private Projector(long moduleId, Schema schema, int numExprs) { + private Projector(JniWrapper wrapper, long moduleId, Schema schema, int numExprs) { + this.wrapper = wrapper; this.moduleId = moduleId; this.schema = schema; this.numExprs = numExprs; @@ -71,7 +73,7 @@ private Projector(long moduleId, Schema schema, int numExprs) { */ public static Projector make(Schema schema, List exprs) throws GandivaException { - return make(schema, exprs, ConfigurationBuilder.getDefaultConfiguration()); + return make(schema, exprs, JniLoader.getDefaultConfiguration()); } /** @@ -96,11 +98,11 @@ public static Projector make(Schema schema, List exprs, long // Invoke the JNI layer to create the LLVM module representing the expressions GandivaTypes.Schema schemaBuf = ArrowTypeHelper.arrowSchemaToProtobuf(schema); - JniWrapper gandivaBridge = JniWrapper.getInstance(); - long moduleId = gandivaBridge.buildProjector(schemaBuf.toByteArray(), builder.build() - .toByteArray(), configurationId); + JniWrapper wrapper = JniLoader.getInstance().getWrapper(); + long moduleId = wrapper.buildProjector(schemaBuf.toByteArray(), + builder.build().toByteArray(), configurationId); logger.info("Created module for the projector with id {}", moduleId); - return new Projector(moduleId, schema, exprs.size()); + return new Projector(wrapper, moduleId, schema, exprs.size()); } /** @@ -175,9 +177,7 @@ private void evaluate(int numRows, List buffers, List buf valueVector.setValueCount(numRows); } - JniWrapper.getInstance().evaluateProjector(this.moduleId, numRows, - bufAddrs, bufSizes, - outAddrs, outSizes); + wrapper.evaluateProjector(this.moduleId, numRows, bufAddrs, bufSizes, outAddrs, outSizes); } /** @@ -188,7 +188,7 @@ public void close() throws GandivaException { return; } - JniWrapper.getInstance().closeProjector(this.moduleId); + wrapper.closeProjector(this.moduleId); this.closed = true; } } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/DecimalNode.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/DecimalNode.java new file mode 100644 index 0000000000000..1b908b9962fb3 --- /dev/null +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/DecimalNode.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.expression; + +import java.nio.charset.Charset; + +import org.apache.arrow.gandiva.exceptions.GandivaException; +import org.apache.arrow.gandiva.ipc.GandivaTypes; + +import com.google.protobuf.ByteString; + + +/** + * Used to represent expression tree nodes representing decimal constants. + * Used in the expression (x + 5.0) + */ +class DecimalNode implements TreeNode { + private final String value; + private final int precision; + private final int scale; + + DecimalNode(String value, int precision, int scale) { + this.value = value; + this.precision = precision; + this.scale = scale; + } + + @Override + public GandivaTypes.TreeNode toProtobuf() throws GandivaException { + GandivaTypes.DecimalNode.Builder decimalNode = GandivaTypes.DecimalNode.newBuilder(); + decimalNode.setValue(value); + decimalNode.setPrecision(precision); + decimalNode.setScale(scale); + + GandivaTypes.TreeNode.Builder builder = GandivaTypes.TreeNode.newBuilder(); + builder.setDecimalNode(decimalNode.build()); + return builder.build(); + } +} diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java index f5568591c2002..a220c547e44a6 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java @@ -55,6 +55,10 @@ public static TreeNode makeBinaryLiteral(byte[] binaryConstant) { return new BinaryNode(binaryConstant); } + public static TreeNode makeDecimalLiteral(String decimalConstant, int precision, int scale) { + return new DecimalNode(decimalConstant, precision, scale); + } + /** * create a null literal. */ diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java index aeb3d418a70ac..97c2883c58e5e 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/BaseEvaluatorTest.java @@ -17,6 +17,8 @@ package org.apache.arrow.gandiva.evaluator; +import java.math.BigDecimal; +import java.math.BigInteger; import java.util.ArrayList; import java.util.List; import java.util.Random; @@ -27,6 +29,7 @@ import org.apache.arrow.gandiva.expression.ExpressionTree; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; @@ -229,6 +232,18 @@ ArrowBuf intBuf(int[] ints) { return buffer; } + DecimalVector decimalVector(String[] values, int precision, int scale) { + DecimalVector vector = new DecimalVector("decimal" + Math.random(), allocator, precision, scale); + vector.allocateNew(); + for (int i = 0; i < values.length; i++) { + BigDecimal decimal = new BigDecimal(values[i]); + vector.setSafe(i, decimal); + } + + vector.setValueCount(values.length); + return vector; + } + ArrowBuf longBuf(long[] longs) { ArrowBuf buffer = allocator.buffer(longs.length * 8); for (int i = 0; i < longs.length; i++) { diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtilTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtilTest.java new file mode 100644 index 0000000000000..4a4fb82951c16 --- /dev/null +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtilTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.evaluator; + +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.junit.Assert; +import org.junit.Test; + +public class DecimalTypeUtilTest { + + @Test + public void testOutputTypesForAdd() { + ArrowType.Decimal operand1 = getDecimal(30, 10); + ArrowType.Decimal operand2 = getDecimal(30, 10); + ArrowType.Decimal resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, operand1, operand2); + Assert.assertTrue(getDecimal(31, 10).equals(resultType)); + + operand1 = getDecimal(30, 6); + operand2 = getDecimal(30, 5); + resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, operand1, operand2); + Assert.assertTrue(getDecimal(32, 6).equals(resultType)); + + operand1 = getDecimal(30, 10); + operand2 = getDecimal(38, 10); + resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, operand1, operand2); + Assert.assertTrue(getDecimal(38, 9).equals(resultType)); + + operand1 = getDecimal(38, 10); + operand2 = getDecimal(38, 38); + resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, operand1, operand2); + Assert.assertTrue(getDecimal(38, 9).equals(resultType)); + + operand1 = getDecimal(38, 10); + operand2 = getDecimal(38, 2); + resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, operand1, operand2); + Assert.assertTrue(getDecimal(38, 6).equals(resultType)); + + } + + @Test + public void testOutputTypesForMultiply() { + ArrowType.Decimal operand1 = getDecimal(30, 10); + ArrowType.Decimal operand2 = getDecimal(30, 10); + ArrowType.Decimal resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.MULTIPLY, operand1, operand2); + Assert.assertTrue(getDecimal(38, 6).equals(resultType)); + + operand1 = getDecimal(38, 10); + operand2 = getDecimal(9, 2); + resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.MULTIPLY, operand1, operand2); + Assert.assertTrue(getDecimal(38, 6).equals(resultType)); + + } + + @Test + public void testOutputTypesForMod() { + ArrowType.Decimal operand1 = getDecimal(30, 10); + ArrowType.Decimal operand2 = getDecimal(28 , 7); + ArrowType.Decimal resultType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.MOD, operand1, operand2); + Assert.assertTrue(getDecimal(30, 10).equals(resultType)); + } + + private ArrowType.Decimal getDecimal(int precision, int scale) { + return new ArrowType.Decimal(precision, scale); + } + +} diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorDecimalTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorDecimalTest.java new file mode 100644 index 0000000000000..a3a0b4818ac22 --- /dev/null +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorDecimalTest.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.evaluator; + + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; + +import org.apache.arrow.gandiva.exceptions.GandivaException; +import org.apache.arrow.gandiva.expression.ExpressionTree; +import org.apache.arrow.gandiva.expression.TreeBuilder; +import org.apache.arrow.gandiva.expression.TreeNode; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Test; + +import com.google.common.collect.Lists; + +public class ProjectorDecimalTest extends org.apache.arrow.gandiva.evaluator.BaseEvaluatorTest { + + @Test + public void test_add() throws GandivaException { + int precision = 38; + int scale = 8; + ArrowType.Decimal decimal = new ArrowType.Decimal(precision, scale); + Field a = Field.nullable("a", decimal); + Field b = Field.nullable("b", decimal); + List args = Lists.newArrayList(a, b); + + ArrowType.Decimal outputType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, decimal, decimal); + Field retType = Field.nullable("c", outputType); + ExpressionTree root = TreeBuilder.makeExpression("add", args, retType); + + List exprs = Lists.newArrayList(root); + + Schema schema = new Schema(args); + Projector eval = Projector.make(schema, exprs); + + int numRows = 4; + byte[] validity = new byte[]{(byte) 255}; + String[] aValues = new String[]{"1.12345678","2.12345678","3.12345678","4.12345678"}; + String[] bValues = new String[]{"2.12345678","3.12345678","4.12345678","5.12345678"}; + + DecimalVector valuesa = decimalVector(aValues, precision, scale); + DecimalVector valuesb = decimalVector(bValues, precision, scale); + ArrowRecordBatch batch = + new ArrowRecordBatch( + numRows, + Lists.newArrayList(new ArrowFieldNode(numRows, 0), new ArrowFieldNode(numRows, 0)), + Lists.newArrayList(valuesa.getValidityBuffer(), valuesa.getDataBuffer(), + valuesb.getValidityBuffer(), valuesb.getDataBuffer())); + + DecimalVector outVector = new DecimalVector("decimal_output", allocator, outputType.getPrecision(), + outputType.getScale()); + outVector.allocateNew(numRows); + + List output = new ArrayList(); + output.add(outVector); + eval.evaluate(batch, output); + + // should have scaled down. + BigDecimal[] expOutput = new BigDecimal[]{BigDecimal.valueOf(3.2469136), + BigDecimal.valueOf(5.2469136), + BigDecimal.valueOf(7.2469136), + BigDecimal.valueOf(9.2469136)}; + + for (int i = 0; i < 4; i++) { + assertFalse(outVector.isNull(i)); + assertTrue("index : " + i + " failed compare", expOutput[i].compareTo(outVector.getObject(i) + ) == 0); + } + + // free buffers + releaseRecordBatch(batch); + releaseValueVectors(output); + eval.close(); + } + + @Test + public void test_add_literal() throws GandivaException { + int precision = 2; + int scale = 0; + ArrowType.Decimal decimal = new ArrowType.Decimal(precision, scale); + ArrowType.Decimal literalType = new ArrowType.Decimal(2, 1); + Field a = Field.nullable("a", decimal); + + ArrowType.Decimal outputType = DecimalTypeUtil.getResultTypeForOperation(DecimalTypeUtil + .OperationType.ADD, decimal, literalType); + Field retType = Field.nullable("c", outputType); + TreeNode field = TreeBuilder.makeField(a); + TreeNode literal = TreeBuilder.makeDecimalLiteral("6", 2, 1); + List args = Lists.newArrayList(field, literal); + TreeNode root = TreeBuilder.makeFunction("add", args, outputType); + ExpressionTree tree = TreeBuilder.makeExpression(root, retType); + + List exprs = Lists.newArrayList(tree); + + Schema schema = new Schema(Lists.newArrayList(a)); + Projector eval = Projector.make(schema, exprs); + + int numRows = 4; + String[] aValues = new String[]{"1", "2", "3", "4"}; + + DecimalVector valuesa = decimalVector(aValues, precision, scale); + ArrowRecordBatch batch = + new ArrowRecordBatch( + numRows, + Lists.newArrayList(new ArrowFieldNode(numRows, 0)), + Lists.newArrayList(valuesa.getValidityBuffer(), valuesa.getDataBuffer())); + + DecimalVector outVector = new DecimalVector("decimal_output", allocator, outputType.getPrecision(), + outputType.getScale()); + outVector.allocateNew(numRows); + + List output = new ArrayList(); + output.add(outVector); + eval.evaluate(batch, output); + + BigDecimal[] expOutput = new BigDecimal[]{BigDecimal.valueOf(1.6), BigDecimal.valueOf(2.6), + BigDecimal.valueOf(3.6), BigDecimal.valueOf(4.6)}; + + for (int i = 0; i < 4; i++) { + assertFalse(outVector.isNull(i)); + assertTrue(expOutput[i].compareTo(outVector.getObject(i)) == 0); + } + + // free buffers + releaseRecordBatch(batch); + releaseValueVectors(output); + eval.close(); + } +} diff --git a/python/pyarrow/gandiva.pyx b/python/pyarrow/gandiva.pyx index 76e55d6ba27ef..715ff9dcfb384 100644 --- a/python/pyarrow/gandiva.pyx +++ b/python/pyarrow/gandiva.pyx @@ -19,6 +19,8 @@ # distutils: language = c++ # cython: embedsignature = True +import os + from libcpp cimport bool as c_bool, nullptr from libcpp.memory cimport shared_ptr, unique_ptr, make_shared from libcpp.string cimport string as c_string @@ -73,6 +75,14 @@ from pyarrow.includes.libgandiva cimport ( CFunctionSignature, GetRegisteredFunctionSignatures) +if os.name == 'posix': + # Expose self with RTLD_GLOBAL so that symbols from gandiva.so and child + # libs (such as libstdc++) can be reached during JIT code execution. + # Another workaround is to use + # sys.setdlopenflags(os.RTLD_GLOBAL | os.RTLD_NOW) + # but it would affect all C extensions loaded in the process. + import ctypes + _dll = ctypes.CDLL(__file__, ctypes.RTLD_GLOBAL) cdef class Node: cdef: From ccec63847e7709317a18036931ef3e3fbeab1f05 Mon Sep 17 00:00:00 2001 From: "Korn, Uwe" Date: Tue, 8 Jan 2019 10:14:53 -0600 Subject: [PATCH 175/328] ARROW-4191: [C++] Use same CC and AR for jemalloc as for the main sources Author: Korn, Uwe Closes #3347 from xhochy/ARROW-4191 and squashes the following commits: 44df02a23 ARROW-4191: Use same CC and AR for jemalloc as for the main sources --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index d8b34862eeaab..5a8c28feab4e8 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -772,7 +772,7 @@ if (ARROW_JEMALLOC) ExternalProject_Add(jemalloc_ep URL ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/jemalloc/${JEMALLOC_VERSION}.tar.gz PATCH_COMMAND touch doc/jemalloc.3 doc/jemalloc.html - CONFIGURE_COMMAND ./autogen.sh "--prefix=${JEMALLOC_PREFIX}" "--with-jemalloc-prefix=je_arrow_" "--with-private-namespace=je_arrow_private_" "--disable-tls" + CONFIGURE_COMMAND ./autogen.sh "AR=${CMAKE_AR}" "CC=${CMAKE_C_COMPILER}" "--prefix=${JEMALLOC_PREFIX}" "--with-jemalloc-prefix=je_arrow_" "--with-private-namespace=je_arrow_private_" "--disable-tls" ${EP_LOG_OPTIONS} BUILD_IN_SOURCE 1 BUILD_COMMAND ${MAKE} ${MAKE_BUILD_ARGS} From 326015cfc66e1f657cdd6811620137e9e277b43d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 8 Jan 2019 10:17:54 -0600 Subject: [PATCH 176/328] ARROW-4186: [C++] BitmapWriter shouldn't clobber data when length == 0 Author: Antoine Pitrou Closes #3348 from pitrou/ARROW-4186-bitmap-writer-zero-length and squashes the following commits: 2299b0906 ARROW-4186: BitmapWriter shouldn't clobber data when length == 0 --- cpp/src/arrow/util/bit-util-test.cc | 79 ++++++++++++++++++----------- cpp/src/arrow/util/bit-util.h | 4 +- 2 files changed, 50 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index b12e2ecf9eef9..174e6d0f05235 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -167,33 +166,40 @@ TEST(BitmapReader, DoesNotReadOutOfBounds) { } TEST(BitmapWriter, NormalOperation) { - { - uint8_t bitmap[] = {0, 0, 0, 0}; - auto writer = internal::BitmapWriter(bitmap, 0, 12); - WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); - // {0b00110110, 0b1010, 0, 0} - ASSERT_BYTES_EQ(bitmap, {0x36, 0x0a, 0, 0}); - } - { - uint8_t bitmap[] = {0xff, 0xff, 0xff, 0xff}; - auto writer = internal::BitmapWriter(bitmap, 0, 12); - WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); - // {0b00110110, 0b11111010, 0xff, 0xff} - ASSERT_BYTES_EQ(bitmap, {0x36, 0xfa, 0xff, 0xff}); - } - { - uint8_t bitmap[] = {0, 0, 0, 0}; - auto writer = internal::BitmapWriter(bitmap, 3, 12); - WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); - // {0b10110000, 0b01010001, 0, 0} - ASSERT_BYTES_EQ(bitmap, {0xb0, 0x51, 0, 0}); - } - { - uint8_t bitmap[] = {0, 0, 0, 0}; - auto writer = internal::BitmapWriter(bitmap, 20, 12); - WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); - // {0, 0, 0b01100000, 0b10100011} - ASSERT_BYTES_EQ(bitmap, {0, 0, 0x60, 0xa3}); + for (const auto fill_byte_int : {0x00, 0xff}) { + const uint8_t fill_byte = static_cast(fill_byte_int); + { + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + auto writer = internal::BitmapWriter(bitmap, 0, 12); + WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); + // {0b00110110, 0b....1010, ........, ........} + ASSERT_BYTES_EQ(bitmap, {0x36, static_cast(0x0a | (fill_byte & 0xf0)), + fill_byte, fill_byte}); + } + { + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + auto writer = internal::BitmapWriter(bitmap, 3, 12); + WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); + // {0b10110..., 0b.1010001, ........, ........} + ASSERT_BYTES_EQ(bitmap, {static_cast(0xb0 | (fill_byte & 0x07)), + static_cast(0x51 | (fill_byte & 0x80)), fill_byte, + fill_byte}); + } + { + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + auto writer = internal::BitmapWriter(bitmap, 20, 12); + WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); + // {........, ........, 0b0110...., 0b10100011} + ASSERT_BYTES_EQ(bitmap, {fill_byte, fill_byte, + static_cast(0x60 | (fill_byte & 0x0f)), 0xa3}); + } + // 0-length writes + for (int64_t pos = 0; pos < 32; ++pos) { + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + auto writer = internal::BitmapWriter(bitmap, pos, 0); + WriteVectorToWriter(writer, {}); + ASSERT_BYTES_EQ(bitmap, {fill_byte, fill_byte, fill_byte, fill_byte}); + } } } @@ -266,6 +272,10 @@ TEST(FirstTimeBitmapWriter, NormalOperation) { } { uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + { + auto writer = internal::FirstTimeBitmapWriter(bitmap, 4, 0); + WriteVectorToWriter(writer, {}); + } { auto writer = internal::FirstTimeBitmapWriter(bitmap, 4, 6); WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1}); @@ -274,6 +284,10 @@ TEST(FirstTimeBitmapWriter, NormalOperation) { auto writer = internal::FirstTimeBitmapWriter(bitmap, 10, 3); WriteVectorToWriter(writer, {0, 0, 0}); } + { + auto writer = internal::FirstTimeBitmapWriter(bitmap, 13, 0); + WriteVectorToWriter(writer, {}); + } { auto writer = internal::FirstTimeBitmapWriter(bitmap, 13, 3); WriteVectorToWriter(writer, {1, 0, 1}); @@ -319,8 +333,8 @@ TYPED_TEST(TestGenerateBits, NormalOperation) { for (const int64_t start_offset : start_offsets) { for (const int64_t length : lengths) { for (const uint8_t fill_byte : fill_bytes) { - uint8_t bitmap[kSourceSize]; - memset(bitmap, fill_byte, kSourceSize); + uint8_t bitmap[kSourceSize + 1]; + memset(bitmap, fill_byte, kSourceSize + 1); // First call GenerateBits { int64_t ncalled = 0; @@ -344,7 +358,7 @@ TYPED_TEST(TestGenerateBits, NormalOperation) { result_reader.Next(); } } - // Check bits preceding and following generated contents weren't clobbered + // Check bits preceding generated contents weren't clobbered { internal::BitmapReader reader_before(bitmap, 0, start_offset); for (int64_t i = 0; i < start_offset; ++i) { @@ -352,6 +366,9 @@ TYPED_TEST(TestGenerateBits, NormalOperation) { << "mismatch at preceding bit #" << start_offset - i; } } + // Check the byte following generated contents wasn't clobbered + auto byte_after = bitmap[BitUtil::CeilDiv(start_offset + length, 8)]; + ASSERT_EQ(byte_after, fill_byte); } } } diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 93b6cb28d91b1..415684e449287 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -409,7 +409,7 @@ class BitmapWriter { void Finish() { // Store current byte if we didn't went past bitmap storage - if (bit_mask_ != 0x01 || position_ < length_) { + if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { bitmap_[byte_offset_] = current_byte_; } } @@ -461,7 +461,7 @@ class FirstTimeBitmapWriter { void Finish() { // Store current byte if we didn't went past bitmap storage - if (bit_mask_ != 0x01 || position_ < length_) { + if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { bitmap_[byte_offset_] = current_byte_; } } From ac45f3210a194049ef35f49847dbc4ff5e70d48f Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Tue, 8 Jan 2019 16:49:12 -0700 Subject: [PATCH 177/328] ARROW-3839: [Rust] Add ability to infer schema in CSV reader Resubmission of #3128 Author: Neville Dipale Closes #3349 from nevi-me/rust/infer-csv-schema and squashes the following commits: 0838199 ARROW-3839: Add ability to infer schema in CSV reader --- ci/rust-build-main.bat | 1 + ci/travis_script_rust.sh | 1 + rust/arrow/Cargo.toml | 2 + rust/arrow/examples/read_csv_infer_schema.rs | 66 ++++ rust/arrow/src/csv/mod.rs | 1 + rust/arrow/src/csv/reader.rs | 373 +++++++++++++++++- rust/arrow/src/datatypes.rs | 4 +- rust/arrow/src/error.rs | 37 ++ .../test/data/uk_cities_with_headers.csv | 38 ++ rust/arrow/test/data/various_types.csv | 6 + 10 files changed, 524 insertions(+), 5 deletions(-) create mode 100644 rust/arrow/examples/read_csv_infer_schema.rs create mode 100644 rust/arrow/test/data/uk_cities_with_headers.csv create mode 100644 rust/arrow/test/data/various_types.csv diff --git a/ci/rust-build-main.bat b/ci/rust-build-main.bat index ac5c9e7589245..b36a97acf51ac 100644 --- a/ci/rust-build-main.bat +++ b/ci/rust-build-main.bat @@ -40,5 +40,6 @@ cd arrow cargo run --example builders --target %TARGET% --release || exit /B cargo run --example dynamic_types --target %TARGET% --release || exit /B cargo run --example read_csv --target %TARGET% --release || exit /B +cargo run --example read_csv_infer_schema --target %TARGET% --release || exit /B popd diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh index 8e3c8c3906b24..c25d64ec42cb6 100755 --- a/ci/travis_script_rust.sh +++ b/ci/travis_script_rust.sh @@ -39,5 +39,6 @@ cd arrow cargo run --example builders cargo run --example dynamic_types cargo run --example read_csv +cargo run --example read_csv_infer_schema popd diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index 77e8d53fa55b5..38e7e5e0ec06e 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -43,6 +43,8 @@ serde_json = "1.0.13" rand = "0.5" csv = "1.0.0" num = "0.2" +regex = "1.1" +lazy_static = "1.2" [dev-dependencies] criterion = "0.2" diff --git a/rust/arrow/examples/read_csv_infer_schema.rs b/rust/arrow/examples/read_csv_infer_schema.rs new file mode 100644 index 0000000000000..9dd2d2aaf2cc2 --- /dev/null +++ b/rust/arrow/examples/read_csv_infer_schema.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate arrow; + +use arrow::array::{BinaryArray, Float64Array}; +use arrow::csv; +use std::fs::File; + +fn main() { + let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); + let builder = csv::ReaderBuilder::new() + .has_headers(true) + .infer_schema(Some(100)); + let mut csv = builder.build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + println!( + "Loaded {} rows containing {} columns", + batch.num_rows(), + batch.num_columns() + ); + + println!("Inferred schema: {:?}", batch.schema()); + + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + let lng = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + + for i in 0..batch.num_rows() { + let city_name: String = String::from_utf8(city.value(i).to_vec()).unwrap(); + + println!( + "City: {}, Latitude: {}, Longitude: {}", + city_name, + lat.value(i), + lng.value(i) + ); + } +} diff --git a/rust/arrow/src/csv/mod.rs b/rust/arrow/src/csv/mod.rs index 9f2bd1db69db2..6521b196d1e12 100644 --- a/rust/arrow/src/csv/mod.rs +++ b/rust/arrow/src/csv/mod.rs @@ -18,3 +18,4 @@ pub mod reader; pub use self::reader::Reader; +pub use self::reader::ReaderBuilder; diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 57c7dde1b250d..49e0302aa0672 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -40,8 +40,11 @@ //! let batch = csv.next().unwrap().unwrap(); //! ``` +use lazy_static::lazy_static; +use regex::{Regex, RegexBuilder}; +use std::collections::HashSet; use std::fs::File; -use std::io::BufReader; +use std::io::{BufReader, Seek, SeekFrom}; use std::sync::Arc; use csv as csv_crate; @@ -54,6 +57,130 @@ use crate::record_batch::RecordBatch; use self::csv_crate::{StringRecord, StringRecordsIntoIter}; +lazy_static! { + static ref DECIMAL_RE: Regex = Regex::new(r"^-?(\d+\.\d+)$").unwrap(); + static ref INTEGER_RE: Regex = Regex::new(r"^-?(\d*.)$").unwrap(); + static ref BOOLEAN_RE: Regex = RegexBuilder::new(r"^(true)$|^(false)$") + .case_insensitive(true) + .build() + .unwrap(); +} + +/// Infer the data type of a record +fn infer_field_schema(string: &str) -> DataType { + // when quoting is enabled in the reader, these quotes aren't escaped, we default to Utf8 for them + if string.starts_with("\"") { + return DataType::Utf8; + } + // match regex in a particular order + if BOOLEAN_RE.is_match(string) { + return DataType::Boolean; + } else if DECIMAL_RE.is_match(string) { + return DataType::Float64; + } else if INTEGER_RE.is_match(string) { + return DataType::Int64; + } else { + return DataType::Utf8; + } +} + +/// Infer the schema of a CSV file by reading through the first n records of the file, +/// with `max_read_records` controlling the maximum number of records to read. +/// +/// If `max_read_records` is not set, the whole file is read to infer its schema. +fn infer_file_schema( + mut file: File, + delimiter: u8, + max_read_records: Option, + has_headers: bool, +) -> Result { + let mut csv_reader = csv::ReaderBuilder::new() + .delimiter(delimiter) + .from_reader(BufReader::new(file.try_clone()?)); + + // get or create header names + // when has_headers is false, creates default column names with column_ prefix + let headers: Vec = if has_headers { + let headers = &csv_reader.headers()?.clone(); + headers.iter().map(|s| s.to_string()).collect() + } else { + let first_record_count = &csv_reader.headers()?.len(); + (0..*first_record_count) + .map(|i| format!("column_{}", i + 1)) + .into_iter() + .collect() + }; + + // save the csv reader position after reading headers + let position = csv_reader.position().clone(); + + let header_length = headers.len(); + // keep track of inferred field types + let mut column_types: Vec> = vec![HashSet::new(); header_length]; + // keep track of columns with nulls + let mut nulls: Vec = vec![false; header_length]; + + // return csv reader position to after headers + csv_reader.seek(position)?; + + let mut fields = vec![]; + + for result in csv_reader + .into_records() + .take(max_read_records.unwrap_or(std::usize::MAX)) + { + let record = result?; + + for i in 0..header_length { + let string: Option<&str> = record.get(i); + match string { + Some(s) => { + if s == "" { + nulls[i] = true; + } else { + column_types[i].insert(infer_field_schema(s)); + } + } + _ => {} + } + } + } + + // build schema from inference results + for i in 0..header_length { + let possibilities = &column_types[i]; + let has_nulls = nulls[i]; + let field_name = &headers[i]; + + // determine data type based on possible types + // if there are incompatible types, use DataType::Utf8 + match possibilities.len() { + 1 => { + for dtype in possibilities.iter() { + fields.push(Field::new(&field_name, dtype.clone(), has_nulls)); + } + } + 2 => { + if possibilities.contains(&DataType::Int64) + && possibilities.contains(&DataType::Float64) + { + // we have an integer and double, fall down to double + fields.push(Field::new(&field_name, DataType::Float64, has_nulls)); + } else { + // default to Utf8 for conflicting datatypes (e.g bool and int) + fields.push(Field::new(&field_name, DataType::Utf8, has_nulls)); + } + } + _ => fields.push(Field::new(&field_name, DataType::Utf8, has_nulls)), + } + } + + // return the file seek back to the start + file.seek(SeekFrom::Start(0))?; + + Ok(Schema::new(fields)) +} + /// CSV file reader pub struct Reader { /// Explicit schema for the CSV file @@ -68,6 +195,8 @@ pub struct Reader { impl Reader { /// Create a new CsvReader + /// + /// To customise the Reader, such as to enable schema inference, use `ReaderBuilder` pub fn new( file: File, schema: Arc, @@ -78,10 +207,9 @@ impl Reader { let csv_reader = csv::ReaderBuilder::new() .has_headers(has_headers) .from_reader(BufReader::new(file)); - let record_iter = csv_reader.into_records(); Reader { - schema: schema.clone(), + schema, projection, record_iter, batch_size, @@ -194,6 +322,141 @@ impl Reader { } } +/// CSV file reader builder +pub struct ReaderBuilder { + /// Optional schema for the CSV file + /// + /// If the schema is not supplied, the reader will try to infer the schema + /// based on the CSV structure. + schema: Option>, + /// Whether the file has headers or not + /// + /// If schema inference is run on a file with no headers, default column names + /// are created. + has_headers: bool, + /// An optional column delimiter. Defauits to `b','` + delimiter: Option, + /// Optional maximum number of records to read during schema inference + /// + /// If a number is not provided, all the records are read. + max_records: Option, + /// Batch size (number of records to load each time) + /// + /// The default batch size when using the `ReaderBuilder` is 1024 records + batch_size: usize, + /// Optional projection for which columns to load (zero-based column indices) + projection: Option>, +} + +impl Default for ReaderBuilder { + fn default() -> ReaderBuilder { + ReaderBuilder { + schema: None, + has_headers: false, + delimiter: None, + max_records: None, + batch_size: 1024, + projection: None, + } + } +} + +impl ReaderBuilder { + /// Create a new builder for configuring CSV parsing options. + /// + /// To convert a builder into a reader, call `Reader::from_builder` + /// + /// # Example + /// + /// ``` + /// extern crate arrow; + /// + /// use arrow::csv; + /// use std::fs::File; + /// + /// fn example() -> csv::Reader { + /// let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); + /// + /// // create a builder, inferring the schema with the first 100 records + /// let builder = csv::ReaderBuilder::new().infer_schema(Some(100)); + /// + /// let reader = builder.build(file).unwrap(); + /// + /// reader + /// } + /// ``` + pub fn new() -> ReaderBuilder { + ReaderBuilder::default() + } + + /// Set the CSV file's schema + pub fn with_schema(mut self, schema: Arc) -> Self { + self.schema = Some(schema); + self + } + + /// Set whether the CSV file has headers + pub fn has_headers(mut self, has_headers: bool) -> Self { + self.has_headers = has_headers; + self + } + + /// Set the CSV file's column delimiter as a byte character + pub fn with_delimiter(mut self, delimiter: u8) -> Self { + self.delimiter = Some(delimiter); + self + } + + /// Set the CSV reader to infer the schema of the file + pub fn infer_schema(mut self, max_records: Option) -> Self { + // remove any schema that is set + self.schema = None; + self.max_records = max_records; + self + } + + /// Set the batch size (number of records to load at one time) + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = batch_size; + self + } + + /// Set the reader's column projection + pub fn with_projection(mut self, projection: Vec) -> Self { + self.projection = Some(projection); + self + } + + /// Create a new `Reader` from the `ReaderBuilder` + pub fn build(self, file: File) -> Result { + // check if schema should be inferred + let schema = match self.schema { + Some(schema) => schema, + None => { + let inferred_schema = infer_file_schema( + file.try_clone().unwrap(), + self.delimiter.unwrap_or(b','), + self.max_records, + self.has_headers, + )?; + + Arc::new(inferred_schema) + } + }; + let csv_reader = csv::ReaderBuilder::new() + .delimiter(self.delimiter.unwrap_or(b',')) + .has_headers(self.has_headers) + .from_reader(BufReader::new(file)); + let record_iter = csv_reader.into_records(); + Ok(Reader { + schema, + projection: self.projection.clone(), + record_iter, + batch_size: self.batch_size, + }) + } +} + #[cfg(test)] mod tests { use super::*; @@ -236,6 +499,75 @@ mod tests { assert_eq!("Aberdeen, Aberdeen City, UK", city_name); } + #[test] + fn test_csv_with_schema_inference() { + let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); + + let builder = ReaderBuilder::new().has_headers(true).infer_schema(None); + + let mut csv = builder.build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + let city_name: String = String::from_utf8(city.value(13).to_vec()).unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city_name); + } + + #[test] + fn test_csv_with_schema_inference_no_headers() { + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let builder = ReaderBuilder::new().infer_schema(None); + + let mut csv = builder.build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + // csv field names should be 'column_{number}' + let schema = batch.schema(); + assert_eq!("column_1", schema.field(0).name()); + assert_eq!("column_2", schema.field(1).name()); + assert_eq!("column_3", schema.field(2).name()); + + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + let city_name: String = String::from_utf8(city.value(13).to_vec()).unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city_name); + } + #[test] fn test_csv_with_projection() { let schema = Schema::new(vec![ @@ -272,4 +604,39 @@ mod tests { assert_eq!(false, batch.column(1).is_null(4)); } + #[test] + fn test_nulls_with_inference() { + let file = File::open("test/data/various_types.csv").unwrap(); + + let builder = ReaderBuilder::new() + .infer_schema(None) + .has_headers(true) + .with_delimiter(b'|') + .with_batch_size(512) + .with_projection(vec![0, 1, 2, 3]); + + let mut csv = builder.build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + assert_eq!(5, batch.num_rows()); + assert_eq!(4, batch.num_columns()); + + let schema = batch.schema(); + + assert_eq!(&DataType::Int64, schema.field(0).data_type()); + assert_eq!(&DataType::Float64, schema.field(1).data_type()); + assert_eq!(&DataType::Float64, schema.field(2).data_type()); + assert_eq!(&DataType::Boolean, schema.field(3).data_type()); + + assert_eq!(false, schema.field(0).is_nullable()); + assert_eq!(true, schema.field(1).is_nullable()); + assert_eq!(true, schema.field(2).is_nullable()); + assert_eq!(false, schema.field(3).is_nullable()); + + assert_eq!(false, batch.column(1).is_null(0)); + assert_eq!(false, batch.column(1).is_null(1)); + assert_eq!(true, batch.column(1).is_null(2)); + assert_eq!(false, batch.column(1).is_null(3)); + assert_eq!(false, batch.column(1).is_null(4)); + } } diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 0627b4523a1ce..05db6ce7d40b9 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -42,7 +42,7 @@ use crate::error::{ArrowError, Result}; /// Nested types can themselves be nested within other arrays. /// For more information on these types please see /// [here](https://arrow.apache.org/docs/memory_layout.html). -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)] pub enum DataType { Boolean, Int8, @@ -64,7 +64,7 @@ pub enum DataType { /// Contains the meta-data for a single relative type. /// /// The `Schema` object is an ordered collection of `Field` objects. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)] pub struct Field { name: String, data_type: DataType, diff --git a/rust/arrow/src/error.rs b/rust/arrow/src/error.rs index 559b2d7205994..b75111fd4e883 100644 --- a/rust/arrow/src/error.rs +++ b/rust/arrow/src/error.rs @@ -15,12 +15,49 @@ // specific language governing permissions and limitations // under the License. +use std::error::Error; + +use csv as csv_crate; + #[derive(Debug, Clone, PartialEq)] pub enum ArrowError { MemoryError(String), ParseError(String), ComputeError(String), DivideByZero, + CsvError(String), + IoError(String), +} + +impl From<::std::io::Error> for ArrowError { + fn from(error: ::std::io::Error) -> Self { + ArrowError::IoError(error.description().to_string()) + } +} + +impl From for ArrowError { + fn from(error: csv_crate::Error) -> Self { + match error.kind() { + csv_crate::ErrorKind::Io(error) => { + ArrowError::CsvError(error.description().to_string()) + } + csv_crate::ErrorKind::Utf8 {pos: _, err} => { + ArrowError::CsvError(format!("Encountered UTF-8 error while reading CSV file: {:?}", err.description())) + } + csv_crate::ErrorKind::UnequalLengths {pos: _, expected_len, len} => { + ArrowError::CsvError( + format!( + "Encountered unequal lengths between records on CSV file. Expected {} records, found {} records", + len, + expected_len + ) + ) + } + _ => { + ArrowError::CsvError("Error reading CSV file".to_string()) + } + } + } } pub type Result = ::std::result::Result; diff --git a/rust/arrow/test/data/uk_cities_with_headers.csv b/rust/arrow/test/data/uk_cities_with_headers.csv new file mode 100644 index 0000000000000..92f5a17bdda38 --- /dev/null +++ b/rust/arrow/test/data/uk_cities_with_headers.csv @@ -0,0 +1,38 @@ +city,lat,lng +"Elgin, Scotland, the UK",57.653484,-3.335724 +"Stoke-on-Trent, Staffordshire, the UK",53.002666,-2.179404 +"Solihull, Birmingham, UK",52.412811,-1.778197 +"Cardiff, Cardiff county, UK",51.481583,-3.179090 +"Eastbourne, East Sussex, UK",50.768036,0.290472 +"Oxford, Oxfordshire, UK",51.752022,-1.257677 +"London, UK",51.509865,-0.118092 +"Swindon, Swindon, UK",51.568535,-1.772232 +"Gravesend, Kent, UK",51.441883,0.370759 +"Northampton, Northamptonshire, UK",52.240479,-0.902656 +"Rugby, Warwickshire, UK",52.370876,-1.265032 +"Sutton Coldfield, West Midlands, UK",52.570385,-1.824042 +"Harlow, Essex, UK",51.772938,0.102310 +"Aberdeen, Aberdeen City, UK",57.149651,-2.099075 +"Swansea, Swansea, UK",51.621441,-3.943646 +"Chesterfield, Derbyshire, UK",53.235046,-1.421629 +"Londonderry, Derry, UK",55.006763,-7.318268 +"Salisbury, Wiltshire, UK",51.068787,-1.794472 +"Weymouth, Dorset, UK",50.614429,-2.457621 +"Wolverhampton, West Midlands, UK",52.591370,-2.110748 +"Preston, Lancashire, UK",53.765762,-2.692337 +"Bournemouth, UK",50.720806,-1.904755 +"Doncaster, South Yorkshire, UK",53.522820,-1.128462 +"Ayr, South Ayrshire, UK",55.458565,-4.629179 +"Hastings, East Sussex, UK",50.854259,0.573453 +"Bedford, UK",52.136436,-0.460739 +"Basildon, Essex, UK",51.572376,0.470009 +"Chippenham, Wiltshire, UK",51.458057,-2.116074 +"Belfast, UK",54.607868,-5.926437 +"Uckfield, East Sussex, UK",50.967941,0.085831 +"Worthing, West Sussex, UK",50.825024,-0.383835 +"Leeds, West Yorkshire, UK",53.801277,-1.548567 +"Kendal, Cumbria, UK",54.328506,-2.743870 +"Plymouth, UK",50.376289,-4.143841 +"Haverhill, Suffolk, UK",52.080875,0.444517 +"Frankton, Warwickshire, UK",52.328415,-1.377561 +"Inverness, the UK",57.477772,-4.224721 \ No newline at end of file diff --git a/rust/arrow/test/data/various_types.csv b/rust/arrow/test/data/various_types.csv new file mode 100644 index 0000000000000..322d9c347aaa6 --- /dev/null +++ b/rust/arrow/test/data/various_types.csv @@ -0,0 +1,6 @@ +c_int|c_float|c_string|c_bool +1|1.1|"1.11"|true +2|2.2|"2.22"|true +3||"3.33"|true +4|4.4||false +5|6.6|""|false \ No newline at end of file From bcca04aabd804263c555945463f5cf4a2ab6216f Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Tue, 8 Jan 2019 16:56:31 -0700 Subject: [PATCH 178/328] ARROW-4172: [Rust] more consistent naming in array builders This is to make the namings in `builder.rs` more consistent: 1. Changes `PrimitiveArrayBuilder` to `PrimitiveBuilder`, similarly for `ListArrayBuilder`, `BinaryArrayBuilder` and `StructArrayBuilder`. The `Array` seems redundant. 2. Currently we use both `push` and `append`, which is a bit confusing. This unifies them by using `append`. Author: Chao Sun Closes #3345 from sunchao/ARROW-4172 and squashes the following commits: 3472d12 ARROW-4172: more consistent naming in array builders --- rust/arrow/examples/builders.rs | 12 +- rust/arrow/src/array.rs | 4 +- rust/arrow/src/array_ops.rs | 22 +- rust/arrow/src/builder.rs | 368 ++++++++++++++++---------------- rust/arrow/src/csv/reader.rs | 10 +- rust/arrow/src/tensor.rs | 12 +- 6 files changed, 214 insertions(+), 214 deletions(-) diff --git a/rust/arrow/examples/builders.rs b/rust/arrow/examples/builders.rs index 92f45ce67d981..f9ba2974ef7c8 100644 --- a/rust/arrow/examples/builders.rs +++ b/rust/arrow/examples/builders.rs @@ -29,14 +29,14 @@ fn main() { // Create a new builder with a capacity of 100 let mut primitive_array_builder = Int32Builder::new(100); - // Push an individual primitive value - primitive_array_builder.push(55).unwrap(); + // Append an individual primitive value + primitive_array_builder.append_value(55).unwrap(); - // Push a null value - primitive_array_builder.push_null().unwrap(); + // Append a null value + primitive_array_builder.append_null().unwrap(); - // Push a slice of primitive values - primitive_array_builder.push_slice(&[39, 89, 12]).unwrap(); + // Append a slice of primitive values + primitive_array_builder.append_slice(&[39, 89, 12]).unwrap(); // Build the `PrimitiveArray` let _primitive_array = primitive_array_builder.finish(); diff --git a/rust/arrow/src/array.rs b/rust/arrow/src/array.rs index f8272eb007db6..78910d55cd687 100644 --- a/rust/arrow/src/array.rs +++ b/rust/arrow/src/array.rs @@ -201,8 +201,8 @@ impl PrimitiveArray { } // Returns a new primitive array builder - pub fn builder(capacity: usize) -> PrimitiveArrayBuilder { - PrimitiveArrayBuilder::::new(capacity) + pub fn builder(capacity: usize) -> PrimitiveBuilder { + PrimitiveBuilder::::new(capacity) } } diff --git a/rust/arrow/src/array_ops.rs b/rust/arrow/src/array_ops.rs index 69637094942cf..f41740a85e0ea 100644 --- a/rust/arrow/src/array_ops.rs +++ b/rust/arrow/src/array_ops.rs @@ -22,7 +22,7 @@ use std::ops::{Add, Div, Mul, Sub}; use num::Zero; use crate::array::{Array, BooleanArray, PrimitiveArray}; -use crate::builder::PrimitiveArrayBuilder; +use crate::builder::PrimitiveBuilder; use crate::datatypes; use crate::datatypes::ArrowNumericType; use crate::error::{ArrowError, Result}; @@ -102,13 +102,13 @@ where "Cannot perform math operation on arrays of different length".to_string(), )); } - let mut b = PrimitiveArrayBuilder::::new(left.len()); + let mut b = PrimitiveBuilder::::new(left.len()); for i in 0..left.len() { let index = i; if left.is_null(i) || right.is_null(i) { - b.push_null()?; + b.append_null()?; } else { - b.push(op(left.value(index), right.value(index))?)?; + b.append_value(op(left.value(index), right.value(index))?)?; } } Ok(b.finish()) @@ -276,7 +276,7 @@ where } else { Some(right.value(index)) }; - b.push(op(l, r))?; + b.append_value(op(l, r))?; } Ok(b.finish()) } @@ -291,9 +291,9 @@ pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { let mut b = BooleanArray::builder(left.len()); for i in 0..left.len() { if left.is_null(i) || right.is_null(i) { - b.push_null()?; + b.append_null()?; } else { - b.push(left.value(i) && right.value(i))?; + b.append_value(left.value(i) && right.value(i))?; } } Ok(b.finish()) @@ -309,9 +309,9 @@ pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { let mut b = BooleanArray::builder(left.len()); for i in 0..left.len() { if left.is_null(i) || right.is_null(i) { - b.push_null()?; + b.append_null()?; } else { - b.push(left.value(i) || right.value(i))?; + b.append_value(left.value(i) || right.value(i))?; } } Ok(b.finish()) @@ -322,9 +322,9 @@ pub fn not(left: &BooleanArray) -> Result { let mut b = BooleanArray::builder(left.len()); for i in 0..left.len() { if left.is_null(i) { - b.push_null()?; + b.append_null()?; } else { - b.push(!left.value(i))?; + b.append_value(!left.value(i))?; } } Ok(b.finish()) diff --git a/rust/arrow/src/builder.rs b/rust/arrow/src/builder.rs index a0bb43c7dee53..2a4b702a2738a 100644 --- a/rust/arrow/src/builder.rs +++ b/rust/arrow/src/builder.rs @@ -59,8 +59,8 @@ pub trait BufferBuilderTrait { fn capacity(&self) -> usize; fn advance(&mut self, i: usize) -> Result<()>; fn reserve(&mut self, n: usize) -> Result<()>; - fn push(&mut self, v: T::Native) -> Result<()>; - fn push_slice(&mut self, slice: &[T::Native]) -> Result<()>; + fn append(&mut self, v: T::Native) -> Result<()>; + fn append_slice(&mut self, slice: &[T::Native]) -> Result<()>; fn finish(&mut self) -> Buffer; } @@ -102,14 +102,14 @@ impl BufferBuilderTrait for BufferBuilder { Ok(()) } - /// Pushes a value into the builder, growing the internal buffer as needed. - default fn push(&mut self, v: T::Native) -> Result<()> { + /// Appends a value into the builder, growing the internal buffer as needed. + default fn append(&mut self, v: T::Native) -> Result<()> { self.reserve(1)?; self.write_bytes(v.to_byte_slice(), 1) } - /// Pushes a slice of type `T`, growing the internal buffer as needed. - default fn push_slice(&mut self, slice: &[T::Native]) -> Result<()> { + /// Appends a slice of type `T`, growing the internal buffer as needed. + default fn append_slice(&mut self, slice: &[T::Native]) -> Result<()> { let array_slots = slice.len(); self.reserve(array_slots)?; self.write_bytes(slice.to_byte_slice(), array_slots) @@ -163,11 +163,11 @@ impl BufferBuilderTrait for BufferBuilder { Ok(()) } - /// Pushes a value into the builder, growing the internal buffer as needed. - fn push(&mut self, v: bool) -> Result<()> { + /// Appends a value into the builder, growing the internal buffer as needed. + fn append(&mut self, v: bool) -> Result<()> { self.reserve(1)?; if v { - // For performance the `len` of the buffer is not updated on each push but + // For performance the `len` of the buffer is not updated on each append but // is updated in the `freeze` method instead. unsafe { bit_util::set_bit_raw(self.buffer.raw_data() as *mut u8, self.len); @@ -177,11 +177,11 @@ impl BufferBuilderTrait for BufferBuilder { Ok(()) } - /// Pushes a slice of type `T`, growing the internal buffer as needed. - fn push_slice(&mut self, slice: &[bool]) -> Result<()> { + /// Appends a slice of type `T`, growing the internal buffer as needed. + fn append_slice(&mut self, slice: &[bool]) -> Result<()> { let array_slots = slice.len(); for i in 0..array_slots { - self.push(slice[i])?; + self.append(slice[i])?; } Ok(()) } @@ -201,7 +201,7 @@ impl BufferBuilderTrait for BufferBuilder { /// Reset this builder and returns an immutable `Buffer`. fn finish(&mut self) -> Buffer { - // `push` does not update the buffer's `len` so do it before `freeze` is called. + // `append` does not update the buffer's `len` so do it before `freeze` is called. let new_buffer_len = bit_util::ceil(self.len, 8); debug_assert!(new_buffer_len >= self.buffer.len()); let mut buf = ::std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); @@ -238,24 +238,24 @@ pub trait ArrayBuilder: Any { } /// Array builder for fixed-width primitive types -pub struct PrimitiveArrayBuilder { +pub struct PrimitiveBuilder { values_builder: BufferBuilder, bitmap_builder: BooleanBufferBuilder, } -pub type BooleanBuilder = PrimitiveArrayBuilder; -pub type Int8Builder = PrimitiveArrayBuilder; -pub type Int16Builder = PrimitiveArrayBuilder; -pub type Int32Builder = PrimitiveArrayBuilder; -pub type Int64Builder = PrimitiveArrayBuilder; -pub type UInt8Builder = PrimitiveArrayBuilder; -pub type UInt16Builder = PrimitiveArrayBuilder; -pub type UInt32Builder = PrimitiveArrayBuilder; -pub type UInt64Builder = PrimitiveArrayBuilder; -pub type Float32Builder = PrimitiveArrayBuilder; -pub type Float64Builder = PrimitiveArrayBuilder; - -impl ArrayBuilder for PrimitiveArrayBuilder { +pub type BooleanBuilder = PrimitiveBuilder; +pub type Int8Builder = PrimitiveBuilder; +pub type Int16Builder = PrimitiveBuilder; +pub type Int32Builder = PrimitiveBuilder; +pub type Int64Builder = PrimitiveBuilder; +pub type UInt8Builder = PrimitiveBuilder; +pub type UInt16Builder = PrimitiveBuilder; +pub type UInt32Builder = PrimitiveBuilder; +pub type UInt64Builder = PrimitiveBuilder; +pub type Float32Builder = PrimitiveBuilder; +pub type Float64Builder = PrimitiveBuilder; + +impl ArrayBuilder for PrimitiveBuilder { /// Returns the builder as an non-mutable `Any` reference. fn as_any(&self) -> &Any { self @@ -282,7 +282,7 @@ impl ArrayBuilder for PrimitiveArrayBuilder { } } -impl PrimitiveArrayBuilder { +impl PrimitiveBuilder { /// Creates a new primitive array builder pub fn new(capacity: usize) -> Self { Self { @@ -296,33 +296,33 @@ impl PrimitiveArrayBuilder { self.values_builder.capacity() } - /// Pushes a value of type `T` into the builder - pub fn push(&mut self, v: T::Native) -> Result<()> { - self.bitmap_builder.push(true)?; - self.values_builder.push(v)?; + /// Appends a value of type `T` into the builder + pub fn append_value(&mut self, v: T::Native) -> Result<()> { + self.bitmap_builder.append(true)?; + self.values_builder.append(v)?; Ok(()) } - /// Pushes a null slot into the builder - pub fn push_null(&mut self) -> Result<()> { - self.bitmap_builder.push(false)?; + /// Appends a null slot into the builder + pub fn append_null(&mut self) -> Result<()> { + self.bitmap_builder.append(false)?; self.values_builder.advance(1)?; Ok(()) } - /// Pushes an `Option` into the builder - pub fn push_option(&mut self, v: Option) -> Result<()> { + /// Appends an `Option` into the builder + pub fn append_option(&mut self, v: Option) -> Result<()> { match v { - None => self.push_null()?, - Some(v) => self.push(v)?, + None => self.append_null()?, + Some(v) => self.append_value(v)?, }; Ok(()) } - /// Pushes a slice of type `T` into the builder - pub fn push_slice(&mut self, v: &[T::Native]) -> Result<()> { - self.bitmap_builder.push_slice(&vec![true; v.len()][..])?; - self.values_builder.push_slice(v)?; + /// Appends a slice of type `T` into the builder + pub fn append_slice(&mut self, v: &[T::Native]) -> Result<()> { + self.bitmap_builder.append_slice(&vec![true; v.len()][..])?; + self.values_builder.append_slice(v)?; Ok(()) } @@ -345,18 +345,18 @@ impl PrimitiveArrayBuilder { } /// Array builder for `ListArray` -pub struct ListArrayBuilder { +pub struct ListBuilder { offsets_builder: Int32BufferBuilder, bitmap_builder: BooleanBufferBuilder, values_builder: T, len: usize, } -impl ListArrayBuilder { +impl ListBuilder { /// Creates a new `ListArrayBuilder` from a given values array builder pub fn new(values_builder: T) -> Self { let mut offsets_builder = Int32BufferBuilder::new(values_builder.len() + 1); - offsets_builder.push(0).unwrap(); + offsets_builder.append(0).unwrap(); Self { offsets_builder, bitmap_builder: BooleanBufferBuilder::new(values_builder.len()), @@ -366,7 +366,7 @@ impl ListArrayBuilder { } } -impl ArrayBuilder for ListArrayBuilder +impl ArrayBuilder for ListBuilder where T: 'static, { @@ -396,13 +396,13 @@ where } } -impl ListArrayBuilder +impl ListBuilder where T: 'static, { /// Returns the child array builder as a mutable reference. /// - /// This mutable reference can be used to push values into the child array builder, + /// This mutable reference can be used to append values into the child array builder, /// but you must call `append` to delimit each distinct list value. pub fn values(&mut self) -> &mut T { &mut self.values_builder @@ -411,8 +411,8 @@ where /// Finish the current variable-length list array slot pub fn append(&mut self, is_valid: bool) -> Result<()> { self.offsets_builder - .push(self.values_builder.len() as i32)?; - self.bitmap_builder.push(is_valid)?; + .append(self.values_builder.len() as i32)?; + self.bitmap_builder.append(is_valid)?; self.len += 1; Ok(()) } @@ -431,7 +431,7 @@ where let offset_buffer = self.offsets_builder.finish(); let null_bit_buffer = self.bitmap_builder.finish(); - self.offsets_builder.push(0).unwrap(); + self.offsets_builder.append(0).unwrap(); let data = ArrayData::builder(DataType::List(Box::new(values_data.data_type().clone()))) .len(len) .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) @@ -445,11 +445,11 @@ where } /// Array builder for `BinaryArray` -pub struct BinaryArrayBuilder { - builder: ListArrayBuilder, +pub struct BinaryBuilder { + builder: ListBuilder, } -impl ArrayBuilder for BinaryArrayBuilder { +impl ArrayBuilder for BinaryBuilder { /// Returns the builder as an non-mutable `Any` reference. fn as_any(&self) -> &Any { self @@ -476,30 +476,30 @@ impl ArrayBuilder for BinaryArrayBuilder { } } -impl BinaryArrayBuilder { - /// Creates a new `BinaryArrayBuilder`, `capacity` is the number of bytes in the values array +impl BinaryBuilder { + /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values array pub fn new(capacity: usize) -> Self { let values_builder = UInt8Builder::new(capacity); Self { - builder: ListArrayBuilder::new(values_builder), + builder: ListBuilder::new(values_builder), } } - /// Pushes a single byte value into the builder's values array. + /// Appends a single byte value into the builder's values array. /// - /// Note, when pushing individual byte values you must call `append` to delimit each + /// Note, when appending individual byte values you must call `append` to delimit each /// distinct list value. - pub fn push(&mut self, value: u8) -> Result<()> { - self.builder.values().push(value)?; + pub fn append_value(&mut self, value: u8) -> Result<()> { + self.builder.values().append_value(value)?; Ok(()) } - /// Pushes a `&String` or `&str` into the builder. + /// Appends a `&String` or `&str` into the builder. /// - /// Automatically calls the `append` method to delimit the string pushed in as a distinct - /// array element. - pub fn push_string(&mut self, value: &str) -> Result<()> { - self.builder.values().push_slice(value.as_bytes())?; + /// Automatically calls the `append` method to delimit the string appended in as a + /// distinct array element. + pub fn append_string(&mut self, value: &str) -> Result<()> { + self.builder.values().append_slice(value.as_bytes())?; self.builder.append(true)?; Ok(()) } @@ -524,7 +524,7 @@ impl BinaryArrayBuilder { /// /// Note that callers should make sure that methods of all the child field builders are /// properly called to maintain the consistency of the data structure. -pub struct StructArrayBuilder { +pub struct StructBuilder { fields: Vec, field_anys: Vec>, field_builders: Vec>, @@ -532,7 +532,7 @@ pub struct StructArrayBuilder { len: usize, } -impl ArrayBuilder for StructArrayBuilder { +impl ArrayBuilder for StructBuilder { /// Returns the number of array slots in the builder. /// /// Note that this always return the first child field builder's length, and it is @@ -571,7 +571,7 @@ impl ArrayBuilder for StructArrayBuilder { } } -impl StructArrayBuilder { +impl StructBuilder { pub fn new(fields: Vec, builders: Vec>) -> Self { let mut field_anys = Vec::with_capacity(builders.len()); let mut field_builders = Vec::with_capacity(builders.len()); @@ -619,7 +619,7 @@ impl StructArrayBuilder { DataType::UInt64 => Box::new(UInt64Builder::new(capacity)), DataType::Float32 => Box::new(Float32Builder::new(capacity)), DataType::Float64 => Box::new(Float64Builder::new(capacity)), - DataType::Utf8 => Box::new(BinaryArrayBuilder::new(capacity)), + DataType::Utf8 => Box::new(BinaryBuilder::new(capacity)), DataType::Struct(fields) => { let schema = Schema::new(fields.clone()); Box::new(Self::from_schema(schema, capacity)) @@ -643,7 +643,7 @@ impl StructArrayBuilder { /// Appends an element (either null or non-null) to the struct. The actual elements /// should be appended for each child sub-array in a consistent way. pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.bitmap_builder.push(is_valid)?; + self.bitmap_builder.append(is_valid)?; self.len += 1; Ok(()) } @@ -675,7 +675,7 @@ impl StructArrayBuilder { } } -impl Drop for StructArrayBuilder { +impl Drop for StructBuilder { fn drop(&mut self) { // To avoid double drop on the field array builders. let builders = ::std::mem::replace(&mut self.field_builders, Vec::new()); @@ -702,7 +702,7 @@ mod tests { #[test] fn test_builder_i32_alloc_zero_bytes() { let mut b = Int32BufferBuilder::new(0); - b.push(123).unwrap(); + b.append(123).unwrap(); let a = b.finish(); assert_eq!(4, a.len()); } @@ -711,7 +711,7 @@ mod tests { fn test_builder_i32() { let mut b = Int32BufferBuilder::new(5); for i in 0..5 { - b.push(i).unwrap(); + b.append(i).unwrap(); } assert_eq!(16, b.capacity()); let a = b.finish(); @@ -723,7 +723,7 @@ mod tests { let mut b = Int32BufferBuilder::new(2); assert_eq!(16, b.capacity()); for i in 0..20 { - b.push(i).unwrap(); + b.append(i).unwrap(); } assert_eq!(32, b.capacity()); let a = b.finish(); @@ -735,7 +735,7 @@ mod tests { let mut b = Int32BufferBuilder::new(5); assert_eq!(16, b.capacity()); for i in 0..10 { - b.push(i).unwrap(); + b.append(i).unwrap(); } let mut a = b.finish(); assert_eq!(40, a.len()); @@ -744,7 +744,7 @@ mod tests { // Try build another buffer after cleaning up. for i in 0..20 { - b.push(i).unwrap() + b.append(i).unwrap() } assert_eq!(32, b.capacity()); a = b.finish(); @@ -769,15 +769,15 @@ mod tests { } #[test] - fn test_push_slice() { + fn test_append_slice() { let mut b = UInt8BufferBuilder::new(0); - b.push_slice("Hello, ".as_bytes()).unwrap(); - b.push_slice("World!".as_bytes()).unwrap(); + b.append_slice("Hello, ".as_bytes()).unwrap(); + b.append_slice("World!".as_bytes()).unwrap(); let buffer = b.finish(); assert_eq!(13, buffer.len()); let mut b = Int32BufferBuilder::new(0); - b.push_slice(&[32, 54]).unwrap(); + b.append_slice(&[32, 54]).unwrap(); let buffer = b.finish(); assert_eq!(8, buffer.len()); } @@ -785,17 +785,17 @@ mod tests { #[test] fn test_write_bytes() { let mut b = BooleanBufferBuilder::new(4); - b.push(false).unwrap(); - b.push(true).unwrap(); - b.push(false).unwrap(); - b.push(true).unwrap(); + b.append(false).unwrap(); + b.append(true).unwrap(); + b.append(false).unwrap(); + b.append(true).unwrap(); assert_eq!(4, b.len()); assert_eq!(512, b.capacity()); let buffer = b.finish(); assert_eq!(1, buffer.len()); let mut b = BooleanBufferBuilder::new(4); - b.push_slice(&[false, true, false, true]).unwrap(); + b.append_slice(&[false, true, false, true]).unwrap(); assert_eq!(4, b.len()); assert_eq!(512, b.capacity()); let buffer = b.finish(); @@ -829,9 +829,9 @@ mod tests { for i in 0..10 { if i == 3 || i == 6 || i == 9 { - builder.push(true).unwrap(); + builder.append(true).unwrap(); } else { - builder.push(false).unwrap(); + builder.append(false).unwrap(); } } let buf2 = builder.finish(); @@ -844,7 +844,7 @@ mod tests { fn test_primitive_array_builder_i32() { let mut builder = Int32Array::builder(5); for i in 0..5 { - builder.push(i).unwrap(); + builder.append_value(i).unwrap(); } let arr = builder.finish(); assert_eq!(5, arr.len()); @@ -864,9 +864,9 @@ mod tests { let mut builder = BooleanArray::builder(10); for i in 0..10 { if i == 3 || i == 6 || i == 9 { - builder.push(true).unwrap(); + builder.append_value(true).unwrap(); } else { - builder.push(false).unwrap(); + builder.append_value(false).unwrap(); } } @@ -883,15 +883,15 @@ mod tests { } #[test] - fn test_primitive_array_builder_push_option() { + fn test_primitive_array_builder_append_option() { let arr1 = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); let mut builder = Int32Array::builder(5); - builder.push_option(Some(0)).unwrap(); - builder.push_option(None).unwrap(); - builder.push_option(Some(2)).unwrap(); - builder.push_option(None).unwrap(); - builder.push_option(Some(4)).unwrap(); + builder.append_option(Some(0)).unwrap(); + builder.append_option(None).unwrap(); + builder.append_option(Some(2)).unwrap(); + builder.append_option(None).unwrap(); + builder.append_option(Some(4)).unwrap(); let arr2 = builder.finish(); assert_eq!(arr1.len(), arr2.len()); @@ -907,15 +907,15 @@ mod tests { } #[test] - fn test_primitive_array_builder_push_null() { + fn test_primitive_array_builder_append_null() { let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); let mut builder = Int32Array::builder(5); - builder.push(0).unwrap(); - builder.push(2).unwrap(); - builder.push_null().unwrap(); - builder.push_null().unwrap(); - builder.push(4).unwrap(); + builder.append_value(0).unwrap(); + builder.append_value(2).unwrap(); + builder.append_null().unwrap(); + builder.append_null().unwrap(); + builder.append_value(4).unwrap(); let arr2 = builder.finish(); assert_eq!(arr1.len(), arr2.len()); @@ -931,14 +931,14 @@ mod tests { } #[test] - fn test_primitive_array_builder_push_slice() { + fn test_primitive_array_builder_append_slice() { let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); let mut builder = Int32Array::builder(5); - builder.push_slice(&[0, 2]).unwrap(); - builder.push_null().unwrap(); - builder.push_null().unwrap(); - builder.push(4).unwrap(); + builder.append_slice(&[0, 2]).unwrap(); + builder.append_null().unwrap(); + builder.append_null().unwrap(); + builder.append_value(4).unwrap(); let arr2 = builder.finish(); assert_eq!(arr1.len(), arr2.len()); @@ -956,12 +956,12 @@ mod tests { #[test] fn test_primitive_array_builder_finish() { let mut builder = Int32Builder::new(5); - builder.push_slice(&[2, 4, 6, 8]).unwrap(); + builder.append_slice(&[2, 4, 6, 8]).unwrap(); let mut arr = builder.finish(); assert_eq!(4, arr.len()); assert_eq!(0, builder.len()); - builder.push_slice(&[1, 3, 5, 7, 9]).unwrap(); + builder.append_slice(&[1, 3, 5, 7, 9]).unwrap(); arr = builder.finish(); assert_eq!(5, arr.len()); assert_eq!(0, builder.len()); @@ -970,19 +970,19 @@ mod tests { #[test] fn test_list_array_builder() { let values_builder = Int32Builder::new(10); - let mut builder = ListArrayBuilder::new(values_builder); + let mut builder = ListBuilder::new(values_builder); // [[0, 1, 2], [3, 4, 5], [6, 7]] - builder.values().push(0).unwrap(); - builder.values().push(1).unwrap(); - builder.values().push(2).unwrap(); + builder.values().append_value(0).unwrap(); + builder.values().append_value(1).unwrap(); + builder.values().append_value(2).unwrap(); builder.append(true).unwrap(); - builder.values().push(3).unwrap(); - builder.values().push(4).unwrap(); - builder.values().push(5).unwrap(); + builder.values().append_value(3).unwrap(); + builder.values().append_value(4).unwrap(); + builder.values().append_value(5).unwrap(); builder.append(true).unwrap(); - builder.values().push(6).unwrap(); - builder.values().push(7).unwrap(); + builder.values().append_value(6).unwrap(); + builder.values().append_value(7).unwrap(); builder.append(true).unwrap(); let list_array = builder.finish(); @@ -1009,20 +1009,20 @@ mod tests { #[test] fn test_list_array_builder_nulls() { let values_builder = Int32Builder::new(10); - let mut builder = ListArrayBuilder::new(values_builder); + let mut builder = ListBuilder::new(values_builder); // [[0, 1, 2], null, [3, null, 5], [6, 7]] - builder.values().push(0).unwrap(); - builder.values().push(1).unwrap(); - builder.values().push(2).unwrap(); + builder.values().append_value(0).unwrap(); + builder.values().append_value(1).unwrap(); + builder.values().append_value(2).unwrap(); builder.append(true).unwrap(); builder.append(false).unwrap(); - builder.values().push(3).unwrap(); - builder.values().push_null().unwrap(); - builder.values().push(5).unwrap(); + builder.values().append_value(3).unwrap(); + builder.values().append_null().unwrap(); + builder.values().append_value(5).unwrap(); builder.append(true).unwrap(); - builder.values().push(6).unwrap(); - builder.values().push(7).unwrap(); + builder.values().append_value(6).unwrap(); + builder.values().append_value(7).unwrap(); builder.append(true).unwrap(); let list_array = builder.finish(); @@ -1036,18 +1036,18 @@ mod tests { #[test] fn test_list_array_builder_finish() { let values_builder = Int32Array::builder(5); - let mut builder = ListArrayBuilder::new(values_builder); + let mut builder = ListBuilder::new(values_builder); - builder.values().push_slice(&[1, 2, 3]).unwrap(); + builder.values().append_slice(&[1, 2, 3]).unwrap(); builder.append(true).unwrap(); - builder.values().push_slice(&[4, 5, 6]).unwrap(); + builder.values().append_slice(&[4, 5, 6]).unwrap(); builder.append(true).unwrap(); let mut arr = builder.finish(); assert_eq!(2, arr.len()); assert_eq!(0, builder.len()); - builder.values().push_slice(&[7, 8, 9]).unwrap(); + builder.values().append_slice(&[7, 8, 9]).unwrap(); builder.append(true).unwrap(); arr = builder.finish(); assert_eq!(1, arr.len()); @@ -1057,31 +1057,31 @@ mod tests { #[test] fn test_list_list_array_builder() { let primitive_builder = Int32Builder::new(10); - let values_builder = ListArrayBuilder::new(primitive_builder); - let mut builder = ListArrayBuilder::new(values_builder); + let values_builder = ListBuilder::new(primitive_builder); + let mut builder = ListBuilder::new(values_builder); // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] - builder.values().values().push(1).unwrap(); - builder.values().values().push(2).unwrap(); + builder.values().values().append_value(1).unwrap(); + builder.values().values().append_value(2).unwrap(); builder.values().append(true).unwrap(); - builder.values().values().push(3).unwrap(); - builder.values().values().push(4).unwrap(); + builder.values().values().append_value(3).unwrap(); + builder.values().values().append_value(4).unwrap(); builder.values().append(true).unwrap(); builder.append(true).unwrap(); - builder.values().values().push(5).unwrap(); - builder.values().values().push(6).unwrap(); - builder.values().values().push(7).unwrap(); + builder.values().values().append_value(5).unwrap(); + builder.values().values().append_value(6).unwrap(); + builder.values().values().append_value(7).unwrap(); builder.values().append(true).unwrap(); builder.values().append(false).unwrap(); - builder.values().values().push(8).unwrap(); + builder.values().values().append_value(8).unwrap(); builder.values().append(true).unwrap(); builder.append(true).unwrap(); builder.append(false).unwrap(); - builder.values().values().push(9).unwrap(); - builder.values().values().push(10).unwrap(); + builder.values().values().append_value(9).unwrap(); + builder.values().values().append_value(10).unwrap(); builder.values().append(true).unwrap(); builder.append(true).unwrap(); @@ -1111,20 +1111,20 @@ mod tests { #[test] fn test_binary_array_builder() { - let mut builder = BinaryArrayBuilder::new(20); + let mut builder = BinaryBuilder::new(20); - builder.push(b'h').unwrap(); - builder.push(b'e').unwrap(); - builder.push(b'l').unwrap(); - builder.push(b'l').unwrap(); - builder.push(b'o').unwrap(); + builder.append_value(b'h').unwrap(); + builder.append_value(b'e').unwrap(); + builder.append_value(b'l').unwrap(); + builder.append_value(b'l').unwrap(); + builder.append_value(b'o').unwrap(); builder.append(true).unwrap(); builder.append(true).unwrap(); - builder.push(b'w').unwrap(); - builder.push(b'o').unwrap(); - builder.push(b'r').unwrap(); - builder.push(b'l').unwrap(); - builder.push(b'd').unwrap(); + builder.append_value(b'w').unwrap(); + builder.append_value(b'o').unwrap(); + builder.append_value(b'r').unwrap(); + builder.append_value(b'l').unwrap(); + builder.append_value(b'd').unwrap(); builder.append(true).unwrap(); let array = builder.finish(); @@ -1145,29 +1145,29 @@ mod tests { #[test] fn test_binary_array_builder_finish() { - let mut builder = BinaryArrayBuilder::new(10); + let mut builder = BinaryBuilder::new(10); - builder.push_string("hello").unwrap(); - builder.push_string("world").unwrap(); + builder.append_string("hello").unwrap(); + builder.append_string("world").unwrap(); let mut arr = builder.finish(); assert_eq!(2, arr.len()); assert_eq!(0, builder.len()); - builder.push_string("arrow").unwrap(); + builder.append_string("arrow").unwrap(); arr = builder.finish(); assert_eq!(1, arr.len()); assert_eq!(0, builder.len()); } #[test] - fn test_binary_array_builder_push_string() { - let mut builder = BinaryArrayBuilder::new(20); + fn test_binary_array_builder_append_string() { + let mut builder = BinaryBuilder::new(20); let var = "hello".to_owned(); - builder.push_string(&var).unwrap(); + builder.append_string(&var).unwrap(); builder.append(true).unwrap(); - builder.push_string("world").unwrap(); + builder.append_string("world").unwrap(); let array = builder.finish(); @@ -1187,7 +1187,7 @@ mod tests { #[test] fn test_struct_array_builder() { - let string_builder = BinaryArrayBuilder::new(4); + let string_builder = BinaryBuilder::new(4); let int_builder = Int32Builder::new(4); let mut fields = Vec::new(); @@ -1197,24 +1197,24 @@ mod tests { fields.push(Field::new("f2", DataType::Int32, false)); field_builders.push(Box::new(int_builder) as Box); - let mut builder = StructArrayBuilder::new(fields, field_builders); + let mut builder = StructBuilder::new(fields, field_builders); assert_eq!(2, builder.num_fields()); let string_builder = builder - .field_builder::(0) + .field_builder::(0) .expect("builder at field 0 should be binary builder"); - string_builder.push_string("joe").unwrap(); + string_builder.append_string("joe").unwrap(); string_builder.append_null().unwrap(); string_builder.append_null().unwrap(); - string_builder.push_string("mark").unwrap(); + string_builder.append_string("mark").unwrap(); let int_builder = builder .field_builder::(1) .expect("builder at field 1 should be int builder"); - int_builder.push(1).unwrap(); - int_builder.push(2).unwrap(); - int_builder.push_null().unwrap(); - int_builder.push(4).unwrap(); + int_builder.append_value(1).unwrap(); + int_builder.append_value(2).unwrap(); + int_builder.append_null().unwrap(); + int_builder.append_value(4).unwrap(); builder.append(true).unwrap(); builder.append(true).unwrap(); @@ -1282,16 +1282,16 @@ mod tests { fields.push(Field::new("f2", DataType::Boolean, false)); field_builders.push(Box::new(bool_builder) as Box); - let mut builder = StructArrayBuilder::new(fields, field_builders); + let mut builder = StructBuilder::new(fields, field_builders); builder .field_builder::(0) .unwrap() - .push_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + .append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) .unwrap(); builder .field_builder::(1) .unwrap() - .push_slice(&[ + .append_slice(&[ false, true, false, true, false, true, false, true, false, true, ]) .unwrap(); @@ -1303,12 +1303,12 @@ mod tests { builder .field_builder::(0) .unwrap() - .push_slice(&[1, 3, 5, 7, 9]) + .append_slice(&[1, 3, 5, 7, 9]) .unwrap(); builder .field_builder::(1) .unwrap() - .push_slice(&[false, true, false, true, false]) + .append_slice(&[false, true, false, true, false]) .unwrap(); let arr = builder.finish(); @@ -1327,11 +1327,11 @@ mod tests { let struct_type = DataType::Struct(sub_fields); fields.push(Field::new("f3", struct_type, false)); - let mut builder = StructArrayBuilder::from_schema(Schema::new(fields), 5); + let mut builder = StructBuilder::from_schema(Schema::new(fields), 5); assert_eq!(3, builder.num_fields()); assert!(builder.field_builder::(0).is_some()); - assert!(builder.field_builder::(1).is_some()); - assert!(builder.field_builder::(2).is_some()); + assert!(builder.field_builder::(1).is_some()); + assert!(builder.field_builder::(2).is_some()); } #[test] @@ -1342,7 +1342,7 @@ mod tests { let list_type = DataType::List(Box::new(DataType::Int64)); fields.push(Field::new("f2", list_type, false)); - let _ = StructArrayBuilder::from_schema(Schema::new(fields), 5); + let _ = StructBuilder::from_schema(Schema::new(fields), 5); } #[test] @@ -1354,8 +1354,8 @@ mod tests { fields.push(Field::new("f1", DataType::Int32, false)); field_builders.push(Box::new(int_builder) as Box); - let mut builder = StructArrayBuilder::new(fields, field_builders); - assert!(builder.field_builder::(0).is_none()); + let mut builder = StructBuilder::new(fields, field_builders); + assert!(builder.field_builder::(0).is_none()); } } diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 49e0302aa0672..718e8d526c46b 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -266,10 +266,10 @@ impl Reader { &DataType::Float32 => self.build_primitive_array::(rows, i), &DataType::Float64 => self.build_primitive_array::(rows, i), &DataType::Utf8 => { - let mut builder = BinaryArrayBuilder::new(rows.len()); + let mut builder = BinaryBuilder::new(rows.len()); for row_index in 0..rows.len() { match rows[row_index].get(*i) { - Some(s) => builder.push_string(s).unwrap(), + Some(s) => builder.append_string(s).unwrap(), _ => builder.append(false).unwrap(), } } @@ -294,7 +294,7 @@ impl Reader { rows: &[StringRecord], col_idx: &usize, ) -> Result { - let mut builder = PrimitiveArrayBuilder::::new(rows.len()); + let mut builder = PrimitiveBuilder::::new(rows.len()); let is_boolean_type = *self.schema.field(*col_idx).data_type() == DataType::Boolean; for row_index in 0..rows.len() { match rows[row_index].get(*col_idx) { @@ -305,7 +305,7 @@ impl Reader { s.parse::() }; match t { - Ok(v) => builder.push(v)?, + Ok(v) => builder.append_value(v)?, Err(_) => { // TODO: we should surface the underlying error here. return Err(ArrowError::ParseError(format!( @@ -315,7 +315,7 @@ impl Reader { } } } - _ => builder.push_null()?, + _ => builder.append_null()?, } } Ok(Arc::new(builder.finish())) diff --git a/rust/arrow/src/tensor.rs b/rust/arrow/src/tensor.rs index 7272a2cf14631..1703c83738570 100644 --- a/rust/arrow/src/tensor.rs +++ b/rust/arrow/src/tensor.rs @@ -279,7 +279,7 @@ mod tests { fn test_tensor() { let mut builder = Int32BufferBuilder::new(16); for i in 0..16 { - builder.push(i).unwrap(); + builder.append(i).unwrap(); } let buf = builder.finish(); let tensor = Int32Tensor::new(buf, Some(vec![2, 8]), None, None); @@ -294,7 +294,7 @@ mod tests { fn test_new_row_major() { let mut builder = Int32BufferBuilder::new(16); for i in 0..16 { - builder.push(i).unwrap(); + builder.append(i).unwrap(); } let buf = builder.finish(); let tensor = Int32Tensor::new_row_major(buf, Some(vec![2, 8]), None); @@ -312,7 +312,7 @@ mod tests { fn test_new_column_major() { let mut builder = Int32BufferBuilder::new(16); for i in 0..16 { - builder.push(i).unwrap(); + builder.append(i).unwrap(); } let buf = builder.finish(); let tensor = Int32Tensor::new_column_major(buf, Some(vec![2, 8]), None); @@ -330,7 +330,7 @@ mod tests { fn test_with_names() { let mut builder = Int64BufferBuilder::new(8); for i in 0..8 { - builder.push(i).unwrap(); + builder.append(i).unwrap(); } let buf = builder.finish(); let names = vec!["Dim 1", "Dim 2"]; @@ -351,7 +351,7 @@ mod tests { fn test_inconsistent_strides() { let mut builder = Int32BufferBuilder::new(16); for i in 0..16 { - builder.push(i).unwrap(); + builder.append(i).unwrap(); } let buf = builder.finish(); Int32Tensor::new(buf, Some(vec![2, 8]), Some(vec![2, 8, 1]), None); @@ -362,7 +362,7 @@ mod tests { fn test_inconsistent_names() { let mut builder = Int32BufferBuilder::new(16); for i in 0..16 { - builder.push(i).unwrap(); + builder.append(i).unwrap(); } let buf = builder.finish(); Int32Tensor::new( From a3aed3b60bd61c55d7402c4484e480f1998b99f1 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 9 Jan 2019 09:17:46 +0900 Subject: [PATCH 179/328] ARROW-4184: [Ruby] Add Arrow::RecordBatch#to_table Author: Kouhei Sutou Closes #3339 from kou/ruby-record-batch-to-table and squashes the following commits: a6fab35f Require gobject-introspection gem 3.3.1 or later 4a1f3564 Add Arrow::RecordBatch#to_table --- ruby/red-arrow/lib/arrow/record-batch.rb | 9 +++++++++ ruby/red-arrow/red-arrow.gemspec | 2 +- ruby/red-arrow/test/test-record-batch.rb | 23 ++++++++++++++--------- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/ruby/red-arrow/lib/arrow/record-batch.rb b/ruby/red-arrow/lib/arrow/record-batch.rb index f5f8ea2e77721..6d9c35b9dc849 100644 --- a/ruby/red-arrow/lib/arrow/record-batch.rb +++ b/ruby/red-arrow/lib/arrow/record-batch.rb @@ -29,6 +29,15 @@ def columns @columns ||= columns_raw end + # Converts the record batch to {Arrow::Table}. + # + # @return [Arrow::Table] + # + # @since 0.12.0 + def to_table + Table.new(schema, [self]) + end + def respond_to_missing?(name, include_private) return true if find_column(name) super diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec index 8e79c75dcaff2..2d417f08b0087 100644 --- a/ruby/red-arrow/red-arrow.gemspec +++ b/ruby/red-arrow/red-arrow.gemspec @@ -45,7 +45,7 @@ Gem::Specification.new do |spec| spec.test_files += Dir.glob("test/**/*") spec.extensions = ["dependency-check/Rakefile"] - spec.add_runtime_dependency("gobject-introspection", ">= 3.1.1") + spec.add_runtime_dependency("gobject-introspection", ">= 3.3.1") spec.add_runtime_dependency("pkg-config") spec.add_runtime_dependency("native-package-installer") diff --git a/ruby/red-arrow/test/test-record-batch.rb b/ruby/red-arrow/test/test-record-batch.rb index 994b16de99813..4dac085bff86e 100644 --- a/ruby/red-arrow/test/test-record-batch.rb +++ b/ruby/red-arrow/test/test-record-batch.rb @@ -16,16 +16,16 @@ # under the License. class RecordBatchTest < Test::Unit::TestCase - sub_test_case(".each") do - setup do - fields = [ - Arrow::Field.new("count", :uint32), - ] - @schema = Arrow::Schema.new(fields) - @counts = Arrow::UInt32Array.new([1, 2, 4, 8]) - @record_batch = Arrow::RecordBatch.new(@schema, @counts.length, [@counts]) - end + setup do + fields = [ + Arrow::Field.new("count", :uint32), + ] + @schema = Arrow::Schema.new(fields) + @counts = Arrow::UInt32Array.new([1, 2, 4, 8]) + @record_batch = Arrow::RecordBatch.new(@schema, @counts.length, [@counts]) + end + sub_test_case(".each") do test("default") do records = [] @record_batch.each do |record| @@ -54,4 +54,9 @@ class RecordBatchTest < Test::Unit::TestCase records.collect {|record, i| [record.index, i]}) end end + + test("#to_table") do + assert_equal(Arrow::Table.new(@schema, [@counts]), + @record_batch.to_table) + end end From 420c949fd4e593fb0303954092b3d8a46a7aa864 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Wed, 9 Jan 2019 09:28:03 +0900 Subject: [PATCH 180/328] ARROW-4175: [GLib] Add support for decimal compare operators Author: Yosuke Shiro Author: Kouhei Sutou Closes #3346 from shiro615/glib-add-support-for-decimal-compare-operators and squashes the following commits: 28871fd6 Fix documents e81d4146 Unify test case comparisons 0791c4f1 Use rubyish method name 54f46039 Add a test for equal 943c2364 Rename 'more than' to 'greater than' 181e0544 Add support for decimal compare operators --- c_glib/arrow-glib/decimal128.cpp | 98 +++++++++++++++++++++++++++++++- c_glib/arrow-glib/decimal128.h | 15 +++++ c_glib/test/test-decimal128.rb | 97 +++++++++++++++++++++++++++++++ 3 files changed, 209 insertions(+), 1 deletion(-) diff --git a/c_glib/arrow-glib/decimal128.cpp b/c_glib/arrow-glib/decimal128.cpp index d87a5019c1203..a49dba580ee79 100644 --- a/c_glib/arrow-glib/decimal128.cpp +++ b/c_glib/arrow-glib/decimal128.cpp @@ -141,7 +141,8 @@ garrow_decimal128_new_integer(const gint64 data) * @decimal: A #GArrowDecimal128. * @other_decimal: A #GArrowDecimal128 to be compared. * - * Returns: %TRUE if both of them is the same value, %FALSE otherwise. + * Returns: %TRUE if the decimal is equal to the other decimal, %FALSE + * otherwise. * * Since: 0.12.0 */ @@ -154,6 +155,101 @@ garrow_decimal128_equal(GArrowDecimal128 *decimal, return *arrow_decimal == *arrow_other_decimal; } +/** + * garrow_decimal128_not_equal: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal isn't equal to the other decimal, + * %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_not_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal != *arrow_other_decimal; +} + +/** + * garrow_decimal128_less_than: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal is less than the other decimal, + * %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_less_than(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal < *arrow_other_decimal; +} + +/** + * garrow_decimal128_less_than_or_equal: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal is less than the other decimal + * or equal to the other decimal, %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_less_than_or_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal <= *arrow_other_decimal; +} + +/** + * garrow_decimal128_greater_than: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal is greater than the other decimal, + * %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_greater_than(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal > *arrow_other_decimal; +} + +/** + * garrow_decimal128_greater_than_or_equal: + * @decimal: A #GArrowDecimal128. + * @other_decimal: A #GArrowDecimal128 to be compared. + * + * Returns: %TRUE if the decimal is greater than the other decimal + * or equal to the other decimal, %FALSE otherwise. + * + * Since: 0.12.0 + */ +gboolean +garrow_decimal128_greater_than_or_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal) +{ + const auto arrow_decimal = garrow_decimal128_get_raw(decimal); + const auto arrow_other_decimal = garrow_decimal128_get_raw(other_decimal); + return *arrow_decimal >= *arrow_other_decimal; +} + /** * garrow_decimal128_to_string_scale: * @decimal: A #GArrowDecimal128. diff --git a/c_glib/arrow-glib/decimal128.h b/c_glib/arrow-glib/decimal128.h index e8fa59980cd94..e7601a457601b 100644 --- a/c_glib/arrow-glib/decimal128.h +++ b/c_glib/arrow-glib/decimal128.h @@ -41,6 +41,21 @@ GArrowDecimal128 *garrow_decimal128_new_integer(const gint64 data); GARROW_AVAILABLE_IN_0_12 gboolean garrow_decimal128_equal(GArrowDecimal128 *decimal, GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_not_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_less_than(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_less_than_or_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_greater_than(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); +GARROW_AVAILABLE_IN_0_12 +gboolean garrow_decimal128_greater_than_or_equal(GArrowDecimal128 *decimal, + GArrowDecimal128 *other_decimal); gchar *garrow_decimal128_to_string_scale(GArrowDecimal128 *decimal, gint32 scale); gchar *garrow_decimal128_to_string(GArrowDecimal128 *decimal); diff --git a/c_glib/test/test-decimal128.rb b/c_glib/test/test-decimal128.rb index 99f1912babfae..de9453cbe69cd 100644 --- a/c_glib/test/test-decimal128.rb +++ b/c_glib/test/test-decimal128.rb @@ -106,4 +106,101 @@ def test_divide_zero decimal1.divide(decimal2) end end + + def test_equal + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(10) + other_decimal2 = Arrow::Decimal128.new(11) + assert_equal([ + true, + false, + ], + [ + decimal == other_decimal1, + decimal == other_decimal2, + ]) + end + + def test_not_equal + require_gi_bindings(3, 3, 1) + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(10) + other_decimal2 = Arrow::Decimal128.new(11) + assert_equal([ + false, + true, + ], + [ + decimal != other_decimal1, + decimal != other_decimal2, + ]) + end + + def test_less_than + require_gi_bindings(3, 3, 1) + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(11) + other_decimal2 = Arrow::Decimal128.new(9) + assert_equal([ + true, + false, + false + ], + [ + decimal < other_decimal1, + decimal < other_decimal2, + decimal < decimal, + ]) + end + + def test_less_than_or_equal + require_gi_bindings(3, 3, 1) + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(11) + other_decimal2 = Arrow::Decimal128.new(9) + assert_equal([ + true, + false, + true + ], + [ + decimal <= other_decimal1, + decimal <= other_decimal2, + decimal <= decimal + ]) + end + + def test_greater_than + require_gi_bindings(3, 3, 1) + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(11) + other_decimal2 = Arrow::Decimal128.new(9) + assert_equal([ + false, + true, + false + ], + [ + decimal > other_decimal1, + decimal > other_decimal2, + decimal > decimal + ]) + end + + def test_greater_than_or_equal + require_gi_bindings(3, 3, 1) + decimal = Arrow::Decimal128.new(10) + other_decimal1 = Arrow::Decimal128.new(11) + other_decimal2 = Arrow::Decimal128.new(9) + assert_equal([ + false, + true, + true + ], + [ + decimal >= other_decimal1, + decimal >= other_decimal2, + decimal >= decimal + ]) + end end From bfe6865ba8087a46bd7665679e48af3a77987cef Mon Sep 17 00:00:00 2001 From: Pindikura Ravindra Date: Wed, 9 Jan 2019 09:11:01 +0530 Subject: [PATCH 181/328] ARROW-4147: [Java] reduce heap usage for varwidth vectors (#3298) * ARROW-4147: reduce heap usage for varwidth vectors - some code reorg to avoid duplication - changed the default initial alloc from 4096 to 3970 * ARROW-4147: [Java] Address review comments * ARROW-4147: remove check on width to be <= 16: * ARROW-4147: allow initial valueCount to be 0. * ARROW-4147: Fix incorrect comment on initial alloc --- .../arrow/vector/BaseFixedWidthVector.java | 127 +--- .../apache/arrow/vector/BaseValueVector.java | 99 ++- .../arrow/vector/BaseVariableWidthVector.java | 165 +++-- .../org/apache/arrow/vector/BitVector.java | 5 +- .../vector/TestBufferOwnershipTransfer.java | 9 +- .../org/apache/arrow/vector/TestCopyFrom.java | 569 ++++++++++-------- .../apache/arrow/vector/TestValueVector.java | 435 +++++++------ .../arrow/vector/TestVectorReAlloc.java | 23 +- .../complex/writer/TestComplexWriter.java | 15 +- 9 files changed, 799 insertions(+), 648 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index f69a9d1754ac7..f3c2837cfa7e8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -22,7 +22,6 @@ import java.util.Collections; import java.util.List; -import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; @@ -43,8 +42,7 @@ public abstract class BaseFixedWidthVector extends BaseValueVector implements FixedWidthVector, FieldVector, VectorDefinitionSetter { private final int typeWidth; - protected int valueAllocationSizeInBytes; - protected int validityAllocationSizeInBytes; + protected int initialValueAllocation; protected final Field field; private int allocationMonitor; @@ -61,14 +59,7 @@ public BaseFixedWidthVector(final String name, final BufferAllocator allocator, allocationMonitor = 0; validityBuffer = allocator.getEmpty(); valueBuffer = allocator.getEmpty(); - if (typeWidth > 0) { - valueAllocationSizeInBytes = INITIAL_VALUE_ALLOCATION * typeWidth; - validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); - } else { - /* specialized handling for BitVector */ - valueAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); - validityAllocationSizeInBytes = valueAllocationSizeInBytes; - } + initialValueAllocation = INITIAL_VALUE_ALLOCATION; } @@ -159,12 +150,8 @@ public ArrowBuf getOffsetBuffer() { */ @Override public void setInitialCapacity(int valueCount) { - final long size = (long) valueCount * typeWidth; - if (size > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); - } - valueAllocationSizeInBytes = (int) size; - validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount); + computeAndCheckBufferSize(valueCount); + initialValueAllocation = valueCount; } /** @@ -267,18 +254,13 @@ public void allocateNew() { */ @Override public boolean allocateNewSafe() { - long curAllocationSizeValue = valueAllocationSizeInBytes; - long curAllocationSizeValidity = validityAllocationSizeInBytes; - - if (align(curAllocationSizeValue) + curAllocationSizeValidity > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory exceeds limit"); - } + computeAndCheckBufferSize(initialValueAllocation); /* we are doing a new allocation -- release the current buffers */ clear(); try { - allocateBytes(curAllocationSizeValue, curAllocationSizeValidity); + allocateBytes(initialValueAllocation); } catch (Exception e) { clear(); return false; @@ -295,22 +277,13 @@ public boolean allocateNewSafe() { * @throws org.apache.arrow.memory.OutOfMemoryException on error */ public void allocateNew(int valueCount) { - long valueBufferSize = valueCount * typeWidth; - long validityBufferSize = getValidityBufferSizeFromCount(valueCount); - if (typeWidth == 0) { - /* specialized handling for BitVector */ - valueBufferSize = validityBufferSize; - } - - if (align(valueBufferSize) + validityBufferSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); - } + computeAndCheckBufferSize(valueCount); /* we are doing a new allocation -- release the current buffers */ clear(); try { - allocateBytes(valueBufferSize, validityBufferSize); + allocateBytes(valueCount); } catch (Exception e) { clear(); throw e; @@ -318,10 +291,16 @@ public void allocateNew(int valueCount) { } /* - * align to a 8-byte value. + * Compute the buffer size required for 'valueCount', and check if it's within bounds. */ - private long align(long size) { - return ((size + 7) / 8) * 8; + private long computeAndCheckBufferSize(int valueCount) { + final long size = computeCombinedBufferSize(valueCount, typeWidth); + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Memory required for vector capacity " + + valueCount + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + return size; } /** @@ -333,25 +312,11 @@ private long align(long size) { * within the bounds of max allocation allowed and any other error * conditions. */ - private void allocateBytes(final long valueBufferSize, final long validityBufferSize) { - int valueBufferSlice = (int)align(valueBufferSize); - int validityBufferSlice = (int)validityBufferSize; - - /* allocate combined buffer */ - ArrowBuf buffer = allocator.buffer(valueBufferSlice + validityBufferSlice); - - valueAllocationSizeInBytes = valueBufferSlice; - valueBuffer = buffer.slice(0, valueBufferSlice); - valueBuffer.retain(); - valueBuffer.readerIndex(0); - - validityAllocationSizeInBytes = validityBufferSlice; - validityBuffer = buffer.slice(valueBufferSlice, validityBufferSlice); - validityBuffer.retain(); - validityBuffer.readerIndex(0); + private void allocateBytes(int valueCount) { + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(valueCount, typeWidth); + valueBuffer = buffers.getDataBuf(); + validityBuffer = buffers.getValidityBuf(); zeroVector(); - - buffer.release(); } /** @@ -363,7 +328,6 @@ private void allocateBytes(final long valueBufferSize, final long validityBuffer private void allocateValidityBuffer(final int validityBufferSize) { validityBuffer = allocator.buffer(validityBufferSize); validityBuffer.readerIndex(0); - validityAllocationSizeInBytes = validityBufferSize; } /** @@ -439,50 +403,28 @@ public ArrowBuf[] getBuffers(boolean clear) { */ @Override public void reAlloc() { - int valueBaseSize = Integer.max(valueBuffer.capacity(), valueAllocationSizeInBytes); - long newValueBufferSlice = align(valueBaseSize * 2L); - long newValidityBufferSlice; - if (typeWidth > 0) { - long targetValueBufferSize = align(BaseAllocator.nextPowerOfTwo(newValueBufferSlice)); - long targetValueCount = targetValueBufferSize / typeWidth; - targetValueBufferSize -= getValidityBufferSizeFromCount((int) targetValueCount); - if (newValueBufferSlice < targetValueBufferSize) { - newValueBufferSlice = targetValueBufferSize; + int targetValueCount = getValueCapacity() * 2; + if (targetValueCount == 0) { + if (initialValueAllocation > 0) { + targetValueCount = initialValueAllocation * 2; + } else { + targetValueCount = INITIAL_VALUE_ALLOCATION * 2; } - - newValidityBufferSlice = getValidityBufferSizeFromCount((int)(newValueBufferSlice / typeWidth)); - } else { - newValidityBufferSlice = newValueBufferSlice; - } - - long newAllocationSize = newValueBufferSlice + newValidityBufferSlice; - assert newAllocationSize >= 1; - - if (newAllocationSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Unable to expand the buffer"); } + computeAndCheckBufferSize(targetValueCount); - final ArrowBuf newBuffer = allocator.buffer((int) newAllocationSize); - final ArrowBuf newValueBuffer = newBuffer.slice(0, (int)newValueBufferSlice); + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(targetValueCount, typeWidth); + final ArrowBuf newValueBuffer = buffers.getDataBuf(); newValueBuffer.setBytes(0, valueBuffer, 0, valueBuffer.capacity()); - newValueBuffer.setZero(valueBuffer.capacity(), (int)newValueBufferSlice - valueBuffer.capacity()); - newValueBuffer.retain(); - newValueBuffer.readerIndex(0); + newValueBuffer.setZero(valueBuffer.capacity(), newValueBuffer.capacity() - valueBuffer.capacity()); valueBuffer.release(); valueBuffer = newValueBuffer; - valueAllocationSizeInBytes = (int)newValueBufferSlice; - final ArrowBuf newValidityBuffer = newBuffer.slice((int)newValueBufferSlice, - (int)newValidityBufferSlice); + final ArrowBuf newValidityBuffer = buffers.getValidityBuf(); newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); - newValidityBuffer.setZero(validityBuffer.capacity(), (int)newValidityBufferSlice - validityBuffer.capacity()); - newValidityBuffer.retain(); - newValidityBuffer.readerIndex(0); + newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); validityBuffer.release(); validityBuffer = newValidityBuffer; - validityAllocationSizeInBytes = (int)newValidityBufferSlice; - - newBuffer.release(); } @Override @@ -535,9 +477,6 @@ public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers valueBuffer = dataBuffer.retain(allocator); valueCount = fieldNode.getLength(); - - valueAllocationSizeInBytes = valueBuffer.capacity(); - validityAllocationSizeInBytes = validityBuffer.capacity(); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 4cbf4be19dfeb..4e014bbd2aefe 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -20,6 +20,7 @@ import java.util.Collections; import java.util.Iterator; +import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.util.TransferPair; @@ -33,7 +34,14 @@ public abstract class BaseValueVector implements ValueVector { public static final String MAX_ALLOCATION_SIZE_PROPERTY = "arrow.vector.max_allocation_bytes"; public static final int MAX_ALLOCATION_SIZE = Integer.getInteger(MAX_ALLOCATION_SIZE_PROPERTY, Integer.MAX_VALUE); - public static final int INITIAL_VALUE_ALLOCATION = 4096; + /* + * For all fixed width vectors, the value and validity buffers are sliced from a single buffer. + * Similarly, for variable width vectors, the offsets and validity buffers are sliced from a + * single buffer. To ensure the single buffer is power-of-2 size, the initial value allocation + * should be less than power-of-2. For IntVectors, this comes to 3970*4 (15880) for the data + * buffer and 504 bytes for the validity buffer, totalling to 16384 (2^16). + */ + public static final int INITIAL_VALUE_ALLOCATION = 3970; protected final BufferAllocator allocator; protected final String name; @@ -98,5 +106,94 @@ protected ArrowBuf releaseBuffer(ArrowBuf buffer) { protected static int getValidityBufferSizeFromCount(final int valueCount) { return (int) Math.ceil(valueCount / 8.0); } + + /* round up to the next multiple of 8 */ + private static long roundUp8(long size) { + return ((size + 7) / 8) * 8; + } + + protected long computeCombinedBufferSize(int valueCount, int typeWidth) { + Preconditions.checkArgument(valueCount >= 0, "valueCount must be >= 0"); + Preconditions.checkArgument(typeWidth >= 0, "typeWidth must be >= 0"); + + // compute size of validity buffer. + long bufferSize = roundUp8(getValidityBufferSizeFromCount(valueCount)); + + // add the size of the value buffer. + if (typeWidth == 0) { + // for boolean type, value-buffer and validity-buffer are of same size. + bufferSize *= 2; + } else { + bufferSize += roundUp8(valueCount * typeWidth); + } + return BaseAllocator.nextPowerOfTwo(bufferSize); + } + + class DataAndValidityBuffers { + private ArrowBuf dataBuf; + private ArrowBuf validityBuf; + + DataAndValidityBuffers(ArrowBuf dataBuf, ArrowBuf validityBuf) { + this.dataBuf = dataBuf; + this.validityBuf = validityBuf; + } + + public ArrowBuf getDataBuf() { + return dataBuf; + } + + public ArrowBuf getValidityBuf() { + return validityBuf; + } + + } + + protected DataAndValidityBuffers allocFixedDataAndValidityBufs(int valueCount, int typeWidth) { + long bufferSize = computeCombinedBufferSize(valueCount, typeWidth); + assert bufferSize < MAX_ALLOCATION_SIZE; + + int validityBufferSize; + int dataBufferSize; + if (typeWidth == 0) { + validityBufferSize = dataBufferSize = (int) (bufferSize / 2); + } else { + // Due to roundup to power-of-2 allocation, the bufferSize could be greater than the + // requested size. Utilize the allocated buffer fully.; + int actualCount = (int) ((bufferSize * 8.0) / (8 * typeWidth + 1)); + do { + validityBufferSize = (int) roundUp8(getValidityBufferSizeFromCount(actualCount)); + dataBufferSize = (int) roundUp8(actualCount * typeWidth); + if (validityBufferSize + dataBufferSize <= bufferSize) { + break; + } + --actualCount; + } while (true); + } + + + /* allocate combined buffer */ + ArrowBuf combinedBuffer = allocator.buffer((int) bufferSize); + + /* slice into requested lengths */ + ArrowBuf dataBuf = null; + ArrowBuf validityBuf = null; + int bufferOffset = 0; + for (int numBuffers = 0; numBuffers < 2; ++numBuffers) { + int len = (numBuffers == 0 ? dataBufferSize : validityBufferSize); + ArrowBuf buf = combinedBuffer.slice(bufferOffset, len); + buf.retain(); + buf.readerIndex(0); + buf.writerIndex(0); + + bufferOffset += len; + if (numBuffers == 0) { + dataBuf = buf; + } else { + validityBuf = buf; + } + } + combinedBuffer.release(); + return new DataAndValidityBuffers(dataBuf, validityBuf); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 390dfe955b6ce..ac148a25c7c29 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -38,10 +38,8 @@ public abstract class BaseVariableWidthVector extends BaseValueVector implements VariableWidthVector, FieldVector, VectorDefinitionSetter { private static final int DEFAULT_RECORD_BYTE_COUNT = 8; private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; - - private int valueAllocationSizeInBytes; - private int validityAllocationSizeInBytes; - private int offsetAllocationSizeInBytes; + private int initialValueAllocation; + private int initialValueAllocationSizeInBytes; /* protected members */ public static final int OFFSET_WIDTH = 4; /* 4 byte unsigned int to track offsets */ @@ -57,9 +55,9 @@ public abstract class BaseVariableWidthVector extends BaseValueVector public BaseVariableWidthVector(final String name, final BufferAllocator allocator, FieldType fieldType) { super(name, allocator); - valueAllocationSizeInBytes = INITIAL_BYTE_COUNT; - validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); - offsetAllocationSizeInBytes = (INITIAL_VALUE_ALLOCATION) * OFFSET_WIDTH; + initialValueAllocationSizeInBytes = INITIAL_BYTE_COUNT; + // -1 because we require one extra slot for the offset array. + initialValueAllocation = INITIAL_VALUE_ALLOCATION - 1; field = new Field(name, fieldType, null); valueCount = 0; lastSet = -1; @@ -155,15 +153,10 @@ public long getDataBufferAddress() { @Override public void setInitialCapacity(int valueCount) { final long size = (long) valueCount * DEFAULT_RECORD_BYTE_COUNT; - if (size > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); - } - valueAllocationSizeInBytes = (int) size; - validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount); - /* to track the end offset of last data element in vector, we need - * an additional slot in offset buffer. - */ - offsetAllocationSizeInBytes = (valueCount + 1) * OFFSET_WIDTH; + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + initialValueAllocationSizeInBytes = (int) size; + initialValueAllocation = valueCount; } /** @@ -175,17 +168,10 @@ public void setInitialCapacity(int valueCount) { @Override public void setInitialCapacity(int valueCount, double density) { long size = Math.max((long)(valueCount * density), 1L); - - if (size > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); - } - - valueAllocationSizeInBytes = (int) size; - validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount); - /* to track the end offset of last data element in vector, we need - * an additional slot in offset buffer. - */ - offsetAllocationSizeInBytes = (valueCount + 1) * OFFSET_WIDTH; + checkDataBufferSize(size); + computeAndCheckOffsetsBufferSize(valueCount); + initialValueAllocationSizeInBytes = (int) size; + initialValueAllocation = valueCount; } /** @@ -376,20 +362,14 @@ public void allocateNew() { */ @Override public boolean allocateNewSafe() { - long curAllocationSizeValue = valueAllocationSizeInBytes; - long curAllocationSizeValidity = validityAllocationSizeInBytes; - long curAllocationSizeOffset = offsetAllocationSizeInBytes; - - if (curAllocationSizeValue > MAX_ALLOCATION_SIZE || - curAllocationSizeOffset > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory exceeds limit"); - } + checkDataBufferSize(initialValueAllocationSizeInBytes); + computeAndCheckOffsetsBufferSize(initialValueAllocation); /* we are doing a new allocation -- release the current buffers */ clear(); try { - allocateBytes(curAllocationSizeValue, curAllocationSizeValidity, curAllocationSizeOffset); + allocateBytes(initialValueAllocationSizeInBytes, initialValueAllocation); } catch (Exception e) { clear(); return false; @@ -409,35 +389,59 @@ public boolean allocateNewSafe() { @Override public void allocateNew(int totalBytes, int valueCount) { assert totalBytes >= 0; - final int offsetBufferSize = (valueCount + 1) * OFFSET_WIDTH; - final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); - if (totalBytes > MAX_ALLOCATION_SIZE || - offsetBufferSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Requested amount of memory exceeds limit"); - } + checkDataBufferSize(totalBytes); + computeAndCheckOffsetsBufferSize(valueCount); /* we are doing a new allocation -- release the current buffers */ clear(); try { - allocateBytes(totalBytes, validityBufferSize, offsetBufferSize); + allocateBytes(totalBytes, valueCount); } catch (Exception e) { clear(); throw e; } } + /* Check if the data buffer size is within bounds. */ + private void checkDataBufferSize(long size) { + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Memory required for vector " + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + } + + /* + * Compute the buffer size required for 'valueCount' offsets and validity, and check if it's + * within bounds. + */ + private long computeAndCheckOffsetsBufferSize(int valueCount) { + /* to track the end offset of last data element in vector, we need + * an additional slot in offset buffer. + */ + final long size = computeCombinedBufferSize(valueCount + 1, OFFSET_WIDTH); + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Memory required for vector capacity " + + valueCount + + " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")"); + } + return size; + } + /* allocate the inner buffers */ - private void allocateBytes(final long valueBufferSize, final long validityBufferSize, - final long offsetBufferSize) { + private void allocateBytes(final int valueBufferSize, final int valueCount) { /* allocate data buffer */ - int curSize = (int) valueBufferSize; + int curSize = valueBufferSize; valueBuffer = allocator.buffer(curSize); valueBuffer.readerIndex(0); - valueAllocationSizeInBytes = curSize; - allocateValidityBuffer(validityBufferSize); - allocateOffsetBuffer(offsetBufferSize); + + /* allocate offset buffer and validity buffer */ + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(valueCount + 1, OFFSET_WIDTH); + offsetBuffer = buffers.getDataBuf(); + validityBuffer = buffers.getValidityBuf(); + initOffsetBuffer(); + initValidityBuffer(); } /* allocate offset buffer */ @@ -445,7 +449,6 @@ private void allocateOffsetBuffer(final long size) { final int curSize = (int) size; offsetBuffer = allocator.buffer(curSize); offsetBuffer.readerIndex(0); - offsetAllocationSizeInBytes = curSize; initOffsetBuffer(); } @@ -454,7 +457,6 @@ private void allocateValidityBuffer(final long size) { final int curSize = (int) size; validityBuffer = allocator.buffer(curSize); validityBuffer.readerIndex(0); - validityAllocationSizeInBytes = curSize; initValidityBuffer(); } @@ -476,7 +478,7 @@ public void reAlloc() { * @throws OutOfMemoryException if the internal memory allocation fails */ public void reallocDataBuffer() { - long baseSize = valueAllocationSizeInBytes; + long baseSize = initialValueAllocationSizeInBytes; final int currentBufferCapacity = valueBuffer.capacity(); if (baseSize < (long) currentBufferCapacity) { @@ -487,15 +489,12 @@ public void reallocDataBuffer() { newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); assert newAllocationSize >= 1; - if (newAllocationSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Unable to expand the buffer"); - } + checkDataBufferSize(newAllocationSize); final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); newBuf.setBytes(0, valueBuffer, 0, currentBufferCapacity); valueBuffer.release(); valueBuffer = newBuf; - valueAllocationSizeInBytes = (int) newAllocationSize; } /** @@ -522,40 +521,28 @@ public void reallocDataBuffer() { * @throws OutOfMemoryException if the internal memory allocation fails */ public void reallocValidityAndOffsetBuffers() { - offsetBuffer = reallocBufferHelper(offsetBuffer, true); - validityBuffer = reallocBufferHelper(validityBuffer, false); - } - - /* helper method to realloc a particular buffer. returns the allocated buffer */ - private ArrowBuf reallocBufferHelper(ArrowBuf buffer, final boolean offsetBuffer) { - final int currentBufferCapacity = buffer.capacity(); - long baseSize = (offsetBuffer ? offsetAllocationSizeInBytes - : validityAllocationSizeInBytes); - - if (baseSize < (long) currentBufferCapacity) { - baseSize = (long) currentBufferCapacity; - } - - long newAllocationSize = baseSize * 2L; - newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); - assert newAllocationSize >= 1; - - if (newAllocationSize > MAX_ALLOCATION_SIZE) { - throw new OversizedAllocationException("Unable to expand the buffer"); + int targetOffsetCount = (offsetBuffer.capacity() / OFFSET_WIDTH) * 2; + if (targetOffsetCount == 0) { + if (initialValueAllocation > 0) { + targetOffsetCount = 2 * (initialValueAllocation + 1); + } else { + targetOffsetCount = 2 * (INITIAL_VALUE_ALLOCATION + 1); + } } + computeAndCheckOffsetsBufferSize(targetOffsetCount); - final ArrowBuf newBuf = allocator.buffer((int) newAllocationSize); - newBuf.setBytes(0, buffer, 0, currentBufferCapacity); - newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); - buffer.release(1); - buffer = newBuf; - if (offsetBuffer) { - offsetAllocationSizeInBytes = (int) newAllocationSize; - } else { - validityAllocationSizeInBytes = (int) newAllocationSize; - } + DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(targetOffsetCount, OFFSET_WIDTH); + final ArrowBuf newOffsetBuffer = buffers.getDataBuf(); + newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity()); + newOffsetBuffer.setZero(offsetBuffer.capacity(), newOffsetBuffer.capacity() - offsetBuffer.capacity()); + offsetBuffer.release(); + offsetBuffer = newOffsetBuffer; - return buffer; + final ArrowBuf newValidityBuffer = buffers.getValidityBuf(); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); + validityBuffer.release(); + validityBuffer = newValidityBuffer; } /** @@ -919,7 +906,7 @@ public long getStartEnd(int index) { @Override public void setIndexDefined(int index) { while (index >= getValidityBufferValueCapacity()) { - validityBuffer = reallocBufferHelper(validityBuffer, false); + reallocValidityAndOffsetBuffers(); } BitVectorHelper.setValidityBitToOne(validityBuffer, index); } @@ -1072,7 +1059,7 @@ public void setSafe(int index, ByteBuffer value, int start, int length) { */ public void setNull(int index) { while (index >= getValidityBufferValueCapacity()) { - validityBuffer = reallocBufferHelper(validityBuffer, false); + reallocValidityAndOffsetBuffers(); } BitVectorHelper.setValidityBit(validityBuffer, index, 0); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 7aac28cbf1fc4..c6c964233419d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -91,11 +91,10 @@ public MinorType getMinorType() { @Override public void setInitialCapacity(int valueCount) { final int size = getValidityBufferSizeFromCount(valueCount); - if (size > MAX_ALLOCATION_SIZE) { + if (size * 2 > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); } - valueAllocationSizeInBytes = size; - validityAllocationSizeInBytes = size; + initialValueAllocation = valueCount; } /** diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java index 9165343bfdc2b..a407166c4f6d0 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBufferOwnershipTransfer.java @@ -51,7 +51,7 @@ public void testTransferFixedWidth() { } @Test - public void testTransferVariableidth() { + public void testTransferVariableWidth() { BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); BufferAllocator childAllocator1 = allocator.newChildAllocator("child1", 100000, 100000); BufferAllocator childAllocator2 = allocator.newChildAllocator("child2", 100000, 100000); @@ -62,15 +62,12 @@ public void testTransferVariableidth() { v1.setValueCount(4001); VarCharVector v2 = new VarCharVector("v2", childAllocator2); + long memoryBeforeTransfer = childAllocator1.getAllocatedMemory(); v1.makeTransferPair(v2).transfer(); assertEquals(0, childAllocator1.getAllocatedMemory()); - int expectedValueVector = 4096 * 8; - int expectedOffsetVector = 4096 * 4; - int expectedBitVector = 512; - int expected = expectedBitVector + expectedOffsetVector + expectedValueVector; - assertEquals(expected, childAllocator2.getAllocatedMemory()); + assertEquals(memoryBeforeTransfer, childAllocator2.getAllocatedMemory()); } private static class Pointer { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java b/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java index f7d3ddb397315..b10db95b6cf48 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestCopyFrom.java @@ -69,14 +69,16 @@ public void terminate() throws Exception { @Test /* NullableVarChar */ public void testCopyFromWithNulls() { - try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); - final VarCharVector vector2 = - newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + try (final VarCharVector vector = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); + final VarCharVector vector2 = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { vector.allocateNew(); - int capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertTrue(vector.getValueCapacity() >= 1); + assertEquals(0, vector.getValueCount()); + int initialCapacity = vector.getValueCapacity(); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { continue; } @@ -85,43 +87,53 @@ public void testCopyFromWithNulls() { } /* NO reAlloc() should have happened in setSafe() */ - capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + int capacity = vector.getValueCapacity(); + assertEquals(initialCapacity, capacity); - vector.setValueCount(4095); + vector.setValueCount(initialCapacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { assertNull(vector.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector.getObject(i).toString()); } } + vector2.setInitialCapacity(initialCapacity); vector2.allocateNew(); capacity = vector2.getValueCapacity(); - assertEquals(4095, capacity); + assertEquals(initialCapacity, capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector); if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); } } /* NO reAlloc() should have happened in copyFrom */ capacity = vector2.getValueCapacity(); - assertEquals(4095, capacity); + assertEquals(initialCapacity, capacity); - vector2.setValueCount(4095); + vector2.setValueCount(initialCapacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); } } } @@ -129,14 +141,16 @@ public void testCopyFromWithNulls() { @Test /* NullableVarChar */ public void testCopyFromWithNulls1() { - try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); - final VarCharVector vector2 = - newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + try (final VarCharVector vector = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator); + final VarCharVector vector2 = + newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { vector.allocateNew(); - int capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertTrue(vector.getValueCapacity() >= 1); + assertEquals(0, vector.getValueCount()); + int initialCapacity = vector.getValueCapacity(); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { continue; } @@ -145,47 +159,57 @@ public void testCopyFromWithNulls1() { } /* NO reAlloc() should have happened in setSafe() */ - capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + int capacity = vector.getValueCapacity(); + assertEquals(initialCapacity, capacity); - vector.setValueCount(4095); + vector.setValueCount(initialCapacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { assertNull(vector.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector.getObject(i).toString()); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024 * 10, 1024); + vector2.allocateNew((initialCapacity / 4) * 10, initialCapacity / 4); capacity = vector2.getValueCapacity(); - assertEquals(1024, capacity); + assertTrue(capacity >= initialCapacity / 4); + assertTrue(capacity < initialCapacity / 2); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector); if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); } } /* 2 reAllocs should have happened in copyFromSafe() */ capacity = vector2.getValueCapacity(); - assertEquals(4096, capacity); + assertTrue(capacity >= initialCapacity); - vector2.setValueCount(4095); + vector2.setValueCount(initialCapacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < initialCapacity; i++) { if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, Integer.toString(i), vector2.getObject(i).toString()); + assertEquals( + "unexpected value at index: " + i, + Integer.toString(i), + vector2.getObject(i).toString()); } } } @@ -194,28 +218,29 @@ public void testCopyFromWithNulls1() { @Test /* IntVector */ public void testCopyFromWithNulls2() { try (final IntVector vector1 = new IntVector(EMPTY_SCHEMA_PATH, allocator); - final IntVector vector2 = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { + final IntVector vector2 = new IntVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } vector1.setSafe(i, 1000 + i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { @@ -226,23 +251,24 @@ public void testCopyFromWithNulls2() { /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { assertEquals("unexpected value at index: " + i, 1000 + i, vector2.get(i)); @@ -254,60 +280,60 @@ public void testCopyFromWithNulls2() { @Test /* BigIntVector */ public void testCopyFromWithNulls3() { try (final BigIntVector vector1 = new BigIntVector(EMPTY_SCHEMA_PATH, allocator); - final BigIntVector vector2 = new BigIntVector(EMPTY_SCHEMA_PATH, allocator)) { + final BigIntVector vector2 = new BigIntVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - vector1.setSafe(i, 10000000000L + (long)i); + vector1.setSafe(i, 10000000000L + (long) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 10000000000L + (long)i, vector1.get(i)); + assertEquals("unexpected value at index: " + i, 10000000000L + (long) i, vector1.get(i)); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 10000000000L + (long)i, vector2.get(i)); + assertEquals("unexpected value at index: " + i, 10000000000L + (long) i, vector2.get(i)); } } } @@ -316,8 +342,9 @@ public void testCopyFromWithNulls3() { @Test /* BitVector */ public void testCopyFromWithNulls4() { try (final BitVector vector1 = new BitVector(EMPTY_SCHEMA_PATH, allocator); - final BitVector vector2 = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + final BitVector vector2 = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { + vector1.setInitialCapacity(4096); vector1.allocateNew(); assertEquals(4096, vector1.getValueCapacity()); assertEquals(0, vector1.getValueCount()); @@ -394,60 +421,60 @@ public void testCopyFromWithNulls4() { @Test /* Float4Vector */ public void testCopyFromWithNulls5() { try (final Float4Vector vector1 = new Float4Vector(EMPTY_SCHEMA_PATH, allocator); - final Float4Vector vector2 = new Float4Vector(EMPTY_SCHEMA_PATH, allocator)) { + final Float4Vector vector2 = new Float4Vector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - vector1.setSafe(i, 100.25f + (float)i); + vector1.setSafe(i, 100.25f + (float) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 100.25f + (float)i, vector1.get(i), 0); + assertEquals("unexpected value at index: " + i, 100.25f + (float) i, vector1.get(i), 0); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 100.25f + i * 1.0f, vector2.get(i), 0); + assertEquals("unexpected value at index: " + i, 100.25f + i * 1.0f, vector2.get(i), 0); } } } @@ -456,60 +483,62 @@ public void testCopyFromWithNulls5() { @Test /* Float8Vector */ public void testCopyFromWithNulls6() { try (final Float8Vector vector1 = new Float8Vector(EMPTY_SCHEMA_PATH, allocator); - final Float8Vector vector2 = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { + final Float8Vector vector2 = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } vector1.setSafe(i, 123456.7865 + (double) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 123456.7865 + (double) i, vector1.get(i), 0); + assertEquals( + "unexpected value at index: " + i, 123456.7865 + (double) i, vector1.get(i), 0); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - 123456.7865 + (double) i, vector2.get(i), 0); + assertEquals( + "unexpected value at index: " + i, 123456.7865 + (double) i, vector2.get(i), 0); } } } @@ -518,30 +547,31 @@ public void testCopyFromWithNulls6() { @Test /* IntervalDayVector */ public void testCopyFromWithNulls7() { try (final IntervalDayVector vector1 = new IntervalDayVector(EMPTY_SCHEMA_PATH, allocator); - final IntervalDayVector vector2 = new IntervalDayVector(EMPTY_SCHEMA_PATH, allocator)) { + final IntervalDayVector vector2 = new IntervalDayVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final int days = 10; final int milliseconds = 10000; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } vector1.setSafe(i, days + i, milliseconds + i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { @@ -554,23 +584,24 @@ public void testCopyFromWithNulls7() { /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { final Period p = vector2.getObject(i); @@ -584,15 +615,16 @@ public void testCopyFromWithNulls7() { @Test /* IntervalYearVector */ public void testCopyFromWithNulls8() { try (final IntervalYearVector vector1 = new IntervalYearVector(EMPTY_SCHEMA_PATH, allocator); - final IntervalYearVector vector2 = new IntervalYearVector(EMPTY_SCHEMA_PATH, allocator)) { + final IntervalYearVector vector2 = new IntervalYearVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final int interval = 30; /* 2 years 6 months */ - final Period[] periods = new Period[4096]; - for (int i = 0; i < 4096; i++) { + final Period[] periods = new Period[4096]; + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } @@ -600,18 +632,19 @@ public void testCopyFromWithNulls8() { final Period p = new Period(); final int years = (interval + i) / org.apache.arrow.vector.util.DateUtility.yearsToMonths; final int months = (interval + i) % org.apache.arrow.vector.util.DateUtility.yearsToMonths; - periods[i] = p.plusYears(years).plusMonths(months);; + periods[i] = p.plusYears(years).plusMonths(months); + ; } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { @@ -624,23 +657,24 @@ public void testCopyFromWithNulls8() { /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { final Period p = vector2.getObject(i); @@ -653,61 +687,61 @@ public void testCopyFromWithNulls8() { @Test /* SmallIntVector */ public void testCopyFromWithNulls9() { try (final SmallIntVector vector1 = new SmallIntVector(EMPTY_SCHEMA_PATH, allocator); - final SmallIntVector vector2 = new SmallIntVector(EMPTY_SCHEMA_PATH, allocator)) { + final SmallIntVector vector2 = new SmallIntVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final short val = 1000; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - vector1.setSafe(i, val + (short)i); + vector1.setSafe(i, val + (short) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (short)i, vector1.get(i)); + assertEquals("unexpected value at index: " + i, val + (short) i, vector1.get(i)); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (short)i, vector2.get(i)); + assertEquals("unexpected value at index: " + i, val + (short) i, vector2.get(i)); } } } @@ -716,61 +750,61 @@ public void testCopyFromWithNulls9() { @Test /* TimeMicroVector */ public void testCopyFromWithNulls10() { try (final TimeMicroVector vector1 = new TimeMicroVector(EMPTY_SCHEMA_PATH, allocator); - final TimeMicroVector vector2 = new TimeMicroVector(EMPTY_SCHEMA_PATH, allocator)) { + final TimeMicroVector vector2 = new TimeMicroVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final long val = 100485765432L; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - vector1.setSafe(i, val + (long)i); + vector1.setSafe(i, val + (long) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (long)i, vector1.get(i)); + assertEquals("unexpected value at index: " + i, val + (long) i, vector1.get(i)); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (long) i, vector2.get(i)); + assertEquals("unexpected value at index: " + i, val + (long) i, vector2.get(i)); } } } @@ -779,61 +813,61 @@ public void testCopyFromWithNulls10() { @Test /* TimeMilliVector */ public void testCopyFromWithNulls11() { try (final TimeMilliVector vector1 = new TimeMilliVector(EMPTY_SCHEMA_PATH, allocator); - final TimeMilliVector vector2 = new TimeMilliVector(EMPTY_SCHEMA_PATH, allocator)) { + final TimeMilliVector vector2 = new TimeMilliVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final int val = 1000; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } vector1.setSafe(i, val + i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + i, vector1.get(i)); + assertEquals("unexpected value at index: " + i, val + i, vector1.get(i)); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + i, vector2.get(i)); + assertEquals("unexpected value at index: " + i, val + i, vector2.get(i)); } } } @@ -842,14 +876,15 @@ public void testCopyFromWithNulls11() { @Test /* TinyIntVector */ public void testCopyFromWithNulls12() { try (final TinyIntVector vector1 = new TinyIntVector(EMPTY_SCHEMA_PATH, allocator); - final TinyIntVector vector2 = new TinyIntVector(EMPTY_SCHEMA_PATH, allocator)) { + final TinyIntVector vector2 = new TinyIntVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); byte val = -128; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } @@ -857,16 +892,16 @@ public void testCopyFromWithNulls12() { val++; } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); val = -128; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { @@ -878,24 +913,24 @@ public void testCopyFromWithNulls12() { /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ val = -128; - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { assertEquals("unexpected value at index: " + i, val, vector2.get(i)); @@ -908,32 +943,33 @@ public void testCopyFromWithNulls12() { @Test /* DecimalVector */ public void testCopyFromWithNulls13() { try (final DecimalVector vector1 = new DecimalVector(EMPTY_SCHEMA_PATH, allocator, 30, 16); - final DecimalVector vector2 = new DecimalVector(EMPTY_SCHEMA_PATH, allocator, 30, 16)) { + final DecimalVector vector2 = new DecimalVector(EMPTY_SCHEMA_PATH, allocator, 30, 16)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final double baseValue = 104567897654.876543654; final BigDecimal[] decimals = new BigDecimal[4096]; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - BigDecimal decimal = new BigDecimal(baseValue + (double)i); + BigDecimal decimal = new BigDecimal(baseValue + (double) i); vector1.setSafe(i, decimal); decimals[i] = decimal; } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { @@ -945,23 +981,24 @@ public void testCopyFromWithNulls13() { /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { final BigDecimal decimal = vector2.getObject(i); @@ -974,61 +1011,61 @@ public void testCopyFromWithNulls13() { @Test /* TimeStampVector */ public void testCopyFromWithNulls14() { try (final TimeStampVector vector1 = new TimeStampMicroVector(EMPTY_SCHEMA_PATH, allocator); - final TimeStampVector vector2 = new TimeStampMicroVector(EMPTY_SCHEMA_PATH, allocator)) { + final TimeStampVector vector2 = new TimeStampMicroVector(EMPTY_SCHEMA_PATH, allocator)) { vector1.allocateNew(); - assertEquals(4096, vector1.getValueCapacity()); + assertTrue(vector1.getValueCapacity() >= vector1.initialValueAllocation); assertEquals(0, vector1.getValueCount()); + int initialCapacity = vector1.getValueCapacity(); final long val = 20145678912L; - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { continue; } - vector1.setSafe(i, val + (long)i); + vector1.setSafe(i, val + (long) i); } - vector1.setValueCount(4096); + vector1.setValueCount(initialCapacity); /* No realloc should have happened in setSafe or * setValueCount */ - assertEquals(4096, vector1.getValueCapacity()); - assertEquals(4096, vector1.getValueCount()); + assertEquals(initialCapacity, vector1.getValueCapacity()); + assertEquals(initialCapacity, vector1.getValueCount()); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { if ((i & 1) == 0) { assertNull(vector1.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (long)i, vector1.get(i)); + assertEquals("unexpected value at index: " + i, val + (long) i, vector1.get(i)); } } /* set lesser initial capacity than actually needed * to trigger reallocs in copyFromSafe() */ - vector2.allocateNew(1024); - assertEquals(1024, vector2.getValueCapacity()); + vector2.allocateNew(initialCapacity / 4); + assertTrue(vector2.getValueCapacity() >= initialCapacity / 4); + assertTrue(vector2.getValueCapacity() < initialCapacity / 2); - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity; i++) { vector2.copyFromSafe(i, i, vector1); } /* 2 realloc should have happened in copyFromSafe() */ - assertEquals(4096, vector2.getValueCapacity()); - vector2.setValueCount(8192); + assertTrue(vector2.getValueCapacity() >= initialCapacity); + vector2.setValueCount(initialCapacity * 2); /* setValueCount() should have done another realloc */ - assertEquals(8192, vector2.getValueCount()); - assertEquals(8192, vector2.getValueCapacity()); + assertEquals(initialCapacity * 2, vector2.getValueCount()); + assertTrue(vector2.getValueCapacity() >= initialCapacity * 2); /* check vector data after copy and realloc */ - for (int i = 0; i < 8192; i++) { - if (((i & 1) == 0) || (i >= 4096)) { + for (int i = 0; i < initialCapacity * 2; i++) { + if (((i & 1) == 0) || (i >= initialCapacity)) { assertNull(vector2.getObject(i)); } else { - assertEquals("unexpected value at index: " + i, - val + (long) i, vector2.get(i)); + assertEquals("unexpected value at index: " + i, val + (long) i, vector2.get(i)); } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 4772a86356b95..30fe23cae4afd 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -32,6 +32,7 @@ import java.util.Arrays; import java.util.List; +import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; @@ -68,8 +69,8 @@ public void init() { private static final byte[] STR5 = "EEE5".getBytes(utf8Charset); private static final byte[] STR6 = "FFFFF6".getBytes(utf8Charset); private static final int MAX_VALUE_COUNT = - Integer.getInteger("arrow.vector.max_allocation_bytes", Integer.MAX_VALUE) / 4; - private static final int MAX_VALUE_COUNT_8BYTE = MAX_VALUE_COUNT / 2; + (int)(Integer.getInteger("arrow.vector.max_allocation_bytes", Integer.MAX_VALUE) / 7); + private static final int MAX_VALUE_COUNT_8BYTE = (int)(MAX_VALUE_COUNT / 2); @After public void terminate() throws Exception { @@ -108,7 +109,7 @@ public void testFixedType1() { vector.allocateNew(1024); initialCapacity = vector.getValueCapacity(); - assertEquals(1024, initialCapacity); + assertTrue(initialCapacity >= 1024); // Put and set a few values vector.setSafe(0, 100); @@ -124,7 +125,7 @@ public void testFixedType1() { assertEquals(104, vector.get(1023)); try { - vector.set(1024, 10000); + vector.set(initialCapacity, 10000); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -133,7 +134,7 @@ public void testFixedType1() { } try { - vector.get(1024); + vector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -142,10 +143,10 @@ public void testFixedType1() { } /* this should trigger a realloc() */ - vector.setSafe(1024, 10000); + vector.setSafe(initialCapacity, 10000); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); /* check vector data after realloc */ assertEquals(100, vector.get(0)); @@ -153,16 +154,17 @@ public void testFixedType1() { assertEquals(102, vector.get(100)); assertEquals(103, vector.get(1022)); assertEquals(104, vector.get(1023)); - assertEquals(10000, vector.get(1024)); + assertEquals(10000, vector.get(initialCapacity)); /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); vector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertEquals(capacityBeforeReset, vector.getValueCapacity()); /* vector data should have been zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { // TODO: test vector.get(i) is 0 after unsafe get added assertEquals("non-zero data not expected at index: " + i, true, vector.isNull(i)); } @@ -180,7 +182,7 @@ public void testFixedType2() { intVector.setInitialCapacity(MAX_VALUE_COUNT); try { - intVector.setInitialCapacity(MAX_VALUE_COUNT + 1); + intVector.setInitialCapacity(MAX_VALUE_COUNT * 2); } catch (OversizedAllocationException oe) { error = true; } finally { @@ -195,17 +197,18 @@ public void testFixedType2() { /* allocate 64 bytes (16 * 4) */ intVector.allocateNew(); /* underlying buffer should be able to store 16 values */ - assertEquals(initialCapacity, intVector.getValueCapacity()); + assertTrue(intVector.getValueCapacity() >= initialCapacity); + initialCapacity = intVector.getValueCapacity(); /* populate the vector */ int j = 1; - for (int i = 0; i < 16; i += 2) { + for (int i = 0; i < initialCapacity; i += 2) { intVector.set(i, j); j++; } try { - intVector.set(16, 9); + intVector.set(initialCapacity, j); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -215,13 +218,13 @@ public void testFixedType2() { /* check vector contents */ j = 1; - for (int i = 0; i < 16; i += 2) { + for (int i = 0; i < initialCapacity; i += 2) { assertEquals("unexpected value at index: " + i, j, intVector.get(i)); j++; } try { - intVector.get(16); + intVector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -230,26 +233,27 @@ public void testFixedType2() { } /* this should trigger a realloc() */ - intVector.setSafe(16, 9); + intVector.setSafe(initialCapacity, j); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, intVector.getValueCapacity()); + assertTrue(intVector.getValueCapacity() >= initialCapacity * 2); /* vector data should still be intact after realloc */ j = 1; - for (int i = 0; i <= 16; i += 2) { + for (int i = 0; i <= initialCapacity; i += 2) { assertEquals("unexpected value at index: " + i, j, intVector.get(i)); j++; } /* reset the vector */ + int capacityBeforeRealloc = intVector.getValueCapacity(); intVector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, intVector.getValueCapacity()); + assertEquals(capacityBeforeRealloc, intVector.getValueCapacity()); /* vector data should have been zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeRealloc; i++) { assertEquals("non-zero data not expected at index: " + i, true, intVector.isNull(i)); } } @@ -266,7 +270,7 @@ public void testFixedType3() { floatVector.setInitialCapacity(MAX_VALUE_COUNT); try { - floatVector.setInitialCapacity(MAX_VALUE_COUNT + 1); + floatVector.setInitialCapacity(MAX_VALUE_COUNT * 2); } catch (OversizedAllocationException oe) { error = true; } finally { @@ -281,7 +285,8 @@ public void testFixedType3() { /* allocate 64 bytes (16 * 4) */ floatVector.allocateNew(); /* underlying buffer should be able to store 16 values */ - assertEquals(initialCapacity, floatVector.getValueCapacity()); + assertTrue(floatVector.getValueCapacity() >= initialCapacity); + initialCapacity = floatVector.getValueCapacity(); floatVector.zeroVector(); @@ -296,7 +301,7 @@ public void testFixedType3() { floatVector.set(14, 8.5f); try { - floatVector.set(16, 9.5f); + floatVector.set(initialCapacity, 9.5f); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -315,7 +320,7 @@ public void testFixedType3() { assertEquals(8.5f, floatVector.get(14), 0); try { - floatVector.get(16); + floatVector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -324,10 +329,10 @@ public void testFixedType3() { } /* this should trigger a realloc() */ - floatVector.setSafe(16, 9.5f); + floatVector.setSafe(initialCapacity, 9.5f); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, floatVector.getValueCapacity()); + assertTrue(floatVector.getValueCapacity() >= initialCapacity * 2); /* vector data should still be intact after realloc */ assertEquals(1.5f, floatVector.get(0), 0); @@ -338,16 +343,17 @@ public void testFixedType3() { assertEquals(6.6f, floatVector.get(10), 0); assertEquals(7.8f, floatVector.get(12), 0); assertEquals(8.5f, floatVector.get(14), 0); - assertEquals(9.5f, floatVector.get(16), 0); + assertEquals(9.5f, floatVector.get(initialCapacity), 0); /* reset the vector */ + int capacityBeforeReset = floatVector.getValueCapacity(); floatVector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, floatVector.getValueCapacity()); + assertEquals(capacityBeforeReset, floatVector.getValueCapacity()); /* vector data should be zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { assertEquals("non-zero data not expected at index: " + i, true, floatVector.isNull(i)); } } @@ -364,7 +370,7 @@ public void testFixedType4() { floatVector.setInitialCapacity(MAX_VALUE_COUNT_8BYTE); try { - floatVector.setInitialCapacity(MAX_VALUE_COUNT_8BYTE + 1); + floatVector.setInitialCapacity(MAX_VALUE_COUNT_8BYTE * 2); } catch (OversizedAllocationException oe) { error = true; } finally { @@ -379,7 +385,8 @@ public void testFixedType4() { /* allocate 128 bytes (16 * 8) */ floatVector.allocateNew(); /* underlying buffer should be able to store 16 values */ - assertEquals(initialCapacity, floatVector.getValueCapacity()); + assertTrue(floatVector.getValueCapacity() >= initialCapacity); + initialCapacity = floatVector.getValueCapacity(); /* populate the vector */ floatVector.set(0, 1.55); @@ -392,7 +399,7 @@ public void testFixedType4() { floatVector.set(14, 8.56); try { - floatVector.set(16, 9.53); + floatVector.set(initialCapacity, 9.53); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -411,7 +418,7 @@ public void testFixedType4() { assertEquals(8.56, floatVector.get(14), 0); try { - floatVector.get(16); + floatVector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -420,10 +427,10 @@ public void testFixedType4() { } /* this should trigger a realloc() */ - floatVector.setSafe(16, 9.53); + floatVector.setSafe(initialCapacity, 9.53); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, floatVector.getValueCapacity()); + assertTrue(floatVector.getValueCapacity() >= initialCapacity * 2); /* vector data should still be intact after realloc */ assertEquals(1.55, floatVector.get(0), 0); @@ -434,16 +441,17 @@ public void testFixedType4() { assertEquals(6.67, floatVector.get(10), 0); assertEquals(7.87, floatVector.get(12), 0); assertEquals(8.56, floatVector.get(14), 0); - assertEquals(9.53, floatVector.get(16), 0); + assertEquals(9.53, floatVector.get(initialCapacity), 0); /* reset the vector */ + int capacityBeforeReset = floatVector.getValueCapacity(); floatVector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, floatVector.getValueCapacity()); + assertEquals(capacityBeforeReset, floatVector.getValueCapacity()); /* vector data should be zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { assertEquals("non-zero data not expected at index: " + i, true, floatVector.isNull(i)); } } @@ -463,36 +471,37 @@ public void testNullableFixedType1() { assertEquals(0, vector.getValueCapacity()); vector.allocateNew(); - assertEquals(initialCapacity, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); // Put and set a few values vector.set(0, 100); vector.set(1, 101); vector.set(100, 102); - vector.set(1022, 103); - vector.set(1023, 104); + vector.set(initialCapacity - 2, 103); + vector.set(initialCapacity - 1, 104); /* check vector contents */ assertEquals(100, vector.get(0)); assertEquals(101, vector.get(1)); assertEquals(102, vector.get(100)); - assertEquals(103, vector.get(1022)); - assertEquals(104, vector.get(1023)); + assertEquals(103, vector.get(initialCapacity - 2)); + assertEquals(104, vector.get(initialCapacity - 1)); int val = 0; /* check unset bits/null values */ - for (int i = 2, j = 101; i <= 99 || j <= 1021; i++, j++) { + for (int i = 2, j = 101; i <= 99 || j <= initialCapacity - 3; i++, j++) { if (i <= 99) { assertTrue(vector.isNull(i)); } - if (j <= 1021) { + if (j <= initialCapacity - 3) { assertTrue(vector.isNull(j)); } } try { - vector.set(1024, 10000); + vector.set(initialCapacity, 10000); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -501,7 +510,7 @@ public void testNullableFixedType1() { } try { - vector.get(1024); + vector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -510,39 +519,40 @@ public void testNullableFixedType1() { } /* should trigger a realloc of the underlying bitvector and valuevector */ - vector.setSafe(1024, 10000); + vector.setSafe(initialCapacity, 10000); /* check new capacity */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= initialCapacity * 2); /* vector contents should still be intact after realloc */ assertEquals(100, vector.get(0)); assertEquals(101, vector.get(1)); assertEquals(102, vector.get(100)); - assertEquals(103, vector.get(1022)); - assertEquals(104, vector.get(1023)); - assertEquals(10000, vector.get(1024)); + assertEquals(103, vector.get(initialCapacity - 2)); + assertEquals(104, vector.get(initialCapacity - 1)); + assertEquals(10000, vector.get(initialCapacity)); val = 0; /* check unset bits/null values */ - for (int i = 2, j = 101; i < 99 || j < 1021; i++, j++) { + for (int i = 2, j = 101; i < 99 || j < initialCapacity - 3; i++, j++) { if (i <= 99) { assertTrue(vector.isNull(i)); } - if (j <= 1021) { + if (j <= initialCapacity - 3) { assertTrue(vector.isNull(j)); } } /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); vector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertEquals(capacityBeforeReset, vector.getValueCapacity()); /* vector data should be zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } @@ -560,7 +570,8 @@ public void testNullableFixedType2() { assertEquals(0, vector.getValueCapacity()); vector.allocateNew(); - assertEquals(initialCapacity, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); /* populate the vector */ vector.set(0, 100.5f); @@ -573,7 +584,7 @@ public void testNullableFixedType2() { vector.set(14, 89.5f); try { - vector.set(16, 90.5f); + vector.set(initialCapacity, 90.5f); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -600,7 +611,7 @@ public void testNullableFixedType2() { assertTrue(vector.isNull(15)); try { - vector.get(16); + vector.get(initialCapacity); } catch (IndexOutOfBoundsException ie) { error = true; } finally { @@ -609,10 +620,10 @@ public void testNullableFixedType2() { } /* this should trigger a realloc() */ - vector.setSafe(16, 90.5f); + vector.setSafe(initialCapacity, 90.5f); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); /* vector data should still be intact after realloc */ assertEquals(100.5f, vector.get(0), 0); @@ -633,13 +644,14 @@ public void testNullableFixedType2() { assertTrue(vector.isNull(15)); /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); vector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertEquals(capacityBeforeReset, vector.getValueCapacity()); /* vector data should be zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } @@ -656,8 +668,9 @@ public void testNullableFixedType3() { assertEquals(0, vector.getValueCapacity()); /* allocate space for 4KB data (1024 * 4) */ vector.allocateNew(initialCapacity); - /* underlying buffer should be able to store 16 values */ - assertEquals(initialCapacity, vector.getValueCapacity()); + /* underlying buffer should be able to store 1024 values */ + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); vector.set(0, 1); vector.set(1, 2); @@ -687,7 +700,7 @@ public void testNullableFixedType3() { ArrowBuf validityVectorBuf = buffers.get(0); /* bitvector tracks 1024 integers --> 1024 bits --> 128 bytes */ - assertEquals(128, validityVectorBuf.readableBytes()); + assertTrue(validityVectorBuf.readableBytes() >= 128); assertEquals(3, validityVectorBuf.getByte(0)); // 1st and second bit defined for (int i = 1; i < 12; i++) { assertEquals(0, validityVectorBuf.getByte(i)); // nothing defined until 100 @@ -699,15 +712,15 @@ public void testNullableFixedType3() { assertEquals(-64, validityVectorBuf.getByte(127)); // 1022nd and 1023rd bit defined /* this should trigger a realloc() */ - vector.setSafe(1024, 6); + vector.setSafe(initialCapacity, 6); /* underlying buffer should now be able to store double the number of values */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); /* vector data should still be intact after realloc */ j = 1; for (int i = 0; i < (initialCapacity * 2); i++) { - if ((i > 1024) || (i >= 2 && i <= 99) || (i >= 101 && i <= 1021)) { + if ((i > 1023 && i != initialCapacity) || (i >= 2 && i <= 99) || (i >= 101 && i <= 1021)) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } else { assertFalse("null data not expected at index: " + i, vector.isNull(i)); @@ -717,19 +730,20 @@ public void testNullableFixedType3() { } /* reset the vector */ + int capacityBeforeReset = vector.getValueCapacity(); vector.reset(); /* capacity shouldn't change after reset */ - assertEquals(initialCapacity * 2, vector.getValueCapacity()); + assertEquals(capacityBeforeReset, vector.getValueCapacity()); /* vector data should have been zeroed out */ - for (int i = 0; i < (initialCapacity * 2); i++) { + for (int i = 0; i < capacityBeforeReset; i++) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } - vector.allocateNew(4096); + vector.allocateNew(initialCapacity * 4); // vector has been erased - for (int i = 0; i < 4096; i++) { + for (int i = 0; i < initialCapacity * 4; i++) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } @@ -764,7 +778,7 @@ public void testNullableFixedType4() { } vector.setSafe(valueCapacity, 20000000); - assertEquals(valueCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= valueCapacity * 2); for (int i = 0; i < vector.getValueCapacity(); i++) { if (i == valueCapacity) { @@ -795,14 +809,15 @@ public void testNullableFixedType4() { } } - vector.setSafe((valueCapacity * 2) + 1000, 400000000); - assertEquals(valueCapacity * 4, vector.getValueCapacity()); + int valueCapacityBeforeRealloc = vector.getValueCapacity(); + vector.setSafe(valueCapacityBeforeRealloc + 1000, 400000000); + assertTrue(vector.getValueCapacity() >= valueCapacity * 4); for (int i = 0; i < vector.getValueCapacity(); i++) { - if (i == (valueCapacity * 2 + 1000)) { + if (i == (valueCapacityBeforeRealloc + 1000)) { assertFalse("unexpected null value at index: " + i, vector.isNull(i)); assertEquals("unexpected value at index: " + i, 400000000, vector.get(i)); - } else if (i < valueCapacity * 2 && (i % 2) == 0) { + } else if (i < valueCapacityBeforeRealloc && (i % 2) == 0) { assertFalse("unexpected null value at index: " + i, vector.isNull(i)); assertEquals("unexpected value at index: " + i, baseValue + i, vector.get(i)); } else { @@ -811,13 +826,14 @@ public void testNullableFixedType4() { } /* reset the vector */ + int valueCapacityBeforeReset = vector.getValueCapacity(); vector.reset(); /* capacity shouldn't change after reset */ - assertEquals(valueCapacity * 4, vector.getValueCapacity()); + assertEquals(valueCapacityBeforeReset, vector.getValueCapacity()); /* vector data should be zeroed out */ - for (int i = 0; i < (valueCapacity * 4); i++) { + for (int i = 0; i < valueCapacityBeforeReset; i++) { assertTrue("non-null data not expected at index: " + i, vector.isNull(i)); } } @@ -936,52 +952,56 @@ public void testNullableVarType2() { @Test /* Float8Vector */ public void testReallocAfterVectorTransfer1() { try (final Float8Vector vector = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { - final int initialDefaultCapacity = 4096; + int initialCapacity = 4096; boolean error = false; /* use the default capacity; 4096*8 => 32KB */ + vector.setInitialCapacity(initialCapacity); vector.allocateNew(); - assertEquals(initialDefaultCapacity, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); double baseValue = 100.375; - for (int i = 0; i < initialDefaultCapacity; i++) { + for (int i = 0; i < initialCapacity; i++) { vector.setSafe(i, baseValue + (double)i); } /* the above setSafe calls should not have triggered a realloc as * we are within the capacity. check the vector contents */ - assertEquals(initialDefaultCapacity, vector.getValueCapacity()); + assertEquals(initialCapacity, vector.getValueCapacity()); - for (int i = 0; i < initialDefaultCapacity; i++) { + for (int i = 0; i < initialCapacity; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - vector.setSafe(initialDefaultCapacity, baseValue + (double)initialDefaultCapacity); - assertEquals(initialDefaultCapacity * 2, vector.getValueCapacity()); + vector.setSafe(initialCapacity, baseValue + (double)initialCapacity); + assertTrue(vector.getValueCapacity() >= initialCapacity * 2); + int capacityAfterRealloc1 = vector.getValueCapacity(); - for (int i = initialDefaultCapacity + 1; i < (initialDefaultCapacity * 2); i++) { + for (int i = initialCapacity + 1; i < capacityAfterRealloc1; i++) { vector.setSafe(i, baseValue + (double)i); } - for (int i = 0; i < (initialDefaultCapacity * 2); i++) { + for (int i = 0; i < capacityAfterRealloc1; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - vector.setSafe(initialDefaultCapacity * 2, baseValue + (double)(initialDefaultCapacity * 2)); - assertEquals(initialDefaultCapacity * 4, vector.getValueCapacity()); + vector.setSafe(capacityAfterRealloc1, baseValue + (double)(capacityAfterRealloc1)); + assertTrue(vector.getValueCapacity() >= initialCapacity * 4); + int capacityAfterRealloc2 = vector.getValueCapacity(); - for (int i = (initialDefaultCapacity * 2) + 1; i < (initialDefaultCapacity * 4); i++) { + for (int i = capacityAfterRealloc1 + 1; i < capacityAfterRealloc2; i++) { vector.setSafe(i, baseValue + (double)i); } - for (int i = 0; i < (initialDefaultCapacity * 4); i++) { + for (int i = 0; i < capacityAfterRealloc2; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } @@ -997,10 +1017,10 @@ public void testReallocAfterVectorTransfer1() { /* now let's realloc the toVector */ toVector.reAlloc(); - assertEquals(initialDefaultCapacity * 8, toVector.getValueCapacity()); + assertTrue(toVector.getValueCapacity() >= initialCapacity * 8); - for (int i = 0; i < (initialDefaultCapacity * 8); i++) { - if (i < (initialDefaultCapacity * 4)) { + for (int i = 0; i < toVector.getValueCapacity(); i++) { + if (i < capacityAfterRealloc2) { assertEquals(baseValue + (double)i, toVector.get(i), 0); } else { assertTrue(toVector.isNull(i)); @@ -1014,51 +1034,53 @@ public void testReallocAfterVectorTransfer1() { @Test /* Float8Vector */ public void testReallocAfterVectorTransfer2() { try (final Float8Vector vector = new Float8Vector(EMPTY_SCHEMA_PATH, allocator)) { - final int initialDefaultCapacity = 4096; + int initialCapacity = 4096; boolean error = false; - vector.allocateNew(initialDefaultCapacity); - - assertEquals(initialDefaultCapacity, vector.getValueCapacity()); + vector.allocateNew(initialCapacity); + assertTrue(vector.getValueCapacity() >= initialCapacity); + initialCapacity = vector.getValueCapacity(); double baseValue = 100.375; - for (int i = 0; i < initialDefaultCapacity; i++) { + for (int i = 0; i < initialCapacity; i++) { vector.setSafe(i, baseValue + (double)i); } /* the above setSafe calls should not have triggered a realloc as * we are within the capacity. check the vector contents */ - assertEquals(initialDefaultCapacity, vector.getValueCapacity()); + assertEquals(initialCapacity, vector.getValueCapacity()); - for (int i = 0; i < initialDefaultCapacity; i++) { + for (int i = 0; i < initialCapacity; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - vector.setSafe(initialDefaultCapacity, baseValue + (double)initialDefaultCapacity); - assertEquals(initialDefaultCapacity * 2, vector.getValueCapacity()); + vector.setSafe(initialCapacity, baseValue + (double)initialCapacity); + assertTrue(vector.getValueCapacity() >= initialCapacity * 2); + int capacityAfterRealloc1 = vector.getValueCapacity(); - for (int i = initialDefaultCapacity + 1; i < (initialDefaultCapacity * 2); i++) { + for (int i = initialCapacity + 1; i < capacityAfterRealloc1; i++) { vector.setSafe(i, baseValue + (double)i); } - for (int i = 0; i < (initialDefaultCapacity * 2); i++) { + for (int i = 0; i < capacityAfterRealloc1; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } /* this should trigger a realloc */ - vector.setSafe(initialDefaultCapacity * 2, baseValue + (double)(initialDefaultCapacity * 2)); - assertEquals(initialDefaultCapacity * 4, vector.getValueCapacity()); + vector.setSafe(capacityAfterRealloc1, baseValue + (double)(capacityAfterRealloc1)); + assertTrue(vector.getValueCapacity() >= initialCapacity * 4); + int capacityAfterRealloc2 = vector.getValueCapacity(); - for (int i = (initialDefaultCapacity * 2) + 1; i < (initialDefaultCapacity * 4); i++) { + for (int i = capacityAfterRealloc1 + 1; i < capacityAfterRealloc2; i++) { vector.setSafe(i, baseValue + (double)i); } - for (int i = 0; i < (initialDefaultCapacity * 4); i++) { + for (int i = 0; i < capacityAfterRealloc2; i++) { double value = vector.get(i); assertEquals(baseValue + (double)i, value, 0); } @@ -1073,7 +1095,7 @@ public void testReallocAfterVectorTransfer2() { Float8Vector toVector = (Float8Vector)transferPair.getTo(); /* check toVector contents before realloc */ - for (int i = 0; i < (initialDefaultCapacity * 4); i++) { + for (int i = 0; i < toVector.getValueCapacity(); i++) { assertFalse("unexpected null value at index: " + i, toVector.isNull(i)); double value = toVector.get(i); assertEquals("unexpected value at index: " + i, baseValue + (double)i, value, 0); @@ -1081,10 +1103,10 @@ public void testReallocAfterVectorTransfer2() { /* now let's realloc the toVector and check contents again */ toVector.reAlloc(); - assertEquals(initialDefaultCapacity * 8, toVector.getValueCapacity()); + assertTrue(toVector.getValueCapacity() >= initialCapacity * 8); - for (int i = 0; i < (initialDefaultCapacity * 8); i++) { - if (i < (initialDefaultCapacity * 4)) { + for (int i = 0; i < toVector.getValueCapacity(); i++) { + if (i < capacityAfterRealloc2) { assertFalse("unexpected null value at index: " + i, toVector.isNull(i)); double value = toVector.get(i); assertEquals("unexpected value at index: " + i, baseValue + (double)i, value, 0); @@ -1103,7 +1125,7 @@ public void testReallocAfterVectorTransfer3() { /* 4096 values with 10 byte per record */ vector.allocateNew(4096 * 10, 4096); int valueCapacity = vector.getValueCapacity(); - assertEquals(4096, valueCapacity); + assertTrue(valueCapacity >= 4096); /* populate the vector */ for (int i = 0; i < valueCapacity; i++) { @@ -1125,7 +1147,10 @@ public void testReallocAfterVectorTransfer3() { /* trigger first realloc */ vector.setSafe(valueCapacity, STR2, 0, STR2.length); - assertEquals(valueCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * valueCapacity); + while (vector.getByteCapacity() < 10 * vector.getValueCapacity()) { + vector.reallocDataBuffer(); + } /* populate the remaining vector */ for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { @@ -1148,7 +1173,10 @@ public void testReallocAfterVectorTransfer3() { /* trigger second realloc */ vector.setSafe(valueCapacity + 10, STR2, 0, STR2.length); - assertEquals(valueCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * valueCapacity); + while (vector.getByteCapacity() < 10 * vector.getValueCapacity()) { + vector.reallocDataBuffer(); + } /* populate the remaining vector */ for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { @@ -1197,7 +1225,7 @@ public void testReallocAfterVectorTransfer4() { /* 4096 values */ vector.allocateNew(4096); int valueCapacity = vector.getValueCapacity(); - assertEquals(4096, valueCapacity); + assertTrue(valueCapacity >= 4096); /* populate the vector */ int baseValue = 1000; @@ -1218,7 +1246,7 @@ public void testReallocAfterVectorTransfer4() { /* trigger first realloc */ vector.setSafe(valueCapacity, 10000000); - assertEquals(valueCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= valueCapacity * 2); /* populate the remaining vector */ for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { @@ -1239,7 +1267,7 @@ public void testReallocAfterVectorTransfer4() { /* trigger second realloc */ vector.setSafe(valueCapacity, 10000000); - assertEquals(valueCapacity * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= valueCapacity * 2); /* populate the remaining vector */ for (int i = valueCapacity; i < vector.getValueCapacity(); i++) { @@ -1288,7 +1316,8 @@ public void testReAllocFixedWidthVector() { try (final Float4Vector vector = newVector(Float4Vector.class, EMPTY_SCHEMA_PATH, MinorType.FLOAT4, allocator)) { vector.allocateNew(1024); - assertEquals(1024, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 1024); + int initialCapacity = vector.getValueCapacity(); // Put values in indexes that fall within the initial allocation vector.setSafe(0, 100.1f); @@ -1299,7 +1328,7 @@ public void testReAllocFixedWidthVector() { vector.setSafe(2000, 105.5f); // Check valueCapacity is more than initial allocation - assertEquals(1024 * 2, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); assertEquals(100.1f, vector.get(0), 0); assertEquals(102.3f, vector.get(100), 0); @@ -1316,24 +1345,24 @@ public void testReAllocFixedWidthVector() { @Test public void testReAllocVariableWidthVector() { try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.setInitialCapacity(4095); vector.allocateNew(); int initialCapacity = vector.getValueCapacity(); - assertEquals(4095, initialCapacity); + assertTrue(initialCapacity >= 4095); /* Put values in indexes that fall within the initial allocation */ vector.setSafe(0, STR1, 0, STR1.length); vector.setSafe(initialCapacity - 1, STR2, 0, STR2.length); /* the above set calls should NOT have triggered a realloc */ - initialCapacity = vector.getValueCapacity(); - assertEquals(4095, initialCapacity); + assertEquals(initialCapacity, vector.getValueCapacity()); /* Now try to put values in space that falls beyond the initial allocation */ vector.setSafe(initialCapacity + 200, STR3, 0, STR3.length); /* Check valueCapacity is more than initial allocation */ - assertEquals(((initialCapacity + 1) * 2) - 1, vector.getValueCapacity()); + assertTrue(initialCapacity * 2 <= vector.getValueCapacity()); assertArrayEquals(STR1, vector.get(0)); assertArrayEquals(STR2, vector.get(initialCapacity - 1)); @@ -1348,20 +1377,20 @@ public void testReAllocVariableWidthVector() { @Test public void testFillEmptiesNotOverfill() { try (final VarCharVector vector = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.setInitialCapacity(4095); vector.allocateNew(); int initialCapacity = vector.getValueCapacity(); - assertEquals(4095, initialCapacity); + assertTrue(initialCapacity >= 4095); vector.setSafe(4094, "hello".getBytes(), 0, 5); /* the above set method should NOT have trigerred a realloc */ - initialCapacity = vector.getValueCapacity(); - assertEquals(4095, initialCapacity); + assertEquals(initialCapacity, vector.getValueCapacity()); - vector.setValueCount(4095); - assertEquals(4096 * vector.OFFSET_WIDTH, vector.getFieldBuffers().get(1).capacity()); - initialCapacity = vector.getValueCapacity(); - assertEquals(4095, initialCapacity); + int bufSizeBefore = vector.getFieldBuffers().get(1).capacity(); + vector.setValueCount(initialCapacity); + assertEquals(bufSizeBefore, vector.getFieldBuffers().get(1).capacity()); + assertEquals(initialCapacity, vector.getValueCapacity()); } } @@ -1371,11 +1400,12 @@ public void testCopyFromWithNulls() { final VarCharVector vector2 = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.setInitialCapacity(4095); vector.allocateNew(); int capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertTrue(capacity >= 4095); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { continue; } @@ -1384,12 +1414,11 @@ public void testCopyFromWithNulls() { } /* NO reAlloc() should have happened in setSafe() */ - capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertEquals(capacity, vector.getValueCapacity()); - vector.setValueCount(4095); + vector.setValueCount(capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { assertNull(vector.getObject(i)); } else { @@ -1397,11 +1426,12 @@ public void testCopyFromWithNulls() { } } + vector2.setInitialCapacity(4095); vector2.allocateNew(); - capacity = vector2.getValueCapacity(); - assertEquals(4095, capacity); + int capacity2 = vector2.getValueCapacity(); + assertEquals(capacity2, capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { vector2.copyFromSafe(i, i, vector); if (i % 3 == 0) { assertNull(vector2.getObject(i)); @@ -1411,12 +1441,11 @@ public void testCopyFromWithNulls() { } /* NO reAlloc() should have happened in copyFrom */ - capacity = vector2.getValueCapacity(); - assertEquals(4095, capacity); + assertEquals(capacity, vector2.getValueCapacity()); - vector2.setValueCount(4095); + vector2.setValueCount(capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { @@ -1432,11 +1461,12 @@ public void testCopyFromWithNulls1() { final VarCharVector vector2 = newVector(VarCharVector.class, EMPTY_SCHEMA_PATH, MinorType.VARCHAR, allocator)) { + vector.setInitialCapacity(4095); vector.allocateNew(); int capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertTrue(capacity >= 4095); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { continue; } @@ -1445,12 +1475,11 @@ public void testCopyFromWithNulls1() { } /* NO reAlloc() should have happened in setSafe() */ - capacity = vector.getValueCapacity(); - assertEquals(4095, capacity); + assertEquals(capacity, vector.getValueCapacity()); - vector.setValueCount(4095); + vector.setValueCount(capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { assertNull(vector.getObject(i)); } else { @@ -1463,10 +1492,11 @@ public void testCopyFromWithNulls1() { */ vector2.allocateNew(1024 * 10, 1024); - capacity = vector2.getValueCapacity(); - assertEquals(1024, capacity); + int capacity2 = vector2.getValueCapacity(); + assertTrue(capacity2 >= 1024); + assertTrue(capacity2 <= capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { vector2.copyFromSafe(i, i, vector); if (i % 3 == 0) { assertNull(vector2.getObject(i)); @@ -1476,12 +1506,11 @@ public void testCopyFromWithNulls1() { } /* 2 reAllocs should have happened in copyFromSafe() */ - capacity = vector2.getValueCapacity(); - assertEquals(4096, capacity); + assertEquals(capacity, vector2.getValueCapacity()); - vector2.setValueCount(4095); + vector2.setValueCount(capacity); - for (int i = 0; i < 4095; i++) { + for (int i = 0; i < capacity; i++) { if (i % 3 == 0) { assertNull(vector2.getObject(i)); } else { @@ -1876,30 +1905,88 @@ public void testSetInitialCapacity() { try (final VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator)) { /* use the default 8 data bytes on average per element */ - vector.setInitialCapacity(4096); + int defaultCapacity = BaseValueVector.INITIAL_VALUE_ALLOCATION - 1; + vector.setInitialCapacity(defaultCapacity); vector.allocateNew(); - assertEquals(4096, vector.getValueCapacity()); - assertEquals(4096 * 8, vector.getDataBuffer().capacity()); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(BaseAllocator.nextPowerOfTwo(defaultCapacity * 8), vector.getDataBuffer().capacity()); - vector.setInitialCapacity(4096, 1); + vector.setInitialCapacity(defaultCapacity, 1); vector.allocateNew(); - assertEquals(4096, vector.getValueCapacity()); - assertEquals(4096, vector.getDataBuffer().capacity()); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(BaseAllocator.nextPowerOfTwo(defaultCapacity), vector.getDataBuffer().capacity()); - vector.setInitialCapacity(4096, 0.1); + vector.setInitialCapacity(defaultCapacity, 0.1); vector.allocateNew(); - assertEquals(4096, vector.getValueCapacity()); - assertEquals(512, vector.getDataBuffer().capacity()); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(BaseAllocator.nextPowerOfTwo((int)(defaultCapacity * 0.1)), vector.getDataBuffer().capacity()); - vector.setInitialCapacity(4096, 0.01); + vector.setInitialCapacity(defaultCapacity, 0.01); vector.allocateNew(); - assertEquals(4096, vector.getValueCapacity()); - assertEquals(64, vector.getDataBuffer().capacity()); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(BaseAllocator.nextPowerOfTwo((int)(defaultCapacity * 0.01)), vector.getDataBuffer().capacity()); vector.setInitialCapacity(5, 0.01); vector.allocateNew(); - assertEquals(7, vector.getValueCapacity()); + assertEquals(5, vector.getValueCapacity()); assertEquals(2, vector.getDataBuffer().capacity()); } } + + @Test + public void testDefaultAllocNewAll() { + int defaultCapacity = BaseFixedWidthVector.INITIAL_VALUE_ALLOCATION; + int expectedSize; + long beforeSize; + try (BufferAllocator childAllocator = allocator.newChildAllocator("defaultAllocs", 0, Long.MAX_VALUE); + final IntVector intVector = new IntVector(EMPTY_SCHEMA_PATH, childAllocator); + final BigIntVector bigIntVector = new BigIntVector(EMPTY_SCHEMA_PATH, childAllocator); + final BitVector bitVector = new BitVector(EMPTY_SCHEMA_PATH, childAllocator); + final DecimalVector decimalVector = new DecimalVector(EMPTY_SCHEMA_PATH, childAllocator, 38, 6); + final VarCharVector varCharVector = new VarCharVector(EMPTY_SCHEMA_PATH, childAllocator)) { + + // verify that the wastage is within bounds for IntVector. + beforeSize = childAllocator.getAllocatedMemory(); + intVector.allocateNew(); + assertTrue(intVector.getValueCapacity() >= defaultCapacity); + expectedSize = (defaultCapacity * IntVector.TYPE_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity); + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for BigIntVector. + beforeSize = childAllocator.getAllocatedMemory(); + bigIntVector.allocateNew(); + assertTrue(bigIntVector.getValueCapacity() >= defaultCapacity); + expectedSize = (defaultCapacity * bigIntVector.TYPE_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity); + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for DecimalVector. + beforeSize = childAllocator.getAllocatedMemory(); + decimalVector.allocateNew(); + assertTrue(decimalVector.getValueCapacity() >= defaultCapacity); + expectedSize = (defaultCapacity * decimalVector.TYPE_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity); + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for VarCharVector. + // var char vector have an offsets array that is 1 less than defaultCapacity + beforeSize = childAllocator.getAllocatedMemory(); + varCharVector.allocateNew(); + assertTrue(varCharVector.getValueCapacity() >= defaultCapacity - 1); + expectedSize = (defaultCapacity * VarCharVector.OFFSET_WIDTH) + + BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity) + + defaultCapacity * 8; + // wastage should be less than 5%. + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + // verify that the wastage is within bounds for BitVector. + beforeSize = childAllocator.getAllocatedMemory(); + bitVector.allocateNew(); + assertTrue(bitVector.getValueCapacity() >= defaultCapacity); + expectedSize = BaseFixedWidthVector.getValidityBufferSizeFromCount(defaultCapacity) * 2; + assertTrue(childAllocator.getAllocatedMemory() - beforeSize <= expectedSize * 1.05); + + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java index 5474675fbf343..60747aaad92ce 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -19,6 +19,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; import java.nio.charset.StandardCharsets; @@ -54,20 +55,21 @@ public void testFixedType() { vector.setInitialCapacity(512); vector.allocateNew(); - assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 512); + int initialCapacity = vector.getValueCapacity(); try { - vector.set(512, 0); + vector.set(initialCapacity, 0); Assert.fail("Expected out of bounds exception"); } catch (Exception e) { // ok } vector.reAlloc(); - assertEquals(1024, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); - vector.set(512, 100); - assertEquals(100, vector.get(512)); + vector.set(initialCapacity, 100); + assertEquals(100, vector.get(initialCapacity)); } } @@ -77,20 +79,21 @@ public void testNullableType() { vector.setInitialCapacity(512); vector.allocateNew(); - assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 512); + int initialCapacity = vector.getValueCapacity(); try { - vector.set(512, "foo".getBytes(StandardCharsets.UTF_8)); + vector.set(initialCapacity, "foo".getBytes(StandardCharsets.UTF_8)); Assert.fail("Expected out of bounds exception"); } catch (Exception e) { // ok } vector.reAlloc(); - assertEquals(1024, vector.getValueCapacity()); + assertTrue(vector.getValueCapacity() >= 2 * initialCapacity); - vector.set(512, "foo".getBytes(StandardCharsets.UTF_8)); - assertEquals("foo", new String(vector.get(512), StandardCharsets.UTF_8)); + vector.set(initialCapacity, "foo".getBytes(StandardCharsets.UTF_8)); + assertEquals("foo", new String(vector.get(initialCapacity), StandardCharsets.UTF_8)); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index b7215ce4e2e68..61c1b924f664d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -974,11 +974,16 @@ public void testSingleStructWriter1() { Float4Vector float4Vector = (Float4Vector)parent.getChild("float4Field"); Float8Vector float8Vector = (Float8Vector)parent.getChild("float8Field"); - assertEquals(initialCapacity, singleStructWriter.getValueCapacity()); - assertEquals(initialCapacity, intVector.getValueCapacity()); - assertEquals(initialCapacity, bigIntVector.getValueCapacity()); - assertEquals(initialCapacity, float4Vector.getValueCapacity()); - assertEquals(initialCapacity, float8Vector.getValueCapacity()); + int capacity = singleStructWriter.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = intVector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = bigIntVector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = float4Vector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); + capacity = float8Vector.getValueCapacity(); + assertTrue(capacity >= initialCapacity && capacity < initialCapacity * 2); StructReader singleStructReader = new SingleStructReaderImpl(parent); From cec75410b78b70b30bd57908d920c006d9101b72 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Wed, 9 Jan 2019 13:35:05 +0900 Subject: [PATCH 182/328] ARROW-4199: [GLib] Add garrow_seekable_input_stream_peek() Author: Yosuke Shiro Author: Kouhei Sutou Closes #3351 from shiro615/glib-support-peek and squashes the following commits: 1f445764 Improve document a5f0fdfd Add GARROW_AVAILABLE_IN_0_12 b27c0a04 Use g_bytes_new_static to avoid copying the data f9d9f237 Add support for Peek to InputStream --- c_glib/arrow-glib/input-stream.cpp | 24 ++++++++++++++++++++++++ c_glib/arrow-glib/input-stream.h | 3 +++ c_glib/test/test-buffer-input-stream.rb | 8 ++++++++ 3 files changed, 35 insertions(+) diff --git a/c_glib/arrow-glib/input-stream.cpp b/c_glib/arrow-glib/input-stream.cpp index cb36e49067ac9..cb1fb3b04a68e 100644 --- a/c_glib/arrow-glib/input-stream.cpp +++ b/c_glib/arrow-glib/input-stream.cpp @@ -325,6 +325,30 @@ garrow_seekable_input_stream_read_at(GArrowSeekableInputStream *input_stream, } +/** + * garrow_seekable_input_stream_peek: + * @input_stream: A #GArrowSeekableInputStream. + * @n_bytes: The number of bytes to be peeked. + * + * Returns: (transfer full): The data of the buffer, up to the + * indicated number. The data becomes invalid after any operation on + * the stream. If the stream is unbuffered, the data is empty. + * + * It should be freed with g_bytes_unref() when no longer needed. + * + * Since: 0.12.0 + */ +GBytes * +garrow_seekable_input_stream_peek(GArrowSeekableInputStream *input_stream, + gint64 n_bytes) +{ + auto arrow_random_access_file = + garrow_seekable_input_stream_get_raw(input_stream); + auto string_view = arrow_random_access_file->Peek(n_bytes); + return g_bytes_new_static(string_view.data(), string_view.size()); +} + + typedef struct GArrowBufferInputStreamPrivate_ { GArrowBuffer *buffer; } GArrowBufferInputStreamPrivate; diff --git a/c_glib/arrow-glib/input-stream.h b/c_glib/arrow-glib/input-stream.h index 9deebd717363b..745b912749eb6 100644 --- a/c_glib/arrow-glib/input-stream.h +++ b/c_glib/arrow-glib/input-stream.h @@ -66,6 +66,9 @@ GArrowBuffer *garrow_seekable_input_stream_read_at(GArrowSeekableInputStream *in gint64 position, gint64 n_bytes, GError **error); +GARROW_AVAILABLE_IN_0_12 +GBytes *garrow_seekable_input_stream_peek(GArrowSeekableInputStream *input_stream, + gint64 n_bytes); #define GARROW_TYPE_BUFFER_INPUT_STREAM \ diff --git a/c_glib/test/test-buffer-input-stream.rb b/c_glib/test/test-buffer-input-stream.rb index f5a0132d2da98..cb6a667b3b7c0 100644 --- a/c_glib/test/test-buffer-input-stream.rb +++ b/c_glib/test/test-buffer-input-stream.rb @@ -39,4 +39,12 @@ def test_align read_buffer = buffer_input_stream.read(3) assert_equal("rld", read_buffer.data.to_s) end + + def test_peek + buffer = Arrow::Buffer.new("Hello World") + buffer_input_stream = Arrow::BufferInputStream.new(buffer) + peeked_data = buffer_input_stream.peek(5) + assert_equal(buffer_input_stream.read(5).data.to_s, + peeked_data.to_s) + end end From 090a8c020611b2f75ec0e36d765cc6d48adbe9a7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 8 Jan 2019 22:59:00 -0600 Subject: [PATCH 183/328] ARROW-4200: [C++/Python] Enable conda_env_python.yml to work on Windows, simplify python/development.rst I also removed nomkl from conda_env_python.yml. It's sort of a developer decision whether or not they want to install the MKL -- we shouldn't force them to _not_ have it Author: Wes McKinney Closes #3353 from wesm/ARROW-4200 and squashes the following commits: 4849a326d Accept bkietz suggestions 576e63b27 Also add nomkl to python/Dockerfile 9b39e8300 Get conda env files working on Windows, small cleaning to Python development instructions --- ci/conda_env_python.yml | 2 -- ci/conda_env_unix.yml | 1 + ci/travis_script_python.sh | 1 + docs/source/python/development.rst | 23 +++++++---------------- python/Dockerfile | 1 + 5 files changed, 10 insertions(+), 18 deletions(-) diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index d3756cbcfa8c9..b51f5c32f3297 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -18,10 +18,8 @@ cython cloudpickle hypothesis -nomkl numpy pandas pytest -rsync setuptools setuptools_scm diff --git a/ci/conda_env_unix.yml b/ci/conda_env_unix.yml index eeb90e48dce72..9ecf549b504eb 100644 --- a/ci/conda_env_unix.yml +++ b/ci/conda_env_unix.yml @@ -18,3 +18,4 @@ # conda package dependencies specific to Unix-like environments (Linux and macOS) autoconf +rsync diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 69e115a9dcce7..e9a112275502e 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -47,6 +47,7 @@ fi conda create -y -q -p $CONDA_ENV_DIR \ --file $TRAVIS_BUILD_DIR/ci/conda_env_python.yml \ + nomkl \ cmake \ pip \ numpy=1.13.1 \ diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst index 0bc1c62b4af18..d85537110e48c 100644 --- a/docs/source/python/development.rst +++ b/docs/source/python/development.rst @@ -86,18 +86,9 @@ On Linux and OSX: --file arrow/ci/conda_env_python.yml \ python=3.6 - source activate pyarrow-dev + conda activate pyarrow-dev -On Windows: - -.. code-block:: shell - - conda create -y -n pyarrow-dev -c conda-forge ^ - --file arrow\ci\conda_env_cpp.yml ^ - --file arrow\ci\conda_env_python.yml ^ - python=3.6 - - activate pyarrow-dev +For Windows, see the `Developing on Windows`_ section below. We need to set some environment variables to let Arrow's build system know about our build toolchain: @@ -310,11 +301,11 @@ First, starting from fresh clones of Apache Arrow: .. code-block:: shell - conda create -y -q -n pyarrow-dev ^ - python=3.6 numpy six setuptools cython pandas pytest ^ - cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib ^ - gflags brotli lz4-c zstd -c conda-forge - activate pyarrow-dev + conda create -y -n pyarrow-dev -c conda-forge ^ + --file arrow\ci\conda_env_cpp.yml ^ + --file arrow\ci\conda_env_python.yml ^ + python=3.7 + conda activate pyarrow-dev Now, we build and install Arrow C++ libraries diff --git a/python/Dockerfile b/python/Dockerfile index a99a4206290f8..ecabc94493cf0 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -21,6 +21,7 @@ FROM arrow:cpp ARG PYTHON_VERSION=3.6 ADD ci/conda_env_python.yml /arrow/ci/ RUN conda install -c conda-forge \ + nomkl \ --file arrow/ci/conda_env_python.yml \ python=$PYTHON_VERSION && \ conda clean --all From af925d9395bd8f5cf435f379e389633bd3acfdfd Mon Sep 17 00:00:00 2001 From: Dmitry Vukolov Date: Wed, 9 Jan 2019 13:58:48 +0100 Subject: [PATCH 184/328] ARROW-2038: [Python] Strip s3:// scheme in S3FSWrapper isdir() and isfile() This fixes an exception from ParquetDataset arising when the supplied path contains the `s3://` scheme specifier. The issue stemmed from the fact that while the underlying S3FileSystem does support both types of paths, with and without and explicit `s3://`, its function calls always return paths stripped of the scheme. This messed up with the logic in isdir() and isfile(). An alternative solution would be to strip the scheme in parquet.py (by adding it to _URI_STRIP_SCHEMES). This however would require additional code changes along the lines of: ```python _URI_STRIP_SCHEMES = ('hdfs', 's3') def _parse_uri(path): path = _stringify_path(path) parsed_uri = urlparse(path) if parsed_uri.scheme in _URI_STRIP_SCHEMES: scheme = '{0}://'.format(parsed_uri.scheme) path = parsed_uri.geturl().replace(scheme, '', 1) return path else: # ARROW-4073: On Windows returning the path with the scheme # stripped removes the drive letter, if any return path ``` Not sure if that would have any impact on handling HDFS. Therefore this patch proposes a safer, more localised approach, already used in other parts of S3FSWrapper. Author: Dmitry Vukolov Closes #3286 from dvukolov/master and squashes the following commits: 8de916c5 Strip s3:// scheme in S3FSWrapper isdir() and isfile() --- python/pyarrow/filesystem.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index 98efb1e3ec374..92a65ce69892a 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -319,7 +319,7 @@ class S3FSWrapper(DaskFileSystem): @implements(FileSystem.isdir) def isdir(self, path): - path = _stringify_path(path) + path = _sanitize_s3(_stringify_path(path)) try: contents = self.fs.ls(path) if len(contents) == 1 and contents[0] == path: @@ -331,7 +331,7 @@ def isdir(self, path): @implements(FileSystem.isfile) def isfile(self, path): - path = _stringify_path(path) + path = _sanitize_s3(_stringify_path(path)) try: contents = self.fs.ls(path) return len(contents) == 1 and contents[0] == path @@ -345,7 +345,7 @@ def walk(self, path, refresh=False): Generator version of what is in s3fs, which yields a flattened list of files """ - path = _stringify_path(path).replace('s3://', '') + path = _sanitize_s3(_stringify_path(path)) directories = set() files = set() @@ -371,6 +371,13 @@ def walk(self, path, refresh=False): yield tup +def _sanitize_s3(path): + if path.startswith('s3://'): + return path.replace('s3://', '') + else: + return path + + def _ensure_filesystem(fs): fs_type = type(fs) From 361285d86c345b3943eee8e63d3f9a782e7bf6da Mon Sep 17 00:00:00 2001 From: Pindikura Ravindra Date: Wed, 9 Jan 2019 10:09:48 -0600 Subject: [PATCH 185/328] ARROW-4209: [Gandiva] Avoid struct return param in IR Author: Pindikura Ravindra Closes #3356 from pravindra/struct and squashes the following commits: f437acd0 ARROW-4209: Avoid struct return param in IR --- cpp/src/gandiva/decimal_ir.cc | 30 ++++++++----------- .../gandiva/precompiled/decimal_wrapper.cc | 20 +++++-------- 2 files changed, 20 insertions(+), 30 deletions(-) diff --git a/cpp/src/gandiva/decimal_ir.cc b/cpp/src/gandiva/decimal_ir.cc index 38b35a64b293f..d10158a6f0487 100644 --- a/cpp/src/gandiva/decimal_ir.cc +++ b/cpp/src/gandiva/decimal_ir.cc @@ -218,27 +218,23 @@ DecimalIR::ValueWithOverflow DecimalIR::AddWithOverflowCheck(const ValueFull& x, // This is pretty complex, so use CPP fns. llvm::Value* DecimalIR::AddLarge(const ValueFull& x, const ValueFull& y, const ValueFull& out) { - std::vector args; - + auto block = ir_builder()->GetInsertBlock(); + auto out_high_ptr = new llvm::AllocaInst(types()->i64_type(), 0, "out_hi", block); + auto out_low_ptr = new llvm::AllocaInst(types()->i64_type(), 0, "out_low", block); auto x_split = ValueSplit::MakeFromInt128(this, x.value()); - args.push_back(x_split.high()); - args.push_back(x_split.low()); - args.push_back(x.precision()); - args.push_back(x.scale()); - auto y_split = ValueSplit::MakeFromInt128(this, y.value()); - args.push_back(y_split.high()); - args.push_back(y_split.low()); - args.push_back(y.precision()); - args.push_back(y.scale()); - args.push_back(out.precision()); - args.push_back(out.scale()); - - auto split = ir_builder()->CreateCall( - module()->getFunction("add_large_decimal128_decimal128"), args); + std::vector args = { + x_split.high(), x_split.low(), x.precision(), x.scale(), + y_split.high(), y_split.low(), y.precision(), y.scale(), + out.precision(), out.scale(), out_high_ptr, out_low_ptr, + }; + ir_builder()->CreateCall(module()->getFunction("add_large_decimal128_decimal128"), + args); - auto sum = ValueSplit::MakeFromStruct(this, split).AsInt128(this); + auto out_high = ir_builder()->CreateLoad(out_high_ptr); + auto out_low = ir_builder()->CreateLoad(out_low_ptr); + auto sum = ValueSplit(out_high, out_low).AsInt128(this); ADD_TRACE_128("AddLarge : sum", sum); return sum; } diff --git a/cpp/src/gandiva/precompiled/decimal_wrapper.cc b/cpp/src/gandiva/precompiled/decimal_wrapper.cc index fdc751f7fe87c..0118100971220 100644 --- a/cpp/src/gandiva/precompiled/decimal_wrapper.cc +++ b/cpp/src/gandiva/precompiled/decimal_wrapper.cc @@ -20,24 +20,18 @@ extern "C" { -/// TODO : Passing around structs in IR can be fragile due to c-abi compatibility issues. -/// This seems to work for now, but will need to revisit if we hit issues. -struct DecimalSplit { - int64_t high_bits; - uint64_t low_bits; -}; - FORCE_INLINE -DecimalSplit add_large_decimal128_decimal128(int64_t x_high, uint64_t x_low, - int32_t x_precision, int32_t x_scale, - int64_t y_high, uint64_t y_low, - int32_t y_precision, int32_t y_scale, - int32_t out_precision, int32_t out_scale) { +void add_large_decimal128_decimal128(int64_t x_high, uint64_t x_low, int32_t x_precision, + int32_t x_scale, int64_t y_high, uint64_t y_low, + int32_t y_precision, int32_t y_scale, + int32_t out_precision, int32_t out_scale, + int64_t* out_high, uint64_t* out_low) { gandiva::Decimal128Full x(x_high, x_low, x_precision, x_scale); gandiva::Decimal128Full y(y_high, y_low, y_precision, y_scale); arrow::Decimal128 out = gandiva::decimalops::Add(x, y, out_precision, out_scale); - return DecimalSplit{out.high_bits(), out.low_bits()}; + *out_high = out.high_bits(); + *out_low = out.low_bits(); } } // extern "C" From bcfacaafcb181a39d43dbb3d0540c018a5afe157 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 9 Jan 2019 23:12:31 +0100 Subject: [PATCH 186/328] ARROW-3233: [Python] Add prose documentation for CUDA support It will be harder to add generated API docs without requiring CUDA support on the machine building the docs. Author: Antoine Pitrou Closes #3359 from pitrou/ARROW-3233-pyarrow-cuda-doc and squashes the following commits: 40b63f0f ARROW-3233: Add prose documentation for CUDA support --- docs/source/python/cuda.rst | 159 ++++++++++++++++++++++++++++++++++ docs/source/python/index.rst | 1 + docs/source/python/memory.rst | 3 + 3 files changed, 163 insertions(+) create mode 100644 docs/source/python/cuda.rst diff --git a/docs/source/python/cuda.rst b/docs/source/python/cuda.rst new file mode 100644 index 0000000000000..b0150c1c5c8a2 --- /dev/null +++ b/docs/source/python/cuda.rst @@ -0,0 +1,159 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.cuda + +CUDA Integration +================ + +Arrow is not limited to CPU buffers (located in the computer's main memory, +also named "host memory"). It also has provisions for accessing buffers +located on a CUDA-capable GPU device (in "device memory"). + +.. note:: + This functionality is optional and must have been enabled at build time. + If this is not done by your package manager, you might have to build Arrow + yourself. + +CUDA Contexts +------------- + +A CUDA context represents access to a particular CUDA-capable device. +For example, this is creating a CUDA context accessing CUDA device number 0:: + + >>> from pyarrow import cuda + >>> ctx = cuda.Context(0) + >>> + +CUDA Buffers +------------ + +A CUDA buffer can be created by copying data from host memory to the memory +of a CUDA device, using the :meth:`Context.buffer_from_data` method. +The source data can be any Python buffer-like object, including Arrow buffers:: + + >>> import numpy as np + >>> arr = np.arange(4, dtype=np.int32) + >>> arr.nbytes + 16 + >>> cuda_buf = ctx.buffer_from_data(arr) + >>> type(cuda_buf) + pyarrow._cuda.CudaBuffer + >>> cuda_buf.size # The buffer's size in bytes + 16 + >>> cuda_buf.address # The buffer's address in device memory + 30088364544 + >>> cuda_buf.context.device_number + 0 + +Conversely, you can copy back a CUDA buffer to device memory, getting a regular +CPU buffer:: + + >>> buf = cuda_buf.copy_to_host() + >>> type(buf) + pyarrow.lib.Buffer + >>> np.frombuffer(buf, dtype=np.int32) + array([0, 1, 2, 3], dtype=int32) + +.. warning:: + Many Arrow functions expect a CPU buffer but will not check the buffer's + actual type. You will get a crash if you pass a CUDA buffer to such a + function:: + + >>> pa.py_buffer(b"x" * 16).equals(cuda_buf) + Segmentation fault + +Numba Integration +----------------- + +There is not much you can do directly with Arrow CUDA buffers from Python, +but they support interoperation with `Numba `_, +a JIT compiler which can turn Python code into optimized CUDA kernels. + +Arrow to Numba +~~~~~~~~~~~~~~ + +First let's define a Numba CUDA kernel operating on an ``int32`` array. Here, +we will simply increment each array element (assuming the array is writable):: + + import numba.cuda + + @numba.cuda.jit + def increment_by_one(an_array): + pos = numba.cuda.grid(1) + if pos < an_array.size: + an_array[pos] += 1 + +Then we need to wrap our CUDA buffer into a Numba "device array" with the right +array metadata (shape, strides and datatype). This is necessary so that Numba +can identify the array's characteristics and compile the kernel with the +appropriate type declarations. + +In this case the metadata can simply be got from the original Numpy array. +Note the GPU data isn't copied, just pointed to:: + + >>> from numba.cuda.cudadrv.devicearray import DeviceNDArray + >>> device_arr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=cuda_buf.to_numba()) + +(ideally we could have defined an Arrow array in CPU memory, copied it to CUDA +memory without losing type information, and then invoked the Numba kernel on it +without constructing the DeviceNDArray by hand; this is not yet possible) + +Finally we can run the Numba CUDA kernel on the Numba device array (here +with a 16x16 grid size):: + + >>> increment_by_one[16, 16](device_arr) + +And the results can be checked by copying back the CUDA buffer to CPU memory:: + + >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32) + array([1, 2, 3, 4], dtype=int32) + +Numba to Arrow +~~~~~~~~~~~~~~ + +Conversely, a Numba-created device array can be viewed as an Arrow CUDA buffer, +using the :meth:`CudaBuffer.from_numba` factory method. + +For the sake of example, let's first create a Numba device array:: + + >>> arr = np.arange(10, 14, dtype=np.int32) + >>> arr + array([10, 11, 12, 13], dtype=int32) + >>> device_arr = numba.cuda.to_device(arr) + +Then we can create a CUDA buffer pointing the device array's memory. +We don't need to pass a CUDA context explicitly this time: the appropriate +CUDA context is automatically retrieved and adapted from the Numba object. + +:: + + >>> cuda_buf = cuda.CudaBuffer.from_numba(device_arr.gpu_data) + >>> cuda_buf.size + 16 + >>> cuda_buf.address + 30088364032 + >>> cuda_buf.context.device_number + 0 + +Of course, we can copy the CUDA buffer back to host memory:: + + >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32) + array([10, 11, 12, 13], dtype=int32) + +.. seealso:: + Documentation for Numba's `CUDA support `_. diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index fe04a73f32ef2..9f96771494c79 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -43,6 +43,7 @@ files into Arrow structures. pandas csv parquet + cuda extending api development diff --git a/docs/source/python/memory.rst b/docs/source/python/memory.rst index 0d30866d0aa4d..ba66807b38a8e 100644 --- a/docs/source/python/memory.rst +++ b/docs/source/python/memory.rst @@ -109,6 +109,9 @@ the buffer is garbaged-collected, all of the memory is freed: buf = None pa.total_allocated_bytes() +.. seealso:: + On-GPU buffers using Arrow's optional :doc:`CUDA integration `. + Input and Output ================ From 3330d660643a034168b472b52aebfe0fea84b8cf Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 9 Jan 2019 16:14:25 -0600 Subject: [PATCH 187/328] ARROW-4118: [Python] Fix benchmark setup for "asv run" "conda activate" unfortunately isn't available from a non-interactive shell, and running bash as interactive doesn't look like a workable solution. Also fix a setup slowness issue in the Parquet benchmarks, and fix a C++ ABI issue by downloading packages from Anaconda rather than conda-forge. Author: Antoine Pitrou Closes #3357 from pitrou/ARROW-4118-fix-asv-run and squashes the following commits: b07b68e61 ARROW-4118: Fix benchmark setup for "asv run" --- docs/source/python/benchmarks.rst | 24 +++++++++++++----------- python/asv-build.sh | 17 ++++++++++++----- python/asv.conf.json | 4 +++- python/benchmarks/parquet.py | 16 +++++++++------- 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/docs/source/python/benchmarks.rst b/docs/source/python/benchmarks.rst index 7672294a4eddf..12205c57355bb 100644 --- a/docs/source/python/benchmarks.rst +++ b/docs/source/python/benchmarks.rst @@ -19,35 +19,37 @@ Benchmarks ========== The ``pyarrow`` package comes with a suite of benchmarks meant to -run with `asv`_. You'll need to install the ``asv`` package first +run with `ASV`_. You'll need to install the ``asv`` package first (``pip install asv`` or ``conda install -c conda-forge asv``). -The benchmarks are run using `asv`_ which is also their only requirement. - Running the benchmarks ---------------------- -To run the benchmarks, call ``asv run --python=same``. You cannot use the -plain ``asv run`` command at the moment as asv cannot handle python packages -in subdirectories of a repository. +To run the benchmarks for a locally-built Arrow, run ``asv dev`` or +``asv run --python=same``. -Running with arbitrary revisions --------------------------------- +Running for arbitrary Git revisions +----------------------------------- ASV allows to store results and generate graphs of the benchmarks over -the project's evolution. For this you have the latest development version of ASV: +the project's evolution. You need to have the latest development version of ASV: .. code:: pip install git+https://github.com/airspeed-velocity/asv +The build scripts assume that Conda's ``activate`` script is on the PATH +(the ``conda activate`` command unfortunately isn't available from +non-interactive scripts). + Now you should be ready to run ``asv run`` or whatever other command -suits your needs. +suits your needs. Note that this can be quite long, as each Arrow needs +to be rebuilt for each Git revision you're running the benchmarks for. Compatibility ------------- We only expect the benchmarking setup to work with Python 3.6 or later, -on a Unix-like system. +on a Unix-like system with bash. .. _asv: https://asv.readthedocs.org/ diff --git a/python/asv-build.sh b/python/asv-build.sh index 7b55456394dcd..90c7872cc2b8d 100755 --- a/python/asv-build.sh +++ b/python/asv-build.sh @@ -21,7 +21,9 @@ set -e # ASV doesn't activate its conda environment for us if [ -z "$ASV_ENV_DIR" ]; then exit 1; fi -conda activate $ASV_ENV_DIR +# Avoid "conda activate" because it's only set up in interactive shells +# (https://github.com/conda/conda/issues/8072) +source activate $ASV_ENV_DIR echo "== Conda Prefix for benchmarks: " $CONDA_PREFIX " ==" # Build Arrow C++ libraries @@ -32,6 +34,8 @@ export ORC_HOME=$CONDA_PREFIX export PROTOBUF_HOME=$CONDA_PREFIX export BOOST_ROOT=$CONDA_PREFIX +export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" + pushd ../cpp mkdir -p build pushd build @@ -40,9 +44,11 @@ cmake -GNinja \ -DCMAKE_BUILD_TYPE=release \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DARROW_CXXFLAGS=$CXXFLAGS \ - -DARROW_PYTHON=ON \ - -DARROW_PLASMA=ON \ - -DARROW_BUILD_TESTS=OFF \ + -DARROW_USE_GLOG=off \ + -DARROW_PARQUET=on \ + -DARROW_PYTHON=on \ + -DARROW_PLASMA=on \ + -DARROW_BUILD_TESTS=off \ .. cmake --build . --target install @@ -52,7 +58,8 @@ popd # Build pyarrow wrappers export SETUPTOOLS_SCM_PRETEND_VERSION=0.0.1 export PYARROW_BUILD_TYPE=release -export PYARROW_PARALLEL=4 +export PYARROW_PARALLEL=8 +export PYARROW_WITH_PARQUET=1 export PYARROW_WITH_PLASMA=1 python setup.py clean diff --git a/python/asv.conf.json b/python/asv.conf.json index 40938ee713b08..09031c833035d 100644 --- a/python/asv.conf.json +++ b/python/asv.conf.json @@ -35,6 +35,7 @@ // of the repository. "repo_subdir": "python", + // Custom build commands for Arrow. "build_command": ["/bin/bash {build_dir}/asv-build.sh"], "install_command": ["/bin/bash {build_dir}/asv-install.sh"], "uninstall_command": ["/bin/bash {build_dir}/asv-uninstall.sh"], @@ -56,7 +57,8 @@ // determined by looking for tools on the PATH environment // variable. "environment_type": "conda", - "conda_channels": ["conda-forge", "defaults"], + // Avoid conda-forge to avoid C++ ABI issues + "conda_channels": ["defaults"], // the base URL to show a commit for the project. "show_commit_url": "https://github.com/apache/arrow/commit/", diff --git a/python/benchmarks/parquet.py b/python/benchmarks/parquet.py index fd617934e8baf..4f555872a1550 100644 --- a/python/benchmarks/parquet.py +++ b/python/benchmarks/parquet.py @@ -15,11 +15,12 @@ # specific language governing permissions and limitations # under the License. -import pandas as pd -import random import shutil import tempfile +import numpy as np +import pandas as pd + import pyarrow as pa try: import pyarrow.parquet as pq @@ -38,18 +39,19 @@ class ParquetManifestCreation(object): def setup(self, num_partitions, num_threads): if pq is None: - raise NotImplementedError + raise NotImplementedError("Parquet support not enabled") self.tmpdir = tempfile.mkdtemp('benchmark_parquet') - num1 = [random.choice(range(0, num_partitions)) - for _ in range(self.size)] - num2 = [random.choice(range(0, 1000)) for _ in range(self.size)] + rnd = np.random.RandomState(42) + num1 = rnd.randint(0, num_partitions, size=self.size) + num2 = rnd.randint(0, 1000, size=self.size) output_df = pd.DataFrame({'num1': num1, 'num2': num2}) output_table = pa.Table.from_pandas(output_df) pq.write_to_dataset(output_table, self.tmpdir, ['num1']) def teardown(self, num_partitions, num_threads): - shutil.rmtree(self.tmpdir) + if self.tmpdir is not None: + shutil.rmtree(self.tmpdir) def time_manifest_creation(self, num_partitions, num_threads): pq.ParquetManifest(self.tmpdir, metadata_nthreads=num_threads) From 6b496f7c1929a0a371fe708ae653228a9e722150 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 9 Jan 2019 16:16:40 -0600 Subject: [PATCH 188/328] ARROW-3997: [Documentation] Clarify dictionary index type Mandate signed integers for dictionary index types, without constraining integer width. Author: Antoine Pitrou Closes #3355 from pitrou/ARROW-3997-dictionary-encoding-doc and squashes the following commits: 4e05e2642 ARROW-3997: Clarify dictionary index type --- docs/source/format/Layout.rst | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/docs/source/format/Layout.rst b/docs/source/format/Layout.rst index 69cbf0654900a..f3e5290ec1803 100644 --- a/docs/source/format/Layout.rst +++ b/docs/source/format/Layout.rst @@ -614,13 +614,13 @@ Dictionary encoding ------------------- When a field is dictionary encoded, the values are represented by an array of -Int32 representing the index of the value in the dictionary. The Dictionary is -received as one or more DictionaryBatches with the id referenced by a -dictionary attribute defined in the metadata (Message.fbs) in the Field -table. The dictionary has the same layout as the type of the field would -dictate. Each entry in the dictionary can be accessed by its index in the -DictionaryBatches. When a Schema references a Dictionary id, it must send at -least one DictionaryBatch for this id. +signed integers representing the index of the value in the dictionary. +The Dictionary is received as one or more DictionaryBatches with the id +referenced by a dictionary attribute defined in the metadata (Message.fbs) +in the Field table. The dictionary has the same layout as the type of the +field would dictate. Each entry in the dictionary can be accessed by its +index in the DictionaryBatches. When a Schema references a Dictionary id, +it must send at least one DictionaryBatch for this id. As an example, you could have the following data: :: @@ -640,16 +640,17 @@ As an example, you could have the following data: :: In dictionary-encoded form, this could appear as: :: data List (dictionary-encoded, dictionary id i) - indices: [0, 0, 0, 1, 1, 1, 0] + type: Int32 + values: + [0, 0, 0, 1, 1, 1, 0] dictionary i - - type: List - - [ - ['a', 'b'], - ['c', 'd', 'e'], - ] + type: List + values: + [ + ['a', 'b'], + ['c', 'd', 'e'], + ] References ---------- From b8aeb79e94a5a507aeec55d0b6c6bf5d7f0100b2 Mon Sep 17 00:00:00 2001 From: Kenta Murata Date: Wed, 9 Jan 2019 16:18:19 -0600 Subject: [PATCH 189/328] ARROW-854: [Format] Add tentative SparseTensor format I'm interested in making a language-agnostic sparse tensor format. I believe one of the suitable places to do this is Apache Arrow, so let me propose my idea of this here. First of all, I found that there is no common memory layout of sparse tensor representations in my investigation. It means we need some kinds of conversion to share sparse tensors among different systems even if the data format is logically the same. It is the same situation as dataframe, and this is the reason why I believe Apache Arrow is the suitable place. There are many formats to represent a sparse tensor. Most of them are specialized for a matrix, which has two dimensions. There are few formats for general sparse tensor with more than two dimensions. I think the COO format is suitable to start because COO can handle any dimensions, and many systems support the COO format. In my investigation, the systems support COO are SciPy, dask, pydata/sparse, TensorFlow, and PyTorch. Additionally, CSR format for matrices may also be good to support at the first time. The reason is that CSR format is efficient to extract row slices, that may be important for extracting samples from tidy data, and it is supported by SciPy, MXNet, and R's Matrix library. I add my prototype definition of SparseTensor format in this pull-request. I designed this prototype format to be extensible so that we can support additional sparse formats. I think we at least need to support additional sparse tensor format for more than two dimensions in addition to COO so we will need this extensibility. Author: Kenta Murata Closes #2546 from mrkn/sparse_tensor_proposal and squashes the following commits: 148bff822 make format d57e56fc6 Merge sparse_tensor_format.h into sparse_tensor.h 880bbc4eb Rename too-verbose function name c83ea6aaf Add type aliases of sparse tensor types 90e8b3166 Rename sparse tensor classes 07a651863 Use substitution instead of constructor call 37a0a14c6 Remove needless function declaration 97e85bd35 Use std::make_shared 3dd434c83 Capitalize member function name 6ef6ad065 Apply code formatter 6f291581e Mark APIs for sparse tensor as EXPERIMENTAL ff3ea71c5 Rename length to non_zero_length in SparseTensor f78230344 Return Status::IOError instead of DCHECK if message header type is not matched 7e814de36 Put EXPERIMENTAL markn in comments 357860d8c Fix typo in comments 43d8eea44 Fix coding style 99b1d1d4d Add missing ARROW_EXPORT specifiers 401ae8023 Fix SparseCSRIndex::ToString and add tests 9e457acd3 Remove needless virtual specifiers 3b1db7d32 Add SparseTensorBase::Equals d6a8c3805 Unify Tensor.fbs and SparseTensor.fbs b3a62ebfa Fix format 6bc9e296f Support IPC read and write of SparseTensor 1d9042709 Fix format 51a83bfee Add SparseTensorFormat 93c03adad Add SparseIndex::ToString() 021b46be0 Add SparseTensorBase ed3984dd4 Add SparseIndex::format_type 4251b4d08 Add SparseCSRIndex 433c9b441 Change COO index matrix to column-major in a format description 392a25b7c Implement SparseTensor and SparseCOOIndex b24f3c342 Insert additional padding in sparse tensor format c508db086 Write sparse tensor format in IPC.md 2b50040f5 Add an example of the CSR format in comment 76c56dd35 Make indptr of CSR a buffer d7e653f17 Add an example of COO format in comment 866b2c13a Add header comments in SparseTensor.fbs aa9b8a4d0 Add SparseTensor.fbs in FBS_SRC 1f16ffed8 Fix syntax error in SparseTensor.fbs c3bc6edfa Add tentative SparseTensor format --- cpp/src/arrow/CMakeLists.txt | 2 + cpp/src/arrow/compare.cc | 93 +++++ cpp/src/arrow/compare.h | 4 + cpp/src/arrow/ipc/message.cc | 2 + cpp/src/arrow/ipc/message.h | 2 +- cpp/src/arrow/ipc/metadata-internal.cc | 148 ++++++++ cpp/src/arrow/ipc/metadata-internal.h | 12 + cpp/src/arrow/ipc/read-write-test.cc | 112 ++++++ cpp/src/arrow/ipc/reader.cc | 119 +++++++ cpp/src/arrow/ipc/reader.h | 17 + cpp/src/arrow/ipc/writer.cc | 101 ++++++ cpp/src/arrow/ipc/writer.h | 15 + cpp/src/arrow/sparse_tensor-test.cc | 244 +++++++++++++ cpp/src/arrow/sparse_tensor.cc | 452 +++++++++++++++++++++++++ cpp/src/arrow/sparse_tensor.h | 211 ++++++++++++ cpp/src/arrow/tensor.h | 6 + docs/source/format/IPC.rst | 24 ++ format/Message.fbs | 4 +- format/Tensor.fbs | 96 ++++++ 19 files changed, 1661 insertions(+), 3 deletions(-) create mode 100644 cpp/src/arrow/sparse_tensor-test.cc create mode 100644 cpp/src/arrow/sparse_tensor.cc create mode 100644 cpp/src/arrow/sparse_tensor.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index f2a811247287b..91bdce294c2d1 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -83,6 +83,7 @@ set(ARROW_SRCS table.cc table_builder.cc tensor.cc + sparse_tensor.cc type.cc visitor.cc @@ -286,6 +287,7 @@ ADD_ARROW_TEST(type-test) ADD_ARROW_TEST(table-test) ADD_ARROW_TEST(table_builder-test) ADD_ARROW_TEST(tensor-test) +ADD_ARROW_TEST(sparse_tensor-test) ADD_ARROW_BENCHMARK(builder-benchmark) ADD_ARROW_BENCHMARK(column-benchmark) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index efc8ad82faf93..114752934c9f6 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -30,6 +30,7 @@ #include "arrow/array.h" #include "arrow/buffer.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/type.h" @@ -782,6 +783,98 @@ bool TensorEquals(const Tensor& left, const Tensor& right) { return are_equal; } +namespace { + +template +struct SparseTensorEqualsImpl { + static bool Compare(const SparseTensorImpl& left, + const SparseTensorImpl& right) { + // TODO(mrkn): should we support the equality among different formats? + return false; + } +}; + +template +struct SparseTensorEqualsImpl { + static bool Compare(const SparseTensorImpl& left, + const SparseTensorImpl& right) { + DCHECK(left.type()->id() == right.type()->id()); + DCHECK(left.shape() == right.shape()); + DCHECK(left.non_zero_length() == right.non_zero_length()); + + const auto& left_index = checked_cast(*left.sparse_index()); + const auto& right_index = checked_cast(*right.sparse_index()); + + if (!left_index.Equals(right_index)) { + return false; + } + + const auto& size_meta = dynamic_cast(*left.type()); + const int byte_width = size_meta.bit_width() / CHAR_BIT; + DCHECK_GT(byte_width, 0); + + const uint8_t* left_data = left.data()->data(); + const uint8_t* right_data = right.data()->data(); + + return memcmp(left_data, right_data, + static_cast(byte_width * left.non_zero_length())); + } +}; + +template +inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl& left, + const SparseTensor& right) { + switch (right.format_id()) { + case SparseTensorFormat::COO: { + const auto& right_coo = + checked_cast&>(right); + return SparseTensorEqualsImpl::Compare(left, + right_coo); + } + + case SparseTensorFormat::CSR: { + const auto& right_csr = + checked_cast&>(right); + return SparseTensorEqualsImpl::Compare(left, + right_csr); + } + + default: + return false; + } +} + +} // namespace + +bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) { + if (&left == &right) { + return true; + } else if (left.type()->id() != right.type()->id()) { + return false; + } else if (left.size() == 0) { + return true; + } else if (left.shape() != right.shape()) { + return false; + } else if (left.non_zero_length() != right.non_zero_length()) { + return false; + } + + switch (left.format_id()) { + case SparseTensorFormat::COO: { + const auto& left_coo = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_coo, right); + } + + case SparseTensorFormat::CSR: { + const auto& left_csr = checked_cast&>(left); + return SparseTensorEqualsImplDispatch(left_csr, right); + } + + default: + return false; + } +} + bool TypeEquals(const DataType& left, const DataType& right) { bool are_equal; // The arrays are the same object diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 21e2fdc24f19c..d49d7cc0fdb08 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -29,12 +29,16 @@ namespace arrow { class Array; class DataType; class Tensor; +class SparseTensor; /// Returns true if the arrays are exactly equal bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right); bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right); +/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal +bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right); + /// Returns true if the arrays are approximately equal. For non-floating point /// types, this is equivalent to ArrayEquals(left, right) bool ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right); diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 8adf4a8b66038..23709a4619207 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -63,6 +63,8 @@ class Message::MessageImpl { return Message::RECORD_BATCH; case flatbuf::MessageHeader_Tensor: return Message::TENSOR; + case flatbuf::MessageHeader_SparseTensor: + return Message::SPARSE_TENSOR; default: return Message::NONE; } diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index 092a19ff9a0cf..760012d1a6878 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -70,7 +70,7 @@ constexpr int kMaxNestingDepth = 64; /// \brief An IPC message including metadata and body class ARROW_EXPORT Message { public: - enum Type { NONE, SCHEMA, DICTIONARY_BATCH, RECORD_BATCH, TENSOR }; + enum Type { NONE, SCHEMA, DICTIONARY_BATCH, RECORD_BATCH, TENSOR, SPARSE_TENSOR }; /// \brief Construct message, but do not validate /// diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 1d4c80c2946b1..da6711395f8ea 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -31,6 +31,7 @@ #include "arrow/ipc/Tensor_generated.h" // IWYU pragma: keep #include "arrow/ipc/message.h" #include "arrow/ipc/util.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/type.h" @@ -50,6 +51,7 @@ using DictionaryOffset = flatbuffers::Offset; using FieldOffset = flatbuffers::Offset; using KeyValueOffset = flatbuffers::Offset; using RecordBatchOffset = flatbuffers::Offset; +using SparseTensorOffset = flatbuffers::Offset; using Offset = flatbuffers::Offset; using FBString = flatbuffers::Offset; @@ -781,6 +783,106 @@ Status WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, body_length, out); } +Status MakeSparseTensorIndexCOO(FBB& fbb, const SparseCOOIndex& sparse_index, + const std::vector& buffers, + flatbuf::SparseTensorIndex* fb_sparse_index_type, + Offset* fb_sparse_index, size_t* num_buffers) { + *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseTensorIndexCOO; + const BufferMetadata& indices_metadata = buffers[0]; + flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); + *fb_sparse_index = flatbuf::CreateSparseTensorIndexCOO(fbb, &indices).Union(); + *num_buffers = 1; + return Status::OK(); +} + +Status MakeSparseMatrixIndexCSR(FBB& fbb, const SparseCSRIndex& sparse_index, + const std::vector& buffers, + flatbuf::SparseTensorIndex* fb_sparse_index_type, + Offset* fb_sparse_index, size_t* num_buffers) { + *fb_sparse_index_type = flatbuf::SparseTensorIndex_SparseMatrixIndexCSR; + const BufferMetadata& indptr_metadata = buffers[0]; + const BufferMetadata& indices_metadata = buffers[1]; + flatbuf::Buffer indptr(indptr_metadata.offset, indptr_metadata.length); + flatbuf::Buffer indices(indices_metadata.offset, indices_metadata.length); + *fb_sparse_index = flatbuf::CreateSparseMatrixIndexCSR(fbb, &indptr, &indices).Union(); + *num_buffers = 2; + return Status::OK(); +} + +Status MakeSparseTensorIndex(FBB& fbb, const SparseIndex& sparse_index, + const std::vector& buffers, + flatbuf::SparseTensorIndex* fb_sparse_index_type, + Offset* fb_sparse_index, size_t* num_buffers) { + switch (sparse_index.format_id()) { + case SparseTensorFormat::COO: + RETURN_NOT_OK(MakeSparseTensorIndexCOO( + fbb, checked_cast(sparse_index), buffers, + fb_sparse_index_type, fb_sparse_index, num_buffers)); + break; + + case SparseTensorFormat::CSR: + RETURN_NOT_OK(MakeSparseMatrixIndexCSR( + fbb, checked_cast(sparse_index), buffers, + fb_sparse_index_type, fb_sparse_index, num_buffers)); + break; + + default: + std::stringstream ss; + ss << "Unsupporoted sparse tensor format:: " << sparse_index.ToString() + << std::endl; + return Status::NotImplemented(ss.str()); + } + + return Status::OK(); +} + +Status MakeSparseTensor(FBB& fbb, const SparseTensor& sparse_tensor, int64_t body_length, + const std::vector& buffers, + SparseTensorOffset* offset) { + flatbuf::Type fb_type_type; + Offset fb_type; + RETURN_NOT_OK( + TensorTypeToFlatbuffer(fbb, *sparse_tensor.type(), &fb_type_type, &fb_type)); + + using TensorDimOffset = flatbuffers::Offset; + std::vector dims; + for (int i = 0; i < sparse_tensor.ndim(); ++i) { + FBString name = fbb.CreateString(sparse_tensor.dim_name(i)); + dims.push_back(flatbuf::CreateTensorDim(fbb, sparse_tensor.shape()[i], name)); + } + + auto fb_shape = fbb.CreateVector(dims); + + flatbuf::SparseTensorIndex fb_sparse_index_type; + Offset fb_sparse_index; + size_t num_index_buffers = 0; + RETURN_NOT_OK(MakeSparseTensorIndex(fbb, *sparse_tensor.sparse_index(), buffers, + &fb_sparse_index_type, &fb_sparse_index, + &num_index_buffers)); + + const BufferMetadata& data_metadata = buffers[num_index_buffers]; + flatbuf::Buffer data(data_metadata.offset, data_metadata.length); + + const int64_t non_zero_length = sparse_tensor.non_zero_length(); + + *offset = + flatbuf::CreateSparseTensor(fbb, fb_type_type, fb_type, fb_shape, non_zero_length, + fb_sparse_index_type, fb_sparse_index, &data); + + return Status::OK(); +} + +Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, + const std::vector& buffers, + std::shared_ptr* out) { + FBB fbb; + SparseTensorOffset fb_sparse_tensor; + RETURN_NOT_OK( + MakeSparseTensor(fbb, sparse_tensor, body_length, buffers, &fb_sparse_tensor)); + return WriteFBMessage(fbb, flatbuf::MessageHeader_SparseTensor, + fb_sparse_tensor.Union(), body_length, out); +} + Status WriteDictionaryMessage(int64_t id, int64_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers, @@ -933,6 +1035,52 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type return TypeFromFlatbuffer(tensor->type_type(), tensor->type(), {}, type); } +Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, + std::vector* shape, + std::vector* dim_names, + int64_t* non_zero_length, + SparseTensorFormat::type* sparse_tensor_format_id) { + auto message = flatbuf::GetMessage(metadata.data()); + if (message->header_type() != flatbuf::MessageHeader_SparseTensor) { + return Status::IOError("Header of flatbuffer-encoded Message is not SparseTensor."); + } + if (message->header() == nullptr) { + return Status::IOError("Header-pointer of flatbuffer-encoded Message is null."); + } + + auto sparse_tensor = reinterpret_cast(message->header()); + int ndim = static_cast(sparse_tensor->shape()->size()); + + for (int i = 0; i < ndim; ++i) { + auto dim = sparse_tensor->shape()->Get(i); + + shape->push_back(dim->size()); + auto fb_name = dim->name(); + if (fb_name == 0) { + dim_names->push_back(""); + } else { + dim_names->push_back(fb_name->str()); + } + } + + *non_zero_length = sparse_tensor->non_zero_length(); + + switch (sparse_tensor->sparseIndex_type()) { + case flatbuf::SparseTensorIndex_SparseTensorIndexCOO: + *sparse_tensor_format_id = SparseTensorFormat::COO; + break; + + case flatbuf::SparseTensorIndex_SparseMatrixIndexCSR: + *sparse_tensor_format_id = SparseTensorFormat::CSR; + break; + + default: + return Status::Invalid("Unrecognized sparse index type"); + } + + return TypeFromFlatbuffer(sparse_tensor->type_type(), sparse_tensor->type(), {}, type); +} + // ---------------------------------------------------------------------- // Implement message writing diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 152ca1367ec0e..6562382b878e6 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -33,6 +33,7 @@ #include "arrow/ipc/dictionary.h" // IYWU pragma: keep #include "arrow/ipc/message.h" #include "arrow/memory_pool.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" namespace arrow { @@ -40,6 +41,7 @@ namespace arrow { class DataType; class Schema; class Tensor; +class SparseTensor; namespace flatbuf = org::apache::arrow::flatbuf; @@ -103,6 +105,12 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type std::vector* shape, std::vector* strides, std::vector* dim_names); +// EXPERIMENTAL: Extracting metadata of a sparse tensor from the message +Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, + std::vector* shape, + std::vector* dim_names, int64_t* length, + SparseTensorFormat::type* sparse_tensor_format_id); + /// Write a serialized message metadata with a length-prefix and padding to an /// 8-byte offset. Does not make assumptions about whether the stream is /// aligned already @@ -137,6 +145,10 @@ Status WriteRecordBatchMessage(const int64_t length, const int64_t body_length, Status WriteTensorMessage(const Tensor& tensor, const int64_t buffer_start_offset, std::shared_ptr* out); +Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, + const std::vector& buffers, + std::shared_ptr* out); + Status WriteFileFooter(const Schema& schema, const std::vector& dictionaries, const std::vector& record_batches, DictionaryMemo* dictionary_memo, io::OutputStream* out); diff --git a/cpp/src/arrow/ipc/read-write-test.cc b/cpp/src/arrow/ipc/read-write-test.cc index 3a723badf37d7..bc27386f34f30 100644 --- a/cpp/src/arrow/ipc/read-write-test.cc +++ b/cpp/src/arrow/ipc/read-write-test.cc @@ -38,6 +38,7 @@ #include "arrow/ipc/writer.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/test-util.h" @@ -844,6 +845,117 @@ TEST_F(TestTensorRoundTrip, NonContiguous) { CheckTensorRoundTrip(tensor); } +class TestSparseTensorRoundTrip : public ::testing::Test, public IpcTestFixture { + public: + void SetUp() { pool_ = default_memory_pool(); } + void TearDown() { io::MemoryMapFixture::TearDown(); } + + template + void CheckSparseTensorRoundTrip(const SparseTensorImpl& tensor) { + GTEST_FAIL(); + } +}; + +template <> +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseTensorImpl& tensor) { + const auto& type = checked_cast(*tensor.type()); + const int elem_size = type.bit_width() / 8; + + int32_t metadata_length; + int64_t body_length; + + ASSERT_OK(mmap_->Seek(0)); + + ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, + default_memory_pool())); + + const auto& sparse_index = checked_cast(*tensor.sparse_index()); + const int64_t indices_length = elem_size * sparse_index.indices()->size(); + const int64_t data_length = elem_size * tensor.non_zero_length(); + const int64_t expected_body_length = indices_length + data_length; + ASSERT_EQ(expected_body_length, body_length); + + ASSERT_OK(mmap_->Seek(0)); + + std::shared_ptr result; + ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + + const auto& resulted_sparse_index = + checked_cast(*result->sparse_index()); + ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); + ASSERT_EQ(result->data()->size(), data_length); + ASSERT_TRUE(result->Equals(*result)); +} + +template <> +void TestSparseTensorRoundTrip::CheckSparseTensorRoundTrip( + const SparseTensorImpl& tensor) { + const auto& type = checked_cast(*tensor.type()); + const int elem_size = type.bit_width() / 8; + + int32_t metadata_length; + int64_t body_length; + + ASSERT_OK(mmap_->Seek(0)); + + ASSERT_OK(WriteSparseTensor(tensor, mmap_.get(), &metadata_length, &body_length, + default_memory_pool())); + + const auto& sparse_index = checked_cast(*tensor.sparse_index()); + const int64_t indptr_length = elem_size * sparse_index.indptr()->size(); + const int64_t indices_length = elem_size * sparse_index.indices()->size(); + const int64_t data_length = elem_size * tensor.non_zero_length(); + const int64_t expected_body_length = indptr_length + indices_length + data_length; + ASSERT_EQ(expected_body_length, body_length); + + ASSERT_OK(mmap_->Seek(0)); + + std::shared_ptr result; + ASSERT_OK(ReadSparseTensor(mmap_.get(), &result)); + + const auto& resulted_sparse_index = + checked_cast(*result->sparse_index()); + ASSERT_EQ(resulted_sparse_index.indptr()->data()->size(), indptr_length); + ASSERT_EQ(resulted_sparse_index.indices()->data()->size(), indices_length); + ASSERT_EQ(result->data()->size(), data_length); + ASSERT_TRUE(result->Equals(*result)); +} + +TEST_F(TestSparseTensorRoundTrip, WithSparseCOOIndex) { + std::string path = "test-write-sparse-coo-tensor"; + constexpr int64_t kBufferSize = 1 << 20; + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); + + std::vector shape = {2, 3, 4}; + std::vector dim_names = {"foo", "bar", "baz"}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + + auto data = Buffer::Wrap(values); + NumericTensor t(data, shape, {}, dim_names); + SparseTensorImpl st(t); + + CheckSparseTensorRoundTrip(st); +} + +TEST_F(TestSparseTensorRoundTrip, WithSparseCSRIndex) { + std::string path = "test-write-sparse-csr-matrix"; + constexpr int64_t kBufferSize = 1 << 20; + ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); + + std::vector shape = {4, 6}; + std::vector dim_names = {"foo", "bar", "baz"}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + + auto data = Buffer::Wrap(values); + NumericTensor t(data, shape, {}, dim_names); + SparseTensorImpl st(t); + + CheckSparseTensorRoundTrip(st); +} + TEST(TestRecordBatchStreamReader, MalformedInput) { const std::string empty_str = ""; const std::string garbage_str = "12345678"; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 59a322a64338a..e856acafd7138 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -38,6 +38,7 @@ #include "arrow/ipc/message.h" #include "arrow/ipc/metadata-internal.h" #include "arrow/record_batch.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/tensor.h" #include "arrow/type.h" @@ -726,5 +727,123 @@ Status ReadTensor(const Message& message, std::shared_ptr* out) { return Status::OK(); } +namespace { + +Status ReadSparseCOOIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, + int64_t non_zero_length, io::RandomAccessFile* file, + std::shared_ptr* out) { + auto* sparse_index = sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(); + auto* indices_buffer = sparse_index->indicesBuffer(); + std::shared_ptr indices_data; + RETURN_NOT_OK( + file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); + std::vector shape({non_zero_length, ndim}); + const int64_t elsize = sizeof(int64_t); + std::vector strides({elsize, elsize * non_zero_length}); + *out = std::make_shared( + std::make_shared(indices_data, shape, strides)); + return Status::OK(); +} + +Status ReadSparseCSRIndex(const flatbuf::SparseTensor* sparse_tensor, int64_t ndim, + int64_t non_zero_length, io::RandomAccessFile* file, + std::shared_ptr* out) { + auto* sparse_index = sparse_tensor->sparseIndex_as_SparseMatrixIndexCSR(); + + auto* indptr_buffer = sparse_index->indptrBuffer(); + std::shared_ptr indptr_data; + RETURN_NOT_OK( + file->ReadAt(indptr_buffer->offset(), indptr_buffer->length(), &indptr_data)); + + auto* indices_buffer = sparse_index->indicesBuffer(); + std::shared_ptr indices_data; + RETURN_NOT_OK( + file->ReadAt(indices_buffer->offset(), indices_buffer->length(), &indices_data)); + + std::vector indptr_shape({ndim + 1}); + std::vector indices_shape({non_zero_length}); + *out = std::make_shared( + std::make_shared(indptr_data, indptr_shape), + std::make_shared(indices_data, indices_shape)); + return Status::OK(); +} + +Status MakeSparseTensorWithSparseCOOIndex( + const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names, + const std::shared_ptr& sparse_index, int64_t non_zero_length, + const std::shared_ptr& data, std::shared_ptr* out) { + *out = std::make_shared>(sparse_index, type, data, + shape, dim_names); + return Status::OK(); +} + +Status MakeSparseTensorWithSparseCSRIndex( + const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names, + const std::shared_ptr& sparse_index, int64_t non_zero_length, + const std::shared_ptr& data, std::shared_ptr* out) { + *out = std::make_shared>(sparse_index, type, data, + shape, dim_names); + return Status::OK(); +} + +} // namespace + +Status ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file, + std::shared_ptr* out) { + std::shared_ptr type; + std::vector shape; + std::vector dim_names; + int64_t non_zero_length; + SparseTensorFormat::type sparse_tensor_format_id; + + RETURN_NOT_OK(internal::GetSparseTensorMetadata( + metadata, &type, &shape, &dim_names, &non_zero_length, &sparse_tensor_format_id)); + + auto message = flatbuf::GetMessage(metadata.data()); + auto sparse_tensor = reinterpret_cast(message->header()); + const flatbuf::Buffer* buffer = sparse_tensor->data(); + DCHECK(BitUtil::IsMultipleOf8(buffer->offset())) + << "Buffer of sparse index data " + << "did not start on 8-byte aligned offset: " << buffer->offset(); + + std::shared_ptr data; + RETURN_NOT_OK(file->ReadAt(buffer->offset(), buffer->length(), &data)); + + std::shared_ptr sparse_index; + switch (sparse_tensor_format_id) { + case SparseTensorFormat::COO: + RETURN_NOT_OK(ReadSparseCOOIndex(sparse_tensor, shape.size(), non_zero_length, file, + &sparse_index)); + return MakeSparseTensorWithSparseCOOIndex( + type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), + non_zero_length, data, out); + + case SparseTensorFormat::CSR: + RETURN_NOT_OK(ReadSparseCSRIndex(sparse_tensor, shape.size(), non_zero_length, file, + &sparse_index)); + return MakeSparseTensorWithSparseCSRIndex( + type, shape, dim_names, std::dynamic_pointer_cast(sparse_index), + non_zero_length, data, out); + + default: + return Status::Invalid("Unsupported sparse index format"); + } +} + +Status ReadSparseTensor(const Message& message, std::shared_ptr* out) { + io::BufferReader buffer_reader(message.body()); + return ReadSparseTensor(*message.metadata(), &buffer_reader, out); +} + +Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out) { + std::unique_ptr message; + RETURN_NOT_OK(ReadContiguousPayload(file, &message)); + DCHECK_EQ(message->type(), Message::SPARSE_TENSOR); + io::BufferReader buffer_reader(message->body()); + return ReadSparseTensor(*message->metadata(), &buffer_reader, out); +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 942664d6f2269..ebecea13ffb8b 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -33,6 +33,7 @@ class Buffer; class Schema; class Status; class Tensor; +class SparseTensor; namespace io { @@ -235,6 +236,22 @@ Status ReadTensor(io::InputStream* file, std::shared_ptr* out); ARROW_EXPORT Status ReadTensor(const Message& message, std::shared_ptr* out); +/// \brief EXPERIMETNAL: Read arrow::SparseTensor as encapsulated IPC message in file +/// +/// \param[in] file an InputStream pointed at the start of the message +/// \param[out] out the read sparse tensor +/// \return Status +ARROW_EXPORT +Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out); + +/// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message +/// +/// \param[in] message a Message containing the tensor metadata and body +/// \param[out] out the read sparse tensor +/// \return Status +ARROW_EXPORT +Status ReadSparseTensor(const Message& message, std::shared_ptr* out); + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 6ce72e070e7b3..0bf68142c7776 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "arrow/array.h" @@ -33,6 +34,7 @@ #include "arrow/ipc/util.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/sparse_tensor.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/tensor.h" @@ -671,6 +673,105 @@ Status GetTensorMessage(const Tensor& tensor, MemoryPool* pool, return Status::OK(); } +namespace internal { + +class SparseTensorSerializer { + public: + SparseTensorSerializer(int64_t buffer_start_offset, IpcPayload* out) + : out_(out), buffer_start_offset_(buffer_start_offset) {} + + ~SparseTensorSerializer() = default; + + Status VisitSparseIndex(const SparseIndex& sparse_index) { + switch (sparse_index.format_id()) { + case SparseTensorFormat::COO: + RETURN_NOT_OK( + VisitSparseCOOIndex(checked_cast(sparse_index))); + break; + + case SparseTensorFormat::CSR: + RETURN_NOT_OK( + VisitSparseCSRIndex(checked_cast(sparse_index))); + break; + + default: + std::stringstream ss; + ss << "Unable to convert type: " << sparse_index.ToString() << std::endl; + return Status::NotImplemented(ss.str()); + } + + return Status::OK(); + } + + Status SerializeMetadata(const SparseTensor& sparse_tensor) { + return WriteSparseTensorMessage(sparse_tensor, out_->body_length, buffer_meta_, + &out_->metadata); + } + + Status Assemble(const SparseTensor& sparse_tensor) { + if (buffer_meta_.size() > 0) { + buffer_meta_.clear(); + out_->body_buffers.clear(); + } + + RETURN_NOT_OK(VisitSparseIndex(*sparse_tensor.sparse_index())); + out_->body_buffers.emplace_back(sparse_tensor.data()); + + int64_t offset = buffer_start_offset_; + buffer_meta_.reserve(out_->body_buffers.size()); + + for (size_t i = 0; i < out_->body_buffers.size(); ++i) { + const Buffer* buffer = out_->body_buffers[i].get(); + int64_t size = buffer->size(); + int64_t padding = BitUtil::RoundUpToMultipleOf8(size) - size; + buffer_meta_.push_back({offset, size + padding}); + offset += size + padding; + } + + out_->body_length = offset - buffer_start_offset_; + DCHECK(BitUtil::IsMultipleOf8(out_->body_length)); + + return SerializeMetadata(sparse_tensor); + } + + private: + Status VisitSparseCOOIndex(const SparseCOOIndex& sparse_index) { + out_->body_buffers.emplace_back(sparse_index.indices()->data()); + return Status::OK(); + } + + Status VisitSparseCSRIndex(const SparseCSRIndex& sparse_index) { + out_->body_buffers.emplace_back(sparse_index.indptr()->data()); + out_->body_buffers.emplace_back(sparse_index.indices()->data()); + return Status::OK(); + } + + IpcPayload* out_; + + std::vector buffer_meta_; + + int64_t buffer_start_offset_; +}; + +Status GetSparseTensorPayload(const SparseTensor& sparse_tensor, MemoryPool* pool, + IpcPayload* out) { + SparseTensorSerializer writer(0, out); + return writer.Assemble(sparse_tensor); +} + +} // namespace internal + +Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length, + MemoryPool* pool) { + internal::IpcPayload payload; + internal::SparseTensorSerializer writer(0, &payload); + RETURN_NOT_OK(writer.Assemble(sparse_tensor)); + + *body_length = payload.body_length; + return internal::WriteIpcPayload(payload, dst, metadata_length); +} + Status WriteDictionary(int64_t dictionary_id, const std::shared_ptr& dictionary, int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length, MemoryPool* pool) { diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index a1c711146efe8..5feb9e90cb0b0 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -36,6 +36,7 @@ class Schema; class Status; class Table; class Tensor; +class SparseTensor; namespace io { @@ -269,6 +270,20 @@ ARROW_EXPORT Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length); +// \brief EXPERIMENTAL: Write arrow::SparseTensor as a contiguous mesasge. The metadata, +// sparse index, and body are written assuming 64-byte alignment. It is the +// user's responsibility to ensure that the OutputStream has been aligned +// to a 64-byte multiple before writing the message. +// +// \param[in] tensor the SparseTensor to write +// \param[in] dst the OutputStream to write to +// \param[out] metadata_length the actual metadata length, including padding +// \param[out] body_length the actual message body length +ARROW_EXPORT +Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length, + MemoryPool* pool); + namespace internal { // These internal APIs may change without warning or deprecation diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc new file mode 100644 index 0000000000000..d48f2d0229d58 --- /dev/null +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -0,0 +1,244 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Unit tests for DataType (and subclasses), Field, and Schema + +#include +#include +#include +#include + +#include + +#include + +#include "arrow/sparse_tensor.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +namespace arrow { + +static inline void CheckSparseIndexFormatType(SparseTensorFormat::type expected, + const SparseTensor& sparse_tensor) { + ASSERT_EQ(expected, sparse_tensor.format_id()); + ASSERT_EQ(expected, sparse_tensor.sparse_index()->format_id()); +} + +TEST(TestSparseCOOTensor, CreationEmptyTensor) { + std::vector shape = {2, 3, 4}; + SparseTensorImpl st1(int64(), shape); + + std::vector dim_names = {"foo", "bar", "baz"}; + SparseTensorImpl st2(int64(), shape, dim_names); + + ASSERT_EQ(0, st1.non_zero_length()); + ASSERT_EQ(0, st2.non_zero_length()); + + ASSERT_EQ(24, st1.size()); + ASSERT_EQ(24, st2.size()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); +} + +TEST(TestSparseCOOTensor, CreationFromNumericTensor) { + std::vector shape = {2, 3, 4}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + std::vector dim_names = {"foo", "bar", "baz"}; + NumericTensor tensor1(buffer, shape); + NumericTensor tensor2(buffer, shape, {}, dim_names); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); + + CheckSparseIndexFormatType(SparseTensorFormat::COO, st1); + + ASSERT_EQ(12, st1.non_zero_length()); + ASSERT_TRUE(st1.is_mutable()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); + + const int64_t* ptr = reinterpret_cast(st1.raw_data()); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 1, ptr[i]); + } + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 11, ptr[i + 6]); + } + + const auto& si = internal::checked_cast(*st1.sparse_index()); + ASSERT_EQ(std::string("SparseCOOIndex"), si.ToString()); + + std::shared_ptr sidx = si.indices(); + ASSERT_EQ(std::vector({12, 3}), sidx->shape()); + ASSERT_TRUE(sidx->is_column_major()); + + // (0, 0, 0) -> 1 + ASSERT_EQ(0, sidx->Value({0, 0})); + ASSERT_EQ(0, sidx->Value({0, 1})); + ASSERT_EQ(0, sidx->Value({0, 2})); + + // (0, 0, 2) -> 2 + ASSERT_EQ(0, sidx->Value({1, 0})); + ASSERT_EQ(0, sidx->Value({1, 1})); + ASSERT_EQ(2, sidx->Value({1, 2})); + + // (0, 1, 1) -> 3 + ASSERT_EQ(0, sidx->Value({2, 0})); + ASSERT_EQ(1, sidx->Value({2, 1})); + ASSERT_EQ(1, sidx->Value({2, 2})); + + // (1, 2, 1) -> 15 + ASSERT_EQ(1, sidx->Value({10, 0})); + ASSERT_EQ(2, sidx->Value({10, 1})); + ASSERT_EQ(1, sidx->Value({10, 2})); + + // (1, 2, 3) -> 16 + ASSERT_EQ(1, sidx->Value({11, 0})); + ASSERT_EQ(2, sidx->Value({11, 1})); + ASSERT_EQ(3, sidx->Value({11, 2})); +} + +TEST(TestSparseCOOTensor, CreationFromTensor) { + std::vector shape = {2, 3, 4}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + std::vector dim_names = {"foo", "bar", "baz"}; + Tensor tensor1(int64(), buffer, shape); + Tensor tensor2(int64(), buffer, shape, {}, dim_names); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); + + ASSERT_EQ(12, st1.non_zero_length()); + ASSERT_TRUE(st1.is_mutable()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); + + const int64_t* ptr = reinterpret_cast(st1.raw_data()); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 1, ptr[i]); + } + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 11, ptr[i + 6]); + } + + const auto& si = internal::checked_cast(*st1.sparse_index()); + std::shared_ptr sidx = si.indices(); + ASSERT_EQ(std::vector({12, 3}), sidx->shape()); + ASSERT_TRUE(sidx->is_column_major()); + + // (0, 0, 0) -> 1 + ASSERT_EQ(0, sidx->Value({0, 0})); + ASSERT_EQ(0, sidx->Value({0, 1})); + ASSERT_EQ(0, sidx->Value({0, 2})); + + // (0, 0, 2) -> 2 + ASSERT_EQ(0, sidx->Value({1, 0})); + ASSERT_EQ(0, sidx->Value({1, 1})); + ASSERT_EQ(2, sidx->Value({1, 2})); + + // (0, 1, 1) -> 3 + ASSERT_EQ(0, sidx->Value({2, 0})); + ASSERT_EQ(1, sidx->Value({2, 1})); + ASSERT_EQ(1, sidx->Value({2, 2})); + + // (1, 2, 1) -> 15 + ASSERT_EQ(1, sidx->Value({10, 0})); + ASSERT_EQ(2, sidx->Value({10, 1})); + ASSERT_EQ(1, sidx->Value({10, 2})); + + // (1, 2, 3) -> 16 + ASSERT_EQ(1, sidx->Value({11, 0})); + ASSERT_EQ(2, sidx->Value({11, 1})); + ASSERT_EQ(3, sidx->Value({11, 2})); +} + +TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { + std::vector shape = {6, 4}; + std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer = Buffer::Wrap(values); + std::vector dim_names = {"foo", "bar", "baz"}; + NumericTensor tensor1(buffer, shape); + NumericTensor tensor2(buffer, shape, {}, dim_names); + + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); + + CheckSparseIndexFormatType(SparseTensorFormat::CSR, st1); + + ASSERT_EQ(12, st1.non_zero_length()); + ASSERT_TRUE(st1.is_mutable()); + + ASSERT_EQ("foo", st2.dim_name(0)); + ASSERT_EQ("bar", st2.dim_name(1)); + ASSERT_EQ("baz", st2.dim_name(2)); + + ASSERT_EQ("", st1.dim_name(0)); + ASSERT_EQ("", st1.dim_name(1)); + ASSERT_EQ("", st1.dim_name(2)); + + const int64_t* ptr = reinterpret_cast(st1.raw_data()); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 1, ptr[i]); + } + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(i + 11, ptr[i + 6]); + } + + const auto& si = internal::checked_cast(*st1.sparse_index()); + + ASSERT_EQ(std::string("SparseCSRIndex"), si.ToString()); + ASSERT_EQ(1, si.indptr()->ndim()); + ASSERT_EQ(1, si.indices()->ndim()); + + const int64_t* indptr_begin = reinterpret_cast(si.indptr()->raw_data()); + std::vector indptr_values(indptr_begin, + indptr_begin + si.indptr()->shape()[0]); + + ASSERT_EQ(7, indptr_values.size()); + ASSERT_EQ(std::vector({0, 2, 4, 6, 8, 10, 12}), indptr_values); + + const int64_t* indices_begin = + reinterpret_cast(si.indices()->raw_data()); + std::vector indices_values(indices_begin, + indices_begin + si.indices()->shape()[0]); + + ASSERT_EQ(12, indices_values.size()); + ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); +} + +} // namespace arrow diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc new file mode 100644 index 0000000000000..101500d36432e --- /dev/null +++ b/cpp/src/arrow/sparse_tensor.cc @@ -0,0 +1,452 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/sparse_tensor.h" + +#include +#include +#include + +#include "arrow/compare.h" +#include "arrow/util/logging.h" + +namespace arrow { + +namespace { + +// ---------------------------------------------------------------------- +// SparseTensorConverter + +template +class SparseTensorConverter { + public: + explicit SparseTensorConverter(const NumericTensor&) {} + + Status Convert() { return Status::Invalid("Unsupported sparse index"); } +}; + +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCOOIndex + +template +struct SparseTensorConverterBase { + using NumericTensorType = NumericTensor; + using value_type = typename NumericTensorType::value_type; + + explicit SparseTensorConverterBase(const NumericTensorType& tensor) : tensor_(tensor) {} + + bool TensorIsTriviallyIterable() const { + return tensor_.ndim() <= 1 || tensor_.is_contiguous(); + } + + size_t CountNonZero() const { + if (tensor_.size() == 0) { + return 0; + } + + if (TensorIsTriviallyIterable()) { + const value_type* data = reinterpret_cast(tensor_.raw_data()); + return std::count_if(data, data + tensor_.size(), + [](value_type x) { return x != 0; }); + } + + const std::vector& shape = tensor_.shape(); + const int64_t ndim = tensor_.ndim(); + + size_t count = 0; + std::vector coord(ndim, 0); + for (int64_t n = tensor_.size(); n > 0; n--) { + if (tensor_.Value(coord) != 0) { + ++count; + } + + // increment index + ++coord[ndim - 1]; + if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } + } + return count; + } + + const NumericTensorType& tensor_; +}; + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using NumericTensorType = typename BaseClass::NumericTensorType; + using value_type = typename BaseClass::value_type; + + explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {} + + Status Convert() { + const int64_t ndim = tensor_.ndim(); + const int64_t nonzero_count = static_cast(CountNonZero()); + + std::shared_ptr indices_buffer; + RETURN_NOT_OK( + AllocateBuffer(sizeof(int64_t) * ndim * nonzero_count, &indices_buffer)); + int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + + std::shared_ptr values_buffer; + RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); + + if (ndim <= 1) { + const value_type* data = reinterpret_cast(tensor_.raw_data()); + const int64_t count = ndim == 0 ? 1 : tensor_.shape()[0]; + for (int64_t i = 0; i < count; ++i, ++data) { + if (*data != 0) { + *indices++ = i; + *values++ = *data; + } + } + } else { + const std::vector& shape = tensor_.shape(); + std::vector coord(ndim, 0); + + for (int64_t n = tensor_.size(); n > 0; n--) { + const value_type x = tensor_.Value(coord); + if (tensor_.Value(coord) != 0) { + *values++ = x; + + int64_t* indp = indices; + for (int64_t i = 0; i < ndim; ++i) { + *indp = coord[i]; + indp += nonzero_count; + } + indices++; + } + + // increment index + ++coord[ndim - 1]; + if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) { + int64_t d = ndim - 1; + while (d > 0 && coord[d] == shape[d]) { + coord[d] = 0; + ++coord[d - 1]; + --d; + } + } + } + } + + // make results + const std::vector indices_shape = {nonzero_count, ndim}; + const int64_t indices_elsize = sizeof(int64_t); + const std::vector indices_strides = {indices_elsize, + indices_elsize * nonzero_count}; + sparse_index = + std::make_shared(std::make_shared( + indices_buffer, indices_shape, indices_strides)); + data = values_buffer; + + return Status::OK(); + } + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using SparseTensorConverterBase::tensor_; + using SparseTensorConverterBase::CountNonZero; +}; + +template +void MakeSparseTensorFromTensor(const Tensor& tensor, + std::shared_ptr* sparse_index, + std::shared_ptr* data) { + NumericTensor numeric_tensor(tensor.data(), tensor.shape(), tensor.strides()); + SparseTensorConverter converter(numeric_tensor); + DCHECK_OK(converter.Convert()); + *sparse_index = converter.sparse_index; + *data = converter.data; +} + +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCSRIndex + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using NumericTensorType = typename BaseClass::NumericTensorType; + using value_type = typename BaseClass::value_type; + + explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {} + + Status Convert() { + const int64_t ndim = tensor_.ndim(); + if (ndim > 2) { + return Status::Invalid("Invalid tensor dimension"); + } + + const int64_t nr = tensor_.shape()[0]; + const int64_t nc = tensor_.shape()[1]; + const int64_t nonzero_count = static_cast(CountNonZero()); + + std::shared_ptr indptr_buffer; + std::shared_ptr indices_buffer; + + std::shared_ptr values_buffer; + RETURN_NOT_OK(AllocateBuffer(sizeof(value_type) * nonzero_count, &values_buffer)); + value_type* values = reinterpret_cast(values_buffer->mutable_data()); + + if (ndim <= 1) { + return Status::NotImplemented("TODO for ndim <= 1"); + } else { + RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * (nr + 1), &indptr_buffer)); + int64_t* indptr = reinterpret_cast(indptr_buffer->mutable_data()); + + RETURN_NOT_OK(AllocateBuffer(sizeof(int64_t) * nonzero_count, &indices_buffer)); + int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + + int64_t k = 0; + *indptr++ = 0; + for (int64_t i = 0; i < nr; ++i) { + for (int64_t j = 0; j < nc; ++j) { + const value_type x = tensor_.Value({i, j}); + if (x != 0) { + *values++ = x; + *indices++ = j; + k++; + } + } + *indptr++ = k; + } + } + + std::vector indptr_shape({nr + 1}); + std::shared_ptr indptr_tensor = + std::make_shared(indptr_buffer, indptr_shape); + + std::vector indices_shape({nonzero_count}); + std::shared_ptr indices_tensor = + std::make_shared(indices_buffer, indices_shape); + + sparse_index = std::make_shared(indptr_tensor, indices_tensor); + data = values_buffer; + + return Status::OK(); + } + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using BaseClass::tensor_; + using SparseTensorConverterBase::CountNonZero; +}; + +// ---------------------------------------------------------------------- +// Instantiate templates + +#define INSTANTIATE_SPARSE_TENSOR_CONVERTER(IndexType) \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter; \ + template class ARROW_TEMPLATE_EXPORT SparseTensorConverter + +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCOOIndex); +INSTANTIATE_SPARSE_TENSOR_CONVERTER(SparseCSRIndex); + +} // namespace + +// ---------------------------------------------------------------------- +// SparseCOOIndex + +// Constructor with a column-major NumericTensor +SparseCOOIndex::SparseCOOIndex(const std::shared_ptr& coords) + : SparseIndexBase(coords->shape()[0]), coords_(coords) { + DCHECK(coords_->is_column_major()); +} + +std::string SparseCOOIndex::ToString() const { return std::string("SparseCOOIndex"); } + +// ---------------------------------------------------------------------- +// SparseCSRIndex + +// Constructor with two index vectors +SparseCSRIndex::SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices) + : SparseIndexBase(indices->shape()[0]), indptr_(indptr), indices_(indices) { + DCHECK_EQ(1, indptr_->ndim()); + DCHECK_EQ(1, indices_->ndim()); +} + +std::string SparseCSRIndex::ToString() const { return std::string("SparseCSRIndex"); } + +// ---------------------------------------------------------------------- +// SparseTensor + +// Constructor with all attributes +SparseTensor::SparseTensor(const std::shared_ptr& type, + const std::shared_ptr& data, + const std::vector& shape, + const std::shared_ptr& sparse_index, + const std::vector& dim_names) + : type_(type), + data_(data), + shape_(shape), + sparse_index_(sparse_index), + dim_names_(dim_names) { + DCHECK(is_tensor_supported(type->id())); +} + +const std::string& SparseTensor::dim_name(int i) const { + static const std::string kEmpty = ""; + if (dim_names_.size() == 0) { + return kEmpty; + } else { + DCHECK_LT(i, static_cast(dim_names_.size())); + return dim_names_[i]; + } +} + +int64_t SparseTensor::size() const { + return std::accumulate(shape_.begin(), shape_.end(), 1LL, std::multiplies()); +} + +bool SparseTensor::Equals(const SparseTensor& other) const { + return SparseTensorEquals(*this, other); +} + +// ---------------------------------------------------------------------- +// SparseTensorImpl + +// Constructor with a dense tensor +template +SparseTensorImpl::SparseTensorImpl( + const std::shared_ptr& type, const std::vector& shape, + const std::vector& dim_names) + : SparseTensorImpl(nullptr, type, nullptr, shape, dim_names) {} + +// Constructor with a dense tensor +template +template +SparseTensorImpl::SparseTensorImpl(const NumericTensor& tensor) + : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(), + tensor.dim_names_) { + SparseTensorConverter converter(tensor); + DCHECK_OK(converter.Convert()); + sparse_index_ = converter.sparse_index; + data_ = converter.data; +} + +// Constructor with a dense tensor +template +SparseTensorImpl::SparseTensorImpl(const Tensor& tensor) + : SparseTensorImpl(nullptr, tensor.type(), nullptr, tensor.shape(), + tensor.dim_names_) { + switch (tensor.type()->id()) { + case Type::UINT8: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT8: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::UINT16: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT16: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::UINT32: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT32: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::UINT64: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::INT64: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::HALF_FLOAT: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::FLOAT: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + case Type::DOUBLE: + MakeSparseTensorFromTensor(tensor, &sparse_index_, + &data_); + return; + default: + break; + } +} + +// ---------------------------------------------------------------------- +// Instantiate templates + +#define INSTANTIATE_SPARSE_TENSOR(IndexType) \ + template class ARROW_TEMPLATE_EXPORT SparseTensorImpl; \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&); \ + template ARROW_EXPORT SparseTensorImpl::SparseTensorImpl( \ + const NumericTensor&) + +INSTANTIATE_SPARSE_TENSOR(SparseCOOIndex); +INSTANTIATE_SPARSE_TENSOR(SparseCSRIndex); + +} // namespace arrow diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h new file mode 100644 index 0000000000000..c7693d2ec9579 --- /dev/null +++ b/cpp/src/arrow/sparse_tensor.h @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_SPARSE_TENSOR_H +#define ARROW_SPARSE_TENSOR_H + +#include +#include +#include + +#include "arrow/tensor.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// SparseIndex class + +/// \brief EXPERIMENTAL: Sparse tensor format enumeration +struct SparseTensorFormat { + enum type { COO, CSR }; +}; + +/// \brief EXPERIMENTAL: The base class for representing index of non-zero +/// values in sparse tensor +class ARROW_EXPORT SparseIndex { + public: + explicit SparseIndex(SparseTensorFormat::type format_id, int64_t non_zero_length) + : format_id_(format_id), non_zero_length_(non_zero_length) {} + + virtual ~SparseIndex() = default; + + SparseTensorFormat::type format_id() const { return format_id_; } + int64_t non_zero_length() const { return non_zero_length_; } + + virtual std::string ToString() const = 0; + + protected: + SparseTensorFormat::type format_id_; + int64_t non_zero_length_; +}; + +template +class SparseIndexBase : public SparseIndex { + public: + explicit SparseIndexBase(int64_t non_zero_length) + : SparseIndex(SparseIndexType::format_id, non_zero_length) {} +}; + +// ---------------------------------------------------------------------- +// SparseCOOIndex class + +/// \brief EXPERIMENTAL: The index data for COO sparse tensor +class ARROW_EXPORT SparseCOOIndex : public SparseIndexBase { + public: + using CoordsTensor = NumericTensor; + + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::COO; + + // Constructor with a column-major NumericTensor + explicit SparseCOOIndex(const std::shared_ptr& coords); + + const std::shared_ptr& indices() const { return coords_; } + + std::string ToString() const override; + + bool Equals(const SparseCOOIndex& other) const { + return indices()->Equals(*other.indices()); + } + + protected: + std::shared_ptr coords_; +}; + +// ---------------------------------------------------------------------- +// SparseCSRIndex class + +/// \brief EXPERIMENTAL: The index data for CSR sparse matrix +class ARROW_EXPORT SparseCSRIndex : public SparseIndexBase { + public: + using IndexTensor = NumericTensor; + + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; + + // Constructor with two index vectors + explicit SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices); + + const std::shared_ptr& indptr() const { return indptr_; } + const std::shared_ptr& indices() const { return indices_; } + + std::string ToString() const override; + + bool Equals(const SparseCSRIndex& other) const { + return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices()); + } + + protected: + std::shared_ptr indptr_; + std::shared_ptr indices_; +}; + +// ---------------------------------------------------------------------- +// SparseTensor class + +/// \brief EXPERIMENTAL: The base class of sparse tensor container +class ARROW_EXPORT SparseTensor { + public: + virtual ~SparseTensor() = default; + + SparseTensorFormat::type format_id() const { return sparse_index_->format_id(); } + + std::shared_ptr type() const { return type_; } + std::shared_ptr data() const { return data_; } + + const uint8_t* raw_data() const { return data_->data(); } + uint8_t* raw_mutable_data() const { return data_->mutable_data(); } + + const std::vector& shape() const { return shape_; } + + const std::shared_ptr& sparse_index() const { return sparse_index_; } + + int ndim() const { return static_cast(shape_.size()); } + + const std::string& dim_name(int i) const; + + /// Total number of value cells in the sparse tensor + int64_t size() const; + + /// Return true if the underlying data buffer is mutable + bool is_mutable() const { return data_->is_mutable(); } + + /// Total number of non-zero cells in the sparse tensor + int64_t non_zero_length() const { + return sparse_index_ ? sparse_index_->non_zero_length() : 0; + } + + bool Equals(const SparseTensor& other) const; + + protected: + // Constructor with all attributes + SparseTensor(const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, + const std::shared_ptr& sparse_index, + const std::vector& dim_names); + + std::shared_ptr type_; + std::shared_ptr data_; + std::vector shape_; + std::shared_ptr sparse_index_; + + /// These names are optional + std::vector dim_names_; +}; + +// ---------------------------------------------------------------------- +// SparseTensorImpl class + +/// \brief EXPERIMENTAL: Concrete sparse tensor implementation classes with sparse index +/// type +template +class ARROW_EXPORT SparseTensorImpl : public SparseTensor { + public: + virtual ~SparseTensorImpl() = default; + + // Constructor with all attributes + SparseTensorImpl(const std::shared_ptr& sparse_index, + const std::shared_ptr& type, + const std::shared_ptr& data, const std::vector& shape, + const std::vector& dim_names) + : SparseTensor(type, data, shape, sparse_index, dim_names) {} + + // Constructor for empty sparse tensor + SparseTensorImpl(const std::shared_ptr& type, + const std::vector& shape, + const std::vector& dim_names = {}); + + // Constructor with a dense numeric tensor + template + explicit SparseTensorImpl(const NumericTensor& tensor); + + // Constructor with a dense tensor + explicit SparseTensorImpl(const Tensor& tensor); + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensorImpl); +}; + +/// \brief EXPERIMENTAL: Type alias for COO sparse tensor +using SparseTensorCOO = SparseTensorImpl; + +/// \brief EXPERIMENTAL: Type alias for CSR sparse matrix +using SparseTensorCSR = SparseTensorImpl; +using SparseMatrixCSR = SparseTensorImpl; + +} // namespace arrow + +#endif // ARROW_SPARSE_TENSOR_H diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index a9b5df81fa193..e81f0f0dff5d7 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -50,6 +50,9 @@ static inline bool is_tensor_supported(Type::type type_id) { return false; } +template +class SparseTensorImpl; + class ARROW_EXPORT Tensor { public: virtual ~Tensor() = default; @@ -110,6 +113,9 @@ class ARROW_EXPORT Tensor { /// These names are optional std::vector dim_names_; + template + friend class SparseTensorImpl; + private: ARROW_DISALLOW_COPY_AND_ASSIGN(Tensor); }; diff --git a/docs/source/format/IPC.rst b/docs/source/format/IPC.rst index 8cb74b87afcdc..62a1237436ae3 100644 --- a/docs/source/format/IPC.rst +++ b/docs/source/format/IPC.rst @@ -234,4 +234,28 @@ region) to be multiples of 64 bytes: :: +SparseTensor Message Format +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``SparseTensor`` message types provides another way to write a +multidimensional array of fixed-size values using Arrow's shared memory tools +in addition to ``Tensor``. ``SparseTensor`` is designed specifically for tensors +whose elements are almost zeros. Arrow implementations in general are not +required to implement this data format likewise ``Tensor``. + +When writing a standalone encapsulated sparse tensor message, we use the format as +indicated above, but additionally align the starting offset of the metadata as +well as the starting offsets of the sparse index and the sparse tensor body +(if writing to a shared memory region) to be multiples of 64 bytes: + + + + + + + + +The contents of the sparse tensor index is depends on what kinds of sparse +format is used. + .. _Flatbuffer: https://github.com/google/flatbuffers diff --git a/format/Message.fbs b/format/Message.fbs index 830718139d88c..e14fdca8f155c 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -87,7 +87,7 @@ table DictionaryBatch { /// which may include experimental metadata types. For maximum compatibility, /// it is best to send data using RecordBatch union MessageHeader { - Schema, DictionaryBatch, RecordBatch, Tensor + Schema, DictionaryBatch, RecordBatch, Tensor, SparseTensor } table Message { @@ -96,4 +96,4 @@ table Message { bodyLength: long; } -root_type Message; \ No newline at end of file +root_type Message; diff --git a/format/Tensor.fbs b/format/Tensor.fbs index 18b614c3bde62..e77b353a0f33f 100644 --- a/format/Tensor.fbs +++ b/format/Tensor.fbs @@ -23,6 +23,9 @@ include "Schema.fbs"; namespace org.apache.arrow.flatbuf; +/// ---------------------------------------------------------------------- +/// Data structures for dense tensors + /// Shape data for a single axis in a tensor table TensorDim { /// Length of dimension @@ -48,3 +51,96 @@ table Tensor { } root_type Tensor; + +/// ---------------------------------------------------------------------- +/// EXPERIMENTAL: Data structures for sparse tensors + +/// Coodinate format of sparse tensor index. +table SparseTensorIndexCOO { + /// COO's index list are represented as a NxM matrix, + /// where N is the number of non-zero values, + /// and M is the number of dimensions of a sparse tensor. + /// indicesBuffer stores the location and size of this index matrix. + /// The type of index value is long, so the stride for the index matrix is unnecessary. + /// + /// For example, let X be a 2x3x4x5 tensor, and it has the following 6 non-zero values: + /// + /// X[0, 1, 2, 0] := 1 + /// X[1, 1, 2, 3] := 2 + /// X[0, 2, 1, 0] := 3 + /// X[0, 1, 3, 0] := 4 + /// X[0, 1, 2, 1] := 5 + /// X[1, 2, 0, 4] := 6 + /// + /// In COO format, the index matrix of X is the following 4x6 matrix: + /// + /// [[0, 0, 0, 0, 1, 1], + /// [1, 1, 1, 2, 1, 2], + /// [2, 2, 3, 1, 2, 0], + /// [0, 1, 0, 0, 3, 4]] + /// + /// Note that the indices are sorted in lexcographical order. + indicesBuffer: Buffer; +} + +/// Compressed Sparse Row format, that is matrix-specific. +table SparseMatrixIndexCSR { + /// indptrBuffer stores the location and size of indptr array that + /// represents the range of the rows. + /// The i-th row spans from indptr[i] to indptr[i+1] in the data. + /// The length of this array is 1 + (the number of rows), and the type + /// of index value is long. + /// + /// For example, let X be the following 6x4 matrix: + /// + /// X := [[0, 1, 2, 0], + /// [0, 0, 3, 0], + /// [0, 4, 0, 5], + /// [0, 0, 0, 0], + /// [6, 0, 7, 8], + /// [0, 9, 0, 0]]. + /// + /// The array of non-zero values in X is: + /// + /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. + /// + /// And the indptr of X is: + /// + /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. + indptrBuffer: Buffer; + + /// indicesBuffer stores the location and size of the array that + /// contains the column indices of the corresponding non-zero values. + /// The type of index value is long. + /// + /// For example, the indices of the above X is: + /// + /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. + indicesBuffer: Buffer; +} + +union SparseTensorIndex { + SparseTensorIndexCOO, + SparseMatrixIndexCSR +} + +table SparseTensor { + /// The type of data contained in a value cell. + /// Currently only fixed-width value types are supported, + /// no strings or nested types. + type: Type; + + /// The dimensions of the tensor, optionally named. + shape: [TensorDim]; + + /// The number of non-zero values in a sparse tensor. + non_zero_length: long; + + /// Sparse tensor index + sparseIndex: SparseTensorIndex; + + /// The location and size of the tensor's data + data: Buffer; +} + +root_type SparseTensor; From 84b221dd864af8385ac626fc753875416e840ff0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 9 Jan 2019 17:32:38 -0600 Subject: [PATCH 190/328] ARROW-4138: [Python] Fix setuptools_scm version customization on Windows Using single quotes for the regular expression doesn't work on Windows for some reason. Using double quotes fixes the issue Author: Wes McKinney Closes #3362 from wesm/ARROW-4138 and squashes the following commits: ca3e56a9b Windows doesn't like single quotes passed to git describe --- python/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/setup.py b/python/setup.py index 742851918c124..584c35a09ce5a 100755 --- a/python/setup.py +++ b/python/setup.py @@ -500,8 +500,8 @@ def parse_git(root, **kwargs): subprojects, e.g. apache-arrow-js-XXX tags. """ from setuptools_scm.git import parse - kwargs['describe_command'] = \ - "git describe --dirty --tags --long --match 'apache-arrow-[0-9].*'" + kwargs['describe_command'] =\ + 'git describe --dirty --tags --long --match "apache-arrow-[0-9].*"' return parse(root, **kwargs) From a80c27e46814ded00216cc48f83e3fedbfb9cf4f Mon Sep 17 00:00:00 2001 From: Tim Paine Date: Wed, 9 Jan 2019 17:34:53 -0600 Subject: [PATCH 191/328] ARROW-4197: [C++] Better Emscripten support A few changes for better compatibility with the Emscripten compiler for WebAssembly - expose the `-ggdb` flag as an option (unsupported by emscripten) - the `-undefined dynamic_lookup` flag should be set on apple, but not when using emscripten - allow for `backtrace` to be turned off even if found (no `execinfo.h` available, from `util/logging.cc`) Author: Tim Paine Closes #3350 from timkpaine/emscripten and squashes the following commits: e3661ff52 restore default ggdb behavior, use better environment variable to detect emscripten and add a comment explaining this a0e91a77c expose backtrace at top level, re-add -g, make backtrace private scope again b8f0c8068 Merge branch 'master' into emscripten 5308f6b49 fix for emscripten --- cpp/CMakeLists.txt | 8 ++++++++ cpp/cmake_modules/BuildUtils.cmake | 5 ++++- cpp/cmake_modules/SetupCxxFlags.cmake | 16 ++++++++++++---- cpp/src/arrow/CMakeLists.txt | 2 +- 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3d2b698b8ff25..4232af3a12005 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -170,6 +170,10 @@ static|shared (default shared)") "If off, 'quiet' flags will be passed to linting tools" OFF) + option(ARROW_GGDB_DEBUG + "Pass -ggdb flag to debug builds" + ON) + #---------------------------------------------------------------------- # Project components to enable / disable building @@ -249,6 +253,10 @@ Note that this requires linking Boost statically" "Rely on Protocol Buffers shared libraries where relevant" OFF) + option(ARROW_WITH_BACKTRACE + "Build with backtrace support" + ON) + option(ARROW_USE_GLOG "Build libraries with glog support for pluggable logging" ON) diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 77db28e2aab28..cf2145b8a9166 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -182,11 +182,14 @@ function(ADD_ARROW_LIB LIB_NAME) ${ARG_PRIVATE_INCLUDES}) endif() - if(APPLE) + if(APPLE AND NOT DEFINED $ENV{EMSCRIPTEN}) # On OS X, you can avoid linking at library load time and instead # expecting that the symbols have been loaded separately. This happens # with libpython* where there can be conflicts between system Python and # the Python from a thirdparty distribution + # + # When running with the Emscripten Compiler, we need not worry about + # python, and the Emscripten Compiler does not support this option. set(ARG_SHARED_LINK_FLAGS "-undefined dynamic_lookup ${ARG_SHARED_LINK_FLAGS}") endif() diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 11608350c5f7a..796a68db0b878 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -340,11 +340,19 @@ endif() # Debug symbols are stripped for reduced binary size. Add # -DARROW_CXXFLAGS="-g" to add them if (NOT MSVC) - set(C_FLAGS_DEBUG "-ggdb -O0") - set(C_FLAGS_FASTDEBUG "-ggdb -O1") + if(ARROW_GGDB_DEBUG) + set(C_FLAGS_DEBUG "-ggdb -O0") + set(C_FLAGS_FASTDEBUG "-ggdb -O1") + set(CXX_FLAGS_DEBUG "-ggdb -O0") + set(CXX_FLAGS_FASTDEBUG "-ggdb -O1") + else() + set(C_FLAGS_DEBUG "-g -O0") + set(C_FLAGS_FASTDEBUG "-g -O1") + set(CXX_FLAGS_DEBUG "-g -O0") + set(CXX_FLAGS_FASTDEBUG "-g -O1") + endif() + set(C_FLAGS_RELEASE "-O3 -DNDEBUG") - set(CXX_FLAGS_DEBUG "-ggdb -O0") - set(CXX_FLAGS_FASTDEBUG "-ggdb -O1") set(CXX_FLAGS_RELEASE "-O3 -DNDEBUG") endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 91bdce294c2d1..59f035792b80d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -257,7 +257,7 @@ find_package(Backtrace) foreach(LIB_TARGET ${ARROW_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) - if (Backtrace_FOUND) + if (Backtrace_FOUND AND ARROW_WITH_BACKTRACE) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_WITH_BACKTRACE) endif() From 87ceb3ca904c9e9a839ff1cc724d3139c1958047 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 9 Jan 2019 16:49:04 -0700 Subject: [PATCH 192/328] ARROW-3959: [Rust] Add date/time data types This only adds the date/time types to the DataTypes enum as well as JSON serialization for meta data. This PR also implements `Schema::to_json` Author: Andy Grove Closes #3340 from andygrove/ARROW-3959 and squashes the following commits: 945498e merge from master and implement Hash for DateUnit, TimeUnit, etc. b05d6a0 Merge branch 'master' into ARROW-3959 312885e Timestamp now uses TimeUnit c3e092b Merge branch 'master' into ARROW-3959 d289cbb improve test 2d36927 update unit test d51bc82 fix mistake f4bbf10 Add date/time data types --- rust/arrow/src/datatypes.rs | 146 +++++++++++++++++++++++++++++++++++- 1 file changed, 145 insertions(+), 1 deletion(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 05db6ce7d40b9..5008a97624a40 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -56,11 +56,36 @@ pub enum DataType { Float16, Float32, Float64, + Timestamp(TimeUnit), + Date(DateUnit), + Time32(TimeUnit), + Time64(TimeUnit), + Interval(IntervalUnit), Utf8, List(Box), Struct(Vec), } +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)] +pub enum DateUnit { + Day, + Millisecond, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)] +pub enum TimeUnit { + Second, + Millisecond, + Microsecond, + Nanosecond, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)] +pub enum IntervalUnit { + YearMonth, + DayTime, +} + /// Contains the meta-data for a single relative type. /// /// The `Schema` object is an ordered collection of `Field` objects. @@ -175,6 +200,47 @@ impl DataType { "floatingpoint precision missing or invalid".to_string(), )), }, + Some(s) if s == "timestamp" => match map.get("unit") { + Some(p) if p == "SECOND" => Ok(DataType::Timestamp(TimeUnit::Second)), + Some(p) if p == "MILLISECOND" => Ok(DataType::Timestamp(TimeUnit::Millisecond)), + Some(p) if p == "MICROSECOND" => Ok(DataType::Timestamp(TimeUnit::Microsecond)), + Some(p) if p == "NANOSECOND" => Ok(DataType::Timestamp(TimeUnit::Nanosecond)), + _ => Err(ArrowError::ParseError( + "timestamp unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "date" => match map.get("unit") { + Some(p) if p == "DAY" => Ok(DataType::Date(DateUnit::Day)), + Some(p) if p == "MILLISECOND" => Ok(DataType::Date(DateUnit::Millisecond)), + _ => Err(ArrowError::ParseError( + "date unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "time" => { + let unit = match map.get("unit") { + Some(p) if p == "SECOND" => Ok(TimeUnit::Second), + Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), + Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), + Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), + _ => Err(ArrowError::ParseError( + "time unit missing or invalid".to_string(), + )), + }; + match map.get("bitWidth") { + Some(p) if p == "32" => Ok(DataType::Time32(unit?)), + Some(p) if p == "64" => Ok(DataType::Time32(unit?)), + _ => Err(ArrowError::ParseError( + "time bitWidth missing or invalid".to_string(), + )), + } + } + Some(s) if s == "interval" => match map.get("unit") { + Some(p) if p == "DAY_TIME" => Ok(DataType::Interval(IntervalUnit::DayTime)), + Some(p) if p == "YEAR_MONTH" => Ok(DataType::Interval(IntervalUnit::YearMonth)), + _ => Err(ArrowError::ParseError( + "interval unit missing or invalid".to_string(), + )), + }, Some(s) if s == "int" => match map.get("isSigned") { Some(&Value::Bool(true)) => match map.get("bitWidth") { Some(&Value::Number(ref n)) => match n.as_u64() { @@ -231,7 +297,7 @@ impl DataType { /// Generate a JSON representation of the data type pub fn to_json(&self) -> Value { - match *self { + match self { DataType::Boolean => json!({"name": "bool"}), DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}), DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}), @@ -254,6 +320,32 @@ impl DataType { let child_json = t.to_json(); json!({ "name": "list", "children": child_json }) } + DataType::Time32(unit) => json!({"name": "time", "bitWidth": "32", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}), + DataType::Time64(unit) => json!({"name": "time", "bitWidth": "64", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}), + DataType::Date(unit) => json!({"name": "date", "unit": match unit { + DateUnit::Day => "DAY", + DateUnit::Millisecond => "MILLISECOND", + }}), + DataType::Timestamp(unit) => json!({"name": "timestamp", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}), + DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { + IntervalUnit::YearMonth => "YEAR_MONTH", + IntervalUnit::DayTime => "DAY_TIME", + }}), } } } @@ -394,6 +486,13 @@ impl Schema { .enumerate() .find(|&(_, c)| c.name == name) } + + /// Generate a JSON representation of the `Field` + pub fn to_json(&self) -> Value { + json!({ + "fields": self.fields.iter().map(|field| field.to_json()).collect::>(), + }) + } } impl fmt::Display for Schema { @@ -528,6 +627,51 @@ mod tests { assert_eq!(DataType::Int32, dt); } + #[test] + fn schema_json() { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Utf8, false), + Field::new("c2", DataType::Date(DateUnit::Day), false), + Field::new("c3", DataType::Date(DateUnit::Millisecond), false), + Field::new("c7", DataType::Time32(TimeUnit::Second), false), + Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false), + Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false), + Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false), + Field::new("c11", DataType::Time64(TimeUnit::Second), false), + Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false), + Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false), + Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false), + Field::new("c15", DataType::Timestamp(TimeUnit::Second), false), + Field::new("c16", DataType::Timestamp(TimeUnit::Millisecond), false), + Field::new("c17", DataType::Timestamp(TimeUnit::Microsecond), false), + Field::new("c18", DataType::Timestamp(TimeUnit::Nanosecond), false), + Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), + Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), + Field::new( + "c21", + DataType::Struct(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::UInt16, false), + ]), + false, + ), + ]); + + let json = schema.to_json().to_string(); + assert_eq!(json, "{\"fields\":[{\"name\":\"c1\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},{\"name\":\"c2\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"DAY\"}},{\"name\":\"c3\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"MILLISECOND\"}},{\"name\":\"c7\",\"nullable\":false,\"type\":{\"bitWidth\":\"32\",\"name\":\"time\",\"unit\":\"SECOND\"}},{\"name\":\"c8\",\"nullable\":false,\"type\":{\"bitWidth\":\"32\",\"name\":\"time\",\"unit\":\"MILLISECOND\"}},{\"name\":\"c9\",\"nullable\":false,\"type\":{\"bitWidth\":\"32\",\"name\":\"time\",\"unit\":\"MICROSECOND\"}},{\"name\":\"c10\",\"nullable\":false,\"type\":{\"bitWidth\":\"32\",\"name\":\"time\",\"unit\":\"NANOSECOND\"}},{\"name\":\"c11\",\"nullable\":false,\"type\":{\"bitWidth\":\"64\",\"name\":\"time\",\"unit\":\"SECOND\"}},{\"name\":\"c12\",\"nullable\":false,\"type\":{\"bitWidth\":\"64\",\"name\":\"time\",\"unit\":\"MILLISECOND\"}},{\"name\":\"c13\",\"nullable\":false,\"type\":{\"bitWidth\":\"64\",\"name\":\"time\",\"unit\":\"MICROSECOND\"}},{\"name\":\"c14\",\"nullable\":false,\"type\":{\"bitWidth\":\"64\",\"name\":\"time\",\"unit\":\"NANOSECOND\"}},{\"name\":\"c15\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"SECOND\"}},{\"name\":\"c16\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"MILLISECOND\"}},{\"name\":\"c17\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"MICROSECOND\"}},{\"name\":\"c18\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"NANOSECOND\"}},{\"name\":\"c19\",\"nullable\":false,\"type\":{\"name\":\"interval\",\"unit\":\"DAY_TIME\"}},{\"name\":\"c20\",\"nullable\":false,\"type\":{\"name\":\"interval\",\"unit\":\"YEAR_MONTH\"}},{\"name\":\"c21\",\"nullable\":false,\"type\":{\"fields\":[{\"name\":\"a\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},{\"name\":\"b\",\"nullable\":false,\"type\":{\"bitWidth\":16,\"isSigned\":false,\"name\":\"int\"}}]}}]}"); + + // convert back to a schema + let value: Value = serde_json::from_str(&json).unwrap(); + let schema2 = DataType::from(&value).unwrap(); + + match schema2 { + DataType::Struct(fields) => { + assert_eq!(schema.fields().len(), fields.len()); + } + _ => panic!(), + } + } + #[test] fn create_schema_string() { let _person = Schema::new(vec![ From b29ecdce6e096618aeb110878367906b3b4b48a5 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 9 Jan 2019 19:30:29 -0600 Subject: [PATCH 193/328] ARROW-4177: [C++] Add ThreadPool and TaskGroup microbenchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These benchmarks measure the number of tasks per second that can be executed depending on task cost and number of threads. It shows that for short tasks (< 10 µs), the scalability can be poor or even negative for very short tasks (< 1 µs). Also includes an optimization of ThreadedTaskGroup to avoid taking a lock on the hot path. Sample output (8-core AMD CPU, Ubuntu 18.04): ``` ----------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------------------------- BM_WorkloadCost/task_cost:1000/repeats:1 724 ns 724 ns 987295 1.31655M items/s BM_WorkloadCost/task_cost:10000/repeats:1 7331 ns 7330 ns 88982 133.23k items/s BM_WorkloadCost/task_cost:100000/repeats:1 73279 ns 73267 ns 9182 13.3288k items/s BM_ThreadPoolSpawn/threads:1/task_cost:1000/repeats:1/real_time 163842359 ns 41132762 ns 4 1.16414M items/s BM_ThreadPoolSpawn/threads:2/task_cost:1000/repeats:1/real_time 158705340 ns 103873994 ns 7 1.20182M items/s BM_ThreadPoolSpawn/threads:4/task_cost:1000/repeats:1/real_time 447998576 ns 370986805 ns 2 435.969k items/s BM_ThreadPoolSpawn/threads:8/task_cost:1000/repeats:1/real_time 674500180 ns 543967794 ns 1 289.568k items/s BM_ThreadPoolSpawn/threads:1/task_cost:10000/repeats:1/real_time 150078690 ns 4887868 ns 5 130.147k items/s BM_ThreadPoolSpawn/threads:2/task_cost:10000/repeats:1/real_time 84446492 ns 5402850 ns 8 231.297k items/s BM_ThreadPoolSpawn/threads:4/task_cost:10000/repeats:1/real_time 46164089 ns 4912818 ns 15 423.104k items/s BM_ThreadPoolSpawn/threads:8/task_cost:10000/repeats:1/real_time 22703512 ns 7074437 ns 31 860.317k items/s BM_ThreadPoolSpawn/threads:1/task_cost:100000/repeats:1/real_time 149733023 ns 515907 ns 4 13.0506k items/s BM_ThreadPoolSpawn/threads:2/task_cost:100000/repeats:1/real_time 81157195 ns 448091 ns 9 24.078k items/s BM_ThreadPoolSpawn/threads:4/task_cost:100000/repeats:1/real_time 45600571 ns 521094 ns 16 42.8526k items/s BM_ThreadPoolSpawn/threads:8/task_cost:100000/repeats:1/real_time 20867873 ns 359547 ns 32 93.6416k items/s BM_SerialTaskGroup/task_cost:1000/repeats:1/real_time 8366557 ns 8362959 ns 66 1.13998M items/s BM_SerialTaskGroup/task_cost:10000/repeats:1/real_time 8346475 ns 8345288 ns 75 117.12k items/s BM_SerialTaskGroup/task_cost:100000/repeats:1/real_time 8409974 ns 8408879 ns 80 11.7281k items/s BM_ThreadedTaskGroup/threads:1/task_cost:1000/repeats:1/real_time 12932016 ns 6283623 ns 60 755.227k items/s BM_ThreadedTaskGroup/threads:2/task_cost:1000/repeats:1/real_time 10622580 ns 8631946 ns 58 919.419k items/s BM_ThreadedTaskGroup/threads:4/task_cost:1000/repeats:1/real_time 25544253 ns 20347053 ns 25 382.34k items/s BM_ThreadedTaskGroup/threads:8/task_cost:1000/repeats:1/real_time 36215077 ns 29435817 ns 19 269.683k items/s BM_ThreadedTaskGroup/threads:1/task_cost:10000/repeats:1/real_time 9830469 ns 476288 ns 69 99.4397k items/s BM_ThreadedTaskGroup/threads:2/task_cost:10000/repeats:1/real_time 5446608 ns 546159 ns 116 179.477k items/s BM_ThreadedTaskGroup/threads:4/task_cost:10000/repeats:1/real_time 2858316 ns 666944 ns 247 341.998k items/s BM_ThreadedTaskGroup/threads:8/task_cost:10000/repeats:1/real_time 1544885 ns 526298 ns 452 632.759k items/s BM_ThreadedTaskGroup/threads:1/task_cost:100000/repeats:1/real_time 9506192 ns 53110 ns 69 10.3756k items/s BM_ThreadedTaskGroup/threads:2/task_cost:100000/repeats:1/real_time 5262119 ns 67967 ns 116 18.7439k items/s BM_ThreadedTaskGroup/threads:4/task_cost:100000/repeats:1/real_time 2710626 ns 82870 ns 252 36.3875k items/s BM_ThreadedTaskGroup/threads:8/task_cost:100000/repeats:1/real_time 1602394 ns 65768 ns 423 61.5534k items/s ``` Author: Antoine Pitrou Closes #3337 from pitrou/ARROW-4177-thread-pool-benchmark and squashes the following commits: 5a17ca0d8 Fix warnings 2ffce8376 Make ThreadedTaskGroup mostly lockless (apart from ThreadPool) b5260b955 ARROW-4177: Add ThreadPool and TaskGroup microbenchmarks --- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/task-group.cc | 60 ++++-- cpp/src/arrow/util/task-group.h | 2 +- cpp/src/arrow/util/thread-pool-benchmark.cc | 202 ++++++++++++++++++++ cpp/src/arrow/util/thread-pool.cc | 3 + 5 files changed, 246 insertions(+), 22 deletions(-) create mode 100644 cpp/src/arrow/util/thread-pool-benchmark.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index b02dc113c5459..54ff5674fdfcc 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -73,5 +73,6 @@ ADD_ARROW_BENCHMARK(int-util-benchmark) ADD_ARROW_BENCHMARK(lazy-benchmark) ADD_ARROW_BENCHMARK(machine-benchmark) ADD_ARROW_BENCHMARK(number-parsing-benchmark) +ADD_ARROW_BENCHMARK(thread-pool-benchmark) ADD_ARROW_BENCHMARK(trie-benchmark) ADD_ARROW_BENCHMARK(utf8-util-benchmark) diff --git a/cpp/src/arrow/util/task-group.cc b/cpp/src/arrow/util/task-group.cc index 3ea63fc5ad80e..52c40bd46d1d3 100644 --- a/cpp/src/arrow/util/task-group.cc +++ b/cpp/src/arrow/util/task-group.cc @@ -17,9 +17,11 @@ #include "arrow/util/task-group.h" +#include #include #include #include +#include #include "arrow/util/logging.h" #include "arrow/util/thread-pool.h" @@ -41,6 +43,8 @@ class SerialTaskGroup : public TaskGroup { Status current_status() override { return status_; } + bool ok() override { return status_.ok(); } + Status Finish() override { if (!finished_) { finished_ = true; @@ -70,7 +74,8 @@ class SerialTaskGroup : public TaskGroup { class ThreadedTaskGroup : public TaskGroup { public: - explicit ThreadedTaskGroup(ThreadPool* thread_pool) : thread_pool_(thread_pool) {} + explicit ThreadedTaskGroup(ThreadPool* thread_pool) + : thread_pool_(thread_pool), nremaining_(0), ok_(true) {} ~ThreadedTaskGroup() override { // Make sure all pending tasks are finished, so that dangling references @@ -79,22 +84,19 @@ class ThreadedTaskGroup : public TaskGroup { } void AppendReal(std::function task) override { - std::lock_guard lock(mutex_); - DCHECK(!finished_); - - if (status_.ok()) { - ++nremaining_; - status_ = thread_pool_->Spawn([&, task]() { - std::unique_lock lock(mutex_); - if (status_.ok()) { - lock.unlock(); + // The hot path is unlocked thanks to atomics + // Only if an error occurs is the lock taken + if (ok_.load(std::memory_order_acquire)) { + nremaining_.fetch_add(1, std::memory_order_acquire); + Status st = thread_pool_->Spawn([this, task]() { + if (ok_.load(std::memory_order_acquire)) { // XXX what about exceptions? Status st = task(); - lock.lock(); - status_ &= st; + UpdateStatus(std::move(st)); } OneTaskDone(); }); + UpdateStatus(std::move(st)); } } @@ -103,15 +105,15 @@ class ThreadedTaskGroup : public TaskGroup { return status_; } + bool ok() override { return ok_.load(); } + Status Finish() override { std::unique_lock lock(mutex_); if (!finished_) { - cv_.wait(lock, [&]() { return nremaining_ == 0; }); + cv_.wait(lock, [&]() { return nremaining_.load() == 0; }); // Current tasks may start other tasks, so only set this when done finished_ = true; if (parent_) { - // Need to lock parent - std::lock_guard parent_lock(parent_->mutex_); parent_->OneTaskDone(); } } @@ -124,26 +126,42 @@ class ThreadedTaskGroup : public TaskGroup { std::lock_guard lock(mutex_); auto child = new ThreadedTaskGroup(thread_pool_); child->parent_ = this; - nremaining_++; + nremaining_.fetch_add(1, std::memory_order_acquire); return std::shared_ptr(child); } protected: + void UpdateStatus(Status&& st) { + // Must be called unlocked, only locks on error + if (ARROW_PREDICT_FALSE(!st.ok())) { + std::lock_guard lock(mutex_); + ok_.store(false, std::memory_order_release); + status_ &= std::move(st); + } + } + void OneTaskDone() { - // We are locked - --nremaining_; - DCHECK_GE(nremaining_, 0); - if (nremaining_ == 0) { + // Can be called unlocked thanks to atomics + auto nremaining = nremaining_.fetch_sub(1, std::memory_order_release) - 1; + DCHECK_GE(nremaining, 0); + if (nremaining == 0) { + // Take the lock so that ~ThreadedTaskGroup cannot destroy cv + // before cv.notify_one() has returned + std::unique_lock lock(mutex_); cv_.notify_one(); } } + // These members are usable unlocked ThreadPool* thread_pool_; + std::atomic nremaining_; + std::atomic ok_; + + // These members use locking std::mutex mutex_; std::condition_variable cv_; Status status_; bool finished_ = false; - int32_t nremaining_ = 0; ThreadedTaskGroup* parent_ = nullptr; }; diff --git a/cpp/src/arrow/util/task-group.h b/cpp/src/arrow/util/task-group.h index 450b6da5884fc..390d9476e59bd 100644 --- a/cpp/src/arrow/util/task-group.h +++ b/cpp/src/arrow/util/task-group.h @@ -59,7 +59,7 @@ class ARROW_EXPORT TaskGroup { virtual Status current_status() = 0; /// Whether some tasks have already failed. Non-blocking , useful for stopping early. - bool ok() { return current_status().ok(); } + virtual bool ok() = 0; /// How many tasks can typically be executed in parallel. /// This is only a hint, useful for testing or debugging. diff --git a/cpp/src/arrow/util/thread-pool-benchmark.cc b/cpp/src/arrow/util/thread-pool-benchmark.cc new file mode 100644 index 0000000000000..8d855d3acba09 --- /dev/null +++ b/cpp/src/arrow/util/thread-pool-benchmark.cc @@ -0,0 +1,202 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/test-util.h" +#include "arrow/util/task-group.h" +#include "arrow/util/thread-pool.h" + +namespace arrow { +namespace internal { + +struct Workload { + explicit Workload(int32_t size) : size_(size), data_(kDataSize) { + std::default_random_engine gen(42); + std::uniform_int_distribution dist(0, std::numeric_limits::max()); + std::generate(data_.begin(), data_.end(), [&]() { return dist(gen); }); + } + + void operator()(); + + private: + static constexpr int32_t kDataSize = 32; + + int32_t size_; + std::vector data_; +}; + +void Workload::operator()() { + uint64_t result = 0; + for (int32_t i = 0; i < size_ / kDataSize; ++i) { + for (const auto v : data_) { + result = (result << (v % 64)) - v; + } + } + benchmark::DoNotOptimize(result); +} + +struct Task { + explicit Task(int32_t size) : workload_(size) {} + + Status operator()() { + workload_(); + return Status::OK(); + } + + private: + Workload workload_; +}; + +// This benchmark simply provides a baseline indicating the raw cost of our workload +// depending on the workload size. Number of items / second in this (serial) +// benchmark can be compared to the numbers obtained in BM_ThreadPoolSpawn. +static void BM_WorkloadCost(benchmark::State& state) { + const auto workload_size = static_cast(state.range(0)); + + Workload workload(workload_size); + for (auto _ : state) { + workload(); + } + + state.SetItemsProcessed(state.iterations()); +} + +// Benchmark ThreadPool::Spawn +static void BM_ThreadPoolSpawn(benchmark::State& state) { + const auto nthreads = static_cast(state.range(0)); + const auto workload_size = static_cast(state.range(1)); + + Workload workload(workload_size); + + // Spawn enough tasks to make the pool start up overhead negligible + const int32_t nspawns = 200000000 / workload_size + 1; + + for (auto _ : state) { + state.PauseTiming(); + std::shared_ptr pool; + ABORT_NOT_OK(ThreadPool::Make(nthreads, &pool)); + state.ResumeTiming(); + + for (int32_t i = 0; i < nspawns; ++i) { + // Pass the task by reference to avoid copying it around + ABORT_NOT_OK(pool->Spawn(std::ref(workload))); + } + + // Wait for all tasks to finish + ABORT_NOT_OK(pool->Shutdown(true /* wait */)); + state.PauseTiming(); + pool.reset(); + state.ResumeTiming(); + } + state.SetItemsProcessed(state.iterations() * nspawns); +} + +// Benchmark serial TaskGroup +static void BM_SerialTaskGroup(benchmark::State& state) { + const auto workload_size = static_cast(state.range(0)); + + Task task(workload_size); + + const int32_t nspawns = 10000000 / workload_size + 1; + + for (auto _ : state) { + auto task_group = TaskGroup::MakeSerial(); + for (int32_t i = 0; i < nspawns; ++i) { + // Pass the task by reference to avoid copying it around + task_group->Append(std::ref(task)); + } + ABORT_NOT_OK(task_group->Finish()); + } + state.SetItemsProcessed(state.iterations() * nspawns); +} + +// Benchmark threaded TaskGroup +static void BM_ThreadedTaskGroup(benchmark::State& state) { + const auto nthreads = static_cast(state.range(0)); + const auto workload_size = static_cast(state.range(1)); + + std::shared_ptr pool; + ABORT_NOT_OK(ThreadPool::Make(nthreads, &pool)); + + Task task(workload_size); + + const int32_t nspawns = 10000000 / workload_size + 1; + + for (auto _ : state) { + auto task_group = TaskGroup::MakeThreaded(pool.get()); + for (int32_t i = 0; i < nspawns; ++i) { + // Pass the task by reference to avoid copying it around + task_group->Append(std::ref(task)); + } + ABORT_NOT_OK(task_group->Finish()); + } + ABORT_NOT_OK(pool->Shutdown(true /* wait */)); + + state.SetItemsProcessed(state.iterations() * nspawns); +} + +static const int32_t kWorkloadSizes[] = {1000, 10000, 100000}; + +static void WorkloadCost_Customize(benchmark::internal::Benchmark* b) { + for (const auto w : kWorkloadSizes) { + b->Args({w}); + } + b->ArgNames({"task_cost"}); +} + +static void ThreadPoolSpawn_Customize(benchmark::internal::Benchmark* b) { + for (const int32_t w : kWorkloadSizes) { + for (const int nthreads : {1, 2, 4, 8}) { + b->Args({nthreads, w}); + } + } + b->ArgNames({"threads", "task_cost"}); +} + +static const int kRepetitions = 1; + +BENCHMARK(BM_WorkloadCost)->Repetitions(kRepetitions)->Apply(WorkloadCost_Customize); + +BENCHMARK(BM_ThreadPoolSpawn) + ->UseRealTime() + ->Repetitions(kRepetitions) + ->Apply(ThreadPoolSpawn_Customize); + +BENCHMARK(BM_SerialTaskGroup) + ->UseRealTime() + ->Repetitions(kRepetitions) + ->Apply(WorkloadCost_Customize); + +BENCHMARK(BM_ThreadedTaskGroup) + ->UseRealTime() + ->Repetitions(kRepetitions) + ->Apply(ThreadPoolSpawn_Customize); + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/thread-pool.cc b/cpp/src/arrow/util/thread-pool.cc index 751b264b42f59..17ad9c4972fa2 100644 --- a/cpp/src/arrow/util/thread-pool.cc +++ b/cpp/src/arrow/util/thread-pool.cc @@ -34,6 +34,9 @@ namespace internal { struct ThreadPool::State { State() : desired_capacity_(0), please_shutdown_(false), quick_shutdown_(false) {} + // NOTE: in case locking becomes too expensive, we can investigate lock-free FIFOs + // such as https://github.com/cameron314/concurrentqueue + std::mutex mutex_; std::condition_variable cv_; std::condition_variable cv_shutdown_; From db29723f661174eefd04077666347a9bbaca5be1 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 10 Jan 2019 11:49:29 +0900 Subject: [PATCH 194/328] ARROW-4215: [GLib] Fix typos in documentation This solves the following warnings: arrow-glib/basic-data-type.cpp:1070: warning: multi-line since docs found arrow-glib/decimal128.cpp:37: warning: Section decimal is not defined in the arrow-glib-sections.txt file. Author: Kouhei Sutou Closes #3361 from kou/glib-fix-document and squashes the following commits: edd43c8a Fix typos in documentation --- c_glib/arrow-glib/basic-data-type.cpp | 2 +- c_glib/arrow-glib/decimal128.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 2a599963ee3aa..861bbaf388801 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -1065,7 +1065,7 @@ garrow_decimal_data_type_class_init(GArrowDecimalDataTypeClass *klass) * * Since: 0.10.0 * - * Deprecate: 0.12.0: + * Deprecated: 0.12.0: * Use garrow_decimal128_data_type_new() instead. */ GArrowDecimalDataType * diff --git a/c_glib/arrow-glib/decimal128.cpp b/c_glib/arrow-glib/decimal128.cpp index a49dba580ee79..32bdf5fcae6e4 100644 --- a/c_glib/arrow-glib/decimal128.cpp +++ b/c_glib/arrow-glib/decimal128.cpp @@ -27,8 +27,8 @@ G_BEGIN_DECLS /** - * SECTION: decimal - * @title: Decimal classes + * SECTION: decimal128 + * @title: 128-bit decimal class * @include: arrow-glib/arrow-glib.h * * #GArrowDecimal128 is a 128-bit decimal class. From 3b61349b3c16d43003e493c7e2aec9348e7e7343 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 9 Jan 2019 22:00:12 -0600 Subject: [PATCH 195/328] ARROW-2968: [R] Multi-threaded conversion from Arrow table to R data.frame The `as_tibble()` methods for `arrow::RecordBatch` and `arrow::Table` gained a `use_threads` argument. When set to `TRUE` columns of a record batch or table are converted to R vectors in parallel. We cannot allocate R data structures in parallel (including scalar strings), so it goes like this: ``` for each column: - allocate the R vector host for the array - if that can be done in parallel, fill the R vector with data from the array fill serially all columns that could not be filled in parallel wait for all columns to be full ``` This is I believe better (although perhaps harder to explain) than - allocate all the vectors - fill them in parallel Because we don't have to wait for all the vectors to be allocated to start filling them. I believe the python does that, in `DataFrameBlockCreator::Convert` ``` RETURN_NOT_OK(CreateBlocks()); RETURN_NOT_OK(WriteTableToBlocks()); ``` I've had to split the implementation of `Array__as_vector` into two steps: - Allocate: this must happen on the main thread, or alternatively would need to mutex R - Ingest: For most array types, this can be done in parallel Author: Romain Francois Closes #3332 from romainfrancois/2968/threads and squashes the following commits: 8261f2907 sprinkle use_threads in functions that call as_tibble() 3205de2d8 lint 590baf5a6 using string_view cd0dd343e no need for checkBuffers 29546cd5d Some more refactoring of the Converters 5557b7974 refactor the Converter api, so that all Converters are implementations of the base class Converter. e2ed26b78 lint 2a5815e03 moving parallel_ingest() to a static method of the Converter classes 2613d4ec4 null_count already local variable 62a842054 + to_r_index lambda, with comment about why +1 52c725fc8 default_value() marked constexpr 11e82e769 lint d22b9c551 parallel version of Table__to_dataframe 2455bd057 parallel version of RecordBatch__to_dataframe 380d3a5bc simplify ArrayVector__as_vector. 85881a3e2 simplify ArrayVector_To_Vector 7074b36e9 reinstate Converter_Timestamp so that ArrayVector__as_vector can be simplified cf7e76bae + parallel_ingest() to indicate if ingest for a givne converter can be doine in parallel baaaefe1b Re"work Converter api e650b7934 + arrow::r::inspect(SEXP) for debugging a335dfdfc Factor out Array -> R vector code in separate file 1212e28a9 .Ingest() return an Invalid status instead of throwing an exception 39bf76403 .Ingest() return a Status instead of void f68b79376 replaced DictionaryArrays_to_Vector and Converter_Dictionary_Int32Indices by Converter_Dictionary d25a0e6b5 replace Date32ArrayVector_to_Vector by Converter_Date32 85e48c0c7 lint 18b921e6f + Get/Set ThreadPoolCapacity --- r/NAMESPACE | 2 + r/R/RcppExports.R | 57 +- r/R/RecordBatch.R | 4 +- r/R/Table.R | 4 +- r/R/feather.R | 5 +- r/R/parquet.R | 5 +- r/R/read_table.R | 4 +- r/man/GetCpuThreadPoolCapacity.Rd | 18 + r/man/SetCpuThreadPoolCapacity.Rd | 17 + r/man/read_feather.Rd | 5 +- r/man/read_parquet.Rd | 4 +- r/man/read_table.Rd | 4 +- r/src/RcppExports.cpp | 120 +-- r/src/array.cpp | 496 ------------- r/src/array__to_vector.cpp | 697 ++++++++++++++++++ r/src/arrow_types.h | 12 +- r/src/recordbatch.cpp | 16 - r/src/symbols.cpp | 9 + r/src/table.cpp | 17 - r/src/threadpool.cpp | 44 ++ r/tests/testthat/test-RecordBatch.R | 1 - r/tests/testthat/test-cputhreadpoolcapacity.R | 26 + 22 files changed, 959 insertions(+), 608 deletions(-) create mode 100644 r/man/GetCpuThreadPoolCapacity.Rd create mode 100644 r/man/SetCpuThreadPoolCapacity.Rd create mode 100644 r/src/array__to_vector.cpp create mode 100644 r/src/threadpool.cpp create mode 100644 r/tests/testthat/test-cputhreadpoolcapacity.R diff --git a/r/NAMESPACE b/r/NAMESPACE index f8f6384dce1f8..7fd76c7c4fb7e 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -80,6 +80,7 @@ export(FeatherTableWriter) export(FileMode) export(FileOutputStream) export(FixedSizeBufferWriter) +export(GetCpuThreadPoolCapacity) export(MessageReader) export(MessageType) export(MockOutputStream) @@ -88,6 +89,7 @@ export(RecordBatchFileReader) export(RecordBatchFileWriter) export(RecordBatchStreamReader) export(RecordBatchStreamWriter) +export(SetCpuThreadPoolCapacity) export(StatusCode) export(TimeUnit) export(Type) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index c6fe8719f4e89..51ed4ea6b5a2a 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -5,14 +5,6 @@ Array__from_vector <- function(x) { .Call(`_arrow_Array__from_vector`, x) } -Array__as_vector <- function(array) { - .Call(`_arrow_Array__as_vector`, array) -} - -ChunkedArray__as_vector <- function(chunked_array) { - .Call(`_arrow_ChunkedArray__as_vector`, chunked_array) -} - Array__Slice1 <- function(array, offset) { .Call(`_arrow_Array__Slice1`, array, offset) } @@ -81,6 +73,22 @@ DictionaryArray__dictionary <- function(array) { .Call(`_arrow_DictionaryArray__dictionary`, array) } +Array__as_vector <- function(array) { + .Call(`_arrow_Array__as_vector`, array) +} + +ChunkedArray__as_vector <- function(chunked_array) { + .Call(`_arrow_ChunkedArray__as_vector`, chunked_array) +} + +RecordBatch__to_dataframe <- function(batch, use_threads) { + .Call(`_arrow_RecordBatch__to_dataframe`, batch, use_threads) +} + +Table__to_dataframe <- function(table, use_threads) { + .Call(`_arrow_Table__to_dataframe`, table, use_threads) +} + ArrayData__get_type <- function(x) { .Call(`_arrow_ArrayData__get_type`, x) } @@ -661,10 +669,6 @@ RecordBatch__column <- function(batch, i) { .Call(`_arrow_RecordBatch__column`, batch, i) } -RecordBatch__to_dataframe <- function(batch) { - .Call(`_arrow_RecordBatch__to_dataframe`, batch) -} - RecordBatch__from_dataframe <- function(tbl) { .Call(`_arrow_RecordBatch__from_dataframe`, tbl) } @@ -781,10 +785,6 @@ Table__schema <- function(x) { .Call(`_arrow_Table__schema`, x) } -Table__to_dataframe <- function(table) { - .Call(`_arrow_Table__to_dataframe`, table) -} - Table__column <- function(table, i) { .Call(`_arrow_Table__column`, table, i) } @@ -793,3 +793,28 @@ Table__columns <- function(table) { .Call(`_arrow_Table__columns`, table) } +#' Get the capacity of the global thread pool +#' +#' @return the number of worker threads in the thread pool to which +#' Arrow dispatches various CPU-bound tasks. This is an ideal number, +#' not necessarily the exact number of threads at a given point in time. +#' +#' You can change this number using [SetCpuThreadPoolCapacity()]. +#' +#' @export +GetCpuThreadPoolCapacity <- function() { + .Call(`_arrow_GetCpuThreadPoolCapacity`) +} + +#' Set the capacity of the global thread pool +#' +#' @param threads the number of worker threads int the thread pool to which +#' Arrow dispatches various CPU-bound tasks. +#' +#' The current number is returned by [GetCpuThreadPoolCapacity()] +#' +#' @export +SetCpuThreadPoolCapacity <- function(threads) { + invisible(.Call(`_arrow_SetCpuThreadPoolCapacity`, threads)) +} + diff --git a/r/R/RecordBatch.R b/r/R/RecordBatch.R index fed10abee769c..9872117452e85 100644 --- a/r/R/RecordBatch.R +++ b/r/R/RecordBatch.R @@ -80,8 +80,8 @@ } #' @export -`as_tibble.arrow::RecordBatch` <- function(x, ...){ - RecordBatch__to_dataframe(x) +`as_tibble.arrow::RecordBatch` <- function(x, use_threads = TRUE, ...){ + RecordBatch__to_dataframe(x, use_threads = use_threads) } #' Create an [arrow::RecordBatch][arrow__RecordBatch] from a data frame diff --git a/r/R/Table.R b/r/R/Table.R index 8972634d59f1d..c39fce246af16 100644 --- a/r/R/Table.R +++ b/r/R/Table.R @@ -61,6 +61,6 @@ table <- function(.data){ } #' @export -`as_tibble.arrow::Table` <- function(x, ...){ - Table__to_dataframe(x) +`as_tibble.arrow::Table` <- function(x, use_threads = TRUE, ...){ + Table__to_dataframe(x, use_threads = use_threads) } diff --git a/r/R/feather.R b/r/R/feather.R index 064652145c8e4..eaeea4caefbaa 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -154,15 +154,16 @@ FeatherTableReader.fs_path <- function(file, mmap = TRUE, ...) { #' @param file a arrow::ipc::feather::TableReader or whatever the [FeatherTableReader()] function can handle #' @param columns names if the columns to read. The default `NULL` means all columns #' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble. +#' @param use_threads Use threads when converting to a tibble. #' @param ... additional parameters #' #' @return a data frame if `as_tibble` is `TRUE` (the default), or a [arrow::Table][arrow__Table] otherwise #' #' @export -read_feather <- function(file, columns = NULL, as_tibble = TRUE, ...){ +read_feather <- function(file, columns = NULL, as_tibble = TRUE, use_threads = TRUE, ...){ out <- FeatherTableReader(file, ...)$Read(columns) if (isTRUE(as_tibble)) { - out <- as_tibble(out) + out <- as_tibble(out, use_threads = use_threads) } out } diff --git a/r/R/parquet.R b/r/R/parquet.R index 141da7bd04b2c..6a393e2c880df 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -19,15 +19,16 @@ #' #' @param file a file path #' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble. +#' @param use_threads Use threads when converting to a tibble, only relevant if `as_tibble` is `TRUE` #' @param ... currently ignored #' #' @return a [arrow::Table][arrow__Table], or a data frame if `as_tibble` is `TRUE`. #' #' @export -read_parquet <- function(file, as_tibble = TRUE, ...) { +read_parquet <- function(file, as_tibble = TRUE, use_threads = TRUE, ...) { tab <- shared_ptr(`arrow::Table`, read_parquet_file(f)) if (isTRUE(as_tibble)) { - tab <- as_tibble(tab) + tab <- as_tibble(tab, use_threads = use_threads) } tab } diff --git a/r/R/read_table.R b/r/R/read_table.R index a540a42173556..260c50f12374f 100644 --- a/r/R/read_table.R +++ b/r/R/read_table.R @@ -33,6 +33,8 @@ #' #' - a raw vector: read using a [arrow::ipc::RecordBatchStreamReader][arrow__ipc__RecordBatchStreamReader] #' +#' @param use_threads Use threads when converting to a tibble +#' #' @return #' #' - `read_table` returns an [arrow::Table][arrow__Table] @@ -81,6 +83,6 @@ read_table.fs_path <- function(stream) { #' @rdname read_table #' @export -read_arrow <- function(stream){ +read_arrow <- function(stream, use_threads = TRUE){ as_tibble(read_table(stream)) } diff --git a/r/man/GetCpuThreadPoolCapacity.Rd b/r/man/GetCpuThreadPoolCapacity.Rd new file mode 100644 index 0000000000000..8bf0a6fc89424 --- /dev/null +++ b/r/man/GetCpuThreadPoolCapacity.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{GetCpuThreadPoolCapacity} +\alias{GetCpuThreadPoolCapacity} +\title{Get the capacity of the global thread pool} +\usage{ +GetCpuThreadPoolCapacity() +} +\value{ +the number of worker threads in the thread pool to which +Arrow dispatches various CPU-bound tasks. This is an ideal number, +not necessarily the exact number of threads at a given point in time. + +You can change this number using \code{\link[=SetCpuThreadPoolCapacity]{SetCpuThreadPoolCapacity()}}. +} +\description{ +Get the capacity of the global thread pool +} diff --git a/r/man/SetCpuThreadPoolCapacity.Rd b/r/man/SetCpuThreadPoolCapacity.Rd new file mode 100644 index 0000000000000..3a06dd5d6a202 --- /dev/null +++ b/r/man/SetCpuThreadPoolCapacity.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{SetCpuThreadPoolCapacity} +\alias{SetCpuThreadPoolCapacity} +\title{Set the capacity of the global thread pool} +\usage{ +SetCpuThreadPoolCapacity(threads) +} +\arguments{ +\item{threads}{the number of worker threads int the thread pool to which +Arrow dispatches various CPU-bound tasks. + +The current number is returned by \code{\link[=GetCpuThreadPoolCapacity]{GetCpuThreadPoolCapacity()}}} +} +\description{ +Set the capacity of the global thread pool +} diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index 31fd36ab65a26..4509c7d334dbf 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -4,7 +4,8 @@ \alias{read_feather} \title{Read a feather file} \usage{ -read_feather(file, columns = NULL, as_tibble = TRUE, ...) +read_feather(file, columns = NULL, as_tibble = TRUE, + use_threads = TRUE, ...) } \arguments{ \item{file}{a arrow::ipc::feather::TableReader or whatever the \code{\link[=FeatherTableReader]{FeatherTableReader()}} function can handle} @@ -13,6 +14,8 @@ read_feather(file, columns = NULL, as_tibble = TRUE, ...) \item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.} +\item{use_threads}{Use threads when converting to a tibble.} + \item{...}{additional parameters} } \value{ diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd index c29e18bca5baf..a4f294bdd67ed 100644 --- a/r/man/read_parquet.Rd +++ b/r/man/read_parquet.Rd @@ -4,13 +4,15 @@ \alias{read_parquet} \title{Read parquet file from disk} \usage{ -read_parquet(file, as_tibble = TRUE, ...) +read_parquet(file, as_tibble = TRUE, use_threads = TRUE, ...) } \arguments{ \item{file}{a file path} \item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.} +\item{use_threads}{Use threads when converting to a tibble, only relevant if \code{as_tibble} is \code{TRUE}} + \item{...}{currently ignored} } \value{ diff --git a/r/man/read_table.Rd b/r/man/read_table.Rd index 3231b26da267b..356ec5e740d01 100644 --- a/r/man/read_table.Rd +++ b/r/man/read_table.Rd @@ -7,7 +7,7 @@ \usage{ read_table(stream) -read_arrow(stream) +read_arrow(stream, use_threads = TRUE) } \arguments{ \item{stream}{stream. @@ -23,6 +23,8 @@ binary file format, and uses a \link[=arrow__ipc__RecordBatchFileReader]{arrow:: to process it. \item a raw vector: read using a \link[=arrow__ipc__RecordBatchStreamReader]{arrow::ipc::RecordBatchStreamReader} }} + +\item{use_threads}{Use threads when converting to a tibble} } \value{ \itemize{ diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index 1e8fed1867655..a31c401efa5f5 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -17,28 +17,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// Array__as_vector -SEXP Array__as_vector(const std::shared_ptr& array); -RcppExport SEXP _arrow_Array__as_vector(SEXP arraySEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::shared_ptr& >::type array(arraySEXP); - rcpp_result_gen = Rcpp::wrap(Array__as_vector(array)); - return rcpp_result_gen; -END_RCPP -} -// ChunkedArray__as_vector -SEXP ChunkedArray__as_vector(const std::shared_ptr& chunked_array); -RcppExport SEXP _arrow_ChunkedArray__as_vector(SEXP chunked_arraySEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::shared_ptr& >::type chunked_array(chunked_arraySEXP); - rcpp_result_gen = Rcpp::wrap(ChunkedArray__as_vector(chunked_array)); - return rcpp_result_gen; -END_RCPP -} // Array__Slice1 std::shared_ptr Array__Slice1(const std::shared_ptr& array, int offset); RcppExport SEXP _arrow_Array__Slice1(SEXP arraySEXP, SEXP offsetSEXP) { @@ -237,6 +215,52 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// Array__as_vector +SEXP Array__as_vector(const std::shared_ptr& array); +RcppExport SEXP _arrow_Array__as_vector(SEXP arraySEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type array(arraySEXP); + rcpp_result_gen = Rcpp::wrap(Array__as_vector(array)); + return rcpp_result_gen; +END_RCPP +} +// ChunkedArray__as_vector +SEXP ChunkedArray__as_vector(const std::shared_ptr& chunked_array); +RcppExport SEXP _arrow_ChunkedArray__as_vector(SEXP chunked_arraySEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type chunked_array(chunked_arraySEXP); + rcpp_result_gen = Rcpp::wrap(ChunkedArray__as_vector(chunked_array)); + return rcpp_result_gen; +END_RCPP +} +// RecordBatch__to_dataframe +List RecordBatch__to_dataframe(const std::shared_ptr& batch, bool use_threads); +RcppExport SEXP _arrow_RecordBatch__to_dataframe(SEXP batchSEXP, SEXP use_threadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type batch(batchSEXP); + Rcpp::traits::input_parameter< bool >::type use_threads(use_threadsSEXP); + rcpp_result_gen = Rcpp::wrap(RecordBatch__to_dataframe(batch, use_threads)); + return rcpp_result_gen; +END_RCPP +} +// Table__to_dataframe +List Table__to_dataframe(const std::shared_ptr& table, bool use_threads); +RcppExport SEXP _arrow_Table__to_dataframe(SEXP tableSEXP, SEXP use_threadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type table(tableSEXP); + Rcpp::traits::input_parameter< bool >::type use_threads(use_threadsSEXP); + rcpp_result_gen = Rcpp::wrap(Table__to_dataframe(table, use_threads)); + return rcpp_result_gen; +END_RCPP +} // ArrayData__get_type std::shared_ptr ArrayData__get_type(const std::shared_ptr& x); RcppExport SEXP _arrow_ArrayData__get_type(SEXP xSEXP) { @@ -1846,17 +1870,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// RecordBatch__to_dataframe -List RecordBatch__to_dataframe(const std::shared_ptr& batch); -RcppExport SEXP _arrow_RecordBatch__to_dataframe(SEXP batchSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::shared_ptr& >::type batch(batchSEXP); - rcpp_result_gen = Rcpp::wrap(RecordBatch__to_dataframe(batch)); - return rcpp_result_gen; -END_RCPP -} // RecordBatch__from_dataframe std::shared_ptr RecordBatch__from_dataframe(DataFrame tbl); RcppExport SEXP _arrow_RecordBatch__from_dataframe(SEXP tblSEXP) { @@ -2185,17 +2198,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// Table__to_dataframe -List Table__to_dataframe(const std::shared_ptr& table); -RcppExport SEXP _arrow_Table__to_dataframe(SEXP tableSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::shared_ptr& >::type table(tableSEXP); - rcpp_result_gen = Rcpp::wrap(Table__to_dataframe(table)); - return rcpp_result_gen; -END_RCPP -} // Table__column std::shared_ptr Table__column(const std::shared_ptr& table, int i); RcppExport SEXP _arrow_Table__column(SEXP tableSEXP, SEXP iSEXP) { @@ -2219,11 +2221,29 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// GetCpuThreadPoolCapacity +int GetCpuThreadPoolCapacity(); +RcppExport SEXP _arrow_GetCpuThreadPoolCapacity() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + rcpp_result_gen = Rcpp::wrap(GetCpuThreadPoolCapacity()); + return rcpp_result_gen; +END_RCPP +} +// SetCpuThreadPoolCapacity +void SetCpuThreadPoolCapacity(int threads); +RcppExport SEXP _arrow_SetCpuThreadPoolCapacity(SEXP threadsSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< int >::type threads(threadsSEXP); + SetCpuThreadPoolCapacity(threads); + return R_NilValue; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_arrow_Array__from_vector", (DL_FUNC) &_arrow_Array__from_vector, 1}, - {"_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, - {"_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 1}, {"_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, {"_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, {"_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2}, @@ -2241,6 +2261,10 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_Array__Mask", (DL_FUNC) &_arrow_Array__Mask, 1}, {"_arrow_DictionaryArray__indices", (DL_FUNC) &_arrow_DictionaryArray__indices, 1}, {"_arrow_DictionaryArray__dictionary", (DL_FUNC) &_arrow_DictionaryArray__dictionary, 1}, + {"_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, + {"_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 1}, + {"_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, + {"_arrow_Table__to_dataframe", (DL_FUNC) &_arrow_Table__to_dataframe, 2}, {"_arrow_ArrayData__get_type", (DL_FUNC) &_arrow_ArrayData__get_type, 1}, {"_arrow_ArrayData__get_length", (DL_FUNC) &_arrow_ArrayData__get_length, 1}, {"_arrow_ArrayData__get_null_count", (DL_FUNC) &_arrow_ArrayData__get_null_count, 1}, @@ -2386,7 +2410,6 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, {"_arrow_RecordBatch__columns", (DL_FUNC) &_arrow_RecordBatch__columns, 1}, {"_arrow_RecordBatch__column", (DL_FUNC) &_arrow_RecordBatch__column, 2}, - {"_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 1}, {"_arrow_RecordBatch__from_dataframe", (DL_FUNC) &_arrow_RecordBatch__from_dataframe, 1}, {"_arrow_RecordBatch__Equals", (DL_FUNC) &_arrow_RecordBatch__Equals, 2}, {"_arrow_RecordBatch__RemoveColumn", (DL_FUNC) &_arrow_RecordBatch__RemoveColumn, 2}, @@ -2416,9 +2439,10 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1}, {"_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, {"_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, - {"_arrow_Table__to_dataframe", (DL_FUNC) &_arrow_Table__to_dataframe, 1}, {"_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2}, {"_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1}, + {"_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0}, + {"_arrow_SetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_SetCpuThreadPoolCapacity, 1}, {NULL, NULL, 0} }; diff --git a/r/src/array.cpp b/r/src/array.cpp index 901f2b69bedb4..dd0d7e64a20bf 100644 --- a/r/src/array.cpp +++ b/r/src/array.cpp @@ -33,9 +33,6 @@ inline bool isna(double x) { return ISNA(x); } -// the integer64 sentinel -constexpr int64_t NA_INT64 = std::numeric_limits::min(); - template std::shared_ptr SimpleArray(SEXP x) { Rcpp::Vector vec(x); @@ -503,499 +500,6 @@ std::shared_ptr Array__from_vector(SEXP x) { return nullptr; } -// ---------------------------- Array -> R vector - -namespace arrow { -namespace r { - -template -SEXP ArrayVector_To_Vector(int64_t n, const ArrayVector& arrays, Args... args) { - Converter converter(n, std::forward(args)...); - - R_xlen_t k = 0; - for (const auto& array : arrays) { - auto n_chunk = array->length(); - converter.Ingest(array, k, n_chunk); - k += n_chunk; - } - return converter.data; -} - -template -struct Converter_SimpleArray { - using Vector = Rcpp::Vector; - - Converter_SimpleArray(R_xlen_t n) : data(no_init(n)) {} - - void Ingest(const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) { - using value_type = typename Vector::stored_type; - auto null_count = array->null_count(); - - if (n == null_count) { - std::fill_n(data.begin() + start, n, default_value()); - } else { - auto p_values = array->data()->GetValues(1); - STOP_IF_NULL(p_values); - - // first copy all the data - std::copy_n(p_values, n, data.begin() + start); - - if (null_count) { - // then set the sentinel NA - arrow::internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), - array->offset(), n); - - for (size_t i = 0; i < n; i++, bitmap_reader.Next()) { - if (bitmap_reader.IsNotSet()) { - data[i + start] = default_value(); - } - } - } - } - } - - Vector data; -}; - -struct Converter_String { - Converter_String(R_xlen_t n) : data(n) {} - - void Ingest(const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) { - auto null_count = array->null_count(); - - if (null_count == n) { - std::fill_n(data.begin(), n, NA_STRING); - } else { - auto p_offset = array->data()->GetValues(1); - STOP_IF_NULL(p_offset); - auto p_data = array->data()->GetValues(2, *p_offset); - if (!p_data) { - // There is an offset buffer, but the data buffer is null - // There is at least one value in the array and not all the values are null - // That means all values are empty strings so there is nothing to do - return; - } - - if (null_count) { - // need to watch for nulls - arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), - array->offset(), n); - for (int i = 0; i < n; i++, null_reader.Next()) { - if (null_reader.IsSet()) { - auto diff = p_offset[i + 1] - p_offset[i]; - SET_STRING_ELT(data, start + i, Rf_mkCharLenCE(p_data, diff, CE_UTF8)); - p_data += diff; - } else { - SET_STRING_ELT(data, start + i, NA_STRING); - } - } - - } else { - // no need to check for nulls - // TODO: altrep mark this as no na - for (int i = 0; i < n; i++) { - auto diff = p_offset[i + 1] - p_offset[i]; - SET_STRING_ELT(data, start + i, Rf_mkCharLenCE(p_data, diff, CE_UTF8)); - p_data += diff; - } - } - } - } - - CharacterVector data; -}; - -struct Converter_Boolean { - Converter_Boolean(R_xlen_t n) : data(n) {} - - void Ingest(const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) { - auto null_count = array->null_count(); - - if (n == null_count) { - std::fill_n(data.begin() + start, n, NA_LOGICAL); - } else { - // process the data - auto p_data = array->data()->GetValues(1, 0); - STOP_IF_NULL(p_data); - - arrow::internal::BitmapReader data_reader(p_data, array->offset(), n); - for (size_t i = 0; i < n; i++, data_reader.Next()) { - data[start + i] = data_reader.IsSet(); - } - - // then the null bitmap if needed - if (null_count) { - arrow::internal::BitmapReader null_reader(array->null_bitmap()->data(), - array->offset(), n); - for (size_t i = 0; i < n; i++, null_reader.Next()) { - if (null_reader.IsNotSet()) { - data[start + i] = NA_LOGICAL; - } - } - } - } - } - - LogicalVector data; -}; - -template -struct Converter_Dictionary_Int32Indices { - Converter_Dictionary_Int32Indices(R_xlen_t n, const std::shared_ptr& dict, - bool ordered) - : data(no_init(n)) { - data.attr("levels") = ArrayVector_To_Vector(dict->length(), {dict}); - if (ordered) { - data.attr("class") = CharacterVector::create("ordered", "factor"); - } else { - data.attr("class") = "factor"; - } - } - - void Ingest(const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) { - DictionaryArray* dict_array = static_cast(array.get()); - using value_type = typename arrow::TypeTraits::ArrayType::value_type; - auto null_count = array->null_count(); - - if (n == null_count) { - std::fill_n(data.begin() + start, n, NA_INTEGER); - } else { - std::shared_ptr indices = dict_array->indices(); - auto p_array = indices->data()->GetValues(1); - STOP_IF_NULL(p_array); - - if (array->null_count()) { - arrow::internal::BitmapReader bitmap_reader(indices->null_bitmap()->data(), - indices->offset(), n); - for (size_t i = 0; i < n; i++, bitmap_reader.Next(), ++p_array) { - data[start + i] = - bitmap_reader.IsNotSet() ? NA_INTEGER : (static_cast(*p_array) + 1); - } - } else { - std::transform( - p_array, p_array + n, data.begin() + start, - [](const value_type value) { return static_cast(value) + 1; }); - } - } - } - - IntegerVector data; -}; - -struct Converter_Date64 { - Converter_Date64(R_xlen_t n) : data(n) { - data.attr("class") = CharacterVector::create("POSIXct", "POSIXt"); - } - - void Ingest(const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) { - auto null_count = array->null_count(); - if (null_count == n) { - std::fill_n(data.begin() + start, n, NA_REAL); - } else { - auto p_values = array->data()->GetValues(1); - STOP_IF_NULL(p_values); - auto p_vec = data.begin() + start; - - // convert DATE64 milliseconds to R seconds (stored as double) - auto seconds = [](int64_t ms) { return static_cast(ms / 1000); }; - - if (null_count) { - arrow::internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), - array->offset(), n); - for (size_t i = 0; i < n; i++, bitmap_reader.Next(), ++p_vec, ++p_values) { - *p_vec = bitmap_reader.IsSet() ? seconds(*p_values) : NA_REAL; - } - } else { - std::transform(p_values, p_values + n, p_vec, seconds); - } - } - } - - NumericVector data; -}; - -template -struct Converter_Promotion { - using r_stored_type = typename Rcpp::Vector::stored_type; - using value_type = typename TypeTraits::ArrayType::value_type; - - Converter_Promotion(R_xlen_t n) : data(no_init(n)) {} - - void Ingest(const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) { - auto null_count = array->null_count(); - if (null_count == n) { - std::fill_n(data.begin() + start, n, default_value()); - } else { - auto p_values = array->data()->GetValues(1); - STOP_IF_NULL(p_values); - - auto value_convert = [](value_type value) { - return static_cast(value); - }; - if (null_count) { - internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), - array->offset(), n); - for (size_t i = 0; i < n; i++, bitmap_reader.Next()) { - data[start + i] = bitmap_reader.IsNotSet() ? Rcpp::Vector::get_na() - : value_convert(p_values[i]); - } - } else { - std::transform(p_values, p_values + n, data.begin(), value_convert); - } - } - } - - Rcpp::Vector data; -}; - -template -struct Converter_Time { - Converter_Time(int64_t n, int32_t multiplier, CharacterVector classes) - : data(no_init(n)), multiplier_(multiplier) { - data.attr("class") = classes; - } - - Converter_Time(int64_t n, int32_t multiplier) - : data(no_init(n)), multiplier_(multiplier) { - data.attr("class") = CharacterVector::create("hms", "difftime"); - data.attr("units") = "secs"; - } - - void Ingest(const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) { - auto null_count = array->null_count(); - if (n == null_count) { - std::fill_n(data.begin() + start, n, NA_REAL); - } else { - auto p_values = array->data()->GetValues(1); - STOP_IF_NULL(p_values); - auto p_vec = data.begin() + start; - auto convert = [this](value_type value) { - return static_cast(value) / multiplier_; - }; - if (null_count) { - arrow::internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), - array->offset(), n); - for (size_t i = 0; i < n; i++, bitmap_reader.Next(), ++p_vec, ++p_values) { - *p_vec = bitmap_reader.IsSet() ? convert(*p_values) : NA_REAL; - } - } else { - std::transform(p_values, p_values + n, p_vec, convert); - } - } - } - - NumericVector data; - int32_t multiplier_; -}; - -template -struct Converter_TimeStamp : Converter_Time { - Converter_TimeStamp(int64_t n, int32_t multiplier) - : Converter_Time(n, multiplier, - CharacterVector::create("POSIXct", "POSIXt")) {} -}; - -struct Converter_Int64 { - Converter_Int64(R_xlen_t n) : data(no_init(n)) { data.attr("class") = "integer64"; } - - void Ingest(const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) { - auto null_count = array->null_count(); - if (null_count == n) { - std::fill_n(reinterpret_cast(data.begin()) + start, n, NA_INT64); - } else { - auto p_values = array->data()->GetValues(1); - STOP_IF_NULL(p_values); - auto p_vec = reinterpret_cast(data.begin()) + start; - - if (array->null_count()) { - internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), - array->offset(), n); - for (size_t i = 0; i < n; i++, bitmap_reader.Next()) { - p_vec[i] = bitmap_reader.IsNotSet() ? NA_INT64 : p_values[i]; - } - } else { - std::copy_n(p_values, n, p_vec); - } - } - } - - NumericVector data; -}; - -SEXP DictionaryArrays_to_Vector(int64_t n, const ArrayVector& arrays) { - DictionaryArray* dict_array = static_cast(arrays[0].get()); - auto dict = dict_array->dictionary(); - auto indices = dict_array->indices(); - - if (dict->type_id() != Type::STRING) { - stop("Cannot convert Dictionary Array of type `%s` to R", - dict_array->type()->ToString()); - } - bool ordered = dict_array->dict_type()->ordered(); - switch (indices->type_id()) { - case Type::UINT8: - return ArrayVector_To_Vector>( - n, arrays, dict, ordered); - - case Type::INT8: - return ArrayVector_To_Vector>( - n, arrays, dict, ordered); - - case Type::UINT16: - return ArrayVector_To_Vector>( - n, arrays, dict, ordered); - - case Type::INT16: - return ArrayVector_To_Vector>( - n, arrays, dict, ordered); - - case Type::INT32: - return ArrayVector_To_Vector>( - n, arrays, dict, ordered); - - default: - stop("Cannot convert Dictionary Array of type `%s` to R", - dict_array->type()->ToString()); - } - return R_NilValue; -} - -SEXP Date32ArrayVector_to_Vector(int64_t n, const ArrayVector& arrays) { - IntegerVector out( - arrow::r::ArrayVector_To_Vector>(n, arrays)); - out.attr("class") = "Date"; - return out; -} - -struct Converter_Decimal { - Converter_Decimal(R_xlen_t n) : data(no_init(n)) {} - - void Ingest(const std::shared_ptr& array, R_xlen_t start, R_xlen_t n) { - auto null_count = array->null_count(); - if (n == null_count) { - std::fill_n(data.begin() + start, n, NA_REAL); - } else { - auto p_vec = reinterpret_cast(data.begin()) + start; - const auto& decimals_arr = - internal::checked_cast(*array); - - if (array->null_count()) { - internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), - array->offset(), n); - - for (size_t i = 0; i < n; i++, bitmap_reader.Next()) { - p_vec[i] = bitmap_reader.IsNotSet() - ? NA_REAL - : std::stod(decimals_arr.FormatValue(i).c_str()); - } - } else { - for (size_t i = 0; i < n; i++) { - p_vec[i] = std::stod(decimals_arr.FormatValue(i).c_str()); - } - } - } - } - - NumericVector data; -}; - -} // namespace r -} // namespace arrow - -SEXP ArrayVector__as_vector(int64_t n, const ArrayVector& arrays) { - using namespace arrow::r; - - switch (arrays[0]->type_id()) { - // direct support - case Type::INT8: - return ArrayVector_To_Vector>(n, arrays); - case Type::INT32: - return ArrayVector_To_Vector>(n, arrays); - case Type::DOUBLE: - return ArrayVector_To_Vector>(n, arrays); - - // need to handle 1-bit case - case Type::BOOL: - return ArrayVector_To_Vector(n, arrays); - - // handle memory dense strings - case Type::STRING: - return ArrayVector_To_Vector(n, arrays); - case Type::DICTIONARY: - return DictionaryArrays_to_Vector(n, arrays); - - case Type::DATE32: - return Date32ArrayVector_to_Vector(n, arrays); - case Type::DATE64: - return ArrayVector_To_Vector(n, arrays); - - // promotions to integer vector - case Type::UINT8: - return ArrayVector_To_Vector>(n, - arrays); - case Type::INT16: - return ArrayVector_To_Vector>(n, - arrays); - case Type::UINT16: - return ArrayVector_To_Vector>( - n, arrays); - - // promotions to numeric vector - case Type::UINT32: - return ArrayVector_To_Vector>( - n, arrays); - case Type::HALF_FLOAT: - return ArrayVector_To_Vector>( - n, arrays); - case Type::FLOAT: - return ArrayVector_To_Vector>( - n, arrays); - - // time32 ane time64 - case Type::TIME32: - return ArrayVector_To_Vector>( - n, arrays, - static_cast(arrays[0]->type().get())->unit() == TimeUnit::SECOND - ? 1 - : 1000); - - case Type::TIME64: - return ArrayVector_To_Vector>( - n, arrays, - static_cast(arrays[0]->type().get())->unit() == TimeUnit::MICRO - ? 1000000 - : 1000000000); - - case Type::TIMESTAMP: - return ArrayVector_To_Vector>( - n, arrays, - static_cast(arrays[0]->type().get())->unit() == TimeUnit::MICRO - ? 1000000 - : 1000000000); - - case Type::INT64: - return ArrayVector_To_Vector(n, arrays); - case Type::DECIMAL: - return ArrayVector_To_Vector(n, arrays); - - default: - break; - } - - stop(tfm::format("cannot handle Array of type %s", arrays[0]->type()->name())); - return R_NilValue; -} - -// [[Rcpp::export]] -SEXP Array__as_vector(const std::shared_ptr& array) { - return ArrayVector__as_vector(array->length(), {array}); -} - -// [[Rcpp::export]] -SEXP ChunkedArray__as_vector(const std::shared_ptr& chunked_array) { - return ArrayVector__as_vector(chunked_array->length(), chunked_array->chunks()); -} - // [[Rcpp::export]] std::shared_ptr Array__Slice1(const std::shared_ptr& array, int offset) { diff --git a/r/src/array__to_vector.cpp b/r/src/array__to_vector.cpp new file mode 100644 index 0000000000000..c531933c04d52 --- /dev/null +++ b/r/src/array__to_vector.cpp @@ -0,0 +1,697 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include "arrow_types.h" + +using namespace Rcpp; +using namespace arrow; + +namespace arrow { +namespace r { + +class Converter { + public: + Converter(const ArrayVector& arrays) : arrays_(arrays) {} + + virtual ~Converter() {} + + // Allocate a vector of the right R type for this converter + virtual SEXP Allocate(R_xlen_t n) const = 0; + + // data[ start:(start + n) ] = NA + virtual Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const = 0; + + // ingest the values from the array into data[ start : (start + n)] + virtual Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const = 0; + + // ingest one array + Status IngestOne(SEXP data, const std::shared_ptr& array, R_xlen_t start, + R_xlen_t n) const { + if (array->null_count() == n) { + return Ingest_all_nulls(data, start, n); + } else { + return Ingest_some_nulls(data, array, start, n); + } + } + + // can this run in parallel ? + virtual bool Parallel() const { return true; } + + // Ingest all the arrays serially + Status IngestSerial(SEXP data) { + R_xlen_t k = 0; + for (const auto& array : arrays_) { + auto n_chunk = array->length(); + RETURN_NOT_OK(IngestOne(data, array, k, n_chunk)); + k += n_chunk; + } + return Status::OK(); + } + + // ingest the arrays in parallel + // + // for each array, add a task to the task group + // + // The task group is Finish() iun the caller + void IngestParallel(SEXP data, const std::shared_ptr& tg) { + R_xlen_t k = 0; + for (const auto& array : arrays_) { + auto n_chunk = array->length(); + tg->Append([=] { return IngestOne(data, array, k, n_chunk); }); + k += n_chunk; + } + } + + // Converter factory + static std::shared_ptr Make(const ArrayVector& arrays); + + protected: + const ArrayVector& arrays_; +}; + +// data[start:(start+n)] = NA +template +Status AllNull_Ingest(SEXP data, R_xlen_t start, R_xlen_t n) { + auto p_data = Rcpp::internal::r_vector_start(data) + start; + std::fill_n(p_data, n, default_value()); + return Status::OK(); +} + +// ingest the data from `array` into a slice of `data` +// +// each element goes through `lambda` when some conversion is needed +template +Status SomeNull_Ingest(SEXP data, R_xlen_t start, R_xlen_t n, + const array_value_type* p_values, + const std::shared_ptr& array, Lambda lambda) { + if (!p_values) { + return Status::Invalid("Invalid data buffer"); + } + auto p_data = Rcpp::internal::r_vector_start(data) + start; + + if (array->null_count()) { + arrow::internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), + array->offset(), n); + for (size_t i = 0; i < n; i++, bitmap_reader.Next(), ++p_data, ++p_values) { + *p_data = bitmap_reader.IsSet() ? lambda(*p_values) : default_value(); + } + } else { + std::transform(p_values, p_values + n, p_data, lambda); + } + + return Status::OK(); +} + +// Allocate + Ingest +SEXP ArrayVector__as_vector(R_xlen_t n, const ArrayVector& arrays) { + auto converter = Converter::Make(arrays); + Shield data(converter->Allocate(n)); + STOP_IF_NOT_OK(converter->IngestSerial(data)); + return data; +} + +template +class Converter_SimpleArray : public Converter { + using Vector = Rcpp::Vector; + using value_type = typename Vector::stored_type; + + public: + Converter_SimpleArray(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { return Vector(no_init(n)); } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + return AllNull_Ingest(data, start, n); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + auto p_values = array->data()->GetValues(1); + auto echo = [](value_type value) { return value; }; + return SomeNull_Ingest(data, start, n, p_values, array, echo); + } +}; + +class Converter_Date32 : public Converter_SimpleArray { + public: + Converter_Date32(const ArrayVector& arrays) : Converter_SimpleArray(arrays) {} + + SEXP Allocate(R_xlen_t n) const { + IntegerVector data(no_init(n)); + data.attr("class") = "Date"; + return data; + } +}; + +struct Converter_String : public Converter { + public: + Converter_String(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { return StringVector_(no_init(n)); } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + return AllNull_Ingest(data, start, n); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + auto p_offset = array->data()->GetValues(1); + if (!p_offset) { + return Status::Invalid("Invalid offset buffer"); + } + auto p_strings = array->data()->GetValues(2, *p_offset); + if (!p_strings) { + // There is an offset buffer, but the data buffer is null + // There is at least one value in the array and not all the values are null + // That means all values are either empty strings or nulls so there is nothing to do + + if (array->null_count()) { + arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), + array->offset(), n); + for (int i = 0; i < n; i++, null_reader.Next()) { + if (null_reader.IsNotSet()) { + SET_STRING_ELT(data, start + i, NA_STRING); + } + } + } + return Status::OK(); + } + + arrow::StringArray* string_array = static_cast(array.get()); + if (array->null_count()) { + // need to watch for nulls + arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), + array->offset(), n); + for (int i = 0; i < n; i++, null_reader.Next()) { + if (null_reader.IsSet()) { + SET_STRING_ELT(data, start + i, r_string(string_array->GetString(i))); + } else { + SET_STRING_ELT(data, start + i, NA_STRING); + } + } + + } else { + for (int i = 0; i < n; i++) { + SET_STRING_ELT(data, start + i, r_string(string_array->GetString(i))); + } + } + + return Status::OK(); + } + + bool Parallel() const { return false; } + + inline SEXP r_string(const arrow::util::string_view& view) const { + return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8); + } +}; + +class Converter_Boolean : public Converter { + public: + Converter_Boolean(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { return LogicalVector_(no_init(n)); } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + return AllNull_Ingest(data, start, n); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + auto p_data = Rcpp::internal::r_vector_start(data) + start; + auto p_bools = array->data()->GetValues(1, 0); + if (!p_bools) { + return Status::Invalid("Invalid data buffer"); + } + + arrow::internal::BitmapReader data_reader(p_bools, array->offset(), n); + if (array->null_count()) { + arrow::internal::BitmapReader null_reader(array->null_bitmap()->data(), + array->offset(), n); + + for (size_t i = 0; i < n; i++, data_reader.Next(), null_reader.Next(), ++p_data) { + *p_data = null_reader.IsSet() ? data_reader.IsSet() : NA_LOGICAL; + } + } else { + for (size_t i = 0; i < n; i++, data_reader.Next(), ++p_data) { + *p_data = data_reader.IsSet(); + } + } + + return Status::OK(); + } +}; + +class Converter_Dictionary : public Converter { + public: + Converter_Dictionary(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { + IntegerVector data(no_init(n)); + auto dict_array = static_cast(Converter::arrays_[0].get()); + auto dict = dict_array->dictionary(); + auto indices = dict_array->indices(); + switch (indices->type_id()) { + case Type::UINT8: + case Type::INT8: + case Type::UINT16: + case Type::INT16: + case Type::INT32: + break; + default: + stop("Cannot convert Dictionary Array of type `%s` to R", + dict_array->type()->ToString()); + } + + if (dict->type_id() != Type::STRING) { + stop("Cannot convert Dictionary Array of type `%s` to R", + dict_array->type()->ToString()); + } + bool ordered = dict_array->dict_type()->ordered(); + + data.attr("levels") = ArrayVector__as_vector(dict->length(), {dict}); + if (ordered) { + data.attr("class") = CharacterVector::create("ordered", "factor"); + } else { + data.attr("class") = "factor"; + } + return data; + } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + return AllNull_Ingest(data, start, n); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + DictionaryArray* dict_array = static_cast(array.get()); + auto indices = dict_array->indices(); + switch (indices->type_id()) { + case Type::UINT8: + return Ingest_some_nulls_Impl(data, array, start, n); + case Type::INT8: + return Ingest_some_nulls_Impl(data, array, start, n); + case Type::UINT16: + return Ingest_some_nulls_Impl(data, array, start, n); + case Type::INT16: + return Ingest_some_nulls_Impl(data, array, start, n); + case Type::INT32: + return Ingest_some_nulls_Impl(data, array, start, n); + default: + break; + } + return Status::OK(); + } + + private: + template + Status Ingest_some_nulls_Impl(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + using value_type = typename arrow::TypeTraits::ArrayType::value_type; + + std::shared_ptr indices = + static_cast(array.get())->indices(); + + // convert the 0-based indices from the arrow Array + // to 1-based indices used in R factors + auto to_r_index = [](value_type value) { return static_cast(value) + 1; }; + + return SomeNull_Ingest( + data, start, n, indices->data()->GetValues(1), indices, to_r_index); + } +}; + +double ms_to_seconds(int64_t ms) { return static_cast(ms / 1000); } + +class Converter_Date64 : public Converter { + public: + Converter_Date64(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { + NumericVector data(no_init(n)); + data.attr("class") = CharacterVector::create("POSIXct", "POSIXt"); + return data; + } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + return AllNull_Ingest(data, start, n); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + auto convert = [](int64_t ms) { return static_cast(ms / 1000); }; + return SomeNull_Ingest( + data, start, n, array->data()->GetValues(1), array, convert); + } +}; + +template +class Converter_Promotion : public Converter { + using r_stored_type = typename Rcpp::Vector::stored_type; + using value_type = typename TypeTraits::ArrayType::value_type; + + public: + Converter_Promotion(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { + return Rcpp::Vector(no_init(n)); + } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + return AllNull_Ingest(data, start, n); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + auto convert = [](value_type value) { return static_cast(value); }; + return SomeNull_Ingest( + data, start, n, array->data()->GetValues(1), array, convert); + } + + private: + static r_stored_type value_convert(value_type value) { + return static_cast(value); + } +}; + +template +class Converter_Time : public Converter { + public: + Converter_Time(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { + NumericVector data(no_init(n)); + data.attr("class") = CharacterVector::create("hms", "difftime"); + data.attr("units") = CharacterVector::create("secs"); + return data; + } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + return AllNull_Ingest(data, start, n); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + int multiplier = TimeUnit_multiplier(array); + auto convert = [=](value_type value) { + return static_cast(value) / multiplier; + }; + return SomeNull_Ingest( + data, start, n, array->data()->GetValues(1), array, convert); + } + + private: + int TimeUnit_multiplier(const std::shared_ptr& array) const { + switch (static_cast(array->type().get())->unit()) { + case TimeUnit::SECOND: + return 1; + case TimeUnit::MILLI: + return 1000; + case TimeUnit::MICRO: + return 1000000; + case TimeUnit::NANO: + return 1000000000; + } + } +}; + +template +class Converter_Timestamp : public Converter_Time { + public: + Converter_Timestamp(const ArrayVector& arrays) : Converter_Time(arrays) {} + + SEXP Allocate(R_xlen_t n) const { + NumericVector data(no_init(n)); + data.attr("class") = CharacterVector::create("POSIXct", "POSIXt"); + return data; + } +}; + +class Converter_Decimal : public Converter { + public: + Converter_Decimal(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { return NumericVector_(no_init(n)); } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + return AllNull_Ingest(data, start, n); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + auto p_data = Rcpp::internal::r_vector_start(data) + start; + const auto& decimals_arr = + internal::checked_cast(*array); + + internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(), + n); + + for (size_t i = 0; i < n; i++, bitmap_reader.Next(), ++p_data) { + *p_data = bitmap_reader.IsSet() ? std::stod(decimals_arr.FormatValue(i).c_str()) + : NA_REAL; + } + + return Status::OK(); + } +}; + +class Converter_Int64 : public Converter { + public: + Converter_Int64(const ArrayVector& arrays) : Converter(arrays) {} + + SEXP Allocate(R_xlen_t n) const { + NumericVector data(no_init(n)); + data.attr("class") = "integer64"; + return data; + } + + Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { + auto p_data = reinterpret_cast(REAL(data)) + start; + std::fill_n(p_data, n, NA_INT64); + return Status::OK(); + } + + Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, + R_xlen_t start, R_xlen_t n) const { + auto p_values = array->data()->GetValues(1); + if (!p_values) { + return Status::Invalid("Invalid data buffer"); + } + + auto p_data = reinterpret_cast(REAL(data)) + start; + + if (array->null_count()) { + internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(), + n); + for (size_t i = 0; i < n; i++, bitmap_reader.Next(), ++p_data) { + *p_data = bitmap_reader.IsSet() ? p_values[i] : NA_INT64; + } + } else { + std::copy_n(p_values, n, p_data); + } + + return Status::OK(); + } +}; + +std::shared_ptr Converter::Make(const ArrayVector& arrays) { + using namespace arrow::r; + + switch (arrays[0]->type_id()) { + // direct support + case Type::INT8: + return std::make_shared>(arrays); + + case Type::INT32: + return std::make_shared>(arrays); + + case Type::DOUBLE: + return std::make_shared>(arrays); + + // need to handle 1-bit case + case Type::BOOL: + return std::make_shared(arrays); + + // handle memory dense strings + case Type::STRING: + return std::make_shared(arrays); + + case Type::DICTIONARY: + return std::make_shared(arrays); + + case Type::DATE32: + return std::make_shared(arrays); + + case Type::DATE64: + return std::make_shared(arrays); + + // promotions to integer vector + case Type::UINT8: + return std::make_shared>(arrays); + + case Type::INT16: + return std::make_shared>(arrays); + + case Type::UINT16: + return std::make_shared>(arrays); + + // promotions to numeric vector + case Type::UINT32: + return std::make_shared>(arrays); + + case Type::HALF_FLOAT: + return std::make_shared>(arrays); + + case Type::FLOAT: + return std::make_shared>(arrays); + + // time32 ane time64 + case Type::TIME32: + return std::make_shared>(arrays); + + case Type::TIME64: + return std::make_shared>(arrays); + + case Type::TIMESTAMP: + return std::make_shared>(arrays); + + case Type::INT64: + return std::make_shared(arrays); + + case Type::DECIMAL: + return std::make_shared(arrays); + + default: + break; + } + + stop(tfm::format("cannot handle Array of type %s", arrays[0]->type()->name())); + return nullptr; +} + +List to_dataframe_serial(int64_t nr, int64_t nc, const CharacterVector& names, + const std::vector>& converters) { + List tbl(nc); + + for (int i = 0; i < nc; i++) { + SEXP column = tbl[i] = converters[i]->Allocate(nr); + STOP_IF_NOT_OK(converters[i]->IngestSerial(column)); + } + tbl.attr("names") = names; + tbl.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); + tbl.attr("row.names") = IntegerVector::create(NA_INTEGER, -nr); + return tbl; +} + +List to_dataframe_parallel(int64_t nr, int64_t nc, const CharacterVector& names, + const std::vector>& converters) { + List tbl(nc); + + // task group to ingest data in parallel + auto tg = arrow::internal::TaskGroup::MakeThreaded(arrow::internal::GetCpuThreadPool()); + + // allocate and start ingesting immediately the columns that + // can be ingested in parallel, i.e. when ingestion no longer + // need to happen on the main thread + for (int i = 0; i < nc; i++) { + // allocate data for column i + SEXP column = tbl[i] = converters[i]->Allocate(nr); + + // add a task to ingest data of that column if that can be done in parallel + if (converters[i]->Parallel()) { + converters[i]->IngestParallel(column, tg); + } + } + + arrow::Status status = arrow::Status::OK(); + + // ingest the columns that cannot be dealt with in parallel + for (int i = 0; i < nc; i++) { + if (!converters[i]->Parallel()) { + status &= converters[i]->IngestSerial(tbl[i]); + } + } + + // wait for the ingestion to be finished + status &= tg->Finish(); + + STOP_IF_NOT_OK(status); + + tbl.attr("names") = names; + tbl.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); + tbl.attr("row.names") = IntegerVector::create(NA_INTEGER, -nr); + + return tbl; +} + +} // namespace r +} // namespace arrow + +// [[Rcpp::export]] +SEXP Array__as_vector(const std::shared_ptr& array) { + return arrow::r::ArrayVector__as_vector(array->length(), {array}); +} + +// [[Rcpp::export]] +SEXP ChunkedArray__as_vector(const std::shared_ptr& chunked_array) { + return arrow::r::ArrayVector__as_vector(chunked_array->length(), + chunked_array->chunks()); +} + +// [[Rcpp::export]] +List RecordBatch__to_dataframe(const std::shared_ptr& batch, + bool use_threads) { + int64_t nc = batch->num_columns(); + int64_t nr = batch->num_rows(); + CharacterVector names(nc); + std::vector arrays(nc); + std::vector> converters(nc); + + for (int64_t i = 0; i < nc; i++) { + names[i] = batch->column_name(i); + arrays[i] = {batch->column(i)}; + converters[i] = arrow::r::Converter::Make(arrays[i]); + } + + if (use_threads) { + return arrow::r::to_dataframe_parallel(nr, nc, names, converters); + } else { + return arrow::r::to_dataframe_serial(nr, nc, names, converters); + } +} + +// [[Rcpp::export]] +List Table__to_dataframe(const std::shared_ptr& table, bool use_threads) { + int64_t nc = table->num_columns(); + int64_t nr = table->num_rows(); + CharacterVector names(nc); + std::vector> converters(nc); + + for (int64_t i = 0; i < nc; i++) { + converters[i] = arrow::r::Converter::Make(table->column(i)->data()->chunks()); + names[i] = table->column(i)->name(); + } + + if (use_threads) { + return arrow::r::to_dataframe_parallel(nr, nc, names, converters); + } else { + return arrow::r::to_dataframe_serial(nr, nc, names, converters); + } +} diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 6fef7997dbfa7..a657731a51ae4 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -52,6 +52,8 @@ namespace r { struct symbols { static SEXP units; static SEXP xp; + static SEXP dot_Internal; + static SEXP inspect; }; } // namespace r } // namespace arrow @@ -148,6 +150,7 @@ inline SEXP wrap_dispatch(const T& x, Rcpp::traits::wrap_type_unique_ptr_tag) { } // namespace Rcpp namespace Rcpp { +using NumericVector_ = Rcpp::Vector; using IntegerVector_ = Rcpp::Vector; using LogicalVector_ = Rcpp::Vector; using StringVector_ = Rcpp::Vector; @@ -156,11 +159,11 @@ using RawVector_ = Rcpp::Vector; using List_ = Rcpp::Vector; template -inline typename Rcpp::Vector::stored_type default_value() { +inline constexpr typename Rcpp::Vector::stored_type default_value() { return Rcpp::Vector::get_na(); } template <> -inline Rbyte default_value() { +inline constexpr Rbyte default_value() { return 0; } @@ -174,6 +177,11 @@ std::shared_ptr RecordBatch__from_dataframe(Rcpp::DataFrame namespace arrow { namespace r { +void inspect(SEXP obj); + +// the integer64 sentinel +constexpr int64_t NA_INT64 = std::numeric_limits::min(); + template > class RBuffer : public MutableBuffer { public: diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index b6bee7ae53927..b776d2ae5753e 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -57,22 +57,6 @@ std::shared_ptr RecordBatch__column( return batch->column(i); } -// [[Rcpp::export]] -List RecordBatch__to_dataframe(const std::shared_ptr& batch) { - int nc = batch->num_columns(); - int nr = batch->num_rows(); - List tbl(nc); - CharacterVector names(nc); - for (int i = 0; i < nc; i++) { - tbl[i] = Array__as_vector(batch->column(i)); - names[i] = batch->column_name(i); - } - tbl.attr("names") = names; - tbl.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); - tbl.attr("row.names") = IntegerVector::create(NA_INTEGER, -nr); - return tbl; -} - // [[Rcpp::export]] std::shared_ptr RecordBatch__from_dataframe(DataFrame tbl) { CharacterVector names = tbl.names(); diff --git a/r/src/symbols.cpp b/r/src/symbols.cpp index e60bcce631f37..5b4e44e8bfc5f 100644 --- a/r/src/symbols.cpp +++ b/r/src/symbols.cpp @@ -21,5 +21,14 @@ namespace arrow { namespace r { SEXP symbols::units = Rf_install("units"); SEXP symbols::xp = Rf_install(".:xp:."); +SEXP symbols::dot_Internal = Rf_install(".Internal"); +SEXP symbols::inspect = Rf_install("inspect"); + +void inspect(SEXP obj) { + Rcpp::Shield call_inspect(Rf_lang2(symbols::inspect, obj)); + Rcpp::Shield call_internal(Rf_lang2(symbols::dot_Internal, call_inspect)); + Rf_eval(call_internal, R_GlobalEnv); +} + } // namespace r } // namespace arrow diff --git a/r/src/table.cpp b/r/src/table.cpp index f4ebd0466b918..fcf2a0347689b 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -45,23 +45,6 @@ std::shared_ptr Table__schema(const std::shared_ptr return x->schema(); } -// [[Rcpp::export]] -List Table__to_dataframe(const std::shared_ptr& table) { - int nc = table->num_columns(); - int nr = table->num_rows(); - List tbl(nc); - CharacterVector names(nc); - for (int i = 0; i < nc; i++) { - auto column = table->column(i); - tbl[i] = ChunkedArray__as_vector(column->data()); - names[i] = column->name(); - } - tbl.attr("names") = names; - tbl.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); - tbl.attr("row.names") = IntegerVector::create(NA_INTEGER, -nr); - return tbl; -} - // [[Rcpp::export]] std::shared_ptr Table__column(const std::shared_ptr& table, int i) { diff --git a/r/src/threadpool.cpp b/r/src/threadpool.cpp new file mode 100644 index 0000000000000..1ce0451ac2b55 --- /dev/null +++ b/r/src/threadpool.cpp @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "arrow_types.h" + +//' Get the capacity of the global thread pool +//' +//' @return the number of worker threads in the thread pool to which +//' Arrow dispatches various CPU-bound tasks. This is an ideal number, +//' not necessarily the exact number of threads at a given point in time. +//' +//' You can change this number using [SetCpuThreadPoolCapacity()]. +//' +//' @export +// [[Rcpp::export]] +int GetCpuThreadPoolCapacity() { return arrow::GetCpuThreadPoolCapacity(); } + +//' Set the capacity of the global thread pool +//' +//' @param threads the number of worker threads int the thread pool to which +//' Arrow dispatches various CPU-bound tasks. +//' +//' The current number is returned by [GetCpuThreadPoolCapacity()] +//' +//' @export +// [[Rcpp::export]] +void SetCpuThreadPoolCapacity(int threads) { + STOP_IF_NOT_OK(arrow::SetCpuThreadPoolCapacity(threads)); +} diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index f40bd8387ad74..29f90946da6e7 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -69,7 +69,6 @@ test_that("RecordBatch", { expect_equal(col_fct$as_vector(), tbl$fct) expect_equal(col_fct$type, dictionary(int32(), array(letters[1:10]))) - batch2 <- batch$RemoveColumn(0) expect_equal( batch2$schema, diff --git a/r/tests/testthat/test-cputhreadpoolcapacity.R b/r/tests/testthat/test-cputhreadpoolcapacity.R new file mode 100644 index 0000000000000..de23f151a3524 --- /dev/null +++ b/r/tests/testthat/test-cputhreadpoolcapacity.R @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("CpuThreadPoolCapacity") + +test_that("can set/get cpu thread pool capacity", { + old <- GetCpuThreadPoolCapacity() + SetCpuThreadPoolCapacity(19L) + expect_equal(GetCpuThreadPoolCapacity(), 19L) + SetCpuThreadPoolCapacity(old) + expect_equal(GetCpuThreadPoolCapacity(), old) +}) From 2b361fb2e5b4321a6cdcbdbf457181702fd97eaa Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 9 Jan 2019 22:07:14 -0600 Subject: [PATCH 196/328] ARROW-3428: [Python] Fix from_pandas conversion from float to bool When `from_pandas` converts data to boolean, the values are read into a `uint8_t` and then checked. When the values are floating point numbers, not all bits are checked which can cause incorrect results. Author: Bryan Cutler Closes #2698 from BryanCutler/python-from_pandas-float-to-bool-ARROW-3428 and squashes the following commits: f3d472626 added test with fix that passes, but fails other tests --- cpp/src/arrow/compute/kernels/cast-test.cc | 19 ++++++ cpp/src/arrow/python/numpy_to_arrow.cc | 66 +++++++++------------ cpp/src/arrow/python/type_traits.h | 1 + python/pyarrow/tests/test_convert_pandas.py | 39 +++++++++--- 4 files changed, 81 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc index 781e0af87a825..c3a0df5d8a73f 100644 --- a/cpp/src/arrow/compute/kernels/cast-test.cc +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -138,6 +138,25 @@ TEST_F(TestCast, SameTypeZeroCopy) { AssertBufferSame(*arr, *result, 1); } +TEST_F(TestCast, FromBoolean) { + CastOptions options; + + vector is_valid(20, true); + is_valid[3] = false; + + vector v1(is_valid.size(), true); + vector e1(is_valid.size(), 1); + for (size_t i = 0; i < v1.size(); ++i) { + if (i % 3 == 1) { + v1[i] = false; + e1[i] = 0; + } + } + + CheckCase(boolean(), v1, is_valid, int32(), e1, + options); +} + TEST_F(TestCast, ToBoolean) { CastOptions options; for (auto type : kNumericTypes) { diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index aa28b6e870834..aada6bf598ca0 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -63,6 +63,7 @@ namespace arrow { using internal::checked_cast; using internal::CopyBitmap; +using internal::GenerateBitsUnrolled; namespace py { @@ -246,6 +247,11 @@ class NumPyConverter { return Status::OK(); } + // Called before ConvertData to ensure Numpy input buffer is in expected + // Arrow layout + template + Status PrepareInputData(std::shared_ptr* data); + // ---------------------------------------------------------------------- // Traditional visitor conversion for non-object arrays @@ -407,14 +413,32 @@ Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* po } // namespace template -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::PrepareInputData(std::shared_ptr* data) { if (is_strided()) { RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); + } else if (dtype_->type_num == NPY_BOOL) { + int64_t nbytes = BitUtil::BytesForBits(length_); + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer)); + + Ndarray1DIndexer values(arr_); + int64_t i = 0; + const auto generate = [&values, &i]() -> bool { return values[i++] > 0; }; + GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate); + + *data = buffer; } else { // Can zero-copy *data = std::make_shared(reinterpret_cast(arr_)); } + return Status::OK(); +} + +template +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { + RETURN_NOT_OK(PrepareInputData(data)); + std::shared_ptr input_type; RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); @@ -426,38 +450,12 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { return Status::OK(); } -template <> -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - int64_t nbytes = BitUtil::BytesForBits(length_); - std::shared_ptr buffer; - RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer)); - - Ndarray1DIndexer values(arr_); - - uint8_t* bitmap = buffer->mutable_data(); - - memset(bitmap, 0, nbytes); - for (int i = 0; i < length_; ++i) { - if (values[i] > 0) { - BitUtil::SetBit(bitmap, i); - } - } - - *data = buffer; - return Status::OK(); -} - template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - if (is_strided()) { - RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); - } else { - // Can zero-copy - *data = std::make_shared(reinterpret_cast(arr_)); - } - std::shared_ptr input_type; + RETURN_NOT_OK(PrepareInputData(data)); + auto date_dtype = reinterpret_cast(dtype_->c_metadata); if (dtype_->type_num == NPY_DATETIME) { // If we have inbound datetime64[D] data, this needs to be downcasted @@ -489,17 +487,11 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - if (is_strided()) { - RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); - } else { - // Can zero-copy - *data = std::make_shared(reinterpret_cast(arr_)); - } - constexpr int64_t kMillisecondsInDay = 86400000; - std::shared_ptr input_type; + RETURN_NOT_OK(PrepareInputData(data)); + auto date_dtype = reinterpret_cast(dtype_->c_metadata); if (dtype_->type_num == NPY_DATETIME) { // If we have inbound datetime64[D] data, this needs to be downcasted diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h index d90517a60a28a..bc71ec4e90bd0 100644 --- a/cpp/src/arrow/python/type_traits.h +++ b/cpp/src/arrow/python/type_traits.h @@ -149,6 +149,7 @@ template <> struct arrow_traits { static constexpr int npy_type = NPY_BOOL; static constexpr bool supports_nulls = false; + typedef typename npy_traits::value_type T; }; #define INT_DECL(TYPE) \ diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 3e89f5eb4ff70..cd7f4999ace3a 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -113,13 +113,13 @@ def _check_array_roundtrip(values, expected=None, mask=None, else: assert arr.null_count == (mask | values_nulls).sum() - if mask is None: - tm.assert_series_equal(pd.Series(result), pd.Series(values), - check_names=False) - else: - expected = pd.Series(np.ma.masked_array(values, mask=mask)) - tm.assert_series_equal(pd.Series(result), expected, - check_names=False) + if expected is None: + if mask is None: + expected = pd.Series(values) + else: + expected = pd.Series(np.ma.masked_array(values, mask=mask)) + + tm.assert_series_equal(pd.Series(result), expected, check_names=False) def _check_array_from_pandas_roundtrip(np_array, type=None): @@ -559,6 +559,11 @@ def test_float_nulls_to_ints(self): assert table[0].to_pylist() == [1, 2, None] tm.assert_frame_equal(df, table.to_pandas()) + def test_float_nulls_to_boolean(self): + s = pd.Series([0.0, 1.0, 2.0, None, -3.0]) + expected = pd.Series([False, True, True, None, True]) + _check_array_roundtrip(s, expected=expected, type=pa.bool_()) + def test_integer_no_nulls(self): data = OrderedDict() fields = [] @@ -672,6 +677,26 @@ def test_boolean_nulls(self): tm.assert_frame_equal(result, ex_frame) + def test_boolean_to_int(self): + # test from dtype=bool + s = pd.Series([True, True, False, True, True] * 2) + expected = pd.Series([1, 1, 0, 1, 1] * 2) + _check_array_roundtrip(s, expected=expected, type=pa.int64()) + + def test_boolean_objects_to_int(self): + # test from dtype=object + s = pd.Series([True, True, False, True, True] * 2, dtype=object) + expected = pd.Series([1, 1, 0, 1, 1] * 2) + expected_msg = 'Expected integer, got bool' + with pytest.raises(pa.ArrowTypeError, match=expected_msg): + _check_array_roundtrip(s, expected=expected, type=pa.int64()) + + def test_boolean_nulls_to_float(self): + # test from dtype=object + s = pd.Series([True, True, False, None, True] * 2) + expected = pd.Series([1.0, 1.0, 0.0, None, 1.0] * 2) + _check_array_roundtrip(s, expected=expected, type=pa.float64()) + def test_float_object_nulls(self): arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object) df = pd.DataFrame({'floats': arr}) From 8ab1493c810ae354ce085c2c2052676f349b168a Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Wed, 9 Jan 2019 22:30:39 -0600 Subject: [PATCH 197/328] ARROW-4065: [C++] arrowTargets.cmake is broken When we build Arrow's cpp library using CMake, arrowTargets.cmake will be generated and installed but it's broken. The following is a part of arrowTargets.cmake generated. ``` # Create imported target arrow_shared add_library(arrow_shared SHARED IMPORTED) set_target_properties(arrow_shared PROPERTIES INTERFACE_LINK_LIBRARIES "dl;pthreadshared" ) # Create imported target arrow_static add_library(arrow_static STATIC IMPORTED) set_target_properties(arrow_static PROPERTIES INTERFACE_LINK_LIBRARIES "glog_static;zstd_static;zlib_shared;snappy_static;lz4_static;brotli_dec_static;brotli_enc_static;brotli_common_static;double-conversion_static;boost_system_shared;boost_filesystem_shared;boost_regex_shared;jemalloc_static;rt;pthreadshared" ) ``` There are no INTERFACE_INCLUDE_DIRECTORIES and linker doesn't recognize pthreadshared because the true name of pthread should be libpthread.so or libpthread.a. *_static and *_shared are also wrong name. After this fix, we can build apps which links to arrow using CMake with CMakeLists.txt like as follows. ``` cmake_minimum_required(VERSION ...) project(...) ... find_package(arrow) add_executable(your_excellent_app ...) target_link_libraries(your_excellent_app arrow_shared) # or arrow_static ... ``` `$ cmake -D CMAKE_PREFIX_PATH=/path/to/arrow /path/to/CMakeLists.txt` `$ cmake --build .` Author: Kousuke Saruta Closes #3212 from sarutak/improve-cmake-config-file-generation and squashes the following commits: 0213d2666 Fix cpp/CMakeLists.txt, src/arrow/CMakeLists.txt and BuildUtils.cmake to enable building apps which links to Arrow using arrowTargets.cmake --- cpp/CMakeLists.txt | 80 +++++++++++++++++++++--------- cpp/cmake_modules/BuildUtils.cmake | 30 +++++++++-- cpp/src/arrow/CMakeLists.txt | 4 +- 3 files changed, 84 insertions(+), 30 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4232af3a12005..0e4f3951156a6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -690,46 +690,59 @@ endif(UNIX) ############################################################ set(ARROW_LINK_LIBS) +set(ARROW_SHARED_INSTALL_INTERFACE_LIBS) +set(ARROW_STATIC_INSTALL_INTERFACE_LIBS) # Libraries to link statically with libarrow.so set(ARROW_STATIC_LINK_LIBS double-conversion_static) +set(ARROW_STATIC_INSTALL_INTERFACE_LIBS double-conversion) if (ARROW_WITH_BROTLI) - SET(ARROW_STATIC_LINK_LIBS + list(APPEND + ARROW_STATIC_LINK_LIBS brotli_dec_static brotli_enc_static - brotli_common_static - ${ARROW_STATIC_LINK_LIBS}) + brotli_common_static) + list(APPEND + ARROW_STATIC_INSTALL_INTERFACE_LIBS + brotlidec + brotlienc + brotlicommon) endif() if (ARROW_WITH_BZ2) - SET(ARROW_STATIC_LINK_LIBS bz2_static ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS bz2_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS bz2) endif() if (ARROW_WITH_LZ4) - SET(ARROW_STATIC_LINK_LIBS lz4_static ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS lz4_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS lz4) endif() if (ARROW_WITH_SNAPPY) - SET(ARROW_STATIC_LINK_LIBS snappy_static ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS snappy_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS snappy) endif() if (ARROW_WITH_ZLIB) - SET(ARROW_STATIC_LINK_LIBS ${ZLIB_LIBRARY} ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS ${ZLIB_LIBRARY}) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS z) endif() if (ARROW_WITH_ZSTD) - SET(ARROW_STATIC_LINK_LIBS zstd_static ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS zstd_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS zstd) endif() if (ARROW_ORC) - SET(ARROW_STATIC_LINK_LIBS - ${ARROW_STATIC_LINK_LIBS} - orc_static) + list(APPEND ARROW_STATIC_LINK_LIBS orc_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc) endif() if (ARROW_USE_GLOG) - SET(ARROW_STATIC_LINK_LIBS glog_static ${ARROW_STATIC_LINK_LIBS}) + list(APPEND ARROW_STATIC_LINK_LIBS glog_static) + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS glog) add_definitions("-DARROW_USE_GLOG") endif() @@ -746,15 +759,24 @@ set(ARROW_SHARED_PRIVATE_LINK_LIBS ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_REGEX_LIBRARY}) -set(ARROW_STATIC_LINK_LIBS - ${ARROW_STATIC_LINK_LIBS} +list(APPEND + ARROW_STATIC_LINK_LIBS ${BOOST_SYSTEM_LIBRARY} ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_REGEX_LIBRARY}) +list(APPEND + ARROW_STATIC_INSTALL_INTERFACE_LIBS + boost_system + boost_filesystem + boost_regex) + if (NOT MSVC) - set(ARROW_LINK_LIBS - ${ARROW_LINK_LIBS} + list(APPEND + ARROW_LINK_LIBS + ${CMAKE_DL_LIBS}) + list(APPEND + ARROW_SHARED_INSTALL_INTERFACE_LIBS ${CMAKE_DL_LIBS}) endif() @@ -822,21 +844,31 @@ if (ARROW_JEMALLOC) jemalloc_static ) endif() - set(ARROW_SHARED_PRIVATE_LINK_LIBS - ${ARROW_SHARED_PRIVATE_LINK_LIBS} + list(APPEND + ARROW_SHARED_PRIVATE_LINK_LIBS ${ARROW_JEMALLOC_LINK_LIBS}) - set(ARROW_STATIC_LINK_LIBS - ${ARROW_STATIC_LINK_LIBS} + list(APPEND + ARROW_STATIC_LINK_LIBS ${ARROW_JEMALLOC_LINK_LIBS}) + list(APPEND + ARROW_STATIC_INSTALL_INTERFACE_LIBS + jemalloc + rt) endif(ARROW_JEMALLOC) if (PTHREAD_LIBRARY) - set(ARROW_LINK_LIBS - ${ARROW_LINK_LIBS} + list(APPEND + ARROW_LINK_LIBS pthreadshared) - set(ARROW_STATIC_LINK_LIBS - ${ARROW_STATIC_LINK_LIBS} + list(APPEND + ARROW_SHARED_INSTALL_INTERFACE_LIBS + pthread) + list(APPEND + ARROW_STATIC_LINK_LIBS pthreadshared) + list(APPEND + ARROW_STATIC_INSTALL_INTERFACE_LIBS + pthread) endif() ############################################################ diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index cf2145b8a9166..fffd15819f85f 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -97,7 +97,9 @@ function(ADD_ARROW_LIB LIB_NAME) SHARED_PRIVATE_LINK_LIBS EXTRA_INCLUDES PRIVATE_INCLUDES - DEPENDENCIES) + DEPENDENCIES + SHARED_INSTALL_INTERFACE_LIBS + STATIC_INSTALL_INTERFACE_LIBS) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if(ARG_UNPARSED_ARGUMENTS) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") @@ -204,8 +206,16 @@ function(ADD_ARROW_LIB LIB_NAME) VERSION "${ARROW_FULL_SO_VERSION}" SOVERSION "${ARROW_SO_VERSION}") + if (ARG_SHARED_INSTALL_INTERFACE_LIBS) + set(INTERFACE_LIBS ${ARG_SHARED_INSTALL_INTERFACE_LIBS}) + else() + set(INTERFACE_LIBS ${ARG_SHARED_LINK_LIBS}) + endif() + target_link_libraries(${LIB_NAME}_shared - LINK_PUBLIC ${ARG_SHARED_LINK_LIBS} + LINK_PUBLIC + "$" + "$" LINK_PRIVATE ${ARG_SHARED_PRIVATE_LINK_LIBS}) if (ARROW_RPATH_ORIGIN) @@ -235,7 +245,8 @@ function(ADD_ARROW_LIB LIB_NAME) EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if (BUILD_STATIC) @@ -274,15 +285,24 @@ function(ADD_ARROW_LIB LIB_NAME) LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}" OUTPUT_NAME ${LIB_NAME_STATIC}) + if (ARG_STATIC_INSTALL_INTERFACE_LIBS) + set(INTERFACE_LIBS ${ARG_STATIC_INSTALL_INTERFACE_LIBS}) + else() + set(INTERFACE_LIBS ${ARG_STATIC_LINK_LIBS}) + endif() + target_link_libraries(${LIB_NAME}_static - LINK_PUBLIC ${ARG_STATIC_LINK_LIBS}) + LINK_PUBLIC + "$" + "$") install(TARGETS ${LIB_NAME}_static ${INSTALL_IS_OPTIONAL} EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() # Modify variable in calling scope diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 59f035792b80d..244d0b9342f08 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -226,7 +226,9 @@ ADD_ARROW_LIB(arrow SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} SHARED_LINK_LIBS ${ARROW_LINK_LIBS} SHARED_PRIVATE_LINK_LIBS ${ARROW_SHARED_PRIVATE_LINK_LIBS} - STATIC_LINK_LIBS ${ARROW_STATIC_LINK_LIBS}) + STATIC_LINK_LIBS ${ARROW_STATIC_LINK_LIBS} + SHARED_INSTALL_INTERFACE_LIBS ${ARROW_SHARED_INSTALL_INTERFACE_LIBS} + STATIC_INSTALL_INTERFACE_LIBS ${ARROW_STATIC_INSTALL_INTERFACE_LIBS}) add_dependencies(arrow ${ARROW_LIBRARIES}) From 7fcad2c29e3c3ac99b2f6c1f1fddc91c05b7f2b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 9 Jan 2019 22:38:12 -0600 Subject: [PATCH 198/328] ARROW-3126: [Python] Make Buffered* IO classes available to Python, incorporate into input_stream, output_stream factory functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should add benchmarks too as a follow up PR. Author: Krisztián Szűcs Author: Wes McKinney Closes #3252 from kszucs/ARROW-3126 and squashes the following commits: 50118a639 Fix API in file-benchmark.cc d3917d9e5 Code review comments, buffer_size=0 means unbuffered 88bed90ef lint 5842eae0e remove test runner script fd729abdb don't typehint _detect_compression 3d1e386ce tests 5e8b38551 fix failing test e458db5a6 python support for buffered input and output streams --- cpp/CMakeLists.txt | 2 +- cpp/src/arrow/io/api.h | 1 + cpp/src/arrow/io/buffered-test.cc | 5 +- cpp/src/arrow/io/buffered.cc | 30 +++-- cpp/src/arrow/io/buffered.h | 20 +-- cpp/src/arrow/io/file-benchmark.cc | 9 +- python/pyarrow/includes/libarrow.pxd | 16 +++ python/pyarrow/io.pxi | 195 +++++++++++++-------------- python/pyarrow/tests/test_io.py | 86 ++++++++++++ 9 files changed, 234 insertions(+), 130 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0e4f3951156a6..08868af829b9e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -80,7 +80,6 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR INFER_FOUND) # See http://clang.llvm.org/docs/JSONCompilationDatabase.html set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() - # ---------------------------------------------------------------------- # cmake options @@ -358,6 +357,7 @@ endif() if (ARROW_USE_CCACHE) find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND) + message(STATUS "Using ccache: ${CCACHE_FOUND}") set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_FOUND}) set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_FOUND}) endif(CCACHE_FOUND) diff --git a/cpp/src/arrow/io/api.h b/cpp/src/arrow/io/api.h index 0d5742ad65864..cf1be337fd1a9 100644 --- a/cpp/src/arrow/io/api.h +++ b/cpp/src/arrow/io/api.h @@ -18,6 +18,7 @@ #ifndef ARROW_IO_API_H #define ARROW_IO_API_H +#include "arrow/io/buffered.h" #include "arrow/io/compressed.h" #include "arrow/io/file.h" #include "arrow/io/hdfs.h" diff --git a/cpp/src/arrow/io/buffered-test.cc b/cpp/src/arrow/io/buffered-test.cc index 074833d4bf7b7..7b9ab0cd890b1 100644 --- a/cpp/src/arrow/io/buffered-test.cc +++ b/cpp/src/arrow/io/buffered-test.cc @@ -105,7 +105,8 @@ class TestBufferedOutputStream : public FileTestFixture { lseek(fd_, 0, SEEK_END); #endif } - ASSERT_OK(BufferedOutputStream::Create(file, buffer_size, &buffered_)); + ASSERT_OK(BufferedOutputStream::Create(buffer_size, default_memory_pool(), file, + &buffered_)); } void WriteChunkwise(const std::string& datastr, const std::valarray& sizes) { @@ -321,7 +322,7 @@ class TestBufferedInputStream : public FileTestFixture { std::shared_ptr file_in; ASSERT_OK(ReadableFile::Open(path_, &file_in)); raw_ = file_in; - ASSERT_OK(BufferedInputStream::Create(raw_, buffer_size, pool, &buffered_)); + ASSERT_OK(BufferedInputStream::Create(buffer_size, pool, raw_, &buffered_)); } protected: diff --git a/cpp/src/arrow/io/buffered.cc b/cpp/src/arrow/io/buffered.cc index f3eae39c8e62e..0b1431f440fa2 100644 --- a/cpp/src/arrow/io/buffered.cc +++ b/cpp/src/arrow/io/buffered.cc @@ -91,8 +91,8 @@ class BufferedBase { class BufferedOutputStream::Impl : public BufferedBase { public: - explicit Impl(std::shared_ptr raw) - : BufferedBase(default_memory_pool()), raw_(std::move(raw)) {} + explicit Impl(std::shared_ptr raw, MemoryPool* pool) + : BufferedBase(pool), raw_(std::move(raw)) {} Status Close() { std::lock_guard guard(lock_); @@ -173,14 +173,16 @@ class BufferedOutputStream::Impl : public BufferedBase { std::shared_ptr raw_; }; -BufferedOutputStream::BufferedOutputStream(std::shared_ptr raw) - : impl_(new BufferedOutputStream::Impl(std::move(raw))) {} +BufferedOutputStream::BufferedOutputStream(std::shared_ptr raw, + MemoryPool* pool) { + impl_.reset(new Impl(std::move(raw), pool)); +} -Status BufferedOutputStream::Create(std::shared_ptr raw, - int64_t buffer_size, +Status BufferedOutputStream::Create(int64_t buffer_size, MemoryPool* pool, + std::shared_ptr raw, std::shared_ptr* out) { - auto result = - std::shared_ptr(new BufferedOutputStream(std::move(raw))); + auto result = std::shared_ptr( + new BufferedOutputStream(std::move(raw), pool)); RETURN_NOT_OK(result->SetBufferSize(buffer_size)); *out = std::move(result); return Status::OK(); @@ -217,12 +219,12 @@ std::shared_ptr BufferedOutputStream::raw() const { return impl_-> // ---------------------------------------------------------------------- // BufferedInputStream implementation -class BufferedInputStream::BufferedInputStreamImpl : public BufferedBase { +class BufferedInputStream::Impl : public BufferedBase { public: - BufferedInputStreamImpl(std::shared_ptr raw, MemoryPool* pool) + Impl(std::shared_ptr raw, MemoryPool* pool) : BufferedBase(pool), raw_(std::move(raw)), bytes_buffered_(0) {} - ~BufferedInputStreamImpl() { DCHECK_OK(Close()); } + ~Impl() { DCHECK_OK(Close()); } Status Close() { std::lock_guard guard(lock_); @@ -350,13 +352,13 @@ class BufferedInputStream::BufferedInputStreamImpl : public BufferedBase { BufferedInputStream::BufferedInputStream(std::shared_ptr raw, MemoryPool* pool) { - impl_.reset(new BufferedInputStreamImpl(std::move(raw), pool)); + impl_.reset(new Impl(std::move(raw), pool)); } BufferedInputStream::~BufferedInputStream() { DCHECK_OK(impl_->Close()); } -Status BufferedInputStream::Create(std::shared_ptr raw, int64_t buffer_size, - MemoryPool* pool, +Status BufferedInputStream::Create(int64_t buffer_size, MemoryPool* pool, + std::shared_ptr raw, std::shared_ptr* out) { auto result = std::shared_ptr(new BufferedInputStream(std::move(raw), pool)); diff --git a/cpp/src/arrow/io/buffered.h b/cpp/src/arrow/io/buffered.h index d5079556c7cfc..945915bfe998f 100644 --- a/cpp/src/arrow/io/buffered.h +++ b/cpp/src/arrow/io/buffered.h @@ -40,12 +40,13 @@ class ARROW_EXPORT BufferedOutputStream : public OutputStream { ~BufferedOutputStream() override; /// \brief Create a buffered output stream wrapping the given output stream. + /// \param[in] buffer_size the size of the temporary write buffer + /// \param[in] pool a MemoryPool to use for allocations /// \param[in] raw another OutputStream - /// \param[in] buffer_size the size of the temporary buffer. Allocates from - /// the default memory pool /// \param[out] out the created BufferedOutputStream /// \return Status - static Status Create(std::shared_ptr raw, int64_t buffer_size, + static Status Create(int64_t buffer_size, MemoryPool* pool, + std::shared_ptr raw, std::shared_ptr* out); /// \brief Resize internal buffer @@ -79,7 +80,7 @@ class ARROW_EXPORT BufferedOutputStream : public OutputStream { std::shared_ptr raw() const; private: - explicit BufferedOutputStream(std::shared_ptr raw); + explicit BufferedOutputStream(std::shared_ptr raw, MemoryPool* pool); class ARROW_NO_EXPORT Impl; std::unique_ptr impl_; @@ -94,12 +95,13 @@ class ARROW_EXPORT BufferedInputStream : public InputStream { ~BufferedInputStream() override; /// \brief Create a BufferedInputStream from a raw InputStream - /// \param[in] raw a raw InputStream /// \param[in] buffer_size the size of the temporary read buffer /// \param[in] pool a MemoryPool to use for allocations + /// \param[in] raw a raw InputStream /// \param[out] out the created BufferedInputStream - static Status Create(std::shared_ptr raw, int64_t buffer_size, - MemoryPool* pool, std::shared_ptr* out); + static Status Create(int64_t buffer_size, MemoryPool* pool, + std::shared_ptr raw, + std::shared_ptr* out); /// \brief Resize internal read buffer; calls to Read(...) will read at least /// \param[in] new_buffer_size the new read buffer size @@ -138,8 +140,8 @@ class ARROW_EXPORT BufferedInputStream : public InputStream { private: explicit BufferedInputStream(std::shared_ptr raw, MemoryPool* pool); - class ARROW_NO_EXPORT BufferedInputStreamImpl; - std::unique_ptr impl_; + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; }; } // namespace io diff --git a/cpp/src/arrow/io/file-benchmark.cc b/cpp/src/arrow/io/file-benchmark.cc index c57fa6d605d68..4439a18978232 100644 --- a/cpp/src/arrow/io/file-benchmark.cc +++ b/cpp/src/arrow/io/file-benchmark.cc @@ -163,7 +163,8 @@ static void BM_BufferedOutputStreamSmallWritesToNull( ABORT_NOT_OK(io::FileOutputStream::Open(GetNullFile(), &file)); std::shared_ptr buffered_file; - ABORT_NOT_OK(io::BufferedOutputStream::Create(file, kBufferSize, &buffered_file)); + ABORT_NOT_OK(io::BufferedOutputStream::Create(kBufferSize, default_memory_pool(), file, + &buffered_file)); BenchmarkStreamingWrites(state, small_sizes, buffered_file.get()); } @@ -196,7 +197,8 @@ static void BM_BufferedOutputStreamSmallWritesToPipe( SetupPipeWriter(&stream, &reader); std::shared_ptr buffered_stream; - ABORT_NOT_OK(io::BufferedOutputStream::Create(stream, kBufferSize, &buffered_stream)); + ABORT_NOT_OK(io::BufferedOutputStream::Create(kBufferSize, default_memory_pool(), + stream, &buffered_stream)); BenchmarkStreamingWrites(state, small_sizes, buffered_stream.get(), reader.get()); } @@ -207,7 +209,8 @@ static void BM_BufferedOutputStreamLargeWritesToPipe( SetupPipeWriter(&stream, &reader); std::shared_ptr buffered_stream; - ABORT_NOT_OK(io::BufferedOutputStream::Create(stream, kBufferSize, &buffered_stream)); + ABORT_NOT_OK(io::BufferedOutputStream::Create(kBufferSize, default_memory_pool(), + stream, &buffered_stream)); BenchmarkStreamingWrites(state, large_sizes, buffered_stream.get(), reader.get()); } diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index cc77ff432967f..97bc892ddf3fe 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -697,6 +697,22 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: CStatus Make(CCodec* codec, shared_ptr[OutputStream] raw, shared_ptr[CCompressedOutputStream]* out) + cdef cppclass CBufferedInputStream \ + " arrow::io::BufferedInputStream"(InputStream): + + @staticmethod + CStatus Create(int64_t buffer_size, CMemoryPool* pool, + shared_ptr[InputStream] raw, + shared_ptr[CBufferedInputStream]* out) + + cdef cppclass CBufferedOutputStream \ + " arrow::io::BufferedOutputStream"(OutputStream): + + @staticmethod + CStatus Create(int64_t buffer_size, CMemoryPool* pool, + shared_ptr[OutputStream] raw, + shared_ptr[CBufferedOutputStream]* out) + # ---------------------------------------------------------------------- # HDFS diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 97abde8f892af..52122740b63ae 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1064,32 +1064,6 @@ cdef class BufferReader(NativeFile): self.is_readable = True -cdef shared_ptr[InputStream] _make_compressed_input_stream( - shared_ptr[InputStream] stream, - CompressionType compression_type) except *: - cdef: - shared_ptr[CCompressedInputStream] compressed_stream - unique_ptr[CCodec] codec - - check_status(CCodec.Create(compression_type, &codec)) - check_status(CCompressedInputStream.Make(codec.get(), stream, - &compressed_stream)) - return compressed_stream - - -cdef shared_ptr[OutputStream] _make_compressed_output_stream( - shared_ptr[OutputStream] stream, - CompressionType compression_type) except *: - cdef: - shared_ptr[CCompressedOutputStream] compressed_stream - unique_ptr[CCodec] codec - - check_status(CCodec.Create(compression_type, &codec)) - check_status(CCompressedOutputStream.Make(codec.get(), stream, - &compressed_stream)) - return compressed_stream - - cdef class CompressedInputStream(NativeFile): """ An input stream wrapper which decompresses data on the fly. @@ -1104,26 +1078,19 @@ cdef class CompressedInputStream(NativeFile): def __init__(self, NativeFile stream, compression): cdef: CompressionType compression_type + unique_ptr[CCodec] codec + shared_ptr[CCompressedInputStream] compressed_stream compression_type = _get_compression_type(compression) if compression_type == CompressionType_UNCOMPRESSED: - raise ValueError("Invalid value for compression: %r" - % (compression,)) - self._init(stream, compression_type) + raise ValueError('Invalid value for compression: {!r}' + .format(compression)) - @staticmethod - cdef create(NativeFile stream, CompressionType compression_type): - cdef: - CompressedInputStream self - - self = CompressedInputStream.__new__(CompressedInputStream) - self._init(stream, compression_type) - return self + check_status(CCodec.Create(compression_type, &codec)) + check_status(CCompressedInputStream.Make( + codec.get(), stream.get_input_stream(), &compressed_stream)) - cdef _init(self, NativeFile stream, CompressionType compression_type): - self.set_input_stream( - _make_compressed_input_stream(stream.get_input_stream(), - compression_type)) + self.set_input_stream( compressed_stream) self.is_readable = True @@ -1138,29 +1105,55 @@ cdef class CompressedOutputStream(NativeFile): The compression type ("bz2", "brotli", "gzip", "lz4", "snappy" or "zstd") """ + def __init__(self, NativeFile stream, compression): cdef: CompressionType compression_type + unique_ptr[CCodec] codec + shared_ptr[CCompressedOutputStream] compressed_stream compression_type = _get_compression_type(compression) if compression_type == CompressionType_UNCOMPRESSED: - raise ValueError("Invalid value for compression: %r" - % (compression,)) - self._init(stream, compression_type) + raise ValueError('Invalid value for compression: {!r}' + .format(compression)) - @staticmethod - cdef create(NativeFile stream, CompressionType compression_type): - cdef: - CompressedOutputStream self + check_status(CCodec.Create(compression_type, &codec)) + check_status(CCompressedOutputStream.Make( + codec.get(), stream.get_output_stream(), &compressed_stream)) - self = CompressedOutputStream.__new__(CompressedOutputStream) - self._init(stream, compression_type) - return self + self.set_output_stream( compressed_stream) + self.is_writable = True + + +cdef class BufferedInputStream(NativeFile): + + def __init__(self, NativeFile stream, int buffer_size, + MemoryPool memory_pool=None): + cdef shared_ptr[CBufferedInputStream] buffered_stream + + if buffer_size <= 0: + raise ValueError('Buffer size must be larger than zero') + check_status(CBufferedInputStream.Create( + buffer_size, maybe_unbox_memory_pool(memory_pool), + stream.get_input_stream(), &buffered_stream)) + + self.set_input_stream( buffered_stream) + self.is_readable = True + + +cdef class BufferedOutputStream(NativeFile): + + def __init__(self, NativeFile stream, int buffer_size, + MemoryPool memory_pool=None): + cdef shared_ptr[CBufferedOutputStream] buffered_stream + + if buffer_size <= 0: + raise ValueError('Buffer size must be larger than zero') + check_status(CBufferedOutputStream.Create( + buffer_size, maybe_unbox_memory_pool(memory_pool), + stream.get_output_stream(), &buffered_stream)) - cdef _init(self, NativeFile stream, CompressionType compression_type): - self.set_output_stream( - _make_compressed_output_stream(stream.get_output_stream(), - compression_type)) + self.set_output_stream( buffered_stream) self.is_writable = True @@ -1232,24 +1225,27 @@ cdef get_input_stream(object source, c_bool use_memory_map, """ cdef: NativeFile nf + unique_ptr[CCodec] codec shared_ptr[InputStream] input_stream shared_ptr[CCompressedInputStream] compressed_stream - CompressionType compression_type = CompressionType_UNCOMPRESSED - unique_ptr[CCodec] codec + CompressionType compression_type try: source_path = _stringify_path(source) except TypeError: - pass + compression = None else: - compression_type = _get_compression_type_by_filename(source_path) + compression = _detect_compression(source_path) + compression_type = _get_compression_type(compression) nf = _get_native_file(source, use_memory_map) input_stream = nf.get_input_stream() if compression_type != CompressionType_UNCOMPRESSED: - input_stream = _make_compressed_input_stream(input_stream, - compression_type) + check_status(CCodec.Create(compression_type, &codec)) + check_status(CCompressedInputStream.Make(codec.get(), input_stream, + &compressed_stream)) + input_stream = compressed_stream out[0] = input_stream @@ -1292,21 +1288,19 @@ cdef CompressionType _get_compression_type(object name) except *: elif name == 'zstd': return CompressionType_ZSTD else: - raise ValueError("Unrecognized compression type: {0}" - .format(str(name))) + raise ValueError('Unrecognized compression type: {}'.format(name)) -cdef CompressionType _get_compression_type_by_filename(filename) except *: - if filename.endswith('.bz2'): - return CompressionType_BZ2 - elif filename.endswith('.gz'): - return CompressionType_GZIP - elif filename.endswith('.lz4'): - return CompressionType_LZ4 - elif filename.endswith('.zst'): - return CompressionType_ZSTD - else: - return CompressionType_UNCOMPRESSED +def _detect_compression(path): + if isinstance(path, six.string_types): + if path.endswith('.bz2'): + return 'bz2' + elif path.endswith('.gz'): + return 'gzip' + elif path.endswith('.lz4'): + return 'lz4' + elif path.endswith('.zst'): + return 'zstd' def compress(object buf, codec='lz4', asbytes=False, memory_pool=None): @@ -1427,18 +1421,7 @@ def decompress(object buf, decompressed_size=None, codec='lz4', return pybuf if asbytes else out_buf -cdef CompressionType _stream_compression_argument( - compression, source_path) except *: - if compression == 'detect': - if source_path is not None: - return _get_compression_type_by_filename(source_path) - else: - return CompressionType_UNCOMPRESSED - else: - return _get_compression_type(compression) - - -def input_stream(source, compression='detect'): +def input_stream(source, compression='detect', buffer_size=None): """ Create an Arrow input stream. @@ -1452,18 +1435,17 @@ def input_stream(source, compression='detect'): chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip") + buffer_size: int, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. """ - cdef: - CompressionType compression_type - NativeFile stream + cdef NativeFile stream try: source_path = _stringify_path(source) except TypeError: source_path = None - compression_type = _stream_compression_argument(compression, source_path) - if isinstance(source, NativeFile): stream = source elif source_path is not None: @@ -1479,13 +1461,19 @@ def input_stream(source, compression='detect'): raise TypeError("pa.input_stream() called with instance of '{}'" .format(source.__class__)) - if compression_type != CompressionType_UNCOMPRESSED: - stream = CompressedInputStream.create(stream, compression_type) + if compression == 'detect': + compression = _detect_compression(source_path) + + if buffer_size is not None and buffer_size != 0: + stream = BufferedInputStream(stream, buffer_size) + + if compression is not None: + stream = CompressedInputStream(stream, compression) return stream -def output_stream(source, compression='detect'): +def output_stream(source, compression='detect', buffer_size=None): """ Create an Arrow output stream. @@ -1499,18 +1487,17 @@ def output_stream(source, compression='detect'): chosen based on the file extension. If None, no compression will be applied. Otherwise, a well-known algorithm name must be supplied (e.g. "gzip") + buffer_size: int, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. """ - cdef: - CompressionType compression_type - NativeFile stream + cdef NativeFile stream try: source_path = _stringify_path(source) except TypeError: source_path = None - compression_type = _stream_compression_argument(compression, source_path) - if isinstance(source, NativeFile): stream = source elif source_path is not None: @@ -1526,7 +1513,13 @@ def output_stream(source, compression='detect'): raise TypeError("pa.output_stream() called with instance of '{}'" .format(source.__class__)) - if compression_type != CompressionType_UNCOMPRESSED: - stream = CompressedOutputStream.create(stream, compression_type) + if compression == 'detect': + compression = _detect_compression(source_path) + + if buffer_size is not None and buffer_size != 0: + stream = BufferedOutputStream(stream, buffer_size) + + if compression is not None: + stream = CompressedOutputStream(stream, compression) return stream diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index f54f03a9ff92e..77ed70c31ca77 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -1134,6 +1134,44 @@ def test_input_stream_file_path_compressed(tmpdir): assert stream.read() == gz_data +def test_input_stream_file_path_buffered(tmpdir): + data = b"some test data\n" * 10 + b"eof\n" + file_path = tmpdir / 'input_stream.buffered' + with open(str(file_path), 'wb') as f: + f.write(data) + + stream = pa.input_stream(file_path, buffer_size=32) + assert stream.read() == data + stream = pa.input_stream(str(file_path), buffer_size=64) + assert stream.read() == data + stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024) + assert stream.read() == data + + unbuffered_stream = pa.input_stream(file_path, buffer_size=0) + assert isinstance(unbuffered_stream, pa.OSFile) + + msg = 'Buffer size must be larger than zero' + with pytest.raises(ValueError, match=msg): + pa.input_stream(file_path, buffer_size=-1) + with pytest.raises(TypeError): + pa.input_stream(file_path, buffer_size='million') + + +def test_input_stream_file_path_compressed_and_buffered(tmpdir): + data = b"some test data\n" * 100 + b"eof\n" + gz_data = gzip_compress(data) + file_path = tmpdir / 'input_stream_compressed_and_buffered.gz' + with open(str(file_path), 'wb') as f: + f.write(gz_data) + + stream = pa.input_stream(file_path, buffer_size=32, compression='gzip') + assert stream.read() == data + stream = pa.input_stream(str(file_path), buffer_size=64) + assert stream.read() == data + stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024) + assert stream.read() == data + + def test_input_stream_python_file(tmpdir): data = b"some test data\n" * 10 + b"eof\n" bio = BytesIO(data) @@ -1232,6 +1270,54 @@ def check_data(file_path, data, **kwargs): check_data(file_path, data, compression='gzip')) == data assert check_data(file_path, data, compression=None) == data + with pytest.raises(ValueError, match='Unrecognized compression type'): + assert check_data(file_path, data, compression='rabbit') == data + + +def test_output_stream_file_path_buffered(tmpdir): + data = b"some test data\n" * 10 + b"eof\n" + file_path = tmpdir / 'output_stream.buffered' + + def check_data(file_path, data, **kwargs): + with pa.output_stream(file_path, **kwargs) as stream: + stream.write(data) + with open(str(file_path), 'rb') as f: + return f.read() + + unbuffered_stream = pa.output_stream(file_path, buffer_size=0) + assert isinstance(unbuffered_stream, pa.OSFile) + + msg = 'Buffer size must be larger than zero' + with pytest.raises(ValueError, match=msg): + assert check_data(file_path, data, buffer_size=-128) == data + + assert check_data(file_path, data, buffer_size=32) == data + assert check_data(file_path, data, buffer_size=1024) == data + assert check_data(str(file_path), data, buffer_size=32) == data + + result = check_data(pathlib.Path(str(file_path)), data, buffer_size=32) + assert result == data + + +def test_output_stream_file_path_compressed_and_buffered(tmpdir): + data = b"some test data\n" * 100 + b"eof\n" + file_path = tmpdir / 'output_stream_compressed_and_buffered.gz' + + def check_data(file_path, data, **kwargs): + with pa.output_stream(file_path, **kwargs) as stream: + stream.write(data) + with open(str(file_path), 'rb') as f: + return f.read() + + result = check_data(file_path, data, buffer_size=32) + assert gzip_decompress(result) == data + + result = check_data(file_path, data, buffer_size=1024) + assert gzip_decompress(result) == data + + result = check_data(file_path, data, buffer_size=1024, compression='gzip') + assert gzip_decompress(result) == data + def test_output_stream_python_file(tmpdir): data = b"some test data\n" * 10 + b"eof\n" From ea69e8fe4901329e53455c8d6fafad1c4f35d827 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Thu, 10 Jan 2019 17:07:39 +0900 Subject: [PATCH 199/328] ARROW-4207: [Gandiva] [GLib] Add support for IfNode Author: Yosuke Shiro Author: Kouhei Sutou Closes #3354 from shiro615/glib-add-support-for-if-node and squashes the following commits: d543ea00 Add support for error 8058b2c7 Add support for IfNode --- c_glib/gandiva-glib/node.cpp | 207 ++++++++++++++++++++++++++++ c_glib/gandiva-glib/node.h | 19 +++ c_glib/gandiva-glib/node.hpp | 6 + c_glib/test/gandiva/test-if-node.rb | 49 +++++++ 4 files changed, 281 insertions(+) create mode 100644 c_glib/test/gandiva/test-if-node.rb diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index 2c68cbeabe330..a3814c190412d 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -95,6 +95,8 @@ G_BEGIN_DECLS * #GGandivaStringLiteralNode is a class for a node in the expression tree, * representing an UTF-8 encoded string literal. * + * #GGandivaIfNode is a class for a node in the expression tree, representing an if-else. + * * Since: 0.12.0 */ @@ -1180,6 +1182,194 @@ ggandiva_string_literal_node_get_value(GGandivaStringLiteralNode *node) return value.c_str(); } + +typedef struct GGandivaIfNodePrivate_ { + GGandivaNode *condition_node; + GGandivaNode *then_node; + GGandivaNode *else_node; +} GGandivaIfNodePrivate; + +enum { + PROP_CONDITION_NODE = 1, + PROP_THEN_NODE, + PROP_ELSE_NODE, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GGandivaIfNode, + ggandiva_if_node, + GGANDIVA_TYPE_NODE) + +#define GGANDIVA_IF_NODE_GET_PRIVATE(object) \ + static_cast( \ + ggandiva_if_node_get_instance_private( \ + GGANDIVA_IF_NODE(object))) + +static void +ggandiva_if_node_dispose(GObject *object) +{ + auto priv = GGANDIVA_IF_NODE_GET_PRIVATE(object); + + if (priv->condition_node) { + g_object_unref(priv->condition_node); + priv->condition_node = nullptr; + } + + if (priv->then_node) { + g_object_unref(priv->then_node); + priv->then_node = nullptr; + } + + if (priv->else_node) { + g_object_unref(priv->else_node); + priv->else_node = nullptr; + } + + G_OBJECT_CLASS(ggandiva_if_node_parent_class)->dispose(object); +} + +static void +ggandiva_if_node_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_IF_NODE_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CONDITION_NODE: + priv->condition_node = GGANDIVA_NODE(g_value_dup_object(value)); + break; + case PROP_THEN_NODE: + priv->then_node = GGANDIVA_NODE(g_value_dup_object(value)); + break; + case PROP_ELSE_NODE: + priv->else_node = GGANDIVA_NODE(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_if_node_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_IF_NODE_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CONDITION_NODE: + g_value_set_object(value, priv->condition_node); + break; + case PROP_THEN_NODE: + g_value_set_object(value, priv->then_node); + break; + case PROP_ELSE_NODE: + g_value_set_object(value, priv->else_node); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_if_node_init(GGandivaIfNode *if_node) +{ +} + +static void +ggandiva_if_node_class_init(GGandivaIfNodeClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = ggandiva_if_node_dispose; + gobject_class->set_property = ggandiva_if_node_set_property; + gobject_class->get_property = ggandiva_if_node_get_property; + + GParamSpec *spec; + spec = g_param_spec_object("condition-node", + "Condition node", + "The condition node", + GGANDIVA_TYPE_NODE, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_CONDITION_NODE, spec); + + spec = g_param_spec_object("then-node", + "Then node", + "The then node", + GGANDIVA_TYPE_NODE, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_THEN_NODE, spec); + + spec = g_param_spec_object("else-node", + "Else node", + "The else node", + GGANDIVA_TYPE_NODE, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_ELSE_NODE, spec); +} + +/** + * ggandiva_if_node_new: + * @condition_node: the node with the condition for if-else expression. + * @then_node: the node in case the condition node is true. + * @else_node: the node in case the condition node is false. + * @return_type: A #GArrowDataType. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GGandivaIfNode or %NULl on error. + * + * Since: 0.12.0 + */ +GGandivaIfNode * +ggandiva_if_node_new(GGandivaNode *condition_node, + GGandivaNode *then_node, + GGandivaNode *else_node, + GArrowDataType *return_type, + GError **error) +{ + if (!condition_node || !then_node || !else_node || !return_type) { + /* TODO: Improve error message to show which arguments are invalid. */ + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "[gandiva][if-literal-node][new] " + "all arguments must not NULL"); + return NULL; + } + auto gandiva_condition_node = ggandiva_node_get_raw(condition_node); + auto gandiva_then_node = ggandiva_node_get_raw(then_node); + auto gandiva_else_node = ggandiva_node_get_raw(else_node); + auto arrow_return_type = garrow_data_type_get_raw(return_type); + auto gandiva_node = gandiva::TreeExprBuilder::MakeIf(gandiva_condition_node, + gandiva_then_node, + gandiva_else_node, + arrow_return_type); + if (!gandiva_node) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "[gandiva][if-literal-node][new] " + "failed to create: if (<%s>) {<%s>} else {<%s>} -> <%s>", + gandiva_condition_node->ToString().c_str(), + gandiva_then_node->ToString().c_str(), + gandiva_else_node->ToString().c_str(), + arrow_return_type->ToString().c_str()); + return NULL; + } + return ggandiva_if_node_new_raw(&gandiva_node, + condition_node, + then_node, + else_node, + return_type); +} + G_END_DECLS std::shared_ptr @@ -1305,3 +1495,20 @@ ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node, return literal_node; } + +GGandivaIfNode * +ggandiva_if_node_new_raw(std::shared_ptr *gandiva_node, + GGandivaNode *condition_node, + GGandivaNode *then_node, + GGandivaNode *else_node, + GArrowDataType *return_type) +{ + auto if_node = g_object_new(GGANDIVA_TYPE_IF_NODE, + "node", gandiva_node, + "condition-node", condition_node, + "then-node", then_node, + "else-node", else_node, + "return-type", return_type, + NULL); + return GGANDIVA_IF_NODE(if_node); +} diff --git a/c_glib/gandiva-glib/node.h b/c_glib/gandiva-glib/node.h index d9e67e27b7eea..ffcf41da10b21 100644 --- a/c_glib/gandiva-glib/node.h +++ b/c_glib/gandiva-glib/node.h @@ -320,4 +320,23 @@ ggandiva_string_literal_node_new(const gchar *value); const gchar * ggandiva_string_literal_node_get_value(GGandivaStringLiteralNode *node); + +#define GGANDIVA_TYPE_IF_NODE (ggandiva_if_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaIfNode, + ggandiva_if_node, + GGANDIVA, + IF_NODE, + GGandivaNode) +struct _GGandivaIfNodeClass +{ + GGandivaNodeClass parent_class; +}; + +GGandivaIfNode * +ggandiva_if_node_new(GGandivaNode *condition_node, + GGandivaNode *then_node, + GGandivaNode *else_node, + GArrowDataType *return_type, + GError **error); + G_END_DECLS diff --git a/c_glib/gandiva-glib/node.hpp b/c_glib/gandiva-glib/node.hpp index 40f9d1b465591..9a6ae98058699 100644 --- a/c_glib/gandiva-glib/node.hpp +++ b/c_glib/gandiva-glib/node.hpp @@ -38,3 +38,9 @@ ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, GGandivaLiteralNode * ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node, GArrowDataType *return_type); +GGandivaIfNode * +ggandiva_if_node_new_raw(std::shared_ptr *gandiva_node, + GGandivaNode *condition_node, + GGandivaNode *then_node, + GGandivaNode *else_node, + GArrowDataType *return_type); diff --git a/c_glib/test/gandiva/test-if-node.rb b/c_glib/test/gandiva/test-if-node.rb new file mode 100644 index 0000000000000..b00359590905d --- /dev/null +++ b/c_glib/test/gandiva/test-if-node.rb @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaIfNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + field1 = Arrow::Field.new("field1", Arrow::Int32DataType.new) + field2 = Arrow::Field.new("field2", Arrow::Int32DataType.new) + @then_node = Gandiva::FieldNode.new(field1) + @else_node = Gandiva::FieldNode.new(field2) + @return_type = Arrow::Int32DataType.new + @condition_node = Gandiva::FunctionNode.new("greater_than", + [@then_node, @else_node], + @return_type) + @if_node = Gandiva::IfNode.new(@condition_node, + @then_node, + @else_node, + @return_type) + end + + def test_readers + assert_equal([ + @condition_node, + @then_node, + @else_node, + @return_type + ], + [ + @if_node.condition_node, + @if_node.then_node, + @if_node.else_node, + @if_node.return_type + ]) + end +end From 9c0e643442fa6d4ca3db18b1fe4adf8fcd7dd807 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 10 Jan 2019 17:18:30 +0900 Subject: [PATCH 200/328] ARROW-4211: [GLib] Add GArrowFixedSizeBinaryDataType Author: Kouhei Sutou Author: Yosuke Shiro Closes #3358 from shiro615/glib-add-fixed-size-binary-data-type and squashes the following commits: a19354e9 Add garrow_fixed_size_binary_data_type_get_byte_width() 8dd67811 Fix GArrowFixedSizeBinaryDataType's parent f2491309 Add GArrowFixedSizeBinaryDataType --- c_glib/arrow-glib/basic-data-type.cpp | 62 ++++++++++++++++++- c_glib/arrow-glib/basic-data-type.h | 24 ++++++- c_glib/arrow-glib/type.cpp | 2 + c_glib/arrow-glib/type.h | 3 + .../test/test-fixed-size-binary-data-type.rb | 39 ++++++++++++ 5 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 c_glib/test/test-fixed-size-binary-data-type.rb diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 861bbaf388801..b6c5705fb070b 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -66,6 +66,8 @@ G_BEGIN_DECLS * * #GArrowBinaryDataType is a class for binary data type. * + * #GArrowFixedSizeBinaryDataType is a class for fixed-size binary data type. + * * #GArrowStringDataType is a class for UTF-8 encoded string data * type. * @@ -239,7 +241,7 @@ garrow_fixed_width_data_type_class_init(GArrowFixedWidthDataTypeClass *klass) } /** - * garrow_fixed_width_data_type_get_id: + * garrow_fixed_width_data_type_get_bit_width: * @data_type: A #GArrowFixedWidthDataType. * * Returns: The number of bits for one data. @@ -716,6 +718,59 @@ garrow_binary_data_type_new(void) } +G_DEFINE_TYPE(GArrowFixedSizeBinaryDataType, + garrow_fixed_size_binary_data_type, + GARROW_TYPE_FIXED_WIDTH_DATA_TYPE) + +static void +garrow_fixed_size_binary_data_type_init(GArrowFixedSizeBinaryDataType *object) +{ +} + +static void +garrow_fixed_size_binary_data_type_class_init(GArrowFixedSizeBinaryDataTypeClass *klass) +{ +} + +/** + * garrow_fixed_size_binary_data_type: + * @byte_width: The byte width. + * + * Returns: The newly created fixed-size binary data type. + * + * Since: 0.12.0 + */ +GArrowFixedSizeBinaryDataType * +garrow_fixed_size_binary_data_type_new(gint32 byte_width) +{ + auto arrow_fixed_size_binary_data_type = arrow::fixed_size_binary(byte_width); + + auto fixed_size_binary_data_type = + GARROW_FIXED_SIZE_BINARY_DATA_TYPE(g_object_new(GARROW_TYPE_FIXED_SIZE_BINARY_DATA_TYPE, + "data-type", &arrow_fixed_size_binary_data_type, + NULL)); + return fixed_size_binary_data_type; +} + +/** + * garrow_fixed_size_binary_data_type_get_byte_width: + * @data_type: A #GArrowFixedSizeBinaryDataType. + * + * Returns: The number of bytes for one data. + * + * Since: 0.12.0 + */ +gint32 +garrow_fixed_size_binary_data_type_get_byte_width(GArrowFixedSizeBinaryDataType *data_type) +{ + const auto arrow_data_type = + garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + const auto arrow_fixed_size_binary_type = + std::static_pointer_cast(arrow_data_type); + return arrow_fixed_size_binary_type->byte_width(); +} + + G_DEFINE_TYPE(GArrowStringDataType, garrow_string_data_type, GARROW_TYPE_DATA_TYPE) @@ -1044,7 +1099,7 @@ garrow_time64_data_type_new(GArrowTimeUnit unit, GError **error) G_DEFINE_ABSTRACT_TYPE(GArrowDecimalDataType, garrow_decimal_data_type, - GARROW_TYPE_DATA_TYPE) + GARROW_TYPE_FIXED_SIZE_BINARY_DATA_TYPE) static void garrow_decimal_data_type_init(GArrowDecimalDataType *object) @@ -1197,6 +1252,9 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) case arrow::Type::type::BINARY: type = GARROW_TYPE_BINARY_DATA_TYPE; break; + case arrow::Type::type::FIXED_SIZE_BINARY: + type = GARROW_TYPE_FIXED_SIZE_BINARY_DATA_TYPE; + break; case arrow::Type::type::STRING: type = GARROW_TYPE_STRING_DATA_TYPE; break; diff --git a/c_glib/arrow-glib/basic-data-type.h b/c_glib/arrow-glib/basic-data-type.h index ef41f1dbcfa0b..d18958265748d 100644 --- a/c_glib/arrow-glib/basic-data-type.h +++ b/c_glib/arrow-glib/basic-data-type.h @@ -338,6 +338,25 @@ GType garrow_binary_data_type_get_type (void) G_GNUC_CONST; GArrowBinaryDataType *garrow_binary_data_type_new (void); +#define GARROW_TYPE_FIXED_SIZE_BINARY_DATA_TYPE (garrow_fixed_size_binary_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowFixedSizeBinaryDataType, + garrow_fixed_size_binary_data_type, + GARROW, + FIXED_SIZE_BINARY_DATA_TYPE, + GArrowDataType) +struct _GArrowFixedSizeBinaryDataTypeClass +{ + GArrowFixedWidthDataTypeClass parent_class; +}; + +GARROW_AVAILABLE_IN_0_12 +GArrowFixedSizeBinaryDataType * +garrow_fixed_size_binary_data_type_new(gint32 byte_width); +GARROW_AVAILABLE_IN_0_12 +gint32 +garrow_fixed_size_binary_data_type_get_byte_width(GArrowFixedSizeBinaryDataType *data_type); + + #define GARROW_TYPE_STRING_DATA_TYPE \ (garrow_string_data_type_get_type()) #define GARROW_STRING_DATA_TYPE(obj) \ @@ -651,15 +670,14 @@ GArrowTime64DataType *garrow_time64_data_type_new (GArrowTimeUnit unit, #define GARROW_TYPE_DECIMAL_DATA_TYPE (garrow_decimal_data_type_get_type()) -/* TODO: Delivered from GArrowFixedSizeBinaryDataType. */ G_DECLARE_DERIVABLE_TYPE(GArrowDecimalDataType, garrow_decimal_data_type, GARROW, DECIMAL_DATA_TYPE, - GArrowDataType) + GArrowFixedSizeBinaryDataType) struct _GArrowDecimalDataTypeClass { - GArrowDataTypeClass parent_class; + GArrowFixedSizeBinaryDataTypeClass parent_class; }; #ifndef GARROW_DISABLE_DEPRECATED diff --git a/c_glib/arrow-glib/type.cpp b/c_glib/arrow-glib/type.cpp index 0642004e2f07b..e227ed2c31fc8 100644 --- a/c_glib/arrow-glib/type.cpp +++ b/c_glib/arrow-glib/type.cpp @@ -66,6 +66,8 @@ garrow_type_from_raw(arrow::Type::type type) return GARROW_TYPE_STRING; case arrow::Type::type::BINARY: return GARROW_TYPE_BINARY; + case arrow::Type::type::FIXED_SIZE_BINARY: + return GARROW_TYPE_FIXED_SIZE_BINARY; case arrow::Type::type::DATE32: return GARROW_TYPE_DATE32; case arrow::Type::type::DATE64: diff --git a/c_glib/arrow-glib/type.h b/c_glib/arrow-glib/type.h index 2137c785515f8..85f55c452be55 100644 --- a/c_glib/arrow-glib/type.h +++ b/c_glib/arrow-glib/type.h @@ -40,6 +40,8 @@ G_BEGIN_DECLS * @GARROW_TYPE_DOUBLE: 8-byte floating point value. * @GARROW_TYPE_STRING: UTF-8 variable-length string. * @GARROW_TYPE_BINARY: Variable-length bytes (no guarantee of UTF-8-ness). + * @GARROW_TYPE_FIXED_SIZE_BINARY: Fixed-size binary. Each value occupies + * the same number of bytes. * @GARROW_TYPE_DATE32: int32 days since the UNIX epoch. * @GARROW_TYPE_DATE64: int64 milliseconds since the UNIX epoch. * @GARROW_TYPE_TIMESTAMP: Exact timestamp encoded with int64 since UNIX epoch. @@ -72,6 +74,7 @@ typedef enum { GARROW_TYPE_DOUBLE, GARROW_TYPE_STRING, GARROW_TYPE_BINARY, + GARROW_TYPE_FIXED_SIZE_BINARY, GARROW_TYPE_DATE32, GARROW_TYPE_DATE64, GARROW_TYPE_TIMESTAMP, diff --git a/c_glib/test/test-fixed-size-binary-data-type.rb b/c_glib/test/test-fixed-size-binary-data-type.rb new file mode 100644 index 0000000000000..584fb3deec93d --- /dev/null +++ b/c_glib/test/test-fixed-size-binary-data-type.rb @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFixedSizeBinaryDataType < Test::Unit::TestCase + def setup + @byte_width = 10 + @data_type = Arrow::FixedSizeBinaryDataType.new(@byte_width) + end + + def test_type + assert_equal(Arrow::Type::FIXED_SIZE_BINARY, @data_type.id) + end + + def test_to_s + assert_equal("fixed_size_binary[10]", @data_type.to_s) + end + + def test_byte_width + assert_equal(@byte_width, @data_type.byte_width) + end + + def test_bit_width + assert_equal(@byte_width * 8, @data_type.bit_width) + end +end From f67a5150df7d11a0ad5bc53044c192b023ad312c Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 10 Jan 2019 17:39:11 +0900 Subject: [PATCH 201/328] ARROW-4214: [Ruby] Add support for building RecordBatch from raw Ruby objects Author: Kouhei Sutou Closes #3360 from kou/ruby-record-batch-builder-append-records and squashes the following commits: e85bbaf5 Add support for building RecordBatch from raw Ruby objects --- ruby/red-arrow/lib/arrow/array-builder.rb | 8 +- .../red-arrow/lib/arrow/list-array-builder.rb | 10 ++ ruby/red-arrow/lib/arrow/loader.rb | 8 ++ .../lib/arrow/record-batch-builder.rb | 115 +++++++++++++++++ ruby/red-arrow/lib/arrow/record-batch.rb | 16 +++ .../lib/arrow/struct-array-builder.rb | 10 ++ .../red-arrow/test/test-list-array-builder.rb | 17 +++ .../test/test-record-batch-builder.rb | 116 ++++++++++++++++++ ruby/red-arrow/test/test-record-batch.rb | 114 ++++++++++++----- .../test/test-struct-array-builder.rb | 20 +++ 10 files changed, 400 insertions(+), 34 deletions(-) create mode 100644 ruby/red-arrow/lib/arrow/record-batch-builder.rb create mode 100644 ruby/red-arrow/test/test-record-batch-builder.rb diff --git a/ruby/red-arrow/lib/arrow/array-builder.rb b/ruby/red-arrow/lib/arrow/array-builder.rb index 8edb3c4bfbbd9..7cfc4329aed6e 100644 --- a/ruby/red-arrow/lib/arrow/array-builder.rb +++ b/ruby/red-arrow/lib/arrow/array-builder.rb @@ -65,6 +65,12 @@ def build(values) end def build(values) + append(*values) + finish + end + + # @since 0.12.0 + def append(*values) value_convertable = respond_to?(:convert_to_arrow_value, true) start_index = 0 current_index = 0 @@ -111,8 +117,6 @@ def build(values) append_nulls(current_index - start_index) end end - - finish end def append_nulls(n) diff --git a/ruby/red-arrow/lib/arrow/list-array-builder.rb b/ruby/red-arrow/lib/arrow/list-array-builder.rb index aa093c2de9b5c..1fa507f69a72f 100644 --- a/ruby/red-arrow/lib/arrow/list-array-builder.rb +++ b/ruby/red-arrow/lib/arrow/list-array-builder.rb @@ -82,5 +82,15 @@ def append_values(lists, is_valids=nil) end end end + + # @since 0.12.0 + def append(*values) + if values.empty? + # For backward compatibility + append_value + else + super + end + end end end diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb index acd2573e3218f..6e0bf2929022f 100644 --- a/ruby/red-arrow/lib/arrow/loader.rb +++ b/ruby/red-arrow/lib/arrow/loader.rb @@ -54,6 +54,7 @@ def require_libraries require "arrow/path-extension" require "arrow/record" require "arrow/record-batch" + require "arrow/record-batch-builder" require "arrow/record-batch-file-reader" require "arrow/record-batch-stream-reader" require "arrow/rolling-window" @@ -89,6 +90,13 @@ def load_object_info(info) def load_method_info(info, klass, method_name) case klass.name + when /Builder\z/ + case method_name + when "append" + return + else + super + end when "Arrow::StringArray" case method_name when "get_value" diff --git a/ruby/red-arrow/lib/arrow/record-batch-builder.rb b/ruby/red-arrow/lib/arrow/record-batch-builder.rb new file mode 100644 index 0000000000000..dba16b3b8116d --- /dev/null +++ b/ruby/red-arrow/lib/arrow/record-batch-builder.rb @@ -0,0 +1,115 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class RecordBatchBuilder + class << self + # @since 0.12.0 + def build(schema, data) + builder = new(schema) + builder.append(data) + builder.flush + end + end + + alias_method :initialize_raw, :initialize + private :initialize_raw + def initialize(schema) + unless schema.is_a?(Schema) + schema = Schema.new(schema) + end + initialize_raw(schema) + @name_to_index = {} + schema.fields.each_with_index do |field, i| + @name_to_index[field.name] = i + end + end + + # @since 0.12.0 + def [](name_or_index) + case name_or_index + when String, Symbol + name = name_or_index + self[resolve_name(name)] + else + index = name_or_index + column_builders[index] + end + end + + # @since 0.12.0 + def append(*values) + values.each do |value| + case value + when Hash + append_columns(value) + else + append_records(value) + end + end + end + + # @since 0.12.0 + def append_records(records) + n = n_fields + columns = n.times.collect do + [] + end + records.each_with_index do |record, nth_record| + case record + when nil + when Hash + record.each do |name, value| + nth_column = resolve_name(name) + next if nth_column.nil? + columns[nth_column] << value + end + else + record.each_with_index do |value, nth_column| + columns[nth_column] << value + end + end + columns.each do |column| + column << nil if column.size != (nth_record + 1) + end + end + columns.each_with_index do |column, i| + self[i].append(*column) + end + end + + # @since 0.12.0 + def append_columns(columns) + columns.each do |name, values| + self[name].append(*values) + end + end + + private + def resolve_name(name) + @name_to_index[name.to_s] + end + + # TODO: Make public with good name. Is column_builders good enough? + # builders? sub_builders? + def column_builders + @column_builders ||= n_fields.times.collect do |i| + get_field(i) + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/record-batch.rb b/ruby/red-arrow/lib/arrow/record-batch.rb index 6d9c35b9dc849..b577d4a41a6c6 100644 --- a/ruby/red-arrow/lib/arrow/record-batch.rb +++ b/ruby/red-arrow/lib/arrow/record-batch.rb @@ -22,6 +22,22 @@ class RecordBatch include RecordContainable include Enumerable + class << self + def new(*args) + n_args = args.size + case n_args + when 2 + schema, data = args + RecordBatchBuilder.build(schema, data) + when 3 + super + else + message = "wrong number of arguments (given #{n_args}, expected 2..3)" + raise ArgumentError, message + end + end + end + alias_method :each, :each_record alias_method :columns_raw, :columns diff --git a/ruby/red-arrow/lib/arrow/struct-array-builder.rb b/ruby/red-arrow/lib/arrow/struct-array-builder.rb index 52f75aab46d35..b56056cad4471 100644 --- a/ruby/red-arrow/lib/arrow/struct-array-builder.rb +++ b/ruby/red-arrow/lib/arrow/struct-array-builder.rb @@ -119,6 +119,16 @@ def append_null end end + # @since 0.12.0 + def append(*values) + if values.empty? + # For backward compatibility + append_value_raw + else + super + end + end + private def cached_field_builders @field_builders ||= field_builders diff --git a/ruby/red-arrow/test/test-list-array-builder.rb b/ruby/red-arrow/test/test-list-array-builder.rb index e36f2c8340be4..aee31e73b1b96 100644 --- a/ruby/red-arrow/test/test-list-array-builder.rb +++ b/ruby/red-arrow/test/test-list-array-builder.rb @@ -59,4 +59,21 @@ def setup array.collect {|list| list ? list.to_a : nil}) end end + + sub_test_case("#append") do + test("backward compatibility") do + @builder.append + @builder.value_builder.append(true) + @builder.value_builder.append(false) + @builder.append + @builder.value_builder.append(true) + array = @builder.finish + + assert_equal([ + [true, false], + [true], + ], + array.collect(&:to_a)) + end + end end diff --git a/ruby/red-arrow/test/test-record-batch-builder.rb b/ruby/red-arrow/test/test-record-batch-builder.rb new file mode 100644 index 0000000000000..7cd1f8cee7a16 --- /dev/null +++ b/ruby/red-arrow/test/test-record-batch-builder.rb @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class RecordBatchBuilderTest < Test::Unit::TestCase + sub_test_case(".new") do + test("Schema") do + schema = Arrow::Schema.new(visible: :boolean, + count: :uint32) + builder = Arrow::RecordBatchBuilder.new(schema) + assert_equal(schema, + builder.schema) + end + + test("Hash") do + builder = Arrow::RecordBatchBuilder.new(visible: :boolean, + count: :uint32) + assert_equal(Arrow::Schema.new(visible: :boolean, + count: :uint32), + builder.schema) + end + end + + sub_test_case("instance methods") do + def setup + @schema = Arrow::Schema.new(visible: :boolean, + count: :uint32) + @builder = Arrow::RecordBatchBuilder.new(@schema) + end + + sub_test_case("#[]") do + test("String") do + assert_equal(Arrow::BooleanDataType.new, + @builder["visible"].value_data_type) + end + + test("Symbol") do + assert_equal(Arrow::BooleanDataType.new, + @builder[:visible].value_data_type) + end + + test("Integer") do + assert_equal(Arrow::UInt32DataType.new, + @builder[1].value_data_type) + end + end + + test("#append") do + records = [ + {visible: true, count: 1}, + ] + columns = { + visible: [false], + count: [2], + } + arrays = [ + Arrow::BooleanArray.new([true, false]), + Arrow::UInt32Array.new([1, 2]), + ] + @builder.append(records, columns) + assert_equal(Arrow::RecordBatch.new(@schema, + arrays[0].length, + arrays), + @builder.flush) + end + + test("#append_records") do + records = [ + {visible: true, count: 1}, + {visible: true, count: 2, garbage: "garbage"}, + {visible: true}, + [false, 4], + nil, + [true], + ] + arrays = [ + Arrow::BooleanArray.new([true, true, true, false, nil, true]), + Arrow::UInt32Array.new([1, 2, nil, 4, nil, nil]), + ] + @builder.append_records(records) + assert_equal(Arrow::RecordBatch.new(@schema, + arrays[0].length, + arrays), + @builder.flush) + end + + test("#append_columns") do + columns = { + visible: [true, true, true, false, nil, true], + count: [1, 2, nil, 4, nil, nil], + } + arrays = [ + Arrow::BooleanArray.new(columns[:visible]), + Arrow::UInt32Array.new(columns[:count]), + ] + @builder.append_columns(columns) + assert_equal(Arrow::RecordBatch.new(@schema, + arrays[0].length, + arrays), + @builder.flush) + end + end +end diff --git a/ruby/red-arrow/test/test-record-batch.rb b/ruby/red-arrow/test/test-record-batch.rb index 4dac085bff86e..d33298b4e5f7f 100644 --- a/ruby/red-arrow/test/test-record-batch.rb +++ b/ruby/red-arrow/test/test-record-batch.rb @@ -16,47 +16,97 @@ # under the License. class RecordBatchTest < Test::Unit::TestCase - setup do - fields = [ - Arrow::Field.new("count", :uint32), - ] - @schema = Arrow::Schema.new(fields) - @counts = Arrow::UInt32Array.new([1, 2, 4, 8]) - @record_batch = Arrow::RecordBatch.new(@schema, @counts.length, [@counts]) - end + sub_test_case(".new") do + def setup + @schema = Arrow::Schema.new(visible: :boolean, + count: :uint32) + end - sub_test_case(".each") do - test("default") do - records = [] - @record_batch.each do |record| - records << [record, record.index] - end + test("[Schema, records]") do + records = [ + {visible: true, count: 1}, + nil, + [false, 3], + ] + record_batch = Arrow::RecordBatch.new(@schema, records) assert_equal([ - [0, 0], - [1, 1], - [2, 2], - [3, 3], + {"visible" => true, "count" => 1}, + {"visible" => nil, "count" => nil}, + {"visible" => false, "count" => 3}, ], - records.collect {|record, i| [record.index, i]}) + record_batch.each_record.collect(&:to_h)) end - test("reuse_record: true") do - records = [] - @record_batch.each(reuse_record: true) do |record| - records << [record, record.index] - end + test("[Schema, columns]") do + columns = { + visible: [true, nil, false], + count: [1, 2, nil], + } + record_batch = Arrow::RecordBatch.new(@schema, columns) + assert_equal([ + {"visible" => true, "count" => 1}, + {"visible" => nil, "count" => 2}, + {"visible" => false, "count" => nil}, + ], + record_batch.each_record.collect(&:to_h)) + end + + test("[Schema, n_rows, columns]") do + columns = [ + Arrow::BooleanArray.new([true, nil, false]), + Arrow::UInt32Array.new([1, 2, nil]), + ] + n_rows = columns[0].length + record_batch = Arrow::RecordBatch.new(@schema, n_rows, columns) assert_equal([ - [3, 0], - [3, 1], - [3, 2], - [3, 3], + {"visible" => true, "count" => 1}, + {"visible" => nil, "count" => 2}, + {"visible" => false, "count" => nil}, ], - records.collect {|record, i| [record.index, i]}) + record_batch.each_record.collect(&:to_h)) end end - test("#to_table") do - assert_equal(Arrow::Table.new(@schema, [@counts]), - @record_batch.to_table) + sub_test_case("instance methods") do + def setup + @schema = Arrow::Schema.new(count: :uint32) + @counts = Arrow::UInt32Array.new([1, 2, 4, 8]) + @record_batch = Arrow::RecordBatch.new(@schema, @counts.length, [@counts]) + end + + sub_test_case("#each") do + test("default") do + records = [] + @record_batch.each do |record| + records << [record, record.index] + end + assert_equal([ + [0, 0], + [1, 1], + [2, 2], + [3, 3], + ], + records.collect {|record, i| [record.index, i]}) + end + + test("reuse_record: true") do + records = [] + @record_batch.each(reuse_record: true) do |record| + records << [record, record.index] + end + assert_equal([ + [3, 0], + [3, 1], + [3, 2], + [3, 3], + ], + records.collect {|record, i| [record.index, i]}) + end + end + + test("#to_table") do + assert_equal(Arrow::Table.new(@schema, [@counts]), + @record_batch.to_table) + end end end diff --git a/ruby/red-arrow/test/test-struct-array-builder.rb b/ruby/red-arrow/test/test-struct-array-builder.rb index 42e1ded78e318..f7706ee8d190b 100644 --- a/ruby/red-arrow/test/test-struct-array-builder.rb +++ b/ruby/red-arrow/test/test-struct-array-builder.rb @@ -157,4 +157,24 @@ def setup ]) end end + + sub_test_case("#append") do + test("backward compatibility") do + @builder.append + @builder.get_field_builder(0).append(true) + @builder.get_field_builder(1).append(1) + @builder.append + @builder.get_field_builder(0).append(false) + @builder.get_field_builder(1).append(2) + array = @builder.finish + assert_equal([ + [true, 1], + [false, 2], + ], + [ + array.get_value(0).values, + array.get_value(1).values, + ]) + end + end end From b8d59133465c8be85603f9b0f23fdc687ec2e2ba Mon Sep 17 00:00:00 2001 From: "Sweeney, Mack" Date: Thu, 10 Jan 2019 09:47:00 +0100 Subject: [PATCH 202/328] ARROW-3916: [Python] Add support for `filesystem` kwarg in ParquetWriter Implements [ARROW 3916](https://jira.apache.org/jira/browse/ARROW-3916). Author: Sweeney, Mack Author: Wes McKinney Closes #3070 from macks22/ARROW-3916_ParquetDataset_filesystem_kwarg and squashes the following commits: b5973bc0 Fixes post rebase 124d9df8 Add support for filesystem kwarg in ParquetWriter --- python/pyarrow/filesystem.py | 18 ++++++++---- python/pyarrow/parquet.py | 25 +++++++++-------- python/pyarrow/tests/test_parquet.py | 41 +++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 17 deletions(-) diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index 92a65ce69892a..43280799bccce 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -23,7 +23,7 @@ from six.moves.urllib.parse import urlparse import pyarrow as pa -from pyarrow.util import implements, _stringify_path +from pyarrow.util import implements, _stringify_path, _is_path_like class FileSystem(object): @@ -397,14 +397,22 @@ def _ensure_filesystem(fs): return fs -def get_filesystem_from_uri(path): +def resolve_filesystem_and_path(where, filesystem=None): """ return filesystem from path which could be an HDFS URI """ + if not _is_path_like(where): + if filesystem is not None: + raise ValueError("filesystem passed but where is file-like, so" + " there is nothing to open with filesystem.") + return filesystem, where + # input can be hdfs URI such as hdfs://host:port/myfile.parquet - path = _stringify_path(path) - # if _has_pathlib and isinstance(path, pathlib.Path): - # path = str(path) + path = _stringify_path(where) + + if filesystem is not None: + return _ensure_filesystem(filesystem), path + parsed_uri = urlparse(path) if parsed_uri.scheme == 'hdfs': netloc_split = parsed_uri.netloc.split(':') diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index b8dae65a5de78..7142e2f474540 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -35,7 +35,7 @@ ParquetSchema, ColumnSchema) from pyarrow.compat import guid from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem, - get_filesystem_from_uri) + resolve_filesystem_and_path) from pyarrow.util import _is_path_like, _stringify_path _URI_STRIP_SCHEMES = ('hdfs',) @@ -54,7 +54,7 @@ def _parse_uri(path): def _get_filesystem_and_path(passed_filesystem, path): if passed_filesystem is None: - return get_filesystem_from_uri(path) + return resolve_filesystem_and_path(path, passed_filesystem) else: passed_filesystem = _ensure_filesystem(passed_filesystem) parsed_path = _parse_uri(path) @@ -320,7 +320,10 @@ def _sanitize_table(table, new_schema, flavor): Specify the compression codec, either on a general basis or per-column. Valid values: {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD'} flavor : {'spark'}, default None - Sanitize schema or set other compatibility options for compatibility""" + Sanitize schema or set other compatibility options for compatibility +filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed.""" class ParquetWriter(object): @@ -335,12 +338,12 @@ class ParquetWriter(object): {0} """.format(_parquet_writer_arg_docs) - def __init__(self, where, schema, flavor=None, + def __init__(self, where, schema, filesystem=None, + flavor=None, version='1.0', use_dictionary=True, compression='snappy', - use_deprecated_int96_timestamps=None, - filesystem=None, **options): + use_deprecated_int96_timestamps=None, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark if flavor is not None and 'spark' in flavor: @@ -357,13 +360,13 @@ def __init__(self, where, schema, flavor=None, self.schema = schema self.where = where - # If we open a file using an implied filesystem, so it can be assured - # to be closed + # If we open a file using a filesystem, store file handle so we can be + # sure to close it when `self.close` is called. self.file_handle = None - if _is_path_like(where): - fs, path = _get_filesystem_and_path(filesystem, where) - sink = self.file_handle = fs.open(path, 'wb') + filesystem, path = resolve_filesystem_and_path(where, filesystem) + if filesystem is not None: + sink = self.file_handle = filesystem.open(path, 'wb') else: sink = where diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 3a6c84678eba2..5156300b01b95 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -31,7 +31,7 @@ import pyarrow as pa from pyarrow.compat import guid, u, BytesIO, unichar, PY2 from pyarrow.tests import util -from pyarrow.filesystem import LocalFileSystem +from pyarrow.filesystem import LocalFileSystem, FileSystem from .pandas_examples import dataframe_with_arrays, dataframe_with_lists try: @@ -2277,6 +2277,45 @@ def test_empty_row_groups(tempdir): assert reader.read_row_group(i).equals(table) +def test_parquet_writer_with_caller_provided_filesystem(): + out = pa.BufferOutputStream() + + class CustomFS(FileSystem): + def __init__(self): + self.path = None + self.mode = None + + def open(self, path, mode='rb'): + self.path = path + self.mode = mode + return out + + fs = CustomFS() + fname = 'expected_fname.parquet' + df = _test_dataframe(100) + table = pa.Table.from_pandas(df, preserve_index=False) + + with pq.ParquetWriter(fname, table.schema, filesystem=fs, version='2.0') \ + as writer: + writer.write_table(table) + + assert fs.path == fname + assert fs.mode == 'wb' + assert out.closed + + buf = out.getvalue() + table_read = _read_table(pa.BufferReader(buf)) + df_read = table_read.to_pandas() + tm.assert_frame_equal(df_read, df) + + # Should raise ValueError when filesystem is passed with file-like object + with pytest.raises(ValueError) as err_info: + pq.ParquetWriter(pa.BufferOutputStream(), table.schema, filesystem=fs) + expected_msg = ("filesystem passed but where is file-like, so" + " there is nothing to open with filesystem.") + assert str(err_info) == expected_msg + + def test_writing_empty_lists(): # ARROW-2591: [Python] Segmentation fault issue in pq.write_table arr1 = pa.array([[], []], pa.list_(pa.int32())) From bf34291b93c748f9dc63a0d89cc1cf857a28630c Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 10 Jan 2019 18:54:20 +0900 Subject: [PATCH 203/328] ARROW-4227: [GLib] Fix wrong data type in field of composite data type Author: Kouhei Sutou Closes #3363 from kou/glib-fix-wrong-data-type and squashes the following commits: 1f274e51 Use garrow_field_new_raw(..., nullpter) 20460a9f Fix wrong data type in field of composite data type --- c_glib/arrow-glib/composite-data-type.cpp | 14 ++++---- c_glib/arrow-glib/field.cpp | 9 +++++ c_glib/arrow-glib/schema.cpp | 16 ++------- c_glib/test/test-dense-union-data-type.rb | 33 +++++++++++++++--- c_glib/test/test-list-data-type.rb | 25 +++++++++----- c_glib/test/test-sparse-union-data-type.rb | 33 +++++++++++++++--- c_glib/test/test-struct-data-type.rb | 39 +++++++++++++++++++--- 7 files changed, 127 insertions(+), 42 deletions(-) diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index 8046d2e23a31a..5ddc1c3dd8914 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -98,7 +98,7 @@ garrow_list_data_type_get_value_field(GArrowListDataType *list_data_type) static_cast(arrow_data_type.get()); auto arrow_field = arrow_list_data_type->value_field(); - return garrow_field_new_raw(&arrow_field, data_type); + return garrow_field_new_raw(&arrow_field, nullptr); } @@ -172,8 +172,7 @@ garrow_struct_data_type_get_fields(GArrowStructDataType *struct_data_type) GList *fields = NULL; for (auto arrow_field : arrow_fields) { - fields = g_list_prepend(fields, - garrow_field_new_raw(&arrow_field, data_type)); + fields = g_list_prepend(fields, garrow_field_new_raw(&arrow_field, nullptr)); } return g_list_reverse(fields); } @@ -207,7 +206,7 @@ garrow_struct_data_type_get_field(GArrowStructDataType *struct_data_type, auto arrow_field = arrow_data_type->child(i); if (arrow_field) { - return garrow_field_new_raw(&arrow_field, data_type); + return garrow_field_new_raw(&arrow_field, nullptr); } else { return NULL; } @@ -234,7 +233,7 @@ garrow_struct_data_type_get_field_by_name(GArrowStructDataType *struct_data_type auto arrow_field = arrow_struct_data_type->GetFieldByName(name); if (arrow_field) { - return garrow_field_new_raw(&arrow_field, data_type); + return garrow_field_new_raw(&arrow_field, nullptr); } else { return NULL; } @@ -309,8 +308,7 @@ garrow_union_data_type_get_fields(GArrowUnionDataType *union_data_type) GList *fields = NULL; for (auto arrow_field : arrow_fields) { - fields = g_list_prepend(fields, - garrow_field_new_raw(&arrow_field, data_type)); + fields = g_list_prepend(fields, garrow_field_new_raw(&arrow_field, nullptr)); } return g_list_reverse(fields); } @@ -344,7 +342,7 @@ garrow_union_data_type_get_field(GArrowUnionDataType *union_data_type, auto arrow_field = arrow_data_type->child(i); if (arrow_field) { - return garrow_field_new_raw(&arrow_field, data_type); + return garrow_field_new_raw(&arrow_field, nullptr); } else { return NULL; } diff --git a/c_glib/arrow-glib/field.cpp b/c_glib/arrow-glib/field.cpp index d74053af48f05..f7250bc6ee634 100644 --- a/c_glib/arrow-glib/field.cpp +++ b/c_glib/arrow-glib/field.cpp @@ -243,10 +243,19 @@ GArrowField * garrow_field_new_raw(std::shared_ptr *arrow_field, GArrowDataType *data_type) { + bool data_type_need_unref = false; + if (!data_type) { + auto arrow_data_type = (*arrow_field)->type(); + data_type = garrow_data_type_new_raw(&arrow_data_type); + data_type_need_unref = true; + } auto field = GARROW_FIELD(g_object_new(GARROW_TYPE_FIELD, "field", arrow_field, "data-type", data_type, NULL)); + if (data_type_need_unref) { + g_object_unref(data_type); + } return field; } diff --git a/c_glib/arrow-glib/schema.cpp b/c_glib/arrow-glib/schema.cpp index 64332419e0972..1bbe82f9a3ca6 100644 --- a/c_glib/arrow-glib/schema.cpp +++ b/c_glib/arrow-glib/schema.cpp @@ -174,11 +174,7 @@ garrow_schema_get_field(GArrowSchema *schema, guint i) { const auto arrow_schema = garrow_schema_get_raw(schema); auto arrow_field = arrow_schema->field(i); - auto arrow_data_type = arrow_field->type(); - auto data_type = garrow_data_type_new_raw(&arrow_data_type); - auto field = garrow_field_new_raw(&arrow_field, data_type); - g_object_unref(data_type); - return field; + return garrow_field_new_raw(&arrow_field, nullptr); } /** @@ -198,10 +194,7 @@ garrow_schema_get_field_by_name(GArrowSchema *schema, return NULL; } else { auto arrow_data_type = arrow_field->type(); - auto data_type = garrow_data_type_new_raw(&arrow_data_type); - auto field = garrow_field_new_raw(&arrow_field, data_type); - g_object_unref(data_type); - return field; + return garrow_field_new_raw(&arrow_field, nullptr); } } @@ -232,10 +225,7 @@ garrow_schema_get_fields(GArrowSchema *schema) GList *fields = NULL; for (auto arrow_field : arrow_schema->fields()) { - auto arrow_data_type = arrow_field->type(); - auto data_type = garrow_data_type_new_raw(&arrow_data_type); - auto field = garrow_field_new_raw(&arrow_field, data_type); - g_object_unref(data_type); + auto field = garrow_field_new_raw(&arrow_field, nullptr); fields = g_list_prepend(fields, field); } diff --git a/c_glib/test/test-dense-union-data-type.rb b/c_glib/test/test-dense-union-data-type.rb index 0d1295423ebbb..231767f8a5441 100644 --- a/c_glib/test/test-dense-union-data-type.rb +++ b/c_glib/test/test-dense-union-data-type.rb @@ -17,11 +17,19 @@ class TestDenseUnionDataType < Test::Unit::TestCase def setup - fields = [ - Arrow::Field.new("number", Arrow::Int32DataType.new), - Arrow::Field.new("text", Arrow::StringDataType.new), + @number_field_data_type = Arrow::Int32DataType.new + @text_field_data_type = Arrow::StringDataType.new + @field_data_types = [ + @number_field_data_type, + @text_field_data_type, ] - @data_type = Arrow::DenseUnionDataType.new(fields, [2, 9]) + @number_field = Arrow::Field.new("number", @number_field_data_type) + @text_field = Arrow::Field.new("text", @text_field_data_type) + @fields = [ + @number_field, + @text_field, + ] + @data_type = Arrow::DenseUnionDataType.new(@fields, [2, 9]) end def test_type @@ -32,4 +40,21 @@ def test_to_s assert_equal("union[dense]", @data_type.to_s) end + + def test_fields + assert_equal(@fields.zip(@field_data_types), + @data_type.fields.collect {|field| [field, field.data_type]}) + end + + def test_get_field + field = @data_type.get_field(0) + assert_equal([ + @fields[0], + @field_data_types[0], + ], + [ + field, + field.data_type, + ]) + end end diff --git a/c_glib/test/test-list-data-type.rb b/c_glib/test/test-list-data-type.rb index aa6a8fa65fd8c..2d96fcb21ed3e 100644 --- a/c_glib/test/test-list-data-type.rb +++ b/c_glib/test/test-list-data-type.rb @@ -16,21 +16,28 @@ # under the License. class TestListDataType < Test::Unit::TestCase + def setup + @field_data_type = Arrow::BooleanDataType.new + @field = Arrow::Field.new("enabled", @field_data_type) + @data_type = Arrow::ListDataType.new(@field) + end + def test_type - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - data_type = Arrow::ListDataType.new(field) - assert_equal(Arrow::Type::LIST, data_type.id) + assert_equal(Arrow::Type::LIST, @data_type.id) end def test_to_s - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - data_type = Arrow::ListDataType.new(field) - assert_equal("list", data_type.to_s) + assert_equal("list", @data_type.to_s) end def test_value_field - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - data_type = Arrow::ListDataType.new(field) - assert_equal(field, data_type.value_field) + assert_equal([ + @field, + @field_data_type, + ], + [ + @data_type.value_field, + @data_type.value_field.data_type, + ]) end end diff --git a/c_glib/test/test-sparse-union-data-type.rb b/c_glib/test/test-sparse-union-data-type.rb index ff4ce72c274a3..30e24f7a11c9b 100644 --- a/c_glib/test/test-sparse-union-data-type.rb +++ b/c_glib/test/test-sparse-union-data-type.rb @@ -17,11 +17,19 @@ class TestSparseUnionDataType < Test::Unit::TestCase def setup - fields = [ - Arrow::Field.new("number", Arrow::Int32DataType.new), - Arrow::Field.new("text", Arrow::StringDataType.new), + @number_field_data_type = Arrow::Int32DataType.new + @text_field_data_type = Arrow::StringDataType.new + @field_data_types = [ + @number_field_data_type, + @text_field_data_type, ] - @data_type = Arrow::SparseUnionDataType.new(fields, [2, 9]) + @number_field = Arrow::Field.new("number", @number_field_data_type) + @text_field = Arrow::Field.new("text", @text_field_data_type) + @fields = [ + @number_field, + @text_field, + ] + @data_type = Arrow::SparseUnionDataType.new(@fields, [2, 9]) end def test_type @@ -32,4 +40,21 @@ def test_to_s assert_equal("union[sparse]", @data_type.to_s) end + + def test_fields + assert_equal(@fields.zip(@field_data_types), + @data_type.fields.collect {|field| [field, field.data_type]}) + end + + def test_get_field + field = @data_type.get_field(0) + assert_equal([ + @fields[0], + @field_data_types[0], + ], + [ + field, + field.data_type, + ]) + end end diff --git a/c_glib/test/test-struct-data-type.rb b/c_glib/test/test-struct-data-type.rb index ce94e41c70148..82ce19ec6a495 100644 --- a/c_glib/test/test-struct-data-type.rb +++ b/c_glib/test/test-struct-data-type.rb @@ -17,8 +17,14 @@ class TestStructDataType < Test::Unit::TestCase def setup - @enabled_field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - @message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + @enabled_field_data_type = Arrow::BooleanDataType.new + @message_field_data_type = Arrow::StringDataType.new + @field_data_types = [ + @enabled_field_data_type, + @message_field_data_type, + ] + @enabled_field = Arrow::Field.new("enabled", @enabled_field_data_type) + @message_field = Arrow::Field.new("message", @message_field_data_type) @fields = [@enabled_field, @message_field] @data_type = Arrow::StructDataType.new(@fields) end @@ -37,7 +43,8 @@ def test_n_fields end def test_fields - assert_equal(@fields, @data_type.fields) + assert_equal(@fields.zip(@field_data_types), + @data_type.fields.collect {|field| [field, field.data_type]}) end sub_test_case("#get_field") do @@ -52,6 +59,18 @@ def test_negative def test_over assert_equal(nil, @data_type.get_field(2)) end + + def test_data_type + field = @data_type.get_field(0) + assert_equal([ + @fields[0], + @field_data_types[0], + ], + [ + field, + field.data_type, + ]) + end end sub_test_case("#get_field_by_name") do @@ -64,9 +83,21 @@ def test_not_found assert_equal(nil, @data_type.get_field_by_name("nonexistent")) end + + def test_data_type + field = @data_type.get_field_by_name("enabled") + assert_equal([ + @enabled_field, + @enabled_field_data_type, + ], + [ + field, + field.data_type, + ]) + end end - sub_test_case("#get_field_by_name") do + sub_test_case("#get_field_index") do def test_found assert_equal(@fields.index(@enabled_field), @data_type.get_field_index("enabled")) From fc7b414faa5c187770ef8e28c26319f416ad7018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Jan 2019 11:46:06 +0100 Subject: [PATCH 204/328] ARROW-4210: [Python] Mention boost-cpp directly in the conda meta.yaml for pyarrow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crossbow builds: [kszucs/crossbow/build-402](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=402) Author: Krisztián Szűcs Closes #3367 from kszucs/ARROW-4210 and squashes the following commits: 0647ee68 add boost-cpp to pyarrow's recipe --- dev/tasks/conda-recipes/pyarrow/meta.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dev/tasks/conda-recipes/pyarrow/meta.yaml b/dev/tasks/conda-recipes/pyarrow/meta.yaml index 7c653876765b5..9f6ae79dc64d7 100644 --- a/dev/tasks/conda-recipes/pyarrow/meta.yaml +++ b/dev/tasks/conda-recipes/pyarrow/meta.yaml @@ -33,6 +33,9 @@ requirements: - {{ compiler('c') }} - {{ compiler('cxx') }} host: + # directly pin boost-cpp as we also seem to directly include boost symbols + # in the Python modules. + - boost-cpp - python - setuptools - setuptools_scm @@ -42,6 +45,7 @@ requirements: - arrow-cpp {{ ARROW_VERSION }} run: + - boost-cpp - python - setuptools - {{ pin_compatible('numpy', lower_bound='1.14') }} From 9d342ec4ffe2441ab0b072c90a4f652aa2678dc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Jan 2019 13:24:23 -0600 Subject: [PATCH 205/328] ARROW-3819: [Packaging] Update conda variant files to conform with feedstock after compiler migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crossbow builds: - [kszucs/crossbow/build-403](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-403) - [kszucs/crossbow/build-404](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-404) - [kszucs/crossbow/build-405](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-405) - [kszucs/crossbow/build-406](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-406) - [kszucs/crossbow/build-407](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-407) Author: Krisztián Szűcs Closes #3368 from kszucs/conda_forge_migration and squashes the following commits: e0a5a6422 use --croot 3749a2ff9 git on osx; set FEEDSTOSK_ROOT ca7217d7f support channel sources from variant files 33cba7118 fix conda path on linux 2505828b7 fix task names 0c4a10bc3 conda recipes for python 3.7; compiler migration --- LICENSE.txt | 33 +++++ dev/release/rat_exclude_files.txt | 1 + dev/tasks/conda-recipes/travis.linux.yml | 23 ++-- dev/tasks/conda-recipes/travis.osx.yml | 24 ++-- ...c_compilergcccxx_compilergxxpython2.7.yaml | 29 ++++ ...c_compilergcccxx_compilergxxpython3.6.yaml | 29 ++++ ...c_compilergcccxx_compilergxxpython3.7.yaml | 29 ++++ ...n_ccxx_compilertoolchain_cxxpython2.7.yaml | 29 ++++ ...n_ccxx_compilertoolchain_cxxpython3.6.yaml | 29 ++++ ...n_ccxx_compilertoolchain_cxxpython3.7.yaml | 29 ++++ .../variants/linux_python2.7.yaml | 47 ------- .../variants/linux_python3.5.yaml | 47 ------- .../variants/linux_python3.6.yaml | 47 ------- ...ilerclangcxx_compilerclangxxpython2.7.yaml | 32 +++++ ...ilerclangcxx_compilerclangxxpython3.6.yaml | 32 +++++ ...ilerclangcxx_compilerclangxxpython3.7.yaml | 32 +++++ ...n_ccxx_compilertoolchain_cxxpython2.7.yaml | 32 +++++ ...n_ccxx_compilertoolchain_cxxpython3.6.yaml | 32 +++++ ...n_ccxx_compilertoolchain_cxxpython3.7.yaml | 32 +++++ .../conda-recipes/variants/osx_python2.7.yaml | 53 -------- .../conda-recipes/variants/osx_python3.5.yaml | 53 -------- .../conda-recipes/variants/osx_python3.6.yaml | 47 ------- ...ilervs2015cxx_compilervs2015python3.5.yaml | 51 ------- ...ilervs2015cxx_compilervs2015python3.6.yaml | 39 +----- ...ilervs2015cxx_compilervs2015python3.7.yaml | 22 ++++ dev/tasks/tasks.yml | 124 +++++++++++++----- 26 files changed, 547 insertions(+), 430 deletions(-) create mode 100644 dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython2.7.yaml create mode 100644 dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.6.yaml create mode 100644 dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.7.yaml create mode 100644 dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml create mode 100644 dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml create mode 100644 dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml delete mode 100644 dev/tasks/conda-recipes/variants/linux_python2.7.yaml delete mode 100644 dev/tasks/conda-recipes/variants/linux_python3.5.yaml delete mode 100644 dev/tasks/conda-recipes/variants/linux_python3.6.yaml create mode 100644 dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml create mode 100644 dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml create mode 100644 dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml create mode 100644 dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml create mode 100644 dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml create mode 100644 dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml delete mode 100644 dev/tasks/conda-recipes/variants/osx_python2.7.yaml delete mode 100644 dev/tasks/conda-recipes/variants/osx_python3.5.yaml delete mode 100644 dev/tasks/conda-recipes/variants/osx_python3.6.yaml delete mode 100644 dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.5.yaml create mode 100644 dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.7.yaml diff --git a/LICENSE.txt b/LICENSE.txt index 572d3ef548917..ad2255d431066 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -795,3 +795,36 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - xxHash homepage: http://www.xxhash.com - xxHash source repository : https://github.com/Cyan4973/xxHash + +-------------------------------------------------------------------------------- + +The files in dev/tasks/conda-recipes/variants have the following license + +BSD 3-clause license +Copyright (c) 2015-2018, conda-forge +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 720b19d894ace..282f57c515b7c 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -114,6 +114,7 @@ dev/tasks/linux-packages/debian/plasma-store-server.install dev/tasks/linux-packages/debian/rules dev/tasks/linux-packages/debian/source/format dev/tasks/linux-packages/debian/watch +dev/tasks/conda-recipes/variants/*.yaml docs/requirements.txt go/arrow/go.sum go/arrow/Gopkg.lock diff --git a/dev/tasks/conda-recipes/travis.linux.yml b/dev/tasks/conda-recipes/travis.linux.yml index c0fc71d230a55..a3c2929b7e6db 100644 --- a/dev/tasks/conda-recipes/travis.linux.yml +++ b/dev/tasks/conda-recipes/travis.linux.yml @@ -38,23 +38,28 @@ install: MINICONDA_FILE="Miniconda3-latest-Linux-x86_64.sh" curl -L -O "${MINICONDA_URL}/${MINICONDA_FILE}" bash $MINICONDA_FILE -b - - # Configure conda. + # Install conda build dependency - | echo "" echo "Configuring conda." source /home/travis/miniconda3/bin/activate root - conda config --remove channels defaults - conda config --add channels defaults - conda config --add channels conda-forge - conda config --set show_channel_urls true - conda install --yes --quiet conda-build + conda install -n root -c conda-forge --quiet --yes conda-forge-ci-setup=2 -script: +before_script: - git clone -b {{ arrow.branch }} {{ arrow.remote }} arrow - git -C arrow checkout {{ arrow.head }} - pushd arrow/dev/tasks/conda-recipes - - conda build --output-folder . -m {{ variant_config_file }} parquet-cpp arrow-cpp pyarrow + # Configure conda + - setup_conda_rc ./ ./ {{ variant_config_file }} + - source run_conda_forge_build_setup + +script: + # Don't need to run make_build_number, no build number decrementation happens, it's always 0 + - | + conda build --croot $TRAVIS_HOME/conda_build_root \ + --output-folder . \ + -m {{ variant_config_file }} \ + parquet-cpp arrow-cpp pyarrow deploy: provider: releases diff --git a/dev/tasks/conda-recipes/travis.osx.yml b/dev/tasks/conda-recipes/travis.osx.yml index 193539d8c9f37..6b3e561a3c5b0 100644 --- a/dev/tasks/conda-recipes/travis.osx.yml +++ b/dev/tasks/conda-recipes/travis.osx.yml @@ -47,24 +47,28 @@ install: MINICONDA_FILE="Miniconda3-latest-MacOSX-x86_64.sh" curl -L -O "${MINICONDA_URL}/${MINICONDA_FILE}" bash $MINICONDA_FILE -b - - # Configure conda. + # Install conda build dependency - | echo "" echo "Configuring conda." source /Users/travis/miniconda3/bin/activate root - conda config --remove channels defaults - conda config --add channels defaults - conda config --add channels conda-forge - conda config --set show_channel_urls true - conda install --yes --quiet conda-forge-ci-setup=1 - source run_conda_forge_build_setup + conda install -n root -c conda-forge --quiet --yes conda-forge-ci-setup=2 -script: +before_script: - git clone -b {{ arrow.branch }} {{ arrow.remote }} arrow - git -C arrow checkout {{ arrow.head }} - pushd arrow/dev/tasks/conda-recipes - - conda build --output-folder . -m {{ variant_config_file }} parquet-cpp arrow-cpp pyarrow + # Configure conda + - setup_conda_rc ./ ./ {{ variant_config_file }} + - source run_conda_forge_build_setup + +script: + # Don't need to run make_build_number, no build number decrementation happens, it's always 0 + - | + conda build --croot $TRAVIS_HOME/conda_build_root \ + --output-folder . \ + -m {{ variant_config_file }} \ + parquet-cpp arrow-cpp pyarrow deploy: provider: releases diff --git a/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython2.7.yaml b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython2.7.yaml new file mode 100644 index 0000000000000..43b2902b5986a --- /dev/null +++ b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython2.7.yaml @@ -0,0 +1,29 @@ +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +c_compiler: +- gcc +channel_sources: +- conda-forge/label/gcc7,defaults +channel_targets: +- conda-forge gcc7 +cxx_compiler: +- gxx +docker_image: +- condaforge/linux-anvil-comp7 +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '2.7' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - docker_image + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.6.yaml b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.6.yaml new file mode 100644 index 0000000000000..e5c89f2fed039 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.6.yaml @@ -0,0 +1,29 @@ +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +c_compiler: +- gcc +channel_sources: +- conda-forge/label/gcc7,defaults +channel_targets: +- conda-forge gcc7 +cxx_compiler: +- gxx +docker_image: +- condaforge/linux-anvil-comp7 +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '3.6' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - docker_image + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.7.yaml b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.7.yaml new file mode 100644 index 0000000000000..3892e5e8a509b --- /dev/null +++ b/dev/tasks/conda-recipes/variants/linux_c_compilergcccxx_compilergxxpython3.7.yaml @@ -0,0 +1,29 @@ +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +c_compiler: +- gcc +channel_sources: +- conda-forge/label/gcc7,defaults +channel_targets: +- conda-forge gcc7 +cxx_compiler: +- gxx +docker_image: +- condaforge/linux-anvil-comp7 +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '3.7' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - docker_image + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml b/dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml new file mode 100644 index 0000000000000..9a9e0f79cecc7 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml @@ -0,0 +1,29 @@ +boost_cpp: +- 1.68.0 +build_number_decrement: +- '1000' +c_compiler: +- toolchain_c +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +cxx_compiler: +- toolchain_cxx +docker_image: +- condaforge/linux-anvil +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '2.7' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - docker_image + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml b/dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml new file mode 100644 index 0000000000000..5f01b786de4a1 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml @@ -0,0 +1,29 @@ +boost_cpp: +- 1.68.0 +build_number_decrement: +- '1000' +c_compiler: +- toolchain_c +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +cxx_compiler: +- toolchain_cxx +docker_image: +- condaforge/linux-anvil +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '3.6' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - docker_image + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml b/dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml new file mode 100644 index 0000000000000..0e27f2ec290d7 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml @@ -0,0 +1,29 @@ +boost_cpp: +- 1.68.0 +build_number_decrement: +- '1000' +c_compiler: +- toolchain_c +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +cxx_compiler: +- toolchain_cxx +docker_image: +- condaforge/linux-anvil +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '3.7' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - docker_image + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/linux_python2.7.yaml b/dev/tasks/conda-recipes/variants/linux_python2.7.yaml deleted file mode 100644 index 45026b07d60ab..0000000000000 --- a/dev/tasks/conda-recipes/variants/linux_python2.7.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '2.7' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/linux_python3.5.yaml b/dev/tasks/conda-recipes/variants/linux_python3.5.yaml deleted file mode 100644 index 683022f834913..0000000000000 --- a/dev/tasks/conda-recipes/variants/linux_python3.5.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '3.5' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/linux_python3.6.yaml b/dev/tasks/conda-recipes/variants/linux_python3.6.yaml deleted file mode 100644 index 6b7d8896ac369..0000000000000 --- a/dev/tasks/conda-recipes/variants/linux_python3.6.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '3.6' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml new file mode 100644 index 0000000000000..caf6bf7ebb41f --- /dev/null +++ b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml @@ -0,0 +1,32 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +c_compiler: +- clang +channel_sources: +- conda-forge/label/gcc7,defaults +channel_targets: +- conda-forge gcc7 +cxx_compiler: +- clangxx +macos_machine: +- x86_64-apple-darwin13.4.0 +macos_min_version: +- '10.9' +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '2.7' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml new file mode 100644 index 0000000000000..94f51c0ac1461 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml @@ -0,0 +1,32 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +c_compiler: +- clang +channel_sources: +- conda-forge/label/gcc7,defaults +channel_targets: +- conda-forge gcc7 +cxx_compiler: +- clangxx +macos_machine: +- x86_64-apple-darwin13.4.0 +macos_min_version: +- '10.9' +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '3.6' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml new file mode 100644 index 0000000000000..25b5c4175ddbc --- /dev/null +++ b/dev/tasks/conda-recipes/variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml @@ -0,0 +1,32 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +boost_cpp: +- 1.68.0 +build_number_decrement: +- '0' +c_compiler: +- clang +channel_sources: +- conda-forge/label/gcc7,defaults +channel_targets: +- conda-forge gcc7 +cxx_compiler: +- clangxx +macos_machine: +- x86_64-apple-darwin13.4.0 +macos_min_version: +- '10.9' +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '3.7' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml b/dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml new file mode 100644 index 0000000000000..e11b9f8c60cb8 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml @@ -0,0 +1,32 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +boost_cpp: +- 1.68.0 +build_number_decrement: +- '1000' +c_compiler: +- toolchain_c +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +cxx_compiler: +- toolchain_cxx +macos_machine: +- x86_64-apple-darwin13.4.0 +macos_min_version: +- '10.9' +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '2.7' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml b/dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml new file mode 100644 index 0000000000000..01aa8595a1e24 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml @@ -0,0 +1,32 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +boost_cpp: +- 1.68.0 +build_number_decrement: +- '1000' +c_compiler: +- toolchain_c +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +cxx_compiler: +- toolchain_cxx +macos_machine: +- x86_64-apple-darwin13.4.0 +macos_min_version: +- '10.9' +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '3.6' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml b/dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml new file mode 100644 index 0000000000000..836650a03a7eb --- /dev/null +++ b/dev/tasks/conda-recipes/variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml @@ -0,0 +1,32 @@ +MACOSX_DEPLOYMENT_TARGET: +- '10.9' +boost_cpp: +- 1.68.0 +build_number_decrement: +- '1000' +c_compiler: +- toolchain_c +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +cxx_compiler: +- toolchain_cxx +macos_machine: +- x86_64-apple-darwin13.4.0 +macos_min_version: +- '10.9' +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '3.7' +zip_keys: +- - c_compiler + - cxx_compiler + - channel_sources + - channel_targets + - build_number_decrement diff --git a/dev/tasks/conda-recipes/variants/osx_python2.7.yaml b/dev/tasks/conda-recipes/variants/osx_python2.7.yaml deleted file mode 100644 index b8fc15f924dd5..0000000000000 --- a/dev/tasks/conda-recipes/variants/osx_python2.7.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -macos_machine: -- x86_64-apple-darwin13.4.0 -macos_min_version: -- '10.9' -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '2.7' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/osx_python3.5.yaml b/dev/tasks/conda-recipes/variants/osx_python3.5.yaml deleted file mode 100644 index 05f7a8dd4d36d..0000000000000 --- a/dev/tasks/conda-recipes/variants/osx_python3.5.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -macos_machine: -- x86_64-apple-darwin13.4.0 -macos_min_version: -- '10.9' -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '3.5' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/osx_python3.6.yaml b/dev/tasks/conda-recipes/variants/osx_python3.6.yaml deleted file mode 100644 index 6b7d8896ac369..0000000000000 --- a/dev/tasks/conda-recipes/variants/osx_python3.6.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -boost_cpp: -- 1.67.0 -c_compiler: -- toolchain_c -cxx_compiler: -- toolchain_cxx -lz4_c: -- 1.8.1 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '3.6' -snappy: -- 1.1.7 -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.5.yaml b/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.5.yaml deleted file mode 100644 index d886b0e39ff7f..0000000000000 --- a/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.5.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -boost_cpp: -- 1.67.0 -c_compiler: -- vs2015 -cxx_compiler: -- vs2015 -lz4_c: -- 1.8.1 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x -python: -- '3.5' -snappy: -- 1.1.7 -zip_keys: -- - python - - c_compiler - - cxx_compiler -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.6.yaml b/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.6.yaml index 880642f5b7d85..a56ee638f6753 100644 --- a/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.6.yaml +++ b/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.6.yaml @@ -1,51 +1,22 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - boost_cpp: -- 1.67.0 +- 1.68.0 c_compiler: - vs2015 +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main cxx_compiler: - vs2015 -lz4_c: -- 1.8.1 pin_run_as_build: boost-cpp: max_pin: x.x.x - lz4-c: - max_pin: x.x.x python: min_pin: x.x max_pin: x.x - snappy: - max_pin: x.x.x - zlib: - max_pin: x.x - zstd: - max_pin: x.x.x python: - '3.6' -snappy: -- 1.1.7 zip_keys: - - python - c_compiler - cxx_compiler -zlib: -- '1.2' -zstd: -- 1.3.3 diff --git a/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.7.yaml b/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.7.yaml new file mode 100644 index 0000000000000..1cce7445c73e7 --- /dev/null +++ b/dev/tasks/conda-recipes/variants/win_c_compilervs2015cxx_compilervs2015python3.7.yaml @@ -0,0 +1,22 @@ +boost_cpp: +- 1.68.0 +c_compiler: +- vs2015 +channel_sources: +- conda-forge,defaults +channel_targets: +- conda-forge main +cxx_compiler: +- vs2015 +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + python: + min_pin: x.x + max_pin: x.x +python: +- '3.7' +zip_keys: +- - python + - c_compiler + - cxx_compiler diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 52bbc577e6f1b..4b10b57fd0990 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -19,14 +19,20 @@ groups: # these groups are just for convenience # makes it easier to submit related tasks conda: - - conda-linux-py27 - - conda-linux-py35 - - conda-linux-py36 - - conda-osx-py27 - - conda-osx-py35 - - conda-osx-py36 - - conda-win-py35 - - conda-win-py36 + - conda-linux-gcc-py27 + - conda-linux-gcc-py36 + - conda-linux-gcc-py37 + - conda-linux-toolchain-py27 + - conda-linux-toolchain-py36 + - conda-linux-toolchain-py37 + - conda-osx-clang-py27 + - conda-osx-clang-py36 + - conda-osx-clang-py37 + - conda-osx-toolchain-py27 + - conda-osx-toolchain-py36 + - conda-osx-toolchain-py37 + - conda-win-vs2015-py36 + - conda-win-vs2015-py37 wheel: - wheel-linux-cp27m - wheel-linux-cp27mu @@ -64,81 +70,135 @@ tasks: ############################## Conda Linux ################################## - conda-linux-py27: + conda-linux-gcc-py27: platform: linux template: conda-recipes/travis.linux.yml params: - variant_config_file: variants/linux_python2.7.yaml + variant_config_file: variants/linux_c_compilergcccxx_compilergxxpython2.7.yaml artifacts: - arrow-cpp-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - pyarrow-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - conda-linux-py35: + conda-linux-gcc-py36: platform: linux template: conda-recipes/travis.linux.yml params: - variant_config_file: variants/linux_python3.5.yaml + variant_config_file: variants/linux_c_compilergcccxx_compilergxxpython3.6.yaml artifacts: - - arrow-cpp-{version}-py35(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py35(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + + conda-linux-gcc-py37: + platform: linux + template: conda-recipes/travis.linux.yml + params: + variant_config_file: variants/linux_c_compilergcccxx_compilergxxpython3.7.yaml + artifacts: + - arrow-cpp-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + + conda-linux-toolchain-py27: + platform: linux + template: conda-recipes/travis.linux.yml + params: + variant_config_file: variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml + artifacts: + - arrow-cpp-{version}-py27(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - conda-linux-py36: + conda-linux-toolchain-py36: platform: linux template: conda-recipes/travis.linux.yml params: - variant_config_file: variants/linux_python3.6.yaml + variant_config_file: variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml artifacts: - arrow-cpp-{version}-py36(h[a-z0-9]+)_0.tar.bz2 - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + conda-linux-toolchain-py37: + platform: linux + template: conda-recipes/travis.linux.yml + params: + variant_config_file: variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml + artifacts: + - arrow-cpp-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + ############################## Conda OSX #################################### - conda-osx-py27: + conda-osx-clang-py27: platform: osx template: conda-recipes/travis.osx.yml params: - variant_config_file: variants/osx_python2.7.yaml + variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml artifacts: - arrow-cpp-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - pyarrow-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - conda-osx-py35: + conda-osx-clang-py36: + platform: osx + template: conda-recipes/travis.osx.yml + params: + variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml + artifacts: + - arrow-cpp-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + + conda-osx-clang-py37: platform: osx template: conda-recipes/travis.osx.yml params: - variant_config_file: variants/osx_python3.5.yaml + variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml artifacts: - - arrow-cpp-{version}-py35(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py35(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + + conda-osx-toolchain-py27: + platform: osx + template: conda-recipes/travis.osx.yml + params: + variant_config_file: variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml + artifacts: + - arrow-cpp-{version}-py27(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - conda-osx-py36: + conda-osx-toolchain-py36: platform: osx template: conda-recipes/travis.osx.yml params: - variant_config_file: variants/osx_python3.6.yaml + variant_config_file: variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml artifacts: - arrow-cpp-{version}-py36(h[a-z0-9]+)_0.tar.bz2 - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + conda-osx-toolchain-py37: + platform: osx + template: conda-recipes/travis.osx.yml + params: + variant_config_file: variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml + artifacts: + - arrow-cpp-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + ############################## Conda Windows ################################ - conda-win-py35: + conda-win-vs2015-py36: platform: win template: conda-recipes/appveyor.yml params: - variant_config_file: variants\win_c_compilervs2015cxx_compilervs2015python3.5.yaml + variant_config_file: variants\win_c_compilervs2015cxx_compilervs2015python3.6.yaml artifacts: - - arrow-cpp-{version}-py35_vc14(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py35(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{version}-py36_vc14(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 - conda-win-py36: + conda-win-vs2015-py37: platform: win template: conda-recipes/appveyor.yml params: - variant_config_file: variants\win_c_compilervs2015cxx_compilervs2015python3.6.yaml + variant_config_file: variants\win_c_compilervs2015cxx_compilervs2015python3.7.yaml artifacts: - - arrow-cpp-{version}-py36_vc14(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{version}-py37_vc14(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{version}-py37(h[a-z0-9]+)_0.tar.bz2 ############################## Wheel Linux ################################## From 5a502d281545402240e818d5fd97a9aaf36363f2 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 10 Jan 2019 21:05:31 +0100 Subject: [PATCH 206/328] ARROW-4216: [Python] Add CUDA API docs Also reorganize the API docs into several documents, and add/improve docstrings. To allow building the docs without CUDA enabled, I added some conditional inclusion logic. When CUDA isn't enabled, the API docs are still generated but the docstrings are empty. This seems to be the only sane setting that doesn't produce Sphinx errors, one way or the other. Author: Antoine Pitrou Closes #3372 from pitrou/ARROW-4216-cuda-py-docs and squashes the following commits: 80600da5 ARROW-4216: Add CUDA API docs --- docs/source/conf.py | 33 +++ docs/source/python/api.rst | 389 +-------------------------- docs/source/python/api/arrays.rst | 109 ++++++++ docs/source/python/api/cuda.rst | 62 +++++ docs/source/python/api/datatypes.rst | 134 +++++++++ docs/source/python/api/files.rst | 65 +++++ docs/source/python/api/formats.rst | 70 +++++ docs/source/python/api/ipc.rst | 59 ++++ docs/source/python/api/memory.rst | 68 +++++ docs/source/python/api/misc.rst | 40 +++ docs/source/python/api/plasma.rst | 33 +++ docs/source/python/api/tables.rst | 54 ++++ python/pyarrow/__init__.py | 4 +- python/pyarrow/_cuda.pyx | 70 ++--- python/pyarrow/array.pxi | 132 +++++++-- python/pyarrow/io.pxi | 86 ++++-- python/pyarrow/memory.pxi | 14 +- python/pyarrow/scalar.pxi | 106 +++++++- python/pyarrow/types.pxi | 103 ++++++- 19 files changed, 1167 insertions(+), 464 deletions(-) create mode 100644 docs/source/python/api/arrays.rst create mode 100644 docs/source/python/api/cuda.rst create mode 100644 docs/source/python/api/datatypes.rst create mode 100644 docs/source/python/api/files.rst create mode 100644 docs/source/python/api/formats.rst create mode 100644 docs/source/python/api/ipc.rst create mode 100644 docs/source/python/api/memory.rst create mode 100644 docs/source/python/api/misc.rst create mode 100644 docs/source/python/api/plasma.rst create mode 100644 docs/source/python/api/tables.rst diff --git a/docs/source/conf.py b/docs/source/conf.py index 1cadef18b64f2..d525fa943138b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -53,6 +53,7 @@ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.doctest', + 'sphinx.ext.ifconfig', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', @@ -69,6 +70,9 @@ 'inherited-members': None } +# Overriden conditionally below +autodoc_mock_imports = [] + # ipython directive options ipython_mplbackend = '' @@ -387,3 +391,32 @@ # If true, do not generate a @detailmenu in the "Top" node's menu. # # texinfo_no_detailmenu = False + + +# -- Customization -------------------------------------------------------- + +# Conditional API doc generation + +# Sphinx has two features for conditional inclusion: +# - The "only" directive +# https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#including-content-based-on-tags +# - The "ifconfig" extension +# https://www.sphinx-doc.org/en/master/usage/extensions/ifconfig.html +# +# Both have issues, but "ifconfig" seems to work in this setting. + +try: + import pyarrow.cuda + cuda_enabled = True +except ImportError: + cuda_enabled = False + # Mock pyarrow.cuda to avoid autodoc warnings. + # XXX I can't get autodoc_mock_imports to work, so mock manually instead + # (https://github.com/sphinx-doc/sphinx/issues/2174#issuecomment-453177550) + from unittest import mock + pyarrow.cuda = sys.modules['pyarrow.cuda'] = mock.Mock() + +def setup(app): + # Use a config value to indicate whether CUDA API docs can be generated. + # This will also rebuild appropriately when the value changes. + app.add_config_value('cuda_enabled', cuda_enabled, 'env') diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst index 0bad76ff0bf63..b06509f7a5b19 100644 --- a/docs/source/python/api.rst +++ b/docs/source/python/api.rst @@ -15,385 +15,22 @@ .. specific language governing permissions and limitations .. under the License. -.. currentmodule:: pyarrow .. _api: ************* API Reference ************* -.. _api.types: - -Type and Schema Factory Functions ---------------------------------- - -.. autosummary:: - :toctree: generated/ - - null - bool_ - int8 - int16 - int32 - int64 - uint8 - uint16 - uint32 - uint64 - float16 - float32 - float64 - time32 - time64 - timestamp - date32 - date64 - binary - string - utf8 - decimal128 - list_ - struct - dictionary - field - schema - from_numpy_dtype - -.. currentmodule:: pyarrow.types -.. _api.types.checking: - -Type checking functions ------------------------ - -.. autosummary:: - :toctree: generated/ - - is_boolean - is_integer - is_signed_integer - is_unsigned_integer - is_int8 - is_int16 - is_int32 - is_int64 - is_uint8 - is_uint16 - is_uint32 - is_uint64 - is_floating - is_float16 - is_float32 - is_float64 - is_decimal - is_list - is_struct - is_union - is_nested - is_temporal - is_timestamp - is_date - is_date32 - is_date64 - is_time - is_time32 - is_time64 - is_null - is_binary - is_unicode - is_string - is_fixed_size_binary - is_map - is_dictionary - -.. currentmodule:: pyarrow - -.. _api.value: - -Scalar Value Types ------------------- - -.. autosummary:: - :toctree: generated/ - - NA - Scalar - ArrayValue - BooleanValue - Int8Value - Int16Value - Int32Value - Int64Value - UInt8Value - UInt16Value - UInt32Value - UInt64Value - FloatValue - DoubleValue - ListValue - BinaryValue - StringValue - FixedSizeBinaryValue - Date32Value - Date64Value - TimestampValue - DecimalValue - -.. _api.array: - -.. currentmodule:: pyarrow - -Array Types ------------ - -.. autosummary:: - :toctree: generated/ - - array - Array - BooleanArray - DictionaryArray - FloatingPointArray - IntegerArray - Int8Array - Int16Array - Int32Array - Int64Array - NullArray - NumericArray - UInt8Array - UInt16Array - UInt32Array - UInt64Array - BinaryArray - FixedSizeBinaryArray - StringArray - Time32Array - Time64Array - Date32Array - Date64Array - TimestampArray - Decimal128Array - ListArray - -.. _api.table: - -.. currentmodule:: pyarrow - -Tables and Record Batches -------------------------- - -.. autosummary:: - :toctree: generated/ - - column - chunked_array - concat_tables - ChunkedArray - Column - RecordBatch - Table - -.. _api.tensor: - -Tensor type and Functions -------------------------- - -.. autosummary:: - :toctree: generated/ - - Tensor - -.. _api.io: - -In-Memory Buffers ------------------ - -.. autosummary:: - :toctree: generated/ - - allocate_buffer - compress - decompress - py_buffer - foreign_buffer - Buffer - ResizableBuffer - -Input / Output and Shared Memory --------------------------------- - -.. autosummary:: - :toctree: generated/ - - input_stream - output_stream - BufferReader - BufferOutputStream - FixedSizeBufferWriter - NativeFile - OSFile - MemoryMappedFile - CompressedInputStream - CompressedOutputStream - memory_map - create_memory_map - PythonFile - -File Systems ------------- - -.. autosummary:: - :toctree: generated/ - - hdfs.connect - LocalFileSystem - -.. class:: HadoopFileSystem - :noindex: - -.. _api.ipc: - -Serialization and IPC ---------------------- - -.. autosummary:: - :toctree: generated/ - - ipc.open_file - ipc.open_stream - Message - MessageReader - RecordBatchFileReader - RecordBatchFileWriter - RecordBatchStreamReader - RecordBatchStreamWriter - read_message - read_record_batch - get_record_batch_size - read_tensor - write_tensor - get_tensor_size - serialize - serialize_to - deserialize - deserialize_components - deserialize_from - read_serialized - SerializedPyObject - SerializationContext - -.. _api.memory_pool: - -Memory Pools ------------- - -.. currentmodule:: pyarrow - -.. autosummary:: - :toctree: generated/ - - MemoryPool - default_memory_pool - total_allocated_bytes - set_memory_pool - log_memory_allocations - -.. _api.type_classes: - -.. currentmodule:: pyarrow - -Type Classes ------------- - -.. autosummary:: - :toctree: generated/ - - DataType - Field - Schema - -.. currentmodule:: pyarrow.plasma - -.. _api.plasma: - -Plasma In-Memory Object Store ------------------------------ - -.. autosummary:: - :toctree: generated/ - - ObjectID - PlasmaClient - PlasmaBuffer - -.. currentmodule:: pyarrow.csv - -.. _api.csv: - -CSV Files ---------- - -.. autosummary:: - :toctree: generated/ - - ReadOptions - ParseOptions - ConvertOptions - read_csv - -.. _api.feather: - -Feather Files -------------- - -.. currentmodule:: pyarrow.feather - -.. autosummary:: - :toctree: generated/ - - read_feather - write_feather - -.. currentmodule:: pyarrow - -.. _api.parquet: - -Parquet Files -------------- - -.. currentmodule:: pyarrow.parquet - -.. autosummary:: - :toctree: generated/ - - ParquetDataset - ParquetFile - ParquetWriter - read_table - read_metadata - read_pandas - read_schema - write_metadata - write_table - write_to_dataset - -.. currentmodule:: pyarrow - -Multi-Threading ---------------- - -.. autosummary:: - :toctree: generated/ - - cpu_count - set_cpu_count - -Using with C extensions ------------------------ - -.. autosummary:: - :toctree: generated/ - - get_include - get_libraries - get_library_dirs +.. toctree:: + :maxdepth: 2 + + api/datatypes + api/arrays + api/memory + api/files + api/tables + api/ipc + api/formats + api/plasma + api/cuda + api/misc diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst new file mode 100644 index 0000000000000..db45eeff0ca5a --- /dev/null +++ b/docs/source/python/api/arrays.rst @@ -0,0 +1,109 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.array: +.. currentmodule:: pyarrow + +Arrays and Scalars +================== + +Factory Function +---------------- + +This function is the main entry point to create an Arrow array from Python. + +.. autosummary:: + :toctree: ../generated/ + + array + +Array Types +----------- + +An array's Python class depends on its data type. Concrete array classes +may expose data type-specific methods or properties. + +.. autosummary:: + :toctree: ../generated/ + + Array + BooleanArray + FloatingPointArray + IntegerArray + Int8Array + Int16Array + Int32Array + Int64Array + NullArray + NumericArray + UInt8Array + UInt16Array + UInt32Array + UInt64Array + BinaryArray + StringArray + FixedSizeBinaryArray + Time32Array + Time64Array + Date32Array + Date64Array + TimestampArray + Decimal128Array + DictionaryArray + ListArray + StructArray + UnionArray + +.. _api.scalar: + +Array Scalars +------------- + +Indexing an array wraps the represented value in a scalar object whose +concrete type depends on the array data type. You shouldn't instantiate +any of those classes directly. + +.. autosummary:: + :toctree: ../generated/ + + NA + Scalar + ArrayValue + BooleanValue + Int8Value + Int16Value + Int32Value + Int64Value + UInt8Value + UInt16Value + UInt32Value + UInt64Value + FloatValue + DoubleValue + BinaryValue + StringValue + FixedSizeBinaryValue + Time32Value + Time64Value + Date32Value + Date64Value + TimestampValue + DecimalValue + DictionaryValue + ListValue + StructValue + UnionValue diff --git a/docs/source/python/api/cuda.rst b/docs/source/python/api/cuda.rst new file mode 100644 index 0000000000000..364f032403586 --- /dev/null +++ b/docs/source/python/api/cuda.rst @@ -0,0 +1,62 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.cuda + +CUDA Integration +================ + +.. ifconfig:: not cuda_enabled + + .. error:: + This documentation was built without CUDA enabled. The CUDA + API docs are not available. + +.. NOTE We still generate those API docs (with empty docstrings) +.. when CUDA is disabled and `pyarrow.cuda` mocked (see conf.py). +.. Otherwise we'd get autodoc warnings, see https://github.com/sphinx-doc/sphinx/issues/4770 + +CUDA Contexts +------------- + +.. autosummary:: + :toctree: ../generated/ + + Context + +CUDA Buffers +------------ + +.. autosummary:: + :toctree: ../generated/ + + CudaBuffer + new_host_buffer + HostBuffer + BufferReader + BufferWriter + +Serialization and IPC +--------------------- + +.. autosummary:: + :toctree: ../generated/ + + serialize_record_batch + read_record_batch + read_message + IpcMemHandle diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst new file mode 100644 index 0000000000000..5ad0204966337 --- /dev/null +++ b/docs/source/python/api/datatypes.rst @@ -0,0 +1,134 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.types: +.. currentmodule:: pyarrow + +Data Types and Schemas +====================== + +Factory Functions +----------------- + +These should be used to create Arrow data types and schemas. + +.. autosummary:: + :toctree: ../generated/ + + null + bool_ + int8 + int16 + int32 + int64 + uint8 + uint16 + uint32 + uint64 + float16 + float32 + float64 + time32 + time64 + timestamp + date32 + date64 + binary + string + utf8 + decimal128 + list_ + struct + dictionary + field + schema + from_numpy_dtype + +.. _api.type_classes: +.. currentmodule:: pyarrow + +Type Classes +------------ + +Do not instantiate these classes directly. Instead, call one of the factory +functions above. + +.. autosummary:: + :toctree: ../generated/ + + DataType + DictionaryType + ListType + StructType + UnionType + TimestampType + Time32Type + Time64Type + FixedSizeBinaryType + Decimal128Type + Field + Schema + +.. _api.types.checking: +.. currentmodule:: pyarrow.types + +Type Checking +------------- + +These functions are predicates to check whether a :class:`DataType` instance +represents a given data type (such as ``int32``) or general category +(such as "is a signed integer"). + +.. autosummary:: + :toctree: ../generated/ + + is_boolean + is_integer + is_signed_integer + is_unsigned_integer + is_int8 + is_int16 + is_int32 + is_int64 + is_uint8 + is_uint16 + is_uint32 + is_uint64 + is_floating + is_float16 + is_float32 + is_float64 + is_decimal + is_list + is_struct + is_union + is_nested + is_temporal + is_timestamp + is_date + is_date32 + is_date64 + is_time + is_time32 + is_time64 + is_null + is_binary + is_unicode + is_string + is_fixed_size_binary + is_map + is_dictionary diff --git a/docs/source/python/api/files.rst b/docs/source/python/api/files.rst new file mode 100644 index 0000000000000..106dfde8abffb --- /dev/null +++ b/docs/source/python/api/files.rst @@ -0,0 +1,65 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +Streams and File Access +======================= + +.. _api.io: + +Factory Functions +----------------- + +These factory functions are the recommended way to create a Arrow stream. +They accept various kinds of sources, such as in-memory buffers or on-disk files. + +.. autosummary:: + :toctree: ../generated/ + + input_stream + output_stream + memory_map + create_memory_map + +Stream Classes +-------------- + +.. autosummary:: + :toctree: ../generated/ + + NativeFile + OSFile + PythonFile + BufferReader + BufferOutputStream + FixedSizeBufferWriter + MemoryMappedFile + CompressedInputStream + CompressedOutputStream + +File Systems +------------ + +.. autosummary:: + :toctree: ../generated/ + + hdfs.connect + LocalFileSystem + +.. class:: HadoopFileSystem + :noindex: diff --git a/docs/source/python/api/formats.rst b/docs/source/python/api/formats.rst new file mode 100644 index 0000000000000..8de30ece93584 --- /dev/null +++ b/docs/source/python/api/formats.rst @@ -0,0 +1,70 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Tabular File Formats +==================== + +.. currentmodule:: pyarrow.csv + +.. _api.csv: + +CSV Files +--------- + +.. autosummary:: + :toctree: ../generated/ + + ReadOptions + ParseOptions + ConvertOptions + read_csv + +.. _api.feather: + +Feather Files +------------- + +.. currentmodule:: pyarrow.feather + +.. autosummary:: + :toctree: ../generated/ + + read_feather + write_feather + +.. currentmodule:: pyarrow + +.. _api.parquet: + +Parquet Files +------------- + +.. currentmodule:: pyarrow.parquet + +.. autosummary:: + :toctree: ../generated/ + + ParquetDataset + ParquetFile + ParquetWriter + read_table + read_metadata + read_pandas + read_schema + write_metadata + write_table + write_to_dataset diff --git a/docs/source/python/api/ipc.rst b/docs/source/python/api/ipc.rst new file mode 100644 index 0000000000000..bd14d30dcb274 --- /dev/null +++ b/docs/source/python/api/ipc.rst @@ -0,0 +1,59 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _api.ipc: + +Serialization and IPC +===================== + +Inter-Process Communication +--------------------------- + +.. autosummary:: + :toctree: ../generated/ + + ipc.open_file + ipc.open_stream + Message + MessageReader + RecordBatchFileReader + RecordBatchFileWriter + RecordBatchStreamReader + RecordBatchStreamWriter + read_message + read_record_batch + get_record_batch_size + read_tensor + write_tensor + get_tensor_size + +Serialization +------------- + +.. autosummary:: + :toctree: ../generated/ + + serialize + serialize_to + deserialize + deserialize_components + deserialize_from + read_serialized + SerializedPyObject + SerializationContext diff --git a/docs/source/python/api/memory.rst b/docs/source/python/api/memory.rst new file mode 100644 index 0000000000000..da9156fcad539 --- /dev/null +++ b/docs/source/python/api/memory.rst @@ -0,0 +1,68 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _api.memory: + +Buffers and Memory +================== + +In-Memory Buffers +----------------- + +Factory Functions +~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: ../generated/ + + allocate_buffer + py_buffer + foreign_buffer + +Classes +~~~~~~~ + +.. autosummary:: + :toctree: ../generated/ + + Buffer + ResizableBuffer + +Miscellaneous +~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: ../generated/ + + compress + decompress + +.. _api.memory_pool: + +Memory Pools +------------ + +.. autosummary:: + :toctree: ../generated/ + + MemoryPool + default_memory_pool + total_allocated_bytes + set_memory_pool + log_memory_allocations diff --git a/docs/source/python/api/misc.rst b/docs/source/python/api/misc.rst new file mode 100644 index 0000000000000..c13b80620f154 --- /dev/null +++ b/docs/source/python/api/misc.rst @@ -0,0 +1,40 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +Miscellaneous +============= + +Multi-Threading +--------------- + +.. autosummary:: + :toctree: ../generated/ + + cpu_count + set_cpu_count + +Using with C extensions +----------------------- + +.. autosummary:: + :toctree: ../generated/ + + get_include + get_libraries + get_library_dirs diff --git a/docs/source/python/api/plasma.rst b/docs/source/python/api/plasma.rst new file mode 100644 index 0000000000000..8df9e4e21ac8b --- /dev/null +++ b/docs/source/python/api/plasma.rst @@ -0,0 +1,33 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.plasma + +.. _api.plasma: + +Plasma In-Memory Object Store +============================= + +Classes +------- + +.. autosummary:: + :toctree: ../generated/ + + ObjectID + PlasmaClient + PlasmaBuffer diff --git a/docs/source/python/api/tables.rst b/docs/source/python/api/tables.rst new file mode 100644 index 0000000000000..5a229d29fa60b --- /dev/null +++ b/docs/source/python/api/tables.rst @@ -0,0 +1,54 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _api.table: + +Tables and Tensors +================== + +Factory Functions +----------------- + +.. autosummary:: + :toctree: ../generated/ + + column + chunked_array + concat_tables + +Classes +------- + +.. autosummary:: + :toctree: ../generated/ + + ChunkedArray + Column + RecordBatch + Table + +.. _api.tensor: + +Tensors +------- + +.. autosummary:: + :toctree: ../generated/ + + Tensor diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 0d1c1bef87a1c..dabcdf1813059 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -60,7 +60,9 @@ def parse_git(root, **kwargs): binary, string, utf8, decimal128, list_, struct, union, dictionary, field, type_for_alias, - DataType, + DataType, DictionaryType, ListType, StructType, + UnionType, TimestampType, Time32Type, Time64Type, + FixedSizeBinaryType, Decimal128Type, Field, Schema, schema, diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx index cd5704947297b..c2d95a6f13652 100644 --- a/python/pyarrow/_cuda.pyx +++ b/python/pyarrow/_cuda.pyx @@ -23,21 +23,29 @@ cimport cpython as cp cdef class Context: - """ CUDA driver context. + """ + CUDA driver context. """ - def __cinit__(self, int device_number=0, uintptr_t handle=0): - """Construct the shared CUDA driver context for a particular device. + def __init__(self, *args, **kwargs): + """ + Create a CUDA driver context for a particular device. + + If a CUDA context handle is passed, it is wrapped, otherwise + a default CUDA context for the given device is requested. Parameters ---------- - device_number : int - Specify the gpu device for which the CUDA driver context is + device_number : int (default 0) + Specify the GPU device for which the CUDA driver context is requested. - handle : int - Specify handle for a shared context that has been created by - another library. + handle : int, optional + Specify CUDA handle for a shared context that has been created + by another library. """ + # This method exposed because autodoc doesn't pick __cinit__ + + def __cinit__(self, int device_number=0, uintptr_t handle=0): cdef CCudaDeviceManager* manager check_status(CCudaDeviceManager.GetInstance(&manager)) cdef int n = manager.num_devices() @@ -55,13 +63,14 @@ cdef class Context: @staticmethod def from_numba(context=None): - """Create Context instance from a numba CUDA context. + """ + Create a Context instance from a Numba CUDA context. Parameters ---------- context : {numba.cuda.cudadrv.driver.Context, None} - Specify numba CUDA context instance. When None, use the - current numba context. + A Numba CUDA context instance. + If None, the current Numba context is used. Returns ------- @@ -75,7 +84,8 @@ cdef class Context: handle=context.handle.value) def to_numba(self): - """Convert Context to numba CUDA context. + """ + Convert Context to a Numba CUDA context. Returns ------- @@ -238,7 +248,7 @@ cdef class Context: cdef class IpcMemHandle: - """A container for a CUDA IPC handle. + """A serializable container for a CUDA IPC handle. """ cdef void init(self, shared_ptr[CCudaIpcMemHandle]& h): self.handle = h @@ -285,14 +295,10 @@ cdef class IpcMemHandle: cdef class CudaBuffer(Buffer): """An Arrow buffer with data located in a GPU device. - To create a CudaBuffer instance, use - - .device_buffer(data=, offset=, - size=) - - The memory allocated in CudaBuffer instance is freed when the - instance is deleted. + To create a CudaBuffer instance, use Context.device_buffer(). + The memory allocated in a CudaBuffer is freed when the buffer object + is deleted. """ def __init__(self): @@ -529,7 +535,7 @@ cdef class CudaBuffer(Buffer): After calling this function, this device memory will not be freed when the CudaBuffer is destructed. - Results + Returns ------- ipc_handle : IpcMemHandle The exported IPC handle @@ -774,9 +780,9 @@ def serialize_record_batch(object batch, object ctx): Parameters ---------- batch : RecordBatch - Specify record batch to write + Record batch to write ctx : Context - Specify context to allocate device memory from + CUDA Context to allocate device memory from Returns ------- @@ -797,14 +803,14 @@ def read_message(object source, pool=None): Parameters ---------- source : {CudaBuffer, cuda.BufferReader} - Specify device buffer or reader of device buffer. - pool : {MemoryPool, None} - Specify pool to allocate CPU memory for the metadata + Device buffer or reader of device buffer. + pool : MemoryPool (optional) + Pool to allocate CPU memory for the metadata Returns ------- message : Message - the deserialized message, body still on device + The deserialized message, body still on device """ cdef: Message result = Message.__new__(Message) @@ -824,16 +830,16 @@ def read_record_batch(object buffer, object schema, pool=None): Parameters ---------- buffer : - Specify device buffer containing the complete IPC message + Device buffer containing the complete IPC message schema : Schema - Specify schema for the record batch - pool : {MemoryPool, None} - Specify pool to use for allocating space for the metadata + The schema for the record batch + pool : MemoryPool (optional) + Pool to allocate metadata from Returns ------- batch : RecordBatch - reconstructed record batch, with device pointers + Reconstructed record batch, with device pointers """ cdef shared_ptr[CSchema] schema_ = pyarrow_unwrap_schema(schema) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 54d0e92cd5561..41a3b970b3acf 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -392,6 +392,9 @@ cdef class _PandasConvertible: cdef class Array(_PandasConvertible): + """ + The base class for all Arrow arrays. + """ def __init__(self): raise TypeError("Do not call {}'s constructor directly, use one of " @@ -616,11 +619,18 @@ cdef class Array(_PandasConvertible): def isnull(self): raise NotImplemented - def __getitem__(self, key): - if PySlice_Check(key): - return _normalize_slice(self, key) + def __getitem__(self, index): + """ + Return the value at the given index. - return self.getitem(_normalize_index(key, self.length())) + Returns + ------- + value : Scalar + """ + if PySlice_Check(index): + return _normalize_slice(self, index) + + return self.getitem(_normalize_index(index, self.length())) cdef getitem(self, int64_t i): return box_scalar(self.type, self.sp_array, i) @@ -736,6 +746,9 @@ cdef class Array(_PandasConvertible): cdef class Tensor: + """ + A n-dimensional array a.k.a Tensor. + """ def __init__(self): raise TypeError("Do not call Tensor's constructor directly, use one " @@ -842,98 +855,147 @@ cdef wrap_array_output(PyObject* output): cdef class NullArray(Array): - pass + """ + Concrete class for Arrow arrays of null data type. + """ cdef class BooleanArray(Array): - pass + """ + Concrete class for Arrow arrays of boolean data type. + """ cdef class NumericArray(Array): - pass + """ + A base class for Arrow numeric arrays. + """ cdef class IntegerArray(NumericArray): - pass + """ + A base class for Arrow integer arrays. + """ cdef class FloatingPointArray(NumericArray): - pass + """ + A base class for Arrow floating-point arrays. + """ cdef class Int8Array(IntegerArray): - pass + """ + Concrete class for Arrow arrays of int8 data type. + """ cdef class UInt8Array(IntegerArray): - pass + """ + Concrete class for Arrow arrays of uint8 data type. + """ cdef class Int16Array(IntegerArray): - pass + """ + Concrete class for Arrow arrays of int16 data type. + """ cdef class UInt16Array(IntegerArray): - pass + """ + Concrete class for Arrow arrays of uint16 data type. + """ cdef class Int32Array(IntegerArray): - pass + """ + Concrete class for Arrow arrays of int32 data type. + """ cdef class UInt32Array(IntegerArray): - pass + """ + Concrete class for Arrow arrays of uint32 data type. + """ cdef class Int64Array(IntegerArray): - pass + """ + Concrete class for Arrow arrays of int64 data type. + """ cdef class UInt64Array(IntegerArray): - pass + """ + Concrete class for Arrow arrays of uint64 data type. + """ cdef class Date32Array(NumericArray): - pass + """ + Concrete class for Arrow arrays of date32 data type. + """ cdef class Date64Array(NumericArray): - pass + """ + Concrete class for Arrow arrays of date64 data type. + """ cdef class TimestampArray(NumericArray): - pass + """ + Concrete class for Arrow arrays of timestamp data type. + """ cdef class Time32Array(NumericArray): - pass + """ + Concrete class for Arrow arrays of time32 data type. + """ cdef class Time64Array(NumericArray): - pass + """ + Concrete class for Arrow arrays of time64 data type. + """ cdef class HalfFloatArray(FloatingPointArray): - pass + """ + Concrete class for Arrow arrays of float16 data type. + """ cdef class FloatArray(FloatingPointArray): - pass + """ + Concrete class for Arrow arrays of float32 data type. + """ cdef class DoubleArray(FloatingPointArray): - pass + """ + Concrete class for Arrow arrays of float64 data type. + """ cdef class FixedSizeBinaryArray(Array): - pass + """ + Concrete class for Arrow arrays of a fixed-size binary data type. + """ cdef class Decimal128Array(FixedSizeBinaryArray): - pass + """ + Concrete class for Arrow arrays of decimal128 data type. + """ cdef class ListArray(Array): + """ + Concrete class for Arrow arrays of a list data type. + """ @staticmethod def from_arrays(offsets, values, MemoryPool pool=None): @@ -975,6 +1037,9 @@ cdef class ListArray(Array): cdef class UnionArray(Array): + """ + Concrete class for Arrow arrays of a Union data type. + """ @staticmethod def from_dense(Array types, Array value_offsets, list children): @@ -1028,6 +1093,9 @@ cdef class UnionArray(Array): cdef class StringArray(Array): + """ + Concrete class for Arrow arrays of string (or utf8) data type. + """ @staticmethod def from_buffers(int length, Buffer value_offsets, Buffer data, @@ -1066,10 +1134,15 @@ cdef class StringArray(Array): cdef class BinaryArray(Array): - pass + """ + Concrete class for Arrow arrays of variable-sized binary data type. + """ cdef class DictionaryArray(Array): + """ + Concrete class for dictionary-encoded Arrow arrays. + """ def dictionary_encode(self): return self @@ -1163,6 +1236,9 @@ cdef class DictionaryArray(Array): cdef class StructArray(Array): + """ + Concrete class for Arrow arrays of a struct data type. + """ def field(self, index): """ diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 52122740b63ae..8edffbec6dea2 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -44,6 +44,16 @@ cdef extern from "Python.h": cdef class NativeFile: + """ + The base class for all Arrow streams. + + Streams are either readable, writable, or both. + They optionally support seeking. + + While this class exposes methods to read or write data from Python, the + primary intent of using a Arrow stream is to pass it to other Arrow + facilities that will make use of it, such as Arrow IPC routines. + """ def __cinit__(self): self.own_file = False @@ -559,6 +569,16 @@ BufferedIOBase.register(NativeFile) cdef class PythonFile(NativeFile): + """ + A stream backed by a Python file object. + + This class allows using Python file objects with arbitrary Arrow + functions, including functions written in another language than Python. + + As a downside, there is a non-zero redirection cost in translating + Arrow stream calls to Python method calls. Furthermore, Python's + Global Interpreter Lock may limit parallelism in some situations. + """ cdef: object handle @@ -628,7 +648,9 @@ cdef class PythonFile(NativeFile): cdef class MemoryMappedFile(NativeFile): """ - Supports 'r', 'r+w', 'w' modes + A stream that represents a memory-mapped file. + + Supports 'r', 'r+', 'w' modes. """ cdef: shared_ptr[CMemoryMappedFile] handle @@ -704,7 +726,9 @@ def memory_map(path, mode='r'): Parameters ---------- path : string - mode : {'r', 'w'}, default 'r' + mode : {'r', 'r+', 'w'}, default 'r' + Whether the file is opened for reading ('r+'), writing ('w') + or both ('r+'). Returns ------- @@ -717,13 +741,14 @@ def memory_map(path, mode='r'): def create_memory_map(path, size): """ - Create memory map at indicated path of the given size, return open - writable file object + Create a file of the given size and memory-map it. Parameters ---------- path : string + The file path to create, on the local filesystem. size : int + The file size to create. Returns ------- @@ -734,7 +759,7 @@ def create_memory_map(path, size): cdef class OSFile(NativeFile): """ - Supports 'r', 'w' modes + A stream backed by a regular file descriptor. """ cdef: object path @@ -774,6 +799,9 @@ cdef class OSFile(NativeFile): cdef class FixedSizeBufferWriter(NativeFile): + """ + A stream writing to a Arrow buffer. + """ def __cinit__(self, Buffer buffer): self.output_stream.reset(new CFixedSizeBufferWriter(buffer.buffer)) @@ -800,6 +828,12 @@ cdef class FixedSizeBufferWriter(NativeFile): cdef class Buffer: + """ + The base class for all Arrow buffers. + + A buffer represents a contiguous memory area. Many buffers will own + their memory, though not all of them do. + """ def __cinit__(self): pass @@ -818,14 +852,23 @@ cdef class Buffer: @property def size(self): + """ + The buffer size in bytes. + """ return self.buffer.get().size() @property def address(self): + """ + The buffer's address, as an integer. + """ return self.buffer.get().data() @property def is_mutable(self): + """ + Whether the buffer is mutable. + """ return self.buffer.get().is_mutable() @property @@ -848,7 +891,9 @@ cdef class Buffer: def slice(self, offset=0, length=None): """ - Compute slice of this buffer + Slice this buffer. Memory is not copied. + + You can also use the Python slice notation ``buffer[start:stop]``. Parameters ---------- @@ -861,6 +906,7 @@ cdef class Buffer: Returns ------- sliced : Buffer + A logical view over this buffer. """ cdef shared_ptr[CBuffer] result @@ -876,7 +922,7 @@ cdef class Buffer: def equals(self, Buffer other): """ - Determine if two buffers contain exactly the same data + Determine if two buffers contain exactly the same data. Parameters ---------- @@ -904,6 +950,9 @@ cdef class Buffer: return py_buffer, (self.to_pybytes(),) def to_pybytes(self): + """ + Return this buffer as a Python bytes object. Memory is copied. + """ return cp.PyBytes_FromStringAndSize( self.buffer.get().data(), self.buffer.get().size()) @@ -950,21 +999,25 @@ cdef class Buffer: cdef class ResizableBuffer(Buffer): + """ + A base class for buffers that can be resized. + """ cdef void init_rz(self, const shared_ptr[CResizableBuffer]& buffer): self.init( buffer) def resize(self, int64_t new_size, shrink_to_fit=False): """ - Resize buffer to indicated size + Resize buffer to indicated size. Parameters ---------- - new_size : int64_t + new_size : int New size of buffer (padding may be added internally) shrink_to_fit : boolean, default False - If new_size is less than the current size, shrink internal - capacity, otherwise leave at current capacity + If this is true, the buffer is shrunk when new_size is less + than the current size. + If this is false, the buffer is never shrunk. """ cdef c_bool c_shrink_to_fit = shrink_to_fit with nogil: @@ -982,15 +1035,17 @@ cdef shared_ptr[CResizableBuffer] _allocate_buffer(CMemoryPool* pool): def allocate_buffer(int64_t size, MemoryPool memory_pool=None, resizable=False): """ - Allocate mutable fixed-size buffer + Allocate a mutable buffer. Parameters ---------- size : int Number of bytes to allocate (plus internal padding) memory_pool : MemoryPool, optional - Uses default memory pool if not provided + The pool to allocate memory from. + If not given, the default memory pool is used. resizable : boolean, default False + If true, the returned buffer is resizable. Returns ------- @@ -1305,8 +1360,7 @@ def _detect_compression(path): def compress(object buf, codec='lz4', asbytes=False, memory_pool=None): """ - Compress pyarrow.Buffer or Python object supporting the buffer (memoryview) - protocol + Compress data from buffer-like object. Parameters ---------- @@ -1367,7 +1421,7 @@ def compress(object buf, codec='lz4', asbytes=False, memory_pool=None): def decompress(object buf, decompressed_size=None, codec='lz4', asbytes=False, memory_pool=None): """ - Decompress data from buffer-like object + Decompress data from buffer-like object. Parameters ---------- diff --git a/python/pyarrow/memory.pxi b/python/pyarrow/memory.pxi index 7fa6d79a370d7..047e70d17abcc 100644 --- a/python/pyarrow/memory.pxi +++ b/python/pyarrow/memory.pxi @@ -21,6 +21,12 @@ cdef class MemoryPool: + """ + Base class for memory allocation. + + Besides tracking its number of allocated bytes, a memory pool also + takes care of the required 64-byte alignment for Arrow data. + """ def __init__(self): raise TypeError("Do not call {}'s constructor directly, " @@ -68,8 +74,9 @@ cdef class LoggingMemoryPool(MemoryPool): cdef class ProxyMemoryPool(MemoryPool): """ - Derived MemoryPool class that tracks the number of bytes and - maximum memory allocated through its direct calls. + Memory pool implementation that tracks the number of bytes and + maximum memory allocated through its direct calls, while redirecting + to another memory pool. """ cdef: unique_ptr[CProxyMemoryPool] proxy_pool @@ -81,6 +88,9 @@ cdef class ProxyMemoryPool(MemoryPool): def default_memory_pool(): + """ + Return the process-global memory pool. + """ cdef: MemoryPool pool = MemoryPool.__new__(MemoryPool) pool.init(c_get_memory_pool()) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index fd3f58072d452..e2c1481797df6 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -19,10 +19,17 @@ _NULL = NA = None +cdef class Scalar: + """ + The base class for all array elements. + """ + + cdef class NullType(Scalar): """ - Null (NA) value singleton + Singleton for null array elements. """ + # TODO rename this NullValue? def __cinit__(self): global NA if NA is not None: @@ -44,6 +51,9 @@ _NULL = NA = NullType() cdef class ArrayValue(Scalar): + """ + The base class for non-null array elements. + """ def __init__(self): raise TypeError("Do not call {}'s constructor directly, use array " @@ -85,6 +95,9 @@ cdef class ArrayValue(Scalar): cdef class BooleanValue(ArrayValue): + """ + Concrete class for boolean array elements. + """ def as_py(self): """ @@ -95,6 +108,9 @@ cdef class BooleanValue(ArrayValue): cdef class Int8Value(ArrayValue): + """ + Concrete class for int8 array elements. + """ def as_py(self): """ @@ -105,6 +121,9 @@ cdef class Int8Value(ArrayValue): cdef class UInt8Value(ArrayValue): + """ + Concrete class for uint8 array elements. + """ def as_py(self): """ @@ -115,6 +134,9 @@ cdef class UInt8Value(ArrayValue): cdef class Int16Value(ArrayValue): + """ + Concrete class for int16 array elements. + """ def as_py(self): """ @@ -125,6 +147,9 @@ cdef class Int16Value(ArrayValue): cdef class UInt16Value(ArrayValue): + """ + Concrete class for uint16 array elements. + """ def as_py(self): """ @@ -135,6 +160,9 @@ cdef class UInt16Value(ArrayValue): cdef class Int32Value(ArrayValue): + """ + Concrete class for int32 array elements. + """ def as_py(self): """ @@ -145,6 +173,9 @@ cdef class Int32Value(ArrayValue): cdef class UInt32Value(ArrayValue): + """ + Concrete class for uint32 array elements. + """ def as_py(self): """ @@ -155,6 +186,9 @@ cdef class UInt32Value(ArrayValue): cdef class Int64Value(ArrayValue): + """ + Concrete class for int64 array elements. + """ def as_py(self): """ @@ -165,6 +199,9 @@ cdef class Int64Value(ArrayValue): cdef class UInt64Value(ArrayValue): + """ + Concrete class for uint64 array elements. + """ def as_py(self): """ @@ -175,6 +212,9 @@ cdef class UInt64Value(ArrayValue): cdef class Date32Value(ArrayValue): + """ + Concrete class for date32 array elements. + """ def as_py(self): """ @@ -188,6 +228,9 @@ cdef class Date32Value(ArrayValue): cdef class Date64Value(ArrayValue): + """ + Concrete class for date64 array elements. + """ def as_py(self): """ @@ -199,6 +242,9 @@ cdef class Date64Value(ArrayValue): cdef class Time32Value(ArrayValue): + """ + Concrete class for time32 array elements. + """ def as_py(self): """ @@ -217,6 +263,9 @@ cdef class Time32Value(ArrayValue): cdef class Time64Value(ArrayValue): + """ + Concrete class for time64 array elements. + """ def as_py(self): """ @@ -269,6 +318,9 @@ else: cdef class TimestampValue(ArrayValue): + """ + Concrete class for timestamp array elements. + """ @property def value(self): @@ -301,6 +353,9 @@ cdef class TimestampValue(ArrayValue): cdef class HalfFloatValue(ArrayValue): + """ + Concrete class for float16 array elements. + """ def as_py(self): """ @@ -311,6 +366,9 @@ cdef class HalfFloatValue(ArrayValue): cdef class FloatValue(ArrayValue): + """ + Concrete class for float32 array elements. + """ def as_py(self): """ @@ -321,6 +379,9 @@ cdef class FloatValue(ArrayValue): cdef class DoubleValue(ArrayValue): + """ + Concrete class for float64 array elements. + """ def as_py(self): """ @@ -331,6 +392,9 @@ cdef class DoubleValue(ArrayValue): cdef class DecimalValue(ArrayValue): + """ + Concrete class for decimal128 array elements. + """ def as_py(self): """ @@ -343,6 +407,9 @@ cdef class DecimalValue(ArrayValue): cdef class StringValue(ArrayValue): + """ + Concrete class for string (utf8) array elements. + """ def as_py(self): """ @@ -353,6 +420,9 @@ cdef class StringValue(ArrayValue): cdef class BinaryValue(ArrayValue): + """ + Concrete class for variable-sized binary array elements. + """ def as_py(self): """ @@ -380,14 +450,26 @@ cdef class BinaryValue(ArrayValue): cdef class ListValue(ArrayValue): + """ + Concrete class for list array elements. + """ def __len__(self): + """ + Return the number of values. + """ return self.length() def __getitem__(self, i): + """ + Return the value at the given index. + """ return self.getitem(_normalize_index(i, self.length())) def __iter__(self): + """ + Iterate over this element's values. + """ for i in range(len(self)): yield self.getitem(i) raise StopIteration @@ -419,6 +501,9 @@ cdef class ListValue(ArrayValue): cdef class UnionValue(ArrayValue): + """ + Concrete class for union array elements. + """ cdef void _set_array(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array @@ -436,11 +521,16 @@ cdef class UnionValue(ArrayValue): def as_py(self): """ Return this value as a Python object. + + The exact type depends on the underlying union member. """ return self.getitem(self.index).as_py() cdef class FixedSizeBinaryValue(ArrayValue): + """ + Concrete class for fixed-size binary array elements. + """ def as_py(self): """ @@ -459,12 +549,18 @@ cdef class FixedSizeBinaryValue(ArrayValue): cdef class StructValue(ArrayValue): + """ + Concrete class for struct array elements. + """ cdef void _set_array(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() def __getitem__(self, key): + """ + Return the child value for the given field name. + """ cdef: CStructType* type int index @@ -496,17 +592,23 @@ cdef class StructValue(ArrayValue): cdef class DictionaryValue(ArrayValue): + """ + Concrete class for dictionary-encoded array elements. + """ def as_py(self): """ Return this value as a Python object. + + The exact type depends on the dictionary value type. """ return self.dictionary_value.as_py() @property def index_value(self): """ - Return this value's underlying index as a Int32Value. + Return this value's underlying index as a ArrayValue of the right + signed integer type. """ cdef CDictionaryArray* darr = (self.sp_array.get()) indices = pyarrow_wrap_array(darr.indices()) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 29b2a1ea3c9a0..7c6aec34282fe 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -88,7 +88,9 @@ ctypedef CFixedWidthType* _CFixedWidthTypePtr cdef class DataType: """ - Base type for Apache Arrow data type instances. Wraps C++ arrow::DataType + Base class of all Arrow data types. + + Each data type is an *instance* of this class. """ def __cinit__(self): pass @@ -162,7 +164,7 @@ cdef class DataType: def to_pandas_dtype(self): """ - Return the NumPy dtype that would be used for storing this + Return the equivalent NumPy / Pandas dtype. """ cdef Type type_id = self.type.id() if type_id in _pandas_type_map: @@ -172,6 +174,9 @@ cdef class DataType: cdef class DictionaryType(DataType): + """ + Concrete class for dictionary data types. + """ cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) @@ -182,18 +187,31 @@ cdef class DictionaryType(DataType): @property def ordered(self): + """ + Whether the dictionary is ordered, i.e. whether the ordering of values + in the dictionary is important. + """ return self.dict_type.ordered() @property def index_type(self): + """ + The data type of dictionary indices (a signed integer type). + """ return pyarrow_wrap_data_type(self.dict_type.index_type()) @property def dictionary(self): + """ + The dictionary array, mapping dictionary indices to values. + """ return pyarrow_wrap_array(self.dict_type.dictionary()) cdef class ListType(DataType): + """ + Concrete class for list data types. + """ cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) @@ -204,10 +222,16 @@ cdef class ListType(DataType): @property def value_type(self): + """ + The data type of list values. + """ return pyarrow_wrap_data_type(self.list_type.value_type()) cdef class StructType(DataType): + """ + Concrete class for struct data types. + """ cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) @@ -215,13 +239,13 @@ cdef class StructType(DataType): cdef Field field(self, int i): """ - Alias for child(i) + Return a child field by its index. """ return self.child(i) cdef Field field_by_name(self, name): """ - Access a child field by its name rather than the column index. + Return a child field by its name rather than its index. """ cdef shared_ptr[CField] field @@ -232,13 +256,22 @@ cdef class StructType(DataType): return pyarrow_wrap_field(field) def __len__(self): + """ + Like num_children(). + """ return self.type.num_children() def __iter__(self): + """ + Iterate over struct fields, in order. + """ for i in range(len(self)): yield self[i] def __getitem__(self, i): + """ + Return the struct field with the given index or name. + """ if isinstance(i, six.string_types): return self.field_by_name(i) elif isinstance(i, six.integer_types): @@ -251,20 +284,32 @@ cdef class StructType(DataType): @property def num_children(self): + """ + The number of struct fields. + """ return self.type.num_children() cdef class UnionType(DataType): + """ + Concrete class for struct data types. + """ cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) @property def num_children(self): + """ + The number of union members. + """ return self.type.num_children() @property def mode(self): + """ + The mode of the union ("dense" or "sparse"). + """ cdef CUnionType* type = self.sp_type.get() cdef int mode = type.mode() if mode == _UnionMode_DENSE: @@ -274,13 +319,22 @@ cdef class UnionType(DataType): assert 0 def __len__(self): + """ + Like num_children() + """ return self.type.num_children() def __iter__(self): + """ + Iterate over union members, in order. + """ for i in range(len(self)): yield self[i] def __getitem__(self, i): + """ + Return a child member by its index. + """ return self.child(i) def __reduce__(self): @@ -288,6 +342,9 @@ cdef class UnionType(DataType): cdef class TimestampType(DataType): + """ + Concrete class for timestamp data types. + """ cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) @@ -295,10 +352,16 @@ cdef class TimestampType(DataType): @property def unit(self): + """ + The timestamp unit ('s', 'ms', 'us' or 'ns'). + """ return timeunit_to_string(self.ts_type.unit()) @property def tz(self): + """ + The timestamp time zone, if any, or None. + """ if self.ts_type.timezone().size() > 0: return frombytes(self.ts_type.timezone()) else: @@ -306,7 +369,7 @@ cdef class TimestampType(DataType): def to_pandas_dtype(self): """ - Return the NumPy dtype that would be used for storing this + Return the equivalent NumPy / Pandas dtype. """ if self.tz is None: return _pandas_type_map[_Type_TIMESTAMP] @@ -319,6 +382,9 @@ cdef class TimestampType(DataType): cdef class Time32Type(DataType): + """ + Concrete class for time32 data types. + """ cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) @@ -326,10 +392,16 @@ cdef class Time32Type(DataType): @property def unit(self): + """ + The time unit ('s', 'ms', 'us' or 'ns'). + """ return timeunit_to_string(self.time_type.unit()) cdef class Time64Type(DataType): + """ + Concrete class for time64 data types. + """ cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) @@ -337,10 +409,16 @@ cdef class Time64Type(DataType): @property def unit(self): + """ + The time unit ('s', 'ms', 'us' or 'ns'). + """ return timeunit_to_string(self.time_type.unit()) cdef class FixedSizeBinaryType(DataType): + """ + Concrete class for fixed-size binary data types. + """ cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) @@ -352,10 +430,16 @@ cdef class FixedSizeBinaryType(DataType): @property def byte_width(self): + """ + The binary size in bytes. + """ return self.fixed_size_binary_type.byte_width() cdef class Decimal128Type(FixedSizeBinaryType): + """ + Concrete class for decimal128 data types. + """ cdef void init(self, const shared_ptr[CDataType]& type): FixedSizeBinaryType.init(self, type) @@ -366,17 +450,22 @@ cdef class Decimal128Type(FixedSizeBinaryType): @property def precision(self): + """ + The decimal precision, in number of decimal digits (an integer). + """ return self.decimal128_type.precision() @property def scale(self): + """ + The decimal scale (an integer). + """ return self.decimal128_type.scale() cdef class Field: """ - Represents a named field, with a data type, nullability, and optional - metadata + A named field, with a data type, nullability, and optional metadata. Notes ----- From 76618f66ee8ce75cbe09d1d1a8c313dad3d94127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Jan 2019 22:35:53 +0100 Subject: [PATCH 207/328] [Release/Java] Disable Flight test case --- .../src/test/java/org/apache/arrow/flight/TestBackPressure.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/java/flight/src/test/java/org/apache/arrow/flight/TestBackPressure.java b/java/flight/src/test/java/org/apache/arrow/flight/TestBackPressure.java index 6b23a40f29348..71c90d3a00d47 100644 --- a/java/flight/src/test/java/org/apache/arrow/flight/TestBackPressure.java +++ b/java/flight/src/test/java/org/apache/arrow/flight/TestBackPressure.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import com.google.common.collect.ImmutableList; @@ -78,6 +79,7 @@ public void ensureIndependentSteams() throws Exception { /** * Make sure that a stream doesn't go faster than the consumer is consuming. */ + @Ignore @Test public void ensureWaitUntilProceed() throws Exception { // request some values. From d7a68335cca4dd996ed6c9d2967f01601f15d5e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 11 Jan 2019 13:59:20 +0100 Subject: [PATCH 208/328] ARROW-4229: [Packaging] Set crossbow target explicitly to enable building arbitrary arrow repo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change eliminates the need of: ``` # checkout the tag under a new branch name and push that branch to your fork's remote # # to launch a crossbow build this branch _must_ exist on your remote git checkout -b zero-one-zero-rc0 apache-arrow-0.1.0 git push -u zero-one-zero-rc0 ``` during the [release procedure](https://cwiki.apache.org/confluence/display/ARROW/Release+Management+Guide): Usage: ```bash python dev/tasks/crossbow.py submit \ -r apache/arrow \ -t apache-arrow-0.12.0 \ -v 0.12.0 \ -g conda -g wheel -g linux ``` Testing it... Author: Krisztián Szűcs Closes #3369 from kszucs/arbitrary-crossbow-repo and squashes the following commits: c97354ed allow passing crossbow repo and branch explicitly --- dev/tasks/crossbow.py | 46 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/dev/tasks/crossbow.py b/dev/tasks/crossbow.py index 74facf4b7fa01..d700384e55988 100755 --- a/dev/tasks/crossbow.py +++ b/dev/tasks/crossbow.py @@ -398,9 +398,10 @@ def __init__(self, head, branch, remote, version, email=None): self.version = version @classmethod - def from_repo(cls, repo): + def from_repo(cls, repo, version=None): assert isinstance(repo, Repo) - version = get_version(repo.path, local_scheme=lambda v: '') + if version is None: + version = get_version(repo.path, local_scheme=lambda v: '') return cls(head=str(repo.head.target), email=repo.email, branch=repo.branch.branch_name, @@ -587,17 +588,48 @@ def load_tasks_from_config(config_path, task_names, group_names): help='Task configuration yml. Defaults to tasks.yml') @click.option('--arrow-version', '-v', default=None, help='Set target version explicitly') +@click.option('--arrow-repo', '-r', default=None, + help='Set Github repo name explicitly, e.g. apache/arrow, ' + 'kszucs/arrow, this repository is going to be cloned on ' + 'the CI services. Note, that no validation happens locally ' + 'and potentially --arrow-branch and --arrow-sha must be ' + 'defined as well') +@click.option('--arrow-branch', '-b', default='master', + help='Give the branch name explicitly, e.g. master, ARROW-1949.' + 'Only available if --arrow-repo is set.') +@click.option('--arrow-sha', '-t', default='HEAD', + help='Set commit SHA or Tag name explicitly, e.g. f67a515, ' + 'apache-arrow-0.11.1. Only available if both --arrow-repo ' + '--arrow-branch are set.') @click.option('--dry-run/--push', default=False, help='Just display the rendered CI configurations without ' 'submitting them') @click.pass_context -def submit(ctx, task, group, job_prefix, config_path, arrow_version, dry_run): +def submit(ctx, task, group, job_prefix, config_path, arrow_version, + arrow_repo, arrow_branch, arrow_sha, dry_run): queue, arrow = ctx.obj['queue'], ctx.obj['arrow'] - target = Target.from_repo(arrow) - # explicitly set arrow version - if arrow_version: - target.version = arrow_version + if arrow_repo is not None: + values = {'version': arrow_version, + 'branch': arrow_branch, + 'sha': arrow_sha} + for k, v in values.items(): + if not v: + raise ValueError('Must pass --arrow-{} argument'.format(k)) + + # Set repo url, branch and sha explicitly - this aims to make release + # procedure a bit simpler. + # Note, that the target resivion's crossbow templates must be + # compatible with the locally checked out version of crossbow (which is + # in case of the release procedure), because the templates still + # contain some business logic (dependency installation, deployments) + # which will be reduced to a single command in the future. + remote = 'https://github.com/{}'.format(arrow_repo) + target = Target(head=arrow_sha, branch=arrow_branch, remote=remote, + version=arrow_version) + else: + # instantiate target from the locally checked out repository and branch + target = Target.from_repo(arrow, version=arrow_version) no_rc_version = re.sub(r'-rc\d+\Z', '', target.version) params = { From 54b35b4c13a8904286eca80bb76d9f4e7b619a87 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Fri, 11 Jan 2019 14:03:38 +0100 Subject: [PATCH 209/328] ARROW-4233: [Packaging] Use Docker to build source archive Author: Kouhei Sutou Closes #3376 from kou/packaging-source-use-docker and squashes the following commits: 404efe87 Use Docker to build source archive --- dev/release/02-source.sh | 48 ++++++++++---------------------- dev/release/source/Dockerfile | 48 ++++++++++++++++++++++++++++++++ dev/release/source/build.sh | 52 +++++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 33 deletions(-) create mode 100644 dev/release/source/Dockerfile create mode 100755 dev/release/source/build.sh diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh index e224584223b4c..85dee3302e917 100755 --- a/dev/release/02-source.sh +++ b/dev/release/02-source.sh @@ -45,46 +45,28 @@ echo "Using commit $release_hash" tarball=${tag}.tar.gz -extract_dir=tmp-apache-arrow -rm -rf ${extract_dir} +archive_name=tmp-apache-arrow # be conservative and use the release hash, even though git produces the same # archive (identical hashes) using the scm tag -git archive ${release_hash} --prefix ${extract_dir}/ | tar xf - - -# build Apache Arrow C++ before building Apache Arrow GLib because -# Apache Arrow GLib requires Apache Arrow C++. -mkdir -p ${extract_dir}/cpp/build -cpp_install_dir=${PWD}/${extract_dir}/cpp/install -cd ${extract_dir}/cpp/build -cmake .. \ - -DCMAKE_INSTALL_PREFIX=${cpp_install_dir} \ - -DCMAKE_INSTALL_LIBDIR=${cpp_install_dir}/lib \ - -DARROW_BUILD_TESTS=no \ - -DARROW_PARQUET=yes -make -j8 -make install -cd - - -# build source archive for Apache Arrow GLib by "make dist". -cd ${extract_dir}/c_glib -./autogen.sh -./configure \ - PKG_CONFIG_PATH=$cpp_install_dir/lib/pkgconfig \ - --enable-gtk-doc -LD_LIBRARY_PATH=$cpp_install_dir/lib:$LD_LIBRARY_PATH make -j8 -make dist -tar xzf *.tar.gz -rm *.tar.gz -cd - -rm -rf tmp-c_glib/ -mv ${extract_dir}/c_glib/apache-arrow-glib-* tmp-c_glib/ -rm -rf ${extract_dir} +git archive ${release_hash} --prefix ${archive_name}/ > ${archive_name}.tar.gz + +dist_c_glib_tar_gz=c_glib.tar.gz +docker_image_name=apache-arrow/release-source +DEBUG=yes docker build -t ${docker_image_name} ${SOURCE_DIR}/source +docker \ + run \ + --rm \ + --interactive \ + --volume "$PWD":/host \ + ${docker_image_name} \ + /build.sh ${archive_name} ${dist_c_glib_tar_gz} # replace c_glib/ by tar.gz generated by "make dist" rm -rf ${tag} git archive $release_hash --prefix ${tag}/ | tar xf - rm -rf ${tag}/c_glib -mv tmp-c_glib ${tag}/c_glib +tar xf ${dist_c_glib_tar_gz} -C ${tag} +rm -f ${dist_c_glib_tar_gz} # Create new tarball from modified source directory tar czhf ${tarball} ${tag} diff --git a/dev/release/source/Dockerfile b/dev/release/source/Dockerfile new file mode 100644 index 0000000000000..70ed8aa866dd0 --- /dev/null +++ b/dev/release/source/Dockerfile @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM ubuntu:18.04 + +ENV DEBIAN_FRONTEND noninteractive + +RUN \ + apt update && \ + apt install -y -V \ + autoconf-archive \ + bison \ + clang-6.0 \ + cmake \ + flex \ + g++ \ + gcc \ + gtk-doc-tools \ + libboost-filesystem-dev \ + libboost-regex-dev \ + libboost-system-dev \ + libgirepository1.0-dev \ + libglib2.0-doc \ + libprotobuf-dev \ + libprotoc-dev \ + libtool \ + lsb-release \ + make \ + pkg-config \ + protobuf-compiler && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* + +COPY build.sh /build.sh diff --git a/dev/release/source/build.sh b/dev/release/source/build.sh new file mode 100755 index 0000000000000..039d07591f2ef --- /dev/null +++ b/dev/release/source/build.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +archive_name=$1 +dist_c_glib_tar_gz=$2 + +tar xf /host/${archive_name}.tar.gz + +# build Apache Arrow C++ before building Apache Arrow GLib because +# Apache Arrow GLib requires Apache Arrow C++. +mkdir -p ${archive_name}/cpp/build +cpp_install_dir=${PWD}/${archive_name}/cpp/install +cd ${archive_name}/cpp/build +cmake .. \ + -DCMAKE_INSTALL_PREFIX=${cpp_install_dir} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DARROW_PARQUET=yes +make -j8 +make install +cd - + +# build source archive for Apache Arrow GLib by "make dist". +cd ${archive_name}/c_glib +./autogen.sh +./configure \ + PKG_CONFIG_PATH=${cpp_install_dir}/lib/pkgconfig \ + --enable-gtk-doc +LD_LIBRARY_PATH=${cpp_install_dir}/lib make -j8 +make dist +tar xzf *.tar.gz +rm *.tar.gz +cd - +mv ${archive_name}/c_glib/apache-arrow-glib-* c_glib/ +tar czf /host/${dist_c_glib_tar_gz} c_glib From 38a628dff6fcd5f3c7e6b402f5ceb35cc8bd52c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 12 Jan 2019 06:03:45 +0900 Subject: [PATCH 210/328] ARROW-4238: [Packaging] Fix RC version conflict between crossbow and rake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This shouldn't affect the current release candidate, the binaries are already building this way. Can be merged after RC0 is finished (either way). Author: Krisztián Szűcs Closes #3380 from kszucs/ARROW-4238 and squashes the following commits: 0e865002 use no_rc_version everywhere --- dev/release/00-prepare.sh | 4 ++ dev/tasks/conda-recipes/appveyor.yml | 2 +- dev/tasks/conda-recipes/travis.linux.yml | 2 +- dev/tasks/conda-recipes/travis.osx.yml | 2 +- dev/tasks/crossbow.py | 4 +- dev/tasks/python-wheels/appveyor.yml | 2 +- dev/tasks/python-wheels/travis.linux.yml | 2 +- dev/tasks/python-wheels/travis.osx.yml | 2 +- dev/tasks/tasks.yml | 86 ++++++++++++------------ 9 files changed, 55 insertions(+), 51 deletions(-) diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 1c233a35c21ef..96bfd69115a05 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -107,6 +107,8 @@ update_versions() { } if [ "$#" -eq 2 ]; then + ############################## Pre-Tag Commits ############################## + version=$1 next_version=$2 next_version_snapshot=${next_version}-SNAPSHOT @@ -136,6 +138,8 @@ if [ "$#" -eq 2 ]; then mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmodules -DdevelopmentVersion=${next_version_snapshot} cd - + ############################## Post-Tag Commits ############################# + echo "Updating versions for ${next_version_snapshot}" update_versions "${version}" "${next_version}" "snapshot" git commit -m "[Release] Update versions for ${next_version_snapshot}" diff --git a/dev/tasks/conda-recipes/appveyor.yml b/dev/tasks/conda-recipes/appveyor.yml index cdc9d97537156..3d3ba43be7584 100644 --- a/dev/tasks/conda-recipes/appveyor.yml +++ b/dev/tasks/conda-recipes/appveyor.yml @@ -16,7 +16,7 @@ # under the License. environment: - ARROW_VERSION: {{ arrow.version }} + ARROW_VERSION: {{ arrow.no_rc_version }} # regardless of the python version we build against CONDA_INSTALL_LOCN: C:\Miniconda36-x64 diff --git a/dev/tasks/conda-recipes/travis.linux.yml b/dev/tasks/conda-recipes/travis.linux.yml index a3c2929b7e6db..f0c4c77adae06 100644 --- a/dev/tasks/conda-recipes/travis.linux.yml +++ b/dev/tasks/conda-recipes/travis.linux.yml @@ -25,7 +25,7 @@ if: tag IS blank env: global: - TRAVIS_TAG={{ task.tag }} - - ARROW_VERSION={{ arrow.version }} + - ARROW_VERSION={{ arrow.no_rc_version }} - PYTHONUNBUFFERED=1 install: diff --git a/dev/tasks/conda-recipes/travis.osx.yml b/dev/tasks/conda-recipes/travis.osx.yml index 6b3e561a3c5b0..23fd6e104ab4e 100644 --- a/dev/tasks/conda-recipes/travis.osx.yml +++ b/dev/tasks/conda-recipes/travis.osx.yml @@ -25,7 +25,7 @@ if: tag IS blank env: global: - TRAVIS_TAG={{ task.tag }} - - ARROW_VERSION={{ arrow.version }} + - ARROW_VERSION={{ arrow.no_rc_version }} - PYTHONUNBUFFERED=1 before_install: diff --git a/dev/tasks/crossbow.py b/dev/tasks/crossbow.py index d700384e55988..2d0c53089d056 100755 --- a/dev/tasks/crossbow.py +++ b/dev/tasks/crossbow.py @@ -396,6 +396,7 @@ def __init__(self, head, branch, remote, version, email=None): self.branch = branch self.remote = remote self.version = version + self.no_rc_version = re.sub(r'-rc\d+\Z', '', version) @classmethod def from_repo(cls, repo, version=None): @@ -631,10 +632,9 @@ def submit(ctx, task, group, job_prefix, config_path, arrow_version, # instantiate target from the locally checked out repository and branch target = Target.from_repo(arrow, version=arrow_version) - no_rc_version = re.sub(r'-rc\d+\Z', '', target.version) params = { 'version': target.version, - 'no_rc_version': no_rc_version, + 'no_rc_version': target.no_rc_version, } # task and group variables are lists, containing multiple values diff --git a/dev/tasks/python-wheels/appveyor.yml b/dev/tasks/python-wheels/appveyor.yml index c220f922bc45c..be6ad302e1a5c 100644 --- a/dev/tasks/python-wheels/appveyor.yml +++ b/dev/tasks/python-wheels/appveyor.yml @@ -24,7 +24,7 @@ environment: PYTHON: "{{ python_version }}" MSVC_DEFAULT_OPTIONS: ON ARROW_SRC: C:\apache-arrow - PYARROW_VERSION: {{ arrow.version }} + PYARROW_VERSION: {{ arrow.no_rc_version }} PYARROW_REF: {{ arrow.head }} init: diff --git a/dev/tasks/python-wheels/travis.linux.yml b/dev/tasks/python-wheels/travis.linux.yml index 17888ccc9f1bb..b5cbc65bc7e7e 100644 --- a/dev/tasks/python-wheels/travis.linux.yml +++ b/dev/tasks/python-wheels/travis.linux.yml @@ -40,7 +40,7 @@ script: # build wheel - pushd arrow/python/manylinux1 - docker run --shm-size=2g - -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.version }} + -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} -e PYTHON_VERSIONS="{{ python_version }},{{ unicode_width }}" -v $PWD:/io -v $PWD/../../:/arrow diff --git a/dev/tasks/python-wheels/travis.osx.yml b/dev/tasks/python-wheels/travis.osx.yml index c6bd010da4ebc..a98841335e728 100644 --- a/dev/tasks/python-wheels/travis.osx.yml +++ b/dev/tasks/python-wheels/travis.osx.yml @@ -26,7 +26,7 @@ env: - PLAT=x86_64 - TRAVIS_TAG={{ task.tag }} - MACOSX_DEPLOYMENT_TARGET="10.9" - - PYARROW_VERSION={{ arrow.version }} + - PYARROW_VERSION={{ arrow.no_rc_version }} - PYARROW_BUILD_VERBOSE=1 - MB_PYTHON_VERSION={{ python_version }} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 4b10b57fd0990..ce311e546d495 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -66,7 +66,7 @@ tasks: # artifacts: list of regex patterns, each needs to match a single github # release asset, version variable is replaced in the pattern # e.g.: - # - pyarrow-{version}-py36(h[a-z0-9]+)_0-linux-64.tar.bz2 + # - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0-linux-64.tar.bz2 ############################## Conda Linux ################################## @@ -76,8 +76,8 @@ tasks: params: variant_config_file: variants/linux_c_compilergcccxx_compilergxxpython2.7.yaml artifacts: - - arrow-cpp-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py27(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 conda-linux-gcc-py36: platform: linux @@ -85,8 +85,8 @@ tasks: params: variant_config_file: variants/linux_c_compilergcccxx_compilergxxpython3.6.yaml artifacts: - - arrow-cpp-{version}-py36(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 conda-linux-gcc-py37: platform: linux @@ -94,8 +94,8 @@ tasks: params: variant_config_file: variants/linux_c_compilergcccxx_compilergxxpython3.7.yaml artifacts: - - arrow-cpp-{version}-py37(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 conda-linux-toolchain-py27: platform: linux @@ -103,8 +103,8 @@ tasks: params: variant_config_file: variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml artifacts: - - arrow-cpp-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py27(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 conda-linux-toolchain-py36: platform: linux @@ -112,8 +112,8 @@ tasks: params: variant_config_file: variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml artifacts: - - arrow-cpp-{version}-py36(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 conda-linux-toolchain-py37: platform: linux @@ -121,8 +121,8 @@ tasks: params: variant_config_file: variants/linux_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml artifacts: - - arrow-cpp-{version}-py37(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 ############################## Conda OSX #################################### @@ -132,8 +132,8 @@ tasks: params: variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml artifacts: - - arrow-cpp-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py27(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 conda-osx-clang-py36: platform: osx @@ -141,8 +141,8 @@ tasks: params: variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml artifacts: - - arrow-cpp-{version}-py36(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 conda-osx-clang-py37: platform: osx @@ -150,8 +150,8 @@ tasks: params: variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml artifacts: - - arrow-cpp-{version}-py37(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 conda-osx-toolchain-py27: platform: osx @@ -159,8 +159,8 @@ tasks: params: variant_config_file: variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython2.7.yaml artifacts: - - arrow-cpp-{version}-py27(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py27(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 conda-osx-toolchain-py36: platform: osx @@ -168,8 +168,8 @@ tasks: params: variant_config_file: variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.6.yaml artifacts: - - arrow-cpp-{version}-py36(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 conda-osx-toolchain-py37: platform: osx @@ -177,8 +177,8 @@ tasks: params: variant_config_file: variants/osx_c_compilertoolchain_ccxx_compilertoolchain_cxxpython3.7.yaml artifacts: - - arrow-cpp-{version}-py37(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 ############################## Conda Windows ################################ @@ -188,8 +188,8 @@ tasks: params: variant_config_file: variants\win_c_compilervs2015cxx_compilervs2015python3.6.yaml artifacts: - - arrow-cpp-{version}-py36_vc14(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py36(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py36_vc14(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 conda-win-vs2015-py37: platform: win @@ -197,8 +197,8 @@ tasks: params: variant_config_file: variants\win_c_compilervs2015cxx_compilervs2015python3.7.yaml artifacts: - - arrow-cpp-{version}-py37_vc14(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{version}-py37(h[a-z0-9]+)_0.tar.bz2 + - arrow-cpp-{no_rc_version}-py37_vc14(h[a-z0-9]+)_0.tar.bz2 + - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 ############################## Wheel Linux ################################## @@ -210,7 +210,7 @@ tasks: unicode_width: 16 test_docker_images: [] artifacts: - - pyarrow-{version}-cp27-cp27m-manylinux1_x86_64.whl + - pyarrow-{no_rc_version}-cp27-cp27m-manylinux1_x86_64.whl wheel-linux-cp27mu: platform: linux @@ -221,7 +221,7 @@ tasks: test_docker_images: - python:2.7-slim # debian ucs4 artifacts: - - pyarrow-{version}-cp27-cp27mu-manylinux1_x86_64.whl + - pyarrow-{no_rc_version}-cp27-cp27mu-manylinux1_x86_64.whl wheel-linux-cp35m: platform: linux @@ -232,7 +232,7 @@ tasks: test_docker_images: - python:3.5-slim artifacts: - - pyarrow-{version}-cp35-cp35m-manylinux1_x86_64.whl + - pyarrow-{no_rc_version}-cp35-cp35m-manylinux1_x86_64.whl wheel-linux-cp36m: platform: linux @@ -243,7 +243,7 @@ tasks: test_docker_images: - python:3.6-slim artifacts: - - pyarrow-{version}-cp36-cp36m-manylinux1_x86_64.whl + - pyarrow-{no_rc_version}-cp36-cp36m-manylinux1_x86_64.whl wheel-linux-cp37m: platform: linux @@ -254,7 +254,7 @@ tasks: test_docker_images: - python:3.7-slim artifacts: - - pyarrow-{version}-cp37-cp37m-manylinux1_x86_64.whl + - pyarrow-{no_rc_version}-cp37-cp37m-manylinux1_x86_64.whl ############################## Wheel OSX #################################### @@ -264,7 +264,7 @@ tasks: params: python_version: 2.7 artifacts: - - pyarrow-{version}-cp27-cp27m-macosx_10_6_intel.whl + - pyarrow-{no_rc_version}-cp27-cp27m-macosx_10_6_intel.whl wheel-osx-cp35m: platform: osx @@ -272,7 +272,7 @@ tasks: params: python_version: 3.5 artifacts: - - pyarrow-{version}-cp35-cp35m-macosx_10_6_intel.whl + - pyarrow-{no_rc_version}-cp35-cp35m-macosx_10_6_intel.whl wheel-osx-cp36m: platform: osx @@ -280,7 +280,7 @@ tasks: params: python_version: 3.6 artifacts: - - pyarrow-{version}-cp36-cp36m-macosx_10_6_intel.whl + - pyarrow-{no_rc_version}-cp36-cp36m-macosx_10_6_intel.whl wheel-osx-cp37m: platform: osx @@ -288,7 +288,7 @@ tasks: params: python_version: 3.7 artifacts: - - pyarrow-{version}-cp37-cp37m-macosx_10_6_intel.whl + - pyarrow-{no_rc_version}-cp37-cp37m-macosx_10_6_intel.whl ############################## Wheel Windows ################################ @@ -298,7 +298,7 @@ tasks: params: python_version: 3.5 artifacts: - - pyarrow-{version}-cp35-cp35m-win_amd64.whl + - pyarrow-{no_rc_version}-cp35-cp35m-win_amd64.whl wheel-win-cp36m: platform: win @@ -306,7 +306,7 @@ tasks: params: python_version: 3.6 artifacts: - - pyarrow-{version}-cp36-cp36m-win_amd64.whl + - pyarrow-{no_rc_version}-cp36-cp36m-win_amd64.whl wheel-win-cp37m: platform: win @@ -314,7 +314,7 @@ tasks: params: python_version: 3.7 artifacts: - - pyarrow-{version}-cp37-cp37m-win_amd64.whl + - pyarrow-{no_rc_version}-cp37-cp37m-win_amd64.whl ############################## Linux PKGS #################################### @@ -597,10 +597,10 @@ tasks: platform: linux template: gandiva-jars/travis.linux.yml artifacts: - - arrow-gandiva-{version}-SNAPSHOT.jar + - arrow-gandiva-{no_rc_version}-SNAPSHOT.jar gandiva-jar-osx: platform: osx template: gandiva-jars/travis.osx.yml artifacts: - - arrow-gandiva-{version}-SNAPSHOT.jar + - arrow-gandiva-{no_rc_version}-SNAPSHOT.jar From 06de47afcb7532a9646089ca23bd7d1e62eddc10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 12 Jan 2019 06:04:50 +0900 Subject: [PATCH 211/328] ARROW-4237: [Packaging] Fix CMAKE_INSTALL_LIBDIR in release verification script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Required to verify RC0 > commit msg should be: "lib instead of $ARROW_HOME/lib" :) Author: Krisztián Szűcs Closes #3381 from kszucs/ARROW-4237 and squashes the following commits: f831b0e3 lib instead of /lib --- dev/release/verify-release-candidate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 71324ec12f7c5..0e4609735ba53 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -160,7 +160,7 @@ test_and_install_cpp() { ARROW_CMAKE_OPTIONS=" -DCMAKE_INSTALL_PREFIX=$ARROW_HOME --DCMAKE_INSTALL_LIBDIR=$ARROW_HOME/lib +-DCMAKE_INSTALL_LIBDIR=lib -DARROW_PLASMA=ON -DARROW_ORC=ON -DARROW_PYTHON=ON From 9178ad8c3c9ea371c3b7edb3fcee3073f5082bdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 12 Jan 2019 06:32:52 +0900 Subject: [PATCH 212/328] ARROW-4241: [Packaging] Disable crossbow conda OSX clang builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They are expected to fail. Author: Krisztián Szűcs Closes #3383 from kszucs/disable_conda_clang and squashes the following commits: 42417bdb Disable conda OSX clang builds --- dev/tasks/tasks.yml | 58 ++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index ce311e546d495..e6764580966f0 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -25,9 +25,9 @@ groups: - conda-linux-toolchain-py27 - conda-linux-toolchain-py36 - conda-linux-toolchain-py37 - - conda-osx-clang-py27 - - conda-osx-clang-py36 - - conda-osx-clang-py37 + # - conda-osx-clang-py27 + # - conda-osx-clang-py36 + # - conda-osx-clang-py37 - conda-osx-toolchain-py27 - conda-osx-toolchain-py36 - conda-osx-toolchain-py37 @@ -126,32 +126,32 @@ tasks: ############################## Conda OSX #################################### - conda-osx-clang-py27: - platform: osx - template: conda-recipes/travis.osx.yml - params: - variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml - artifacts: - - arrow-cpp-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 - - conda-osx-clang-py36: - platform: osx - template: conda-recipes/travis.osx.yml - params: - variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml - artifacts: - - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 - - conda-osx-clang-py37: - platform: osx - template: conda-recipes/travis.osx.yml - params: - variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml - artifacts: - - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 - - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 + # conda-osx-clang-py27: + # platform: osx + # template: conda-recipes/travis.osx.yml + # params: + # variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython2.7.yaml + # artifacts: + # - arrow-cpp-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 + # - pyarrow-{no_rc_version}-py27(h[a-z0-9]+)_0.tar.bz2 + # + # conda-osx-clang-py36: + # platform: osx + # template: conda-recipes/travis.osx.yml + # params: + # variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython3.6.yaml + # artifacts: + # - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 + # - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0.tar.bz2 + # + # conda-osx-clang-py37: + # platform: osx + # template: conda-recipes/travis.osx.yml + # params: + # variant_config_file: variants/osx_c_compilerclangcxx_compilerclangxxpython3.7.yaml + # artifacts: + # - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 + # - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0.tar.bz2 conda-osx-toolchain-py27: platform: osx From 0a553b7eb9dc65e53254abe31e7841b31ea132a9 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Fri, 11 Jan 2019 22:38:18 +0100 Subject: [PATCH 213/328] ARROW-4240: [Packaging] Add missing Plasma GLib and Gandiva GLib documents to souce archive Author: Kouhei Sutou Closes #3382 from kou/packaging-source-archive-include-all-built-documents and squashes the following commits: ef5bd01c Add missing Plasma GLib and Gandiva GLib documents to source archive --- dev/release/source/build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/release/source/build.sh b/dev/release/source/build.sh index 039d07591f2ef..25775fdc3e813 100755 --- a/dev/release/source/build.sh +++ b/dev/release/source/build.sh @@ -32,6 +32,8 @@ cd ${archive_name}/cpp/build cmake .. \ -DCMAKE_INSTALL_PREFIX=${cpp_install_dir} \ -DCMAKE_INSTALL_LIBDIR=lib \ + -DARROW_PLASMA=yes \ + -DARROW_GANDIVA=yes \ -DARROW_PARQUET=yes make -j8 make install From f7eb1f79619cb4f55e3b2cd46feae0f3dd0ef05b Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Sat, 12 Jan 2019 09:18:35 +0100 Subject: [PATCH 214/328] ARROW-4239: [Packaging] Fix version update for the next version This also includes BSD sed support. Author: Kouhei Sutou Closes #3385 from kou/packaging-fix-version-update and squashes the following commits: add6fd73 Fix version update for the next version --- dev/release/00-prepare.sh | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 96bfd69115a05..d57c51739b100 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -28,17 +28,17 @@ update_versions() { case ${type} in release) - version=${base_version} - r_version=${base_version} + local version=${base_version} + local r_version=${base_version} ;; snapshot) - version=${next_version}-SNAPSHOT - r_version=${base_version}.9000 + local version=${next_version}-SNAPSHOT + local r_version=${base_version}.9000 ;; esac cd "${SOURCE_DIR}/../../cpp" - sed -i.bak -r -e \ + sed -i.bak -E -e \ "s/^set\(ARROW_VERSION \".+\"\)/set(ARROW_VERSION \"${version}\")/" \ CMakeLists.txt rm -f CMakeLists.txt.bak @@ -46,10 +46,10 @@ update_versions() { cd - cd "${SOURCE_DIR}/../../c_glib" - sed -i.bak -r -e \ + sed -i.bak -E -e \ "s/^m4_define\(\[arrow_glib_version\], .+\)/m4_define([arrow_glib_version], ${version})/" \ configure.ac - sed -i.bak -r -e \ + sed -i.bak -E -e \ "s/^version = '.+'/version = '${version}'/" \ meson.build rm -f configure.ac.bak meson.build.bak @@ -58,7 +58,7 @@ update_versions() { # We can enable this when Arrow JS uses the same version. # cd "${SOURCE_DIR}/../../js" - # sed -i.bak -r -e \ + # sed -i.bak -E -e \ # "s/^ \"version\": \".+\"/ \"version\": \"${version}\"/" \ # package.json # rm -f package.json @@ -66,7 +66,7 @@ update_versions() { # cd - cd "${SOURCE_DIR}/../../matlab" - sed -i.bak -r -e \ + sed -i.bak -E -e \ "s/^set\(MLARROW_VERSION \".+\"\)/set(MLARROW_VERSION \"${version}\")/" \ CMakeLists.txt rm -f CMakeLists.txt.bak @@ -74,7 +74,7 @@ update_versions() { cd - cd "${SOURCE_DIR}/../../python" - sed -i.bak -r -e \ + sed -i.bak -E -e \ "s/^default_version: '.+'/default_version = '${version}'/" \ setup.py rm -f setup.py.bak @@ -82,7 +82,7 @@ update_versions() { cd - cd "${SOURCE_DIR}/../../r" - sed -i.bak -r -e \ + sed -i.bak -E -e \ "s/^Version: .+/Version: ${r_version}/" \ DESCRIPTION rm -f DESCRIPTION.bak @@ -90,7 +90,7 @@ update_versions() { cd - cd "${SOURCE_DIR}/../../ruby" - sed -i.bak -r -e \ + sed -i.bak -E -e \ "s/^ VERSION = \".+\"/ VERSION = \"${version}\"/g" \ */*/*/version.rb rm -f */*/*/version.rb.bak @@ -98,7 +98,7 @@ update_versions() { cd - cd "${SOURCE_DIR}/../../rust" - sed -i.bak -r -e \ + sed -i.bak -E -e \ "s/^version = \".+\"/version = \"${version}\"/g" \ arrow/Cargo.toml parquet/Cargo.toml rm -f arrow/Cargo.toml.bak parquet/Cargo.toml.bak @@ -145,8 +145,8 @@ if [ "$#" -eq 2 ]; then git commit -m "[Release] Update versions for ${next_version_snapshot}" echo "Updating .deb package names for ${next_version}" - deb_lib_suffix=$(echo $version | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') - next_deb_lib_suffix=$(echo $next_version | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') + deb_lib_suffix=$(echo $version | sed -E -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') + next_deb_lib_suffix=$(echo $next_version | sed -E -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') cd $SOURCE_DIR/../tasks/linux-packages/ for target in debian*/lib*${deb_lib_suffix}.install; do git mv \ @@ -154,17 +154,17 @@ if [ "$#" -eq 2 ]; then $(echo $target | sed -e "s/${deb_lib_suffix}/${next_deb_lib_suffix}/") done deb_lib_suffix_substitute_pattern="s/(lib(arrow|gandiva|parquet|plasma)[-a-z]*)${deb_lib_suffix}/\\1${next_deb_lib_suffix}/g" - sed -i.bak -r -e "${deb_lib_suffix_substitute_pattern}" debian*/control + sed -i.bak -E -e "${deb_lib_suffix_substitute_pattern}" debian*/control rm -f debian*/control.bak git add debian*/control cd - cd $SOURCE_DIR/../tasks/ - sed -i.bak -r -e "${deb_lib_suffix_substitute_pattern}" tasks.yml + sed -i.bak -E -e "${deb_lib_suffix_substitute_pattern}" tasks.yml rm -f tasks.yml.bak git add tasks.yml cd - cd $SOURCE_DIR - sed -i.bak -r -e "${deb_lib_suffix_substitute_pattern}" rat_exclude_files.txt + sed -i.bak -E -e "${deb_lib_suffix_substitute_pattern}" rat_exclude_files.txt rm -f rat_exclude_files.txt.bak git add rat_exclude_files.txt git commit -m "[Release] Update .deb package names for $next_version" From 3e97ca1c207cacfb5340940bc86f95107849cbcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 12 Jan 2019 09:52:10 +0100 Subject: [PATCH 215/328] ARROW-4243: [Python] Fix test failures with pandas 0.24.0rc1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Author: Kouhei Sutou Closes #3387 from kou/python-pandas and squashes the following commits: 8c9cb641 fix python dockerfile 27d15a6d Fix test failures with pandas 0.24.0rc1 --- python/Dockerfile | 3 +-- python/pyarrow/pandas_compat.py | 8 ++++++-- python/pyarrow/serialization.py | 21 +++++++++++++++++++++ python/pyarrow/tests/test_convert_pandas.py | 6 ++---- 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/python/Dockerfile b/python/Dockerfile index ecabc94493cf0..e20f266da216d 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -21,9 +21,8 @@ FROM arrow:cpp ARG PYTHON_VERSION=3.6 ADD ci/conda_env_python.yml /arrow/ci/ RUN conda install -c conda-forge \ - nomkl \ --file arrow/ci/conda_env_python.yml \ - python=$PYTHON_VERSION && \ + python=$PYTHON_VERSION nomkl && \ conda clean --all ENV ARROW_PYTHON=ON \ diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index a5d8621590f13..403f15dfc2cdb 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -33,7 +33,7 @@ def infer_dtype(column): try: - return pd.api.types.infer_dtype(column) + return pd.api.types.infer_dtype(column, skipna=False) except AttributeError: return pd.lib.infer_dtype(column) @@ -111,6 +111,9 @@ def get_logical_type_from_numpy(pandas_collection): except KeyError: if hasattr(pandas_collection.dtype, 'tz'): return 'datetimetz' + # See https://github.com/pandas-dev/pandas/issues/24739 + if str(pandas_collection.dtype) == 'datetime64[ns]': + return 'datetime64[ns]' result = infer_dtype(pandas_collection) if result == 'string': @@ -477,7 +480,8 @@ def dataframe_to_serialized_dict(frame): if isinstance(block, _int.DatetimeTZBlock): block_data['timezone'] = pa.lib.tzinfo_to_string(values.tz) - values = values.values + if hasattr(values, 'values'): + values = values.values elif isinstance(block, _int.CategoricalBlock): block_data.update(dictionary=values.categories, ordered=values.ordered) diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py index 22f7c0cb52ab8..6bbe1c7bc896c 100644 --- a/python/pyarrow/serialization.py +++ b/python/pyarrow/serialization.py @@ -174,6 +174,27 @@ def _deserialize_pandas_series(data): custom_serializer=_pickle_to_buffer, custom_deserializer=_load_pickle_from_buffer) + if hasattr(pd.core.arrays, 'interval'): + context.register_type( + pd.core.arrays.interval.IntervalArray, + 'pd.core.arrays.interval.IntervalArray', + custom_serializer=_pickle_to_buffer, + custom_deserializer=_load_pickle_from_buffer) + + if hasattr(pd.core.arrays, 'period'): + context.register_type( + pd.core.arrays.period.PeriodArray, + 'pd.core.arrays.period.PeriodArray', + custom_serializer=_pickle_to_buffer, + custom_deserializer=_load_pickle_from_buffer) + + if hasattr(pd.core.arrays, 'datetimes'): + context.register_type( + pd.core.arrays.datetimes.DatetimeArray, + 'pd.core.arrays.datetimes.DatetimeArray', + custom_serializer=_pickle_to_buffer, + custom_deserializer=_load_pickle_from_buffer) + context.register_type( pd.DataFrame, 'pd.DataFrame', custom_serializer=_serialize_pandas_dataframe, diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index cd7f4999ace3a..466d2e9562dd0 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -837,8 +837,7 @@ def test_timestamps_with_timezone(self): '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) - df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern') - .to_frame()) + df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern') _check_pandas_roundtrip(df) _check_series_roundtrip(df['datetime64']) @@ -852,8 +851,7 @@ def test_timestamps_with_timezone(self): '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) - df['datetime64'] = (df['datetime64'].dt.tz_localize('US/Eastern') - .to_frame()) + df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern') _check_pandas_roundtrip(df) From be663c14637b2bdfef935946b6e91b6317219332 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Sun, 13 Jan 2019 11:14:51 +0100 Subject: [PATCH 216/328] ARROW-4247: [Release] Update verify script for 0.12.0 * C++: -DARROW_GPU -> -DARROW_CUDA * C++: Enable Gandiva * C++: default: -DARROW_BUILD_TESTS=ON -> OFF * Ruby: red-plasma, red-gandiva and red-parquet are added * Rust: The top-level Cargo.toml is a virtual manifest Author: Kouhei Sutou Closes #3389 from kou/release-update-verify-script and squashes the following commits: f019a3fc Update verify script for 0.12.0 --- dev/release/verify-release-candidate.sh | 30 ++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 0e4609735ba53..c8b9c54c82c4c 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -51,10 +51,10 @@ HERE=$(cd `dirname "${BASH_SOURCE[0]:-$0}"` && pwd) ARROW_DIST_URL='https://dist.apache.org/repos/dist/dev/arrow' -: ${ARROW_HAVE_GPU:=} -if [ -z "$ARROW_HAVE_GPU" ]; then +: ${ARROW_HAVE_CUDA:=} +if [ -z "$ARROW_HAVE_CUDA" ]; then if nvidia-smi --list-gpus 2>&1 > /dev/null; then - ARROW_HAVE_GPU=yes + ARROW_HAVE_CUDA=yes fi fi @@ -164,13 +164,15 @@ test_and_install_cpp() { -DARROW_PLASMA=ON -DARROW_ORC=ON -DARROW_PYTHON=ON +-DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_BOOST_USE_SHARED=ON -DCMAKE_BUILD_TYPE=release +-DARROW_BUILD_TESTS=ON -DARROW_BUILD_BENCHMARKS=ON " - if [ "$ARROW_HAVE_GPU" = "yes" ]; then - ARROW_CMAKE_OPTIONS="$ARROW_CMAKE_OPTIONS -DARROW_GPU=ON" + if [ "$ARROW_HAVE_CUDA" = "yes" ]; then + ARROW_CMAKE_OPTIONS="$ARROW_CMAKE_OPTIONS -DARROW_CUDA=ON" fi cmake $ARROW_CMAKE_OPTIONS .. @@ -238,17 +240,17 @@ test_js() { test_ruby() { pushd ruby - pushd red-arrow - bundle install --path vendor/bundle - bundle exec ruby test/run-test.rb - popd + local modules="red-arrow red-plasma red-gandiva red-parquet" + if [ "${ARROW_HAVE_CUDA}" = "yes" ]; then + modules="${modules} red-arrow-cuda" + fi - if [ "$ARROW_HAVE_GPU" = "yes" ]; then - pushd red-arrow-gpu + for module in ${modules}; do + pushd ${module} bundle install --path vendor/bundle bundle exec ruby test/run-test.rb popd - fi + done popd } @@ -274,9 +276,7 @@ test_rust() { cargo fmt --all -- --check # raises on any warnings - cargo rustc -- -D warnings - - cargo build + RUSTFLAGS="-D warnings" cargo build cargo test popd From 5598d2f42573ed19e7db4aae7adb02af2cd4ccd0 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Sun, 13 Jan 2019 13:35:22 -0600 Subject: [PATCH 217/328] ARROW-2828: [JS] Refactor Data, Vectors, Visitor, Typings, build, tests, dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's the big one; The Great ArrowJS Refactor of 2018. Thanks for bearing with me through yet another huge PR. [Check out this sweet gif](https://user-images.githubusercontent.com/178183/50551046-19a94d00-0c30-11e9-80ed-74b9290e8c49.gif) of all the new features in action. With streaming getting to a good place, we've already started working on demos/integrations with other projects like [uber/deck.gl](https://github.com/Pessimistress/deck.gl/tree/a5940e20cb1659a44cba7839082b0803a997a12f/test/apps/arrow) :tada: ### The JIRAs In addition to everything I detail below, this PR closes the following JIRAs: * [ARROW-2828](https://issues.apache.org/jira/browse/ARROW-2828): Refactor Vector Data classes * [ARROW-2839](https://issues.apache.org/jira/browse/ARROW-2839): Support whatwg/streams in IPC reader/writer * [ARROW-2235](https://issues.apache.org/jira/browse/ARROW-2235): Add tests for IPC messages split across multiple buffers * [ARROW-3337](https://issues.apache.org/jira/browse/ARROW-3337): IPC writer doesn't serialize the dictionary of nested Vectors * [ARROW-3689](https://issues.apache.org/jira/browse/ARROW-3689): Upgrade to TS 3.1 * [ARROW-3560](https://issues.apache.org/jira/browse/ARROW-3560): Remove @std/esm * [ARROW-3561](https://issues.apache.org/jira/browse/ARROW-3561): Update ts-jest * [ARROW-2778](https://issues.apache.org/jira/browse/ARROW-2778): Add Utf8Vector.from * [ARROW-2766](https://issues.apache.org/jira/browse/ARROW-2766): Add ability to construct a Table from a list of Arrays/TypedArrays ### The stats The gulp scripts have been updated to parallelize as much as possible. These are the numbers from my Intel Core i7-8700K CPU @ 3.70GHz × 12 running Ubuntu 18.04 and node v11.6.0: ```sh $ time npm run build [22:11:04] Finished 'build' after 39 s real 0m40.341s user 4m55.428s sys 0m5.559s ``` ```sh $ npm run test:coverage =============================== Coverage summary =============================== Statements : 90.45% ( 4321/4777 ) Branches : 76.7% ( 1570/2047 ) Functions : 84.62% ( 1106/1307 ) Lines : 91.5% ( 3777/4128 ) ================================================================================ Test Suites: 21 passed, 21 total Tests: 5644 passed, 5644 total Snapshots: 0 total Time: 16.023s ``` ### The fixes * `Vector#indexOf(value)` works for all DataTypes * `Vector#set(i, value)` now works for all DataTypes * Reading from node streams is now fully zero-copy * The IPC writers now serialize dictionaries of nested Vectors correctly (ARROW-3337) * DictionaryBatches marked as `isDelta` now correctly updates the dictionaries for all Vectors that point to that dictionary, even if they were created before the delta batch arrived * A few `arrow2csv` fixes: * Ignore `stdin` if it's a TTY * Now read all the Arrow formats from `stdin` * Always show the `help` text when we don't understand the input * Proper backpressure support to play nicely with other Unix utilities like `head` and `less` * [Fixes an unfiled bug](https://github.com/trxcllnt/arrow/commit/070ec9809a9f5822d62268252d0570366ec40883) we encountered last week where JS would throw an error creating RowProxies for a Table or Struct with duplicate column names ### The upgrades * New zero-copy Message/RecordBatchReaders! * [`RecordBatchReader.from()`](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/test/unit/ipc/reader/from-inference-tests.ts#L37) will peek at the underlying bytes, and return the correct implementation based on whether the data is an Arrow File, Stream, or JSON * [`RecordBatchFileReader`](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/test/unit/ipc/reader/file-reader-tests.ts#L74) now supports random-access seek, enabling more efficient web-worker/multi-process workflows * [`RecordBatchStreamReader`](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/test/unit/ipc/reader/streams-dom-tests.ts#L119) can now read multiple tables from the same underlying socket * `MessageReader` now [guarantees/enforces](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/src/ipc/message.ts#L126) message body byte alignment (this one even surfaced bugs in [node core](https://github.com/nodejs/node/issues/24817) and the [DOM streams polyfill](https://github.com/MattiasBuelens/web-streams-polyfill/issues/3)) * New RecordBatchWriters * Adds RecordBatchJSONWriter, RecordBatchFileWriter and RecordBatchStreamWriter * Adds static `RecordBatchWriter.writeAll()` method to easily write a Table or stream of RecordBatches * Both sync and async flushes based on the WritableSink * Full integration with platform I/O primitives * We can still synchronously read JSON, Buffers, `Iterable`, or `AsyncIterable` * In node, we can now read from any [`ReadableStream`](https://nodejs.org/docs/latest/api/stream.html#stream_class_stream_readable), [`fs.FileHandle`](https://nodejs.org/docs/latest/api/fs.html#fs_class_filehandle) * In the browser, we can read from any [`ReadableStream` or `ReadableByteStream`](https://developer.mozilla.org/en-US/docs/Web/API/ReadableStream), or the [`Response`](https://developer.mozilla.org/en-US/docs/Web/API/Response) returned from the `fetch()` API. (Wrapping the [FileReader](https://developer.mozilla.org/en-US/docs/Web/API/FileReader) is still todo) * We also [accept Promises](https://github.com/Pessimistress/deck.gl/blob/a5940e20cb1659a44cba7839082b0803a997a12f/test/apps/arrow/loader.js#L20) of any of the above * New convenience methods for integrating with node or DOM streams * [`throughNode()`](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/test/unit/ipc/reader/streams-node-tests.ts#L54)/[`throughDOM()`](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/test/unit/ipc/reader/streams-dom-tests.ts#L50) * [`toReadableNodeStream()`](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/test/unit/ipc/reader/streams-node-tests.ts#L69)/[`toReadableDOMStream()`](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/test/unit/ipc/reader/streams-dom-tests.ts#L65) * [`pipe()`](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/test/unit/ipc/writer/streams-node-tests.ts#L91)/[`pipeTo()`/`pipeThrough()`](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/test/unit/ipc/writer/streams-dom-tests.ts#L92) * Generic type parameters inherited from `DataType` now flow recursively ```js const table = Table.from<{ str: Utf8, i32: Int32, bools: List }>(data); table.get(0); // will be of type { str: string, i32: number, bools: BoolVector } ``` * New simplified [`Data` class](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/src/data.ts) * New simplified, faster `Visitor` class with support for optional, more narrow [`visitT` implementations](https://github.com/trxcllnt/arrow/blob/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/src/visitor.ts#L181) * New specialized Visitor implementations to enable runtime reflection (e.g. dynamically lookup the Vector constructor for a given DataType) * New abstract `Chunked` base class for the applicative (concat) operation * public `chunkedInst.chunks` field is the list of inner chunks * New `Column` class extends `Chunked`, combines `Field` with the chunks (provides access to the field `name` from the Schema) * `RecordBatch#concat(...batchesOrTables)` now returns a Table * Table now extends `Chunked`, so it inherits: * `Table#slice(from, to)` * `Table#concat(...batchesOrTables)` * `Table#getChildAt(i)` exists, alias of `getColumnAt(i)` * `Table#getColumn[At]()` returns a Column ### The breaking changes * All the old IPC functions are gone, but the new APIs will live for much longer * `Table#batches` is now `Table#chunks`, which it inherits from `Chunked` (maybe controversial, open to aliasing) * `Table#batchesUnion` is now just... the Table instance itself (also maybe controversial, open to aliasing) * `DataType#TType` is now `DataType#typeId` -- it should have always been this, was a typo. Easy to alias if necessary. * The complicated View classes are now gone, logic centralized as specialized [`Visitors`](https://github.com/trxcllnt/arrow/tree/b58e29bc83675583238bbb94fba2f3ebf8f1e4aa/js/src/visitor) ### The tests * **Tests no longer rely on any C++ or Java generated integration files** * Integration tests have been moved into `bin/integration.js`, and they finish much quicker * The tsconfig files have been tweaked to speed up test run time and improve the async debugging experience * A streaming `RecordBatchJSONWriter` has been implemented so we can easily debug and validate written output * The JSON results are also tested against the corresponding binary representation, similar to the integration tests * A [suite of test-data helpers](https://github.com/trxcllnt/arrow/blob/d9970bb9a6a9d80bbe07b321dc6389bccf1b0835/js/test/generate-test-data.ts) have been added to auto-generate data for validation at runtime * They produce the underlying Arrow VectorData buffers, as well as the expected plain-JS-value representation [for verification](https://github.com/trxcllnt/arrow/blob/d9970bb9a6a9d80bbe07b321dc6389bccf1b0835/js/test/unit/generated-data-tests.ts#L23) * This allows us to test all possible type configuration combinations, e.g. [all types Dictionary-encode](https://github.com/trxcllnt/arrow/blob/d9970bb9a6a9d80bbe07b321dc6389bccf1b0835/js/test/data/tables.ts#L61), all types serialize when nested, etc. * A [suite of IO test helpers](https://github.com/trxcllnt/arrow/blob/d9970bb9a6a9d80bbe07b321dc6389bccf1b0835/js/test/unit/ipc/helpers.ts#L36) has been added * We use [`memfs`](https://www.npmjs.com/package/memfs) to mock the file system, which contributes to test performance improvements * This enables us to [easily test](https://github.com/trxcllnt/arrow/blob/d9970bb9a6a9d80bbe07b321dc6389bccf1b0835/js/test/unit/ipc/reader/file-reader-tests.ts#L38) all the flavors of io primitives across node and browser environments * A vscode debugging launch configuration has been added to ease the process of contributing more tests (and because I've been asked for mine so often) ### The build * Faster * Node 11+ (needs `Symbol.asyncIterator` enabled) * Closure-compiler upgrades and build enhancements mean we can auto-generate the externs file during compilation, rather than maintaining it by hand ### Misc * Added `arrow2csv` to `js/bin/arrow2csv`, so anybody with the JS project dependencies installed can easily view a CSV-ish thing (`cat foo.arrow | js/bin/arrow2csv.js`) ### Todos * Docs/Recipes/Examples * Highlight/write more tools (like `arrow2csv`) * Flesh out the RecordBatchWriters a bit more * Gather feedback on the new RecordBatchReader APIs Author: ptaylor Author: Paul Taylor Closes #3290 from trxcllnt/js-data-refactor and squashes the following commits: 2ef150f7e bind getByteWidth to the vector type 9acfaa367 handle the case where collapsed Uint8Arrays fully overlap 6a97ee09f perf: defer creating rowProxy on nested types, use Array instead of Object for creating Data instances 2cad76065 pipe directly to stdout to ensure backpressure is preserved f006a2681 ensure schema and field always have a metadata map 8dc5d2cbf fix Float64 Array typings 162c7d873 fix arrow2csv left-pad measurement for new bignum/decimal output 64dc01519 teach closure about Symbol.toPrimitive ca0db9e2a fix lint ec12cdd18 add a small BigNum mixin to make working with Int64 and Decimal values a bit easier 62578b93e fix bug where valueToString function would return undefined (JSON.striingify(undefined) === undefined) 4b58bde06 fix visitor method overload type signatures d16541335 don't print comma that includes system paths 708f1b4e7 move stride to data, fix chunked slicing, remove intermediate binding and getters in favor of direct property accesses 78ecc4cfd use the textencoders from the global instead of Buffer for perf testing 47f0677bf perf: use a closure instead of binding 380dbc7de add a single-chunk column type 6bcaad6ac fix lint f7d2b2ef2 add getters for the dictionary and indices of chunked dictionary vectors aaf42c8a9 Consolidated JS data handling refactor --- .travis.yml | 4 +- ci/travis_script_integration.sh | 2 +- ci/travis_script_js.sh | 5 +- integration/integration_test.py | 2 +- js/.gitignore | 8 +- js/.vscode/launch.json | 169 + js/README.md | 43 +- js/bin/arrow2csv.js | 27 + js/bin/file-to-stream.js | 27 +- js/bin/integration.js | 229 +- js/bin/json-to-arrow.js | 53 +- js/bin/print-buffer-alignment.js | 53 +- js/bin/stream-to-file.js | 27 +- js/examples/read_file.html | 2 +- js/gulp/argv.js | 27 +- js/gulp/arrow-task.js | 10 +- js/gulp/clean-task.js | 13 +- js/gulp/closure-task.js | 192 +- js/gulp/{build-task.js => compile-task.js} | 6 +- js/gulp/memoize-task.js | 10 +- js/gulp/minify-task.js | 101 +- js/gulp/package-task.js | 21 +- js/gulp/test-task.js | 47 +- js/gulp/typescript-task.js | 19 +- js/gulp/util.js | 59 +- js/gulpfile.js | 71 +- js/index.ts | 2 +- js/jest.config.js | 56 + js/jest.coverage.config.js | 30 + js/npm-release.sh | 6 +- js/package-lock.json | 8050 ++++++----------- js/package.json | 110 +- js/perf/index.js | 24 +- js/src/Arrow.dom.ts | 86 + js/src/Arrow.externs.js | 814 -- js/src/Arrow.node.ts | 29 + js/src/Arrow.ts | 374 +- js/src/bin/arrow2csv.ts | 224 +- js/src/column.ts | 100 + js/src/compute/dataframe.ts | 209 + js/src/{ => compute}/predicate.ts | 37 +- js/src/data.ts | 482 +- js/src/enum.ts | 95 + js/src/fb/Schema.ts | 2 +- js/src/interfaces.ts | 240 + js/src/io/adapters.ts | 386 + js/src/io/file.ts | 116 + js/src/io/interfaces.ts | 180 + js/src/io/stream.ts | 158 + js/src/ipc/magic.ts | 53 - js/src/ipc/message.ts | 249 + js/src/ipc/metadata.ts | 96 - js/src/ipc/metadata/file.ts | 163 + js/src/ipc/metadata/json.ts | 208 + js/src/ipc/metadata/message.ts | 593 ++ js/src/ipc/node/iterable.ts | 106 + js/src/ipc/node/reader.ts | 85 + js/src/ipc/node/writer.ts | 76 + js/src/ipc/reader.ts | 737 ++ js/src/ipc/reader/arrow.ts | 55 - js/src/ipc/reader/binary.ts | 432 - js/src/ipc/reader/json.ts | 304 - js/src/ipc/reader/node.ts | 78 - js/src/ipc/reader/vector.ts | 131 - js/src/ipc/whatwg/iterable.ts | 88 + js/src/ipc/whatwg/reader.ts | 52 + js/src/ipc/whatwg/writer.ts | 50 + js/src/ipc/writer.ts | 417 + js/src/ipc/writer/binary.ts | 725 -- js/src/recordbatch.ts | 126 +- js/src/schema.ts | 107 + js/src/table.ts | 430 +- js/src/type.ts | 671 +- js/src/util/bit.ts | 35 +- js/src/util/bn.ts | 171 + js/src/util/buffer.ts | 228 + js/src/util/compat.ts | 153 +- js/src/util/int.ts | 94 +- js/src/util/node.ts | 93 - js/src/util/pretty.ts | 33 +- js/src/util/utf8.ts | 47 + js/src/util/vector.ts | 134 + js/src/vector.ts | 481 +- js/src/vector/base.ts | 109 + .../{ipc/writer/arrow.ts => vector/binary.ts} | 26 +- js/src/vector/bool.ts | 32 + js/src/vector/chunked.ts | 314 +- js/src/vector/date.ts | 43 + js/src/vector/decimal.ts | 21 + js/src/vector/dictionary.ts | 65 +- js/src/vector/fixedsizebinary.ts | 22 + js/src/vector/fixedsizelist.ts | 22 + js/src/vector/flat.ts | 290 - js/src/vector/float.ts | 37 + js/src/vector/index.ts | 183 + js/src/vector/int.ts | 53 + js/src/vector/interval.ts | 23 + js/src/vector/list.ts | 131 +- js/src/vector/map.ts | 32 + js/src/vector/nested.ts | 247 - js/src/vector/null.ts | 21 + js/src/vector/row.ts | 100 + js/src/vector/struct.ts | 32 + js/src/vector/time.ts | 25 + js/src/vector/timestamp.ts | 25 + js/src/vector/union.ts | 29 + js/src/vector/utf8.ts | 37 + js/src/vector/validity.ts | 75 - js/src/vector/view.ts | 9 - js/src/visitor.ts | 326 +- js/src/visitor/bytewidth.ts | 65 + js/src/visitor/get.ts | 315 + js/src/visitor/indexof.ts | 181 + js/src/visitor/iterator.ts | 170 + js/src/visitor/jsontypeassembler.ts | 88 + js/src/visitor/jsonvectorassembler.ts | 181 + js/src/visitor/set.ts | 326 + js/src/visitor/toarray.ts | 151 + js/src/visitor/typeassembler.ts | 154 + js/src/visitor/typector.ts | 79 + js/src/visitor/vectorassembler.ts | 230 + js/src/visitor/vectorctor.ts | 96 + js/src/visitor/vectorloader.ts | 132 + js/test/Arrow.ts | 36 +- js/test/data/tables.ts | 85 + js/test/generate-test-data.ts | 657 ++ js/test/inference/column.ts | 70 + js/test/inference/nested.ts | 46 + js/test/inference/visitor/get.ts | 39 + js/test/integration/test-config.ts | 52 - js/test/integration/validate-tests.ts | 213 - js/test/jest-extensions.ts | 156 +- js/test/tsconfig.coverage.json | 6 + js/test/tsconfig.json | 11 +- js/test/unit/generated-data-tests.ts | 238 + js/test/unit/int-tests.ts | 2 +- js/test/unit/ipc/helpers.ts | 206 + js/test/unit/ipc/message-reader-tests.ts | 116 + js/test/unit/ipc/reader/file-reader-tests.ts | 123 + .../unit/ipc/reader/from-inference-tests.ts | 152 + js/test/unit/ipc/reader/json-reader-tests.ts | 43 + .../unit/ipc/reader/stream-reader-tests.ts | 65 + js/test/unit/ipc/reader/streams-dom-tests.ts | 189 + js/test/unit/ipc/reader/streams-node-tests.ts | 188 + js/test/unit/ipc/validate.ts | 74 + js/test/unit/ipc/writer/file-writer-tests.ts | 46 + js/test/unit/ipc/writer/json-writer-tests.ts | 49 + .../unit/ipc/writer/stream-writer-tests.ts | 71 + js/test/unit/ipc/writer/streams-dom-tests.ts | 283 + js/test/unit/ipc/writer/streams-node-tests.ts | 277 + js/test/unit/table-tests.ts | 176 +- js/test/unit/vector-tests.ts | 433 - js/test/unit/vector/bool-vector-tests.ts | 102 + .../unit/{ => vector}/date-vector-tests.ts | 24 +- js/test/unit/vector/float16-vector-tests.ts | 73 + js/test/unit/vector/numeric-vector-tests.ts | 190 + js/test/unit/vector/vector-tests.ts | 127 + js/test/unit/visitor-tests.ts | 168 + js/test/unit/writer-tests.ts | 62 - js/tsconfig/tsconfig.base.json | 2 +- js/tsconfig/tsconfig.bin.cjs.json | 2 +- js/tsconfig/tsconfig.es5.cls.json | 4 +- 162 files changed, 17451 insertions(+), 12440 deletions(-) create mode 100644 js/.vscode/launch.json create mode 100755 js/bin/arrow2csv.js rename js/gulp/{build-task.js => compile-task.js} (90%) create mode 100644 js/jest.config.js create mode 100644 js/jest.coverage.config.js create mode 100644 js/src/Arrow.dom.ts delete mode 100644 js/src/Arrow.externs.js create mode 100644 js/src/Arrow.node.ts create mode 100644 js/src/column.ts create mode 100644 js/src/compute/dataframe.ts rename js/src/{ => compute}/predicate.ts (94%) create mode 100644 js/src/enum.ts create mode 100644 js/src/interfaces.ts create mode 100644 js/src/io/adapters.ts create mode 100644 js/src/io/file.ts create mode 100644 js/src/io/interfaces.ts create mode 100644 js/src/io/stream.ts delete mode 100644 js/src/ipc/magic.ts create mode 100644 js/src/ipc/message.ts delete mode 100644 js/src/ipc/metadata.ts create mode 100644 js/src/ipc/metadata/file.ts create mode 100644 js/src/ipc/metadata/json.ts create mode 100644 js/src/ipc/metadata/message.ts create mode 100644 js/src/ipc/node/iterable.ts create mode 100644 js/src/ipc/node/reader.ts create mode 100644 js/src/ipc/node/writer.ts create mode 100644 js/src/ipc/reader.ts delete mode 100644 js/src/ipc/reader/arrow.ts delete mode 100644 js/src/ipc/reader/binary.ts delete mode 100644 js/src/ipc/reader/json.ts delete mode 100644 js/src/ipc/reader/node.ts delete mode 100644 js/src/ipc/reader/vector.ts create mode 100644 js/src/ipc/whatwg/iterable.ts create mode 100644 js/src/ipc/whatwg/reader.ts create mode 100644 js/src/ipc/whatwg/writer.ts create mode 100644 js/src/ipc/writer.ts delete mode 100644 js/src/ipc/writer/binary.ts create mode 100644 js/src/schema.ts create mode 100644 js/src/util/bn.ts create mode 100644 js/src/util/buffer.ts delete mode 100644 js/src/util/node.ts create mode 100644 js/src/util/utf8.ts create mode 100644 js/src/util/vector.ts create mode 100644 js/src/vector/base.ts rename js/src/{ipc/writer/arrow.ts => vector/binary.ts} (52%) create mode 100644 js/src/vector/bool.ts create mode 100644 js/src/vector/date.ts create mode 100644 js/src/vector/decimal.ts create mode 100644 js/src/vector/fixedsizebinary.ts create mode 100644 js/src/vector/fixedsizelist.ts delete mode 100644 js/src/vector/flat.ts create mode 100644 js/src/vector/float.ts create mode 100644 js/src/vector/index.ts create mode 100644 js/src/vector/int.ts create mode 100644 js/src/vector/interval.ts create mode 100644 js/src/vector/map.ts delete mode 100644 js/src/vector/nested.ts create mode 100644 js/src/vector/null.ts create mode 100644 js/src/vector/row.ts create mode 100644 js/src/vector/struct.ts create mode 100644 js/src/vector/time.ts create mode 100644 js/src/vector/timestamp.ts create mode 100644 js/src/vector/union.ts create mode 100644 js/src/vector/utf8.ts delete mode 100644 js/src/vector/validity.ts delete mode 100644 js/src/vector/view.ts create mode 100644 js/src/visitor/bytewidth.ts create mode 100644 js/src/visitor/get.ts create mode 100644 js/src/visitor/indexof.ts create mode 100644 js/src/visitor/iterator.ts create mode 100644 js/src/visitor/jsontypeassembler.ts create mode 100644 js/src/visitor/jsonvectorassembler.ts create mode 100644 js/src/visitor/set.ts create mode 100644 js/src/visitor/toarray.ts create mode 100644 js/src/visitor/typeassembler.ts create mode 100644 js/src/visitor/typector.ts create mode 100644 js/src/visitor/vectorassembler.ts create mode 100644 js/src/visitor/vectorctor.ts create mode 100644 js/src/visitor/vectorloader.ts create mode 100644 js/test/data/tables.ts create mode 100644 js/test/generate-test-data.ts create mode 100644 js/test/inference/column.ts create mode 100644 js/test/inference/nested.ts create mode 100644 js/test/inference/visitor/get.ts delete mode 100644 js/test/integration/test-config.ts delete mode 100644 js/test/integration/validate-tests.ts create mode 100644 js/test/tsconfig.coverage.json create mode 100644 js/test/unit/generated-data-tests.ts create mode 100644 js/test/unit/ipc/helpers.ts create mode 100644 js/test/unit/ipc/message-reader-tests.ts create mode 100644 js/test/unit/ipc/reader/file-reader-tests.ts create mode 100644 js/test/unit/ipc/reader/from-inference-tests.ts create mode 100644 js/test/unit/ipc/reader/json-reader-tests.ts create mode 100644 js/test/unit/ipc/reader/stream-reader-tests.ts create mode 100644 js/test/unit/ipc/reader/streams-dom-tests.ts create mode 100644 js/test/unit/ipc/reader/streams-node-tests.ts create mode 100644 js/test/unit/ipc/validate.ts create mode 100644 js/test/unit/ipc/writer/file-writer-tests.ts create mode 100644 js/test/unit/ipc/writer/json-writer-tests.ts create mode 100644 js/test/unit/ipc/writer/stream-writer-tests.ts create mode 100644 js/test/unit/ipc/writer/streams-dom-tests.ts create mode 100644 js/test/unit/ipc/writer/streams-node-tests.ts delete mode 100644 js/test/unit/vector-tests.ts create mode 100644 js/test/unit/vector/bool-vector-tests.ts rename js/test/unit/{ => vector}/date-vector-tests.ts (77%) create mode 100644 js/test/unit/vector/float16-vector-tests.ts create mode 100644 js/test/unit/vector/numeric-vector-tests.ts create mode 100644 js/test/unit/vector/vector-tests.ts create mode 100644 js/test/unit/visitor-tests.ts delete mode 100644 js/test/unit/writer-tests.ts diff --git a/.travis.yml b/.travis.yml index 8532cc7f3b662..c57c473c041f9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -230,7 +230,7 @@ matrix: - if [ $ARROW_CI_INTEGRATION_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - - nvm install 10.1 + - nvm install 11.6 - $TRAVIS_BUILD_DIR/ci/travis_before_script_js.sh - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: @@ -240,7 +240,7 @@ matrix: language: node_js os: linux node_js: - - '10.1' + - '11.6' before_script: - if [ $ARROW_CI_JS_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh diff --git a/ci/travis_script_integration.sh b/ci/travis_script_integration.sh index 9c2786282b08b..342db58b5dfd3 100755 --- a/ci/travis_script_integration.sh +++ b/ci/travis_script_integration.sh @@ -36,7 +36,7 @@ pushd $ARROW_JS_DIR # lint and compile JS source npm run lint -npm run build +npm run build -- -t apache-arrow popd diff --git a/ci/travis_script_js.sh b/ci/travis_script_js.sh index 1871b4265cd01..34b07115e70b1 100755 --- a/ci/travis_script_js.sh +++ b/ci/travis_script_js.sh @@ -23,9 +23,10 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh pushd $ARROW_JS_DIR -npm run lint +npm run lint:ci npm run build -# run the non-snapshot unit tests npm test +npm run test:coverage +bash <(curl -s https://codecov.io/bash) || echo "Codecov did not collect coverage reports" popd diff --git a/integration/integration_test.py b/integration/integration_test.py index 7101af2516ad9..c0191c372915c 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -1122,7 +1122,7 @@ def _run(self, exe_cmd, arrow_path=None, json_path=None, if json_path is not None: cmd.extend(['-j', json_path]) - cmd.extend(['--mode', command, '-t', 'es5', '-m', 'umd']) + cmd.extend(['--mode', command]) if self.debug: print(' '.join(cmd)) diff --git a/js/.gitignore b/js/.gitignore index 3437e39da6c0a..5e412f8ee8a57 100644 --- a/js/.gitignore +++ b/js/.gitignore @@ -23,7 +23,8 @@ npm-debug.log* yarn-debug.log* yarn-error.log* -.vscode +.vscode/** +!.vscode/launch.json # Runtime data pids @@ -78,10 +79,13 @@ yarn.lock .env # compilation targets +doc dist targets # test data files -test/data/ +test/data/**/*.json +test/data/**/*.arrow + # jest snapshots (too big) test/__snapshots__/ diff --git a/js/.vscode/launch.json b/js/.vscode/launch.json new file mode 100644 index 0000000000000..ba5609e0c10e8 --- /dev/null +++ b/js/.vscode/launch.json @@ -0,0 +1,169 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "type": "node", + "request": "launch", + "name": "Debug Gulp Build", + "program": "${workspaceFolder}/node_modules/gulp/bin/gulp.js", + "args": [ + "build", + // Specify we want to debug the "src" target, which won't clean or build -- essentially a "dry-run" of the gulp build + "--target", "src" + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug Unit Tests", + "cwd": "${workspaceRoot}", + "program": "${workspaceFolder}/node_modules/.bin/jest", + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "env": { + "NODE_NO_WARNINGS": "1", + "READABLE_STREAM": "disable", + "TEST_DOM_STREAMS": "true", + "TEST_NODE_STREAMS": "true", + // Modify these environment variables to run tests on a specific compilation target + module format combo + "TEST_TS_SOURCE": "true", + // "TEST_TS_SOURCE": "false", + // "TEST_TARGET": "es5", + // "TEST_MODULE": "umd" + }, + "args": [ + // "-i", + "test/unit/", + + // Uncomment any of these to run individual test suites + // "test/unit/int-tests.ts", + // "test/unit/table-tests.ts", + // "test/unit/generated-data-tests.ts", + + // "test/unit/vector/vector-tests.ts", + // "test/unit/vector/bool-vector-tests.ts", + // "test/unit/vector/date-vector-tests.ts", + // "test/unit/vector/float16-vector-tests.ts", + // "test/unit/vector/numeric-vector-tests.ts", + + // "test/unit/visitor-tests.ts", + + // "test/unit/ipc/message-reader-tests.ts", + // "test/unit/ipc/reader/file-reader-tests.ts", + // "test/unit/ipc/reader/json-reader-tests.ts", + // "test/unit/ipc/reader/from-inference-tests.ts", + // "test/unit/ipc/reader/stream-reader-tests.ts", + // "test/unit/ipc/reader/streams-dom-tests.ts", + // "test/unit/ipc/reader/streams-node-tests.ts", + // "test/unit/ipc/writer/file-writer-tests.ts", + // "test/unit/ipc/writer/json-writer-tests.ts", + // "test/unit/ipc/writer/stream-writer-tests.ts", + // "test/unit/ipc/writer/streams-dom-tests.ts", + // "test/unit/ipc/writer/streams-node-tests.ts", + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug Integration Tests", + "cwd": "${workspaceRoot}", + "program": "${workspaceFolder}/bin/integration.js", + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "env": { + "NODE_NO_WARNINGS": "1", + "READABLE_STREAM": "disable" + }, + "args": [ + "--mode", "VALIDATE" + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug bin/arrow2csv", + "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" }, + "runtimeArgs": ["-r", "ts-node/register"], + "console": "integratedTerminal", + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "args": [ + "${workspaceFolder}/src/bin/arrow2csv.ts", + "-f", "./test/data/cpp/stream/simple.arrow" + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug bin/file-to-stream", + "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" }, + "runtimeArgs": ["-r", "ts-node/register"], + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "args": [ + "${workspaceFolder}/bin/file-to-stream.js", + "./test/data/cpp/file/struct_example.arrow", + "./struct_example-stream-out.arrow", + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug bin/stream-to-file", + "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" }, + "runtimeArgs": ["-r", "ts-node/register"], + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "args": [ + "${workspaceFolder}/bin/stream-to-file.js", + "./test/data/cpp/stream/struct_example.arrow", + "./struct_example-file-out.arrow", + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug bin/json-to-arrow", + "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" }, + "runtimeArgs": ["-r", "ts-node/register"], + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "args": [ + "${workspaceFolder}/bin/json-to-arrow.js", + "-j", "./test/data/json/struct_example.json", + "-a", "./struct_example-stream-out.arrow", + "-f", "stream" + ] + }, + { + "type": "node", + "request": "launch", + "name": "Debug bin/print-buffer-alignment", + "env": { "ARROW_JS_DEBUG": "src", "TS_NODE_CACHE": "false" }, + "runtimeArgs": ["-r", "ts-node/register"], + "skipFiles": [ + "/**/*.js", + "${workspaceFolder}/node_modules/**/*.js" + ], + "args": [ + "${workspaceFolder}/bin/print-buffer-alignment.js", + "./test/data/cpp/stream/struct_example.arrow" + ] + } + ] +} diff --git a/js/README.md b/js/README.md index 15d7ed03f65a4..0af4fecabccc9 100644 --- a/js/README.md +++ b/js/README.md @@ -49,7 +49,7 @@ Check out our [API documentation][7] to learn more about how to use Apache Arrow ### Get a table from an Arrow file on disk (in IPC format) -```es6 +```js import { readFileSync } from 'fs'; import { Table } from 'apache-arrow'; @@ -70,7 +70,7 @@ null, null, null ### Create a Table when the Arrow file is split across buffers -```es6 +```js import { readFileSync } from 'fs'; import { Table } from 'apache-arrow'; @@ -93,12 +93,24 @@ console.log(table.toString()); ### Create a Table from JavaScript arrays -```es6 +```js +import { + Table, + FloatVector, + DateVector +} from 'apache-arrow'; + const LENGTH = 2000; -const rainAmounts = Float32Array.from({length: LENGTH}, () => Number((Math.random() * 20).toFixed(1))); -const rainDates = Array.from({length: LENGTH}, (_, i) => new Date(Date.now() - 1000 * 60 * 60 * 24 * i)); -const rainfall = arrow.Table.fromVectors( +const rainAmounts = Float32Array.from( + { length: LENGTH }, + () => Number((Math.random() * 20).toFixed(1))); + +const rainDates = Array.from( + { length: LENGTH }, + (_, i) => new Date(Date.now() - 1000 * 60 * 60 * 24 * i)); + +const rainfall = Table.fromVectors( [FloatVector.from(rainAmounts), DateVector.from(rainDates)], ['precipitation', 'date'] ); @@ -106,20 +118,17 @@ const rainfall = arrow.Table.fromVectors( ### Load data with `fetch` -```es6 +```js import { Table } from "apache-arrow"; -fetch(require("simple.arrow")).then(response => { - response.arrayBuffer().then(buffer => { - const table = Table.from(new Uint8Array(buffer)); - console.log(table.toString()); - }); -}); +const table = await Table.from(fetch(("/simple.arrow"))); +console.log(table.toString()); + ``` ### Columns look like JS Arrays -```es6 +```js import { readFileSync } from 'fs'; import { Table } from 'apache-arrow'; @@ -131,7 +140,7 @@ const table = Table.from([ const column = table.getColumn('origin_lat'); // Copy the data into a TypedArray -const typed = column.slice(); +const typed = column.toArray(); assert(typed instanceof Float32Array); for (let i = -1, n = column.length; ++i < n;) { @@ -141,7 +150,7 @@ for (let i = -1, n = column.length; ++i < n;) { ### Usage with MapD Core -```es6 +```js import MapD from 'rxjs-mapd'; import { Table } from 'apache-arrow'; @@ -164,7 +173,7 @@ MapD.open(host, port) ) .map(([schema, records]) => // Create Arrow Table from results - Table.from(schema, records)) + Table.from([schema, records])) .map((table) => // Stringify the table to CSV with row numbers table.toString({ index: true })) diff --git a/js/bin/arrow2csv.js b/js/bin/arrow2csv.js new file mode 100755 index 0000000000000..afd59736bf521 --- /dev/null +++ b/js/bin/arrow2csv.js @@ -0,0 +1,27 @@ +#! /usr/bin/env node + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const Path = require(`path`); +const here = Path.resolve(__dirname, '../'); +const tsnode = require.resolve(`ts-node/register`); +const arrow2csv = Path.join(here, `src/bin/arrow2csv.ts`); + +require('child_process').spawn(`node`, [ + `-r`, tsnode, arrow2csv, ...process.argv.slice(2) +], { cwd: here, env: process.env, stdio: `inherit` }); diff --git a/js/bin/file-to-stream.js b/js/bin/file-to-stream.js index fa4e5d17bbd3a..090cd0b0eda77 100755 --- a/js/bin/file-to-stream.js +++ b/js/bin/file-to-stream.js @@ -17,21 +17,24 @@ // specific language governing permissions and limitations // under the License. +// @ts-check + const fs = require('fs'); const path = require('path'); - -const encoding = 'binary'; -const ext = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; -const { util: { PipeIterator } } = require(`../index${ext}`); -const { Table, serializeStream, fromReadableStream } = require(`../index${ext}`); +const eos = require('util').promisify(require('stream').finished); +const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; +const { RecordBatchReader, RecordBatchStreamWriter } = require(`../index${extension}`); (async () => { - // Todo (ptaylor): implement `serializeStreamAsync` that accepts an - // AsyncIterable, rather than aggregating into a Table first - const in_ = process.argv.length < 3 - ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); - const out = process.argv.length < 4 - ? process.stdout : fs.createWriteStream(path.resolve(process.argv[3])); - new PipeIterator(serializeStream(await Table.fromAsync(fromReadableStream(in_))), encoding).pipe(out); + + const readable = process.argv.length < 3 ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); + const writable = process.argv.length < 4 ? process.stdout : fs.createWriteStream(path.resolve(process.argv[3])); + + const fileToStream = readable + .pipe(RecordBatchReader.throughNode()) + .pipe(RecordBatchStreamWriter.throughNode()) + .pipe(writable); + + await eos(fileToStream); })().catch((e) => { console.error(e); process.exit(1); }); diff --git a/js/bin/integration.js b/js/bin/integration.js index 6c064deac258d..c6f6cd7a24ed5 100755 --- a/js/bin/integration.js +++ b/js/bin/integration.js @@ -17,61 +17,55 @@ // specific language governing permissions and limitations // under the License. +// @ts-nocheck + const fs = require('fs'); -const glob = require('glob'); -const path = require('path'); -const child_process = require(`child_process`); +const Path = require('path'); +const { promisify } = require('util'); +const glob = promisify(require('glob')); +const { zip } = require('ix/iterable/zip'); +const { parse: bignumJSONParse } = require('json-bignum'); const argv = require(`command-line-args`)(cliOpts(), { partial: true }); -const gulpPath = require.resolve(path.join(`..`, `node_modules/gulp/bin/gulp.js`)); - -let jsonPaths = [...(argv.json || [])]; -let arrowPaths = [...(argv.arrow || [])]; +const { + Table, + RecordBatchReader, + util: { createElementComparator } +} = require('../targets/apache-arrow/Arrow.es5.min'); -if (!argv.mode) { - return print_usage(); +const exists = async (p) => { + try { + return !!(await fs.promises.stat(p)); + } catch (e) { return false; } } -let mode = argv.mode.toUpperCase(); -if (mode === 'VALIDATE' && !jsonPaths.length) { - jsonPaths = glob.sync(path.resolve(__dirname, `../test/data/json/`, `*.json`)); - if (!arrowPaths.length) { - [jsonPaths, arrowPaths] = jsonPaths.reduce(([jsonPaths, arrowPaths], jsonPath) => { - const { name } = path.parse(jsonPath); - for (const source of ['cpp', 'java']) { - for (const format of ['file', 'stream']) { - const arrowPath = path.resolve(__dirname, `../test/data/${source}/${format}/${name}.arrow`); - if (fs.existsSync(arrowPath)) { - jsonPaths.push(jsonPath); - arrowPaths.push(arrowPath); - } - } - } - return [jsonPaths, arrowPaths]; - }, [[], []]); - console.log(`jsonPaths: [\n\t${jsonPaths.join('\n\t')}\n]`); - console.log(`arrowPaths: [\n\t${arrowPaths.join('\n\t')}\n]`); +(async () => { + + if (!argv.mode) { return print_usage(); } + + let mode = argv.mode.toUpperCase(); + let jsonPaths = [...(argv.json || [])]; + let arrowPaths = [...(argv.arrow || [])]; + + if (mode === 'VALIDATE' && !jsonPaths.length) { + [jsonPaths, arrowPaths] = await loadLocalJSONAndArrowPathsForDebugging(jsonPaths, arrowPaths); } -} else if (!jsonPaths.length) { - return print_usage(); -} -switch (mode) { - case 'VALIDATE': - const args = [`test`, `-i`].concat(argv._unknown || []); - jsonPaths.forEach((p, i) => { - args.push('-j', p, '-a', arrowPaths[i]); - }); - process.exitCode = child_process.spawnSync( - gulpPath, args, - { - cwd: path.resolve(__dirname, '..'), - stdio: ['ignore', 'inherit', 'inherit'] + if (!jsonPaths.length) { return print_usage(); } + + switch (mode) { + case 'VALIDATE': + for (let [jsonPath, arrowPath] of zip(jsonPaths, arrowPaths)) { + await validate(jsonPath, arrowPath); } - ).status || process.exitCode || 0; - break; - default: - print_usage(); -} + break; + default: + return print_usage(); + } +})() +.then((x) => +x || 0, (e) => { + e && process.stderr.write(`${e && e.stack || e}\n`); + return process.exitCode || 1; +}).then((code) => process.exit(code)); function cliOpts() { return [ @@ -118,5 +112,144 @@ function print_usage() { ] }, ])); - process.exit(1); + return 1; +} + +async function validate(jsonPath, arrowPath) { + + const files = await Promise.all([ + fs.promises.readFile(arrowPath), + fs.promises.readFile(jsonPath, 'utf8'), + ]); + + const arrowData = files[0]; + const jsonData = bignumJSONParse(files[1]); + + validateReaderIntegration(jsonData, arrowData); + validateTableFromBuffersIntegration(jsonData, arrowData); + validateTableToBuffersIntegration('json', 'file')(jsonData, arrowData); + validateTableToBuffersIntegration('json', 'file')(jsonData, arrowData); + validateTableToBuffersIntegration('binary', 'file')(jsonData, arrowData); + validateTableToBuffersIntegration('binary', 'file')(jsonData, arrowData); +} + +function validateReaderIntegration(jsonData, arrowBuffer) { + const msg = `json and arrow record batches report the same values`; + try { + const jsonReader = RecordBatchReader.from(jsonData); + const binaryReader = RecordBatchReader.from(arrowBuffer); + for (const [jsonRecordBatch, binaryRecordBatch] of zip(jsonReader, binaryReader)) { + compareTableIsh(jsonRecordBatch, binaryRecordBatch); + } + } catch (e) { throw new Error(`${msg}: fail \n ${e && e.stack || e}`); } + process.stdout.write(`${msg}: pass\n`); +} + +function validateTableFromBuffersIntegration(jsonData, arrowBuffer) { + const msg = `json and arrow tables report the same values`; + try { + const jsonTable = Table.from(jsonData); + const binaryTable = Table.from(arrowBuffer); + compareTableIsh(jsonTable, binaryTable); + } catch (e) { throw new Error(`${msg}: fail \n ${e && e.stack || e}`); } + process.stdout.write(`${msg}: pass\n`); +} + +function validateTableToBuffersIntegration(srcFormat, arrowFormat) { + const refFormat = srcFormat === `json` ? `binary` : `json`; + return function testTableToBuffersIntegration(jsonData, arrowBuffer) { + const msg = `serialized ${srcFormat} ${arrowFormat} reports the same values as the ${refFormat} ${arrowFormat}`; + try { + const refTable = Table.from(refFormat === `json` ? jsonData : arrowBuffer); + const srcTable = Table.from(srcFormat === `json` ? jsonData : arrowBuffer); + const dstTable = Table.from(srcTable.serialize(`binary`, arrowFormat === `stream`)); + compareTableIsh(dstTable, refTable); + } catch (e) { throw new Error(`${msg}: fail \n ${e && e.stack || e}`); } + process.stdout.write(`${msg}: pass\n`); + }; +} + +function compareTableIsh(actual, expected) { + if (actual.length !== expected.length) { + throw new Error(`length: ${actual.length} !== ${expected.length}`); + } + if (actual.numCols !== expected.numCols) { + throw new Error(`numCols: ${actual.numCols} !== ${expected.numCols}`); + } + (() => { + const getChildAtFn = expected instanceof Table ? 'getColumnAt' : 'getChildAt'; + for (let i = -1, n = actual.numCols; ++i < n;) { + const v1 = actual[getChildAtFn](i); + const v2 = expected[getChildAtFn](i); + compareVectors(v1, v2); + } + })(); +} + +function compareVectors(actual, expected) { + + if ((actual == null && expected != null) || (expected == null && actual != null)) { + throw new Error(`${actual == null ? `actual` : `expected`} is null, was expecting ${actual == null ? expected : actual} to be that also`); + } + + let props = ['type', 'length', 'nullCount']; + + (() => { + for (let i = -1, n = props.length; ++i < n;) { + const prop = props[i]; + if (`${actual[prop]}` !== `${expected[prop]}`) { + throw new Error(`${prop}: ${actual[prop]} !== ${expected[prop]}`); + } + } + })(); + + (() => { + for (let i = -1, n = actual.length; ++i < n;) { + let x1 = actual.get(i), x2 = expected.get(i); + if (!createElementComparator(x2)(x1)) { + throw new Error(`${i}: ${x1} !== ${x2}`); + } + } + })(); + + (() => { + let i = -1; + for (let [x1, x2] of zip(actual, expected)) { + ++i; + if (!createElementComparator(x2)(x1)) { + throw new Error(`${i}: ${x1} !== ${x2}`); + } + } + })(); +} + +async function loadLocalJSONAndArrowPathsForDebugging(jsonPaths, arrowPaths) { + + const sourceJSONPaths = await glob(Path.resolve(__dirname, `../test/data/json/`, `*.json`)); + + if (!arrowPaths.length) { + await loadJSONAndArrowPaths(sourceJSONPaths, jsonPaths, arrowPaths, 'cpp', 'file'); + await loadJSONAndArrowPaths(sourceJSONPaths, jsonPaths, arrowPaths, 'java', 'file'); + await loadJSONAndArrowPaths(sourceJSONPaths, jsonPaths, arrowPaths, 'cpp', 'stream'); + await loadJSONAndArrowPaths(sourceJSONPaths, jsonPaths, arrowPaths, 'java', 'stream'); + } + + for (let [jsonPath, arrowPath] of zip(jsonPaths, arrowPaths)) { + console.log(`jsonPath: ${jsonPath}`); + console.log(`arrowPath: ${arrowPath}`); + } + + return [jsonPaths, arrowPaths]; + + async function loadJSONAndArrowPaths(sourceJSONPaths, jsonPaths, arrowPaths, source, format) { + for (const jsonPath of sourceJSONPaths) { + const { name } = Path.parse(jsonPath); + const arrowPath = Path.resolve(__dirname, `../test/data/${source}/${format}/${name}.arrow`); + if (await exists(arrowPath)) { + jsonPaths.push(jsonPath); + arrowPaths.push(arrowPath); + } + } + return [jsonPaths, arrowPaths]; + } } diff --git a/js/bin/json-to-arrow.js b/js/bin/json-to-arrow.js index f28b4145ffaed..7a98d56d1a5e2 100755 --- a/js/bin/json-to-arrow.js +++ b/js/bin/json-to-arrow.js @@ -17,37 +17,46 @@ // specific language governing permissions and limitations // under the License. +// @ts-check + const fs = require('fs'); -const glob = require('glob'); -const path = require('path'); -const { promisify } = require('util'); +const Path = require('path'); const { parse } = require('json-bignum'); +const eos = require('util').promisify(require('stream').finished); +const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; const argv = require(`command-line-args`)(cliOpts(), { partial: true }); +const { RecordBatchReader, RecordBatchFileWriter, RecordBatchStreamWriter } = require(`../index${extension}`); -const ext = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; -const { Table } = require(`../index${ext}`); - -const encoding = 'binary'; -const stream = argv.format === 'stream'; const jsonPaths = [...(argv.json || [])]; const arrowPaths = [...(argv.arrow || [])]; -if (!jsonPaths.length || !arrowPaths.length || (jsonPaths.length !== arrowPaths.length)) { - return print_usage(); -} +(async () => { -const readFile = callResolved(promisify(fs.readFile)); -const writeFile = callResolved(promisify(fs.writeFile)); + if (!jsonPaths.length || !arrowPaths.length || (jsonPaths.length !== arrowPaths.length)) { + return print_usage(); + } -(async () => await Promise.all(jsonPaths.map(async (jPath, i) => { - const aPath = arrowPaths[i]; - const arrowTable = Table.from(parse('' + (await readFile(jPath)))); - await writeFile(aPath, arrowTable.serialize(encoding, stream), encoding); -})))().catch((e) => { console.error(e); process.exit(1); }); + await Promise.all(jsonPaths.map(async (path, i) => { + + const RecordBatchWriter = argv.format !== 'stream' + ? RecordBatchFileWriter + : RecordBatchStreamWriter; -function callResolved(fn) { - return async (path_, ...xs) => await fn(path.resolve(path_), ...xs); -} + const reader = RecordBatchReader.from(parse( + await fs.promises.readFile(Path.resolve(path), 'utf8'))); + + const jsonToArrow = reader + .pipe(RecordBatchWriter.throughNode()) + .pipe(fs.createWriteStream(arrowPaths[i])); + + await eos(jsonToArrow); + + })); +})() +.then((x) => +x || 0, (e) => { + e && process.stderr.write(`${e}`); + return process.exitCode || 1; +}).then((code = 0) => process.exit(code)); function cliOpts() { return [ @@ -95,5 +104,5 @@ function print_usage() { ] }, ])); - process.exit(1); + return 1; } diff --git a/js/bin/print-buffer-alignment.js b/js/bin/print-buffer-alignment.js index a4cd9bb2351e7..8d422aad60d74 100755 --- a/js/bin/print-buffer-alignment.js +++ b/js/bin/print-buffer-alignment.js @@ -17,34 +17,41 @@ // specific language governing permissions and limitations // under the License. +// @ts-check + const fs = require('fs'); const path = require('path'); - -const ext = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; -const base = process.env.ARROW_JS_DEBUG === 'src' ? '../src' : '../targets/apache-arrow'; -const { Message } = require(`${base}/ipc/metadata${ext}`); -const { readBuffersAsync } = require(`${base}/ipc/reader/binary${ext}`); -const { Table, VectorVisitor, fromReadableStream } = require(`../index${ext}`); +const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; +const { AsyncMessageReader } = require(`../index${extension}`); (async () => { - const in_ = process.argv.length < 3 - ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); - - let recordBatchIndex = 0; - let dictionaryBatchIndex = 0; - - for await (let { message, loader } of readBuffersAsync(fromReadableStream(in_))) { - - if (Message.isRecordBatch(message)) { - console.log(`record batch ${++recordBatchIndex}, offset ${loader.messageOffset}`); - } else if (Message.isDictionaryBatch(message)) { - message = message.data; - console.log(`dictionary batch ${++dictionaryBatchIndex}, offset ${loader.messageOffset}`); - } else { continue; } - - message.buffers.forEach(({offset, length}, i) => { - console.log(`\tbuffer ${i+1}: { offset: ${offset}, length: ${length} }`); + + const readable = process.argv.length < 3 ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); + const reader = new AsyncMessageReader(readable); + + let recordBatchIndex = 0, dictionaryBatchIndex = 0; + + for await (let message of reader) { + + let bufferRegions = []; + + if (message.isSchema()) { + continue; + } else if (message.isRecordBatch()) { + bufferRegions = message.header().buffers; + const body = await reader.readMessageBody(message.bodyLength); + console.log(`record batch ${++recordBatchIndex}, byteOffset ${body.byteOffset}`); + } else if (message.isDictionaryBatch()) { + bufferRegions = message.header().data.buffers; + const body = await reader.readMessageBody(message.bodyLength); + console.log(`dictionary batch ${++dictionaryBatchIndex}, byteOffset ${body.byteOffset}`); + } + + bufferRegions.forEach(({ offset, length }, i) => { + console.log(`\tbuffer ${i + 1}: { offset: ${offset}, length: ${length} }`); }); } + await reader.return(); + })().catch((e) => { console.error(e); process.exit(1); }); diff --git a/js/bin/stream-to-file.js b/js/bin/stream-to-file.js index f33646ac61a41..015a5eace74d8 100755 --- a/js/bin/stream-to-file.js +++ b/js/bin/stream-to-file.js @@ -17,21 +17,24 @@ // specific language governing permissions and limitations // under the License. +// @ts-check + const fs = require('fs'); const path = require('path'); - -const encoding = 'binary'; -const ext = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; -const { util: { PipeIterator } } = require(`../index${ext}`); -const { Table, serializeFile, fromReadableStream } = require(`../index${ext}`); +const eos = require('util').promisify(require('stream').finished); +const extension = process.env.ARROW_JS_DEBUG === 'src' ? '.ts' : ''; +const { RecordBatchReader, RecordBatchFileWriter } = require(`../index${extension}`); (async () => { - // Todo (ptaylor): implement `serializeFileAsync` that accepts an - // AsyncIterable, rather than aggregating into a Table first - const in_ = process.argv.length < 3 - ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); - const out = process.argv.length < 4 - ? process.stdout : fs.createWriteStream(path.resolve(process.argv[3])); - new PipeIterator(serializeFile(await Table.fromAsync(fromReadableStream(in_))), encoding).pipe(out); + + const readable = process.argv.length < 3 ? process.stdin : fs.createReadStream(path.resolve(process.argv[2])); + const writable = process.argv.length < 4 ? process.stdout : fs.createWriteStream(path.resolve(process.argv[3])); + + const streamToFile = readable + .pipe(RecordBatchReader.throughNode()) + .pipe(RecordBatchFileWriter.throughNode()) + .pipe(writable); + + await eos(streamToFile); })().catch((e) => { console.error(e); process.exit(1); }); diff --git a/js/examples/read_file.html b/js/examples/read_file.html index 3e082d9dc412f..ec96d0e4755e2 100644 --- a/js/examples/read_file.html +++ b/js/examples/read_file.html @@ -86,6 +86,6 @@
- + diff --git a/js/gulp/argv.js b/js/gulp/argv.js index 7dceb0f74c587..3a028f813f936 100644 --- a/js/gulp/argv.js +++ b/js/gulp/argv.js @@ -21,16 +21,12 @@ const path = require('path'); const argv = require(`command-line-args`)([ { name: `all`, type: Boolean }, - { name: 'update', alias: 'u', type: Boolean }, - { name: 'verbose', alias: 'v', type: Boolean }, + { name: 'verbose', alias: `v`, type: Boolean }, { name: `target`, type: String, defaultValue: `` }, { name: `module`, type: String, defaultValue: `` }, { name: `coverage`, type: Boolean, defaultValue: false }, - { name: `integration`, alias: `i`, type: Boolean, defaultValue: false }, { name: `targets`, alias: `t`, type: String, multiple: true, defaultValue: [] }, { name: `modules`, alias: `m`, type: String, multiple: true, defaultValue: [] }, - { name: `json_files`, alias: `j`, type: String, multiple: true, defaultValue: [] }, - { name: `arrow_files`, alias: `a`, type: String, multiple: true, defaultValue: [] }, ], { partial: true }); const { targets, modules } = argv; @@ -44,25 +40,4 @@ if (argv.target === `src`) { (argv.all || !modules.length) && modules.push(`all`); } -if (argv.coverage && (!argv.json_files || !argv.json_files.length)) { - - let [jsonPaths, arrowPaths] = glob - .sync(path.resolve(__dirname, `../test/data/json/`, `*.json`)) - .reduce((paths, jsonPath) => { - const { name } = path.parse(jsonPath); - const [jsonPaths, arrowPaths] = paths; - ['cpp', 'java'].forEach((source) => ['file', 'stream'].forEach((format) => { - const arrowPath = path.resolve(__dirname, `../test/data/${source}/${format}/${name}.arrow`); - if (fs.existsSync(arrowPath)) { - jsonPaths.push(jsonPath); - arrowPaths.push(arrowPath); - } - })); - return paths; - }, [[], []]); - - argv.json_files = jsonPaths; - argv.arrow_files = arrowPaths; -} - module.exports = { argv, targets, modules }; diff --git a/js/gulp/arrow-task.js b/js/gulp/arrow-task.js index 95fc1eed0f84e..e119c540dc351 100644 --- a/js/gulp/arrow-task.js +++ b/js/gulp/arrow-task.js @@ -16,24 +16,22 @@ // under the License. const { - mainExport, gCCLanguageNames, targetDir, observableFromStreams } = require('./util'); const del = require('del'); const gulp = require('gulp'); -const path = require('path'); const { promisify } = require('util'); const gulpRename = require(`gulp-rename`); const { memoizeTask } = require('./memoize-task'); const exec = promisify(require('child_process').exec); const { Observable, ReplaySubject } = require('rxjs'); -const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target, format) { +const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target) { const out = targetDir(target); const dtsGlob = `${targetDir(`es2015`, `cjs`)}/**/*.ts`; const cjsGlob = `${targetDir(`es2015`, `cjs`)}/**/*.js`; - const esmGlob = `${targetDir(`es2015`, `esm`)}/**/*.js`; + const esmGlob = `${targetDir(`esnext`, `esm`)}/**/*.js`; const es5UmdGlob = `${targetDir(`es5`, `umd`)}/*.js`; const es5UmdMaps = `${targetDir(`es5`, `umd`)}/*.map`; const es2015UmdGlob = `${targetDir(`es2015`, `umd`)}/*.js`; @@ -46,7 +44,7 @@ const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target, forma observableFromStreams(gulp.src(esmGlob), ch_ext(`.mjs`), gulp.dest(out)), // copy es2015 esm files and rename to `.mjs` observableFromStreams(gulp.src(es5UmdGlob), append(`.es5.min`), gulp.dest(out)), // copy es5 umd files and add `.min` observableFromStreams(gulp.src(es5UmdMaps), gulp.dest(out)), // copy es5 umd sourcemap files, but don't rename - observableFromStreams(gulp.src(es2015UmdGlob), append(`.es2015.min`), gulp.dest(out)), // copy es2015 umd files and add `.es6.min` + observableFromStreams(gulp.src(es2015UmdGlob), append(`.es2015.min`), gulp.dest(out)), // copy es2015 umd files and add `.es2015.min` observableFromStreams(gulp.src(es2015UmdMaps), gulp.dest(out)), // copy es2015 umd sourcemap files, but don't rename ).publish(new ReplaySubject()).refCount(); }))({}); @@ -61,4 +59,4 @@ const arrowTSTask = ((cache) => memoizeTask(cache, async function copyTS(target, module.exports = arrowTask; module.exports.arrowTask = arrowTask; -module.exports.arrowTSTask = arrowTSTask; \ No newline at end of file +module.exports.arrowTSTask = arrowTSTask; diff --git a/js/gulp/clean-task.js b/js/gulp/clean-task.js index d6c90f4637c8b..551aeb41af739 100644 --- a/js/gulp/clean-task.js +++ b/js/gulp/clean-task.js @@ -16,16 +16,15 @@ // under the License. const del = require('del'); +const { Observable } = require('rxjs'); const { targetDir } = require('./util'); -const { memoizeTask } = require('./memoize-task'); -const { Observable, ReplaySubject } = require('rxjs'); +const memoizeTask = require('./memoize-task'); const cleanTask = ((cache) => memoizeTask(cache, function clean(target, format) { - return Observable - .from(del(`${targetDir(target, format)}/**`)) - .catch((e) => Observable.empty()) - .multicast(new ReplaySubject()).refCount(); + const dir = targetDir(target, format); + return Observable.from(del(dir)) + .catch((e) => Observable.empty()); }))({}); module.exports = cleanTask; -module.exports.cleanTask = cleanTask; \ No newline at end of file +module.exports.cleanTask = cleanTask; diff --git a/js/gulp/closure-task.js b/js/gulp/closure-task.js index 547e760a7fa8a..ef629982ae39f 100644 --- a/js/gulp/closure-task.js +++ b/js/gulp/closure-task.js @@ -18,52 +18,83 @@ const { targetDir, mainExport, + esmRequire, gCCLanguageNames, - UMDSourceTargets, - observableFromStreams + publicModulePaths, + observableFromStreams, + shouldRunInChildProcess, + spawnGulpCommandInChildProcess, } = require('./util'); +const fs = require('fs'); const gulp = require('gulp'); const path = require('path'); const sourcemaps = require('gulp-sourcemaps'); const { memoizeTask } = require('./memoize-task'); const { compileBinFiles } = require('./typescript-task'); -const { Observable, ReplaySubject } = require('rxjs'); +const mkdirp = require('util').promisify(require('mkdirp')); const closureCompiler = require('google-closure-compiler').gulp(); -const closureTask = ((cache) => memoizeTask(cache, function closure(target, format) { +const closureTask = ((cache) => memoizeTask(cache, async function closure(target, format) { + + if (shouldRunInChildProcess(target, format)) { + return spawnGulpCommandInChildProcess('compile', target, format); + } + const src = targetDir(target, `cls`); + const srcAbsolute = path.resolve(src); const out = targetDir(target, format); - const entry = path.join(src, mainExport); - const externs = path.join(`src/Arrow.externs.js`); - return observableFromStreams( - gulp.src([ -/* external libs first --> */ `node_modules/tslib/package.json`, - `node_modules/tslib/tslib.es6.js`, - `node_modules/flatbuffers/package.json`, - `node_modules/flatbuffers/js/flatbuffers.mjs`, - `node_modules/text-encoding-utf-8/package.json`, - `node_modules/text-encoding-utf-8/src/encoding.js`, -/* then sources globs --> */ `${src}/**/*.js`, - ], { base: `./` }), - sourcemaps.init(), - closureCompiler(createClosureArgs(entry, externs)), - // rename the sourcemaps from *.js.map files to *.min.js.map - sourcemaps.write(`.`, { mapFile: (mapPath) => mapPath.replace(`.js.map`, `.${target}.min.js.map`) }), - gulp.dest(out) - ) - .merge(compileBinFiles(target, format)) - .takeLast(1) - .publish(new ReplaySubject()).refCount(); + const externs = path.join(`${out}/${mainExport}.externs.js`); + const entry_point = path.join(`${src}/${mainExport}.dom.cls.js`); + + const exportedImports = publicModulePaths(srcAbsolute).reduce((entries, publicModulePath) => [ + ...entries, { + publicModulePath, + exports_: getPublicExportedNames(esmRequire(publicModulePath, { warnings: false })) + } + ], []); + + await mkdirp(out); + + await Promise.all([ + fs.promises.writeFile(externs, generateExternsFile(exportedImports)), + fs.promises.writeFile(entry_point, generateUMDExportAssignnent(srcAbsolute, exportedImports)) + ]); + + return await Promise.all([ + runClosureCompileAsObservable().toPromise(), + compileBinFiles(target, format).toPromise() + ]); + + function runClosureCompileAsObservable() { + return observableFromStreams( + gulp.src([ + /* external libs first */ + `node_modules/flatbuffers/package.json`, + `node_modules/flatbuffers/js/flatbuffers.mjs`, + `node_modules/text-encoding-utf-8/package.json`, + `node_modules/text-encoding-utf-8/src/encoding.js`, + `${src}/**/*.js` /* <-- then source globs */ + ], { base: `./` }), + sourcemaps.init(), + closureCompiler(createClosureArgs(entry_point, externs)), + // rename the sourcemaps from *.js.map files to *.min.js.map + sourcemaps.write(`.`, { mapFile: (mapPath) => mapPath.replace(`.js.map`, `.${target}.min.js.map`) }), + gulp.dest(out) + ); + } }))({}); -const createClosureArgs = (entry, externs) => ({ +module.exports = closureTask; +module.exports.closureTask = closureTask; + +const createClosureArgs = (entry_point, externs) => ({ externs, + entry_point, third_party: true, warning_level: `QUIET`, dependency_mode: `STRICT`, rewrite_polyfills: false, - entry_point: `${entry}.js`, module_resolution: `NODE`, // formatting: `PRETTY_PRINT`, // debug: true, @@ -72,10 +103,99 @@ const createClosureArgs = (entry, externs) => ({ package_json_entry_names: `module,jsnext:main,main`, assume_function_wrapper: true, js_output_file: `${mainExport}.js`, - language_in: gCCLanguageNames[`es2015`], + language_in: gCCLanguageNames[`esnext`], language_out: gCCLanguageNames[`es5`], - output_wrapper: -`// Licensed to the Apache Software Foundation (ASF) under one + output_wrapper:`${apacheHeader()} +(function (global, factory) { + typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : + typeof define === 'function' && define.amd ? define(['Arrow'], factory) : + (factory(global.Arrow = global.Arrow || {})); +}(this, (function (exports) {%output%}.bind(this))));` +}); + +function generateUMDExportAssignnent(src, exportedImports) { + return [ + ...exportedImports.map(({ publicModulePath }, i) => { + const p = publicModulePath.slice(src.length + 1); + return (`import * as exports${i} from './${p}';`); + }).filter(Boolean), + 'Object.assign(arguments[0], exports0);' + ].join('\n'); +} + +function generateExternsFile(exportedImports) { + return [ + externsHeader(), + ...exportedImports.reduce((externBodies, { exports_ }) => [ + ...externBodies, ...exports_.map(externBody) + ], []).filter(Boolean) + ].join('\n'); +} + +function externBody({ exportName, staticNames, instanceNames }) { + return [ + `var ${exportName} = function() {};`, + staticNames.map((staticName) => (isNaN(+staticName) + ? `/** @type {?} */\n${exportName}.${staticName} = function() {};` + : `/** @type {?} */\n${exportName}[${staticName}] = function() {};` + )).join('\n'), + instanceNames.map((instanceName) => (isNaN(+instanceName) + ? `/** @type {?} */\n${exportName}.prototype.${instanceName};` + : `/** @type {?} */\n${exportName}.prototype[${instanceName}];` + )).join('\n') + ].filter(Boolean).join('\n'); +} + +function externsHeader() { + return (`${apacheHeader()} +// @ts-nocheck +/* tslint:disable */ +/** + * @fileoverview Closure Compiler externs for Arrow + * @externs + * @suppress {duplicate,checkTypes} + */ +/** @type {symbol} */ +Symbol.iterator; +/** @type {symbol} */ +Symbol.toPrimitive; +/** @type {symbol} */ +Symbol.asyncIterator; +`); +} + +function getPublicExportedNames(entryModule) { + const fn = function() {}; + const isStaticOrProtoName = (x) => ( + !(x in fn) && + (x !== `default`) && + (x !== `undefined`) && + (x !== `__esModule`) && + (x !== `constructor`) && + !(x.startsWith('_')) + ); + return Object + .getOwnPropertyNames(entryModule) + .filter((name) => name !== 'default') + .filter((name) => ( + typeof entryModule[name] === `object` || + typeof entryModule[name] === `function` + )) + .map((name) => [name, entryModule[name]]) + .reduce((reserved, [name, value]) => { + + const staticNames = value && + typeof value === 'object' ? Object.getOwnPropertyNames(value).filter(isStaticOrProtoName) : + typeof value === 'function' ? Object.getOwnPropertyNames(value).filter(isStaticOrProtoName) : []; + + const instanceNames = (typeof value === `function` && Object.getOwnPropertyNames(value.prototype || {}) || []).filter(isStaticOrProtoName); + + return [...reserved, { exportName: name, staticNames, instanceNames }]; + }, []); +} + +function apacheHeader() { + return `// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file @@ -90,13 +210,5 @@ const createClosureArgs = (entry, externs) => ({ // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations -// under the License. -(function (global, factory) { - typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : - typeof define === 'function' && define.amd ? define(['exports'], factory) : - (factory(global.Arrow = global.Arrow || {})); -}(this, (function (exports) {%output%}.bind(this))));` -}); - -module.exports = closureTask; -module.exports.closureTask = closureTask; +// under the License.` +} diff --git a/js/gulp/build-task.js b/js/gulp/compile-task.js similarity index 90% rename from js/gulp/build-task.js rename to js/gulp/compile-task.js index 9f3402cdd3508..60e2ebbe36a93 100644 --- a/js/gulp/build-task.js +++ b/js/gulp/compile-task.js @@ -24,7 +24,7 @@ const closureTask = require('./closure-task'); const typescriptTask = require('./typescript-task'); const { arrowTask, arrowTSTask } = require('./arrow-task'); -const buildTask = ((cache) => memoizeTask(cache, function build(target, format, ...args) { +const compileTask = ((cache) => memoizeTask(cache, function compile(target, format, ...args) { return target === `src` ? Observable.empty() : target === npmPkgName ? arrowTask(target, format, ...args)() : target === `ts` ? arrowTSTask(target, format, ...args)() @@ -33,5 +33,5 @@ const buildTask = ((cache) => memoizeTask(cache, function build(target, format, : typescriptTask(target, format, ...args)(); }))({}); -module.exports = buildTask; -module.exports.buildTask = buildTask; +module.exports = compileTask; +module.exports.compileTask = compileTask; diff --git a/js/gulp/memoize-task.js b/js/gulp/memoize-task.js index 0b0fc843c451a..408ee3b8839db 100644 --- a/js/gulp/memoize-task.js +++ b/js/gulp/memoize-task.js @@ -17,6 +17,13 @@ const { taskName } = require('./util'); +const createTask = ((taskFn) => ((target, format, ...args) => { + // Give the memoized fn a displayName so gulp's output is easier to follow. + const fn = () => taskFn(target, format, ...args); + fn.displayName = `${taskFn.name || ``}:${taskName(target, format, ...args)}:task`; + return fn; +})); + const memoizeTask = ((cache, taskFn) => ((target, format, ...args) => { // Give the memoized fn a displayName so gulp's output is easier to follow. const fn = () => ( @@ -27,4 +34,5 @@ const memoizeTask = ((cache, taskFn) => ((target, format, ...args) => { })); module.exports = memoizeTask; -module.exports.memoizeTask = memoizeTask; \ No newline at end of file +module.exports.createTask = createTask; +module.exports.memoizeTask = memoizeTask; diff --git a/js/gulp/minify-task.js b/js/gulp/minify-task.js index 82145aa90861a..81cb5e5f3f536 100644 --- a/js/gulp/minify-task.js +++ b/js/gulp/minify-task.js @@ -18,10 +18,10 @@ const { targetDir, mainExport, - ESKeywords, UMDSourceTargets, terserLanguageNames, - observableFromStreams + shouldRunInChildProcess, + spawnGulpCommandInChildProcess, } = require('./util'); const path = require('path'); @@ -30,41 +30,24 @@ const { memoizeTask } = require('./memoize-task'); const { compileBinFiles } = require('./typescript-task'); const { Observable, ReplaySubject } = require('rxjs'); const TerserPlugin = require(`terser-webpack-plugin`); -const esmRequire = require(`@std/esm`)(module, { - mode: `js`, - warnings: false, - cjs: { - /* A boolean for storing ES modules in require.cache. */ - cache: true, - /* A boolean for respecting require.extensions in ESM. */ - extensions: true, - /* A boolean for __esModule interoperability. */ - interop: true, - /* A boolean for importing named exports of CJS modules. */ - namedExports: true, - /* A boolean for following CJS path rules in ESM. */ - paths: true, - /* A boolean for __dirname, __filename, and require in ESM. */ - vars: true, - } -}); const minifyTask = ((cache, commonConfig) => memoizeTask(cache, function minifyJS(target, format) { + if (shouldRunInChildProcess(target, format)) { + return spawnGulpCommandInChildProcess('compile', target, format); + } + const sourceTarget = UMDSourceTargets[target]; - const PublicNames = reservePublicNames(sourceTarget, `cls`); const out = targetDir(target, format), src = targetDir(sourceTarget, `cls`); const targetConfig = { ...commonConfig, output: { ...commonConfig.output, path: path.resolve(`./${out}`) } }; - const webpackConfigs = [ - [mainExport, PublicNames] - ].map(([entry, reserved]) => ({ + const webpackConfigs = [mainExport].map((entry) => ({ ...targetConfig, name: entry, - entry: { [entry]: path.resolve(`${src}/${entry}.js`) }, + entry: { [entry]: path.resolve(`${src}/${entry}.dom.js`) }, plugins: [ ...(targetConfig.plugins || []), new webpack.SourceMapDevToolPlugin({ @@ -73,20 +56,23 @@ const minifyTask = ((cache, commonConfig) => memoizeTask(cache, function minifyJ resourcePath .replace(/\s/, `_`) .replace(/\.\/node_modules\//, ``) - }), - new TerserPlugin({ - sourceMap: true, - terserOptions: { - ecma: terserLanguageNames[target], - compress: { unsafe: true }, - output: { comments: false, beautify: false }, - mangle: { eval: true, - properties: { reserved, keep_quoted: true } - }, - safari10: true // <-- works around safari10 bugs, see the "safari10" option here: https://github.com/terser-js/terser#minify-options - }, }) - ] + ], + optimization: { + minimize: true, + minimizer: [ + new TerserPlugin({ + sourceMap: true, + terserOptions: { + ecma: terserLanguageNames[target], + output: { comments: false, beautify: false }, + compress: { unsafe: true }, + mangle: true, + safari10: true // <-- works around safari10 bugs, see the "safari10" option here: https://github.com/terser-js/terser#minify-options + }, + }) + ] + } })); const compilers = webpack(webpackConfigs); @@ -102,42 +88,3 @@ const minifyTask = ((cache, commonConfig) => memoizeTask(cache, function minifyJ module.exports = minifyTask; module.exports.minifyTask = minifyTask; - -const reservePublicNames = ((ESKeywords) => function reservePublicNames(target, format) { - const src = targetDir(target, format); - const publicModulePaths = [ - `../${src}/data.js`, - `../${src}/type.js`, - `../${src}/table.js`, - `../${src}/vector.js`, - `../${src}/util/int.js`, - `../${src}/predicate.js`, - `../${src}/recordbatch.js`, - `../${src}/${mainExport}.js`, - ]; - return publicModulePaths.reduce((keywords, publicModulePath) => [ - ...keywords, ...reserveExportedNames(esmRequire(publicModulePath, { warnings: false })) - ], [...ESKeywords]); -})(ESKeywords); - -// Reflect on the Arrow modules to come up with a list of keys to save from -// Terser's -// mangler. Assume all the non-inherited static and prototype members of the Arrow -// module and its direct exports are public, and should be preserved through minification. -const reserveExportedNames = (entryModule) => ( - Object - .getOwnPropertyNames(entryModule) - .filter((name) => ( - typeof entryModule[name] === `object` || - typeof entryModule[name] === `function` - )) - .map((name) => [name, entryModule[name]]) - .reduce((reserved, [name, value]) => { - const fn = function() {}; - const ownKeys = value && typeof value === 'object' && Object.getOwnPropertyNames(value) || []; - const protoKeys = typeof value === `function` && Object.getOwnPropertyNames(value.prototype || {}) || []; - const publicNames = [...ownKeys, ...protoKeys].filter((x) => x !== `default` && x !== `undefined` && !(x in fn)); - return [...reserved, name, ...publicNames]; - }, [] - ) -); diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js index 8c0f8fb0e4767..2a67c812206ce 100644 --- a/js/gulp/package-task.js +++ b/js/gulp/package-task.js @@ -46,17 +46,19 @@ const createMainPackageJson = (target, format) => (orig) => ({ ...createTypeScriptPackageJson(target, format)(orig), bin: orig.bin, name: npmPkgName, - main: mainExport, - types: `${mainExport}.d.ts`, - module: `${mainExport}.mjs`, + main: `${mainExport}.node`, + browser: `${mainExport}.dom`, + types: `${mainExport}.node.d.ts`, unpkg: `${mainExport}.es5.min.js`, - [`@std/esm`]: { mode: `all`, warnings: false, sourceMap: true } + [`esm`]: { mode: `all`, sourceMap: true } }); const createTypeScriptPackageJson = (target, format) => (orig) => ({ ...createScopedPackageJSON(target, format)(orig), - main: `${mainExport}.ts`, types: `${mainExport}.ts`, bin: undefined, + main: `${mainExport}.node.ts`, + types: `${mainExport}.node.ts`, + browser: `${mainExport}.dom.ts`, dependencies: { '@types/flatbuffers': '*', '@types/node': '*', @@ -70,8 +72,10 @@ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), { name: `${npmOrgName}/${packageName(target, format)}`, - version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, - unpkg: undefined, module: undefined, [`@std/esm`]: undefined + browser: format === 'umd' ? undefined : `${mainExport}.dom`, + main: format === 'umd' ? `${mainExport}` : `${mainExport}.node`, + types: format === 'umd' ? undefined : `${mainExport}.node.d.ts`, + version: undefined, unpkg: undefined, module: undefined, [`esm`]: undefined, } ) ) @@ -80,6 +84,5 @@ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => const conditionallyAddStandardESMEntry = (target, format) => (packageJSON) => ( format !== `esm` && format !== `cls` ? packageJSON - : { ...packageJSON, [`@std/esm`]: { mode: `js`, warnings: false, sourceMap: true } } + : { ...packageJSON, [`esm`]: { mode: `auto`, sourceMap: true } } ); - \ No newline at end of file diff --git a/js/gulp/test-task.js b/js/gulp/test-task.js index b0e34f8c94426..c7ad7d513c652 100644 --- a/js/gulp/test-task.js +++ b/js/gulp/test-task.js @@ -20,44 +20,47 @@ const path = require('path'); const { argv } = require('./argv'); const { promisify } = require('util'); const glob = promisify(require('glob')); -const stat = promisify(require('fs').stat); const mkdirp = promisify(require('mkdirp')); const rimraf = promisify(require('rimraf')); const child_process = require(`child_process`); const { memoizeTask } = require('./memoize-task'); const readFile = promisify(require('fs').readFile); +const asyncDone = promisify(require('async-done')); const exec = promisify(require('child_process').exec); const parseXML = promisify(require('xml2js').parseString); const jestArgv = []; -argv.update && jestArgv.push(`-u`); argv.verbose && jestArgv.push(`--verbose`); -argv.coverage && jestArgv.push(`--coverage`); +argv.coverage + ? jestArgv.push(`-c`, `jest.coverage.config.js`, `--coverage`) + : jestArgv.push(`-c`, `jest.config.js`, `-i`) -const debugArgv = [`--runInBand`, `--env`, `node-debug`]; -const jest = require.resolve(path.join(`..`, `node_modules`, `.bin`, `jest`)); +const jest = path.join(path.parse(require.resolve(`jest`)).dir, `../bin/jest.js`); const testOptions = { - env: { ...process.env }, stdio: [`ignore`, `inherit`, `inherit`], + env: { + ...process.env, + // hide fs.promises/stream[Symbol.asyncIterator] warnings + NODE_NO_WARNINGS: `1`, + // prevent the user-land `readable-stream` module from + // patching node's streams -- they're better now + READABLE_STREAM: `disable` + }, }; -const testTask = ((cache, execArgv, testOptions) => memoizeTask(cache, function test(target, format, debug = false) { +const testTask = ((cache, execArgv, testOptions) => memoizeTask(cache, function test(target, format) { const opts = { ...testOptions }; - const args = !debug ? [...execArgv] : [...debugArgv, ...execArgv]; - if (!argv.coverage) { - args.push(`test/${argv.integration ? `integration/*` : `unit/*`}`); - } - opts.env = { ...opts.env, + const args = [...execArgv, `test/unit/`]; + opts.env = { + ...opts.env, TEST_TARGET: target, TEST_MODULE: format, - TEST_TS_SOURCE: !!argv.coverage || (target === 'src') || (opts.env.TEST_TS_SOURCE === 'true'), - JSON_PATHS: JSON.stringify(Array.isArray(argv.json_files) ? argv.json_files : [argv.json_files]), - ARROW_PATHS: JSON.stringify(Array.isArray(argv.arrow_files) ? argv.arrow_files : [argv.arrow_files]), + TEST_DOM_STREAMS: (target ==='src' || format === 'umd').toString(), + TEST_NODE_STREAMS: (target ==='src' || format !== 'umd').toString(), + TEST_TS_SOURCE: !!argv.coverage || (target === 'src') || (opts.env.TEST_TS_SOURCE === 'true') }; - return !debug ? - child_process.spawn(jest, args, opts) : - child_process.exec(`node --inspect-brk ${jest} ${args.join(` `)}`, opts); -}))({}, jestArgv, testOptions); + return asyncDone(() => child_process.spawn(`node`, args, opts)); +}))({}, [jest, ...jestArgv], testOptions); module.exports = testTask; module.exports.testTask = testTask; @@ -69,9 +72,9 @@ const ARROW_HOME = process.env.ARROW_HOME || path.resolve('../'); const ARROW_JAVA_DIR = process.env.ARROW_JAVA_DIR || path.join(ARROW_HOME, 'java'); const CPP_EXE_PATH = process.env.ARROW_CPP_EXE_PATH || path.join(ARROW_HOME, 'cpp/build/debug'); const ARROW_INTEGRATION_DIR = process.env.ARROW_INTEGRATION_DIR || path.join(ARROW_HOME, 'integration'); -const CPP_JSON_TO_ARROW = path.join(CPP_EXE_PATH, 'json-integration-test'); -const CPP_STREAM_TO_FILE = path.join(CPP_EXE_PATH, 'stream-to-file'); -const CPP_FILE_TO_STREAM = path.join(CPP_EXE_PATH, 'file-to-stream'); +const CPP_JSON_TO_ARROW = path.join(CPP_EXE_PATH, 'arrow-json-integration-test'); +const CPP_STREAM_TO_FILE = path.join(CPP_EXE_PATH, 'arrow-stream-to-file'); +const CPP_FILE_TO_STREAM = path.join(CPP_EXE_PATH, 'arrow-file-to-stream'); const testFilesDir = path.join(ARROW_HOME, 'js/test/data'); const snapshotsDir = path.join(ARROW_HOME, 'js/test/__snapshots__'); diff --git a/js/gulp/typescript-task.js b/js/gulp/typescript-task.js index beffab8a08ce0..fe694cac860b3 100644 --- a/js/gulp/typescript-task.js +++ b/js/gulp/typescript-task.js @@ -16,19 +16,26 @@ // under the License. const { - targetDir, tsconfigName, observableFromStreams + targetDir, + tsconfigName, + observableFromStreams, + shouldRunInChildProcess, + spawnGulpCommandInChildProcess, } = require('./util'); -const del = require('del'); const gulp = require('gulp'); const path = require('path'); const ts = require(`gulp-typescript`); -const gulpRename = require(`gulp-rename`); const sourcemaps = require('gulp-sourcemaps'); const { memoizeTask } = require('./memoize-task'); const { Observable, ReplaySubject } = require('rxjs'); const typescriptTask = ((cache) => memoizeTask(cache, function typescript(target, format) { + + if (shouldRunInChildProcess(target, format)) { + return spawnGulpCommandInChildProcess('compile', target, format); + } + const out = targetDir(target, format); const tsconfigPath = path.join(`tsconfig`, `tsconfig.${tsconfigName(target, format)}.json`); return compileTypescript(out, tsconfigPath) @@ -39,11 +46,11 @@ const typescriptTask = ((cache) => memoizeTask(cache, function typescript(target function compileBinFiles(target, format) { const out = targetDir(target, format); const tsconfigPath = path.join(`tsconfig`, `tsconfig.${tsconfigName('bin', 'cjs')}.json`); - return compileTypescript(path.join(out, 'bin'), tsconfigPath); + return compileTypescript(path.join(out, 'bin'), tsconfigPath, { target }); } -function compileTypescript(out, tsconfigPath) { - const tsProject = ts.createProject(tsconfigPath, { typescript: require(`typescript`) }); +function compileTypescript(out, tsconfigPath, tsconfigOverrides) { + const tsProject = ts.createProject(tsconfigPath, { typescript: require(`typescript`), ...tsconfigOverrides }); const { stream: { js, dts } } = observableFromStreams( tsProject.src(), sourcemaps.init(), tsProject(ts.reporter.defaultReporter()) diff --git a/js/gulp/util.js b/js/gulp/util.js index 12d21b0e16be2..bd87684a1dc3d 100644 --- a/js/gulp/util.js +++ b/js/gulp/util.js @@ -17,8 +17,11 @@ const fs = require('fs'); const path = require(`path`); -const pump = require(`pump`); +const pump = require(`stream`).pipeline; +const child_process = require(`child_process`); +const { targets, modules } = require('./argv'); const { Observable, ReplaySubject } = require('rxjs'); +const asyncDone = require('util').promisify(require('async-done')); const mainExport = `Arrow`; const npmPkgName = `apache-arrow`; @@ -29,7 +32,7 @@ const knownTargets = [`es5`, `es2015`, `esnext`]; const knownModules = [`cjs`, `esm`, `cls`, `umd`]; const tasksToSkipPerTargetOrFormat = { src: { clean: true, build: true }, - cls: { test: true, integration: true } + cls: { test: true, package: true } }; const packageJSONFields = [ `version`, `license`, `description`, @@ -66,7 +69,7 @@ const UMDSourceTargets = { es2015: `es2015`, es2016: `es2015`, es2017: `es2015`, - esnext: `es2015` + esnext: `esnext` }; const terserLanguageNames = { @@ -109,12 +112,27 @@ function targetDir(target, format) { return path.join(releasesRootDir, ...(!format ? [target] : [target, format])); } -function logAndDie(e) { - if (e) { - process.exit(1); - } +function shouldRunInChildProcess(target, format) { + // If we're building more than one module/target, then yes run this task in a child process + if (targets.length > 1 || modules.length > 1) { return true; } + // If the target we're building *isn't* the target the gulp command was configured to run, then yes run that in a child process + if (targets[0] !== target || modules[0] !== format) { return true; } + // Otherwise no need -- either gulp was run for just one target, or we've been spawned as the child of a multi-target parent gulp + return false; +} + +const gulp = path.join(path.parse(require.resolve(`gulp`)).dir, `bin/gulp.js`); +function spawnGulpCommandInChildProcess(command, target, format) { + const args = [gulp, command, '-t', target, '-m', format, `--silent`]; + const opts = { + stdio: [`ignore`, `inherit`, `inherit`], + env: { ...process.env, NODE_NO_WARNINGS: `1` } + }; + return asyncDone(() => child_process.spawn(`node`, args, opts)) + .catch((e) => { throw { message: `${command}:${taskName(target, format)}` }; }); } +const logAndDie = (e) => { if (e) { process.exit(1); } }; function observableFromStreams(...streams) { if (streams.length <= 0) { return Observable.empty(); } const pumped = streams.length <= 1 ? streams[0] : pump(...streams, logAndDie); @@ -164,12 +182,37 @@ function* combinations(_targets, _modules) { } } +const publicModulePaths = (dir) => [ + `${dir}/${mainExport}.dom.js`, + `${dir}/util/int.js`, + `${dir}/compute/predicate.js`, +]; + +const esmRequire = require(`esm`)(module, { + mode: `auto`, + cjs: { + /* A boolean for storing ES modules in require.cache. */ + cache: true, + /* A boolean for respecting require.extensions in ESM. */ + extensions: true, + /* A boolean for __esModule interoperability. */ + interop: true, + /* A boolean for importing named exports of CJS modules. */ + namedExports: true, + /* A boolean for following CJS path rules in ESM. */ + paths: true, + /* A boolean for __dirname, __filename, and require in ESM. */ + vars: true, + } +}); + module.exports = { mainExport, npmPkgName, npmOrgName, metadataFiles, packageJSONFields, knownTargets, knownModules, tasksToSkipPerTargetOrFormat, - ESKeywords, gCCLanguageNames, UMDSourceTargets, terserLanguageNames, + gCCLanguageNames, UMDSourceTargets, terserLanguageNames, taskName, packageName, tsconfigName, targetDir, combinations, observableFromStreams, + ESKeywords, publicModulePaths, esmRequire, shouldRunInChildProcess, spawnGulpCommandInChildProcess }; diff --git a/js/gulpfile.js b/js/gulpfile.js index 78aaa17ddb8b4..37c1d187995d2 100644 --- a/js/gulpfile.js +++ b/js/gulpfile.js @@ -17,17 +17,15 @@ const del = require('del'); const gulp = require('gulp'); -const path = require('path'); const { Observable } = require('rxjs'); -const buildTask = require('./gulp/build-task'); const cleanTask = require('./gulp/clean-task'); +const compileTask = require('./gulp/compile-task'); const packageTask = require('./gulp/package-task'); const { targets, modules } = require('./gulp/argv'); const { testTask, createTestData, cleanTestData } = require('./gulp/test-task'); const { - targetDir, taskName, combinations, - knownTargets, + targetDir, knownTargets, npmPkgName, UMDSourceTargets, tasksToSkipPerTargetOrFormat } = require('./gulp/util'); @@ -36,63 +34,60 @@ for (const [target, format] of combinations([`all`], [`all`])) { const task = taskName(target, format); gulp.task(`clean:${task}`, cleanTask(target, format)); gulp.task( `test:${task}`, testTask(target, format)); - gulp.task(`debug:${task}`, testTask(target, format, true)); - gulp.task(`build:${task}`, gulp.series(`clean:${task}`, - buildTask(target, format), - packageTask(target, format))); + gulp.task(`compile:${task}`, compileTask(target, format)); + gulp.task(`package:${task}`, packageTask(target, format)); + gulp.task(`build:${task}`, gulp.series( + `clean:${task}`, `compile:${task}`, `package:${task}` + )); } // The UMD bundles build temporary es5/6/next targets via TS, // then run the TS source through either closure-compiler or // a minifier, so we special case that here. -knownTargets.forEach((target) => - gulp.task(`build:${target}:umd`, - gulp.series( - gulp.parallel( - cleanTask(target, `umd`), - cleanTask(UMDSourceTargets[target], `cls`) - ), - buildTask(UMDSourceTargets[target], `cls`), - buildTask(target, `umd`), packageTask(target, `umd`) - ) - ) -); +knownTargets.forEach((target) => { + const umd = taskName(target, `umd`); + const cls = taskName(UMDSourceTargets[target], `cls`); + gulp.task(`build:${umd}`, gulp.series( + `build:${cls}`, + `clean:${umd}`, `compile:${umd}`, `package:${umd}`, + function remove_closure_tmp_files() { + return del(targetDir(target, `cls`)) + } + )); +}); // The main "apache-arrow" module builds the es5/umd, es2015/cjs, // es2015/esm, and es2015/umd targets, then copies and renames the // compiled output into the apache-arrow folder gulp.task(`build:${npmPkgName}`, gulp.series( - cleanTask(npmPkgName), gulp.parallel( `build:${taskName(`es5`, `umd`)}`, `build:${taskName(`es2015`, `cjs`)}`, `build:${taskName(`es2015`, `esm`)}`, `build:${taskName(`es2015`, `umd`)}` ), - buildTask(npmPkgName), packageTask(npmPkgName) + `clean:${npmPkgName}`, + `compile:${npmPkgName}`, + `package:${npmPkgName}` ) ); - -function gulpConcurrent(tasks) { - return () => Observable.bindCallback((tasks, cb) => gulp.parallel(tasks)(cb))(tasks); -} - -const buildConcurrent = (tasks) => () => - gulpConcurrent(tasks)() - .concat(Observable - .defer(() => Observable - .merge(...knownTargets.map((target) => - del(`${targetDir(target, `cls`)}/**`))))); - +// And finally the global composite tasks gulp.task(`clean:testdata`, cleanTestData); gulp.task(`create:testdata`, createTestData); -gulp.task(`test`, gulp.series(getTasks(`test`))); -gulp.task(`debug`, gulp.series(getTasks(`debug`))); +gulp.task(`test`, gulpConcurrent(getTasks(`test`))); gulp.task(`clean`, gulp.parallel(getTasks(`clean`))); -gulp.task(`build`, buildConcurrent(getTasks(`build`))); -gulp.task(`default`, gulp.series(`build`, `test`)); +gulp.task(`build`, gulpConcurrent(getTasks(`build`))); +gulp.task(`compile`, gulpConcurrent(getTasks(`compile`))); +gulp.task(`package`, gulpConcurrent(getTasks(`package`))); +gulp.task(`default`, gulp.series(`clean`, `build`, `test`)); + +function gulpConcurrent(tasks) { + const numCPUs = Math.max(1, require('os').cpus().length * 0.75) | 0; + return () => Observable.from(tasks.map((task) => gulp.series(task))) + .flatMap((task) => Observable.bindNodeCallback(task)(), numCPUs); +} function getTasks(name) { const tasks = []; diff --git a/js/index.ts b/js/index.ts index 51b8676abbd9d..cfd64bbbe9730 100644 --- a/js/index.ts +++ b/js/index.ts @@ -15,4 +15,4 @@ // specific language governing permissions and limitations // under the License. -export * from './src/Arrow'; \ No newline at end of file +export * from './src/Arrow.node'; \ No newline at end of file diff --git a/js/jest.config.js b/js/jest.config.js new file mode 100644 index 0000000000000..55028d09f969e --- /dev/null +++ b/js/jest.config.js @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +module.exports = { + "verbose": false, + "reporters": [ + "jest-silent-reporter" + ], + "testEnvironment": "node", + "globals": { + "ts-jest": { + "diagnostics": false, + "tsConfig": "test/tsconfig.json" + } + }, + "roots": [ + "/test/" + ], + "moduleFileExtensions": [ + "js", + "ts", + "tsx" + ], + "coverageReporters": [ + "lcov" + ], + "coveragePathIgnorePatterns": [ + "fb\\/(File|Message|Schema|Tensor)\\.(js|ts)$", + "test\\/.*\\.(ts|tsx|js)$", + "/node_modules/" + ], + "transform": { + "^.+\\.jsx?$": "ts-jest", + "^.+\\.tsx?$": "ts-jest" + }, + "transformIgnorePatterns": [ + "/node_modules/(?!web-stream-tools).+\\.js$" + ], + "testRegex": "(.*(-|\\.)(test|spec)s?)\\.(ts|tsx|js)$", + "preset": "ts-jest", + "testMatch": null +}; diff --git a/js/jest.coverage.config.js b/js/jest.coverage.config.js new file mode 100644 index 0000000000000..72ddd3c9345a0 --- /dev/null +++ b/js/jest.coverage.config.js @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +module.exports = { + ...require('./jest.config'), + "reporters": undefined, + "coverageReporters": [ + "lcov", "json" + ], + "globals": { + "ts-jest": { + "diagnostics": false, + "tsConfig": "test/tsconfig.coverage.json" + } + } +}; diff --git a/js/npm-release.sh b/js/npm-release.sh index 3ef24d3e6f828..a52e25ed7884a 100755 --- a/js/npm-release.sh +++ b/js/npm-release.sh @@ -20,11 +20,7 @@ set -e # validate the targets pass all tests before publishing npm install -# npx run-s clean:all lint create:testdata build -# npm run test -- -t ts -u --integration -# npm run test -- --integration -npx run-s clean:all lint build -npm run test +npx gulp # publish the JS target modules to npm npx lerna exec -- npm publish diff --git a/js/package-lock.json b/js/package-lock.json index ef38db9a7468d..3b31a6dff2085 100644 --- a/js/package-lock.json +++ b/js/package-lock.json @@ -64,57 +64,56 @@ } }, "@lerna/add": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/add/-/add-3.5.0.tgz", - "integrity": "sha512-hoOqtal/ChEEtt9rxR/6xmyvTN7581XF4kWHoWPV9NbfZN9e8uTR8z4mCcJq2DiZhRuY7aA5FEROEbl12soowQ==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/add/-/add-3.7.2.tgz", + "integrity": "sha512-/kCuyytOEmYcqpbU8MhHc2/3bPJjEx+qq7SOdb0cCDG+QcJ/oSsDCZ3xVHxhyLRYAoRlKBch3DiBmY4BeIm0Ag==", "dev": true, "requires": { - "@lerna/bootstrap": "^3.5.0", - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", - "@lerna/npm-conf": "^3.4.1", - "@lerna/validation-error": "^3.0.0", + "@lerna/bootstrap": "^3.7.2", + "@lerna/command": "^3.7.2", + "@lerna/filter-options": "^3.6.0", + "@lerna/npm-conf": "^3.7.0", + "@lerna/validation-error": "^3.6.0", "dedent": "^0.7.0", - "npm-package-arg": "^6.0.0", + "libnpm": "^2.0.1", "p-map": "^1.2.0", - "pacote": "^9.1.0", "semver": "^5.5.0" } }, "@lerna/batch-packages": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/@lerna/batch-packages/-/batch-packages-3.1.2.tgz", - "integrity": "sha512-HAkpptrYeUVlBYbLScXgeCgk6BsNVXxDd53HVWgzzTWpXV4MHpbpeKrByyt7viXlNhW0w73jJbipb/QlFsHIhQ==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/batch-packages/-/batch-packages-3.6.0.tgz", + "integrity": "sha512-khG15B+EFLH3Oms6A6WsMAy54DrnKIhEAm6CCATN2BKnBkNgitYjLN2vKBzlR2LfQpTkgub67QKIJkMFQcK1Sg==", "dev": true, "requires": { - "@lerna/package-graph": "^3.1.2", - "@lerna/validation-error": "^3.0.0", - "npmlog": "^4.1.2" + "@lerna/package-graph": "^3.6.0", + "@lerna/validation-error": "^3.6.0", + "libnpm": "^2.0.1" } }, "@lerna/bootstrap": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/bootstrap/-/bootstrap-3.5.0.tgz", - "integrity": "sha512-+z4kVVJFO5EGfC2ob/4C9LetqWwDtbhZgTRllr1+zOi/2clbD+WKcVI0ku+/ckzKjz783SOc83swX7RrmiLwMQ==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/bootstrap/-/bootstrap-3.7.2.tgz", + "integrity": "sha512-yVjr450UivC7gbIh3GZowJ6bzPy/xC75bduq2Zm+jdIksjM/8SA3HRXWNothaSyZWudV+WY+cy6MvwrtFe8Kbg==", "dev": true, "requires": { - "@lerna/batch-packages": "^3.1.2", - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", + "@lerna/batch-packages": "^3.6.0", + "@lerna/command": "^3.7.2", + "@lerna/filter-options": "^3.6.0", "@lerna/has-npm-version": "^3.3.0", - "@lerna/npm-conf": "^3.4.1", - "@lerna/npm-install": "^3.3.0", - "@lerna/rimraf-dir": "^3.3.0", - "@lerna/run-lifecycle": "^3.4.1", + "@lerna/npm-install": "^3.6.0", + "@lerna/package-graph": "^3.6.0", + "@lerna/pulse-till-done": "^3.7.1", + "@lerna/rimraf-dir": "^3.6.0", + "@lerna/run-lifecycle": "^3.7.1", "@lerna/run-parallel-batches": "^3.0.0", - "@lerna/symlink-binary": "^3.3.0", - "@lerna/symlink-dependencies": "^3.3.0", - "@lerna/validation-error": "^3.0.0", + "@lerna/symlink-binary": "^3.7.2", + "@lerna/symlink-dependencies": "^3.7.2", + "@lerna/validation-error": "^3.6.0", "dedent": "^0.7.0", "get-port": "^3.2.0", + "libnpm": "^2.0.1", "multimatch": "^2.1.0", - "npm-package-arg": "^6.0.0", - "npmlog": "^4.1.2", "p-finally": "^1.0.0", "p-map": "^1.2.0", "p-map-series": "^1.0.0", @@ -124,26 +123,26 @@ } }, "@lerna/changed": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/changed/-/changed-3.5.0.tgz", - "integrity": "sha512-p9o7/hXwFAoet7UPeHIzIPonYxLHZe9bcNcjxKztZYAne5/OgmZiF4X1UPL2S12wtkT77WQy4Oz8NjRTczcapg==", + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/@lerna/changed/-/changed-3.8.0.tgz", + "integrity": "sha512-IeOxB+nwGFpAuEgUi9FeP19hj6Abp1aNCeMjS9/KpOxrSGt3ejKlSKY83lwqDPbb6OnthQTRBlodWZpSiSPWqg==", "dev": true, "requires": { - "@lerna/collect-updates": "^3.5.0", - "@lerna/command": "^3.5.0", - "@lerna/listable": "^3.0.0", - "@lerna/output": "^3.0.0", - "@lerna/version": "^3.5.0" + "@lerna/collect-updates": "^3.6.0", + "@lerna/command": "^3.7.2", + "@lerna/listable": "^3.6.0", + "@lerna/output": "^3.6.0", + "@lerna/version": "^3.8.0" } }, "@lerna/check-working-tree": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/check-working-tree/-/check-working-tree-3.5.0.tgz", - "integrity": "sha512-aWeIputHddeZgf7/wA1e5yuv6q9S5si2y7fzO2Ah7m3KyDyl8XHP1M0VSSDzZeiloYCryAYQAoRgcrdH65Vhow==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/check-working-tree/-/check-working-tree-3.6.0.tgz", + "integrity": "sha512-Ioy1t2aVasAwhY1Oi5kfpwbW9RDupxxVVu2t2c1EeBYYCu3jIt1A5ad34gidgsKyiG3HeBEVziI4Uaihnb96ZQ==", "dev": true, "requires": { - "@lerna/describe-ref": "^3.5.0", - "@lerna/validation-error": "^3.0.0" + "@lerna/describe-ref": "^3.6.0", + "@lerna/validation-error": "^3.6.0" } }, "@lerna/child-process": { @@ -193,33 +192,44 @@ "requires": { "pump": "^3.0.0" } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } } } }, "@lerna/clean": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/clean/-/clean-3.5.0.tgz", - "integrity": "sha512-bHUFF6Wv7ms81Tmwe56xk296oqU74Sg9NSkUCDG4kZLpYZx347Aw+89ZPTlaSmUwqCgEXKYLr65ZVVvKmflpcA==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/clean/-/clean-3.7.2.tgz", + "integrity": "sha512-BhuPnAWQa2av6hSE8imbOhenUnveSp0VDO1X0jzC1EX+K6sBCubbowM13kYi+N0qUd2kdeatBNwmafzkBZ3LcQ==", "dev": true, "requires": { - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", - "@lerna/prompt": "^3.3.1", - "@lerna/rimraf-dir": "^3.3.0", + "@lerna/command": "^3.7.2", + "@lerna/filter-options": "^3.6.0", + "@lerna/prompt": "^3.6.0", + "@lerna/pulse-till-done": "^3.7.1", + "@lerna/rimraf-dir": "^3.6.0", "p-map": "^1.2.0", "p-map-series": "^1.0.0", "p-waterfall": "^1.0.0" } }, "@lerna/cli": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/@lerna/cli/-/cli-3.2.0.tgz", - "integrity": "sha512-JdbLyTxHqxUlrkI+Ke+ltXbtyA+MPu9zR6kg/n8Fl6uaez/2fZWtReXzYi8MgLxfUFa7+1OHWJv4eAMZlByJ+Q==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/cli/-/cli-3.6.0.tgz", + "integrity": "sha512-FGCx7XOLpqmU5eFOlo0Lt0hRZraxSUTEWM0bce0p+HNpOxBc91o6d2tenW1azPYFP9HzsMQey1NBtU0ofJJeog==", "dev": true, "requires": { "@lerna/global-options": "^3.1.3", "dedent": "^0.7.0", - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "yargs": "^12.0.1" }, "dependencies": { @@ -260,13 +270,13 @@ } }, "execa": { - "version": "0.10.0", - "resolved": "https://registry.npmjs.org/execa/-/execa-0.10.0.tgz", - "integrity": "sha512-7XOMnz8Ynx1gGo/3hyV9loYNPWM94jG3+3T3Y8tsfSstFmETmENCMU/A/zj8Lyaj1lkgEepKepvd6240tBRvlw==", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/execa/-/execa-1.0.0.tgz", + "integrity": "sha512-adbxcyWV46qiHyvSp50TKt05tB4tK3HcmF7/nxfAdhnox83seTDbwnaqKO4sXRy7roHAIFqJP/Rw/AuEbX61LA==", "dev": true, "requires": { "cross-spawn": "^6.0.0", - "get-stream": "^3.0.0", + "get-stream": "^4.0.0", "is-stream": "^1.1.0", "npm-run-path": "^2.0.0", "p-finally": "^1.0.0", @@ -283,6 +293,15 @@ "locate-path": "^3.0.0" } }, + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", + "dev": true, + "requires": { + "pump": "^3.0.0" + } + }, "invert-kv": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/invert-kv/-/invert-kv-2.0.0.tgz", @@ -326,20 +345,20 @@ } }, "os-locale": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.0.1.tgz", - "integrity": "sha512-7g5e7dmXPtzcP4bgsZ8ixDVqA7oWYuEz4lOSujeWyliPai4gfVDiFIcwBg3aGCPnmSGfzOKTK3ccPn0CKv3DBw==", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-3.1.0.tgz", + "integrity": "sha512-Z8l3R4wYWM40/52Z+S265okfFj8Kt2cC2MKY+xNi3kFs+XGI7WXu/I309QQQYbRW4ijiZ+yxs9pqEhJh0DqW3Q==", "dev": true, "requires": { - "execa": "^0.10.0", + "execa": "^1.0.0", "lcid": "^2.0.0", "mem": "^4.0.0" } }, "p-limit": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.0.0.tgz", - "integrity": "sha512-fl5s52lI5ahKCernzzIyAP0QAZbGIovtVHGwpcu1Jr/EpzLVDI2myISHwGqK7m8uQFugVWSrbxH7XnhGtvEc+A==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.1.0.tgz", + "integrity": "sha512-NhURkNcrVB+8hNfLuysU8enY5xn2KXphsHBaC2YmRNTZRc7RWusw6apSpdEj3jo4CMb6W9nrF6tTnsJsJeyu6g==", "dev": true, "requires": { "p-try": "^2.0.0" @@ -360,6 +379,22 @@ "integrity": "sha512-hMp0onDKIajHfIkdRk3P4CdCmErkYAxxDtP3Wx/4nZ3aGlau2VKh3mZpcuFkH27WQkL/3WBCPOktzA9ZOAnMQQ==", "dev": true }, + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, "string-width": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", @@ -379,6 +414,12 @@ "ansi-regex": "^3.0.0" } }, + "which-module": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", + "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=", + "dev": true + }, "yargs": { "version": "12.0.5", "resolved": "https://registry.npmjs.org/yargs/-/yargs-12.0.5.tgz", @@ -412,34 +453,34 @@ } }, "@lerna/collect-updates": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/collect-updates/-/collect-updates-3.5.0.tgz", - "integrity": "sha512-rFCng14K8vHyrDJSAacj6ABKKT/TxZdpL9uPEtZN7DsoJKlKPzqFeRvRGA2+ed/I6mEm4ltauEjEpKG5O6xqtw==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/collect-updates/-/collect-updates-3.6.0.tgz", + "integrity": "sha512-knliEz3phY51SGnwDhhYqx6SJN6y9qh/gZrZgQ7ogqz1UgA/MyJb27gszjsyyG6jUQshimBpjsG7OMwjt8+n9A==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/describe-ref": "^3.5.0", + "@lerna/describe-ref": "^3.6.0", + "libnpm": "^2.0.1", "minimatch": "^3.0.4", - "npmlog": "^4.1.2", "slash": "^1.0.0" } }, "@lerna/command": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/command/-/command-3.5.0.tgz", - "integrity": "sha512-C/0e7qPbuKZ9vEqzRePksoKDJk4TOWzsU5qaPP/ikqc6vClJbKucsIehk3za6glSjlgLCJpzBTF2lFjHfb+JNw==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/command/-/command-3.7.2.tgz", + "integrity": "sha512-WtBnlvQfzKmnc2i3g+GLazx7pUXwbzASiXHy4j1CoC0w90H42LUqhwJICro4VhnE8xi38BNhcH/+xFNiHX5ERA==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/package-graph": "^3.1.2", - "@lerna/project": "^3.5.0", - "@lerna/validation-error": "^3.0.0", - "@lerna/write-log-file": "^3.0.0", + "@lerna/package-graph": "^3.6.0", + "@lerna/project": "^3.7.2", + "@lerna/validation-error": "^3.6.0", + "@lerna/write-log-file": "^3.6.0", "dedent": "^0.7.0", "execa": "^1.0.0", "is-ci": "^1.0.10", - "lodash": "^4.17.5", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1", + "lodash": "^4.17.5" }, "dependencies": { "cross-spawn": { @@ -478,23 +519,32 @@ "requires": { "pump": "^3.0.0" } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } } } }, "@lerna/conventional-commits": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/conventional-commits/-/conventional-commits-3.5.0.tgz", - "integrity": "sha512-roKPILPYnDWiCDxOeBQ0cObJ2FbDgzJSToxr1ZwIqvJU5hGQ4RmooCf8GHcCW9maBJz7ETeestv8M2mBUgBPbg==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/conventional-commits/-/conventional-commits-3.6.0.tgz", + "integrity": "sha512-KkY3wd7w/tj76EEIhTMYZlSBk/5WkT2NA9Gr/EuSwKV70PYyVA55l1OGlikBUAnuqIjwyfw9x3y+OcbYI4aNEg==", "dev": true, "requires": { - "@lerna/validation-error": "^3.0.0", + "@lerna/validation-error": "^3.6.0", "conventional-changelog-angular": "^5.0.2", "conventional-changelog-core": "^3.1.5", "conventional-recommended-bump": "^4.0.4", "fs-extra": "^7.0.0", "get-stream": "^4.0.0", - "npm-package-arg": "^6.0.0", - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "semver": "^5.5.0" }, "dependencies": { @@ -506,25 +556,36 @@ "requires": { "pump": "^3.0.0" } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } } } }, "@lerna/create": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/create/-/create-3.5.0.tgz", - "integrity": "sha512-ek4flHRmpMegZp9tP3RmuDhmMb9+/Hhy9B5eaZc5X5KWqDvFKJtn56sw+M9hNjiYehiimCwhaLWgE2WSikPvcQ==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/create/-/create-3.7.2.tgz", + "integrity": "sha512-eE6i4mVi5CefQ8Mw4WhkX9GcgiDllfEYfMq3LDMCtBH4pdzXO9oNG2p1J7bbwKgCFqhmKB4nr5FTFhijOIMRRw==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.5.0", - "@lerna/npm-conf": "^3.4.1", - "@lerna/validation-error": "^3.0.0", + "@lerna/command": "^3.7.2", + "@lerna/npm-conf": "^3.7.0", + "@lerna/validation-error": "^3.6.0", "camelcase": "^4.1.0", "dedent": "^0.7.0", "fs-extra": "^7.0.0", "globby": "^8.0.1", "init-package-json": "^1.10.3", - "npm-package-arg": "^6.0.0", + "libnpm": "^2.0.1", + "p-reduce": "^1.0.0", "pify": "^3.0.0", "semver": "^5.5.0", "slash": "^1.0.0", @@ -541,7 +602,7 @@ }, "globby": { "version": "8.0.1", - "resolved": "https://registry.npmjs.org/globby/-/globby-8.0.1.tgz", + "resolved": "http://registry.npmjs.org/globby/-/globby-8.0.1.tgz", "integrity": "sha512-oMrYrJERnKBLXNLVTqhm3vPEdJ/b2ZE28xN4YARiix1NOIOBPEpOUnm844K1iu/BkphCaf2WNFwMszv8Soi1pw==", "dev": true, "requires": { @@ -568,81 +629,115 @@ } }, "@lerna/create-symlink": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/create-symlink/-/create-symlink-3.3.0.tgz", - "integrity": "sha512-0lb88Nnq1c/GG+fwybuReOnw3+ah4dB81PuWwWwuqUNPE0n50qUf/M/7FfSb5JEh/93fcdbZI0La8t3iysNW1w==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/create-symlink/-/create-symlink-3.6.0.tgz", + "integrity": "sha512-YG3lTb6zylvmGqKU+QYA3ylSnoLn+FyLH5XZmUsD0i85R884+EyJJeHx/zUk+yrL2ZwHS4RBUgJfC24fqzgPoA==", "dev": true, "requires": { "cmd-shim": "^2.0.2", "fs-extra": "^7.0.0", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "@lerna/describe-ref": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/describe-ref/-/describe-ref-3.5.0.tgz", - "integrity": "sha512-XvecK2PSwUv4z+otib5moWJMI+h3mtAg8nFlfo4KbivVtD/sI11jfKsr3S75HuAwhVAa8tAijoAxmuBJSsTE1g==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/describe-ref/-/describe-ref-3.6.0.tgz", + "integrity": "sha512-hVZJ2hYVbrrNiEG+dEg/Op4pYAbROkDZdiIUabAJffr0T/frcN+5es2HfmOC//4+78Cs1M9iTyQRoyC1RXS2BQ==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "@lerna/diff": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/diff/-/diff-3.5.0.tgz", - "integrity": "sha512-iyZ0ZRPqH5Y5XEhOYoKS8H/8UXC/gZ/idlToMFHhUn1oTSd8v9HVU1c2xq1ge0u36ZH/fx/YydUk0A/KSv+p3Q==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/diff/-/diff-3.7.2.tgz", + "integrity": "sha512-BVcceQHxwr0hIO4hZ8Udeb1Afn2opDiMXSh3dEyV7kcbYlgc66AxsviVPr4txGP/p8uRlzBUDzgHShVMplMGcg==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.5.0", - "@lerna/validation-error": "^3.0.0", - "npmlog": "^4.1.2" + "@lerna/command": "^3.7.2", + "@lerna/validation-error": "^3.6.0", + "libnpm": "^2.0.1" } }, "@lerna/exec": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/exec/-/exec-3.5.0.tgz", - "integrity": "sha512-H5jeIueDiuNsxeuGKaP7HqTcenvMsFfBFeWr0W6knHv9NrOF8il34dBqYgApZEDSQ7+2fA3ghwWbF+jUGTSh/A==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/exec/-/exec-3.7.2.tgz", + "integrity": "sha512-oEm3EbSxXeMguqC+ekXaBlRmo/aaJc2BcWPHrd+5+9evHhHo/7oOu/xXmbhJYCgZytGkJ6BrX3F9XhWnC+14wg==", "dev": true, "requires": { - "@lerna/batch-packages": "^3.1.2", + "@lerna/batch-packages": "^3.6.0", "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", + "@lerna/command": "^3.7.2", + "@lerna/filter-options": "^3.6.0", "@lerna/run-parallel-batches": "^3.0.0", - "@lerna/validation-error": "^3.0.0" + "@lerna/validation-error": "^3.6.0" } }, "@lerna/filter-options": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/filter-options/-/filter-options-3.5.0.tgz", - "integrity": "sha512-7pEQy1i5ynYOYjcSeo+Qaps4+Ais55RRdnT6/SLLBgyyHAMziflFLX5TnoyEaaXoU90iKfQ5z/ioEp6dFAXSMg==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/filter-options/-/filter-options-3.6.0.tgz", + "integrity": "sha512-6iUMZuvvXPL5EAF7Zo9azaZ6FxOq6tGbiSX8fUXgCdN+jlRjorvkzR+E0HS4bEGTWmV446lnLwdQLZuySfLcbQ==", "dev": true, "requires": { - "@lerna/collect-updates": "^3.5.0", - "@lerna/filter-packages": "^3.0.0", + "@lerna/collect-updates": "^3.6.0", + "@lerna/filter-packages": "^3.6.0", "dedent": "^0.7.0" } }, "@lerna/filter-packages": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/filter-packages/-/filter-packages-3.0.0.tgz", - "integrity": "sha512-zwbY1J4uRjWRZ/FgYbtVkq7I3Nduwsg2V2HwLKSzwV2vPglfGqgovYOVkND6/xqe2BHwDX4IyA2+e7OJmLaLSA==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/filter-packages/-/filter-packages-3.6.0.tgz", + "integrity": "sha512-O/nIENV3LOqp/TiUIw3Ir6L/wUGFDeYBdJsJTQDlTAyHZsgYA1OIn9FvlW8nqBu1bNLzoBVHXh3c5azx1kE+Hg==", "dev": true, "requires": { - "@lerna/validation-error": "^3.0.0", - "multimatch": "^2.1.0", - "npmlog": "^4.1.2" + "@lerna/validation-error": "^3.6.0", + "libnpm": "^2.0.1", + "multimatch": "^2.1.0" } }, "@lerna/get-npm-exec-opts": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/get-npm-exec-opts/-/get-npm-exec-opts-3.0.0.tgz", - "integrity": "sha512-arcYUm+4xS8J3Palhl+5rRJXnZnFHsLFKHBxznkPIxjwGQeAEw7df38uHdVjEQ+HNeFmHnBgSqfbxl1VIw5DHg==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/get-npm-exec-opts/-/get-npm-exec-opts-3.6.0.tgz", + "integrity": "sha512-ruH6KuLlt75aCObXfUIdVJqmfVq7sgWGq5mXa05vc1MEqxTIiU23YiJdWzofQOOUOACaZkzZ4K4Nu7wXEg4Xgg==", + "dev": true, + "requires": { + "libnpm": "^2.0.1" + } + }, + "@lerna/get-packed": { + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/@lerna/get-packed/-/get-packed-3.7.0.tgz", + "integrity": "sha512-yuFtjsUZIHjeIvIYQ/QuytC+FQcHwo3peB+yGBST2uWCLUCR5rx6knoQcPzbxdFDCuUb5IFccFGd3B1fHFg3RQ==", "dev": true, "requires": { - "npmlog": "^4.1.2" + "fs-extra": "^7.0.0", + "ssri": "^6.0.1", + "tar": "^4.4.8" + }, + "dependencies": { + "tar": { + "version": "4.4.8", + "resolved": "https://registry.npmjs.org/tar/-/tar-4.4.8.tgz", + "integrity": "sha512-LzHF64s5chPQQS0IYBn9IN5h3i98c12bo4NCO7e0sGM2llXQ3p2FGC5sdENN4cTW48O915Sh+x+EXx7XW96xYQ==", + "dev": true, + "requires": { + "chownr": "^1.1.1", + "fs-minipass": "^1.2.5", + "minipass": "^2.3.4", + "minizlib": "^1.1.1", + "mkdirp": "^0.5.0", + "safe-buffer": "^5.1.2", + "yallist": "^3.0.2" + } + }, + "yallist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.3.tgz", + "integrity": "sha512-S+Zk8DEWE6oKpV+vI3qWkaK+jSbIK86pCwe2IF/xwIpQ8jEuxpw9NyaGjmp9+BoJv5FV2piqCDcoCtStppiq2A==", + "dev": true + } } }, "@lerna/global-options": { @@ -662,84 +757,86 @@ } }, "@lerna/import": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/import/-/import-3.5.0.tgz", - "integrity": "sha512-vgI6lMEzd1ODgi75cmAlfPYylaK37WY3E2fwKyO/lj6UKSGj46dVSK0KwTRHx33tu4PLvPzFi5C6nbY57o5ykQ==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/import/-/import-3.7.2.tgz", + "integrity": "sha512-TGTYjhzDGLEqc9imWOi/fvIbZdmVxfV71OFB6AS98N9KQE68bbpttehQqCUIPATReVuzPUzxEiF3tMnKd7iEqg==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.5.0", - "@lerna/prompt": "^3.3.1", - "@lerna/validation-error": "^3.0.0", + "@lerna/command": "^3.7.2", + "@lerna/prompt": "^3.6.0", + "@lerna/pulse-till-done": "^3.7.1", + "@lerna/validation-error": "^3.6.0", "dedent": "^0.7.0", "fs-extra": "^7.0.0", "p-map-series": "^1.0.0" } }, "@lerna/init": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/init/-/init-3.5.0.tgz", - "integrity": "sha512-V21/UWj34Mph+9NxIGH1kYcuJAp+uFjfG8Ku2nMy62OGL3553+YQ+Izr+R6egY8y/99UMCDpi5gkQni5eGv3MA==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/init/-/init-3.7.2.tgz", + "integrity": "sha512-840Az0GtyepX7/WH3QvOQDZJCEGFf4IykjjFuCLF+23+Od8Wxn3QCsp4Yn/+HKi/w7bSpsCHJ6xQG208dygfdw==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.5.0", + "@lerna/command": "^3.7.2", "fs-extra": "^7.0.0", "p-map": "^1.2.0", "write-json-file": "^2.3.0" } }, "@lerna/link": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/link/-/link-3.5.0.tgz", - "integrity": "sha512-KSu1mhxwNRmguqMqUTJd4c7QIk9/xmxJxbmMkA71OaJd4fwondob6DyI/B17NIWutdLbvSWQ7pRlFOPxjQVoUw==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/link/-/link-3.7.2.tgz", + "integrity": "sha512-iwxftHVPknb+RXtD7257/FR4DYiCxJRxqo6z/YGlojWjehYRfbK7tJe4xzRzxepIXAE8+ooQFqQ73m0/ozk6kQ==", "dev": true, "requires": { - "@lerna/command": "^3.5.0", - "@lerna/package-graph": "^3.1.2", - "@lerna/symlink-dependencies": "^3.3.0", + "@lerna/command": "^3.7.2", + "@lerna/package-graph": "^3.6.0", + "@lerna/symlink-dependencies": "^3.7.2", "p-map": "^1.2.0", "slash": "^1.0.0" } }, "@lerna/list": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/list/-/list-3.5.0.tgz", - "integrity": "sha512-T+NZBQ/l6FmZklgrtFuN7luMs3AC/BoS52APOPrM7ZmxW4nenvov0xMwQW1783w/t365YDkDlYd5gM0nX3D1Hg==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/list/-/list-3.7.2.tgz", + "integrity": "sha512-yup9KivG31APzr+C96up83m1llqs62spsLuKkinwVUhL5mobhDscT6QwIWTJPRJ8Bbmi++SdXGLfGFkYmgujzQ==", "dev": true, "requires": { - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", - "@lerna/listable": "^3.0.0", - "@lerna/output": "^3.0.0" + "@lerna/command": "^3.7.2", + "@lerna/filter-options": "^3.6.0", + "@lerna/listable": "^3.6.0", + "@lerna/output": "^3.6.0" } }, "@lerna/listable": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/listable/-/listable-3.0.0.tgz", - "integrity": "sha512-HX/9hyx1HLg2kpiKXIUc1EimlkK1T58aKQ7ovO7rQdTx9ForpefoMzyLnHE1n4XrUtEszcSWJIICJ/F898M6Ag==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/listable/-/listable-3.6.0.tgz", + "integrity": "sha512-fz63+zlqrJ9KQxIiv0r7qtufM4DEinSayAuO8YJuooz+1ctIP7RvMEQNvYI/E9tDlUo9Q0de68b5HbKrpmA5rQ==", "dev": true, "requires": { + "@lerna/batch-packages": "^3.6.0", "chalk": "^2.3.1", "columnify": "^1.5.4" } }, "@lerna/log-packed": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/@lerna/log-packed/-/log-packed-3.0.4.tgz", - "integrity": "sha512-vVQHgMagE2wnbxhNY9nFkdu+Cx2TsyWalkJfkxbNzmo6gOCrDsxCBDj9vTEV8Q+4aWx0C0Bsc0sB2Eb8y/+ofA==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/log-packed/-/log-packed-3.6.0.tgz", + "integrity": "sha512-T/J41zMkzpWB5nbiTRS5PmYTFn74mJXe6RQA2qhkdLi0UqnTp97Pux1loz3jsJf2yJtiQUnyMM7KuKIAge0Vlw==", "dev": true, "requires": { "byte-size": "^4.0.3", "columnify": "^1.5.4", "has-unicode": "^2.0.1", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "@lerna/npm-conf": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/@lerna/npm-conf/-/npm-conf-3.4.1.tgz", - "integrity": "sha512-i9G6DnbCqiAqxKx2rSXej/n14qxlV/XOebL6QZonxJKzNTB+Q2wglnhTXmfZXTPJfoqimLaY4NfAEtbOXRWOXQ==", + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/@lerna/npm-conf/-/npm-conf-3.7.0.tgz", + "integrity": "sha512-+WSMDfPKcKzMfqq283ydz9RRpOU6p9wfx0wy4hVSUY/6YUpsyuk8SShjcRtY8zTM5AOrxvFBuuV90H4YpZ5+Ng==", "dev": true, "requires": { "config-chain": "^1.1.11", @@ -747,120 +844,175 @@ } }, "@lerna/npm-dist-tag": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/npm-dist-tag/-/npm-dist-tag-3.3.0.tgz", - "integrity": "sha512-EtZJXzh3w5tqXEev+EBBPrWKWWn0WgJfxm4FihfS9VgyaAW8udIVZHGkIQ3f+tBtupcAzA9Q8cQNUkGF2efwmA==", + "version": "3.7.1", + "resolved": "https://registry.npmjs.org/@lerna/npm-dist-tag/-/npm-dist-tag-3.7.1.tgz", + "integrity": "sha512-caUfA1L6wFl/nvIkk4q7qbFHZSnF2P8zf3Xk7vJMolRybYbj+WT1gYb5C446qPIF75p7JtFu3C/AJzwzdbljCw==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/get-npm-exec-opts": "^3.0.0", - "npmlog": "^4.1.2" + "figgy-pudding": "^3.5.1", + "libnpm": "^2.0.1" } }, "@lerna/npm-install": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/npm-install/-/npm-install-3.3.0.tgz", - "integrity": "sha512-WoVvKdS8ltROTGSNQwo6NDq0YKnjwhvTG4li1okcN/eHKOS3tL9bxbgPx7No0wOq5DKBpdeS9KhAfee6LFAZ5g==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/npm-install/-/npm-install-3.6.0.tgz", + "integrity": "sha512-RKV31VdrBZKjmKfq25JG4mIHJ8NAOsLKq/aYSaBs8zP+uwXH7RU39saVfv9ReKiAzhKE2ghOG2JeMdIHtYnPNA==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/get-npm-exec-opts": "^3.0.0", + "@lerna/get-npm-exec-opts": "^3.6.0", "fs-extra": "^7.0.0", - "npm-package-arg": "^6.0.0", - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "signal-exit": "^3.0.2", "write-pkg": "^3.1.0" } }, "@lerna/npm-publish": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/@lerna/npm-publish/-/npm-publish-3.3.1.tgz", - "integrity": "sha512-bVTlWIcBL6Zpyzqvr9C7rxXYcoPw+l7IPz5eqQDNREj1R39Wj18OWB2KTJq8l7LIX7Wf4C2A1uT5hJaEf9BuvA==", + "version": "3.7.1", + "resolved": "https://registry.npmjs.org/@lerna/npm-publish/-/npm-publish-3.7.1.tgz", + "integrity": "sha512-3Tv4UWD+1Wz1Eqc7/8eEvAHL5c2pTx+rOKYMEc6P5Z1glN1+TfIfPckPAX0H2xg44yTCh1KGJSSBpJQl68QqIQ==", "dev": true, "requires": { - "@lerna/child-process": "^3.3.0", - "@lerna/get-npm-exec-opts": "^3.0.0", - "@lerna/has-npm-version": "^3.3.0", - "@lerna/log-packed": "^3.0.4", + "@lerna/run-lifecycle": "^3.7.1", + "figgy-pudding": "^3.5.1", "fs-extra": "^7.0.0", - "npmlog": "^4.1.2", - "p-map": "^1.2.0" + "libnpm": "^2.0.1" } }, "@lerna/npm-run-script": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/npm-run-script/-/npm-run-script-3.3.0.tgz", - "integrity": "sha512-YqDguWZzp4jIomaE4aWMUP7MIAJAFvRAf6ziQLpqwoQskfWLqK5mW0CcszT1oLjhfb3cY3MMfSTFaqwbdKmICg==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/npm-run-script/-/npm-run-script-3.6.0.tgz", + "integrity": "sha512-6DRNFma30ex9r1a8mMDXziSRHf1/mo//hnvW1Zc1ctBh+7PU4I8n3A2ht/+742vtoTQH93Iqs3QSJl2KOLSsYg==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/get-npm-exec-opts": "^3.0.0", - "npmlog": "^4.1.2" + "@lerna/get-npm-exec-opts": "^3.6.0", + "libnpm": "^2.0.1" } }, "@lerna/output": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/output/-/output-3.0.0.tgz", - "integrity": "sha512-EFxnSbO0zDEVKkTKpoCUAFcZjc3gn3DwPlyTDxbeqPU7neCfxP4rA4+0a6pcOfTlRS5kLBRMx79F2TRCaMM3DA==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/output/-/output-3.6.0.tgz", + "integrity": "sha512-9sjQouf6p7VQtVCRnzoTGlZyURd48i3ha3WBHC/UBJnHZFuXMqWVPKNuvnMf2kRXDyoQD+2mNywpmEJg5jOnRg==", + "dev": true, + "requires": { + "libnpm": "^2.0.1" + } + }, + "@lerna/pack-directory": { + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/pack-directory/-/pack-directory-3.7.2.tgz", + "integrity": "sha512-yAZNSdAsBD26as+Il1l5R0fQaI6vTJqyNeK181V2vf34+KC0NX9TVaM+/Ht28QpK+3SaD2tvVP1T7OP2w0g2qg==", "dev": true, "requires": { - "npmlog": "^4.1.2" + "@lerna/get-packed": "^3.7.0", + "@lerna/package": "^3.7.2", + "@lerna/run-lifecycle": "^3.7.1", + "figgy-pudding": "^3.5.1", + "libnpm": "^2.0.1", + "npm-packlist": "^1.1.12", + "tar": "^4.4.8", + "temp-write": "^3.4.0" + }, + "dependencies": { + "tar": { + "version": "4.4.8", + "resolved": "https://registry.npmjs.org/tar/-/tar-4.4.8.tgz", + "integrity": "sha512-LzHF64s5chPQQS0IYBn9IN5h3i98c12bo4NCO7e0sGM2llXQ3p2FGC5sdENN4cTW48O915Sh+x+EXx7XW96xYQ==", + "dev": true, + "requires": { + "chownr": "^1.1.1", + "fs-minipass": "^1.2.5", + "minipass": "^2.3.4", + "minizlib": "^1.1.1", + "mkdirp": "^0.5.0", + "safe-buffer": "^5.1.2", + "yallist": "^3.0.2" + } + }, + "yallist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.3.tgz", + "integrity": "sha512-S+Zk8DEWE6oKpV+vI3qWkaK+jSbIK86pCwe2IF/xwIpQ8jEuxpw9NyaGjmp9+BoJv5FV2piqCDcoCtStppiq2A==", + "dev": true + } } }, "@lerna/package": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/package/-/package-3.0.0.tgz", - "integrity": "sha512-djzEJxzn212wS8d9znBnlXkeRlPL7GqeAYBykAmsuq51YGvaQK67Umh5ejdO0uxexF/4r7yRwgrlRHpQs8Rfqg==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/package/-/package-3.7.2.tgz", + "integrity": "sha512-8A5hN2CekM1a0Ix4VUO/g+REo+MsnXb8lnQ0bGjr1YGWzSL5NxYJ0Z9+0pwTfDpvRDYlFYO0rMVwBUW44b4dUw==", "dev": true, "requires": { - "npm-package-arg": "^6.0.0", + "libnpm": "^2.0.1", + "load-json-file": "^4.0.0", "write-pkg": "^3.1.0" + }, + "dependencies": { + "load-json-file": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", + "integrity": "sha1-L19Fq5HjMhYjT9U62rZo607AmTs=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "parse-json": "^4.0.0", + "pify": "^3.0.0", + "strip-bom": "^3.0.0" + } + }, + "parse-json": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-4.0.0.tgz", + "integrity": "sha1-vjX1Qlvh9/bHRxhPmKeIy5lHfuA=", + "dev": true, + "requires": { + "error-ex": "^1.3.1", + "json-parse-better-errors": "^1.0.1" + } + }, + "strip-bom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", + "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=", + "dev": true + } } }, "@lerna/package-graph": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/@lerna/package-graph/-/package-graph-3.1.2.tgz", - "integrity": "sha512-9wIWb49I1IJmyjPdEVZQ13IAi9biGfH/OZHOC04U2zXGA0GLiY+B3CAx6FQvqkZ8xEGfqzmXnv3LvZ0bQfc1aQ==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/package-graph/-/package-graph-3.6.0.tgz", + "integrity": "sha512-Xtldh3DTiC3cPDrs6OY5URiuRXGPMIN6uFKcx59rOu3TkqYRt346jRyX+hm85996Y/pboo3+JuQlonvuEP/9QQ==", "dev": true, "requires": { - "@lerna/validation-error": "^3.0.0", - "npm-package-arg": "^6.0.0", + "@lerna/validation-error": "^3.6.0", + "libnpm": "^2.0.1", "semver": "^5.5.0" } }, "@lerna/project": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/project/-/project-3.5.0.tgz", - "integrity": "sha512-uFDzqwrD7a/tTohQoo0voTsRy2cgl9D1ZOU2pHZzHzow9S1M8E0x5q3hJI2HlwsZry9IUugmDUGO6UddTjwm3Q==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/project/-/project-3.7.2.tgz", + "integrity": "sha512-YNJw61G4YrnwW0P1NAR/bd/kfDdK+WPI5YH10AHsG1TXBFV9hBusjB7MROmobYbln7zNWJJ3PQmXtWv134aaRQ==", "dev": true, "requires": { - "@lerna/package": "^3.0.0", - "@lerna/validation-error": "^3.0.0", + "@lerna/package": "^3.7.2", + "@lerna/validation-error": "^3.6.0", "cosmiconfig": "^5.0.2", "dedent": "^0.7.0", "dot-prop": "^4.2.0", "glob-parent": "^3.1.0", "globby": "^8.0.1", + "libnpm": "^2.0.1", "load-json-file": "^4.0.0", - "npmlog": "^4.1.2", "p-map": "^1.2.0", "resolve-from": "^4.0.0", "write-json-file": "^2.3.0" }, "dependencies": { - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - } - }, "globby": { "version": "8.0.1", - "resolved": "https://registry.npmjs.org/globby/-/globby-8.0.1.tgz", + "resolved": "http://registry.npmjs.org/globby/-/globby-8.0.1.tgz", "integrity": "sha512-oMrYrJERnKBLXNLVTqhm3vPEdJ/b2ZE28xN4YARiix1NOIOBPEpOUnm844K1iu/BkphCaf2WNFwMszv8Soi1pw==", "dev": true, "requires": { @@ -873,21 +1025,6 @@ "slash": "^1.0.0" } }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - }, "load-json-file": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", @@ -925,42 +1062,43 @@ } }, "@lerna/prompt": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/@lerna/prompt/-/prompt-3.3.1.tgz", - "integrity": "sha512-eJhofrUCUaItMIH6et8kI7YqHfhjWqGZoTsE+40NRCfAraOMWx+pDzfRfeoAl3qeRAH2HhNj1bkYn70FbUOxuQ==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/prompt/-/prompt-3.6.0.tgz", + "integrity": "sha512-nyAjPMolJ/ZRAAVcXrUH89C4n1SiWvLh4xWNvWYKLcf3PI5yges35sDFP/HYrM4+cEbkNFuJCRq6CxaET4PRsg==", "dev": true, "requires": { "inquirer": "^6.2.0", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "@lerna/publish": { - "version": "3.5.1", - "resolved": "https://registry.npmjs.org/@lerna/publish/-/publish-3.5.1.tgz", - "integrity": "sha512-ltw2YdWWzev9cZRAzons5ywZh9NJARPX67meeA95oMDVMrhD4Y9VHQNJ3T8ueec/W78/4sKlMSr3ecWyPNp5bg==", + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/@lerna/publish/-/publish-3.8.0.tgz", + "integrity": "sha512-EJDF6oPySIHQRre9KMMqtltrPReuBT7Po72W6OQxCUmCjqDyUd6884lhqFHOgbtOl1axrVVaSOpxCU1m+SLNgA==", "dev": true, "requires": { - "@lerna/batch-packages": "^3.1.2", - "@lerna/check-working-tree": "^3.5.0", + "@lerna/batch-packages": "^3.6.0", + "@lerna/check-working-tree": "^3.6.0", "@lerna/child-process": "^3.3.0", - "@lerna/collect-updates": "^3.5.0", - "@lerna/command": "^3.5.0", - "@lerna/describe-ref": "^3.5.0", - "@lerna/get-npm-exec-opts": "^3.0.0", - "@lerna/npm-conf": "^3.4.1", - "@lerna/npm-dist-tag": "^3.3.0", - "@lerna/npm-publish": "^3.3.1", - "@lerna/output": "^3.0.0", - "@lerna/prompt": "^3.3.1", - "@lerna/run-lifecycle": "^3.4.1", + "@lerna/collect-updates": "^3.6.0", + "@lerna/command": "^3.7.2", + "@lerna/describe-ref": "^3.6.0", + "@lerna/log-packed": "^3.6.0", + "@lerna/npm-conf": "^3.7.0", + "@lerna/npm-dist-tag": "^3.7.1", + "@lerna/npm-publish": "^3.7.1", + "@lerna/output": "^3.6.0", + "@lerna/pack-directory": "^3.7.2", + "@lerna/prompt": "^3.6.0", + "@lerna/pulse-till-done": "^3.7.1", + "@lerna/run-lifecycle": "^3.7.1", "@lerna/run-parallel-batches": "^3.0.0", - "@lerna/validation-error": "^3.0.0", - "@lerna/version": "^3.5.0", + "@lerna/validation-error": "^3.6.0", + "@lerna/version": "^3.8.0", + "figgy-pudding": "^3.5.1", "fs-extra": "^7.0.0", - "libnpmaccess": "^3.0.0", - "npm-package-arg": "^6.0.0", + "libnpm": "^2.0.1", "npm-registry-fetch": "^3.8.0", - "npmlog": "^4.1.2", "p-finally": "^1.0.0", "p-map": "^1.2.0", "p-pipe": "^1.2.0", @@ -968,55 +1106,72 @@ "semver": "^5.5.0" } }, + "@lerna/pulse-till-done": { + "version": "3.7.1", + "resolved": "https://registry.npmjs.org/@lerna/pulse-till-done/-/pulse-till-done-3.7.1.tgz", + "integrity": "sha512-MzpesZeW3Mc+CiAq4zUt9qTXI9uEBBKrubYHE36voQTSkHvu/Rox6YOvfUr+U7P6k8frFPeCgGpfMDTLhiqe6w==", + "dev": true, + "requires": { + "libnpm": "^2.0.1" + } + }, "@lerna/resolve-symlink": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/resolve-symlink/-/resolve-symlink-3.3.0.tgz", - "integrity": "sha512-KmoPDcFJ2aOK2inYHbrsiO9SodedUj0L1JDvDgirVNIjMUaQe2Q6Vi4Gh+VCJcyB27JtfHioV9R2NxU72Pk2hg==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/resolve-symlink/-/resolve-symlink-3.6.0.tgz", + "integrity": "sha512-TVOAEqHJSQVhNDMFCwEUZPaOETqHDQV1TQWQfC8ZlOqyaUQ7veZUbg0yfG7RPNzlSpvF0ZaGFeR0YhYDAW03GA==", "dev": true, "requires": { "fs-extra": "^7.0.0", - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "read-cmd-shim": "^1.0.1" } }, "@lerna/rimraf-dir": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/rimraf-dir/-/rimraf-dir-3.3.0.tgz", - "integrity": "sha512-vSqOcZ4kZduiSprbt+y40qziyN3VKYh+ygiCdnbBbsaxpdKB6CfrSMUtrLhVFrqUfBHIZRzHIzgjTdtQex1KLw==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/rimraf-dir/-/rimraf-dir-3.6.0.tgz", + "integrity": "sha512-2CfyWP1lqxDET+SfwGlLUfgqGF4vz9TYDrmb7Zi//g7IFCo899uU2vWOrEcdWTgbKE3Qgwwfk9c008w5MWUhog==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "path-exists": "^3.0.0", "rimraf": "^2.6.2" + }, + "dependencies": { + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + } } }, "@lerna/run": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/run/-/run-3.5.0.tgz", - "integrity": "sha512-BnPD52tj794xG2Xsc4FvgksyFX2CLmSR28TZw/xASEuy14NuQYMZkvbaj61SEhyOEsq7pLhHE5PpfbIv2AIFJw==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/run/-/run-3.7.2.tgz", + "integrity": "sha512-FwBjcrtYSFyvY2YXJ8GoI9VNv2UElUbVra5+iTF1DgQh37RmK0ZCODkfXp6PYyUszHkgCRuJqhK0+yMWRJo61w==", "dev": true, "requires": { - "@lerna/batch-packages": "^3.1.2", - "@lerna/command": "^3.5.0", - "@lerna/filter-options": "^3.5.0", - "@lerna/npm-run-script": "^3.3.0", - "@lerna/output": "^3.0.0", + "@lerna/batch-packages": "^3.6.0", + "@lerna/command": "^3.7.2", + "@lerna/filter-options": "^3.6.0", + "@lerna/npm-run-script": "^3.6.0", + "@lerna/output": "^3.6.0", "@lerna/run-parallel-batches": "^3.0.0", "@lerna/timer": "^3.5.0", - "@lerna/validation-error": "^3.0.0", + "@lerna/validation-error": "^3.6.0", "p-map": "^1.2.0" } }, "@lerna/run-lifecycle": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/@lerna/run-lifecycle/-/run-lifecycle-3.4.1.tgz", - "integrity": "sha512-N/hi2srM9A4BWEkXccP7vCEbf4MmIuALF00DTBMvc0A/ccItwUpl3XNuM7+ADDRK0mkwE3hDw89lJ3A7f8oUQw==", + "version": "3.7.1", + "resolved": "https://registry.npmjs.org/@lerna/run-lifecycle/-/run-lifecycle-3.7.1.tgz", + "integrity": "sha512-kE6w8d8Qde+ewZaDNIz4zhwde8s/i8vbbOsGDlR/Vw/9nqlmtj2YBZaS262NtWj83N04dtdYr4FVj51thciGQw==", "dev": true, "requires": { - "@lerna/npm-conf": "^3.4.1", - "npm-lifecycle": "^2.0.0", - "npmlog": "^4.1.2" + "@lerna/npm-conf": "^3.7.0", + "figgy-pudding": "^3.5.1", + "libnpm": "^2.0.1" } }, "@lerna/run-parallel-batches": { @@ -1030,79 +1185,28 @@ } }, "@lerna/symlink-binary": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/symlink-binary/-/symlink-binary-3.3.0.tgz", - "integrity": "sha512-zRo6CimhvH/VJqCFl9T4IC6syjpWyQIxEfO2sBhrapEcfwjtwbhoGgKwucsvt4rIpFazCw63jQ/AXMT27KUIHg==", + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/symlink-binary/-/symlink-binary-3.7.2.tgz", + "integrity": "sha512-xS7DdBXNQgfgrhBe2Jz27+S65yxBfnl+Xi+grvlqoEGVk7b8kt2VcBtui/XgL6AAaTg6f9szj4LUnwC/oX6S1Q==", + "dev": true, + "requires": { + "@lerna/create-symlink": "^3.6.0", + "@lerna/package": "^3.7.2", + "fs-extra": "^7.0.0", + "p-map": "^1.2.0" + } + }, + "@lerna/symlink-dependencies": { + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@lerna/symlink-dependencies/-/symlink-dependencies-3.7.2.tgz", + "integrity": "sha512-53fZUGQ+QLr5P7I9/pqFmCizLo4Q/Jz5ETd1NURO2+eABGdYuTnuvtqyGku+eOr9A4gYDaVmg50KEpsOXq9TWg==", "dev": true, "requires": { - "@lerna/create-symlink": "^3.3.0", - "@lerna/package": "^3.0.0", + "@lerna/create-symlink": "^3.6.0", + "@lerna/resolve-symlink": "^3.6.0", + "@lerna/symlink-binary": "^3.7.2", "fs-extra": "^7.0.0", - "p-map": "^1.2.0", - "read-pkg": "^3.0.0" - }, - "dependencies": { - "load-json-file": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", - "integrity": "sha1-L19Fq5HjMhYjT9U62rZo607AmTs=", - "dev": true, - "requires": { - "graceful-fs": "^4.1.2", - "parse-json": "^4.0.0", - "pify": "^3.0.0", - "strip-bom": "^3.0.0" - } - }, - "parse-json": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-4.0.0.tgz", - "integrity": "sha1-vjX1Qlvh9/bHRxhPmKeIy5lHfuA=", - "dev": true, - "requires": { - "error-ex": "^1.3.1", - "json-parse-better-errors": "^1.0.1" - } - }, - "path-type": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/path-type/-/path-type-3.0.0.tgz", - "integrity": "sha512-T2ZUsdZFHgA3u4e5PfPbjd7HDDpxPnQb5jN0SrDsjNSuVXHJqtwTnWqG0B1jZrgmJ/7lj1EmVIByWt1gxGkWvg==", - "dev": true, - "requires": { - "pify": "^3.0.0" - } - }, - "read-pkg": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-3.0.0.tgz", - "integrity": "sha1-nLxoaXj+5l0WwA4rGcI3/Pbjg4k=", - "dev": true, - "requires": { - "load-json-file": "^4.0.0", - "normalize-package-data": "^2.3.2", - "path-type": "^3.0.0" - } - }, - "strip-bom": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", - "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=", - "dev": true - } - } - }, - "@lerna/symlink-dependencies": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/symlink-dependencies/-/symlink-dependencies-3.3.0.tgz", - "integrity": "sha512-IRngSNCmuD5uBKVv23tHMvr7Mplti0lKHilFKcvhbvhAfu6m/Vclxhkfs/uLyHzG+DeRpl/9o86SQET3h4XDhg==", - "dev": true, - "requires": { - "@lerna/create-symlink": "^3.3.0", - "@lerna/resolve-symlink": "^3.3.0", - "@lerna/symlink-binary": "^3.3.0", - "fs-extra": "^7.0.0", - "p-finally": "^1.0.0", + "p-finally": "^1.0.0", "p-map": "^1.2.0", "p-map-series": "^1.0.0" } @@ -1114,34 +1218,34 @@ "dev": true }, "@lerna/validation-error": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/validation-error/-/validation-error-3.0.0.tgz", - "integrity": "sha512-5wjkd2PszV0kWvH+EOKZJWlHEqCTTKrWsvfHnHhcUaKBe/NagPZFWs+0xlsDPZ3DJt5FNfbAPAnEBQ05zLirFA==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/validation-error/-/validation-error-3.6.0.tgz", + "integrity": "sha512-MWltncGO5VgMS0QedTlZCjFUMF/evRjDMMHrtVorkIB2Cp5xy0rkKa8iDBG43qpUWeG1giwi58yUlETBcWfILw==", "dev": true, "requires": { - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "@lerna/version": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/@lerna/version/-/version-3.5.0.tgz", - "integrity": "sha512-vxuGkUSfjJuvOIgPG7SDXVmk4GPwJF9F+uhDW9T/wJzTk4UaxL37GpBeJDo43eutQ7mwluP+t88Luwf8S3WXlA==", + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/@lerna/version/-/version-3.8.0.tgz", + "integrity": "sha512-c+TNPzlyv0dgDpgMu87CPauk8R2jZwwftgQarHOCGbEZ0ClXqLFTEAKxvLpzprlt+kH3goIWYNQrZiJflpMOCA==", "dev": true, "requires": { - "@lerna/batch-packages": "^3.1.2", - "@lerna/check-working-tree": "^3.5.0", + "@lerna/batch-packages": "^3.6.0", + "@lerna/check-working-tree": "^3.6.0", "@lerna/child-process": "^3.3.0", - "@lerna/collect-updates": "^3.5.0", - "@lerna/command": "^3.5.0", - "@lerna/conventional-commits": "^3.5.0", - "@lerna/output": "^3.0.0", - "@lerna/prompt": "^3.3.1", - "@lerna/run-lifecycle": "^3.4.1", - "@lerna/validation-error": "^3.0.0", + "@lerna/collect-updates": "^3.6.0", + "@lerna/command": "^3.7.2", + "@lerna/conventional-commits": "^3.6.0", + "@lerna/output": "^3.6.0", + "@lerna/prompt": "^3.6.0", + "@lerna/run-lifecycle": "^3.7.1", + "@lerna/validation-error": "^3.6.0", "chalk": "^2.3.1", "dedent": "^0.7.0", + "libnpm": "^2.0.1", "minimatch": "^3.0.4", - "npmlog": "^4.1.2", "p-map": "^1.2.0", "p-pipe": "^1.2.0", "p-reduce": "^1.0.0", @@ -1152,15 +1256,24 @@ } }, "@lerna/write-log-file": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/write-log-file/-/write-log-file-3.0.0.tgz", - "integrity": "sha512-SfbPp29lMeEVOb/M16lJwn4nnx5y+TwCdd7Uom9umd7KcZP0NOvpnX0PHehdonl7TyHZ1Xx2maklYuCLbQrd/A==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@lerna/write-log-file/-/write-log-file-3.6.0.tgz", + "integrity": "sha512-OkLK99V6sYXsJsYg+O9wtiFS3z6eUPaiz2e6cXJt80mfIIdI1t2dnmyua0Ib5cZWExQvx2z6Y32Wlf0MnsoNsA==", "dev": true, "requires": { - "npmlog": "^4.1.2", + "libnpm": "^2.0.1", "write-file-atomic": "^2.3.0" } }, + "@mattiasbuelens/web-streams-polyfill": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/@mattiasbuelens/web-streams-polyfill/-/web-streams-polyfill-0.2.1.tgz", + "integrity": "sha512-oKuFCQFa3W7Hj7zKn0+4ypI8JFm4ZKIoncwAC6wd5WwFW2sL7O1hpPoJdSWpynQ4DJ4lQ6MvFoVDmCLilonDFg==", + "dev": true, + "requires": { + "@types/whatwg-streams": "^0.0.7" + } + }, "@mrmlnc/readdir-enhanced": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/@mrmlnc/readdir-enhanced/-/readdir-enhanced-2.2.1.tgz", @@ -1177,15 +1290,6 @@ "integrity": "sha512-shAmDyaQC4H92APFoIaVDHCx5bStIocgvbwQyxPRrbUY20V1EYTbSDchWbuwlMG3V17cprZhA6+78JfB+3DTPw==", "dev": true }, - "@samverschueren/stream-to-observable": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/@samverschueren/stream-to-observable/-/stream-to-observable-0.3.0.tgz", - "integrity": "sha512-MI4Xx6LHs4Webyvi6EbspgyAb4D2Q2VtnCQ1blOJcoLS6mVa8lNN2rkIy1CVxfTUpoyIbCTkXES1rLXztFD1lg==", - "dev": true, - "requires": { - "any-observable": "^0.3.0" - } - }, "@sindresorhus/df": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/@sindresorhus/df/-/df-2.1.0.tgz", @@ -1225,12 +1329,6 @@ } } }, - "@std/esm": { - "version": "0.26.0", - "resolved": "https://registry.npmjs.org/@std/esm/-/esm-0.26.0.tgz", - "integrity": "sha512-g3RDuosSa5fZOzENtrZdx7Gevb3zabfn8qglug2aCJIVz/4woFpKoqm1yD3mG2RD0zJEZRnkkuPHsmNglKGl7g==", - "dev": true - }, "@types/events": { "version": "1.2.0", "resolved": "http://registry.npmjs.org/@types/events/-/events-1.2.0.tgz", @@ -1263,9 +1361,9 @@ } }, "@types/handlebars": { - "version": "4.0.39", - "resolved": "https://registry.npmjs.org/@types/handlebars/-/handlebars-4.0.39.tgz", - "integrity": "sha512-vjaS7Q0dVqFp85QhyPSZqDKnTTCemcSHNHFvDdalO1s0Ifz5KuE64jQD5xoUkfdWwF4WpqdJEl7LsWH8rzhKJA==", + "version": "4.0.40", + "resolved": "https://registry.npmjs.org/@types/handlebars/-/handlebars-4.0.40.tgz", + "integrity": "sha512-sGWNtsjNrLOdKha2RV1UeF8+UbQnPSG7qbe5wwbni0mw4h2gHXyPFUMOC+xwGirIiiydM/HSqjDO4rk6NFB18w==", "dev": true }, "@types/highlight.js": { @@ -1275,15 +1373,15 @@ "dev": true }, "@types/jest": { - "version": "23.3.5", - "resolved": "https://registry.npmjs.org/@types/jest/-/jest-23.3.5.tgz", - "integrity": "sha512-3LI+vUC3Wju28vbjIjsTKakhMB8HC4l+tMz+Z8WRzVK+kmvezE5jcOvKtBpznWSI5KDLFo+FouUhpTKoekadCA==", + "version": "23.3.10", + "resolved": "https://registry.npmjs.org/@types/jest/-/jest-23.3.10.tgz", + "integrity": "sha512-DC8xTuW/6TYgvEg3HEXS7cu9OijFqprVDXXiOcdOKZCU/5PJNLZU37VVvmZHdtMiGOa8wAA/We+JzbdxFzQTRQ==", "dev": true }, "@types/lodash": { - "version": "4.14.118", - "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.118.tgz", - "integrity": "sha512-iiJbKLZbhSa6FYRip/9ZDX6HXhayXLDGY2Fqws9cOkEQ6XeKfaxB0sC541mowZJueYyMnVUmmG+al5/4fCDrgw==", + "version": "4.14.119", + "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.119.tgz", + "integrity": "sha512-Z3TNyBL8Vd/M9D9Ms2S3LmFq2sSMzahodD6rCS9V2N44HUMINb75jNkSuwAx7eo2ufqTdfOdtGQpNbieUjPQmw==", "dev": true }, "@types/marked": { @@ -1299,14 +1397,14 @@ "dev": true }, "@types/node": { - "version": "10.12.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.0.tgz", - "integrity": "sha512-3TUHC3jsBAB7qVRGxT6lWyYo2v96BMmD2PTcl47H25Lu7UXtFH/2qqmKiVrnel6Ne//0TFYf6uvNX+HW2FRkLQ==" + "version": "10.12.18", + "resolved": "https://registry.npmjs.org/@types/node/-/node-10.12.18.tgz", + "integrity": "sha512-fh+pAqt4xRzPfqA6eh3Z2y6fyZavRIumvjhaCL753+TVkGKGhpPeyrJG2JftD0T9q4GF00KjefsQ+PQNDdWQaQ==" }, "@types/shelljs": { - "version": "0.8.0", - "resolved": "https://registry.npmjs.org/@types/shelljs/-/shelljs-0.8.0.tgz", - "integrity": "sha512-vs1hCC8RxLHRu2bwumNyYRNrU3o8BtZhLysH5A4I98iYmA2APl6R3uNQb5ihl+WiwH0xdC9LLO+vRrXLs/Kyxg==", + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@types/shelljs/-/shelljs-0.8.1.tgz", + "integrity": "sha512-1lQw+48BuVgp6c1+z8EMipp18IdnV2dLh6KQGwOm+kJy9nPjEkaqRKmwbDNEYf//EKBvKcwOC6V2cDrNxVoQeQ==", "dev": true, "requires": { "@types/glob": "*", @@ -1318,175 +1416,181 @@ "resolved": "https://registry.npmjs.org/@types/text-encoding-utf-8/-/text-encoding-utf-8-1.0.1.tgz", "integrity": "sha512-GpIEYaS+yNfYqpowLLziiY42pyaL+lThd/wMh6tTubaKuG4IRkXqqyxK7Nddn3BvpUg2+go3Gv/jbXvAFMRjiQ==" }, + "@types/whatwg-streams": { + "version": "0.0.7", + "resolved": "https://registry.npmjs.org/@types/whatwg-streams/-/whatwg-streams-0.0.7.tgz", + "integrity": "sha512-6sDiSEP6DWcY2ZolsJ2s39ZmsoGQ7KVwBDI3sESQsEm9P2dHTcqnDIHRZFRNtLCzWp7hCFGqYbw5GyfpQnJ01A==", + "dev": true + }, "@webassemblyjs/ast": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.7.10.tgz", - "integrity": "sha512-wTUeaByYN2EA6qVqhbgavtGc7fLTOx0glG2IBsFlrFG51uXIGlYBTyIZMf4SPLo3v1bgV/7lBN3l7Z0R6Hswew==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.7.11.tgz", + "integrity": "sha512-ZEzy4vjvTzScC+SH8RBssQUawpaInUdMTYwYYLh54/s8TuT0gBLuyUnppKsVyZEi876VmmStKsUs28UxPgdvrA==", "dev": true, "requires": { - "@webassemblyjs/helper-module-context": "1.7.10", - "@webassemblyjs/helper-wasm-bytecode": "1.7.10", - "@webassemblyjs/wast-parser": "1.7.10" + "@webassemblyjs/helper-module-context": "1.7.11", + "@webassemblyjs/helper-wasm-bytecode": "1.7.11", + "@webassemblyjs/wast-parser": "1.7.11" } }, "@webassemblyjs/floating-point-hex-parser": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.7.10.tgz", - "integrity": "sha512-gMsGbI6I3p/P1xL2UxqhNh1ga2HCsx5VBB2i5VvJFAaqAjd2PBTRULc3BpTydabUQEGlaZCzEUQhLoLG7TvEYQ==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.7.11.tgz", + "integrity": "sha512-zY8dSNyYcgzNRNT666/zOoAyImshm3ycKdoLsyDw/Bwo6+/uktb7p4xyApuef1dwEBo/U/SYQzbGBvV+nru2Xg==", "dev": true }, "@webassemblyjs/helper-api-error": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.7.10.tgz", - "integrity": "sha512-DoYRlPWtuw3yd5BOr9XhtrmB6X1enYF0/54yNvQWGXZEPDF5PJVNI7zQ7gkcKfTESzp8bIBWailaFXEK/jjCsw==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.7.11.tgz", + "integrity": "sha512-7r1qXLmiglC+wPNkGuXCvkmalyEstKVwcueZRP2GNC2PAvxbLYwLLPr14rcdJaE4UtHxQKfFkuDFuv91ipqvXg==", "dev": true }, "@webassemblyjs/helper-buffer": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.7.10.tgz", - "integrity": "sha512-+RMU3dt/dPh4EpVX4u5jxsOlw22tp3zjqE0m3ftU2tsYxnPULb4cyHlgaNd2KoWuwasCQqn8Mhr+TTdbtj3LlA==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.7.11.tgz", + "integrity": "sha512-MynuervdylPPh3ix+mKZloTcL06P8tenNH3sx6s0qE8SLR6DdwnfgA7Hc9NSYeob2jrW5Vql6GVlsQzKQCa13w==", "dev": true }, "@webassemblyjs/helper-code-frame": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-code-frame/-/helper-code-frame-1.7.10.tgz", - "integrity": "sha512-UiytbpKAULOEab2hUZK2ywXen4gWJVrgxtwY3Kn+eZaaSWaRM8z/7dAXRSoamhKFiBh1uaqxzE/XD9BLlug3gw==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-code-frame/-/helper-code-frame-1.7.11.tgz", + "integrity": "sha512-T8ESC9KMXFTXA5urJcyor5cn6qWeZ4/zLPyWeEXZ03hj/x9weSokGNkVCdnhSabKGYWxElSdgJ+sFa9G/RdHNw==", "dev": true, "requires": { - "@webassemblyjs/wast-printer": "1.7.10" + "@webassemblyjs/wast-printer": "1.7.11" } }, "@webassemblyjs/helper-fsm": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-fsm/-/helper-fsm-1.7.10.tgz", - "integrity": "sha512-w2vDtUK9xeSRtt5+RnnlRCI7wHEvLjF0XdnxJpgx+LJOvklTZPqWkuy/NhwHSLP19sm9H8dWxKeReMR7sCkGZA==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-fsm/-/helper-fsm-1.7.11.tgz", + "integrity": "sha512-nsAQWNP1+8Z6tkzdYlXT0kxfa2Z1tRTARd8wYnc/e3Zv3VydVVnaeePgqUzFrpkGUyhUUxOl5ML7f1NuT+gC0A==", "dev": true }, "@webassemblyjs/helper-module-context": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-module-context/-/helper-module-context-1.7.10.tgz", - "integrity": "sha512-yE5x/LzZ3XdPdREmJijxzfrf+BDRewvO0zl8kvORgSWmxpRrkqY39KZSq6TSgIWBxkK4SrzlS3BsMCv2s1FpsQ==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-module-context/-/helper-module-context-1.7.11.tgz", + "integrity": "sha512-JxfD5DX8Ygq4PvXDucq0M+sbUFA7BJAv/GGl9ITovqE+idGX+J3QSzJYz+LwQmL7fC3Rs+utvWoJxDb6pmC0qg==", "dev": true }, "@webassemblyjs/helper-wasm-bytecode": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.7.10.tgz", - "integrity": "sha512-u5qy4SJ/OrxKxZqJ9N3qH4ZQgHaAzsopsYwLvoWJY6Q33r8PhT3VPyNMaJ7ZFoqzBnZlCcS/0f4Sp8WBxylXfg==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.7.11.tgz", + "integrity": "sha512-cMXeVS9rhoXsI9LLL4tJxBgVD/KMOKXuFqYb5oCJ/opScWpkCMEz9EJtkonaNcnLv2R3K5jIeS4TRj/drde1JQ==", "dev": true }, "@webassemblyjs/helper-wasm-section": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.7.10.tgz", - "integrity": "sha512-Ecvww6sCkcjatcyctUrn22neSJHLN/TTzolMGG/N7S9rpbsTZ8c6Bl98GpSpV77EvzNijiNRHBG0+JO99qKz6g==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.7.11.tgz", + "integrity": "sha512-8ZRY5iZbZdtNFE5UFunB8mmBEAbSI3guwbrsCl4fWdfRiAcvqQpeqd5KHhSWLL5wuxo53zcaGZDBU64qgn4I4Q==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-buffer": "1.7.10", - "@webassemblyjs/helper-wasm-bytecode": "1.7.10", - "@webassemblyjs/wasm-gen": "1.7.10" + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-buffer": "1.7.11", + "@webassemblyjs/helper-wasm-bytecode": "1.7.11", + "@webassemblyjs/wasm-gen": "1.7.11" } }, "@webassemblyjs/ieee754": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.7.10.tgz", - "integrity": "sha512-HRcWcY+YWt4+s/CvQn+vnSPfRaD4KkuzQFt5MNaELXXHSjelHlSEA8ZcqT69q0GTIuLWZ6JaoKar4yWHVpZHsQ==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.7.11.tgz", + "integrity": "sha512-Mmqx/cS68K1tSrvRLtaV/Lp3NZWzXtOHUW2IvDvl2sihAwJh4ACE0eL6A8FvMyDG9abes3saB6dMimLOs+HMoQ==", "dev": true, "requires": { "@xtuc/ieee754": "^1.2.0" } }, "@webassemblyjs/leb128": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.7.10.tgz", - "integrity": "sha512-og8MciYlA8hvzCLR71hCuZKPbVBfLQeHv7ImKZ4nlyxrYbG7uJHYtHiHu6OV9SqrGuD03H/HtXC4Bgdjfm9FHw==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.7.11.tgz", + "integrity": "sha512-vuGmgZjjp3zjcerQg+JA+tGOncOnJLWVkt8Aze5eWQLwTQGNgVLcyOTqgSCxWTR4J42ijHbBxnuRaL1Rv7XMdw==", "dev": true, "requires": { "@xtuc/long": "4.2.1" } }, "@webassemblyjs/utf8": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.7.10.tgz", - "integrity": "sha512-Ng6Pxv6siyZp635xCSnH3mKmIFgqWPCcGdoo0GBYgyGdxu7cUj4agV7Uu1a8REP66UYUFXJLudeGgd4RvuJAnQ==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.7.11.tgz", + "integrity": "sha512-C6GFkc7aErQIAH+BMrIdVSmW+6HSe20wg57HEC1uqJP8E/xpMjXqQUxkQw07MhNDSDcGpxI9G5JSNOQCqJk4sA==", "dev": true }, "@webassemblyjs/wasm-edit": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.7.10.tgz", - "integrity": "sha512-e9RZFQlb+ZuYcKRcW9yl+mqX/Ycj9+3/+ppDI8nEE/NCY6FoK8f3dKBcfubYV/HZn44b+ND4hjh+4BYBt+sDnA==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.7.11.tgz", + "integrity": "sha512-FUd97guNGsCZQgeTPKdgxJhBXkUbMTY6hFPf2Y4OedXd48H97J+sOY2Ltaq6WGVpIH8o/TGOVNiVz/SbpEMJGg==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-buffer": "1.7.10", - "@webassemblyjs/helper-wasm-bytecode": "1.7.10", - "@webassemblyjs/helper-wasm-section": "1.7.10", - "@webassemblyjs/wasm-gen": "1.7.10", - "@webassemblyjs/wasm-opt": "1.7.10", - "@webassemblyjs/wasm-parser": "1.7.10", - "@webassemblyjs/wast-printer": "1.7.10" + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-buffer": "1.7.11", + "@webassemblyjs/helper-wasm-bytecode": "1.7.11", + "@webassemblyjs/helper-wasm-section": "1.7.11", + "@webassemblyjs/wasm-gen": "1.7.11", + "@webassemblyjs/wasm-opt": "1.7.11", + "@webassemblyjs/wasm-parser": "1.7.11", + "@webassemblyjs/wast-printer": "1.7.11" } }, "@webassemblyjs/wasm-gen": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.7.10.tgz", - "integrity": "sha512-M0lb6cO2Y0PzDye/L39PqwV+jvO+2YxEG5ax+7dgq7EwXdAlpOMx1jxyXJTScQoeTpzOPIb+fLgX/IkLF8h2yw==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.7.11.tgz", + "integrity": "sha512-U/KDYp7fgAZX5KPfq4NOupK/BmhDc5Kjy2GIqstMhvvdJRcER/kUsMThpWeRP8BMn4LXaKhSTggIJPOeYHwISA==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-wasm-bytecode": "1.7.10", - "@webassemblyjs/ieee754": "1.7.10", - "@webassemblyjs/leb128": "1.7.10", - "@webassemblyjs/utf8": "1.7.10" + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-wasm-bytecode": "1.7.11", + "@webassemblyjs/ieee754": "1.7.11", + "@webassemblyjs/leb128": "1.7.11", + "@webassemblyjs/utf8": "1.7.11" } }, "@webassemblyjs/wasm-opt": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.7.10.tgz", - "integrity": "sha512-R66IHGCdicgF5ZliN10yn5HaC7vwYAqrSVJGjtJJQp5+QNPBye6heWdVH/at40uh0uoaDN/UVUfXK0gvuUqtVg==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.7.11.tgz", + "integrity": "sha512-XynkOwQyiRidh0GLua7SkeHvAPXQV/RxsUeERILmAInZegApOUAIJfRuPYe2F7RcjOC9tW3Cb9juPvAC/sCqvg==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-buffer": "1.7.10", - "@webassemblyjs/wasm-gen": "1.7.10", - "@webassemblyjs/wasm-parser": "1.7.10" + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-buffer": "1.7.11", + "@webassemblyjs/wasm-gen": "1.7.11", + "@webassemblyjs/wasm-parser": "1.7.11" } }, "@webassemblyjs/wasm-parser": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.7.10.tgz", - "integrity": "sha512-AEv8mkXVK63n/iDR3T693EzoGPnNAwKwT3iHmKJNBrrALAhhEjuPzo/lTE4U7LquEwyvg5nneSNdTdgrBaGJcA==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.7.11.tgz", + "integrity": "sha512-6lmXRTrrZjYD8Ng8xRyvyXQJYUQKYSXhJqXOBLw24rdiXsHAOlvw5PhesjdcaMadU/pyPQOJ5dHreMjBxwnQKg==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-api-error": "1.7.10", - "@webassemblyjs/helper-wasm-bytecode": "1.7.10", - "@webassemblyjs/ieee754": "1.7.10", - "@webassemblyjs/leb128": "1.7.10", - "@webassemblyjs/utf8": "1.7.10" + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-api-error": "1.7.11", + "@webassemblyjs/helper-wasm-bytecode": "1.7.11", + "@webassemblyjs/ieee754": "1.7.11", + "@webassemblyjs/leb128": "1.7.11", + "@webassemblyjs/utf8": "1.7.11" } }, "@webassemblyjs/wast-parser": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-parser/-/wast-parser-1.7.10.tgz", - "integrity": "sha512-YTPEtOBljkCL0VjDp4sHe22dAYSm3ZwdJ9+2NTGdtC7ayNvuip1wAhaAS8Zt9Q6SW9E5Jf5PX7YE3XWlrzR9cw==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-parser/-/wast-parser-1.7.11.tgz", + "integrity": "sha512-lEyVCg2np15tS+dm7+JJTNhNWq9yTZvi3qEhAIIOaofcYlUp0UR5/tVqOwa/gXYr3gjwSZqw+/lS9dscyLelbQ==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/floating-point-hex-parser": "1.7.10", - "@webassemblyjs/helper-api-error": "1.7.10", - "@webassemblyjs/helper-code-frame": "1.7.10", - "@webassemblyjs/helper-fsm": "1.7.10", + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/floating-point-hex-parser": "1.7.11", + "@webassemblyjs/helper-api-error": "1.7.11", + "@webassemblyjs/helper-code-frame": "1.7.11", + "@webassemblyjs/helper-fsm": "1.7.11", "@xtuc/long": "4.2.1" } }, "@webassemblyjs/wast-printer": { - "version": "1.7.10", - "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.7.10.tgz", - "integrity": "sha512-mJ3QKWtCchL1vhU/kZlJnLPuQZnlDOdZsyP0bbLWPGdYsQDnSBvyTLhzwBA3QAMlzEL9V4JHygEmK6/OTEyytA==", + "version": "1.7.11", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.7.11.tgz", + "integrity": "sha512-m5vkAsuJ32QpkdkDOUPGSltrg8Cuk3KBx4YrmAGQwCZPRdUHXxG4phIOuuycLemHFr74sWL9Wthqss4fzdzSwg==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/wast-parser": "1.7.10", + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/wast-parser": "1.7.11", "@xtuc/long": "4.2.1" } }, @@ -1550,17 +1654,17 @@ }, "dependencies": { "acorn": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-6.0.2.tgz", - "integrity": "sha512-GXmKIvbrN3TV7aVqAzVFaMW8F8wzVX7voEBRO3bDA64+EX37YSayggRJP5Xig6HYHBkWKpFg9W5gg6orklubhg==", + "version": "6.0.4", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-6.0.4.tgz", + "integrity": "sha512-VY4i5EKSKkofY2I+6QLTbTTN/UvEQPCo6eiwzzSaSWfpaDhOmStMCMod6wmuPciNq+XS0faCglFu2lHZpdHUtg==", "dev": true } } }, "acorn-walk": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-6.1.0.tgz", - "integrity": "sha512-ugTb7Lq7u4GfWSqqpwE0bGyoBZNMTok/zDBXxfEG0QM50jNlGhIWjRC1pPN7bvV1anhF+bs+/gNcRw+o55Evbg==", + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-6.1.1.tgz", + "integrity": "sha512-OtUw6JUTgxA2QoqqmrmQ7F2NYqiBPi/L2jqHyFtllhOUvXYQXf0Z1CYUinIfyT4bTCGmrA7gX9FvHA81uzCoVw==", "dev": true }, "agent-base": { @@ -1582,21 +1686,21 @@ } }, "ajv": { - "version": "5.5.2", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-5.5.2.tgz", - "integrity": "sha1-c7Xuyj+rZT49P5Qis0GtQiBdyWU=", + "version": "6.6.2", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.6.2.tgz", + "integrity": "sha512-FBHEW6Jf5TB9MGBgUUA9XHkTbjXYfAUjY43ACMfmdMRHniyoMHjHjzD50OK8LGDWQwp4rWEsIq5kEqq7rvIM1g==", "dev": true, "requires": { - "co": "^4.6.0", - "fast-deep-equal": "^1.0.0", + "fast-deep-equal": "^2.0.1", "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.3.0" + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" } }, "ajv-errors": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/ajv-errors/-/ajv-errors-1.0.0.tgz", - "integrity": "sha1-7PAh+hCP0X37Xms4Py3SM+Mf/Fk=", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/ajv-errors/-/ajv-errors-1.0.1.tgz", + "integrity": "sha512-DCRfO/4nQ+89p/RK43i8Ezd41EqdGIU4ld7nGF8OQ14oc/we5rEntLCUa7+jrn3nn83BosfwZA0wb4pon2o8iQ==", "dev": true }, "ajv-keywords": { @@ -1606,14 +1710,17 @@ "dev": true }, "ansi-colors": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/ansi-colors/-/ansi-colors-2.0.5.tgz", - "integrity": "sha512-yAdfUZ+c2wetVNIFsNRn44THW+Lty6S5TwMpUfLA/UaGhiXbBv/F8E60/1hMLd0cnF/CDoWH8vzVaI5bAcHCjw==", - "dev": true + "version": "1.1.0", + "resolved": "http://registry.npmjs.org/ansi-colors/-/ansi-colors-1.1.0.tgz", + "integrity": "sha512-SFKX67auSNoVR38N3L+nvsPjOE0bybKTYbkf5tRvushrAPQ9V75huw0ZxBkKVeRU9kqH3d6HA4xTckbwZ4ixmA==", + "dev": true, + "requires": { + "ansi-wrap": "^0.1.0" + } }, "ansi-escapes": { "version": "3.1.0", - "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-3.1.0.tgz", + "resolved": "http://registry.npmjs.org/ansi-escapes/-/ansi-escapes-3.1.0.tgz", "integrity": "sha512-UgAb8H9D41AQnu/PbWlCofQVcnV4Gs2bBJi9eZPxfU/hgglFh3SMDMENRIqdr7H6XFnXdoknctFByVsCOotTVw==", "dev": true }, @@ -1646,20 +1753,14 @@ "integrity": "sha1-qCJQ3bABXponyoLoLqYDu/pF768=", "dev": true }, - "any-observable": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/any-observable/-/any-observable-0.3.0.tgz", - "integrity": "sha512-/FQM1EDkTsf63Ub2C6O7GuYFDsSXUwsaZDurV0np41ocwq0jthUAYCmhBX9f+KwlaCgIuWyr/4WlUQUBfKfZog==", - "dev": true - }, "anymatch": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-1.3.2.tgz", - "integrity": "sha512-0XNayC8lTHQ2OI8aljNCN3sSx6hsr/1+rlcDAotXJR7C1oZZHCNsfpbKwMjRA3Uqb5tF1Rae2oloTr4xpq+WjA==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", + "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==", "dev": true, "requires": { - "micromatch": "^2.1.5", - "normalize-path": "^2.0.0" + "micromatch": "^3.1.4", + "normalize-path": "^2.1.1" } }, "append-buffer": { @@ -1721,13 +1822,10 @@ } }, "arr-diff": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", - "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", - "dev": true, - "requires": { - "arr-flatten": "^1.0.1" - } + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", + "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", + "dev": true }, "arr-filter": { "version": "1.1.2", @@ -1781,7 +1879,7 @@ }, "array-equal": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/array-equal/-/array-equal-1.0.0.tgz", + "resolved": "http://registry.npmjs.org/array-equal/-/array-equal-1.0.0.tgz", "integrity": "sha1-jCpe8kcv2ep0KwTHenUJO6J1fJM=", "dev": true }, @@ -1891,9 +1989,9 @@ "dev": true }, "array-unique": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", - "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", + "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", "dev": true }, "arrify": { @@ -1991,14 +2089,6 @@ "once": "^1.3.2", "process-nextick-args": "^1.0.7", "stream-exhaust": "^1.0.1" - }, - "dependencies": { - "process-nextick-args": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", - "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=", - "dev": true - } } }, "async-each": { @@ -2078,7 +2168,7 @@ }, "supports-color": { "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", + "resolved": "http://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", "dev": true } @@ -2166,6 +2256,17 @@ "find-up": "^2.1.0", "istanbul-lib-instrument": "^1.10.1", "test-exclude": "^4.2.1" + }, + "dependencies": { + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + } } }, "babel-plugin-jest-hoist": { @@ -2180,28 +2281,6 @@ "integrity": "sha1-/WU28rzhODb/o6VFjEkDpZe7O/U=", "dev": true }, - "babel-plugin-transform-es2015-modules-commonjs": { - "version": "6.26.2", - "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-modules-commonjs/-/babel-plugin-transform-es2015-modules-commonjs-6.26.2.tgz", - "integrity": "sha512-CV9ROOHEdrjcwhIaJNBGMBCodN+1cfkwtM1SbUHmvyy35KGT7fohbpOxkE2uLz1o6odKK2Ck/tz47z+VqQfi9Q==", - "dev": true, - "requires": { - "babel-plugin-transform-strict-mode": "^6.24.1", - "babel-runtime": "^6.26.0", - "babel-template": "^6.26.0", - "babel-types": "^6.26.0" - } - }, - "babel-plugin-transform-strict-mode": { - "version": "6.24.1", - "resolved": "https://registry.npmjs.org/babel-plugin-transform-strict-mode/-/babel-plugin-transform-strict-mode-6.24.1.tgz", - "integrity": "sha1-1fr3qleKZbvlkc9e2uBKDGcCB1g=", - "dev": true, - "requires": { - "babel-runtime": "^6.22.0", - "babel-types": "^6.24.1" - } - }, "babel-preset-jest": { "version": "23.2.0", "resolved": "https://registry.npmjs.org/babel-preset-jest/-/babel-preset-jest-23.2.0.tgz", @@ -2360,18 +2439,6 @@ "is-data-descriptor": "^1.0.0", "kind-of": "^6.0.2" } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true } } }, @@ -2390,12 +2457,6 @@ "tweetnacl": "^0.14.3" } }, - "beeper": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/beeper/-/beeper-1.1.1.tgz", - "integrity": "sha1-5tXqjF2tABMEpwsiY4RH9pyy+Ak=", - "dev": true - }, "benchmark": { "version": "2.1.4", "resolved": "https://registry.npmjs.org/benchmark/-/benchmark-2.1.4.tgz", @@ -2407,11 +2468,24 @@ } }, "big.js": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/big.js/-/big.js-3.2.0.tgz", - "integrity": "sha512-+hN/Zh2D08Mx65pZ/4g5bsmNiZUuChDiQfTUQ7qJr4/kuopCr88xZsAXv6mBoZEsUI4OuGHlX59qE94K2mMW8Q==", + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/big.js/-/big.js-5.2.2.tgz", + "integrity": "sha512-vyL2OymJxmarO8gxMr0mhChsO9QGwhynfuu4+MHTAW6czfq9humCB7rKpUjDd9YUiDPU4mzpyupFSvOClAwbmQ==", "dev": true }, + "bin-links": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/bin-links/-/bin-links-1.1.2.tgz", + "integrity": "sha512-8eEHVgYP03nILphilltWjeIjMbKyJo3wvp9K816pHbhP301ismzw15mxAAEVQ/USUwcP++1uNrbERbp8lOA6Fg==", + "dev": true, + "requires": { + "bluebird": "^3.5.0", + "cmd-shim": "^2.0.2", + "gentle-fs": "^2.0.0", + "graceful-fs": "^4.1.11", + "write-file-atomic": "^2.3.0" + } + }, "binary-extensions": { "version": "1.12.0", "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-1.12.0.tgz", @@ -2428,9 +2502,9 @@ } }, "bluebird": { - "version": "3.5.2", - "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.2.tgz", - "integrity": "sha512-dhHTWMI7kMx5whMQntl7Vr9C6BvV10lFXDAasnqnrMYhXVCzzk6IO9Fo2L75jXHT07WrOngL1WDXOp+yYS91Yg==", + "version": "3.5.3", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.3.tgz", + "integrity": "sha512-/qKPUQlaW1OyR51WeCPBvRnAlnZFUJkCSG5HzGnuIqhgyJtF+T94lFnn33eiazjRm2LAHVy2guNnaq48X9SJuw==", "dev": true }, "bn.js": { @@ -2450,14 +2524,32 @@ } }, "braces": { - "version": "1.8.5", - "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", - "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", + "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", "dev": true, "requires": { - "expand-range": "^1.8.1", - "preserve": "^0.2.0", - "repeat-element": "^1.1.2" + "arr-flatten": "^1.1.0", + "array-unique": "^0.3.2", + "extend-shallow": "^2.0.1", + "fill-range": "^4.0.0", + "isobject": "^3.0.1", + "repeat-element": "^1.1.2", + "snapdragon": "^0.8.1", + "snapdragon-node": "^2.0.1", + "split-string": "^3.0.2", + "to-regex": "^3.0.1" + }, + "dependencies": { + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } } }, "brorand": { @@ -2483,7 +2575,7 @@ "dependencies": { "resolve": { "version": "1.1.7", - "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.1.7.tgz", + "resolved": "http://registry.npmjs.org/resolve/-/resolve-1.1.7.tgz", "integrity": "sha1-IDEU2CrSxe2ejgQRs5ModeiJ6Xs=", "dev": true } @@ -2560,6 +2652,15 @@ "pako": "~1.0.5" } }, + "bs-logger": { + "version": "0.2.6", + "resolved": "https://registry.npmjs.org/bs-logger/-/bs-logger-0.2.6.tgz", + "integrity": "sha512-pd8DCoxmbgc7hyPKOvxtqNcjYoOsABPQdcCUjGp3d42VR2CX1ORhk2A87oqqu5R1kk+76nsxZupkmyd+MVtCog==", + "dev": true, + "requires": { + "fast-json-stable-stringify": "2.x" + } + }, "bser": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/bser/-/bser-2.0.0.tgz", @@ -2629,32 +2730,47 @@ "dev": true }, "cacache": { - "version": "11.2.0", - "resolved": "https://registry.npmjs.org/cacache/-/cacache-11.2.0.tgz", - "integrity": "sha512-IFWl6lfK6wSeYCHUXh+N1lY72UDrpyrYQJNIVQf48paDuWbv5RbAtJYf/4gUQFObTCHZwdZ5sI8Iw7nqwP6nlQ==", + "version": "11.3.2", + "resolved": "https://registry.npmjs.org/cacache/-/cacache-11.3.2.tgz", + "integrity": "sha512-E0zP4EPGDOaT2chM08Als91eYnf8Z+eH1awwwVsngUmgppfM5jjJ8l3z5vO5p5w/I3LsiXawb1sW0VY65pQABg==", "dev": true, "requires": { - "bluebird": "^3.5.1", - "chownr": "^1.0.1", - "figgy-pudding": "^3.1.0", - "glob": "^7.1.2", - "graceful-fs": "^4.1.11", - "lru-cache": "^4.1.3", + "bluebird": "^3.5.3", + "chownr": "^1.1.1", + "figgy-pudding": "^3.5.1", + "glob": "^7.1.3", + "graceful-fs": "^4.1.15", + "lru-cache": "^5.1.1", "mississippi": "^3.0.0", "mkdirp": "^0.5.1", "move-concurrently": "^1.0.1", "promise-inflight": "^1.0.1", "rimraf": "^2.6.2", - "ssri": "^6.0.0", - "unique-filename": "^1.1.0", + "ssri": "^6.0.1", + "unique-filename": "^1.1.1", "y18n": "^4.0.0" }, "dependencies": { + "lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "requires": { + "yallist": "^3.0.2" + } + }, "y18n": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.0.tgz", "integrity": "sha512-r9S/ZyXu/Xu9q1tYlpsLIsa3EeLXXk0VwlxqTcFRfg9EhMW+17kbt9G0NrgCmhGb5vT2hyhJZLfDGx+7+5Uj/w==", "dev": true + }, + "yallist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.3.tgz", + "integrity": "sha512-S+Zk8DEWE6oKpV+vI3qWkaK+jSbIK86pCwe2IF/xwIpQ8jEuxpw9NyaGjmp9+BoJv5FV2piqCDcoCtStppiq2A==", + "dev": true } } }, @@ -2673,14 +2789,6 @@ "to-object-path": "^0.3.0", "union-value": "^1.0.0", "unset-value": "^1.0.0" - }, - "dependencies": { - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "call-me-maybe": { @@ -2689,9 +2797,27 @@ "integrity": "sha1-JtII6onje1y95gJQoV8DHBak1ms=", "dev": true }, + "caller-callsite": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/caller-callsite/-/caller-callsite-2.0.0.tgz", + "integrity": "sha1-hH4PzgoiN1CpoCfFSzNzGtMVQTQ=", + "dev": true, + "requires": { + "callsites": "^2.0.0" + } + }, + "caller-path": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/caller-path/-/caller-path-2.0.0.tgz", + "integrity": "sha1-Ro+DBE42mrIBD6xfBs7uFbsssfQ=", + "dev": true, + "requires": { + "caller-callsite": "^2.0.0" + } + }, "callsites": { "version": "2.0.0", - "resolved": "https://registry.npmjs.org/callsites/-/callsites-2.0.0.tgz", + "resolved": "http://registry.npmjs.org/callsites/-/callsites-2.0.0.tgz", "integrity": "sha1-BuuE8A7qQT2oav/vrL/7Ngk7PFA=", "dev": true }, @@ -2752,20 +2878,24 @@ "dev": true }, "chokidar": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz", - "integrity": "sha1-eY5ol3gVHIB2tLNg5e3SjNortGg=", + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-2.0.4.tgz", + "integrity": "sha512-z9n7yt9rOvIJrMhvDtDictKrkFHeihkNl6uWMmZlmL6tJtX9Cs+87oK+teBx+JIgzvbX3yZHT3eF8vpbDxHJXQ==", "dev": true, "requires": { - "anymatch": "^1.3.0", + "anymatch": "^2.0.0", "async-each": "^1.0.0", - "fsevents": "^1.0.0", - "glob-parent": "^2.0.0", + "braces": "^2.3.0", + "fsevents": "^1.2.2", + "glob-parent": "^3.1.0", "inherits": "^2.0.1", "is-binary-path": "^1.0.0", - "is-glob": "^2.0.0", + "is-glob": "^4.0.0", + "lodash.debounce": "^4.0.8", + "normalize-path": "^2.1.1", "path-is-absolute": "^1.0.0", - "readdirp": "^2.0.0" + "readdirp": "^2.0.0", + "upath": "^1.0.5" } }, "chownr": { @@ -2819,12 +2949,6 @@ "requires": { "is-descriptor": "^0.1.0" } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true } } }, @@ -2837,16 +2961,6 @@ "restore-cursor": "^2.0.0" } }, - "cli-truncate": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/cli-truncate/-/cli-truncate-0.2.1.tgz", - "integrity": "sha1-nxXPuwcFAFNpIWxiasfQWrkN1XQ=", - "dev": true, - "requires": { - "slice-ansi": "0.0.4", - "string-width": "^1.0.1" - } - }, "cli-width": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/cli-width/-/cli-width-2.2.0.tgz", @@ -2891,6 +3005,14 @@ "inherits": "^2.0.1", "process-nextick-args": "^2.0.0", "readable-stream": "^2.3.5" + }, + "dependencies": { + "process-nextick-args": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", + "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==", + "dev": true + } } }, "cmd-shim": { @@ -2924,17 +3046,6 @@ "arr-map": "^2.0.2", "for-own": "^1.0.0", "make-iterator": "^1.0.0" - }, - "dependencies": { - "for-own": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-1.0.0.tgz", - "integrity": "sha1-xjMy9BXO3EsE2/5wz4NklMU8tEs=", - "dev": true, - "requires": { - "for-in": "^1.0.1" - } - } } }, "collection-visit": { @@ -3055,7 +3166,7 @@ }, "concat-stream": { "version": "1.6.2", - "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", + "resolved": "http://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", "dev": true, "requires": { @@ -3127,11 +3238,14 @@ "through2": "^2.0.0" }, "dependencies": { - "dateformat": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/dateformat/-/dateformat-3.0.3.tgz", - "integrity": "sha512-jyCETtSl3VMZMWeRo7iY1FL19ges1t55hMo5yaam4Jrsm5EPL89UQkoQRyiI+Yf4k8r2ZpdngkV8hr1lIdjb3Q==", - "dev": true + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } }, "load-json-file": { "version": "4.0.0", @@ -3215,14 +3329,6 @@ "semver": "^5.5.0", "split": "^1.0.0", "through2": "^2.0.0" - }, - "dependencies": { - "dateformat": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/dateformat/-/dateformat-3.0.3.tgz", - "integrity": "sha512-jyCETtSl3VMZMWeRo7iY1FL19ges1t55hMo5yaam4Jrsm5EPL89UQkoQRyiI+Yf4k8r2ZpdngkV8hr1lIdjb3Q==", - "dev": true - } } }, "conventional-commits-filter": { @@ -3306,9 +3412,9 @@ } }, "core-js": { - "version": "2.5.7", - "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.5.7.tgz", - "integrity": "sha512-RszJCAxg/PP6uzXVXL6BsxSXx/B05oJAQ2vkJRjyjrEcNVycaqOmNb5OTxZPE3xa5gwZduqza6L9JOCenh/Ecw==", + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.6.1.tgz", + "integrity": "sha512-L72mmmEayPJBejKIWe2pYtGis5r0tQ5NaJekdhyXgeMQTpJoBsH0NL4ElY2LfSoV15xeQWKQ+XTTOZdyero5Xg==", "dev": true }, "core-util-is": { @@ -3318,11 +3424,12 @@ "dev": true }, "cosmiconfig": { - "version": "5.0.6", - "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-5.0.6.tgz", - "integrity": "sha512-6DWfizHriCrFWURP1/qyhsiFvYdlJzbCzmtFWh744+KyWsJo5+kPzUZZaMRSSItoYc0pxFX7gEO7ZC1/gN/7AQ==", + "version": "5.0.7", + "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-5.0.7.tgz", + "integrity": "sha512-PcLqxTKiDmNT6pSpy4N6KtuPwb53W+2tzNvwOZw0WH9N6O0vLIBq0x8aj8Oj75ere4YcGi48bDFCL+3fRJdlNA==", "dev": true, "requires": { + "import-fresh": "^2.0.0", "is-directory": "^0.3.1", "js-yaml": "^3.9.0", "parse-json": "^4.0.0" @@ -3354,25 +3461,6 @@ "request": "^2.85.0" } }, - "cpx": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/cpx/-/cpx-1.5.0.tgz", - "integrity": "sha1-GFvgGFEdhycN7czCkxceN2VauI8=", - "dev": true, - "requires": { - "babel-runtime": "^6.9.2", - "chokidar": "^1.6.0", - "duplexer": "^0.1.1", - "glob": "^7.0.5", - "glob2base": "^0.0.12", - "minimatch": "^3.0.2", - "mkdirp": "^0.5.1", - "resolve": "^1.1.7", - "safe-buffer": "^5.0.1", - "shell-quote": "^1.6.1", - "subarg": "^1.0.0" - } - }, "create-ecdh": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/create-ecdh/-/create-ecdh-4.0.3.tgz", @@ -3551,12 +3639,6 @@ } } }, - "date-fns": { - "version": "1.29.0", - "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-1.29.0.tgz", - "integrity": "sha512-lbTXWZ6M20cWH8N9S6afb0SBm6tMk+uUg6z3MqHPKE9atmsY3kJkTm8vKe93izJ2B2+q5MV990sM2CHgtAZaOw==", - "dev": true - }, "date-now": { "version": "0.1.4", "resolved": "https://registry.npmjs.org/date-now/-/date-now-0.1.4.tgz", @@ -3564,9 +3646,9 @@ "dev": true }, "dateformat": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/dateformat/-/dateformat-2.2.0.tgz", - "integrity": "sha1-QGXiATz5+5Ft39gu+1Bq1MZ2kGI=", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/dateformat/-/dateformat-3.0.3.tgz", + "integrity": "sha512-jyCETtSl3VMZMWeRo7iY1FL19ges1t55hMo5yaam4Jrsm5EPL89UQkoQRyiI+Yf4k8r2ZpdngkV8hr1lIdjb3Q==", "dev": true }, "debug": { @@ -3755,18 +3837,6 @@ "is-data-descriptor": "^1.0.0", "kind-of": "^6.0.2" } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true } } }, @@ -3905,41 +3975,6 @@ "integrity": "sha1-rOb/gIwc5mtX0ev5eXessCM0z8E=", "dev": true }, - "duplexer2": { - "version": "0.0.2", - "resolved": "https://registry.npmjs.org/duplexer2/-/duplexer2-0.0.2.tgz", - "integrity": "sha1-xhTc9n4vsUmVqRcR5aYX6KYKMds=", - "dev": true, - "requires": { - "readable-stream": "~1.1.9" - }, - "dependencies": { - "isarray": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz", - "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=", - "dev": true - }, - "readable-stream": { - "version": "1.1.14", - "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-1.1.14.tgz", - "integrity": "sha1-fPTFTvZI44EwhMY23SB54WbAgdk=", - "dev": true, - "requires": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.1", - "isarray": "0.0.1", - "string_decoder": "~0.10.x" - } - }, - "string_decoder": { - "version": "0.10.31", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", - "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", - "dev": true - } - } - }, "duplexify": { "version": "3.6.1", "resolved": "https://registry.npmjs.org/duplexify/-/duplexify-3.6.1.tgz", @@ -3972,12 +4007,6 @@ "safer-buffer": "^2.1.0" } }, - "elegant-spinner": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/elegant-spinner/-/elegant-spinner-1.0.1.tgz", - "integrity": "sha1-2wQ1IcldfjA/2PNFvtwzSc+wcp4=", - "dev": true - }, "elliptic": { "version": "6.4.1", "resolved": "https://registry.npmjs.org/elliptic/-/elliptic-6.4.1.tgz", @@ -4190,6 +4219,12 @@ "estraverse": "^4.1.1" } }, + "esm": { + "version": "3.0.84", + "resolved": "https://registry.npmjs.org/esm/-/esm-3.0.84.tgz", + "integrity": "sha512-SzSGoZc17S7P+12R9cg21Bdb7eybX25RnIeRZ80xZs+VZ3kdQKzqTp2k4hZJjR7p9l0186TTXSgrxzlMDBktlw==", + "dev": true + }, "esprima": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", @@ -4273,28 +4308,90 @@ "integrity": "sha1-BjJjj42HfMghB9MKD/8aF8uhzQw=", "dev": true }, - "exit-hook": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/exit-hook/-/exit-hook-1.1.1.tgz", - "integrity": "sha1-8FyiM7SMBdVP/wd2XfhQfpXAL/g=", - "dev": true - }, "expand-brackets": { - "version": "0.1.5", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", - "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", + "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", "dev": true, "requires": { - "is-posix-bracket": "^0.1.0" + "debug": "^2.3.3", + "define-property": "^0.2.5", + "extend-shallow": "^2.0.1", + "posix-character-classes": "^0.1.0", + "regex-not": "^1.0.0", + "snapdragon": "^0.8.1", + "to-regex": "^3.0.1" + }, + "dependencies": { + "define-property": { + "version": "0.2.5", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", + "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", + "dev": true, + "requires": { + "is-descriptor": "^0.1.0" + } + }, + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } } }, "expand-range": { "version": "1.8.2", - "resolved": "https://registry.npmjs.org/expand-range/-/expand-range-1.8.2.tgz", + "resolved": "http://registry.npmjs.org/expand-range/-/expand-range-1.8.2.tgz", "integrity": "sha1-opnv/TNf4nIeuujiV+x5ZE/IUzc=", "dev": true, "requires": { "fill-range": "^2.1.0" + }, + "dependencies": { + "fill-range": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-2.2.4.tgz", + "integrity": "sha512-cnrcCbj01+j2gTG921VZPnHbjmdAf8oQV/iGeV2kZxGSyfYjjTyY79ErsK1WJWMpw6DaApEX72binqJE+/d+5Q==", + "dev": true, + "requires": { + "is-number": "^2.1.0", + "isobject": "^2.0.0", + "randomatic": "^3.0.0", + "repeat-element": "^1.1.2", + "repeat-string": "^1.5.2" + } + }, + "is-number": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-2.1.0.tgz", + "integrity": "sha1-Afy7s5NGOlSPL0ZszhbezknbkI8=", + "dev": true, + "requires": { + "kind-of": "^3.0.2" + } + }, + "isobject": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz", + "integrity": "sha1-8GVWEJaj8dou9GJy+BXIQNh+DIk=", + "dev": true, + "requires": { + "isarray": "1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "expand-tilde": { @@ -4359,262 +4456,37 @@ } }, "extglob": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", - "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", - "dev": true, - "requires": { - "is-extglob": "^1.0.0" - } - }, - "extsprintf": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", - "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=", - "dev": true - }, - "fancy-log": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/fancy-log/-/fancy-log-1.3.2.tgz", - "integrity": "sha1-9BEl49hPLn2JpD0G2VjI94vha+E=", - "dev": true, - "requires": { - "ansi-gray": "^0.1.1", - "color-support": "^1.1.3", - "time-stamp": "^1.0.0" - } - }, - "fast-deep-equal": { - "version": "1.1.0", - "resolved": "http://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-1.1.0.tgz", - "integrity": "sha1-wFNHeBfIa1HaqFPIHgWbcz0CNhQ=", - "dev": true - }, - "fast-glob": { - "version": "2.2.4", - "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-2.2.4.tgz", - "integrity": "sha512-FjK2nCGI/McyzgNtTESqaWP3trPvHyRyoyY70hxjc3oKPNmDe8taohLZpoVKoUjW85tbU5txaYUZCNtVzygl1g==", + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", + "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", "dev": true, "requires": { - "@mrmlnc/readdir-enhanced": "^2.2.1", - "@nodelib/fs.stat": "^1.1.2", - "glob-parent": "^3.1.0", - "is-glob": "^4.0.0", - "merge2": "^1.2.3", - "micromatch": "^3.1.10" + "array-unique": "^0.3.2", + "define-property": "^1.0.0", + "expand-brackets": "^2.1.4", + "extend-shallow": "^2.0.1", + "fragment-cache": "^0.2.1", + "regex-not": "^1.0.0", + "snapdragon": "^0.8.1", + "to-regex": "^3.0.1" }, "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", + "define-property": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", + "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", "dev": true, "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } + "is-descriptor": "^1.0.0" } }, - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", "dev": true, "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - }, - "dependencies": { - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - } + "is-extendable": "^0.1.0" } }, "is-accessor-descriptor": { @@ -4645,77 +4517,53 @@ "is-data-descriptor": "^1.0.0", "kind-of": "^6.0.2" } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", - "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", - "dev": true, - "requires": { - "is-extglob": "^2.1.1" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } } } }, + "extsprintf": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", + "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=", + "dev": true + }, + "fancy-log": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/fancy-log/-/fancy-log-1.3.3.tgz", + "integrity": "sha512-k9oEhlyc0FrVh25qYuSELjr8oxsCoc4/LEZfg2iJJrfEk/tZL9bCoJE47gqAvI2m/AUjluCS4+3I0eTx8n3AEw==", + "dev": true, + "requires": { + "ansi-gray": "^0.1.1", + "color-support": "^1.1.3", + "parse-node-version": "^1.0.0", + "time-stamp": "^1.0.0" + } + }, + "fast-deep-equal": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz", + "integrity": "sha1-ewUhjd+WZ79/Nwv3/bLLFf3Qqkk=", + "dev": true + }, + "fast-extend": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/fast-extend/-/fast-extend-0.0.2.tgz", + "integrity": "sha1-9exCz0C5Rg9SGmOH37Ut7u1nHb0=", + "dev": true + }, + "fast-glob": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-2.2.4.tgz", + "integrity": "sha512-FjK2nCGI/McyzgNtTESqaWP3trPvHyRyoyY70hxjc3oKPNmDe8taohLZpoVKoUjW85tbU5txaYUZCNtVzygl1g==", + "dev": true, + "requires": { + "@mrmlnc/readdir-enhanced": "^2.2.1", + "@nodelib/fs.stat": "^1.1.2", + "glob-parent": "^3.1.0", + "is-glob": "^4.0.0", + "merge2": "^1.2.3", + "micromatch": "^3.1.10" + } + }, "fast-json-stable-stringify": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.0.0.tgz", @@ -4769,16 +4617,26 @@ } }, "fill-range": { - "version": "2.2.4", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-2.2.4.tgz", - "integrity": "sha512-cnrcCbj01+j2gTG921VZPnHbjmdAf8oQV/iGeV2kZxGSyfYjjTyY79ErsK1WJWMpw6DaApEX72binqJE+/d+5Q==", + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", + "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", "dev": true, "requires": { - "is-number": "^2.1.0", - "isobject": "^2.0.0", - "randomatic": "^3.0.0", - "repeat-element": "^1.1.2", - "repeat-string": "^1.5.2" + "extend-shallow": "^2.0.1", + "is-number": "^3.0.0", + "repeat-string": "^1.6.1", + "to-regex-range": "^2.1.0" + }, + "dependencies": { + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } } }, "find-cache-dir": { @@ -4812,9 +4670,9 @@ } }, "p-limit": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.0.0.tgz", - "integrity": "sha512-fl5s52lI5ahKCernzzIyAP0QAZbGIovtVHGwpcu1Jr/EpzLVDI2myISHwGqK7m8uQFugVWSrbxH7XnhGtvEc+A==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.1.0.tgz", + "integrity": "sha512-NhURkNcrVB+8hNfLuysU8enY5xn2KXphsHBaC2YmRNTZRc7RWusw6apSpdEj3jo4CMb6W9nrF6tTnsJsJeyu6g==", "dev": true, "requires": { "p-try": "^2.0.0" @@ -4835,6 +4693,12 @@ "integrity": "sha512-hMp0onDKIajHfIkdRk3P4CdCmErkYAxxDtP3Wx/4nZ3aGlau2VKh3mZpcuFkH27WQkL/3WBCPOktzA9ZOAnMQQ==", "dev": true }, + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + }, "pkg-dir": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-3.0.0.tgz", @@ -4846,16 +4710,10 @@ } } }, - "find-index": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/find-index/-/find-index-0.1.1.tgz", - "integrity": "sha1-Z101iyyjiS15Whq0cjL4tuLg3eQ=", - "dev": true - }, - "find-parent-dir": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/find-parent-dir/-/find-parent-dir-0.3.0.tgz", - "integrity": "sha1-M8RLQpqysvBkYpnF+fcY83b/jVQ=", + "find-npm-prefix": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/find-npm-prefix/-/find-npm-prefix-1.0.2.tgz", + "integrity": "sha512-KEftzJ+H90x6pcKtdXZEPsQse8/y/UnvzRKrOSQFprnrGaFuJ62fVkP34Iu2IYuMvyauCyoLTNkJZgrrGA2wkA==", "dev": true }, "find-replace": { @@ -4868,12 +4726,13 @@ } }, "find-up": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", - "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-1.1.2.tgz", + "integrity": "sha1-ay6YIrGizgpgq2TWEOzK1TyyTQ8=", "dev": true, "requires": { - "locate-path": "^2.0.0" + "path-exists": "^2.0.0", + "pinkie-promise": "^2.0.0" } }, "findup-sync": { @@ -4888,233 +4747,6 @@ "resolve-dir": "^1.0.1" }, "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, "is-glob": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", @@ -5123,66 +4755,13 @@ "requires": { "is-extglob": "^2.1.0" } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } } } }, "fined": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/fined/-/fined-1.1.0.tgz", - "integrity": "sha1-s33IRLdqL15wgeiE98CuNE8VNHY=", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/fined/-/fined-1.1.1.tgz", + "integrity": "sha512-jQp949ZmEbiYHk3gkbdtpJ0G1+kgtLQBNdP5edFP7Fh+WAYceLQz6yO1SBj72Xkg8GVyTB3bBzAYrHJVh5Xd5g==", "dev": true, "requires": { "expand-tilde": "^2.0.2", @@ -5193,9 +4772,9 @@ } }, "flagged-respawn": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/flagged-respawn/-/flagged-respawn-1.0.0.tgz", - "integrity": "sha1-Tnmumy6zi/hrO7Vr8+ClaqX8q9c=", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/flagged-respawn/-/flagged-respawn-1.0.1.tgz", + "integrity": "sha512-lNaHNVymajmk0OJMBn8fVUAU1BtDeKIqKoVhk4xAALB57aALg6b4W0MfJ/cUE0g9YBXy5XhSlPIpYIJ7HaY/3Q==", "dev": true }, "flatbuffers": { @@ -5220,9 +4799,9 @@ "dev": true }, "for-own": { - "version": "0.1.5", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", - "integrity": "sha1-UmXGgaTylNq78XyVCbZ2OqhFEM4=", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/for-own/-/for-own-1.0.0.tgz", + "integrity": "sha1-xjMy9BXO3EsE2/5wz4NklMU8tEs=", "dev": true, "requires": { "for-in": "^1.0.1" @@ -5294,6 +4873,23 @@ "through2": "^2.0.3" } }, + "fs-monkey": { + "version": "0.3.3", + "resolved": "https://registry.npmjs.org/fs-monkey/-/fs-monkey-0.3.3.tgz", + "integrity": "sha512-FNUvuTAJ3CqCQb5ELn+qCbGR/Zllhf2HtwsdAtBi59s1WeCjKMT81fHcSu7dwIskqGVK+MmOrb7VOBlq3/SItw==", + "dev": true + }, + "fs-vacuum": { + "version": "1.2.10", + "resolved": "https://registry.npmjs.org/fs-vacuum/-/fs-vacuum-1.2.10.tgz", + "integrity": "sha1-t2Kb7AekAxolSP35n17PHMizHjY=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "path-is-inside": "^1.0.1", + "rimraf": "^2.5.2" + } + }, "fs-write-stream-atomic": { "version": "1.0.10", "resolved": "https://registry.npmjs.org/fs-write-stream-atomic/-/fs-write-stream-atomic-1.0.10.tgz", @@ -5881,18 +5477,28 @@ "integrity": "sha512-KGDOARWVga7+rnB3z9Sd2Letx515owfk0hSxHGuqjANb1M+x2bGZGqHLiozPsYMdM2OubeMni/Hpwmjq6qIUhA==", "dev": true }, + "gentle-fs": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/gentle-fs/-/gentle-fs-2.0.1.tgz", + "integrity": "sha512-cEng5+3fuARewXktTEGbwsktcldA+YsnUEaXZwcK/3pjSE1X9ObnTs+/8rYf8s+RnIcQm2D5x3rwpN7Zom8Bew==", + "dev": true, + "requires": { + "aproba": "^1.1.2", + "fs-vacuum": "^1.2.10", + "graceful-fs": "^4.1.11", + "iferr": "^0.1.5", + "mkdirp": "^0.5.1", + "path-is-inside": "^1.0.2", + "read-cmd-shim": "^1.0.1", + "slide": "^1.1.6" + } + }, "get-caller-file": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.3.tgz", "integrity": "sha512-3t6rVToeoZfYSGd8YoLFR2DJkiQrIiUrGcjvFX2mDw3bn6k2OtwHN0TNCLbBO+w8qTvimhDkv+LSscbJY1vE6w==", "dev": true }, - "get-own-enumerable-property-symbols": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/get-own-enumerable-property-symbols/-/get-own-enumerable-property-symbols-3.0.0.tgz", - "integrity": "sha512-CIJYJC4GGF06TakLg8z4GQKvDsx9EMspVxOYih7LerEL/WosUnFIww45CGfxfeKHqlg3twgUrYRT1O3WQqjGCg==", - "dev": true - }, "get-pkg-repo": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/get-pkg-repo/-/get-pkg-repo-1.4.0.tgz", @@ -6026,27 +5632,6 @@ "meow": "^4.0.0", "split2": "^2.0.0", "through2": "^2.0.0" - }, - "dependencies": { - "lodash.template": { - "version": "4.4.0", - "resolved": "https://registry.npmjs.org/lodash.template/-/lodash.template-4.4.0.tgz", - "integrity": "sha1-5zoDhcg1VZF0bgILmWecaQ5o+6A=", - "dev": true, - "requires": { - "lodash._reinterpolate": "~3.0.0", - "lodash.templatesettings": "^4.0.0" - } - }, - "lodash.templatesettings": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/lodash.templatesettings/-/lodash.templatesettings-4.1.0.tgz", - "integrity": "sha1-K01OlbpEDZFf8IvImeRVNmZxMxY=", - "dev": true, - "requires": { - "lodash._reinterpolate": "~3.0.0" - } - } } }, "git-remote-origin-url": { @@ -6108,15 +5693,53 @@ "requires": { "glob-parent": "^2.0.0", "is-glob": "^2.0.0" + }, + "dependencies": { + "glob-parent": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-2.0.0.tgz", + "integrity": "sha1-gTg9ctsFT8zPUzbaqQLxgvbtuyg=", + "dev": true, + "requires": { + "is-glob": "^2.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + } } }, "glob-parent": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-2.0.0.tgz", - "integrity": "sha1-gTg9ctsFT8zPUzbaqQLxgvbtuyg=", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", + "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", "dev": true, "requires": { - "is-glob": "^2.0.0" + "is-glob": "^3.1.0", + "path-dirname": "^1.0.0" + }, + "dependencies": { + "is-glob": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", + "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", + "dev": true, + "requires": { + "is-extglob": "^2.1.0" + } + } } }, "glob-stream": { @@ -6135,33 +5758,6 @@ "remove-trailing-separator": "^1.0.1", "to-absolute-glob": "^2.0.0", "unique-stream": "^2.0.2" - }, - "dependencies": { - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - } } }, "glob-to-regexp": { @@ -6182,358 +5778,6 @@ "is-negated-glob": "^1.0.0", "just-debounce": "^1.0.0", "object.defaults": "^1.1.0" - }, - "dependencies": { - "anymatch": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", - "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==", - "dev": true, - "requires": { - "micromatch": "^3.1.4", - "normalize-path": "^2.1.1" - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "chokidar": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-2.0.4.tgz", - "integrity": "sha512-z9n7yt9rOvIJrMhvDtDictKrkFHeihkNl6uWMmZlmL6tJtX9Cs+87oK+teBx+JIgzvbX3yZHT3eF8vpbDxHJXQ==", - "dev": true, - "requires": { - "anymatch": "^2.0.0", - "async-each": "^1.0.0", - "braces": "^2.3.0", - "fsevents": "^1.2.2", - "glob-parent": "^3.1.0", - "inherits": "^2.0.1", - "is-binary-path": "^1.0.0", - "is-glob": "^4.0.0", - "lodash.debounce": "^4.0.8", - "normalize-path": "^2.1.1", - "path-is-absolute": "^1.0.0", - "readdirp": "^2.0.0", - "upath": "^1.0.5" - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - }, - "dependencies": { - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", - "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", - "dev": true, - "requires": { - "is-extglob": "^2.1.1" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - } - } - }, - "glob2base": { - "version": "0.0.12", - "resolved": "https://registry.npmjs.org/glob2base/-/glob2base-0.0.12.tgz", - "integrity": "sha1-nUGbPijxLoOjYhZKJ3BVkiycDVY=", - "dev": true, - "requires": { - "find-index": "^0.1.1" } }, "global-modules": { @@ -6568,7 +5812,7 @@ }, "globby": { "version": "6.1.0", - "resolved": "https://registry.npmjs.org/globby/-/globby-6.1.0.tgz", + "resolved": "http://registry.npmjs.org/globby/-/globby-6.1.0.tgz", "integrity": "sha1-9abXDoOV4hyFj7BInWTfAkJNUGw=", "dev": true, "requires": { @@ -6588,23 +5832,25 @@ } }, "glogg": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/glogg/-/glogg-1.0.1.tgz", - "integrity": "sha512-ynYqXLoluBKf9XGR1gA59yEJisIL7YHEH4xr3ZziHB5/yl4qWfaK8Js9jGe6gBGCSCKVqiyO30WnRZADvemUNw==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/glogg/-/glogg-1.0.2.tgz", + "integrity": "sha512-5mwUoSuBk44Y4EshyiqcH95ZntbDdTQqA3QYSrxmzj28Ai0vXBGMH1ApSANH14j2sIRtqCEyg6PfsuP7ElOEDA==", "dev": true, "requires": { "sparkles": "^1.0.0" } }, "google-closure-compiler": { - "version": "20181008.0.0", - "resolved": "https://registry.npmjs.org/google-closure-compiler/-/google-closure-compiler-20181008.0.0.tgz", - "integrity": "sha512-XmJIasXHyy4kirthlsuDev2LZcXjYXWfOHwHdCLUQnfJH8T2sxWDNjFLQycaCIXwQLOyw2Kem38VgxrYfG0hzg==", + "version": "20181210.0.0", + "resolved": "https://registry.npmjs.org/google-closure-compiler/-/google-closure-compiler-20181210.0.0.tgz", + "integrity": "sha512-GCMLakdibnc+jpdNTvF3M/ET5i6I4zzxGKw67A4bQahxc0TPLXQdkVfhF3kwBSoPfK8xwgU5kA+KO0qvDZHKHw==", "dev": true, "requires": { "chalk": "^1.0.0", - "google-closure-compiler-linux": "^20181008.0.0", - "google-closure-compiler-osx": "^20181008.0.0", + "google-closure-compiler-java": "^20181210.0.0", + "google-closure-compiler-js": "^20181210.0.0", + "google-closure-compiler-linux": "^20181210.0.0", + "google-closure-compiler-osx": "^20181210.0.0", "minimist": "^1.2.0", "vinyl": "^2.0.1", "vinyl-sourcemaps-apply": "^0.2.0" @@ -6631,30 +5877,42 @@ }, "supports-color": { "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", + "resolved": "http://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", "dev": true } } }, + "google-closure-compiler-java": { + "version": "20181210.0.0", + "resolved": "https://registry.npmjs.org/google-closure-compiler-java/-/google-closure-compiler-java-20181210.0.0.tgz", + "integrity": "sha512-FMGzY+vp25DePolYNyVcXz8UI2PV/I3AYU3nuFexmHcKn5XiBVy4CqK7em6NpVbZdDXJYUF3GUv5A0x0gLvbfw==", + "dev": true + }, + "google-closure-compiler-js": { + "version": "20181210.0.0", + "resolved": "https://registry.npmjs.org/google-closure-compiler-js/-/google-closure-compiler-js-20181210.0.0.tgz", + "integrity": "sha512-gn+2hT4uQtYKD/jXJqGIXzPMln3/JD7R4caAKDPJm7adqqDvrCAw7qxAiK4Vz1rNec7hJXPXh9TeKQjzz03ZaQ==", + "dev": true + }, "google-closure-compiler-linux": { - "version": "20181008.0.0", - "resolved": "https://registry.npmjs.org/google-closure-compiler-linux/-/google-closure-compiler-linux-20181008.0.0.tgz", - "integrity": "sha512-k8njGfH2uzWJiRPPvUxM7MJB28gPrf4kI2bbuiF0gJk/1arXcWCPGjLD6pzCU0UylMy52MUXLgsIpRorqf2brw==", + "version": "20181210.0.0", + "resolved": "https://registry.npmjs.org/google-closure-compiler-linux/-/google-closure-compiler-linux-20181210.0.0.tgz", + "integrity": "sha512-Gp+yp+Vb6QWEhtYkePKxkspRlzX5dx6L46zUoHGWW7Henuk3ACYoUXuaHLQQ+tF0lmi2QAmFXEkvdnKVDIxR+Q==", "dev": true, "optional": true }, "google-closure-compiler-osx": { - "version": "20181008.0.0", - "resolved": "https://registry.npmjs.org/google-closure-compiler-osx/-/google-closure-compiler-osx-20181008.0.0.tgz", - "integrity": "sha512-xzf/yH/4MXdb6GbP84iHnpcVCOPBbH0gMVOs0JhR/KbrQh+DlJU+Y8Z/DQzTkw9HgD650R2/WZmBknURyg9OTw==", + "version": "20181210.0.0", + "resolved": "https://registry.npmjs.org/google-closure-compiler-osx/-/google-closure-compiler-osx-20181210.0.0.tgz", + "integrity": "sha512-SYUakmEpq8BorJU/O5CfrC+ABYjXR0rTvBd3Khwd1sml9B2aKEiHArdHC5SCmBRZd3ccUhp/XyrVO6PoxHKeZA==", "dev": true, "optional": true }, "graceful-fs": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.11.tgz", - "integrity": "sha1-Dovf5NHduIVNZOBOp8AOKgJuVlg=", + "version": "4.1.15", + "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.15.tgz", + "integrity": "sha512-6uHUhOPEBgQ24HM+r6b/QwWfZq+yiFcipKFrOFiBEnWdy5sdzYoi+pJeQaPI5qOLRFqWmAXUPQNsielzdLoecA==", "dev": true }, "growl": { @@ -6681,15 +5939,6 @@ "vinyl-fs": "^3.0.0" }, "dependencies": { - "ansi-colors": { - "version": "1.1.0", - "resolved": "http://registry.npmjs.org/ansi-colors/-/ansi-colors-1.1.0.tgz", - "integrity": "sha512-SFKX67auSNoVR38N3L+nvsPjOE0bybKTYbkf5tRvushrAPQ9V75huw0ZxBkKVeRU9kqH3d6HA4xTckbwZ4ixmA==", - "dev": true, - "requires": { - "ansi-wrap": "^0.1.0" - } - }, "gulp-cli": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/gulp-cli/-/gulp-cli-2.0.1.tgz", @@ -6715,24 +5964,21 @@ "v8flags": "^3.0.1", "yargs": "^7.1.0" } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true } } }, "gulp-json-transform": { - "version": "0.4.5", - "resolved": "https://registry.npmjs.org/gulp-json-transform/-/gulp-json-transform-0.4.5.tgz", - "integrity": "sha512-kaGUaAhgjxeLgIMNF3IPFFmYCF6AgvzBQwqmVowiIStNADZSoILtPNDisYA4mKfpwMTqSiWLogQt1q5U75+uwA==", + "version": "0.4.6", + "resolved": "https://registry.npmjs.org/gulp-json-transform/-/gulp-json-transform-0.4.6.tgz", + "integrity": "sha512-laPoNiJP/+lAeiyb0lgY3cynOOi7R/QbPvKBEXJY6bm836nYg90pwY4mgwR7w8nFDlXiCToUeaoQCBIc2NudjA==", "dev": true, "requires": { - "gulp-util": "^3.0.8", + "ansi-colors": "^1.0.1", + "fancy-log": "^1.3.2", + "plugin-error": "^1.0.1", "promise": "^8.0.1", - "through2": "^2.0.3" + "through2": "^2.0.3", + "vinyl": "^2.1.0" } }, "gulp-rename": { @@ -6769,204 +6015,39 @@ } }, "gulp-typescript": { - "version": "5.0.0-alpha.3", - "resolved": "https://registry.npmjs.org/gulp-typescript/-/gulp-typescript-5.0.0-alpha.3.tgz", - "integrity": "sha512-6iSBjqBXAUqRsLUh/9XtlOnSzpPMbLrr5rqGj4UPLtGpDwFHW/fVTuRgv6LAWiKesLIUDDM0ourxvcpu2trecQ==", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/gulp-typescript/-/gulp-typescript-5.0.0.tgz", + "integrity": "sha512-lMj2U+Ni6HyFaY2nr1sSQ6D014eHil5L1i52XWBaAQUR9UAUUp9btnm4yRBT2Jb8xhrwqmhMssZf/g2B7cinCA==", "dev": true, "requires": { - "ansi-colors": "^2.0.2", + "ansi-colors": "^3.0.5", "plugin-error": "^1.0.1", "source-map": "^0.7.3", - "through2": "^2.0.3", + "through2": "^3.0.0", "vinyl": "^2.1.0", "vinyl-fs": "^3.0.3" }, "dependencies": { - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - } - }, - "glob-stream": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/glob-stream/-/glob-stream-6.1.0.tgz", - "integrity": "sha1-cEXJlBOz65SIjYOrRtC0BMx73eQ=", - "dev": true, - "requires": { - "extend": "^3.0.0", - "glob": "^7.1.1", - "glob-parent": "^3.1.0", - "is-negated-glob": "^1.0.0", - "ordered-read-streams": "^1.0.0", - "pumpify": "^1.3.5", - "readable-stream": "^2.1.5", - "remove-trailing-separator": "^1.0.1", - "to-absolute-glob": "^2.0.0", - "unique-stream": "^2.0.2" - } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - }, - "is-valid-glob": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-valid-glob/-/is-valid-glob-1.0.0.tgz", - "integrity": "sha1-Kb8+/3Ab4tTTFdusw5vDn+j2Aao=", + "ansi-colors": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/ansi-colors/-/ansi-colors-3.2.3.tgz", + "integrity": "sha512-LEHHyuhlPY3TmuUYMh2oz89lTShfvgbmzaBcxve9t/9Wuy7Dwf4yoAKcND7KFT1HAQfqZ12qtc+DUrBMeKF9nw==", "dev": true }, - "ordered-read-streams": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/ordered-read-streams/-/ordered-read-streams-1.0.1.tgz", - "integrity": "sha1-d8DLN8QVJdZBZtmQ/61+xqDhNj4=", - "dev": true, - "requires": { - "readable-stream": "^2.0.1" - } - }, "source-map": { "version": "0.7.3", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.3.tgz", "integrity": "sha512-CkCj6giN3S+n9qrYiBTX5gystlENnRW5jZeNLHpe6aue+SrHcG5VYwujhW9s4dY31mEGsxBDrHR6oI69fTXsaQ==", "dev": true }, - "to-absolute-glob": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/to-absolute-glob/-/to-absolute-glob-2.0.2.tgz", - "integrity": "sha1-GGX0PZ50sIItufFFt4z/fQ98hJs=", - "dev": true, - "requires": { - "is-absolute": "^1.0.0", - "is-negated-glob": "^1.0.0" - } - }, - "vinyl-fs": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/vinyl-fs/-/vinyl-fs-3.0.3.tgz", - "integrity": "sha512-vIu34EkyNyJxmP0jscNzWBSygh7VWhqun6RmqVfXePrOwi9lhvRs//dOaGOTRUQr4tx7/zd26Tk5WeSVZitgng==", - "dev": true, - "requires": { - "fs-mkdirp-stream": "^1.0.0", - "glob-stream": "^6.1.0", - "graceful-fs": "^4.0.0", - "is-valid-glob": "^1.0.0", - "lazystream": "^1.0.0", - "lead": "^1.0.0", - "object.assign": "^4.0.4", - "pumpify": "^1.3.5", - "readable-stream": "^2.3.3", - "remove-bom-buffer": "^3.0.0", - "remove-bom-stream": "^1.2.0", - "resolve-options": "^1.1.0", - "through2": "^2.0.0", - "to-through": "^2.0.0", - "value-or-function": "^3.0.0", - "vinyl": "^2.0.0", - "vinyl-sourcemap": "^1.1.0" - } - } - } - }, - "gulp-util": { - "version": "3.0.8", - "resolved": "https://registry.npmjs.org/gulp-util/-/gulp-util-3.0.8.tgz", - "integrity": "sha1-AFTh50RQLifATBh8PsxQXdVLu08=", - "dev": true, - "requires": { - "array-differ": "^1.0.0", - "array-uniq": "^1.0.2", - "beeper": "^1.0.0", - "chalk": "^1.0.0", - "dateformat": "^2.0.0", - "fancy-log": "^1.1.0", - "gulplog": "^1.0.0", - "has-gulplog": "^0.1.0", - "lodash._reescape": "^3.0.0", - "lodash._reevaluate": "^3.0.0", - "lodash._reinterpolate": "^3.0.0", - "lodash.template": "^3.0.0", - "minimist": "^1.1.0", - "multipipe": "^0.1.2", - "object-assign": "^3.0.0", - "replace-ext": "0.0.1", - "through2": "^2.0.0", - "vinyl": "^0.5.0" - }, - "dependencies": { - "ansi-styles": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", - "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=", - "dev": true - }, - "chalk": { - "version": "1.1.3", - "resolved": "http://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", - "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", - "dev": true, - "requires": { - "ansi-styles": "^2.2.1", - "escape-string-regexp": "^1.0.2", - "has-ansi": "^2.0.0", - "strip-ansi": "^3.0.0", - "supports-color": "^2.0.0" - } - }, - "clone": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/clone/-/clone-1.0.4.tgz", - "integrity": "sha1-2jCcwmPfFZlMaIypAheco8fNfH4=", - "dev": true - }, - "clone-stats": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/clone-stats/-/clone-stats-0.0.1.tgz", - "integrity": "sha1-uI+UqCzzi4eR1YBG6kAprYjKmdE=", - "dev": true - }, - "object-assign": { + "through2": { "version": "3.0.0", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-3.0.0.tgz", - "integrity": "sha1-m+3VygiXlJvKR+f/QIBi1Un1h/I=", - "dev": true - }, - "replace-ext": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/replace-ext/-/replace-ext-0.0.1.tgz", - "integrity": "sha1-KbvZIHinOfC8zitO5B6DeVNSKSQ=", - "dev": true - }, - "supports-color": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", - "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", - "dev": true - }, - "vinyl": { - "version": "0.5.3", - "resolved": "https://registry.npmjs.org/vinyl/-/vinyl-0.5.3.tgz", - "integrity": "sha1-sEVbOPxeDPMNQyUTLkYZcMIJHN4=", + "resolved": "https://registry.npmjs.org/through2/-/through2-3.0.0.tgz", + "integrity": "sha512-8B+sevlqP4OiCjonI1Zw03Sf8PuV1eRsYQgLad5eonILOdyeRsY27A/2Ze8IlvlMvq31OH+3fz/styI7Ya62yQ==", "dev": true, "requires": { - "clone": "^1.0.0", - "clone-stats": "^0.0.1", - "replace-ext": "0.0.1" + "readable-stream": "2 || 3", + "xtend": "~4.0.1" } } } @@ -7007,12 +6088,12 @@ "dev": true }, "har-validator": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.0.tgz", - "integrity": "sha512-+qnmNjI4OfH2ipQ9VQOw23bBd/ibtfbVdK2fYbY4acTDqKTW/YDp9McimZdDbG8iV9fZizUqQMD5xvriB146TA==", + "version": "5.1.3", + "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.3.tgz", + "integrity": "sha512-sNvOCzEQNr/qrvJgc3UG/kD4QtlHycrzwS+6mfTrrSq97BvaYcPZZI1ZSqGSPR73Cxn4LKTD4PttRwfU7jWq5g==", "dev": true, "requires": { - "ajv": "^5.3.0", + "ajv": "^6.5.5", "har-schema": "^2.0.0" } }, @@ -7039,15 +6120,6 @@ "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=" }, - "has-gulplog": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/has-gulplog/-/has-gulplog-0.1.0.tgz", - "integrity": "sha1-ZBTIKRNpfaUVkDl9r7EvIpZ4Ec4=", - "dev": true, - "requires": { - "sparkles": "^1.0.0" - } - }, "has-symbols": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.0.tgz", @@ -7069,14 +6141,6 @@ "get-value": "^2.0.6", "has-values": "^1.0.0", "isobject": "^3.0.0" - }, - "dependencies": { - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "has-values": { @@ -7089,26 +6153,6 @@ "kind-of": "^4.0.0" }, "dependencies": { - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, "kind-of": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-4.0.0.tgz", @@ -7307,6 +6351,16 @@ "minimatch": "^3.0.4" } }, + "import-fresh": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-2.0.0.tgz", + "integrity": "sha1-2BNVwVYS04bGH53dOSLUMEgipUY=", + "dev": true, + "requires": { + "caller-path": "^2.0.0", + "resolve-from": "^3.0.0" + } + }, "import-local": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/import-local/-/import-local-1.0.0.tgz", @@ -7456,9 +6510,9 @@ } }, "interpret": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/interpret/-/interpret-1.1.0.tgz", - "integrity": "sha1-ftGxQQxqDg94z5XTuEQMY/eLhhQ=", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/interpret/-/interpret-1.2.0.tgz", + "integrity": "sha512-mT34yGKMNceBQUoVn7iCDKDntA7SC6gycMAWzGx1z/CMCTV7b2AAtXlo3nRyHZ1FelRkQbQjprHSYGwzLtkVbw==", "dev": true }, "invariant": { @@ -7494,11 +6548,22 @@ }, "is-accessor-descriptor": { "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", + "resolved": "http://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", "dev": true, "requires": { "kind-of": "^3.0.2" + }, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "is-arrayish": { @@ -7548,11 +6613,22 @@ }, "is-data-descriptor": { "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", + "resolved": "http://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", "dev": true, "requires": { "kind-of": "^3.0.2" + }, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "is-date-object": { @@ -7608,9 +6684,9 @@ "dev": true }, "is-extglob": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", - "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", "dev": true }, "is-finite": { @@ -7633,17 +6709,17 @@ }, "is-generator-fn": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-generator-fn/-/is-generator-fn-1.0.0.tgz", + "resolved": "http://registry.npmjs.org/is-generator-fn/-/is-generator-fn-1.0.0.tgz", "integrity": "sha1-lp1J4bszKfa7fwkIm+JleLLd1Go=", "dev": true }, "is-glob": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", - "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", + "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", "dev": true, "requires": { - "is-extglob": "^1.0.0" + "is-extglob": "^2.1.1" } }, "is-negated-glob": { @@ -7653,12 +6729,23 @@ "dev": true }, "is-number": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-2.1.0.tgz", - "integrity": "sha1-Afy7s5NGOlSPL0ZszhbezknbkI8=", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", + "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", "dev": true, "requires": { "kind-of": "^3.0.2" + }, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "is-obj": { @@ -7667,15 +6754,6 @@ "integrity": "sha1-PkcprB9f3gJc19g6iW2rn09n2w8=", "dev": true }, - "is-observable": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/is-observable/-/is-observable-1.1.0.tgz", - "integrity": "sha512-NqCa4Sa2d+u7BWc6CukaObG3Fh+CU9bvixbpcXYhy2VvYS7vVGIdAgnIS5Ks3A/cqk4rebLJ9s8zBstT2aKnIA==", - "dev": true, - "requires": { - "symbol-observable": "^1.1.0" - } - }, "is-path-cwd": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/is-path-cwd/-/is-path-cwd-1.0.0.tgz", @@ -7713,14 +6791,6 @@ "dev": true, "requires": { "isobject": "^3.0.1" - }, - "dependencies": { - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "is-posix-bracket": { @@ -7750,12 +6820,6 @@ "has": "^1.0.1" } }, - "is-regexp": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-regexp/-/is-regexp-1.0.0.tgz", - "integrity": "sha1-/S2INUXEa6xaYz57mgnof6LLUGk=", - "dev": true - }, "is-relative": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/is-relative/-/is-relative-1.0.0.tgz", @@ -7841,13 +6905,10 @@ "dev": true }, "isobject": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz", - "integrity": "sha1-8GVWEJaj8dou9GJy+BXIQNh+DIk=", - "dev": true, - "requires": { - "isarray": "1.0.0" - } + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", + "dev": true }, "isstream": { "version": "0.1.2", @@ -7973,12 +7034,15 @@ } }, "ix": { - "version": "2.3.5", - "resolved": "https://registry.npmjs.org/ix/-/ix-2.3.5.tgz", - "integrity": "sha512-mdW2LtQiy+gPtggKa393EdSaI46RARsAa5zjlLgNKMlE57vC6dc6g6nehROI1Gj/HhsTvpb3WALSwg0EWhhz0Q==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/ix/-/ix-2.4.3.tgz", + "integrity": "sha512-LoFBSUQ8C41KQxIlm/dw+vgGngnR0jc8DMibryGfNoQs2l4dDodQdYUvmCNAaIGsEMkm+IdiF+hLp5SHl6C8GQ==", "dev": true, "requires": { - "tslib": "^1.8.0" + "@types/node": "^10.12.18", + "is-stream": "1.1.0", + "rxjs": "5.5.11", + "tslib": "^1.9.3" } }, "jest": { @@ -7997,6 +7061,38 @@ "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", "dev": true }, + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "camelcase": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", + "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", + "dev": true + }, "cliui": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/cliui/-/cliui-4.1.0.tgz", @@ -8008,12 +7104,54 @@ "wrap-ansi": "^2.0.0" } }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, "is-fullwidth-code-point": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", "dev": true }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, "jest-cli": { "version": "23.6.0", "resolved": "https://registry.npmjs.org/jest-cli/-/jest-cli-23.6.0.tgz", @@ -8058,6 +7196,36 @@ "yargs": "^11.0.0" } }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + }, "os-locale": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-2.1.0.tgz", @@ -8088,6 +7256,12 @@ "ansi-regex": "^3.0.0" } }, + "which-module": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", + "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=", + "dev": true + }, "yargs": { "version": "11.1.0", "resolved": "http://registry.npmjs.org/yargs/-/yargs-11.1.0.tgz", @@ -8107,6 +7281,15 @@ "y18n": "^3.2.1", "yargs-parser": "^9.0.2" } + }, + "yargs-parser": { + "version": "9.0.2", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-9.0.2.tgz", + "integrity": "sha1-nM9qQ0YP5O1Aqbto9I1DuKaMwHc=", + "dev": true, + "requires": { + "camelcase": "^4.1.0" + } } } }, @@ -8139,53 +7322,144 @@ "jest-validate": "^23.6.0", "micromatch": "^2.3.11", "pretty-format": "^23.6.0" - } - }, - "jest-diff": { - "version": "23.6.0", - "resolved": "https://registry.npmjs.org/jest-diff/-/jest-diff-23.6.0.tgz", - "integrity": "sha512-Gz9l5Ov+X3aL5L37IT+8hoCUsof1CVYBb2QEkOupK64XyRR3h+uRpYIm97K7sY8diFxowR8pIGEdyfMKTixo3g==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "diff": "^3.2.0", - "jest-get-type": "^22.1.0", - "pretty-format": "^23.6.0" - } - }, - "jest-docblock": { - "version": "23.2.0", - "resolved": "https://registry.npmjs.org/jest-docblock/-/jest-docblock-23.2.0.tgz", - "integrity": "sha1-8IXh8YVI2Z/dabICB+b9VdkTg6c=", - "dev": true, - "requires": { - "detect-newline": "^2.1.0" - } - }, - "jest-each": { - "version": "23.6.0", - "resolved": "https://registry.npmjs.org/jest-each/-/jest-each-23.6.0.tgz", - "integrity": "sha512-x7V6M/WGJo6/kLoissORuvLIeAoyo2YqLOoCDkohgJ4XOXSqOtyvr8FbInlAWS77ojBsZrafbozWoKVRdtxFCg==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "pretty-format": "^23.6.0" - } - }, - "jest-environment-jsdom": { - "version": "23.4.0", - "resolved": "https://registry.npmjs.org/jest-environment-jsdom/-/jest-environment-jsdom-23.4.0.tgz", - "integrity": "sha1-BWp5UrP+pROsYqFAosNox52eYCM=", - "dev": true, - "requires": { - "jest-mock": "^23.2.0", - "jest-util": "^23.4.0", - "jsdom": "^11.5.1" - } - }, - "jest-environment-node": { - "version": "23.4.0", - "resolved": "https://registry.npmjs.org/jest-environment-node/-/jest-environment-node-23.4.0.tgz", + }, + "dependencies": { + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + } + } + }, + "jest-diff": { + "version": "23.6.0", + "resolved": "https://registry.npmjs.org/jest-diff/-/jest-diff-23.6.0.tgz", + "integrity": "sha512-Gz9l5Ov+X3aL5L37IT+8hoCUsof1CVYBb2QEkOupK64XyRR3h+uRpYIm97K7sY8diFxowR8pIGEdyfMKTixo3g==", + "dev": true, + "requires": { + "chalk": "^2.0.1", + "diff": "^3.2.0", + "jest-get-type": "^22.1.0", + "pretty-format": "^23.6.0" + } + }, + "jest-docblock": { + "version": "23.2.0", + "resolved": "https://registry.npmjs.org/jest-docblock/-/jest-docblock-23.2.0.tgz", + "integrity": "sha1-8IXh8YVI2Z/dabICB+b9VdkTg6c=", + "dev": true, + "requires": { + "detect-newline": "^2.1.0" + } + }, + "jest-each": { + "version": "23.6.0", + "resolved": "https://registry.npmjs.org/jest-each/-/jest-each-23.6.0.tgz", + "integrity": "sha512-x7V6M/WGJo6/kLoissORuvLIeAoyo2YqLOoCDkohgJ4XOXSqOtyvr8FbInlAWS77ojBsZrafbozWoKVRdtxFCg==", + "dev": true, + "requires": { + "chalk": "^2.0.1", + "pretty-format": "^23.6.0" + } + }, + "jest-environment-jsdom": { + "version": "23.4.0", + "resolved": "https://registry.npmjs.org/jest-environment-jsdom/-/jest-environment-jsdom-23.4.0.tgz", + "integrity": "sha1-BWp5UrP+pROsYqFAosNox52eYCM=", + "dev": true, + "requires": { + "jest-mock": "^23.2.0", + "jest-util": "^23.4.0", + "jsdom": "^11.5.1" + } + }, + "jest-environment-node": { + "version": "23.4.0", + "resolved": "https://registry.npmjs.org/jest-environment-node/-/jest-environment-node-23.4.0.tgz", "integrity": "sha1-V+gO0IQd6jAxZ8zozXlSHeuv3hA=", "dev": true, "requires": { @@ -8219,6 +7493,97 @@ "jest-worker": "^23.2.0", "micromatch": "^2.3.11", "sane": "^2.0.0" + }, + "dependencies": { + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + } } }, "jest-jasmine2": { @@ -8272,6 +7637,97 @@ "micromatch": "^2.3.11", "slash": "^1.0.0", "stack-utils": "^1.0.1" + }, + "dependencies": { + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + } } }, "jest-mock": { @@ -8381,6 +7837,38 @@ "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", "dev": true }, + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "camelcase": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", + "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", + "dev": true + }, "cliui": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/cliui/-/cliui-4.1.0.tgz", @@ -8392,12 +7880,84 @@ "wrap-ansi": "^2.0.0" } }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, "is-fullwidth-code-point": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", "dev": true }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + }, "os-locale": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-2.1.0.tgz", @@ -8434,6 +7994,12 @@ "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=", "dev": true }, + "which-module": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", + "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=", + "dev": true + }, "yargs": { "version": "11.1.0", "resolved": "http://registry.npmjs.org/yargs/-/yargs-11.1.0.tgz", @@ -8453,7 +8019,16 @@ "y18n": "^3.2.1", "yargs-parser": "^9.0.2" } - } + }, + "yargs-parser": { + "version": "9.0.2", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-9.0.2.tgz", + "integrity": "sha1-nM9qQ0YP5O1Aqbto9I1DuKaMwHc=", + "dev": true, + "requires": { + "camelcase": "^4.1.0" + } + } } }, "jest-serializer": { @@ -8462,6 +8037,16 @@ "integrity": "sha1-o3dq6zEekP6D+rnlM+hRAr0WQWU=", "dev": true }, + "jest-silent-reporter": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/jest-silent-reporter/-/jest-silent-reporter-0.1.1.tgz", + "integrity": "sha512-nrRzOV4151hG354tnVWfyZbFGJdylpadRWYWWPSD+WeOz2hQOjUGxvIFODnaY9cKQ7JWCtG+5LgSss22ccRhBg==", + "dev": true, + "requires": { + "chalk": "^2.3.1", + "jest-util": "^23.0.0" + } + }, "jest-snapshot": { "version": "23.6.0", "resolved": "https://registry.npmjs.org/jest-snapshot/-/jest-snapshot-23.6.0.tgz", @@ -8594,7 +8179,7 @@ }, "jsesc": { "version": "1.3.0", - "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-1.3.0.tgz", + "resolved": "http://registry.npmjs.org/jsesc/-/jsesc-1.3.0.tgz", "integrity": "sha1-RsP+yMGJKxKwgz25vHYiF226s0s=", "dev": true }, @@ -8622,19 +8207,16 @@ "dev": true }, "json-schema-traverse": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.3.1.tgz", - "integrity": "sha1-NJptRMU6Ud6JtAgFxdXlm0F9M0A=", + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", "dev": true }, - "json-stable-stringify": { + "json-stable-stringify-without-jsonify": { "version": "1.0.1", - "resolved": "https://registry.npmjs.org/json-stable-stringify/-/json-stable-stringify-1.0.1.tgz", - "integrity": "sha1-mnWdOcXy/1A/1TAGRu1EX4jE+a8=", - "dev": true, - "requires": { - "jsonify": "~0.0.0" - } + "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz", + "integrity": "sha1-nbe1lJatPzz+8wp1FC0tkwrXJlE=", + "dev": true }, "json-stringify-safe": { "version": "5.0.1", @@ -8688,13 +8270,10 @@ "dev": true }, "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", + "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", + "dev": true }, "klaw": { "version": "1.3.1", @@ -8761,28 +8340,28 @@ "dev": true }, "lerna": { - "version": "3.4.3", - "resolved": "https://registry.npmjs.org/lerna/-/lerna-3.4.3.tgz", - "integrity": "sha512-tWq1LvpHqkyB+FaJCmkEweivr88yShDMmauofPVdh0M5gU1cVucszYnIgWafulKYu2LMQ3IfUMUU5Pp3+MvADQ==", - "dev": true, - "requires": { - "@lerna/add": "^3.4.1", - "@lerna/bootstrap": "^3.4.1", - "@lerna/changed": "^3.4.1", - "@lerna/clean": "^3.3.2", - "@lerna/cli": "^3.2.0", - "@lerna/create": "^3.4.1", - "@lerna/diff": "^3.3.0", - "@lerna/exec": "^3.3.2", - "@lerna/import": "^3.3.1", - "@lerna/init": "^3.3.0", - "@lerna/link": "^3.3.0", - "@lerna/list": "^3.3.2", - "@lerna/publish": "^3.4.3", - "@lerna/run": "^3.3.2", - "@lerna/version": "^3.4.1", + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/lerna/-/lerna-3.8.0.tgz", + "integrity": "sha512-OLdf7JSWjpgVecvVLyTRpeKPjTJOcQa366IvaEhorOIxFPZvR1rNIEvi4DMOAaxNINpmCB4nSm769H7H4jNQyw==", + "dev": true, + "requires": { + "@lerna/add": "^3.7.2", + "@lerna/bootstrap": "^3.7.2", + "@lerna/changed": "^3.8.0", + "@lerna/clean": "^3.7.2", + "@lerna/cli": "^3.6.0", + "@lerna/create": "^3.7.2", + "@lerna/diff": "^3.7.2", + "@lerna/exec": "^3.7.2", + "@lerna/import": "^3.7.2", + "@lerna/init": "^3.7.2", + "@lerna/link": "^3.7.2", + "@lerna/list": "^3.7.2", + "@lerna/publish": "^3.8.0", + "@lerna/run": "^3.7.2", + "@lerna/version": "^3.8.0", "import-local": "^1.0.0", - "npmlog": "^4.1.2" + "libnpm": "^2.0.1" } }, "leven": { @@ -8801,6 +8380,34 @@ "type-check": "~0.3.2" } }, + "libnpm": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/libnpm/-/libnpm-2.0.1.tgz", + "integrity": "sha512-qTKoxyJvpBxHZQB6k0AhSLajyXq9ZE/lUsZzuHAplr2Bpv9G+k4YuYlExYdUCeVRRGqcJt8hvkPh4tBwKoV98w==", + "dev": true, + "requires": { + "bin-links": "^1.1.2", + "bluebird": "^3.5.3", + "find-npm-prefix": "^1.0.2", + "libnpmaccess": "^3.0.1", + "libnpmconfig": "^1.2.1", + "libnpmhook": "^5.0.2", + "libnpmorg": "^1.0.0", + "libnpmpublish": "^1.1.0", + "libnpmsearch": "^2.0.0", + "libnpmteam": "^1.0.1", + "lock-verify": "^2.0.2", + "npm-lifecycle": "^2.1.0", + "npm-logical-tree": "^1.2.1", + "npm-package-arg": "^6.1.0", + "npm-profile": "^4.0.1", + "npm-registry-fetch": "^3.8.0", + "npmlog": "^4.1.2", + "pacote": "^9.2.3", + "read-package-json": "^2.0.13", + "stringify-package": "^1.0.0" + } + }, "libnpmaccess": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/libnpmaccess/-/libnpmaccess-3.0.1.tgz", @@ -8827,561 +8434,290 @@ "requires": { "pump": "^3.0.0" } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } } } }, - "liftoff": { - "version": "2.5.0", - "resolved": "https://registry.npmjs.org/liftoff/-/liftoff-2.5.0.tgz", - "integrity": "sha1-IAkpG7Mc6oYbvxCnwVooyvdcMew=", - "dev": true, - "requires": { - "extend": "^3.0.0", - "findup-sync": "^2.0.0", - "fined": "^1.0.1", - "flagged-respawn": "^1.0.0", - "is-plain-object": "^2.0.4", - "object.map": "^1.0.0", - "rechoir": "^0.6.2", - "resolve": "^1.1.7" - } - }, - "lint-staged": { - "version": "7.3.0", - "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-7.3.0.tgz", - "integrity": "sha512-AXk40M9DAiPi7f4tdJggwuKIViUplYtVj1os1MVEteW7qOkU50EOehayCfO9TsoGK24o/EsWb41yrEgfJDDjCw==", + "libnpmconfig": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/libnpmconfig/-/libnpmconfig-1.2.1.tgz", + "integrity": "sha512-9esX8rTQAHqarx6qeZqmGQKBNZR5OIbl/Ayr0qQDy3oXja2iFVQQI81R6GZ2a02bSNZ9p3YOGX1O6HHCb1X7kA==", "dev": true, "requires": { - "chalk": "^2.3.1", - "commander": "^2.14.1", - "cosmiconfig": "^5.0.2", - "debug": "^3.1.0", - "dedent": "^0.7.0", - "execa": "^0.9.0", - "find-parent-dir": "^0.3.0", - "is-glob": "^4.0.0", - "is-windows": "^1.0.2", - "jest-validate": "^23.5.0", - "listr": "^0.14.1", - "lodash": "^4.17.5", - "log-symbols": "^2.2.0", - "micromatch": "^3.1.8", - "npm-which": "^3.0.1", - "p-map": "^1.1.1", - "path-is-inside": "^1.0.2", - "pify": "^3.0.0", - "please-upgrade-node": "^3.0.2", - "staged-git-files": "1.1.1", - "string-argv": "^0.0.2", - "stringify-object": "^3.2.2" + "figgy-pudding": "^3.5.1", + "find-up": "^3.0.0", + "ini": "^1.3.5" }, "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "debug": { - "version": "3.2.6", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", - "integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==", + "find-up": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", + "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==", "dev": true, "requires": { - "ms": "^2.1.1" + "locate-path": "^3.0.0" } }, - "execa": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/execa/-/execa-0.9.0.tgz", - "integrity": "sha512-BbUMBiX4hqiHZUA5+JujIjNb6TyAlp2D5KLheMjMluwOuzcnylDL4AxZYLLn1n2AGB49eSWwyKvvEQoRpnAtmA==", + "locate-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", + "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==", "dev": true, "requires": { - "cross-spawn": "^5.0.1", - "get-stream": "^3.0.0", - "is-stream": "^1.1.0", - "npm-run-path": "^2.0.0", - "p-finally": "^1.0.0", - "signal-exit": "^3.0.0", - "strip-eof": "^1.0.0" + "p-locate": "^3.0.0", + "path-exists": "^3.0.0" } }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", + "p-limit": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.1.0.tgz", + "integrity": "sha512-NhURkNcrVB+8hNfLuysU8enY5xn2KXphsHBaC2YmRNTZRc7RWusw6apSpdEj3jo4CMb6W9nrF6tTnsJsJeyu6g==", "dev": true, "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "dev": true, - "requires": { - "ms": "2.0.0" - } - }, - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - }, - "ms": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", - "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", - "dev": true - } + "p-try": "^2.0.0" } }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", + "p-locate": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", + "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==", "dev": true, "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } + "p-limit": "^2.0.0" } }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } + "p-try": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.0.0.tgz", + "integrity": "sha512-hMp0onDKIajHfIkdRk3P4CdCmErkYAxxDtP3Wx/4nZ3aGlau2VKh3mZpcuFkH27WQkL/3WBCPOktzA9ZOAnMQQ==", + "dev": true }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + } + } + }, + "libnpmhook": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/libnpmhook/-/libnpmhook-5.0.2.tgz", + "integrity": "sha512-vLenmdFWhRfnnZiNFPNMog6CK7Ujofy2TWiM2CrpZUjBRIhHkJeDaAbJdYCT6W4lcHtyrJR8yXW8KFyq6UAp1g==", + "dev": true, + "requires": { + "aproba": "^2.0.0", + "figgy-pudding": "^3.4.1", + "get-stream": "^4.0.0", + "npm-registry-fetch": "^3.8.0" + }, + "dependencies": { + "aproba": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", + "integrity": "sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==", + "dev": true }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", "dev": true, "requires": { - "kind-of": "^6.0.0" + "pump": "^3.0.0" } }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", "dev": true, "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" + "end-of-stream": "^1.1.0", + "once": "^1.3.1" } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", + } + } + }, + "libnpmorg": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/libnpmorg/-/libnpmorg-1.0.0.tgz", + "integrity": "sha512-o+4eVJBoDGMgRwh2lJY0a8pRV2c/tQM/SxlqXezjcAg26Qe9jigYVs+Xk0vvlYDWCDhP0g74J8UwWeAgsB7gGw==", + "dev": true, + "requires": { + "aproba": "^2.0.0", + "figgy-pudding": "^3.4.1", + "get-stream": "^4.0.0", + "npm-registry-fetch": "^3.8.0" + }, + "dependencies": { + "aproba": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", + "integrity": "sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==", "dev": true }, - "is-glob": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", - "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", "dev": true, "requires": { - "is-extglob": "^2.1.1" + "pump": "^3.0.0" } }, - "is-number": { + "pump": { "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", "dev": true, "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" + "end-of-stream": "^1.1.0", + "once": "^1.3.1" } - }, - "ms": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz", - "integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg==", - "dev": true } } }, - "listr": { - "version": "0.14.2", - "resolved": "https://registry.npmjs.org/listr/-/listr-0.14.2.tgz", - "integrity": "sha512-vmaNJ1KlGuGWShHI35X/F8r9xxS0VTHh9GejVXwSN20fG5xpq3Jh4bJbnumoT6q5EDM/8/YP1z3YMtQbFmhuXw==", + "libnpmpublish": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/libnpmpublish/-/libnpmpublish-1.1.0.tgz", + "integrity": "sha512-mQ3LT2EWlpJ6Q8mgHTNqarQVCgcY32l6xadPVPMcjWLtVLz7II4WlWkzlbYg1nHGAf+xyABDwS+3aNUiRLkyaA==", "dev": true, "requires": { - "@samverschueren/stream-to-observable": "^0.3.0", - "is-observable": "^1.1.0", - "is-promise": "^2.1.0", - "is-stream": "^1.1.0", - "listr-silent-renderer": "^1.1.1", - "listr-update-renderer": "^0.4.0", - "listr-verbose-renderer": "^0.4.0", - "p-map": "^1.1.1", - "rxjs": "^6.1.0" + "aproba": "^2.0.0", + "figgy-pudding": "^3.5.1", + "get-stream": "^4.0.0", + "lodash.clonedeep": "^4.5.0", + "normalize-package-data": "^2.4.0", + "npm-package-arg": "^6.1.0", + "npm-registry-fetch": "^3.8.0", + "semver": "^5.5.1", + "ssri": "^6.0.1" }, "dependencies": { - "rxjs": { - "version": "6.3.3", - "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-6.3.3.tgz", - "integrity": "sha512-JTWmoY9tWCs7zvIk/CvRjhjGaOd+OVBM987mxFo+OW66cGpdKjZcpmc74ES1sB//7Kl/PAe8+wEakuhG4pcgOw==", + "aproba": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", + "integrity": "sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==", + "dev": true + }, + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", "dev": true, "requires": { - "tslib": "^1.9.0" + "pump": "^3.0.0" + } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" } } } }, - "listr-silent-renderer": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/listr-silent-renderer/-/listr-silent-renderer-1.1.1.tgz", - "integrity": "sha1-kktaN1cVN3C/Go4/v3S4u/P5JC4=", - "dev": true - }, - "listr-update-renderer": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/listr-update-renderer/-/listr-update-renderer-0.4.0.tgz", - "integrity": "sha1-NE2YDaLKLosUW6MFkI8yrj9MyKc=", + "libnpmsearch": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/libnpmsearch/-/libnpmsearch-2.0.0.tgz", + "integrity": "sha512-vd+JWbTGzOSfiOc+72MU6y7WqmBXn49egCCrIXp27iE/88bX8EpG64ST1blWQI1bSMUr9l1AKPMVsqa2tS5KWA==", "dev": true, "requires": { - "chalk": "^1.1.3", - "cli-truncate": "^0.2.1", - "elegant-spinner": "^1.0.1", - "figures": "^1.7.0", - "indent-string": "^3.0.0", - "log-symbols": "^1.0.2", - "log-update": "^1.0.2", - "strip-ansi": "^3.0.1" + "figgy-pudding": "^3.5.1", + "get-stream": "^4.0.0", + "npm-registry-fetch": "^3.8.0" }, "dependencies": { - "ansi-styles": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", - "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=", - "dev": true - }, - "chalk": { - "version": "1.1.3", - "resolved": "http://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", - "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", - "dev": true, - "requires": { - "ansi-styles": "^2.2.1", - "escape-string-regexp": "^1.0.2", - "has-ansi": "^2.0.0", - "strip-ansi": "^3.0.0", - "supports-color": "^2.0.0" - } - }, - "figures": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/figures/-/figures-1.7.0.tgz", - "integrity": "sha1-y+Hjr/zxzUS4DK3+0o3Hk6lwHS4=", + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", "dev": true, "requires": { - "escape-string-regexp": "^1.0.5", - "object-assign": "^4.1.0" + "pump": "^3.0.0" } }, - "log-symbols": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-1.0.2.tgz", - "integrity": "sha1-N2/3tY6jCGoPCfrMdGF+ylAeGhg=", + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", "dev": true, "requires": { - "chalk": "^1.0.0" + "end-of-stream": "^1.1.0", + "once": "^1.3.1" } - }, - "supports-color": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", - "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", - "dev": true } } }, - "listr-verbose-renderer": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/listr-verbose-renderer/-/listr-verbose-renderer-0.4.1.tgz", - "integrity": "sha1-ggb0z21S3cWCfl/RSYng6WWTOjU=", + "libnpmteam": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/libnpmteam/-/libnpmteam-1.0.1.tgz", + "integrity": "sha512-gDdrflKFCX7TNwOMX1snWojCoDE5LoRWcfOC0C/fqF7mBq8Uz9zWAX4B2RllYETNO7pBupBaSyBDkTAC15cAMg==", "dev": true, "requires": { - "chalk": "^1.1.3", - "cli-cursor": "^1.0.2", - "date-fns": "^1.27.2", - "figures": "^1.7.0" + "aproba": "^2.0.0", + "figgy-pudding": "^3.4.1", + "get-stream": "^4.0.0", + "npm-registry-fetch": "^3.8.0" }, "dependencies": { - "ansi-styles": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", - "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=", + "aproba": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", + "integrity": "sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==", "dev": true }, - "chalk": { - "version": "1.1.3", - "resolved": "http://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", - "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", - "dev": true, - "requires": { - "ansi-styles": "^2.2.1", - "escape-string-regexp": "^1.0.2", - "has-ansi": "^2.0.0", - "strip-ansi": "^3.0.0", - "supports-color": "^2.0.0" - } - }, - "cli-cursor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-1.0.2.tgz", - "integrity": "sha1-ZNo/fValRBLll5S9Ytw1KV6PKYc=", - "dev": true, - "requires": { - "restore-cursor": "^1.0.1" - } - }, - "figures": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/figures/-/figures-1.7.0.tgz", - "integrity": "sha1-y+Hjr/zxzUS4DK3+0o3Hk6lwHS4=", + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", "dev": true, "requires": { - "escape-string-regexp": "^1.0.5", - "object-assign": "^4.1.0" + "pump": "^3.0.0" } }, - "onetime": { - "version": "1.1.0", - "resolved": "http://registry.npmjs.org/onetime/-/onetime-1.1.0.tgz", - "integrity": "sha1-ofeDj4MUxRbwXs78vEzP4EtO14k=", - "dev": true - }, - "restore-cursor": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-1.0.1.tgz", - "integrity": "sha1-NGYfRohjJ/7SmRR5FSJS35LapUE=", + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", "dev": true, "requires": { - "exit-hook": "^1.0.0", - "onetime": "^1.0.0" + "end-of-stream": "^1.1.0", + "once": "^1.3.1" } - }, - "supports-color": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", - "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", - "dev": true } } }, + "liftoff": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/liftoff/-/liftoff-2.5.0.tgz", + "integrity": "sha1-IAkpG7Mc6oYbvxCnwVooyvdcMew=", + "dev": true, + "requires": { + "extend": "^3.0.0", + "findup-sync": "^2.0.0", + "fined": "^1.0.1", + "flagged-respawn": "^1.0.0", + "is-plain-object": "^2.0.4", + "object.map": "^1.0.0", + "rechoir": "^0.6.2", + "resolve": "^1.1.7" + } + }, "load-json-file": { "version": "1.1.0", "resolved": "http://registry.npmjs.org/load-json-file/-/load-json-file-1.1.0.tgz", @@ -9410,14 +8746,25 @@ "dev": true }, "loader-utils": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-1.1.0.tgz", - "integrity": "sha1-yYrvSIvM7aL/teLeZG1qdUQp9c0=", + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-1.2.3.tgz", + "integrity": "sha512-fkpz8ejdnEMG3s37wGL07iSBDg99O9D5yflE9RGNH3hRdx9SOwYfnGYdZOUIZitN8E+E2vkq3MUMYMvPYl5ZZA==", "dev": true, "requires": { - "big.js": "^3.1.3", + "big.js": "^5.2.2", "emojis-list": "^2.0.0", - "json5": "^0.5.0" + "json5": "^1.0.1" + }, + "dependencies": { + "json5": { + "version": "1.0.1", + "resolved": "http://registry.npmjs.org/json5/-/json5-1.0.1.tgz", + "integrity": "sha512-aKS4WQjPenRxiQsC93MNfjx+nbF4PAdYzmd/1JIj8HYzqfbu86beTuNgXDzPknWk0n0uARlyewZo4s++ES36Ow==", + "dev": true, + "requires": { + "minimist": "^1.2.0" + } + } } }, "locate-path": { @@ -9428,6 +8775,24 @@ "requires": { "p-locate": "^2.0.0", "path-exists": "^3.0.0" + }, + "dependencies": { + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + } + } + }, + "lock-verify": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/lock-verify/-/lock-verify-2.0.2.tgz", + "integrity": "sha512-QNVwK0EGZBS4R3YQ7F1Ox8p41Po9VGl2QG/2GsuvTbkJZYSsPeWHKMbbH6iZMCHWSMww5nrJroZYnGzI4cePuw==", + "dev": true, + "requires": { + "npm-package-arg": "^5.1.2 || 6", + "semver": "^5.4.1" } }, "lodash": { @@ -9436,114 +8801,34 @@ "integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg==", "dev": true }, - "lodash._basecopy": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/lodash._basecopy/-/lodash._basecopy-3.0.1.tgz", - "integrity": "sha1-jaDmqHbPNEwK2KVIghEd08XHyjY=", - "dev": true - }, - "lodash._basetostring": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/lodash._basetostring/-/lodash._basetostring-3.0.1.tgz", - "integrity": "sha1-0YYdh3+CSlL2aYMtyvPuFVZqB9U=", - "dev": true - }, - "lodash._basevalues": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/lodash._basevalues/-/lodash._basevalues-3.0.0.tgz", - "integrity": "sha1-W3dXYoAr3j0yl1A+JjAIIP32Ybc=", - "dev": true - }, - "lodash._getnative": { - "version": "3.9.1", - "resolved": "https://registry.npmjs.org/lodash._getnative/-/lodash._getnative-3.9.1.tgz", - "integrity": "sha1-VwvH3t5G1hzc3mh9ZdPuy6o6r/U=", - "dev": true - }, - "lodash._isiterateecall": { - "version": "3.0.9", - "resolved": "https://registry.npmjs.org/lodash._isiterateecall/-/lodash._isiterateecall-3.0.9.tgz", - "integrity": "sha1-UgOte6Ql+uhCRg5pbbnPPmqsBXw=", - "dev": true - }, - "lodash._reescape": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/lodash._reescape/-/lodash._reescape-3.0.0.tgz", - "integrity": "sha1-Kx1vXf4HyKNVdT5fJ/rH8c3hYWo=", - "dev": true - }, - "lodash._reevaluate": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/lodash._reevaluate/-/lodash._reevaluate-3.0.0.tgz", - "integrity": "sha1-WLx0xAZklTrgsSTYBpltrKQx4u0=", - "dev": true - }, "lodash._reinterpolate": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/lodash._reinterpolate/-/lodash._reinterpolate-3.0.0.tgz", "integrity": "sha1-DM8tiRZq8Ds2Y8eWU4t1rG4RTZ0=", "dev": true }, - "lodash._root": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/lodash._root/-/lodash._root-3.0.1.tgz", - "integrity": "sha1-+6HEUkwZ7ppfgTa0YJ8BfPTe1pI=", - "dev": true - }, "lodash.camelcase": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/lodash.camelcase/-/lodash.camelcase-4.3.0.tgz", "integrity": "sha1-soqmKIorn8ZRA1x3EfZathkDMaY=" }, + "lodash.clonedeep": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz", + "integrity": "sha1-4j8/nE+Pvd6HJSnBBxhXoIblzO8=", + "dev": true + }, "lodash.debounce": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz", "integrity": "sha1-gteb/zCmfEAF/9XiUVMArZyk168=", "dev": true }, - "lodash.escape": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/lodash.escape/-/lodash.escape-3.2.0.tgz", - "integrity": "sha1-mV7g3BjBtIzJLv+ucaEKq1tIdpg=", - "dev": true, - "requires": { - "lodash._root": "^3.0.0" - } - }, - "lodash.isarguments": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/lodash.isarguments/-/lodash.isarguments-3.1.0.tgz", - "integrity": "sha1-L1c9hcaiQon/AGY7SRwdM4/zRYo=", - "dev": true - }, - "lodash.isarray": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/lodash.isarray/-/lodash.isarray-3.0.4.tgz", - "integrity": "sha1-eeTriMNqgSKvhvhEqpvNhRtfu1U=", - "dev": true - }, - "lodash.keys": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/lodash.keys/-/lodash.keys-3.1.2.tgz", - "integrity": "sha1-TbwEcrFWvlCgsoaFXRvQsMZWCYo=", - "dev": true, - "requires": { - "lodash._getnative": "^3.0.0", - "lodash.isarguments": "^3.0.0", - "lodash.isarray": "^3.0.0" - } - }, "lodash.padend": { "version": "4.6.1", "resolved": "https://registry.npmjs.org/lodash.padend/-/lodash.padend-4.6.1.tgz", "integrity": "sha1-U8y6BH0G4VjTEfRdpiX05J5vFm4=" }, - "lodash.restparam": { - "version": "3.6.1", - "resolved": "https://registry.npmjs.org/lodash.restparam/-/lodash.restparam-3.6.1.tgz", - "integrity": "sha1-k2pOMJ7zMKdkXtQUWYbIWuWyCAU=", - "dev": true - }, "lodash.sortby": { "version": "4.7.0", "resolved": "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz", @@ -9551,30 +8836,22 @@ "dev": true }, "lodash.template": { - "version": "3.6.2", - "resolved": "https://registry.npmjs.org/lodash.template/-/lodash.template-3.6.2.tgz", - "integrity": "sha1-+M3sxhaaJVvpCYrosMU9N4kx0U8=", + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/lodash.template/-/lodash.template-4.4.0.tgz", + "integrity": "sha1-5zoDhcg1VZF0bgILmWecaQ5o+6A=", "dev": true, "requires": { - "lodash._basecopy": "^3.0.0", - "lodash._basetostring": "^3.0.0", - "lodash._basevalues": "^3.0.0", - "lodash._isiterateecall": "^3.0.0", - "lodash._reinterpolate": "^3.0.0", - "lodash.escape": "^3.0.0", - "lodash.keys": "^3.0.0", - "lodash.restparam": "^3.0.0", - "lodash.templatesettings": "^3.0.0" + "lodash._reinterpolate": "~3.0.0", + "lodash.templatesettings": "^4.0.0" } }, "lodash.templatesettings": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/lodash.templatesettings/-/lodash.templatesettings-3.1.1.tgz", - "integrity": "sha1-+zB4RHU7Zrnxr6VOJix0UwfbqOU=", + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/lodash.templatesettings/-/lodash.templatesettings-4.1.0.tgz", + "integrity": "sha1-K01OlbpEDZFf8IvImeRVNmZxMxY=", "dev": true, "requires": { - "lodash._reinterpolate": "^3.0.0", - "lodash.escape": "^3.0.0" + "lodash._reinterpolate": "~3.0.0" } }, "log-driver": { @@ -9583,71 +8860,19 @@ "integrity": "sha512-U7KCmLdqsGHBLeWqYlFA0V0Sl6P08EE1ZrmA9cxjUE0WVqT9qnyVDPz1kzpFEP0jdJuFnasWIfSd7fsaNXkpbg==", "dev": true }, - "log-symbols": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-2.2.0.tgz", - "integrity": "sha512-VeIAFslyIerEJLXHziedo2basKbMKtTw3vfn5IzG0XTjhAVEJyNHnL2p7vc+wBDSdQuUpNw3M2u6xb9QsAY5Eg==", + "loose-envify": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", + "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", "dev": true, "requires": { - "chalk": "^2.0.1" + "js-tokens": "^3.0.0 || ^4.0.0" } }, - "log-update": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/log-update/-/log-update-1.0.2.tgz", - "integrity": "sha1-GZKfZMQJPS0ucHWh2tivWcKWuNE=", - "dev": true, - "requires": { - "ansi-escapes": "^1.0.0", - "cli-cursor": "^1.0.2" - }, - "dependencies": { - "ansi-escapes": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-1.4.0.tgz", - "integrity": "sha1-06ioOzGapneTZisT52HHkRQiMG4=", - "dev": true - }, - "cli-cursor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-1.0.2.tgz", - "integrity": "sha1-ZNo/fValRBLll5S9Ytw1KV6PKYc=", - "dev": true, - "requires": { - "restore-cursor": "^1.0.1" - } - }, - "onetime": { - "version": "1.1.0", - "resolved": "http://registry.npmjs.org/onetime/-/onetime-1.1.0.tgz", - "integrity": "sha1-ofeDj4MUxRbwXs78vEzP4EtO14k=", - "dev": true - }, - "restore-cursor": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-1.0.1.tgz", - "integrity": "sha1-NGYfRohjJ/7SmRR5FSJS35LapUE=", - "dev": true, - "requires": { - "exit-hook": "^1.0.0", - "onetime": "^1.0.0" - } - } - } - }, - "loose-envify": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", - "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", - "dev": true, - "requires": { - "js-tokens": "^3.0.0 || ^4.0.0" - } - }, - "loud-rejection": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/loud-rejection/-/loud-rejection-1.6.0.tgz", - "integrity": "sha1-W0b4AUft7leIcPCG0Eghz5mOVR8=", + "loud-rejection": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/loud-rejection/-/loud-rejection-1.6.0.tgz", + "integrity": "sha1-W0b4AUft7leIcPCG0Eghz5mOVR8=", "dev": true, "requires": { "currently-unhandled": "^0.4.1", @@ -9655,9 +8880,9 @@ } }, "lru-cache": { - "version": "4.1.3", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.3.tgz", - "integrity": "sha512-fFEhvcgzuIoJVUF8fYr5KR0YqxD238zgObTps31YdADwPPAp82a4M8TrckkWyx7ekNlf9aBcVn81cFwwXngrJA==", + "version": "4.1.5", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.5.tgz", + "integrity": "sha512-sWZlbEP2OsHNkXrMl5GYk/jKk70MBng6UU4YI/qGDYbgf6YbP4EvmqISbXCoJiRKs+1bSpFHVgQxvJ17F2li5g==", "dev": true, "requires": { "pseudomap": "^1.0.2", @@ -9714,14 +8939,6 @@ "dev": true, "requires": { "kind-of": "^6.0.2" - }, - "dependencies": { - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - } } }, "makeerror": { @@ -9779,282 +8996,6 @@ "micromatch": "^3.0.4", "resolve": "^1.4.0", "stack-trace": "0.0.10" - }, - "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - } } }, "math-random": { @@ -10083,6 +9024,16 @@ "mimic-fn": "^1.0.0" } }, + "memfs": { + "version": "2.14.2", + "resolved": "https://registry.npmjs.org/memfs/-/memfs-2.14.2.tgz", + "integrity": "sha512-y19j9L+b8nuDKwuwrrIOiDhDD2bi7pfL1/Z8kfCyPaoZzHxX2aRcI2Q5T6qdUzqVHWd3plAfxeDT3Crb2eCwUw==", + "dev": true, + "requires": { + "fast-extend": "0.0.2", + "fs-monkey": "^0.3.3" + } + }, "memoizee": { "version": "0.4.14", "resolved": "https://registry.npmjs.org/memoizee/-/memoizee-0.4.14.tgz", @@ -10132,6 +9083,15 @@ "trim-newlines": "^2.0.0" }, "dependencies": { + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + }, "load-json-file": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", @@ -10214,24 +9174,24 @@ "dev": true }, "micromatch": { - "version": "2.3.11", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", - "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "version": "3.1.10", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", + "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", "dev": true, "requires": { - "arr-diff": "^2.0.0", - "array-unique": "^0.2.1", - "braces": "^1.8.2", - "expand-brackets": "^0.1.4", - "extglob": "^0.3.1", - "filename-regex": "^2.0.0", - "is-extglob": "^1.0.0", - "is-glob": "^2.0.1", - "kind-of": "^3.0.2", - "normalize-path": "^2.0.1", - "object.omit": "^2.0.0", - "parse-glob": "^3.0.4", - "regex-cache": "^0.4.2" + "arr-diff": "^4.0.0", + "array-unique": "^0.3.2", + "braces": "^2.3.1", + "define-property": "^2.0.2", + "extend-shallow": "^3.0.2", + "extglob": "^2.0.4", + "fragment-cache": "^0.2.1", + "kind-of": "^6.0.2", + "nanomatch": "^1.2.9", + "object.pick": "^1.3.0", + "regex-not": "^1.0.0", + "snapdragon": "^0.8.1", + "to-regex": "^3.0.2" } }, "miller-rabin": { @@ -10321,9 +9281,9 @@ } }, "minizlib": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-1.1.1.tgz", - "integrity": "sha512-TrfjCjk4jLhcJyGMYymBH6oTXcWjYbUAXTHDbtnWHjZC25h0cdajHuPE1zxb4DVmu8crfh+HwH/WMuyLG0nHBg==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-1.2.1.tgz", + "integrity": "sha512-7+4oTUOWKg7AuL3vloEWekXY2/D20cevzsrNT2kGWm+39J9hGTCBv8VI5Pm5lXZ/o3/mdR4f8rflAPhnQb8mPA==", "dev": true, "requires": { "minipass": "^2.2.1" @@ -10345,6 +9305,18 @@ "pumpify": "^1.3.3", "stream-each": "^1.1.0", "through2": "^2.0.0" + }, + "dependencies": { + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + } } }, "mixin-deep": { @@ -10438,7 +9410,7 @@ }, "multimatch": { "version": "2.1.0", - "resolved": "https://registry.npmjs.org/multimatch/-/multimatch-2.1.0.tgz", + "resolved": "http://registry.npmjs.org/multimatch/-/multimatch-2.1.0.tgz", "integrity": "sha1-nHkGoi+0wCkZ4vX3UWG0zb1LKis=", "dev": true, "requires": { @@ -10448,13 +9420,14 @@ "minimatch": "^3.0.0" } }, - "multipipe": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/multipipe/-/multipipe-0.1.2.tgz", - "integrity": "sha1-Ko8t33Du1WTf8tV/HhoTfZ8FB4s=", + "multistream": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/multistream/-/multistream-2.1.1.tgz", + "integrity": "sha512-xasv76hl6nr1dEy3lPvy7Ej7K/Lx3O/FCvwge8PeVJpciPPoNCbaANcNiBug3IpdvTveZUcAV0DJzdnUDMesNQ==", "dev": true, "requires": { - "duplexer2": "0.0.2" + "inherits": "^2.0.1", + "readable-stream": "^2.0.5" } }, "mute-stdout": { @@ -10465,14 +9438,14 @@ }, "mute-stream": { "version": "0.0.7", - "resolved": "https://registry.npmjs.org/mute-stream/-/mute-stream-0.0.7.tgz", + "resolved": "http://registry.npmjs.org/mute-stream/-/mute-stream-0.0.7.tgz", "integrity": "sha1-MHXOk7whuPq0PhvE2n6BFe0ee6s=", "dev": true }, "nan": { - "version": "2.11.1", - "resolved": "https://registry.npmjs.org/nan/-/nan-2.11.1.tgz", - "integrity": "sha512-iji6k87OSXa0CcrLl9z+ZiYSuR2o+c0bGuNmXdrhTQTakxytAFsC56SArGYoiHlJlFoHSnvmhpceZJaXkVuOtA==", + "version": "2.12.1", + "resolved": "https://registry.npmjs.org/nan/-/nan-2.12.1.tgz", + "integrity": "sha512-JY7V6lRkStKcKTvHO5NVSQRv+RV+FIL5pvDoLiAtSL9pKlC5x9PKQcZDsq7m4FO4d57mkhC6Z+QhAh3Jdk5JFw==", "dev": true, "optional": true }, @@ -10493,26 +9466,6 @@ "regex-not": "^1.0.0", "snapdragon": "^0.8.1", "to-regex": "^3.0.1" - }, - "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - } } }, "natural-compare": { @@ -10529,7 +9482,7 @@ }, "next-tick": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/next-tick/-/next-tick-1.0.0.tgz", + "resolved": "http://registry.npmjs.org/next-tick/-/next-tick-1.0.0.tgz", "integrity": "sha1-yobR/ogoFpsBICCOPchCS524NCw=", "dev": true }, @@ -10613,6 +9566,14 @@ "url": "^0.11.0", "util": "^0.10.3", "vm-browserify": "0.0.4" + }, + "dependencies": { + "punycode": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", + "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=", + "dev": true + } } }, "node-notifier": { @@ -10696,6 +9657,12 @@ } } }, + "npm-logical-tree": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/npm-logical-tree/-/npm-logical-tree-1.2.1.tgz", + "integrity": "sha512-AJI/qxDB2PWI4LG1CYN579AY1vCiNyWfkiquCsJWqntRu/WwimVrC8yXeILBFHDwxfOejxewlmnvW9XXjMlYIg==", + "dev": true + }, "npm-package-arg": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/npm-package-arg/-/npm-package-arg-6.1.0.tgz", @@ -10718,15 +9685,6 @@ "npm-bundled": "^1.0.1" } }, - "npm-path": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/npm-path/-/npm-path-2.0.4.tgz", - "integrity": "sha512-IFsj0R9C7ZdR5cP+ET342q77uSRdtWOlWpih5eC+lu29tIDbNEgDbzgVJ5UFvYHWhxDZ5TFkJafFioO0pPQjCw==", - "dev": true, - "requires": { - "which": "^1.2.10" - } - }, "npm-pick-manifest": { "version": "2.2.3", "resolved": "https://registry.npmjs.org/npm-pick-manifest/-/npm-pick-manifest-2.2.3.tgz", @@ -10738,6 +9696,17 @@ "semver": "^5.4.1" } }, + "npm-profile": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/npm-profile/-/npm-profile-4.0.1.tgz", + "integrity": "sha512-NQ1I/1Q7YRtHZXkcuU1/IyHeLy6pd+ScKg4+DQHdfsm769TGq6HPrkbuNJVJS4zwE+0mvvmeULzQdWn2L2EsVA==", + "dev": true, + "requires": { + "aproba": "^1.1.2 || 2", + "figgy-pudding": "^3.4.1", + "npm-registry-fetch": "^3.8.0" + } + }, "npm-registry-fetch": { "version": "3.8.0", "resolved": "https://registry.npmjs.org/npm-registry-fetch/-/npm-registry-fetch-3.8.0.tgz", @@ -10841,17 +9810,6 @@ "path-key": "^2.0.0" } }, - "npm-which": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/npm-which/-/npm-which-3.0.1.tgz", - "integrity": "sha1-kiXybsOihcIJyuZ8OxGmtKtxQKo=", - "dev": true, - "requires": { - "commander": "^2.9.0", - "npm-path": "^2.0.2", - "which": "^1.2.10" - } - }, "npmlog": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-4.1.2.tgz", @@ -10907,6 +9865,15 @@ "requires": { "is-descriptor": "^0.1.0" } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } } } }, @@ -10923,14 +9890,6 @@ "dev": true, "requires": { "isobject": "^3.0.0" - }, - "dependencies": { - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "object.assign": { @@ -10955,23 +9914,6 @@ "array-slice": "^1.0.0", "for-own": "^1.0.0", "isobject": "^3.0.0" - }, - "dependencies": { - "for-own": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-1.0.0.tgz", - "integrity": "sha1-xjMy9BXO3EsE2/5wz4NklMU8tEs=", - "dev": true, - "requires": { - "for-in": "^1.0.1" - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "object.getownpropertydescriptors": { @@ -10992,17 +9934,6 @@ "requires": { "for-own": "^1.0.0", "make-iterator": "^1.0.0" - }, - "dependencies": { - "for-own": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-1.0.0.tgz", - "integrity": "sha1-xjMy9BXO3EsE2/5wz4NklMU8tEs=", - "dev": true, - "requires": { - "for-in": "^1.0.1" - } - } } }, "object.omit": { @@ -11013,6 +9944,17 @@ "requires": { "for-own": "^0.1.4", "is-extendable": "^0.1.1" + }, + "dependencies": { + "for-own": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", + "integrity": "sha1-UmXGgaTylNq78XyVCbZ2OqhFEM4=", + "dev": true, + "requires": { + "for-in": "^1.0.1" + } + } } }, "object.pick": { @@ -11022,14 +9964,6 @@ "dev": true, "requires": { "isobject": "^3.0.1" - }, - "dependencies": { - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - } } }, "object.reduce": { @@ -11040,17 +9974,6 @@ "requires": { "for-own": "^1.0.0", "make-iterator": "^1.0.0" - }, - "dependencies": { - "for-own": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-1.0.0.tgz", - "integrity": "sha1-xjMy9BXO3EsE2/5wz4NklMU8tEs=", - "dev": true, - "requires": { - "for-in": "^1.0.1" - } - } } }, "once": { @@ -11128,7 +10051,7 @@ }, "os-homedir": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/os-homedir/-/os-homedir-1.0.2.tgz", + "resolved": "http://registry.npmjs.org/os-homedir/-/os-homedir-1.0.2.tgz", "integrity": "sha1-/7xJiDNuDoM94MFox+8VISGqf7M=", "dev": true }, @@ -11143,7 +10066,7 @@ }, "os-tmpdir": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", + "resolved": "http://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=", "dev": true }, @@ -11236,17 +10159,17 @@ } }, "pacote": { - "version": "9.2.3", - "resolved": "https://registry.npmjs.org/pacote/-/pacote-9.2.3.tgz", - "integrity": "sha512-Y3+yY3nBRAxMlZWvr62XLJxOwCmG9UmkGZkFurWHoCjqF0cZL72cTOCRJTvWw8T4OhJS2RTg13x4oYYriauvEw==", + "version": "9.3.0", + "resolved": "https://registry.npmjs.org/pacote/-/pacote-9.3.0.tgz", + "integrity": "sha512-uy5xghB5wUtmFS+uNhQGhlsIF9rfsfxw6Zsu2VpmSz4/f+8D2+5V1HwjHdSn7W6aQTrxNNmmoUF5qNE10/EVdA==", "dev": true, "requires": { - "bluebird": "^3.5.2", - "cacache": "^11.2.0", + "bluebird": "^3.5.3", + "cacache": "^11.3.2", "figgy-pudding": "^3.5.1", "get-stream": "^4.1.0", "glob": "^7.1.3", - "lru-cache": "^4.1.3", + "lru-cache": "^5.1.1", "make-fetch-happen": "^4.0.1", "minimatch": "^3.0.4", "minipass": "^2.3.5", @@ -11265,7 +10188,7 @@ "safe-buffer": "^5.1.2", "semver": "^5.6.0", "ssri": "^6.0.1", - "tar": "^4.4.6", + "tar": "^4.4.8", "unique-filename": "^1.1.1", "which": "^1.3.1" }, @@ -11279,6 +10202,25 @@ "pump": "^3.0.0" } }, + "lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "requires": { + "yallist": "^3.0.2" + } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, "tar": { "version": "4.4.8", "resolved": "https://registry.npmjs.org/tar/-/tar-4.4.8.tgz", @@ -11302,6 +10244,14 @@ } } }, + "pad-left": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/pad-left/-/pad-left-2.1.0.tgz", + "integrity": "sha1-FuajstRKjhOMsIOMx8tAOk/J6ZQ=", + "requires": { + "repeat-string": "^1.5.4" + } + }, "pako": { "version": "1.0.7", "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.7.tgz", @@ -11359,6 +10309,23 @@ "is-dotfile": "^1.0.0", "is-extglob": "^1.0.0", "is-glob": "^2.0.0" + }, + "dependencies": { + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + } } }, "parse-json": { @@ -11370,6 +10337,12 @@ "error-ex": "^1.2.0" } }, + "parse-node-version": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/parse-node-version/-/parse-node-version-1.0.0.tgz", + "integrity": "sha512-02GTVHD1u0nWc20n2G7WX/PgdhNFG04j5fi1OkaJzPWLTcf6vh6229Lta1wTmXG/7Dg42tCssgkccVt7qvd8Kg==", + "dev": true + }, "parse-passwd": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/parse-passwd/-/parse-passwd-1.0.0.tgz", @@ -11390,7 +10363,7 @@ }, "path-browserify": { "version": "0.0.0", - "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-0.0.0.tgz", + "resolved": "http://registry.npmjs.org/path-browserify/-/path-browserify-0.0.0.tgz", "integrity": "sha1-oLhwcpquIUAFt9UDLsLLuw+0RRo=", "dev": true }, @@ -11401,14 +10374,17 @@ "dev": true }, "path-exists": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", - "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", - "dev": true + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-2.1.0.tgz", + "integrity": "sha1-D+tsZPD8UY2adU3V77YscCJ2H0s=", + "dev": true, + "requires": { + "pinkie-promise": "^2.0.0" + } }, "path-is-absolute": { "version": "1.0.1", - "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "resolved": "http://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=", "dev": true }, @@ -11517,6 +10493,17 @@ "dev": true, "requires": { "find-up": "^2.1.0" + }, + "dependencies": { + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + } } }, "platform": { @@ -11525,15 +10512,6 @@ "integrity": "sha512-TuvHS8AOIZNAlE77WUDiR4rySV/VMptyMfcfeoMgs4P8apaZM3JrnbzBiixKUv+XR6i+BXrQh8WAnjaSPFO65Q==", "dev": true }, - "please-upgrade-node": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/please-upgrade-node/-/please-upgrade-node-3.1.1.tgz", - "integrity": "sha512-KY1uHnQ2NlQHqIJQpnh/i54rKkuxCEBx+voJIS/Mvb+L2iYd2NMotwduhKTMjfC1uKoX3VXOxLjIYG66dfJTVQ==", - "dev": true, - "requires": { - "semver-compare": "^1.0.0" - } - }, "plugin-error": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/plugin-error/-/plugin-error-1.0.1.tgz", @@ -11544,23 +10522,6 @@ "arr-diff": "^4.0.0", "arr-union": "^3.1.0", "extend-shallow": "^3.0.2" - }, - "dependencies": { - "ansi-colors": { - "version": "1.1.0", - "resolved": "http://registry.npmjs.org/ansi-colors/-/ansi-colors-1.1.0.tgz", - "integrity": "sha512-SFKX67auSNoVR38N3L+nvsPjOE0bybKTYbkf5tRvushrAPQ9V75huw0ZxBkKVeRU9kqH3d6HA4xTckbwZ4ixmA==", - "dev": true, - "requires": { - "ansi-wrap": "^0.1.0" - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - } } }, "pn": { @@ -11624,15 +10585,15 @@ "dev": true }, "process-nextick-args": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", - "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==", + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", + "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=", "dev": true }, "progress": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.2.tgz", - "integrity": "sha512-/OLz5F9beZUWwSHZDreXgap1XShX6W+DCHQCqwCF7uZ88s6uTlD2cR3JBE77SegCmNtb1Idst+NfmwcdU6KVhw==", + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", + "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==", "dev": true }, "promise": { @@ -11707,9 +10668,9 @@ "dev": true }, "psl": { - "version": "1.1.29", - "resolved": "https://registry.npmjs.org/psl/-/psl-1.1.29.tgz", - "integrity": "sha512-AeUmQ0oLN02flVHXWh9sSJF7mcdFq0ppid/JkErufc3hGIV/AMa8Fo9VgDo/cT2jFdOWoFvHp90qqBH54W+gjQ==", + "version": "1.1.31", + "resolved": "https://registry.npmjs.org/psl/-/psl-1.1.31.tgz", + "integrity": "sha512-/6pt4+C+T+wZUieKR620OpzN/LlnNKuWjy1iFLQ/UG35JqHlR/89MP1d96dUfkf6Dne3TuLQzOYEYshJ+Hx8mw==", "dev": true }, "public-encrypt": { @@ -11727,9 +10688,9 @@ } }, "pump": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", - "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/pump/-/pump-2.0.1.tgz", + "integrity": "sha512-ruPMNRkN3MHP1cWJc9OWr+T/xDP0jhXYCLfJcBuX54hhfIBnaQmAUMfDcG4DM5UMWByBbJY69QSphm3jtDKIkA==", "dev": true, "requires": { "end-of-stream": "^1.1.0", @@ -11745,24 +10706,12 @@ "duplexify": "^3.6.0", "inherits": "^2.0.3", "pump": "^2.0.0" - }, - "dependencies": { - "pump": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/pump/-/pump-2.0.1.tgz", - "integrity": "sha512-ruPMNRkN3MHP1cWJc9OWr+T/xDP0jhXYCLfJcBuX54hhfIBnaQmAUMfDcG4DM5UMWByBbJY69QSphm3jtDKIkA==", - "dev": true, - "requires": { - "end-of-stream": "^1.1.0", - "once": "^1.3.1" - } - } } }, "punycode": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", - "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=", + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", + "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==", "dev": true }, "q": { @@ -11811,12 +10760,6 @@ "resolved": "https://registry.npmjs.org/is-number/-/is-number-4.0.0.tgz", "integrity": "sha512-rSklcAIlf1OmFdyAqbnWTLVelsQ58uvZ66S/ZyawjWqIviTWCjg2PzVGw8WUA+nNuPTqb4wgA+NszrJ+08LlgQ==", "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true } } }, @@ -11902,27 +10845,6 @@ "requires": { "find-up": "^1.0.0", "read-pkg": "^1.0.0" - }, - "dependencies": { - "find-up": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-1.1.2.tgz", - "integrity": "sha1-ay6YIrGizgpgq2TWEOzK1TyyTQ8=", - "dev": true, - "requires": { - "path-exists": "^2.0.0", - "pinkie-promise": "^2.0.0" - } - }, - "path-exists": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-2.1.0.tgz", - "integrity": "sha1-D+tsZPD8UY2adU3V77YscCJ2H0s=", - "dev": true, - "requires": { - "pinkie-promise": "^2.0.0" - } - } } }, "readable-stream": { @@ -11938,6 +10860,14 @@ "safe-buffer": "~5.1.1", "string_decoder": "~1.1.1", "util-deprecate": "~1.0.1" + }, + "dependencies": { + "process-nextick-args": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", + "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==", + "dev": true + } } }, "readdir-scoped-modules": { @@ -11961,282 +10891,6 @@ "graceful-fs": "^4.1.11", "micromatch": "^3.1.10", "readable-stream": "^2.0.2" - }, - "dependencies": { - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - } } }, "realpath-native": { @@ -12333,8 +10987,7 @@ "repeat-string": { "version": "1.6.1", "resolved": "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz", - "integrity": "sha1-jcrkcOHIirwtYA//Sndihtp15jc=", - "dev": true + "integrity": "sha1-jcrkcOHIirwtYA//Sndihtp15jc=" }, "repeating": { "version": "2.0.1", @@ -12423,12 +11076,12 @@ "dev": true }, "resolve": { - "version": "1.8.1", - "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.8.1.tgz", - "integrity": "sha512-AicPrAC7Qu1JxPCZ9ZgCZlY35QgFnNqc+0LtbRNxnVw4TXvjQ72wnuL9JQcEBgXkI9JM8MsT9kaQoHcpCRJOYA==", + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.9.0.tgz", + "integrity": "sha512-TZNye00tI67lwYvzxCxHGjwTNlUV70io54/Ed4j6PscB8xVfuBJpRenI/o6dVk0cY0PYTY27AgCoGGxRnYuItQ==", "dev": true, "requires": { - "path-parse": "^1.0.5" + "path-parse": "^1.0.6" } }, "resolve-cwd": { @@ -12582,14 +11235,6 @@ "dev": true, "requires": { "symbol-observable": "1.0.1" - }, - "dependencies": { - "symbol-observable": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/symbol-observable/-/symbol-observable-1.0.1.tgz", - "integrity": "sha1-g0D8RwLDEi310iKI+IKD9RPT/dQ=", - "dev": true - } } }, "safe-buffer": { @@ -12628,292 +11273,6 @@ "minimist": "^1.1.1", "walker": "~1.0.5", "watch": "~0.18.0" - }, - "dependencies": { - "anymatch": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", - "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==", - "dev": true, - "requires": { - "micromatch": "^3.1.4", - "normalize-path": "^2.1.1" - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - } } }, "sax": { @@ -12931,32 +11290,6 @@ "ajv": "^6.1.0", "ajv-errors": "^1.0.0", "ajv-keywords": "^3.1.0" - }, - "dependencies": { - "ajv": { - "version": "6.5.4", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.5.4.tgz", - "integrity": "sha512-4Wyjt8+t6YszqaXnLDfMmG/8AlO5Zbcsy3ATHncCzjW/NoPzAId8AK6749Ybjmdt+kUY1gP60fCu46oDxPv/mg==", - "dev": true, - "requires": { - "fast-deep-equal": "^2.0.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - } - }, - "fast-deep-equal": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz", - "integrity": "sha1-ewUhjd+WZ79/Nwv3/bLLFf3Qqkk=", - "dev": true - }, - "json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", - "dev": true - } } }, "semver": { @@ -12965,12 +11298,6 @@ "integrity": "sha512-RS9R6R35NYgQn++fkDWaOmqGoj4Ek9gGs+DPxNUZKuwE183xjJroKvyo1IzVFeXvUrvmALy6FWD5xrdJT25gMg==", "dev": true }, - "semver-compare": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/semver-compare/-/semver-compare-1.0.0.tgz", - "integrity": "sha1-De4hahyUGrN+nvsXiPavxf9VN/w=", - "dev": true - }, "semver-greatest-satisfied-range": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/semver-greatest-satisfied-range/-/semver-greatest-satisfied-range-1.1.0.tgz", @@ -12981,9 +11308,9 @@ } }, "serialize-javascript": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-1.5.0.tgz", - "integrity": "sha512-Ga8c8NjAAp46Br4+0oZ2WxJCwIzwP60Gq1YPgU+39PiTVxyed/iKE/zyZI6+UlVYH5Q4PaQdHhcegIFPZTUfoQ==", + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-1.6.1.tgz", + "integrity": "sha512-A5MOagrPFga4YaKQSWHryl7AXvbQkEqpw4NNYMTNYUNV51bA8ABHgYFpqKx+YFFrw59xMV1qGH1R4AgoNIVgCw==", "dev": true }, "set-blocking": { @@ -13059,9 +11386,9 @@ } }, "shelljs": { - "version": "0.8.2", - "resolved": "https://registry.npmjs.org/shelljs/-/shelljs-0.8.2.tgz", - "integrity": "sha512-pRXeNrCA2Wd9itwhvLp5LZQvPJ0wU6bcjaTMywHHGX5XWhVN2nzSu7WV0q+oUY7mGK3mgSkDDzP3MgjqdyIgbQ==", + "version": "0.8.3", + "resolved": "https://registry.npmjs.org/shelljs/-/shelljs-0.8.3.tgz", + "integrity": "sha512-fc0BKlAWiLpwZljmOvAOTE/gXawtCoNrP5oaY7KIaQbbyHeQVg01pSEuEGvGh3HEdBU4baCD7wQBwADmM/7f7A==", "dev": true, "requires": { "glob": "^7.0.0", @@ -13104,12 +11431,6 @@ "integrity": "sha1-xB8vbDn8FtHNF61LXYlhFK5HDVU=", "dev": true }, - "slice-ansi": { - "version": "0.0.4", - "resolved": "http://registry.npmjs.org/slice-ansi/-/slice-ansi-0.0.4.tgz", - "integrity": "sha1-7b+JA/ZvfOL46v1s7tZeJkyDGzU=", - "dev": true - }, "slide": { "version": "1.1.6", "resolved": "https://registry.npmjs.org/slide/-/slide-1.1.6.tgz", @@ -13206,18 +11527,6 @@ "is-data-descriptor": "^1.0.0", "kind-of": "^6.0.2" } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true } } }, @@ -13228,6 +11537,17 @@ "dev": true, "requires": { "kind-of": "^3.2.0" + }, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "socks": { @@ -13316,9 +11636,9 @@ "dev": true }, "spdx-correct": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.0.2.tgz", - "integrity": "sha512-q9hedtzyXHr5S0A1vEPoK/7l8NpfkFYTq6iCY+Pno2ZbdZR6WexZFtqeVGkGxW3TEJMN914Z55EnAGMmenlIQQ==", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.1.0.tgz", + "integrity": "sha512-lr2EZCctC2BNR7j7WzJ2FpDznxky1sjfxvvYEyzxNyb6lZXHODmEoJeFu4JupYlkfha1KZpJyoqiJ7pgA1qq8Q==", "dev": true, "requires": { "spdx-expression-parse": "^3.0.0", @@ -13342,9 +11662,9 @@ } }, "spdx-license-ids": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.2.tgz", - "integrity": "sha512-qky9CVt0lVIECkEsYbNILVnPvycuEBkXoMFLRWsREkomQLevYhtRKC+R91a5TOAQ3bCMjikRwhyaRqj1VYatYg==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.3.tgz", + "integrity": "sha512-uBIcIl3Ih6Phe3XHK1NqboJLdGfwr1UN3k6wSD1dZpmPsIkb8AGNbZYJ1fOBk834+Gxy8rpfDxrS6XLEMZMY2g==", "dev": true }, "split": { @@ -13376,14 +11696,14 @@ }, "sprintf-js": { "version": "1.0.3", - "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", + "resolved": "http://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=", "dev": true }, "sshpk": { - "version": "1.15.2", - "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.15.2.tgz", - "integrity": "sha512-Ra/OXQtuh0/enyl4ETZAfTaeksa6BXks5ZcjpSUNrjBr0DvrJKX+1fsKDPpT9TBXgHAFsa4510aNVgI8g/+SzA==", + "version": "1.16.0", + "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.16.0.tgz", + "integrity": "sha512-Zhev35/y7hRMcID/upReIvRse+I9SVhyVre/KTJSJQWMz3C3+G+HpO7m1wK/yckEtujKZ7dS4hkVxAnmHaIGVQ==", "dev": true, "requires": { "asn1": "~0.2.3", @@ -13413,15 +11733,9 @@ "dev": true }, "stack-utils": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-1.0.1.tgz", - "integrity": "sha1-1PM6tU6OOHeLDKXP07OvsS22hiA=", - "dev": true - }, - "staged-git-files": { - "version": "1.1.1", - "resolved": "http://registry.npmjs.org/staged-git-files/-/staged-git-files-1.1.1.tgz", - "integrity": "sha512-H89UNKr1rQJvI1c/PIR3kiAMBV23yvR7LItZiV74HWZwzt7f3YHuujJ9nJZlt58WlFox7XQsOahexwk7nTe69A==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-1.0.2.tgz", + "integrity": "sha512-MTX+MeG5U994cazkjd/9KNAapsHnibjMLnfXodlkXw76JEea0UiNzrqidzo1emMwk7w5Qhc9jd4Bn9TBb1MFwA==", "dev": true }, "static-extend": { @@ -13453,7 +11767,7 @@ }, "stream-browserify": { "version": "2.0.1", - "resolved": "https://registry.npmjs.org/stream-browserify/-/stream-browserify-2.0.1.tgz", + "resolved": "http://registry.npmjs.org/stream-browserify/-/stream-browserify-2.0.1.tgz", "integrity": "sha1-ZiZu5fm9uZQKTkUUyvtDu3Hlyds=", "dev": true, "requires": { @@ -13496,12 +11810,6 @@ "integrity": "sha1-1cdSgl5TZ+eG944Y5EXqIjoVWVI=", "dev": true }, - "string-argv": { - "version": "0.0.2", - "resolved": "https://registry.npmjs.org/string-argv/-/string-argv-0.0.2.tgz", - "integrity": "sha1-2sMECGkMIfPDYwo/86BYd73L1zY=", - "dev": true - }, "string-length": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/string-length/-/string-length-2.0.0.tgz", @@ -13531,7 +11839,7 @@ }, "string-width": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", + "resolved": "http://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", "integrity": "sha1-EYvfW4zcUaKn5w0hHgfisLmxB9M=", "dev": true, "requires": { @@ -13553,23 +11861,18 @@ }, "string_decoder": { "version": "1.1.1", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", "dev": true, "requires": { "safe-buffer": "~5.1.0" } }, - "stringify-object": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/stringify-object/-/stringify-object-3.3.0.tgz", - "integrity": "sha512-rHqiFh1elqCQ9WPLIC8I0Q/g/wj5J1eMkyoiD6eoQApWHP0FtlK7rqnhmabL5VUY9JQCcqwwvlOaSuutekgyrw==", - "dev": true, - "requires": { - "get-own-enumerable-property-symbols": "^3.0.0", - "is-obj": "^1.0.1", - "is-regexp": "^1.0.0" - } + "stringify-package": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/stringify-package/-/stringify-package-1.0.0.tgz", + "integrity": "sha512-JIQqiWmLiEozOC0b0BtxZ/AOUtdUZHCBPgqIZ2kSJJqGwgb9neo44XdTHUC4HZSGqi03hOeB7W/E8rAlKnGe9g==", + "dev": true }, "strip-ansi": { "version": "3.0.1", @@ -13608,26 +11911,16 @@ "dev": true }, "strong-log-transformer": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/strong-log-transformer/-/strong-log-transformer-2.0.0.tgz", - "integrity": "sha512-FQmNqAXJgOX8ygOcvPLlGWBNT41mvNJ9ALoYf0GTwVt9t30mGTqpmp/oJx5gLcu52DXK10kS7dVWhx8aPXDTlg==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/strong-log-transformer/-/strong-log-transformer-2.1.0.tgz", + "integrity": "sha512-B3Hgul+z0L9a236FAUC9iZsL+nVHgoCJnqCbN588DjYxvGXaXaaFbfmQ/JhvKjZwsOukuR72XbHv71Qkug0HxA==", "dev": true, "requires": { - "byline": "^5.0.0", "duplexer": "^0.1.1", "minimist": "^1.2.0", "through": "^2.3.4" } }, - "subarg": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/subarg/-/subarg-1.0.0.tgz", - "integrity": "sha1-9izxdYHplrSPyWVpn1TAauJouNI=", - "dev": true, - "requires": { - "minimist": "^1.1.0" - } - }, "supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", @@ -13647,9 +11940,9 @@ } }, "symbol-observable": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/symbol-observable/-/symbol-observable-1.2.0.tgz", - "integrity": "sha512-e900nM8RRtGhlV36KGEU9k65K3mPb1WV70OdjfxlG2EAuM1noi/E/BaW/uMhL7bPEssK8QV57vN3esixjUvcXQ==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/symbol-observable/-/symbol-observable-1.0.1.tgz", + "integrity": "sha1-g0D8RwLDEi310iKI+IKD9RPT/dQ=", "dev": true }, "symbol-tree": { @@ -13678,7 +11971,7 @@ }, "tar": { "version": "2.2.1", - "resolved": "https://registry.npmjs.org/tar/-/tar-2.2.1.tgz", + "resolved": "http://registry.npmjs.org/tar/-/tar-2.2.1.tgz", "integrity": "sha1-jk0qJWwOIYXGsYrWlK7JaLg8sdE=", "dev": true, "requires": { @@ -13708,9 +12001,9 @@ } }, "terser": { - "version": "3.10.8", - "resolved": "https://registry.npmjs.org/terser/-/terser-3.10.8.tgz", - "integrity": "sha512-GQJHWJ/vbx0EgRk+lBMONMmKaT+ifeo/XgT/hi3KpzEEFOERVyFuJSVXH8grcmJjiqKY35ds8rBCxvABUeyyuQ==", + "version": "3.13.1", + "resolved": "https://registry.npmjs.org/terser/-/terser-3.13.1.tgz", + "integrity": "sha512-ogyZye4DFqOtMzT92Y3Nxxw8OvXmL39HOALro4fc+EUYFFF9G/kk0znkvwMz6PPYgBtdKAodh3FPR70eugdaQA==", "dev": true, "requires": { "commander": "~2.17.1", @@ -13737,9 +12030,9 @@ } }, "terser-webpack-plugin": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-1.1.0.tgz", - "integrity": "sha512-61lV0DSxMAZ8AyZG7/A4a3UPlrbOBo8NIQ4tJzLPAdGOQ+yoNC7l5ijEow27lBAL2humer01KLS6bGIMYQxKoA==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-1.2.1.tgz", + "integrity": "sha512-GGSt+gbT0oKcMDmPx4SRSfJPE1XaN3kQRWG4ghxKQw9cn5G9x6aCKSsgYdvyM0na9NJ4Drv0RG6jbBByZ5CMjw==", "dev": true, "requires": { "cacache": "^11.0.2", @@ -13771,54 +12064,145 @@ "object-assign": "^4.1.0", "read-pkg-up": "^1.0.1", "require-main-filename": "^1.0.1" - } - }, - "test-value": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/test-value/-/test-value-3.0.0.tgz", - "integrity": "sha512-sVACdAWcZkSU9x7AOmJo5TqE+GyNJknHaHsMrR6ZnhjVlVN9Yx6FjHrsKZ3BjIpPCT68zYesPWkakrNupwfOTQ==", - "requires": { - "array-back": "^2.0.0", - "typical": "^2.6.1" - } - }, - "text-encoding-utf-8": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/text-encoding-utf-8/-/text-encoding-utf-8-1.0.2.tgz", - "integrity": "sha512-8bw4MY9WjdsD2aMtO0OzOCY3pXGYNx2d2FfHRVUKkiCPDWjKuOlhLVASS+pD7VkLTVjW268LYJHwsnPFlBpbAg==" - }, - "text-extensions": { - "version": "1.9.0", - "resolved": "https://registry.npmjs.org/text-extensions/-/text-extensions-1.9.0.tgz", - "integrity": "sha512-wiBrwC1EhBelW12Zy26JeOUkQ5mRu+5o8rpsJk5+2t+Y5vE7e842qtZDQ2g1NpX/29HdyFeJ4nSIhI47ENSxlQ==", - "dev": true - }, - "throat": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/throat/-/throat-4.1.0.tgz", - "integrity": "sha1-iQN8vJLFarGJJua6TLsgDhVnKmo=", - "dev": true - }, - "through": { - "version": "2.3.8", - "resolved": "http://registry.npmjs.org/through/-/through-2.3.8.tgz", + }, + "dependencies": { + "arr-diff": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", + "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", + "dev": true, + "requires": { + "arr-flatten": "^1.0.1" + } + }, + "array-unique": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", + "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", + "dev": true + }, + "braces": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", + "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", + "dev": true, + "requires": { + "expand-range": "^1.8.1", + "preserve": "^0.2.0", + "repeat-element": "^1.1.2" + } + }, + "expand-brackets": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", + "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", + "dev": true, + "requires": { + "is-posix-bracket": "^0.1.0" + } + }, + "extglob": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", + "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "is-extglob": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", + "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", + "dev": true + }, + "is-glob": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", + "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", + "dev": true, + "requires": { + "is-extglob": "^1.0.0" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "micromatch": { + "version": "2.3.11", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", + "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", + "dev": true, + "requires": { + "arr-diff": "^2.0.0", + "array-unique": "^0.2.1", + "braces": "^1.8.2", + "expand-brackets": "^0.1.4", + "extglob": "^0.3.1", + "filename-regex": "^2.0.0", + "is-extglob": "^1.0.0", + "is-glob": "^2.0.1", + "kind-of": "^3.0.2", + "normalize-path": "^2.0.1", + "object.omit": "^2.0.0", + "parse-glob": "^3.0.4", + "regex-cache": "^0.4.2" + } + } + } + }, + "test-value": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/test-value/-/test-value-3.0.0.tgz", + "integrity": "sha512-sVACdAWcZkSU9x7AOmJo5TqE+GyNJknHaHsMrR6ZnhjVlVN9Yx6FjHrsKZ3BjIpPCT68zYesPWkakrNupwfOTQ==", + "requires": { + "array-back": "^2.0.0", + "typical": "^2.6.1" + } + }, + "text-encoding-utf-8": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/text-encoding-utf-8/-/text-encoding-utf-8-1.0.2.tgz", + "integrity": "sha512-8bw4MY9WjdsD2aMtO0OzOCY3pXGYNx2d2FfHRVUKkiCPDWjKuOlhLVASS+pD7VkLTVjW268LYJHwsnPFlBpbAg==" + }, + "text-extensions": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/text-extensions/-/text-extensions-1.9.0.tgz", + "integrity": "sha512-wiBrwC1EhBelW12Zy26JeOUkQ5mRu+5o8rpsJk5+2t+Y5vE7e842qtZDQ2g1NpX/29HdyFeJ4nSIhI47ENSxlQ==", + "dev": true + }, + "throat": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/throat/-/throat-4.1.0.tgz", + "integrity": "sha1-iQN8vJLFarGJJua6TLsgDhVnKmo=", + "dev": true + }, + "through": { + "version": "2.3.8", + "resolved": "http://registry.npmjs.org/through/-/through-2.3.8.tgz", "integrity": "sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=", "dev": true }, "through2": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/through2/-/through2-2.0.3.tgz", - "integrity": "sha1-AARWmzfHx0ujnEPzzteNGtlBQL4=", + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/through2/-/through2-2.0.5.tgz", + "integrity": "sha512-/mrRod8xqpA+IHSLyGCQ2s8SPHiCDEeQJSep1jqLYeEUClOFG2Qsh+4FU6G9VeqpZnGW/Su8LQGc4YKni5rYSQ==", "dev": true, "requires": { - "readable-stream": "^2.1.5", + "readable-stream": "~2.3.6", "xtend": "~4.0.1" } }, "through2-filter": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/through2-filter/-/through2-filter-2.0.0.tgz", - "integrity": "sha1-YLxVoNrLdghdsfna6Zq0P4PWIuw=", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/through2-filter/-/through2-filter-3.0.0.tgz", + "integrity": "sha512-jaRjI2WxN3W1V8/FMZ9HKIBXixtiqs3SQSX4/YGIiP3gL6djW48VoZq9tDqeCWs3MT8YY5wb/zli8VW8snY1CA==", "dev": true, "requires": { "through2": "~2.0.0", @@ -13894,6 +12278,17 @@ "dev": true, "requires": { "kind-of": "^3.0.2" + }, + "dependencies": { + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } } }, "to-regex": { @@ -13916,17 +12311,6 @@ "requires": { "is-number": "^3.0.0", "repeat-string": "^1.6.1" - }, - "dependencies": { - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - } - } } }, "to-through": { @@ -13946,6 +12330,14 @@ "requires": { "psl": "^1.1.24", "punycode": "^1.4.1" + }, + "dependencies": { + "punycode": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", + "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=", + "dev": true + } } }, "tr46": { @@ -13955,14 +12347,6 @@ "dev": true, "requires": { "punycode": "^2.1.0" - }, - "dependencies": { - "punycode": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", - "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==", - "dev": true - } } }, "trash": { @@ -13984,7 +12368,7 @@ "dependencies": { "fs-extra": { "version": "0.30.0", - "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-0.30.0.tgz", + "resolved": "http://registry.npmjs.org/fs-extra/-/fs-extra-0.30.0.tgz", "integrity": "sha1-8jP/zAjU2n1DLapEl3aYnbHfk/A=", "dev": true, "requires": { @@ -14039,341 +12423,64 @@ "dev": true }, "ts-jest": { - "version": "22.4.6", - "resolved": "https://registry.npmjs.org/ts-jest/-/ts-jest-22.4.6.tgz", - "integrity": "sha512-kYQ6g1G1AU+bOO9rv+SSQXg4WTcni6Wx3AM48iHni0nP1vIuhdNRjKTE9Cxx36Ix/IOV7L85iKu07dgXJzH2pQ==", + "version": "23.10.5", + "resolved": "https://registry.npmjs.org/ts-jest/-/ts-jest-23.10.5.tgz", + "integrity": "sha512-MRCs9qnGoyKgFc8adDEntAOP64fWK1vZKnOYU1o2HxaqjdJvGqmkLCPCnVq1/If4zkUmEjKPnCiUisTrlX2p2A==", "dev": true, "requires": { - "babel-core": "^6.26.3", - "babel-plugin-istanbul": "^4.1.6", - "babel-plugin-transform-es2015-modules-commonjs": "^6.26.2", - "babel-preset-jest": "^22.4.3", - "cpx": "^1.5.0", - "fs-extra": "6.0.0", - "jest-config": "^22.4.3", - "lodash": "^4.17.10", - "pkg-dir": "^2.0.0", - "source-map-support": "^0.5.5", - "yargs": "^11.0.0" + "bs-logger": "0.x", + "buffer-from": "1.x", + "fast-json-stable-stringify": "2.x", + "json5": "2.x", + "make-error": "1.x", + "mkdirp": "0.x", + "resolve": "1.x", + "semver": "^5.5", + "yargs-parser": "10.x" }, "dependencies": { - "ansi-regex": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-3.0.0.tgz", - "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", - "dev": true - }, - "babel-plugin-jest-hoist": { - "version": "22.4.4", - "resolved": "https://registry.npmjs.org/babel-plugin-jest-hoist/-/babel-plugin-jest-hoist-22.4.4.tgz", - "integrity": "sha512-DUvGfYaAIlkdnygVIEl0O4Av69NtuQWcrjMOv6DODPuhuGLDnbsARz3AwiiI/EkIMMlxQDUcrZ9yoyJvTNjcVQ==", - "dev": true - }, - "babel-preset-jest": { - "version": "22.4.4", - "resolved": "https://registry.npmjs.org/babel-preset-jest/-/babel-preset-jest-22.4.4.tgz", - "integrity": "sha512-+dxMtOFwnSYWfum0NaEc0O03oSdwBsjx4tMSChRDPGwu/4wSY6Q6ANW3wkjKpJzzguaovRs/DODcT4hbSN8yiA==", - "dev": true, - "requires": { - "babel-plugin-jest-hoist": "^22.4.4", - "babel-plugin-syntax-object-rest-spread": "^6.13.0" - } - }, - "cliui": { + "camelcase": { "version": "4.1.0", - "resolved": "https://registry.npmjs.org/cliui/-/cliui-4.1.0.tgz", - "integrity": "sha512-4FG+RSG9DL7uEwRUZXZn3SS34DiDPfzP0VOiEwtUWlE+AR2EIg+hSyvrIgUUfhdgR/UkAeW2QHgeP+hWrXs7jQ==", - "dev": true, - "requires": { - "string-width": "^2.1.1", - "strip-ansi": "^4.0.0", - "wrap-ansi": "^2.0.0" - } - }, - "expect": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/expect/-/expect-22.4.3.tgz", - "integrity": "sha512-XcNXEPehqn8b/jm8FYotdX0YrXn36qp4HWlrVT4ktwQas1l1LPxiVWncYnnL2eyMtKAmVIaG0XAp0QlrqJaxaA==", - "dev": true, - "requires": { - "ansi-styles": "^3.2.0", - "jest-diff": "^22.4.3", - "jest-get-type": "^22.4.3", - "jest-matcher-utils": "^22.4.3", - "jest-message-util": "^22.4.3", - "jest-regex-util": "^22.4.3" - } - }, - "fs-extra": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-6.0.0.tgz", - "integrity": "sha512-lk2cUCo8QzbiEWEbt7Cw3m27WMiRG321xsssbcIpfMhpRjrlC08WBOVQqj1/nQYYNnPtyIhP1oqLO3QwT2tPCw==", - "dev": true, - "requires": { - "graceful-fs": "^4.1.2", - "jsonfile": "^4.0.0", - "universalify": "^0.1.0" - } - }, - "is-fullwidth-code-point": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", - "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", - "dev": true - }, - "jest-config": { - "version": "22.4.4", - "resolved": "https://registry.npmjs.org/jest-config/-/jest-config-22.4.4.tgz", - "integrity": "sha512-9CKfo1GC4zrXSoMLcNeDvQBfgtqGTB1uP8iDIZ97oB26RCUb886KkKWhVcpyxVDOUxbhN+uzcBCeFe7w+Iem4A==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "glob": "^7.1.1", - "jest-environment-jsdom": "^22.4.1", - "jest-environment-node": "^22.4.1", - "jest-get-type": "^22.1.0", - "jest-jasmine2": "^22.4.4", - "jest-regex-util": "^22.1.0", - "jest-resolve": "^22.4.2", - "jest-util": "^22.4.1", - "jest-validate": "^22.4.4", - "pretty-format": "^22.4.0" - } - }, - "jest-diff": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-diff/-/jest-diff-22.4.3.tgz", - "integrity": "sha512-/QqGvCDP5oZOF6PebDuLwrB2BMD8ffJv6TAGAdEVuDx1+uEgrHpSFrfrOiMRx2eJ1hgNjlQrOQEHetVwij90KA==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "diff": "^3.2.0", - "jest-get-type": "^22.4.3", - "pretty-format": "^22.4.3" - } - }, - "jest-environment-jsdom": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-environment-jsdom/-/jest-environment-jsdom-22.4.3.tgz", - "integrity": "sha512-FviwfR+VyT3Datf13+ULjIMO5CSeajlayhhYQwpzgunswoaLIPutdbrnfUHEMyJCwvqQFaVtTmn9+Y8WCt6n1w==", - "dev": true, - "requires": { - "jest-mock": "^22.4.3", - "jest-util": "^22.4.3", - "jsdom": "^11.5.1" - } - }, - "jest-environment-node": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-environment-node/-/jest-environment-node-22.4.3.tgz", - "integrity": "sha512-reZl8XF6t/lMEuPWwo9OLfttyC26A5AMgDyEQ6DBgZuyfyeNUzYT8BFo6uxCCP/Av/b7eb9fTi3sIHFPBzmlRA==", - "dev": true, - "requires": { - "jest-mock": "^22.4.3", - "jest-util": "^22.4.3" - } - }, - "jest-jasmine2": { - "version": "22.4.4", - "resolved": "https://registry.npmjs.org/jest-jasmine2/-/jest-jasmine2-22.4.4.tgz", - "integrity": "sha512-nK3vdUl50MuH7vj/8at7EQVjPGWCi3d5+6aCi7Gxy/XMWdOdbH1qtO/LjKbqD8+8dUAEH+BVVh7HkjpCWC1CSw==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "co": "^4.6.0", - "expect": "^22.4.0", - "graceful-fs": "^4.1.11", - "is-generator-fn": "^1.0.0", - "jest-diff": "^22.4.0", - "jest-matcher-utils": "^22.4.0", - "jest-message-util": "^22.4.0", - "jest-snapshot": "^22.4.0", - "jest-util": "^22.4.1", - "source-map-support": "^0.5.0" - } - }, - "jest-matcher-utils": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-matcher-utils/-/jest-matcher-utils-22.4.3.tgz", - "integrity": "sha512-lsEHVaTnKzdAPR5t4B6OcxXo9Vy4K+kRRbG5gtddY8lBEC+Mlpvm1CJcsMESRjzUhzkz568exMV1hTB76nAKbA==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "jest-get-type": "^22.4.3", - "pretty-format": "^22.4.3" - } - }, - "jest-message-util": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-message-util/-/jest-message-util-22.4.3.tgz", - "integrity": "sha512-iAMeKxhB3Se5xkSjU0NndLLCHtP4n+GtCqV0bISKA5dmOXQfEbdEmYiu2qpnWBDCQdEafNDDU6Q+l6oBMd/+BA==", - "dev": true, - "requires": { - "@babel/code-frame": "^7.0.0-beta.35", - "chalk": "^2.0.1", - "micromatch": "^2.3.11", - "slash": "^1.0.0", - "stack-utils": "^1.0.1" - } - }, - "jest-mock": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-mock/-/jest-mock-22.4.3.tgz", - "integrity": "sha512-+4R6mH5M1G4NK16CKg9N1DtCaFmuxhcIqF4lQK/Q1CIotqMs/XBemfpDPeVZBFow6iyUNu6EBT9ugdNOTT5o5Q==", - "dev": true - }, - "jest-regex-util": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-regex-util/-/jest-regex-util-22.4.3.tgz", - "integrity": "sha512-LFg1gWr3QinIjb8j833bq7jtQopiwdAs67OGfkPrvy7uNUbVMfTXXcOKXJaeY5GgjobELkKvKENqq1xrUectWg==", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", + "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", "dev": true }, - "jest-resolve": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-resolve/-/jest-resolve-22.4.3.tgz", - "integrity": "sha512-u3BkD/MQBmwrOJDzDIaxpyqTxYH+XqAXzVJP51gt29H8jpj3QgKof5GGO2uPGKGeA1yTMlpbMs1gIQ6U4vcRhw==", - "dev": true, - "requires": { - "browser-resolve": "^1.11.2", - "chalk": "^2.0.1" - } - }, - "jest-snapshot": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-snapshot/-/jest-snapshot-22.4.3.tgz", - "integrity": "sha512-JXA0gVs5YL0HtLDCGa9YxcmmV2LZbwJ+0MfyXBBc5qpgkEYITQFJP7XNhcHFbUvRiniRpRbGVfJrOoYhhGE0RQ==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "jest-diff": "^22.4.3", - "jest-matcher-utils": "^22.4.3", - "mkdirp": "^0.5.1", - "natural-compare": "^1.4.0", - "pretty-format": "^22.4.3" - } - }, - "jest-util": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/jest-util/-/jest-util-22.4.3.tgz", - "integrity": "sha512-rfDfG8wyC5pDPNdcnAlZgwKnzHvZDu8Td2NJI/jAGKEGxJPYiE4F0ss/gSAkG4778Y23Hvbz+0GMrDJTeo7RjQ==", - "dev": true, - "requires": { - "callsites": "^2.0.0", - "chalk": "^2.0.1", - "graceful-fs": "^4.1.11", - "is-ci": "^1.0.10", - "jest-message-util": "^22.4.3", - "mkdirp": "^0.5.1", - "source-map": "^0.6.0" - } - }, - "jest-validate": { - "version": "22.4.4", - "resolved": "https://registry.npmjs.org/jest-validate/-/jest-validate-22.4.4.tgz", - "integrity": "sha512-dmlf4CIZRGvkaVg3fa0uetepcua44DHtktHm6rcoNVtYlpwe6fEJRkMFsaUVcFHLzbuBJ2cPw9Gl9TKfnzMVwg==", - "dev": true, - "requires": { - "chalk": "^2.0.1", - "jest-config": "^22.4.4", - "jest-get-type": "^22.1.0", - "leven": "^2.1.0", - "pretty-format": "^22.4.0" - } - }, - "os-locale": { + "json5": { "version": "2.1.0", - "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-2.1.0.tgz", - "integrity": "sha512-3sslG3zJbEYcaC4YVAvDorjGxc7tv6KVATnLPZONiljsUncvihe9BQoVCEs0RZ1kmf4Hk9OBqlZfJZWI4GanKA==", + "resolved": "https://registry.npmjs.org/json5/-/json5-2.1.0.tgz", + "integrity": "sha512-8Mh9h6xViijj36g7Dxi+Y4S6hNGV96vcJZr/SrlHh1LR/pEn/8j/+qIBbs44YKl69Lrfctp4QD+AdWLTMqEZAQ==", "dev": true, "requires": { - "execa": "^0.7.0", - "lcid": "^1.0.0", - "mem": "^1.1.0" + "minimist": "^1.2.0" } }, - "pretty-format": { - "version": "22.4.3", - "resolved": "http://registry.npmjs.org/pretty-format/-/pretty-format-22.4.3.tgz", - "integrity": "sha512-S4oT9/sT6MN7/3COoOy+ZJeA92VmOnveLHgrwBE3Z1W5N9S2A1QGNYiE1z75DAENbJrXXUb+OWXhpJcg05QKQQ==", + "yargs-parser": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-10.1.0.tgz", + "integrity": "sha512-VCIyR1wJoEBZUqk5PA+oOBF6ypbwh5aNB3I50guxAL/quggdfs4TtNHQrSazFA3fYZ+tEqfs0zIGlv0c/rgjbQ==", "dev": true, "requires": { - "ansi-regex": "^3.0.0", - "ansi-styles": "^3.2.0" + "camelcase": "^4.1.0" } - }, - "source-map": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", - "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", - "dev": true - }, - "source-map-support": { - "version": "0.5.9", - "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.9.tgz", - "integrity": "sha512-gR6Rw4MvUlYy83vP0vxoVNzM6t8MUXqNuRsuBmBHQDu1Fh6X015FrLdgoDKcNdkwGubozq0P4N0Q37UyFVr1EA==", - "dev": true, - "requires": { - "buffer-from": "^1.0.0", - "source-map": "^0.6.0" - } - }, - "string-width": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", - "integrity": "sha512-nOqH59deCq9SRHlxq1Aw85Jnt4w6KvLKqWVik6oA9ZklXLNIOlqg4F2yrT1MVaTjAqvVwdfeZ7w7aCvJD7ugkw==", - "dev": true, - "requires": { - "is-fullwidth-code-point": "^2.0.0", - "strip-ansi": "^4.0.0" - } - }, - "strip-ansi": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-4.0.0.tgz", - "integrity": "sha1-qEeQIusaw2iocTibY1JixQXuNo8=", - "dev": true, - "requires": { - "ansi-regex": "^3.0.0" - } - }, - "yargs": { - "version": "11.1.0", - "resolved": "http://registry.npmjs.org/yargs/-/yargs-11.1.0.tgz", - "integrity": "sha512-NwW69J42EsCSanF8kyn5upxvjp5ds+t3+udGBeTbFnERA+lF541DDpMawzo4z6W/QrzNM18D+BPMiOBibnFV5A==", - "dev": true, - "requires": { - "cliui": "^4.0.0", - "decamelize": "^1.1.1", - "find-up": "^2.1.0", - "get-caller-file": "^1.0.1", - "os-locale": "^2.0.0", - "require-directory": "^2.1.1", - "require-main-filename": "^1.0.1", - "set-blocking": "^2.0.0", - "string-width": "^2.0.0", - "which-module": "^2.0.0", - "y18n": "^3.2.1", - "yargs-parser": "^9.0.2" - } - } - } - }, - "ts-node": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-7.0.1.tgz", - "integrity": "sha512-BVwVbPJRspzNh2yfslyT1PSbl5uIk03EZlb493RKHN4qej/D06n1cEhjlOJG69oFsE7OT8XjpTUcYf6pKTLMhw==", - "dev": true, - "requires": { - "arrify": "^1.0.0", - "buffer-from": "^1.1.0", - "diff": "^3.1.0", - "make-error": "^1.1.1", - "minimist": "^1.2.0", - "mkdirp": "^0.5.1", - "source-map-support": "^0.5.6", - "yn": "^2.0.0" - }, - "dependencies": { + } + } + }, + "ts-node": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-7.0.1.tgz", + "integrity": "sha512-BVwVbPJRspzNh2yfslyT1PSbl5uIk03EZlb493RKHN4qej/D06n1cEhjlOJG69oFsE7OT8XjpTUcYf6pKTLMhw==", + "dev": true, + "requires": { + "arrify": "^1.0.0", + "buffer-from": "^1.1.0", + "diff": "^3.1.0", + "make-error": "^1.1.1", + "minimist": "^1.2.0", + "mkdirp": "^0.5.1", + "source-map-support": "^0.5.6", + "yn": "^2.0.0" + }, + "dependencies": { "source-map": { "version": "0.6.1", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", @@ -14398,9 +12505,9 @@ "integrity": "sha512-4krF8scpejhaOgqzBEcGM7yDIEfi0/8+8zDRZhNZZ2kjmHJ4hv3zCbQWxoJGz1iw5U0Jl0nma13xzHXcncMavQ==" }, "tslint": { - "version": "5.11.0", - "resolved": "https://registry.npmjs.org/tslint/-/tslint-5.11.0.tgz", - "integrity": "sha1-mPMMAurjzecAYgHkwzywi0hYHu0=", + "version": "5.12.0", + "resolved": "https://registry.npmjs.org/tslint/-/tslint-5.12.0.tgz", + "integrity": "sha512-CKEcH1MHUBhoV43SA/Jmy1l24HJJgI0eyLbBNSRyFlsQvb9v6Zdq+Nz2vEOH00nC5SUx4SneJ59PZUS/ARcokQ==", "dev": true, "requires": { "babel-code-frame": "^6.22.0", @@ -14428,7 +12535,7 @@ }, "tty-browserify": { "version": "0.0.0", - "resolved": "https://registry.npmjs.org/tty-browserify/-/tty-browserify-0.0.0.tgz", + "resolved": "http://registry.npmjs.org/tty-browserify/-/tty-browserify-0.0.0.tgz", "integrity": "sha1-oVe6QC2iTpv5V/mqadUk7tQpAaY=", "dev": true }, @@ -14463,9 +12570,9 @@ "dev": true }, "typedoc": { - "version": "0.12.0", - "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.12.0.tgz", - "integrity": "sha512-dsdlaYZ7Je8JC+jQ3j2Iroe4uyD0GhqzADNUVyBRgLuytQDP/g0dPkAw5PdM/4drnmmJjRzSWW97FkKo+ITqQg==", + "version": "0.13.0", + "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.13.0.tgz", + "integrity": "sha512-jQWtvPcV+0fiLZAXFEe70v5gqjDO6pJYJz4mlTtmGJeW2KRoIU/BEfktma6Uj8Xii7UakuZjbxFewl3UYOkU/w==", "dev": true, "requires": { "@types/fs-extra": "^5.0.3", @@ -14484,7 +12591,15 @@ "progress": "^2.0.0", "shelljs": "^0.8.2", "typedoc-default-themes": "^0.5.0", - "typescript": "3.0.x" + "typescript": "3.1.x" + }, + "dependencies": { + "typescript": { + "version": "3.1.6", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.1.6.tgz", + "integrity": "sha512-tDMYfVtvpb96msS1lDX9MEdHrW4yOuZ4Kdc4Him9oU796XldPYF/t2+uKoX0BBa0hXXwDlqYQbXY5Rzjzc5hBA==", + "dev": true + } } }, "typedoc-default-themes": { @@ -14494,9 +12609,9 @@ "dev": true }, "typescript": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.0.3.tgz", - "integrity": "sha512-kk80vLW9iGtjMnIv11qyxLqZm20UklzuR2tL0QAnDIygIUIemcZMxlMWudl9OOt76H3ntVzcTiddQ1/pAAJMYg==", + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.2.2.tgz", + "integrity": "sha512-VCj5UiSyHBjwfYacmDuc/NOk4QQixbE+Wn7MFJuS0nRuPQbof132Pw4u53dm264O8LPc2MVsc7RJNml5szurkg==", "dev": true }, "typical": { @@ -14524,155 +12639,6 @@ } } }, - "uglifyjs-webpack-plugin": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/uglifyjs-webpack-plugin/-/uglifyjs-webpack-plugin-1.3.0.tgz", - "integrity": "sha512-ovHIch0AMlxjD/97j9AYovZxG5wnHOPkL7T1GKochBADp/Zwc44pEWNqpKl1Loupp1WhFg7SlYmHZRUfdAacgw==", - "dev": true, - "requires": { - "cacache": "^10.0.4", - "find-cache-dir": "^1.0.0", - "schema-utils": "^0.4.5", - "serialize-javascript": "^1.4.0", - "source-map": "^0.6.1", - "uglify-es": "^3.3.4", - "webpack-sources": "^1.1.0", - "worker-farm": "^1.5.2" - }, - "dependencies": { - "ajv": { - "version": "6.6.1", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.6.1.tgz", - "integrity": "sha512-ZoJjft5B+EJBjUyu9C9Hc0OZyPZSSlOF+plzouTrg6UlA8f+e/n8NIgBFG/9tppJtpPWfthHakK7juJdNDODww==", - "dev": true, - "requires": { - "fast-deep-equal": "^2.0.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - } - }, - "cacache": { - "version": "10.0.4", - "resolved": "https://registry.npmjs.org/cacache/-/cacache-10.0.4.tgz", - "integrity": "sha512-Dph0MzuH+rTQzGPNT9fAnrPmMmjKfST6trxJeK7NQuHRaVw24VzPRWTmg9MpcwOVQZO0E1FBICUlFeNaKPIfHA==", - "dev": true, - "requires": { - "bluebird": "^3.5.1", - "chownr": "^1.0.1", - "glob": "^7.1.2", - "graceful-fs": "^4.1.11", - "lru-cache": "^4.1.1", - "mississippi": "^2.0.0", - "mkdirp": "^0.5.1", - "move-concurrently": "^1.0.1", - "promise-inflight": "^1.0.1", - "rimraf": "^2.6.2", - "ssri": "^5.2.4", - "unique-filename": "^1.1.0", - "y18n": "^4.0.0" - } - }, - "commander": { - "version": "2.13.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-2.13.0.tgz", - "integrity": "sha512-MVuS359B+YzaWqjCL/c+22gfryv+mCBPHAv3zyVI2GN8EY6IRP8VwtasXn8jyyhvvq84R4ImN1OKRtcbIasjYA==", - "dev": true - }, - "fast-deep-equal": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz", - "integrity": "sha1-ewUhjd+WZ79/Nwv3/bLLFf3Qqkk=", - "dev": true - }, - "find-cache-dir": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/find-cache-dir/-/find-cache-dir-1.0.0.tgz", - "integrity": "sha1-kojj6ePMN0hxfTnq3hfPcfww7m8=", - "dev": true, - "requires": { - "commondir": "^1.0.1", - "make-dir": "^1.0.0", - "pkg-dir": "^2.0.0" - } - }, - "json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", - "dev": true - }, - "mississippi": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/mississippi/-/mississippi-2.0.0.tgz", - "integrity": "sha512-zHo8v+otD1J10j/tC+VNoGK9keCuByhKovAvdn74dmxJl9+mWHnx6EMsDN4lgRoMI/eYo2nchAxniIbUPb5onw==", - "dev": true, - "requires": { - "concat-stream": "^1.5.0", - "duplexify": "^3.4.2", - "end-of-stream": "^1.1.0", - "flush-write-stream": "^1.0.0", - "from2": "^2.1.0", - "parallel-transform": "^1.1.0", - "pump": "^2.0.1", - "pumpify": "^1.3.3", - "stream-each": "^1.1.0", - "through2": "^2.0.0" - } - }, - "pump": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/pump/-/pump-2.0.1.tgz", - "integrity": "sha512-ruPMNRkN3MHP1cWJc9OWr+T/xDP0jhXYCLfJcBuX54hhfIBnaQmAUMfDcG4DM5UMWByBbJY69QSphm3jtDKIkA==", - "dev": true, - "requires": { - "end-of-stream": "^1.1.0", - "once": "^1.3.1" - } - }, - "schema-utils": { - "version": "0.4.7", - "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-0.4.7.tgz", - "integrity": "sha512-v/iwU6wvwGK8HbU9yi3/nhGzP0yGSuhQMzL6ySiec1FSrZZDkhm4noOSWzrNFo/jEc+SJY6jRTwuwbSXJPDUnQ==", - "dev": true, - "requires": { - "ajv": "^6.1.0", - "ajv-keywords": "^3.1.0" - } - }, - "source-map": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", - "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", - "dev": true - }, - "ssri": { - "version": "5.3.0", - "resolved": "https://registry.npmjs.org/ssri/-/ssri-5.3.0.tgz", - "integrity": "sha512-XRSIPqLij52MtgoQavH/x/dU1qVKtWUAAZeOHsR9c2Ddi4XerFy3mc1alf+dLJKl9EUIm/Ht+EowFkTUOA6GAQ==", - "dev": true, - "requires": { - "safe-buffer": "^5.1.1" - } - }, - "uglify-es": { - "version": "3.3.9", - "resolved": "https://registry.npmjs.org/uglify-es/-/uglify-es-3.3.9.tgz", - "integrity": "sha512-r+MU0rfv4L/0eeW3xZrd16t4NZfK8Ld4SWVglYBb7ez5uXFWHuVRs6xCTrf1yirs9a4j4Y27nn7SRfO6v67XsQ==", - "dev": true, - "requires": { - "commander": "~2.13.0", - "source-map": "~0.6.1" - } - }, - "y18n": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.0.tgz", - "integrity": "sha512-r9S/ZyXu/Xu9q1tYlpsLIsa3EeLXXk0VwlxqTcFRfg9EhMW+17kbt9G0NrgCmhGb5vT2hyhJZLfDGx+7+5Uj/w==", - "dev": true - } - } - }, "uid-number": { "version": "0.0.6", "resolved": "https://registry.npmjs.org/uid-number/-/uid-number-0.0.6.tgz", @@ -14768,13 +12734,13 @@ } }, "unique-stream": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/unique-stream/-/unique-stream-2.2.1.tgz", - "integrity": "sha1-WqADz76Uxf+GbE59ZouxxNuts2k=", + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/unique-stream/-/unique-stream-2.3.1.tgz", + "integrity": "sha512-2nY4TnBE70yoxHkDli7DMazpWiP7xMdCYqU2nBRO0UB+ZpEkGsSija7MvmvnZFUeC+mrgiUfcHSr3LmRFIg4+A==", "dev": true, "requires": { - "json-stable-stringify": "^1.0.0", - "through2-filter": "^2.0.0" + "json-stable-stringify-without-jsonify": "^1.0.1", + "through2-filter": "^3.0.0" } }, "universalify": { @@ -14820,12 +12786,6 @@ "resolved": "https://registry.npmjs.org/has-values/-/has-values-0.1.4.tgz", "integrity": "sha1-bWHeldkd/Km5oCCJrThL/49it3E=", "dev": true - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true } } }, @@ -14842,14 +12802,6 @@ "dev": true, "requires": { "punycode": "^2.1.0" - }, - "dependencies": { - "punycode": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", - "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==", - "dev": true - } } }, "urix": { @@ -14882,6 +12834,15 @@ "integrity": "sha512-cwESVXlO3url9YWlFW/TA9cshCEhtu7IKJ/p5soJ/gGpj7vbvFrAY/eIioQ6Dw23KjZhYgiIo8HOs1nQ2vr/oQ==", "dev": true }, + "user-home": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/user-home/-/user-home-2.0.0.tgz", + "integrity": "sha1-nHC/2Babwdy/SGBODwS4tJzenp8=", + "dev": true, + "requires": { + "os-homedir": "^1.0.0" + } + }, "util": { "version": "0.10.4", "resolved": "https://registry.npmjs.org/util/-/util-0.10.4.tgz", @@ -14914,9 +12875,9 @@ "dev": true }, "v8flags": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/v8flags/-/v8flags-3.1.1.tgz", - "integrity": "sha512-iw/1ViSEaff8NJ3HLyEjawk/8hjJib3E7pvG4pddVXfUg1983s3VGsiClDjhK64MQVDGqc1Q8r18S4VKQZS9EQ==", + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/v8flags/-/v8flags-3.1.2.tgz", + "integrity": "sha512-MtivA7GF24yMPte9Rp/BWGCYQNaUj86zeYxV/x2RRJMKagImbbv3u8iJC57lNhWLPcGLJmHcHmFWkNsplbbLWw==", "dev": true, "requires": { "homedir-polyfill": "^1.0.1" @@ -14979,437 +12940,94 @@ "dev": true, "requires": { "fs-mkdirp-stream": "^1.0.0", - "glob-stream": "^6.1.0", - "graceful-fs": "^4.0.0", - "is-valid-glob": "^1.0.0", - "lazystream": "^1.0.0", - "lead": "^1.0.0", - "object.assign": "^4.0.4", - "pumpify": "^1.3.5", - "readable-stream": "^2.3.3", - "remove-bom-buffer": "^3.0.0", - "remove-bom-stream": "^1.2.0", - "resolve-options": "^1.1.0", - "through2": "^2.0.0", - "to-through": "^2.0.0", - "value-or-function": "^3.0.0", - "vinyl": "^2.0.0", - "vinyl-sourcemap": "^1.1.0" - } - }, - "vinyl-sourcemap": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/vinyl-sourcemap/-/vinyl-sourcemap-1.1.0.tgz", - "integrity": "sha1-kqgAWTo4cDqM2xHYswCtS+Y7PhY=", - "dev": true, - "requires": { - "append-buffer": "^1.0.2", - "convert-source-map": "^1.5.0", - "graceful-fs": "^4.1.6", - "normalize-path": "^2.1.1", - "now-and-later": "^2.0.0", - "remove-bom-buffer": "^3.0.0", - "vinyl": "^2.0.0" - } - }, - "vinyl-sourcemaps-apply": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/vinyl-sourcemaps-apply/-/vinyl-sourcemaps-apply-0.2.1.tgz", - "integrity": "sha1-q2VJ1h0XLCsbh75cUI0jnI74dwU=", - "dev": true, - "requires": { - "source-map": "^0.5.1" - } - }, - "vm-browserify": { - "version": "0.0.4", - "resolved": "https://registry.npmjs.org/vm-browserify/-/vm-browserify-0.0.4.tgz", - "integrity": "sha1-XX6kW7755Kb/ZflUOOCofDV9WnM=", - "dev": true, - "requires": { - "indexof": "0.0.1" - } - }, - "w3c-hr-time": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/w3c-hr-time/-/w3c-hr-time-1.0.1.tgz", - "integrity": "sha1-gqwr/2PZUOqeMYmlimViX+3xkEU=", - "dev": true, - "requires": { - "browser-process-hrtime": "^0.1.2" - } - }, - "walker": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.7.tgz", - "integrity": "sha1-L3+bj9ENZ3JisYqITijRlhjgKPs=", - "dev": true, - "requires": { - "makeerror": "1.0.x" - } - }, - "watch": { - "version": "0.18.0", - "resolved": "https://registry.npmjs.org/watch/-/watch-0.18.0.tgz", - "integrity": "sha1-KAlUdsbffJDJYxOJkMClQj60uYY=", - "dev": true, - "requires": { - "exec-sh": "^0.2.0", - "minimist": "^1.2.0" - } - }, - "watchpack": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-1.6.0.tgz", - "integrity": "sha512-i6dHe3EyLjMmDlU1/bGQpEw25XSjkJULPuAVKCbNRefQVq48yXKUpwg538F7AZTf9kyr57zj++pQFltUa5H7yA==", - "dev": true, - "requires": { - "chokidar": "^2.0.2", - "graceful-fs": "^4.1.2", - "neo-async": "^2.5.0" - }, - "dependencies": { - "anymatch": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", - "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==", - "dev": true, - "requires": { - "micromatch": "^3.1.4", - "normalize-path": "^2.1.1" - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "chokidar": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-2.0.4.tgz", - "integrity": "sha512-z9n7yt9rOvIJrMhvDtDictKrkFHeihkNl6uWMmZlmL6tJtX9Cs+87oK+teBx+JIgzvbX3yZHT3eF8vpbDxHJXQ==", - "dev": true, - "requires": { - "anymatch": "^2.0.0", - "async-each": "^1.0.0", - "braces": "^2.3.0", - "fsevents": "^1.2.2", - "glob-parent": "^3.1.0", - "inherits": "^2.0.1", - "is-binary-path": "^1.0.0", - "is-glob": "^4.0.0", - "lodash.debounce": "^4.0.8", - "normalize-path": "^2.1.1", - "path-is-absolute": "^1.0.0", - "readdirp": "^2.0.0", - "upath": "^1.0.5" - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - }, - "dependencies": { - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-glob": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", - "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", - "dev": true, - "requires": { - "is-extglob": "^2.1.1" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - } + "glob-stream": "^6.1.0", + "graceful-fs": "^4.0.0", + "is-valid-glob": "^1.0.0", + "lazystream": "^1.0.0", + "lead": "^1.0.0", + "object.assign": "^4.0.4", + "pumpify": "^1.3.5", + "readable-stream": "^2.3.3", + "remove-bom-buffer": "^3.0.0", + "remove-bom-stream": "^1.2.0", + "resolve-options": "^1.1.0", + "through2": "^2.0.0", + "to-through": "^2.0.0", + "value-or-function": "^3.0.0", + "vinyl": "^2.0.0", + "vinyl-sourcemap": "^1.1.0" + } + }, + "vinyl-sourcemap": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/vinyl-sourcemap/-/vinyl-sourcemap-1.1.0.tgz", + "integrity": "sha1-kqgAWTo4cDqM2xHYswCtS+Y7PhY=", + "dev": true, + "requires": { + "append-buffer": "^1.0.2", + "convert-source-map": "^1.5.0", + "graceful-fs": "^4.1.6", + "normalize-path": "^2.1.1", + "now-and-later": "^2.0.0", + "remove-bom-buffer": "^3.0.0", + "vinyl": "^2.0.0" + } + }, + "vinyl-sourcemaps-apply": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/vinyl-sourcemaps-apply/-/vinyl-sourcemaps-apply-0.2.1.tgz", + "integrity": "sha1-q2VJ1h0XLCsbh75cUI0jnI74dwU=", + "dev": true, + "requires": { + "source-map": "^0.5.1" + } + }, + "vm-browserify": { + "version": "0.0.4", + "resolved": "http://registry.npmjs.org/vm-browserify/-/vm-browserify-0.0.4.tgz", + "integrity": "sha1-XX6kW7755Kb/ZflUOOCofDV9WnM=", + "dev": true, + "requires": { + "indexof": "0.0.1" + } + }, + "w3c-hr-time": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/w3c-hr-time/-/w3c-hr-time-1.0.1.tgz", + "integrity": "sha1-gqwr/2PZUOqeMYmlimViX+3xkEU=", + "dev": true, + "requires": { + "browser-process-hrtime": "^0.1.2" + } + }, + "walker": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.7.tgz", + "integrity": "sha1-L3+bj9ENZ3JisYqITijRlhjgKPs=", + "dev": true, + "requires": { + "makeerror": "1.0.x" + } + }, + "watch": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/watch/-/watch-0.18.0.tgz", + "integrity": "sha1-KAlUdsbffJDJYxOJkMClQj60uYY=", + "dev": true, + "requires": { + "exec-sh": "^0.2.0", + "minimist": "^1.2.0" + } + }, + "watchpack": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-1.6.0.tgz", + "integrity": "sha512-i6dHe3EyLjMmDlU1/bGQpEw25XSjkJULPuAVKCbNRefQVq48yXKUpwg538F7AZTf9kyr57zj++pQFltUa5H7yA==", + "dev": true, + "requires": { + "chokidar": "^2.0.2", + "graceful-fs": "^4.1.2", + "neo-async": "^2.5.0" } }, "wcwidth": { @@ -15421,6 +13039,12 @@ "defaults": "^1.0.3" } }, + "web-stream-tools": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/web-stream-tools/-/web-stream-tools-0.0.1.tgz", + "integrity": "sha512-MZUYhvTAMMy1u07OJL2pyp/tdrIu15fRJlGgnfvCQVXBS4cBNbIV1+6veYfVhTfnq0ZLispgx4nv17QxpuX+6w==", + "dev": true + }, "webidl-conversions": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-4.0.2.tgz", @@ -15428,15 +13052,15 @@ "dev": true }, "webpack": { - "version": "4.23.1", - "resolved": "https://registry.npmjs.org/webpack/-/webpack-4.23.1.tgz", - "integrity": "sha512-iE5Cu4rGEDk7ONRjisTOjVHv3dDtcFfwitSxT7evtYj/rANJpt1OuC/Kozh1pBa99AUBr1L/LsaNB+D9Xz3CEg==", + "version": "4.28.3", + "resolved": "https://registry.npmjs.org/webpack/-/webpack-4.28.3.tgz", + "integrity": "sha512-vLZN9k5I7Nr/XB1IDG9GbZB4yQd1sPuvufMFgJkx0b31fi2LD97KQIjwjxE7xytdruAYfu5S0FLBLjdxmwGJCg==", "dev": true, "requires": { - "@webassemblyjs/ast": "1.7.10", - "@webassemblyjs/helper-module-context": "1.7.10", - "@webassemblyjs/wasm-edit": "1.7.10", - "@webassemblyjs/wasm-parser": "1.7.10", + "@webassemblyjs/ast": "1.7.11", + "@webassemblyjs/helper-module-context": "1.7.11", + "@webassemblyjs/wasm-edit": "1.7.11", + "@webassemblyjs/wasm-parser": "1.7.11", "acorn": "^5.6.2", "acorn-dynamic-import": "^3.0.0", "ajv": "^6.1.0", @@ -15454,309 +13078,11 @@ "node-libs-browser": "^2.0.0", "schema-utils": "^0.4.4", "tapable": "^1.1.0", - "uglifyjs-webpack-plugin": "^1.2.4", + "terser-webpack-plugin": "^1.1.0", "watchpack": "^1.5.0", "webpack-sources": "^1.3.0" }, "dependencies": { - "ajv": { - "version": "6.6.1", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.6.1.tgz", - "integrity": "sha512-ZoJjft5B+EJBjUyu9C9Hc0OZyPZSSlOF+plzouTrg6UlA8f+e/n8NIgBFG/9tppJtpPWfthHakK7juJdNDODww==", - "dev": true, - "requires": { - "fast-deep-equal": "^2.0.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - } - }, - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "fast-deep-equal": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz", - "integrity": "sha1-ewUhjd+WZ79/Nwv3/bLLFf3Qqkk=", - "dev": true - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - }, "schema-utils": { "version": "0.4.7", "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-0.4.7.tgz", @@ -15797,9 +13123,9 @@ } }, "whatwg-mimetype": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-2.2.0.tgz", - "integrity": "sha512-5YSO1nMd5D1hY3WzAQV3PzZL83W3YeyR1yW9PcH26Weh1t+Vzh9B6XkDh7aXm83HBZ4nSMvkjvN2H2ySWIvBgw==", + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-2.3.0.tgz", + "integrity": "sha512-M4yMwr6mAnQz76TbJm914+gPpB/nCwvZbJU28cUD6dR004SAxDLOOSUaB1JDRqLtaOV/vi0IC5lEAGFgrjGv/g==", "dev": true }, "whatwg-url": { @@ -15823,9 +13149,9 @@ } }, "which-module": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", - "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-1.0.0.tgz", + "integrity": "sha1-u6Y8qGGUiZT/MHc2CJ47lgJsKk8=", "dev": true }, "wide-align": { @@ -15956,15 +13282,6 @@ "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true - }, - "user-home": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/user-home/-/user-home-2.0.0.tgz", - "integrity": "sha1-nHC/2Babwdy/SGBODwS4tJzenp8=", - "dev": true, - "requires": { - "os-homedir": "^1.0.0" - } } } }, @@ -16027,40 +13344,15 @@ "which-module": "^1.0.0", "y18n": "^3.2.1", "yargs-parser": "^5.0.0" - }, - "dependencies": { - "which-module": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/which-module/-/which-module-1.0.0.tgz", - "integrity": "sha1-u6Y8qGGUiZT/MHc2CJ47lgJsKk8=", - "dev": true - }, - "yargs-parser": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-5.0.0.tgz", - "integrity": "sha1-J17PDX/+Bcd+ZOfIbkzZS/DhIoo=", - "dev": true, - "requires": { - "camelcase": "^3.0.0" - } - } } }, "yargs-parser": { - "version": "9.0.2", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-9.0.2.tgz", - "integrity": "sha1-nM9qQ0YP5O1Aqbto9I1DuKaMwHc=", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-5.0.0.tgz", + "integrity": "sha1-J17PDX/+Bcd+ZOfIbkzZS/DhIoo=", "dev": true, "requires": { - "camelcase": "^4.1.0" - }, - "dependencies": { - "camelcase": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", - "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", - "dev": true - } + "camelcase": "^3.0.0" } }, "yn": { diff --git a/js/package.json b/js/package.json index cf49e41dbe2f4..a80886d82733d 100644 --- a/js/package.json +++ b/js/package.json @@ -8,10 +8,10 @@ }, "scripts": { "lerna": "lerna", - "test": "gulp test", - "build": "gulp build", - "clean": "gulp clean", - "debug": "gulp debug", + "test": "NODE_NO_WARNINGS=1 gulp test", + "build": "NODE_NO_WARNINGS=1 gulp build", + "clean": "NODE_NO_WARNINGS=1 gulp clean", + "debug": "NODE_NO_WARNINGS=1 gulp debug", "perf": "node ./perf/index.js", "test:integration": "node ./bin/integration.js --mode validate", "create:perfdata": "python ./test/data/tables/generate.py ./test/data/tables/tracks.arrow", @@ -19,11 +19,14 @@ "clean:all": "run-p clean clean:testdata", "clean:testdata": "gulp clean:testdata", "create:testdata": "gulp create:testdata", - "test:coverage": "gulp test -t ts --coverage", - "doc": "shx rm -rf ./doc && typedoc --mode file --out doc src/Arrow.ts", - "lint": "run-p lint:*", + "test:coverage": "gulp test -t src --coverage", + "doc": "shx rm -rf ./doc && typedoc --tsconfig tsconfig.json --target ES5 --module commonjs --mode modules --ignoreCompilerErrors --out doc src", + "lint": "run-p lint:src lint:test", + "lint:ci": "run-p lint:src:ci lint:test:ci", "lint:src": "tslint --fix --project -p tsconfig.json -c tslint.json \"src/**/*.ts\"", "lint:test": "tslint --fix --project -p test/tsconfig.json -c tslint.json \"test/**/*.ts\"", + "lint:src:ci": "tslint --project -p tsconfig.json -c tslint.json \"src/**/*.ts\"", + "lint:test:ci": "tslint --project -p test/tsconfig.json -c tslint.json \"test/**/*.ts\"", "prepublishOnly": "echo \"Error: do 'npm run release' instead of 'npm publish'\" && exit 1", "version": "npm install && npm run clean:all" }, @@ -53,99 +56,60 @@ "npm-release.sh" ], "dependencies": { - "@types/flatbuffers": "1.9.0", - "@types/node": "10.12.0", - "@types/text-encoding-utf-8": "1.0.1", + "@types/flatbuffers": "^1.9.0", + "@types/node": "^10.12.18", + "@types/text-encoding-utf-8": "^1.0.1", "command-line-args": "5.0.2", "command-line-usage": "5.0.5", - "flatbuffers": "1.10.2", + "flatbuffers": "^1.10.2", "json-bignum": "0.0.3", + "pad-left": "2.1.0", "text-encoding-utf-8": "1.0.2", - "tslib": "1.9.3" + "tslib": "^1.9.3" }, "devDependencies": { - "@std/esm": "0.26.0", + "@mattiasbuelens/web-streams-polyfill": "0.2.1", "@types/glob": "7.1.1", - "@types/jest": "23.3.5", + "@types/jest": "23.3.10", + "async-done": "1.3.1", "benchmark": "2.1.4", "coveralls": "3.0.2", "del": "3.0.0", + "esm": "3.0.84", "glob": "7.1.3", - "google-closure-compiler": "20181008.0.0", + "google-closure-compiler": "20181210.0.0", "gulp": "4.0.0", - "gulp-json-transform": "0.4.5", + "gulp-json-transform": "0.4.6", "gulp-rename": "1.4.0", "gulp-sourcemaps": "2.6.4", - "gulp-typescript": "5.0.0-alpha.3", - "ix": "2.3.5", + "gulp-typescript": "5.0.0", + "ix": "2.4.3", "jest": "23.6.0", "jest-environment-node-debug": "2.0.0", + "jest-silent-reporter": "0.1.1", "json": "9.0.6", - "lerna": "3.4.3", - "lint-staged": "7.3.0", - "merge2": "1.2.3", + "lerna": "3.8.0", + "memfs": "2.14.2", "mkdirp": "0.5.1", + "multistream": "2.1.1", "npm-run-all": "4.1.5", - "pump": "3.0.0", + "randomatic": "3.1.1", "rimraf": "2.6.2", "rxjs": "5.5.11", "shx": "0.3.2", "source-map-loader": "0.2.4", - "terser-webpack-plugin": "1.1.0", + "terser-webpack-plugin": "1.2.1", "trash": "4.3.0", - "ts-jest": "22.4.6", + "ts-jest": "23.10.5", "ts-node": "7.0.1", - "tslint": "5.11.0", - "typedoc": "0.12", - "typescript": "3.0.3", - "webpack": "4.23.1", + "tslint": "5.12.0", + "typedoc": "0.13.0", + "typescript": "3.2.2", + "web-stream-tools": "0.0.1", + "webpack": "4.28.3", "xml2js": "0.4.19" }, "engines": { - "node": ">=10.0" - }, - "@std/esm": { - "warnings": false - }, - "lint-staged": { - "*.@(ts)": [ - "tslint --fix", - "git add" - ] - }, - "jest": { - "verbose": false, - "testEnvironment": "node", - "globals": { - "ts-jest": { - "skipBabel": true, - "tsConfigFile": "test/tsconfig.json" - } - }, - "roots": [ - "/test/" - ], - "moduleFileExtensions": [ - "js", - "ts", - "tsx" - ], - "coverageReporters": [ - "lcov" - ], - "coveragePathIgnorePatterns": [ - "fb\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$", - "test\\/.*\\.(ts|tsx|js)$", - "/node_modules/" - ], - "transform": { - ".(ts|tsx)": "./node_modules/ts-jest/preprocessor.js", - ".(js|jsx)": "./node_modules/babel-jest/build/index.js" - }, - "transformIgnorePatterns": [ - "/node_modules/", - "/(es2015|esnext)/umd/" - ], - "testRegex": "(.*(-|\\.)(test|spec)s?)\\.(ts|tsx|js)$" + "node": ">=11.0" } } diff --git a/js/perf/index.js b/js/perf/index.js index 2c07591925328..0e9c2bd689aae 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,10 +16,10 @@ // under the License. // Use the ES5 UMD target as perf baseline -// const { predicate, Table, read: readBatches } = require('../targets/es5/umd'); -// const { predicate, Table, read: readBatches } = require('../targets/es5/cjs'); -// const { predicate, Table, read: readBatches } = require('../targets/es2015/umd'); -const { predicate, Table, read: readBatches } = require('../targets/es2015/cjs'); +// const { predicate, Table, RecordBatchReader } = require('../targets/es5/umd'); +// const { predicate, Table, RecordBatchReader } = require('../targets/es5/cjs'); +// const { predicate, Table, RecordBatchReader } = require('../targets/es2015/umd'); +const { predicate, Table, RecordBatchReader } = require('../targets/es2015/cjs'); const { col } = predicate; const Benchmark = require('benchmark'); @@ -91,7 +91,7 @@ function createReadBatchesTest(name, buffers) { return { async: true, name: `readBatches\n`, - fn() { for (recordBatch of readBatches(buffers)) {} } + fn() { for (recordBatch of RecordBatchReader.from(buffers)) {} } }; } @@ -139,34 +139,36 @@ function createDataFrameDirectCountTest(table, column, test, value) { let sum, colidx = table.schema.fields.findIndex((c)=>c.name === column); if (test == 'gt') { - op = function () { + op = () => { sum = 0; - let batches = table.batches; + let batches = table.chunks; let numBatches = batches.length; for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches const batch = batches[batchIndex]; const vector = batch.getChildAt(colidx); // yield all indices - for (let index = -1; ++index < batch.length;) { + for (let index = -1, length = batch.length; ++index < length;) { sum += (vector.get(index) >= value); } } + return sum; } } else if (test == 'eq') { - op = function() { + op = () => { sum = 0; - let batches = table.batches; + let batches = table.chunks; let numBatches = batches.length; for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches const batch = batches[batchIndex]; const vector = batch.getChildAt(colidx); // yield all indices - for (let index = -1; ++index < batch.length;) { + for (let index = -1, length = batch.length; ++index < length;) { sum += (vector.get(index) === value); } } + return sum; } } else { throw new Error(`Unrecognized test "${test}"`); diff --git a/js/src/Arrow.dom.ts b/js/src/Arrow.dom.ts new file mode 100644 index 0000000000000..f9178df91e782 --- /dev/null +++ b/js/src/Arrow.dom.ts @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './io/adapters'; +import { RecordBatchReader } from './ipc/reader'; +import { RecordBatchWriter } from './ipc/writer'; +import { toDOMStream } from './ipc/whatwg/iterable'; +import { recordBatchReaderThroughDOMStream } from './ipc/whatwg/reader'; +import { recordBatchWriterThroughDOMStream } from './ipc/whatwg/writer'; + +streamAdapters.toDOMStream = toDOMStream; +RecordBatchReader['throughDOM'] = recordBatchReaderThroughDOMStream; +RecordBatchWriter['throughDOM'] = recordBatchWriterThroughDOMStream; + +export { + ArrowType, DateUnit, IntervalUnit, MessageHeader, MetadataVersion, Precision, TimeUnit, Type, UnionMode, VectorType, + Data, + DataType, + Null, + Bool, + Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, + Float, Float16, Float32, Float64, + Utf8, + Binary, + FixedSizeBinary, + Date_, DateDay, DateMillisecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Decimal, + List, + Struct, + Union, DenseUnion, SparseUnion, + Dictionary, + Interval, IntervalDayTime, IntervalYearMonth, + FixedSizeList, + Map_, + Table, + Column, + Schema, Field, + Visitor, + Vector, + BaseVector, + BinaryVector, + BoolVector, + Chunked, + DateVector, DateDayVector, DateMillisecondVector, + DecimalVector, + DictionaryVector, + FixedSizeBinaryVector, + FixedSizeListVector, + FloatVector, Float16Vector, Float32Vector, Float64Vector, + IntervalVector, IntervalDayTimeVector, IntervalYearMonthVector, + IntVector, Int8Vector, Int16Vector, Int32Vector, Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, + ListVector, + MapVector, + NullVector, + StructVector, + TimestampVector, TimestampSecondVector, TimestampMillisecondVector, TimestampMicrosecondVector, TimestampNanosecondVector, + TimeVector, TimeSecondVector, TimeMillisecondVector, TimeMicrosecondVector, TimeNanosecondVector, + UnionVector, DenseUnionVector, SparseUnionVector, + Utf8Vector, + ByteStream, AsyncByteStream, AsyncByteQueue, ReadableSource, WritableSink, + RecordBatchReader, RecordBatchFileReader, RecordBatchStreamReader, AsyncRecordBatchFileReader, AsyncRecordBatchStreamReader, + RecordBatchWriter, RecordBatchFileWriter, RecordBatchStreamWriter, RecordBatchJSONWriter, + MessageReader, AsyncMessageReader, JSONMessageReader, + Message, + RecordBatch, + ArrowJSONLike, FileHandle, Readable, Writable, ReadableWritable, ReadableDOMStreamOptions, + DataFrame, FilteredDataFrame, CountByResult, BindFunc, NextFunc, + predicate, + util +} from './Arrow'; diff --git a/js/src/Arrow.externs.js b/js/src/Arrow.externs.js deleted file mode 100644 index 7ad066585712e..0000000000000 --- a/js/src/Arrow.externs.js +++ /dev/null @@ -1,814 +0,0 @@ -// @ts-nocheck -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/* tslint:disable */ - -/** - * @fileoverview Closure Compiler externs for Arrow - * @externs - * @suppress {duplicate,checkTypes} - */ -/** @type {symbol} */ -Symbol.iterator; -/** @type {symbol} */ -Symbol.asyncIterator; - -var Table = function() {}; -/** @type {?} */ -Table.from = function() {}; -/** @type {?} */ -Table.fromVectors = function() {}; -/** @type {?} */ -Table.fromAsync = function() {}; -/** @type {?} */ -Table.fromStruct = function() {}; -/** @type {?} */ -Table.empty = function() {}; -/** @type {?} */ -Table.prototype.schema; -/** @type {?} */ -Table.prototype.length; -/** @type {?} */ -Table.prototype.numCols; -/** @type {?} */ -Table.prototype.get; -/** @type {?} */ -Table.prototype.getColumn; -/** @type {?} */ -Table.prototype.getColumnAt; -/** @type {?} */ -Table.prototype.getColumnIndex; -/** @type {?} */ -Table.prototype.toArray; -/** @type {?} */ -Table.prototype.select; -/** @type {?} */ -Table.prototype.rowsToString; -/** @type {?} */ -Table.prototype.batchesUnion; -/** @type {?} */ -Table.prototype.batches; -/** @type {?} */ -Table.prototype.countBy; -/** @type {?} */ -Table.prototype.scan; -/** @type {?} */ -Table.prototype.serialize; - -var CountByResult = function() {}; -/** @type {?} */ -CountByResult.prototype.asJSON; - -var col = function () {}; -var lit = function () {}; -var and = function () {}; -var or = function () {}; -var custom = function () {}; - -var Value = function() {}; -/** @type {?} */ -Value.prototype.ge; -/** @type {?} */ -Value.prototype.le; -/** @type {?} */ -Value.prototype.eq; -/** @type {?} */ -Value.prototype.lt; -/** @type {?} */ -Value.prototype.gt; -/** @type {?} */ -Value.prototype.ne; - -var Col = function() {}; -/** @type {?} */ -Col.prototype.bind; -var CombinationPredicate = function () {}; -/** @type {?} */ -CombinationPredicate.prototype.children; -var Or = function() {}; -var And = function() {}; -var Not = function() {}; -var GTeq = function () {}; -/** @type {?} */ -GTeq.prototype.and; -/** @type {?} */ -GTeq.prototype.or; -var LTeq = function () {}; -/** @type {?} */ -LTeq.prototype.and; -/** @type {?} */ -LTeq.prototype.or; -var Equals = function () {}; -/** @type {?} */ -Equals.prototype.and; -/** @type {?} */ -Equals.prototype.or; -var Predicate = function() {}; -/** @type {?} */ -Predicate.prototype.bind; -/** @type {?} */ -Predicate.prototype.and; -/** @type {?} */ -Predicate.prototype.or; -/** @type {?} */ -Predicate.prototype.not; -/** @type {?} */ -Predicate.prototype.ands; -var Literal = function() {}; - -var PipeIterator = function() {}; -/** @type {?} */ -PipeIterator.prototype.pipe; - -var AsyncPipeIterator = function() {}; -/** @type {?} */ -AsyncPipeIterator.prototype.pipe; - -var RecordBatch = function() {}; -/** @type {?} */ -RecordBatch.from = function() {}; -/** @type {?} */ -RecordBatch.prototype.numCols; -/** @type {?} */ -RecordBatch.prototype.length; -/** @type {?} */ -RecordBatch.prototype.schema; -/** @type {?} */ -RecordBatch.prototype.columns; -/** @type {?} */ -RecordBatch.prototype.select; - -var Vector = function() {}; -/** @type {?} */ -Vector.create = function() {}; -/** @type {?} */ -Vector.prototype.data; -/** @type {?} */ -Vector.prototype.type; -/** @type {?} */ -Vector.prototype.length; -/** @type {?} */ -Vector.prototype.nullCount; -/** @type {?} */ -Vector.prototype.nullBitmap; -/** @type {?} */ -Vector.prototype.isValid; -/** @type {?} */ -Vector.prototype.get; -/** @type {?} */ -Vector.prototype.set; -/** @type {?} */ -Vector.prototype.toArray; -/** @type {?} */ -Vector.prototype.concat; -/** @type {?} */ -Vector.prototype.slice; -/** @type {?} */ -Vector.prototype.acceptTypeVisitor; - -var BaseInt64 = function() {}; -/** @type {?} */ -BaseInt64.prototype.lessThan; -/** @type {?} */ -BaseInt64.prototype.equals; -/** @type {?} */ -BaseInt64.prototype.greaterThan; -/** @type {?} */ -BaseInt64.prototype.hex; - -var Uint64 = function() {}; -/** @type {?} */ -Uint64.add = function() {}; -/** @type {?} */ -Uint64.multiply = function() {}; -/** @type {?} */ -Uint64.from = function() {}; -/** @type {?} */ -Uint64.fromNumber = function() {}; -/** @type {?} */ -Uint64.fromString = function() {}; -/** @type {?} */ -Uint64.prototype.times; -/** @type {?} */ -Uint64.prototype.plus - -var Int64 = function() {}; -/** @type {?} */ -Int64.add = function() {}; -/** @type {?} */ -Int64.multiply = function() {}; -/** @type {?} */ -Int64.from = function() {}; -/** @type {?} */ -Int64.fromNumber = function() {}; -/** @type {?} */ -Int64.fromString = function() {}; -/** @type {?} */ -Int64.prototype.negate -/** @type {?} */ -Int64.prototype.times -/** @type {?} */ -Int64.prototype.plus -/** @type {?} */ -Int64.prototype.lessThan - -var Int128 = function() {}; -/** @type {?} */ -Int128.add = function() {}; -/** @type {?} */ -Int128.multiply = function() {}; -/** @type {?} */ -Int128.from = function() {}; -/** @type {?} */ -Int128.fromNumber = function() {}; -/** @type {?} */ -Int128.fromString = function() {}; -/** @type {?} */ -Int128.prototype.negate -/** @type {?} */ -Int128.prototype.times -/** @type {?} */ -Int128.prototype.plus -/** @type {?} */ -Int128.prototype.hex - -var packBools = function() {}; - -var Type = function() {}; -/** @type {?} */ -Type.NONE = function() {}; -/** @type {?} */ -Type.Null = function() {}; -/** @type {?} */ -Type.Int = function() {}; -/** @type {?} */ -Type.Float = function() {}; -/** @type {?} */ -Type.FloatingPoint = function() {}; -/** @type {?} */ -Type.Binary = function() {}; -/** @type {?} */ -Type.Utf8 = function() {}; -/** @type {?} */ -Type.Bool = function() {}; -/** @type {?} */ -Type.Decimal = function() {}; -/** @type {?} */ -Type.Date = function() {}; -/** @type {?} */ -Type.Time = function() {}; -/** @type {?} */ -Type.Timestamp = function() {}; -/** @type {?} */ -Type.Interval = function() {}; -/** @type {?} */ -Type.List = function() {}; -/** @type {?} */ -Type.Struct = function() {}; -/** @type {?} */ -Type.Struct_ = function() {}; -/** @type {?} */ -Type.Union = function() {}; -/** @type {?} */ -Type.FixedSizeBinary = function() {}; -/** @type {?} */ -Type.FixedSizeList = function() {}; -/** @type {?} */ -Type.Map = function() {}; -/** @type {?} */ -Type.Dictionary = function() {}; -/** @type {?} */ -Type.DenseUnion = function() {}; -/** @type {?} */ -Type.SparseUnion = function() {}; - -var DateUnit = function() {}; -/** @type {?} */ -DateUnit.DAY = function() {}; -/** @type {?} */ -DateUnit.MILLISECOND = function() {}; -var TimeUnit = function() {}; -/** @type {?} */ -TimeUnit.SECOND = function() {}; -/** @type {?} */ -TimeUnit.MILLISECOND = function() {}; -/** @type {?} */ -TimeUnit.MICROSECOND = function() {}; -/** @type {?} */ -TimeUnit.NANOSECOND = function() {}; -var Precision = function() {}; -/** @type {?} */ -Precision.HALF = function() {}; -/** @type {?} */ -Precision.SINGLE = function() {}; -/** @type {?} */ -Precision.DOUBLE = function() {}; -var UnionMode = function() {}; -/** @type {?} */ -UnionMode.Sparse = function() {}; -/** @type {?} */ -UnionMode.Dense = function() {}; -var VectorType = function() {}; -/** @type {?} */ -VectorType.OFFSET = function() {}; -/** @type {?} */ -VectorType.DATA = function() {}; -/** @type {?} */ -VectorType.VALIDITY = function() {}; -/** @type {?} */ -VectorType.TYPE = function() {}; -var IntervalUnit = function() {}; -/** @type {?} */ -IntervalUnit.YEAR_MONTH = function() {}; -/** @type {?} */ -IntervalUnit.DAY_TIME = function() {}; -var MessageHeader = function() {}; -/** @type {?} */ -MessageHeader.NONE = function() {}; -/** @type {?} */ -MessageHeader.Schema = function() {}; -/** @type {?} */ -MessageHeader.DictionaryBatch = function() {}; -/** @type {?} */ -MessageHeader.RecordBatch = function() {}; -/** @type {?} */ -MessageHeader.Tensor = function() {}; -var MetadataVersion = function() {}; -/** @type {?} */ -MetadataVersion.V1 = function() {}; -/** @type {?} */ -MetadataVersion.V2 = function() {}; -/** @type {?} */ -MetadataVersion.V3 = function() {}; -/** @type {?} */ -MetadataVersion.V4 = function() {}; - -var DataType = function() {}; -/** @type {?} */ -DataType.isNull = function() {}; -/** @type {?} */ -DataType.isInt = function() {}; -/** @type {?} */ -DataType.isFloat = function() {}; -/** @type {?} */ -DataType.isBinary = function() {}; -/** @type {?} */ -DataType.isUtf8 = function() {}; -/** @type {?} */ -DataType.isBool = function() {}; -/** @type {?} */ -DataType.isDecimal = function() {}; -/** @type {?} */ -DataType.isDate = function() {}; -/** @type {?} */ -DataType.isTime = function() {}; -/** @type {?} */ -DataType.isTimestamp = function() {}; -/** @type {?} */ -DataType.isInterval = function() {}; -/** @type {?} */ -DataType.isList = function() {}; -/** @type {?} */ -DataType.isStruct = function() {}; -/** @type {?} */ -DataType.isUnion = function() {}; -/** @type {?} */ -DataType.isDenseUnion = function() {}; -/** @type {?} */ -DataType.isSparseUnion = function() {}; -/** @type {?} */ -DataType.isFixedSizeBinary = function() {}; -/** @type {?} */ -DataType.isFixedSizeList = function() {}; -/** @type {?} */ -DataType.isMap = function() {}; -/** @type {?} */ -DataType.isDictionary = function() {}; -/** @type {?} */ -DataType.prototype.ArrayType; - -var Schema = function() {}; -/** @type {?} */ -Schema.from = function() {}; -/** @type {?} */ -Schema.prototype.fields; -/** @type {?} */ -Schema.prototype.version; -/** @type {?} */ -Schema.prototype.metadata; -/** @type {?} */ -Schema.prototype.dictionaries; -/** @type {?} */ -Schema.prototype.select; -var Field = function() {}; -/** @type {?} */ -Field.prototype.name; -/** @type {?} */ -Field.prototype.type; -/** @type {?} */ -Field.prototype.nullable; -/** @type {?} */ -Field.prototype.metadata; -var Null = function() {}; -var Int8 = function() {}; -var Int16 = function() {}; -var Int32 = function() {}; -var Int64 = function() {}; -var Uint8 = function() {}; -var Uint16 = function() {}; -var Uint32 = function() {}; -var Uint64 = function() {}; -var Float16 = function() {}; -var Float32 = function() {}; -var Float64 = function() {}; -var Binary = function() {}; -var Utf8 = function() {}; -var Bool = function() {}; -var Decimal = function() {}; -var Date_ = function() {}; -var Time = function() {}; -var Timestamp = function() {}; -var Interval = function() {}; -var List = function() {}; -var Struct = function() {}; -var Union = function() {}; -var DenseUnion = function() {}; -var SparseUnion = function() {}; -var FixedSizeBinary = function() {}; -var FixedSizeList = function() {}; -var Map_ = function() {}; -var Dictionary = function() {}; - -var BaseData = function() {}; -/** @type {?} */ -BaseData.prototype.type; -/** @type {?} */ -BaseData.prototype.clone; -/** @type {?} */ -BaseData.prototype.slice; -/** @type {?} */ -BaseData.prototype.length; -/** @type {?} */ -BaseData.prototype.offset; -/** @type {?} */ -BaseData.prototype.typeId; -/** @type {?} */ -BaseData.prototype.childData; -/** @type {?} */ -BaseData.prototype.nullBitmap; -/** @type {?} */ -BaseData.prototype.nullCount; - -var BoolData = function() {}; -var NestedData = function() {}; -var SparseUnionData = function() {}; -var ChunkedData = function() {}; - -var FlatData = function() {}; -/** @type {?} */ -FlatData.prototype.values; - -var FlatListData = function() {}; -/** @type {?} */ -FlatListData.prototype.values; -/** @type {?} */ -FlatListData.prototype.valueOffsets; - -var DictionaryData = function() {}; -/** @type {?} */ -DictionaryData.prototype.indices; -/** @type {?} */ -DictionaryData.prototype.dictionary; - -var ListData = function() {}; -/** @type {?} */ -ListData.prototype.values; -/** @type {?} */ -ListData.prototype.valueOffsets; - -var UnionData = function() {}; -/** @type {?} */ -UnionData.prototype.typeIds; - -var DenseUnionData = function() {}; -/** @type {?} */ -DenseUnionData.prototype.valueOffsets; - -var ChunkedData = function() {}; -/** @type {?} */ -ChunkedData.computeOffsets = function() {}; - -var FlatVector = function() {}; -/** @type {?} */ -FlatVector.prototype.values; -/** @type {?} */ -FlatVector.prototype.lows; -/** @type {?} */ -FlatVector.prototype.highs; -/** @type {?} */ -FlatVector.prototype.asInt32; - -var ListVectorBase = function() {}; -/** @type {?} */ -ListVectorBase.prototype.values; -/** @type {?} */ -ListVectorBase.prototype.valueOffsets; -/** @type {?} */ -ListVectorBase.prototype.getValueOffset; -/** @type {?} */ -ListVectorBase.prototype.getValueLength; - -var NestedVector = function() {}; -/** @type {?} */ -NestedVector.prototype.childData; -/** @type {?} */ -NestedVector.prototype.getChildAt; - -var NullVector = function() {}; -var BoolVector = function() {}; -/** @type {?} */ -BoolVector.from = function() {}; -/** @type {?} */ -BoolVector.prototype.values; -var IntVector = function() {}; -/** @type {?} */ -IntVector.from = function() {}; - -var FloatVector = function() {}; -/** @type {?} */ -FloatVector.from = function() {}; - -var DateVector = function() {}; -/** @type {?} */ -DateVector.from = function() {}; -/** @type {?} */ -DateVector.prototype.asEpochMilliseconds; -var DecimalVector = function() {}; -var TimeVector = function() {}; -var TimestampVector = function() {}; -/** @type {?} */ -TimestampVector.prototype.asEpochMilliseconds; -var IntervalVector = function() {}; -var BinaryVector = function() {}; -/** @type {?} */ -BinaryVector.prototype.asUtf8; -var FixedSizeBinaryVector = function() {}; -var Utf8Vector = function() {}; -/** @type {?} */ -Utf8Vector.prototype.asBinary; -var ListVector = function() {}; -/** @type {?} */ -ListVector.prototype.getChildAt; -var FixedSizeListVector = function() {}; -/** @type {?} */ -FixedSizeListVector.prototype.getChildAt; -var MapVector = function() {}; -/** @type {?} */ -MapVector.prototype.asStruct; -var StructVector = function() {}; -/** @type {?} */ -StructVector.prototype.asMap; -var UnionVector = function() {}; - -var DictionaryVector = function() {}; -/** @type {?} */ -DictionaryVector.prototype.indices; -/** @type {?} */ -DictionaryVector.prototype.dictionary; -/** @type {?} */ -DictionaryVector.prototype.getKey; -/** @type {?} */ -DictionaryVector.prototype.getValue; -/** @type {?} */ -DictionaryVector.prototype.reverseLookup; - -var FlatView = function() {}; -/** @type {?} */ -FlatView.prototype.get; -/** @type {?} */ -FlatView.prototype.clone; -/** @type {?} */ -FlatView.prototype.isValid; -/** @type {?} */ -FlatView.prototype.toArray; -/** @type {?} */ -FlatView.prototype.set; - -var PrimitiveView = function() {}; -/** @type {?} */ -PrimitiveView.prototype.size; -/** @type {?} */ -PrimitiveView.prototype.clone; - -var NullView = function() {}; -/** @type {?} */ -NullView.prototype.get; -/** @type {?} */ -NullView.prototype.clone; -/** @type {?} */ -NullView.prototype.isValid; -/** @type {?} */ -NullView.prototype.toArray; -/** @type {?} */ -NullView.prototype.set; - -var BoolView = function() {}; -/** @type {?} */ -BoolView.prototype.get; -/** @type {?} */ -BoolView.prototype.clone; -/** @type {?} */ -BoolView.prototype.isValid; -/** @type {?} */ -BoolView.prototype.toArray; -/** @type {?} */ -BoolView.prototype.set; - -var ValidityView = function() {}; -/** @type {?} */ -ValidityView.prototype.get; -/** @type {?} */ -ValidityView.prototype.clone; -/** @type {?} */ -ValidityView.prototype.isValid; -/** @type {?} */ -ValidityView.prototype.toArray; -/** @type {?} */ -ValidityView.prototype.set; -/** @type {?} */ -ValidityView.prototype.size; -/** @type {?} */ -ValidityView.prototype.getChildAt; - -var DictionaryView = function() {}; -/** @type {?} */ -DictionaryView.prototype.get; -/** @type {?} */ -DictionaryView.prototype.clone; -/** @type {?} */ -DictionaryView.prototype.isValid; -/** @type {?} */ -DictionaryView.prototype.toArray; -/** @type {?} */ -DictionaryView.prototype.set; - -var ListViewBase = function() {}; -/** @type {?} */ -ListViewBase.prototype.get; -/** @type {?} */ -ListViewBase.prototype.clone; -/** @type {?} */ -ListViewBase.prototype.isValid; -/** @type {?} */ -ListViewBase.prototype.toArray; -/** @type {?} */ -ListViewBase.prototype.set; - -var NestedView = function() {}; -/** @type {?} */ -NestedView.prototype.get; -/** @type {?} */ -NestedView.prototype.clone; -/** @type {?} */ -NestedView.prototype.isValid; -/** @type {?} */ -NestedView.prototype.toArray; -/** @type {?} */ -NestedView.prototype.set; - -var ChunkedView = function() {}; -/** @type {?} */ -ChunkedView.prototype.get; -/** @type {?} */ -ChunkedView.prototype.clone; -/** @type {?} */ -ChunkedView.prototype.isValid; -/** @type {?} */ -ChunkedView.prototype.toArray; -/** @type {?} */ -ChunkedView.prototype.set; - -var ListView = function() {}; -var FixedSizeListView = function() {}; -var BinaryView = function() {}; -var Utf8View = function() {}; -var UnionView = function() {}; -var DenseUnionView = function() {}; -var StructView = function() {}; -var MapView = function() {}; -var NullView = function() {}; -var FixedSizeView = function() {}; -var Float16View = function() {}; -var DateDayView = function() {}; -var DateMillisecondView = function() {}; -var TimestampDayView = function() {}; -var TimestampSecondView = function() {}; -var TimestampMillisecondView = function() {}; -var TimestampMicrosecondView = function() {}; -var TimestampNanosecondView = function() {}; -var IntervalYearMonthView = function() {}; -var IntervalYearView = function() {}; -var IntervalMonthView = function() {}; - -var TypeVisitor = function() {}; -/** @type {?} */ -TypeVisitor.visitTypeInline = function() {}; -/** @type {?} */ -TypeVisitor.prototype.visit; -/** @type {?} */ -TypeVisitor.prototype.visitMany; -/** @type {?} */ -TypeVisitor.prototype.visitNull; -/** @type {?} */ -TypeVisitor.prototype.visitBool; -/** @type {?} */ -TypeVisitor.prototype.visitInt; -/** @type {?} */ -TypeVisitor.prototype.visitFloat; -/** @type {?} */ -TypeVisitor.prototype.visitUtf8; -/** @type {?} */ -TypeVisitor.prototype.visitBinary; -/** @type {?} */ -TypeVisitor.prototype.visitFixedSizeBinary; -/** @type {?} */ -TypeVisitor.prototype.visitDate; -/** @type {?} */ -TypeVisitor.prototype.visitTimestamp; -/** @type {?} */ -TypeVisitor.prototype.visitTime; -/** @type {?} */ -TypeVisitor.prototype.visitDecimal; -/** @type {?} */ -TypeVisitor.prototype.visitList; -/** @type {?} */ -TypeVisitor.prototype.visitStruct; -/** @type {?} */ -TypeVisitor.prototype.visitUnion; -/** @type {?} */ -TypeVisitor.prototype.visitDictionary; -/** @type {?} */ -TypeVisitor.prototype.visitInterval; -/** @type {?} */ -TypeVisitor.prototype.visitFixedSizeList; -/** @type {?} */ -TypeVisitor.prototype.visitMap; - -var VectorVisitor = function() {}; -/** @type {?} */ -VectorVisitor.visitTypeInline = function() {}; -/** @type {?} */ -VectorVisitor.prototype.visit; -/** @type {?} */ -VectorVisitor.prototype.visitMany; -/** @type {?} */ -VectorVisitor.prototype.visitNull; -/** @type {?} */ -VectorVisitor.prototype.visitBool; -/** @type {?} */ -VectorVisitor.prototype.visitInt; -/** @type {?} */ -VectorVisitor.prototype.visitFloat; -/** @type {?} */ -VectorVisitor.prototype.visitUtf8; -/** @type {?} */ -VectorVisitor.prototype.visitBinary; -/** @type {?} */ -VectorVisitor.prototype.visitFixedSizeBinary; -/** @type {?} */ -VectorVisitor.prototype.visitDate; -/** @type {?} */ -VectorVisitor.prototype.visitTimestamp; -/** @type {?} */ -VectorVisitor.prototype.visitTime; -/** @type {?} */ -VectorVisitor.prototype.visitDecimal; -/** @type {?} */ -VectorVisitor.prototype.visitList; -/** @type {?} */ -VectorVisitor.prototype.visitStruct; -/** @type {?} */ -VectorVisitor.prototype.visitUnion; -/** @type {?} */ -VectorVisitor.prototype.visitDictionary; -/** @type {?} */ -VectorVisitor.prototype.visitInterval; -/** @type {?} */ -VectorVisitor.prototype.visitFixedSizeList; -/** @type {?} */ -VectorVisitor.prototype.visitMap; diff --git a/js/src/Arrow.node.ts b/js/src/Arrow.node.ts new file mode 100644 index 0000000000000..da6e3df6d9b08 --- /dev/null +++ b/js/src/Arrow.node.ts @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './io/adapters'; +import { RecordBatchReader } from './ipc/reader'; +import { RecordBatchWriter } from './ipc/writer'; +import { toNodeStream } from './ipc/node/iterable'; +import { recordBatchReaderThroughNodeStream } from './ipc/node/reader'; +import { recordBatchWriterThroughNodeStream } from './ipc/node/writer'; + +streamAdapters.toNodeStream = toNodeStream; +RecordBatchReader['throughNode'] = recordBatchReaderThroughNodeStream; +RecordBatchWriter['throughNode'] = recordBatchWriterThroughNodeStream; + +export * from './Arrow.dom'; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index c76578b62996d..0e5a5fe3bc280 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -15,306 +15,78 @@ // specific language governing permissions and limitations // under the License. -import * as type_ from './type'; -import * as data_ from './data'; -import * as vector_ from './vector'; +export { ArrowType, DateUnit, IntervalUnit, MessageHeader, MetadataVersion, Precision, TimeUnit, Type, UnionMode, VectorType } from './enum'; +export { Data } from './data'; +export { + DataType, + Null, + Bool, + Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, + Float, Float16, Float32, Float64, + Utf8, + Binary, + FixedSizeBinary, + Date_, DateDay, DateMillisecond, + Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, + Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, + Decimal, + List, + Struct, + Union, DenseUnion, SparseUnion, + Dictionary, + Interval, IntervalDayTime, IntervalYearMonth, + FixedSizeList, + Map_, +} from './type'; + +export { Table } from './table'; +export { Column } from './column'; +export { Schema, Field } from './schema'; +export { Visitor } from './visitor'; +export { + Row, + Vector, + BaseVector, + BinaryVector, + BoolVector, + Chunked, + DateVector, DateDayVector, DateMillisecondVector, + DecimalVector, + DictionaryVector, + FixedSizeBinaryVector, + FixedSizeListVector, + FloatVector, Float16Vector, Float32Vector, Float64Vector, + IntervalVector, IntervalDayTimeVector, IntervalYearMonthVector, + IntVector, Int8Vector, Int16Vector, Int32Vector, Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, + ListVector, + MapVector, + NullVector, + StructVector, + TimestampVector, TimestampSecondVector, TimestampMillisecondVector, TimestampMicrosecondVector, TimestampNanosecondVector, + TimeVector, TimeSecondVector, TimeMillisecondVector, TimeMicrosecondVector, TimeNanosecondVector, + UnionVector, DenseUnionVector, SparseUnionVector, + Utf8Vector, +} from './vector/index'; + +export { ByteStream, AsyncByteStream, AsyncByteQueue, ReadableSource, WritableSink } from './io/stream'; +export { RecordBatchReader, RecordBatchFileReader, RecordBatchStreamReader, AsyncRecordBatchFileReader, AsyncRecordBatchStreamReader } from './ipc/reader'; +export { RecordBatchWriter, RecordBatchFileWriter, RecordBatchStreamWriter, RecordBatchJSONWriter } from './ipc/writer'; +export { MessageReader, AsyncMessageReader, JSONMessageReader } from './ipc/message'; +export { Message } from './ipc/metadata/message'; +export { RecordBatch } from './recordbatch'; +export { ArrowJSONLike, FileHandle, Readable, Writable, ReadableWritable, ReadableDOMStreamOptions } from './io/interfaces'; +export { DataFrame, FilteredDataFrame, CountByResult, BindFunc, NextFunc } from './compute/dataframe'; + import * as util_int_ from './util/int'; import * as util_bit_ from './util/bit'; -import * as util_node from './util/node'; -import * as visitor_ from './visitor'; -import * as view_ from './vector/view'; -import * as predicate_ from './predicate'; -import { Vector } from './vector'; -import { RecordBatch } from './recordbatch'; -import { Schema, Field, Type } from './type'; -import { Table, DataFrame, NextFunc, BindFunc, CountByResult } from './table'; -import { fromReadableStream } from './ipc/reader/node'; -import { read, readAsync, readStream } from './ipc/reader/arrow'; -import { readBuffersAsync, readRecordBatchesAsync } from './ipc/reader/arrow'; -import { serializeFile, serializeStream } from './ipc/writer/binary'; - -export import View = vector_.View; -export import VectorLike = vector_.VectorLike; -export import TypedArray = type_.TypedArray; -export import IntBitWidth = type_.IntBitWidth; -export import TimeBitWidth = type_.TimeBitWidth; -export import TypedArrayConstructor = type_.TypedArrayConstructor; - -export { fromReadableStream }; -export { read, readAsync, readStream }; -export { readBuffersAsync, readRecordBatchesAsync }; -export { serializeFile, serializeStream }; -export { Table, DataFrame, NextFunc, BindFunc, CountByResult }; -export { Field, Schema, RecordBatch, Vector, Type }; - -export namespace util { - export import Uint64 = util_int_.Uint64; - export import Int64 = util_int_.Int64; - export import Int128 = util_int_.Int128; - export import packBools = util_bit_.packBools; - export import PipeIterator = util_node.PipeIterator; - export import AsyncPipeIterator = util_node.AsyncPipeIterator; -} - -export namespace data { - export import BaseData = data_.BaseData; - export import FlatData = data_.FlatData; - export import BoolData = data_.BoolData; - export import FlatListData = data_.FlatListData; - export import DictionaryData = data_.DictionaryData; - export import NestedData = data_.NestedData; - export import ListData = data_.ListData; - export import UnionData = data_.UnionData; - export import SparseUnionData = data_.SparseUnionData; - export import DenseUnionData = data_.DenseUnionData; - export import ChunkedData = data_.ChunkedData; -} - -export namespace enum_ { - export import Type = type_.ArrowType; - export import DateUnit = type_.DateUnit; - export import TimeUnit = type_.TimeUnit; - export import Precision = type_.Precision; - export import UnionMode = type_.UnionMode; - export import VectorType = type_.VectorType; - export import IntervalUnit = type_.IntervalUnit; - export import MessageHeader = type_.MessageHeader; - export import MetadataVersion = type_.MetadataVersion; -} - -export namespace type { - export import Schema = type_.Schema; - export import Field = type_.Field; - export import Null = type_.Null; - export import Int = type_.Int; - export import Int8 = type_.Int8; - export import Int16 = type_.Int16; - export import Int32 = type_.Int32; - export import Int64 = type_.Int64; - export import Uint8 = type_.Uint8; - export import Uint16 = type_.Uint16; - export import Uint32 = type_.Uint32; - export import Uint64 = type_.Uint64; - export import Float = type_.Float; - export import Float16 = type_.Float16; - export import Float32 = type_.Float32; - export import Float64 = type_.Float64; - export import Binary = type_.Binary; - export import Utf8 = type_.Utf8; - export import Bool = type_.Bool; - export import Decimal = type_.Decimal; - export import Date_ = type_.Date_; - export import Time = type_.Time; - export import Timestamp = type_.Timestamp; - export import Interval = type_.Interval; - export import List = type_.List; - export import Struct = type_.Struct; - export import Union = type_.Union; - export import DenseUnion = type_.DenseUnion; - export import SparseUnion = type_.SparseUnion; - export import FixedSizeBinary = type_.FixedSizeBinary; - export import FixedSizeList = type_.FixedSizeList; - export import Map_ = type_.Map_; - export import Dictionary = type_.Dictionary; -} - -export namespace vector { - export import Vector = vector_.Vector; - export import NullVector = vector_.NullVector; - export import BoolVector = vector_.BoolVector; - export import IntVector = vector_.IntVector; - export import FloatVector = vector_.FloatVector; - export import DateVector = vector_.DateVector; - export import DecimalVector = vector_.DecimalVector; - export import TimeVector = vector_.TimeVector; - export import TimestampVector = vector_.TimestampVector; - export import IntervalVector = vector_.IntervalVector; - export import BinaryVector = vector_.BinaryVector; - export import FixedSizeBinaryVector = vector_.FixedSizeBinaryVector; - export import Utf8Vector = vector_.Utf8Vector; - export import ListVector = vector_.ListVector; - export import FixedSizeListVector = vector_.FixedSizeListVector; - export import MapVector = vector_.MapVector; - export import StructVector = vector_.StructVector; - export import UnionVector = vector_.UnionVector; - export import DictionaryVector = vector_.DictionaryVector; -} - -export namespace visitor { - export import TypeVisitor = visitor_.TypeVisitor; - export import VectorVisitor = visitor_.VectorVisitor; -} - -export namespace view { - export import ChunkedView = view_.ChunkedView; - export import DictionaryView = view_.DictionaryView; - export import ListView = view_.ListView; - export import FixedSizeListView = view_.FixedSizeListView; - export import BinaryView = view_.BinaryView; - export import Utf8View = view_.Utf8View; - export import UnionView = view_.UnionView; - export import DenseUnionView = view_.DenseUnionView; - export import NestedView = view_.NestedView; - export import StructView = view_.StructView; - export import MapView = view_.MapView; - export import FlatView = view_.FlatView; - export import NullView = view_.NullView; - export import BoolView = view_.BoolView; - export import ValidityView = view_.ValidityView; - export import PrimitiveView = view_.PrimitiveView; - export import FixedSizeView = view_.FixedSizeView; - export import Float16View = view_.Float16View; - export import DateDayView = view_.DateDayView; - export import DateMillisecondView = view_.DateMillisecondView; - export import TimestampDayView = view_.TimestampDayView; - export import TimestampSecondView = view_.TimestampSecondView; - export import TimestampMillisecondView = view_.TimestampMillisecondView; - export import TimestampMicrosecondView = view_.TimestampMicrosecondView; - export import TimestampNanosecondView = view_.TimestampNanosecondView; - export import IntervalYearMonthView = view_.IntervalYearMonthView; - export import IntervalYearView = view_.IntervalYearView; - export import IntervalMonthView = view_.IntervalMonthView; -} - -export namespace predicate { - export import col = predicate_.col; - export import lit = predicate_.lit; - export import and = predicate_.and; - export import or = predicate_.or; - export import custom = predicate_.custom; - - export import Or = predicate_.Or; - export import Col = predicate_.Col; - export import And = predicate_.And; - export import Not = predicate_.Not; - export import GTeq = predicate_.GTeq; - export import LTeq = predicate_.LTeq; - export import Value = predicate_.Value; - export import Equals = predicate_.Equals; - export import Literal = predicate_.Literal; - export import Predicate = predicate_.Predicate; - - export import PredicateFunc = predicate_.PredicateFunc; -} - -/* These exports are needed for the closure and uglify umd targets */ -try { - let Arrow: any = eval('exports'); - if (Arrow && typeof Arrow === 'object') { - // string indexers tell closure and uglify not to rename these properties - Arrow['data'] = data; - Arrow['type'] = type; - Arrow['util'] = util; - Arrow['view'] = view; - Arrow['enum_'] = enum_; - Arrow['vector'] = vector; - Arrow['visitor'] = visitor; - Arrow['predicate'] = predicate; - - Arrow['read'] = read; - Arrow['readAsync'] = readAsync; - Arrow['readStream'] = readStream; - Arrow['fromReadableStream'] = fromReadableStream; - Arrow['readBuffersAsync'] = readBuffersAsync; - Arrow['readRecordBatchesAsync'] = readRecordBatchesAsync; - - Arrow['serializeFile'] = serializeFile; - Arrow['serializeStream'] = serializeStream; - - Arrow['Type'] = Type; - Arrow['Field'] = Field; - Arrow['Schema'] = Schema; - Arrow['Vector'] = Vector; - Arrow['RecordBatch'] = RecordBatch; - - Arrow['Table'] = Table; - Arrow['CountByResult'] = CountByResult; - } -} catch (e) { /* not the UMD bundle */ } -/* end umd exports */ - -// closure compiler erases static properties/methods: -// https://github.com/google/closure-compiler/issues/1776 -// set them via string indexers to save them from the mangler -Schema['from'] = Schema.from; -Table['from'] = Table.from; -Table['fromVectors'] = Table.fromVectors; -Table['fromAsync'] = Table.fromAsync; -Table['fromStruct'] = Table.fromStruct; -Table['empty'] = Table.empty; -Vector['create'] = Vector.create; -RecordBatch['from'] = RecordBatch.from; - -util_int_.Uint64['add'] = util_int_.Uint64.add; -util_int_.Uint64['multiply'] = util_int_.Uint64.multiply; -util_int_.Uint64['from'] = util_int_.Uint64.from; -util_int_.Uint64['fromNumber'] = util_int_.Uint64.fromNumber; -util_int_.Uint64['fromString'] = util_int_.Uint64.fromString; -util_int_.Uint64['convertArray'] = util_int_.Uint64.convertArray; - -util_int_.Int64['add'] = util_int_.Int64.add; -util_int_.Int64['multiply'] = util_int_.Int64.multiply; -util_int_.Int64['from'] = util_int_.Int64.from; -util_int_.Int64['fromNumber'] = util_int_.Int64.fromNumber; -util_int_.Int64['fromString'] = util_int_.Int64.fromString; -util_int_.Int64['convertArray'] = util_int_.Int64.convertArray; - -util_int_.Int128['add'] = util_int_.Int128.add; -util_int_.Int128['multiply'] = util_int_.Int128.multiply; -util_int_.Int128['from'] = util_int_.Int128.from; -util_int_.Int128['fromNumber'] = util_int_.Int128.fromNumber; -util_int_.Int128['fromString'] = util_int_.Int128.fromString; -util_int_.Int128['convertArray'] = util_int_.Int128.convertArray; - -data_.ChunkedData['computeOffsets'] = data_.ChunkedData.computeOffsets; - -(type_.Type as any)['NONE'] = type_.Type.NONE; -(type_.Type as any)['Null'] = type_.Type.Null; -(type_.Type as any)['Int'] = type_.Type.Int; -(type_.Type as any)['Float'] = type_.Type.Float; -(type_.Type as any)['Binary'] = type_.Type.Binary; -(type_.Type as any)['Utf8'] = type_.Type.Utf8; -(type_.Type as any)['Bool'] = type_.Type.Bool; -(type_.Type as any)['Decimal'] = type_.Type.Decimal; -(type_.Type as any)['Date'] = type_.Type.Date; -(type_.Type as any)['Time'] = type_.Type.Time; -(type_.Type as any)['Timestamp'] = type_.Type.Timestamp; -(type_.Type as any)['Interval'] = type_.Type.Interval; -(type_.Type as any)['List'] = type_.Type.List; -(type_.Type as any)['Struct'] = type_.Type.Struct; -(type_.Type as any)['Union'] = type_.Type.Union; -(type_.Type as any)['FixedSizeBinary'] = type_.Type.FixedSizeBinary; -(type_.Type as any)['FixedSizeList'] = type_.Type.FixedSizeList; -(type_.Type as any)['Map'] = type_.Type.Map; -(type_.Type as any)['Dictionary'] = type_.Type.Dictionary; -(type_.Type as any)['DenseUnion'] = type_.Type.DenseUnion; -(type_.Type as any)['SparseUnion'] = type_.Type.SparseUnion; - -type_.DataType['isNull'] = type_.DataType.isNull; -type_.DataType['isInt'] = type_.DataType.isInt; -type_.DataType['isFloat'] = type_.DataType.isFloat; -type_.DataType['isBinary'] = type_.DataType.isBinary; -type_.DataType['isUtf8'] = type_.DataType.isUtf8; -type_.DataType['isBool'] = type_.DataType.isBool; -type_.DataType['isDecimal'] = type_.DataType.isDecimal; -type_.DataType['isDate'] = type_.DataType.isDate; -type_.DataType['isTime'] = type_.DataType.isTime; -type_.DataType['isTimestamp'] = type_.DataType.isTimestamp; -type_.DataType['isInterval'] = type_.DataType.isInterval; -type_.DataType['isList'] = type_.DataType.isList; -type_.DataType['isStruct'] = type_.DataType.isStruct; -type_.DataType['isUnion'] = type_.DataType.isUnion; -type_.DataType['isDenseUnion'] = type_.DataType.isDenseUnion; -type_.DataType['isSparseUnion'] = type_.DataType.isSparseUnion; -type_.DataType['isFixedSizeBinary'] = type_.DataType.isFixedSizeBinary; -type_.DataType['isFixedSizeList'] = type_.DataType.isFixedSizeList; -type_.DataType['isMap'] = type_.DataType.isMap; -type_.DataType['isDictionary'] = type_.DataType.isDictionary; - -vector_.BoolVector['from'] = vector_.BoolVector.from; -vector_.DateVector['from'] = vector_.DateVector.from; -vector_.IntVector['from'] = vector_.IntVector.from; -vector_.FloatVector['from'] = vector_.FloatVector.from; - -visitor_.TypeVisitor['visitTypeInline'] = visitor_.TypeVisitor.visitTypeInline; -visitor_.VectorVisitor['visitTypeInline'] = visitor_.VectorVisitor.visitTypeInline; \ No newline at end of file +import * as util_buffer_ from './util/buffer'; +import * as util_vector_ from './util/vector'; +import * as predicate from './compute/predicate'; + +export { predicate }; +export const util = { + ...util_int_, + ...util_bit_, + ...util_buffer_, + ...util_vector_ +}; diff --git a/js/src/bin/arrow2csv.ts b/js/src/bin/arrow2csv.ts index 510f00740fed0..4ae9c0089a009 100644 --- a/js/src/bin/arrow2csv.ts +++ b/js/src/bin/arrow2csv.ts @@ -20,60 +20,189 @@ /* tslint:disable */ import * as fs from 'fs'; -import { promisify } from 'util'; -import { Table, readStream } from '../Arrow'; +import * as stream from 'stream'; +import { valueToString } from '../util/pretty'; +import { RecordBatch, RecordBatchReader, AsyncByteQueue } from '../Arrow.node'; -const readFile = promisify(fs.readFile); -const { parse } = require('json-bignum'); +const padLeft = require('pad-left'); +const bignumJSONParse = require('json-bignum').parse; +const pipeline = require('util').promisify(stream.pipeline); const argv = require(`command-line-args`)(cliOpts(), { partial: true }); -const files = [...(argv.file || []), ...(argv._unknown || [])].filter(Boolean); +const files = argv.help ? [] : [...(argv.file || []), ...(argv._unknown || [])].filter(Boolean); + +const state = { ...argv, closed: false, hasRecords: false }; (async () => { - let hasRecords = false; - if (files.length > 0) { - hasRecords = true; - for (let input of files) { - printTable(await readFile(input)); - } - } else { - let rowOffset = 0; - let maxColumnWidths: number[] = []; - for await (const recordBatch of readStream(process.stdin)) { - hasRecords = true; - recordBatch.rowsToString(' | ', rowOffset, maxColumnWidths).pipe(process.stdout); - rowOffset += recordBatch.length; + + const sources = argv.help ? [] : [ + ...files.map((file) => () => fs.createReadStream(file)), + ...(process.stdin.isTTY ? [] : [() => process.stdin]) + ].filter(Boolean) as (() => NodeJS.ReadableStream)[]; + + let reader: RecordBatchReader | null; + + for (const source of sources) { + if (state.closed) { break; } + if (reader = await createRecordBatchReader(source)) { + await pipeline( + reader.toNodeStream(), + recordBatchRowsToString(state), + process.stdout + ).catch(() => state.closed = true); } + if (state.closed) { break; } } - return hasRecords ? null : print_usage(); -})().catch((e) => { console.error(e); process.exit(1); }); -function printTable(input: any) { - let table: Table; + return state.hasRecords ? 0 : print_usage(); +})() +.then((x) => +x || 0, (err) => { + if (err) { + console.error(`${err && err.stack || err}`); + } + return process.exitCode || 1; +}).then((code) => process.exit(code)); + +async function createRecordBatchReader(createSourceStream: () => NodeJS.ReadableStream) { + + let json = new AsyncByteQueue(); + let stream = new AsyncByteQueue(); + let source = createSourceStream(); + let reader: RecordBatchReader | null = null; + // tee the input source, just in case it's JSON + source.on('end', () => [stream, json].forEach((y) => y.close())) + .on('data', (x) => [stream, json].forEach((y) => y.write(x))) + .on('error', (e) => [stream, json].forEach((y) => y.abort(e))); + try { - table = Table.from(input); - } catch (e) { - table = Table.from(parse(input + '')); + reader = await (await RecordBatchReader.from(stream)).open(); + } catch (e) { reader = null; } + + if (!reader || reader.closed) { + reader = null; + await json.closed; + if (source instanceof fs.ReadStream) { source.close(); } + // If the data in the `json` ByteQueue parses to JSON, then assume it's Arrow JSON from a file or stdin + try { + reader = await (await RecordBatchReader.from(bignumJSONParse(await json.toString()))).open(); + } catch (e) { reader = null; } + } + + return (reader && !reader.closed) ? reader : null; +} + +function recordBatchRowsToString(state: { closed: boolean, schema: any, separator: string, hasRecords: boolean }) { + + let rowId = 0, maxColWidths = [15], separator = `${state.separator || ' |'} `; + + return new stream.Transform({ transform, encoding: 'utf8', writableObjectMode: true, readableObjectMode: false }); + + function transform(this: stream.Transform, batch: RecordBatch, _enc: string, cb: (error?: Error, data?: any) => void) { + batch = !(state.schema && state.schema.length) ? batch : batch.select(...state.schema); + if (batch.length <= 0 || batch.numCols <= 0 || state.closed) { + state.hasRecords || (state.hasRecords = false); + return cb(undefined, null); + } + + state.hasRecords = true; + const header = ['row_id', ...batch.schema.fields.map((f) => `${f}`)].map(valueToString); + + // Pass one to convert to strings and count max column widths + const newMaxWidths = measureColumnWidths(rowId, batch, header.map((x, i) => Math.max(maxColWidths[i] || 0, x.length))); + + // If any of the column widths changed, print the header again + if ((rowId % 350) && JSON.stringify(newMaxWidths) !== JSON.stringify(maxColWidths)) { + this.push(`\n${formatRow(header, newMaxWidths, separator)}`); + } + + maxColWidths = newMaxWidths; + + for (const row of batch) { + if (state.closed) { break; } + else if (!row) { continue; } + if (!(rowId % 350)) { this.push(`\n${formatRow(header, maxColWidths, separator)}`); } + this.push(formatRow([rowId++, ...row].map(valueToString), maxColWidths, separator)); + } + cb(); } - if (argv.schema && argv.schema.length) { - table = table.select(...argv.schema); +} + +function formatRow(row: string[] = [], maxColWidths: number[] = [], separator: string = ' |') { + return row.map((x, j) => padLeft(x, maxColWidths[j])).join(separator) + '\n'; +} + +function measureColumnWidths(rowId: number, batch: RecordBatch, maxColWidths: number[] = []) { + for (const row of batch) { + if (!row) { continue; } + maxColWidths[0] = Math.max(maxColWidths[0] || 0, (`${rowId++}`).length); + for (let val: any, j = -1, k = row.length; ++j < k;) { + if (ArrayBuffer.isView(val = row[j]) && (typeof val[Symbol.toPrimitive] !== 'function')) { + // If we're printing a column of TypedArrays, ensure the column is wide enough to accommodate + // the widest possible element for a given byte size, since JS omits leading zeroes. For example: + // 1 | [1137743649,2170567488,244696391,2122556476] + // 2 | null + // 3 | [637174007,2142281880,961736230,2912449282] + // 4 | [1035112265,21832886,412842672,2207710517] + // 5 | null + // 6 | null + // 7 | [2755142991,4192423256,2994359,467878370] + const elementWidth = typedArrayElementWidths.get(val.constructor)!; + + maxColWidths[j + 1] = Math.max(maxColWidths[j + 1] || 0, + 2 + // brackets on each end + (val.length - 1) + // commas between elements + (val.length * elementWidth) // width of stringified 2^N-1 + ); + } else { + maxColWidths[j + 1] = Math.max(maxColWidths[j + 1] || 0, valueToString(val).length); + } + } } - table.rowsToString().pipe(process.stdout); + return maxColWidths; } +// Measure the stringified representation of 2^N-1 for each TypedArray variant +const typedArrayElementWidths = (() => { + const maxElementWidth = (ArrayType: any) => { + const octets = Array.from({ length: ArrayType.BYTES_PER_ELEMENT - 1 }, _ => 255); + return `${new ArrayType(new Uint8Array([...octets, 254]).buffer)[0]}`.length; + }; + return new Map([ + [Int8Array, maxElementWidth(Int8Array)], + [Int16Array, maxElementWidth(Int16Array)], + [Int32Array, maxElementWidth(Int32Array)], + [Uint8Array, maxElementWidth(Uint8Array)], + [Uint16Array, maxElementWidth(Uint16Array)], + [Uint32Array, maxElementWidth(Uint32Array)], + [Float32Array, maxElementWidth(Float32Array)], + [Float64Array, maxElementWidth(Float64Array)], + [Uint8ClampedArray, maxElementWidth(Uint8ClampedArray)] + ]) +})(); + function cliOpts() { return [ { type: String, name: 'schema', alias: 's', optional: true, multiple: true, - typeLabel: '[underline]{columns}', + typeLabel: '{underline columns}', description: 'A space-delimited list of column names' }, { type: String, name: 'file', alias: 'f', - optional: false, multiple: true, + optional: true, multiple: true, description: 'The Arrow file to read' + }, + { + type: String, + name: 'sep', optional: true, default: '|', + description: 'The column separator character' + }, + { + type: Boolean, + name: 'help', optional: true, default: false, + description: 'Print this usage guide.' } ]; } @@ -87,34 +216,29 @@ function print_usage() { { header: 'Synopsis', content: [ - '$ arrow2csv [underline]{file.arrow} [[bold]{--schema} column_name ...]', - '$ arrow2csv [[bold]{--schema} column_name ...] [[bold]{--file} [underline]{file.arrow}]', - '$ arrow2csv [bold]{-s} column_1 [bold]{-s} column_2 [[bold]{-f} [underline]{file.arrow}]', - '$ arrow2csv [[bold]{--help}]' + '$ arrow2csv {underline file.arrow} [{bold --schema} column_name ...]', + '$ arrow2csv [{bold --schema} column_name ...] [{bold --file} {underline file.arrow}]', + '$ arrow2csv {bold -s} column_1 {bold -s} column_2 [{bold -f} {underline file.arrow}]', + '$ arrow2csv [{bold --help}]' ] }, { header: 'Options', - optionList: [ - ...cliOpts(), - { - name: 'help', - description: 'Print this usage guide.' - } - ] + optionList: cliOpts() }, { header: 'Example', content: [ - '$ arrow2csv --schema foo baz -f simple.arrow', - '> foo, baz', - '> 1, aa', - '> null, null', - '> 3, null', - '> 4, bbb', - '> 5, cccc', + '$ arrow2csv --schema foo baz -f simple.arrow --sep ","', + ' ', + '> "row_id", "foo: Int32", "bar: Float64", "baz: Utf8"', + '> 0, 1, 1, "aa"', + '> 1, null, null, null', + '> 2, 3, null, null', + '> 3, 4, 4, "bbb"', + '> 4, 5, 5, "cccc"', ] } ])); - process.exit(1); -} \ No newline at end of file + return 1; +} diff --git a/js/src/column.ts b/js/src/column.ts new file mode 100644 index 0000000000000..0a5bc36797bf9 --- /dev/null +++ b/js/src/column.ts @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Field } from './schema'; +import { Vector } from './vector'; +import { DataType } from './type'; +import { Clonable, Sliceable, Applicative } from './vector'; +import { Chunked, SearchContinuation } from './vector/chunked'; + +export interface Column { + typeId: T['TType']; + concat(...others: Vector[]): Column; + slice(begin?: number, end?: number): Column; + clone(chunks?: Vector[], offsets?: Uint32Array): Column; +} + +export class Column + extends Chunked + implements Clonable>, + Sliceable>, + Applicative> { + + constructor(field: Field, vectors: Vector[] = [], offsets?: Uint32Array) { + vectors = Chunked.flatten(...vectors); + super(field.type, vectors, offsets); + this._field = field; + if (vectors.length === 1 && !(this instanceof SingleChunkColumn)) { + return new SingleChunkColumn(field, vectors[0], this._chunkOffsets); + } + } + + protected _field: Field; + protected _children?: Column[]; + + public get field() { return this._field; } + public get name() { return this._field.name; } + + public clone(chunks = this._chunks) { + return new Column(this._field, chunks); + } + + public getChildAt(index: number): Column | null { + + if (index < 0 || index >= this.numChildren) { return null; } + + let columns = this._children || (this._children = []); + let column: Column, field: Field, chunks: Vector[]; + + if (column = columns[index]) { return column; } + if (field = ((this.type.children || [])[index] as Field)) { + chunks = this._chunks + .map((vector) => vector.getChildAt(index)) + .filter((vec): vec is Vector => vec != null); + if (chunks.length > 0) { + return (columns[index] = new Column(field, chunks)); + } + } + + return null; + } +} + +class SingleChunkColumn extends Column { + protected _chunk: Vector; + constructor(field: Field, vector: Vector, offsets?: Uint32Array) { + super(field, [vector], offsets); + this._chunk = vector; + } + public search(index: number): [number, number] | null; + public search>>(index: number, then?: N): ReturnType; + public search>>(index: number, then?: N) { + return then ? then(this, 0, index) : [0, index]; + } + public isValid(index: number): boolean { + return this._chunk.isValid(index); + } + public get(index: number): T['TValue'] | null { + return this._chunk.get(index); + } + public set(index: number, value: T['TValue'] | null): void { + this._chunk.set(index, value); + } + public indexOf(element: T['TValue'], offset?: number): number { + return this._chunk.indexOf(element, offset); + } +} diff --git a/js/src/compute/dataframe.ts b/js/src/compute/dataframe.ts new file mode 100644 index 0000000000000..01026d882f0c0 --- /dev/null +++ b/js/src/compute/dataframe.ts @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Table } from '../table'; +import { Vector } from '../vector'; +import { IntVector } from '../vector/int'; +import { Field, Schema } from '../schema'; +import { Vector as V } from '../interfaces'; +import { Predicate, Col } from './predicate'; +import { RecordBatch } from '../recordbatch'; +import { DataType, Int, Struct, Dictionary } from '../type'; + +/** @ignore */ +export type BindFunc = (batch: RecordBatch) => void; +/** @ignore */ +export type NextFunc = (idx: number, batch: RecordBatch) => void; + +Table.prototype.countBy = function(this: Table, name: Col | string) { return new DataFrame(this.chunks).countBy(name); }; +Table.prototype.scan = function(this: Table, next: NextFunc, bind?: BindFunc) { return new DataFrame(this.chunks).scan(next, bind); }; +Table.prototype.filter = function(this: Table, predicate: Predicate): FilteredDataFrame { return new DataFrame(this.chunks).filter(predicate); }; + +export class DataFrame extends Table { + public filter(predicate: Predicate): FilteredDataFrame { + return new FilteredDataFrame(this.chunks, predicate); + } + public scan(next: NextFunc, bind?: BindFunc) { + const batches = this.chunks, numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + if (bind) { bind(batch); } + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + next(index, batch); + } + } + } + public countBy(name: Col | string) { + const batches = this.chunks, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name as Col; + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as V; + if (!DataType.isDictionary(vector.type)) { + throw new Error('countBy currently only supports dictionary-encoded columns'); + } + + const countByteLength = Math.ceil(Math.log(vector.dictionary.length) / Math.log(256)); + const CountsArrayType = countByteLength == 4 ? Uint32Array : + countByteLength >= 2 ? Uint16Array : Uint8Array; + + const counts = new CountsArrayType(vector.dictionary.length); + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as V).indices; + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + let key = keys.get(index); + if (key !== null) { counts[key]++; } + } + } + return new CountByResult(vector.dictionary, IntVector.from(counts)); + } +} + +export class CountByResult extends Table<{ values: T, counts: TCount }> { + constructor(values: Vector, counts: V) { + const schema = new Schema<{ values: T, counts: TCount }>([ + new Field('values', values.type), + new Field('counts', counts.type) + ]); + super(new RecordBatch(schema, counts.length, [values, counts])); + } + public toJSON(): Object { + const values = this.getColumnAt(0)!; + const counts = this.getColumnAt(1)!; + const result = {} as { [k: string]: number | null }; + for (let i = -1; ++i < this.length;) { + result[values.get(i)] = counts.get(i); + } + return result; + } +} + +export class FilteredDataFrame extends DataFrame { + private _predicate: Predicate; + constructor (batches: RecordBatch[], predicate: Predicate) { + super(batches); + this._predicate = predicate; + } + public scan(next: NextFunc, bind?: BindFunc) { + // inlined version of this: + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) next(idx, columns); + // }); + const batches = this._chunks; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // TODO: bind batches lazily + // If predicate doesn't match anything in the batch we don't need + // to bind the callback + if (bind) { bind(batch); } + const predicate = this._predicate.bind(batch); + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { next(index, batch); } + } + } + } + public count(): number { + // inlined version of this: + // let sum = 0; + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) ++sum; + // }); + // return sum; + let sum = 0; + const batches = this._chunks; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this._predicate.bind(batch); + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { ++sum; } + } + } + return sum; + } + public *[Symbol.iterator](): IterableIterator['TValue']> { + // inlined version of this: + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) next(idx, columns); + // }); + const batches = this._chunks; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // TODO: bind batches lazily + // If predicate doesn't match anything in the batch we don't need + // to bind the callback + const predicate = this._predicate.bind(batch); + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { yield batch.get(index) as any; } + } + } + } + public filter(predicate: Predicate): FilteredDataFrame { + return new FilteredDataFrame( + this._chunks, + this._predicate.and(predicate) + ); + } + public countBy(name: Col | string) { + const batches = this._chunks, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name as Col; + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as V; + if (!DataType.isDictionary(vector.type)) { + throw new Error('countBy currently only supports dictionary-encoded columns'); + } + + const countByteLength = Math.ceil(Math.log(vector.dictionary.length) / Math.log(256)); + const CountsArrayType = countByteLength == 4 ? Uint32Array : + countByteLength >= 2 ? Uint16Array : Uint8Array; + + const counts = new CountsArrayType(vector.dictionary.length); + + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this._predicate.bind(batch); + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as V).indices; + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + let key = keys.get(index); + if (key !== null && predicate(index, batch)) { counts[key]++; } + } + } + return new CountByResult(vector.dictionary, IntVector.from(counts)); + } +} diff --git a/js/src/predicate.ts b/js/src/compute/predicate.ts similarity index 94% rename from js/src/predicate.ts rename to js/src/compute/predicate.ts index cfae73ae0af73..ec947d2670c81 100644 --- a/js/src/predicate.ts +++ b/js/src/compute/predicate.ts @@ -15,12 +15,16 @@ // specific language governing permissions and limitations // under the License. -import { RecordBatch } from './recordbatch'; -import { Vector, DictionaryVector } from './vector'; +import { Vector } from '../vector'; +import { RecordBatch } from '../recordbatch'; +import { DictionaryVector } from '../vector/dictionary'; +/** @ignore */ export type ValueFunc = (idx: number, cols: RecordBatch) => T | null; +/** @ignore */ export type PredicateFunc = (idx: number, cols: RecordBatch) => boolean; +/** @ignore */ export abstract class Value { eq(other: Value | T): Predicate { if (!(other instanceof Value)) { other = new Literal(other); } @@ -45,10 +49,12 @@ export abstract class Value { } } +/** @ignore */ export class Literal extends Value { constructor(public v: T) { super(); } } +/** @ignore */ export class Col extends Value { // @ts-ignore public vector: Vector; @@ -56,7 +62,7 @@ export class Col extends Value { public colidx: number; constructor(public name: string) { super(); } - bind(batch: RecordBatch) { + bind(batch: RecordBatch): (idx: number, batch?: RecordBatch) => any { if (!this.colidx) { // Assume column index doesn't change between calls to bind //this.colidx = cols.findIndex(v => v.name.indexOf(this.name) != -1); @@ -70,11 +76,13 @@ export class Col extends Value { } if (this.colidx < 0) { throw new Error(`Failed to bind Col "${this.name}"`); } } - this.vector = batch.getChildAt(this.colidx)!; - return this.vector.get.bind(this.vector); + + const vec = this.vector = batch.getChildAt(this.colidx)!; + return (idx: number) => vec.get(idx); } } +/** @ignore */ export abstract class Predicate { abstract bind(batch: RecordBatch): PredicateFunc; and(...expr: Predicate[]): And { return new And(this, ...expr); } @@ -82,6 +90,7 @@ export abstract class Predicate { not(): Predicate { return new Not(this); } } +/** @ignore */ export abstract class ComparisonPredicate extends Predicate { constructor(public readonly left: Value, public readonly right: Value) { super(); @@ -110,8 +119,9 @@ export abstract class ComparisonPredicate extends Predicate { protected abstract _bindLitCol(batch: RecordBatch, lit: Literal, col: Col): PredicateFunc; } +/** @ignore */ export abstract class CombinationPredicate extends Predicate { - readonly children: Predicate[] + readonly children: Predicate[]; constructor(...children: Predicate[]) { super(); this.children = children; @@ -120,12 +130,13 @@ export abstract class CombinationPredicate extends Predicate { // add children to protoype so it doesn't get mangled in es2015/umd ( CombinationPredicate.prototype).children = Object.freeze([]); // freeze for safety +/** @ignore */ export class And extends CombinationPredicate { constructor(...children: Predicate[]) { // Flatten any Ands children = children.reduce((accum: Predicate[], p: Predicate): Predicate[] => { - return accum.concat(p instanceof And ? p.children : p) - }, []) + return accum.concat(p instanceof And ? p.children : p); + }, []); super(...children); } bind(batch: RecordBatch) { @@ -134,12 +145,13 @@ export class And extends CombinationPredicate { } } +/** @ignore */ export class Or extends CombinationPredicate { constructor(...children: Predicate[]) { // Flatten any Ors children = children.reduce((accum: Predicate[], p: Predicate): Predicate[] => { - return accum.concat(p instanceof Or ? p.children : p) - }, []) + return accum.concat(p instanceof Or ? p.children : p); + }, []); super(...children); } bind(batch: RecordBatch) { @@ -148,6 +160,7 @@ export class Or extends CombinationPredicate { } } +/** @ignore */ export class Equals extends ComparisonPredicate { // Helpers used to cache dictionary reverse lookups between calls to bind private lastDictionary: Vector|undefined; @@ -200,6 +213,7 @@ export class Equals extends ComparisonPredicate { } } +/** @ignore */ export class LTeq extends ComparisonPredicate { protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { const rtrn: boolean = left.v <= right.v; @@ -223,6 +237,7 @@ export class LTeq extends ComparisonPredicate { } } +/** @ignore */ export class GTeq extends ComparisonPredicate { protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { const rtrn: boolean = left.v >= right.v; @@ -246,6 +261,7 @@ export class GTeq extends ComparisonPredicate { } } +/** @ignore */ export class Not extends Predicate { constructor(public readonly child: Predicate) { super(); @@ -257,6 +273,7 @@ export class Not extends Predicate { } } +/** @ignore */ export class CustomPredicate extends Predicate { constructor(private next: PredicateFunc, private bind_: (batch: RecordBatch) => void) { super(); diff --git a/js/src/data.ts b/js/src/data.ts index 5a117594bc89e..b55321bf98ec2 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -15,317 +15,231 @@ // specific language governing permissions and limitations // under the License. +import { Vector } from './vector'; import { popcnt_bit_range } from './util/bit'; -import { VectorLike, Vector } from './vector'; -import { Int, Bool, FlatListType, List, Struct, Map_ } from './type'; -import { VectorType, TypedArray, TypedArrayConstructor, Dictionary } from './type'; -import { DataType, FlatType, ListType, NestedType, SingleNestedType, DenseUnion, SparseUnion } from './type'; +import { toArrayBufferView } from './util/buffer'; +import { DataType, SparseUnion, DenseUnion } from './type'; +import { VectorType as BufferType, UnionMode, Type } from './enum'; +import { + Dictionary, + Null, Int, Float, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from './type'; -export function toTypedArray(ArrayType: TypedArrayConstructor, values?: T | ArrayLike | Iterable | null): T { - if (!ArrayType && ArrayBuffer.isView(values)) { return values; } - return values instanceof ArrayType ? values - : !values || !ArrayBuffer.isView(values) ? ArrayType.from(values || []) - : new ArrayType(values.buffer, values.byteOffset, values.byteLength / ArrayType.BYTES_PER_ELEMENT); -} - -export type Data = DataTypes[T['TType']] & BaseData; -export interface DataTypes { -/* [Type.NONE]*/ 0: BaseData; -/* [Type.Null]*/ 1: FlatData; -/* [Type.Int]*/ 2: FlatData; -/* [Type.Float]*/ 3: FlatData; -/* [Type.Binary]*/ 4: FlatListData; -/* [Type.Utf8]*/ 5: FlatListData; -/* [Type.Bool]*/ 6: BoolData; -/* [Type.Decimal]*/ 7: FlatData; -/* [Type.Date]*/ 8: FlatData; -/* [Type.Time]*/ 9: FlatData; -/* [Type.Timestamp]*/ 10: FlatData; -/* [Type.Interval]*/ 11: FlatData; -/* [Type.List]*/ 12: ListData>; -/* [Type.Struct]*/ 13: NestedData; -/* [Type.Union]*/ 14: UnionData; -/* [Type.FixedSizeBinary]*/ 15: FlatData; -/* [Type.FixedSizeList]*/ 16: SingleNestedData; -/* [Type.Map]*/ 17: NestedData; -/* [Type.DenseUnion]*/ DenseUnion: DenseUnionData; -/*[Type.SparseUnion]*/ SparseUnion: SparseUnionData; -/*[ Type.Dictionary]*/ Dictionary: DictionaryData; -} // When slicing, we do not know the null count of the sliced range without // doing some computation. To avoid doing this eagerly, we set the null count -// to -1 (any negative number will do). When Array::null_count is called the +// to -1 (any negative number will do). When Vector.nullCount is called the // first time, the null count will be computed. See ARROW-33 -export type kUnknownNullCount = -1; -export const kUnknownNullCount = -1; +/** @ignore */ export type kUnknownNullCount = -1; +/** @ignore */ export const kUnknownNullCount = -1; -export class BaseData implements VectorLike { - public type: T; - public length: number; - public offset: number; - // @ts-ignore - public childData: Data[]; - protected _nullCount: number | kUnknownNullCount; - protected /* [VectorType.OFFSET]:*/ 0?: Int32Array; - protected /* [VectorType.DATA]:*/ 1?: T['TArray']; - protected /*[VectorType.VALIDITY]:*/ 2?: Uint8Array; - protected /* [VectorType.TYPE]:*/ 3?: Int8Array; - constructor(type: T, length: number, offset?: number, nullCount?: number) { - this.type = type; - this.length = Math.floor(Math.max(length || 0, 0)); - this.offset = Math.floor(Math.max(offset || 0, 0)); - this._nullCount = Math.floor(Math.max(nullCount || 0, -1)); - } - public get typeId() { return this.type.TType; } - public get nullBitmap() { return this[VectorType.VALIDITY]; } - public get nullCount() { - let nullCount = this._nullCount; - let nullBitmap: Uint8Array | undefined; - if (nullCount === -1 && (nullBitmap = this[VectorType.VALIDITY])) { - this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); - } - return nullCount; - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new BaseData(type, length, offset, nullCount) as any; - } - public slice(offset: number, length: number) { - return length <= 0 ? this : this.sliceInternal(this.clone( - this.type, length, this.offset + offset, +(this._nullCount === 0) - 1 - ) as any, offset, length); - } - protected sliceInternal(clone: this, offset: number, length: number) { - let arr: any; - // If typeIds exist, slice the typeIds buffer - (arr = this[VectorType.TYPE]) && (clone[VectorType.TYPE] = this.sliceData(arr, offset, length)); - // If offsets exist, only slice the offsets buffer - (arr = this[VectorType.OFFSET]) && (clone[VectorType.OFFSET] = this.sliceOffsets(arr, offset, length)) || - // Otherwise if no offsets, slice the data buffer - (arr = this[VectorType.DATA]) && (clone[VectorType.DATA] = this.sliceData(arr, offset, length)); - return clone; - } - protected sliceData(data: T['TArray'] & TypedArray, offset: number, length: number) { - return data.subarray(offset, offset + length); - } - protected sliceOffsets(valueOffsets: Int32Array, offset: number, length: number) { - return valueOffsets.subarray(offset, offset + length + 1); - } -} +/** @ignore */ export type NullBuffer = Uint8Array | null | undefined; +/** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike | Iterable; +/** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike | Iterable; +/** @ignore */ export type DataBuffer = T['TArray'] | ArrayLike | Iterable; -export class FlatData extends BaseData { - public /* [VectorType.DATA]:*/ 1: T['TArray']; - public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; - public get values() { return this[VectorType.DATA]; } - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, data: Iterable, offset?: number, nullCount?: number) { - super(type, length, offset, nullCount); - this[VectorType.DATA] = toTypedArray(this.ArrayType, data); - this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); - } - public get ArrayType(): T['ArrayType'] { return this.type.ArrayType; } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { - return new (this.constructor as any)(type, length, this[VectorType.VALIDITY], this[VectorType.DATA], offset, nullCount) as FlatData; - } +/** @ignore */ +export interface Buffers { + [BufferType.OFFSET]: Int32Array; + [BufferType.DATA]: T['TArray']; + [BufferType.VALIDITY]: Uint8Array; + [BufferType.TYPE]: T['TArray']; } -export class BoolData extends FlatData { - protected sliceData(data: Uint8Array) { return data; } +/** @ignore */ +export interface Data { + readonly TType: T['TType']; + readonly TArray: T['TArray']; + readonly TValue: T['TValue']; } -export class FlatListData extends FlatData { - public /* [VectorType.OFFSET]:*/ 0: Int32Array; - public /* [VectorType.DATA]:*/ 1: T['TArray']; - public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; - public get values() { return this[VectorType.DATA]; } - public get valueOffsets() { return this[VectorType.OFFSET]; } - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueOffsets: Iterable, data: T['TArray'], offset?: number, nullCount?: number) { - super(type, length, nullBitmap, data, offset, nullCount); - this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { - return new FlatListData(type, length, this[VectorType.VALIDITY], this[VectorType.OFFSET], this[VectorType.DATA], offset, nullCount) as FlatListData; - } -} +/** @ignore */ +export class Data { -export class DictionaryData extends BaseData> { - protected _dictionary: Vector; - protected _indices: Data>; - public get indices() { return this._indices; } - public get dictionary() { return this._dictionary; } - constructor(type: Dictionary, dictionary: Vector, indices: Data>) { - super(type, indices.length, indices.offset, (indices as any)._nullCount); - this._indices = indices; - this._dictionary = dictionary; - } - public get nullCount() { return this._indices.nullCount; } - public get nullBitmap() { return this._indices.nullBitmap; } - public clone>(type: R, length = this.length, offset = this.offset) { - const data = this._dictionary.data.clone(type.dictionary as any); - return new DictionaryData( - this.type as any, - this._dictionary.clone(data) as any, - this._indices.slice(offset - this.offset, length) - ) as any; - } - protected sliceInternal(clone: this, _offset: number, _length: number) { - clone.length = clone._indices.length; - clone._nullCount = (clone._indices as any)._nullCount; - return clone; - } -} + public readonly type: T; + public readonly length: number; + public readonly offset: number; + public readonly stride: number; + public readonly childData: Data[]; + public readonly values: Buffers[BufferType.DATA]; + public readonly typeIds: Buffers[BufferType.TYPE]; + // @ts-ignore + public readonly nullBitmap: Buffers[BufferType.VALIDITY]; + // @ts-ignore + public readonly valueOffsets: Buffers[BufferType.OFFSET]; -export class NestedData extends BaseData { - public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, childData: Data[], offset?: number, nullCount?: number) { - super(type, length, offset, nullCount); - this.childData = childData; - this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new NestedData(type, length, this[VectorType.VALIDITY], this.childData, offset, nullCount) as any; + public get ArrayType() { return this.type.ArrayType; } + public get typeId(): T['TType'] { return this.type.typeId; } + public get buffers() { + return [this.valueOffsets, this.values, this.nullBitmap, this.typeIds] as Buffers; } - protected sliceInternal(clone: this, offset: number, length: number) { - if (!this[VectorType.OFFSET]) { - clone.childData = this.childData.map((child) => child.slice(offset, length)); + + protected _nullCount: number | kUnknownNullCount; + + public get nullCount() { + let nullCount = this._nullCount; + let nullBitmap: Uint8Array | undefined; + if (nullCount <= kUnknownNullCount && (nullBitmap = this.nullBitmap)) { + this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); } - return super.sliceInternal(clone, offset, length); + return nullCount; } -} -export class SingleNestedData extends NestedData { - protected _valuesData: Data; - public get values() { return this._valuesData; } - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueChildData: Data, offset?: number, nullCount?: number) { - super(type, length, nullBitmap, [valueChildData], offset, nullCount); - this._valuesData = valueChildData; + constructor(type: T, offset: number, length: number, nullCount?: number, buffers?: Partial> | Data, childData?: (Data | Vector)[]) { + this.type = type; + this.offset = Math.floor(Math.max(offset || 0, 0)); + this.length = Math.floor(Math.max(length || 0, 0)); + this._nullCount = Math.floor(Math.max(nullCount || 0, -1)); + this.childData = (childData || []).map((x) => x instanceof Data ? x : x.data) as Data[]; + let buffer: Buffers[keyof Buffers]; + if (buffers instanceof Data) { + this.stride = buffers.stride; + this.values = buffers.values; + this.typeIds = buffers.typeIds; + this.nullBitmap = buffers.nullBitmap; + this.valueOffsets = buffers.valueOffsets; + } else { + if (buffers) { + (buffer = (buffers as Buffers)[0]) && (this.valueOffsets = buffer); + (buffer = (buffers as Buffers)[1]) && (this.values = buffer); + (buffer = (buffers as Buffers)[2]) && (this.nullBitmap = buffer); + (buffer = (buffers as Buffers)[3]) && (this.typeIds = buffer); + } + const t: any = type; + switch (type.typeId) { + case Type.Decimal: this.stride = 4; break; + case Type.Timestamp: this.stride = 2; break; + case Type.Date: this.stride = 1 + (t as Date_).unit; break; + case Type.Interval: this.stride = 1 + (t as Interval).unit; break; + case Type.Int: this.stride = 1 + +((t as Int).bitWidth > 32); break; + case Type.Time: this.stride = 1 + +((t as Time).bitWidth > 32); break; + case Type.FixedSizeList: this.stride = (t as FixedSizeList).listSize; break; + case Type.FixedSizeBinary: this.stride = (t as FixedSizeBinary).byteWidth; break; + default: this.stride = 1; + } + } } -} -export class ListData extends SingleNestedData { - public /* [VectorType.OFFSET]:*/ 0: Int32Array; - public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; - public get valueOffsets() { return this[VectorType.OFFSET]; } - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueOffsets: Iterable, valueChildData: Data, offset?: number, nullCount?: number) { - super(type, length, nullBitmap, valueChildData, offset, nullCount); - this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); + public clone(type: R, offset = this.offset, length = this.length, nullCount = this._nullCount, buffers: Buffers = this, childData: (Data | Vector)[] = this.childData) { + return new Data(type, offset, length, nullCount, buffers, childData); } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new ListData(type, length, this[VectorType.VALIDITY], this[VectorType.OFFSET], this._valuesData as any, offset, nullCount) as any; - } -} -export class UnionData extends NestedData { - public /* [VectorType.TYPE]:*/ 3: T['TArray']; - public get typeIds() { return this[VectorType.TYPE]; } - public readonly typeIdToChildIndex: { [key: number]: number }; - constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, childData: Data[], offset?: number, nullCount?: number) { - super(type, length, nullBitmap, childData, offset, nullCount); - this[VectorType.TYPE] = toTypedArray(Int8Array, typeIds); - this.typeIdToChildIndex = type.typeIds.reduce((typeIdToChildIndex, typeId, idx) => { - return (typeIdToChildIndex[typeId] = idx) && typeIdToChildIndex || typeIdToChildIndex; - }, Object.create(null) as { [key: number]: number }); + public slice(offset: number, length: number): Data { + // +true === 1, +false === 0, so this means + // we keep nullCount at 0 if it's already 0, + // otherwise set to the invalidated flag -1 + const { stride, typeId, childData } = this; + const nullCount = +(this._nullCount === 0) - 1; + const childStride = typeId === 16 /* FixedSizeList */ ? stride : 1; + const buffers = this._sliceBuffers(offset, length, stride, typeId); + return this.clone(this.type, this.offset + offset, length, nullCount, buffers, + // Don't slice children if we have value offsets (the variable-width types) + (!childData.length || this.valueOffsets) ? childData : this._sliceChildren(childData, childStride * offset, childStride * length)); } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new UnionData(type, length, this[VectorType.VALIDITY], this[VectorType.TYPE], this.childData, offset, nullCount) as any; - } -} -export class SparseUnionData extends UnionData { - constructor(type: SparseUnion, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, childData: Data[], offset?: number, nullCount?: number) { - super(type, length, nullBitmap, typeIds, childData, offset, nullCount); - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new SparseUnionData( - type, - length, - this[VectorType.VALIDITY], - this[VectorType.TYPE], - this.childData, - offset, nullCount - ) as any; + protected _sliceBuffers(offset: number, length: number, stride: number, typeId: T['TType']): Buffers { + let arr: any, { buffers } = this; + // If typeIds exist, slice the typeIds buffer + (arr = buffers[BufferType.TYPE]) && (buffers[BufferType.TYPE] = arr.subarray(offset, offset + length)); + // If offsets exist, only slice the offsets buffer + (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || + // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + return buffers; } -} -export class DenseUnionData extends UnionData { - public /* [VectorType.OFFSET]:*/ 0: Int32Array; - public get valueOffsets() { return this[VectorType.OFFSET]; } - constructor(type: DenseUnion, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, valueOffsets: Iterable, childData: Data[], offset?: number, nullCount?: number) { - super(type, length, nullBitmap, typeIds, childData, offset, nullCount); - this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new DenseUnionData( - type, - length, - this[VectorType.VALIDITY], - this[VectorType.TYPE], - this[VectorType.OFFSET], - this.childData, - offset, nullCount - ) as any; + protected _sliceChildren(childData: Data[], offset: number, length: number): Data[] { + return childData.map((child) => child.slice(offset, length)); } -} -export class ChunkedData extends BaseData { - // @ts-ignore - protected _chunkData: Data[]; - protected _chunkVectors: Vector[]; - protected _chunkOffsets: Uint32Array; - public get chunkVectors() { return this._chunkVectors; } - public get chunkOffsets() { return this._chunkOffsets; } - public get chunkData() { - return this._chunkData || ( - this._chunkData = this._chunkVectors.map(({ data }) => data)); - } - constructor(type: T, length: number, chunkVectors: Vector[], offset?: number, nullCount?: number, chunkOffsets?: Uint32Array) { - super(type, length, offset, nullCount); - this._chunkVectors = chunkVectors; - this._chunkOffsets = chunkOffsets || ChunkedData.computeOffsets(chunkVectors); - } - public get nullCount() { - let nullCount = this._nullCount; - if (nullCount === -1) { - this._nullCount = nullCount = this._chunkVectors.reduce((x, c) => x + c.nullCount, 0); + // + // Convenience methods for creating Data instances for each of the Arrow Vector types + // + /** @nocollapse */ + public static Null(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer) { + return new Data(type, offset, length, nullCount, [undefined, undefined, toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Int(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Dictionary(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.indices.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Float(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Bool(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Decimal(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Date(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Time(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Timestamp(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Interval(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static FixedSizeBinary(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, data: DataBuffer) { + return new Data(type, offset, length, nullCount, [undefined, toArrayBufferView(type.ArrayType, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Binary(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, valueOffsets: ValueOffsetsBuffer, data: Uint8Array) { + return new Data(type, offset, length, nullCount, [toArrayBufferView(Int32Array, valueOffsets), toArrayBufferView(Uint8Array, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static Utf8(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, valueOffsets: ValueOffsetsBuffer, data: Uint8Array) { + return new Data(type, offset, length, nullCount, [toArrayBufferView(Int32Array, valueOffsets), toArrayBufferView(Uint8Array, data), toArrayBufferView(Uint8Array, nullBitmap)]); + } + /** @nocollapse */ + public static List(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, valueOffsets: ValueOffsetsBuffer, child: Data | Vector) { + return new Data(type, offset, length, nullCount, [toArrayBufferView(Int32Array, valueOffsets), undefined, toArrayBufferView(Uint8Array, nullBitmap)], [child]); + } + /** @nocollapse */ + public static FixedSizeList(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, child: Data | Vector) { + return new Data(type, offset, length, nullCount, [undefined, undefined, toArrayBufferView(Uint8Array, nullBitmap)], [child]); + } + /** @nocollapse */ + public static Struct(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, children: (Data | Vector)[]) { + return new Data(type, offset, length, nullCount, [undefined, undefined, toArrayBufferView(Uint8Array, nullBitmap)], children); + } + /** @nocollapse */ + public static Map(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, children: (Data | Vector)[]) { + return new Data(type, offset, length, nullCount, [undefined, undefined, toArrayBufferView(Uint8Array, nullBitmap)], children); + } + public static Union(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, typeIds: TypeIdsBuffer, children: (Data | Vector)[]): Data; + public static Union(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, typeIds: TypeIdsBuffer, valueOffsets: ValueOffsetsBuffer, children: (Data | Vector)[]): Data; + /** @nocollapse */ + public static Union(type: T, offset: number, length: number, nullCount: number, nullBitmap: NullBuffer, typeIds: TypeIdsBuffer, valueOffsetsOrChildren: ValueOffsetsBuffer | (Data | Vector)[], children?: (Data | Vector)[]) { + const buffers = [ + undefined, undefined, + toArrayBufferView(Uint8Array, nullBitmap), + toArrayBufferView(type.ArrayType, typeIds) + ] as Partial>; + if (type.mode === UnionMode.Sparse) { + return new Data(type, offset, length, nullCount, buffers, valueOffsetsOrChildren as (Data | Vector)[]); } - return nullCount; - } - public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount): Data { - return new ChunkedData( - type, length, - this._chunkVectors.map((vec) => vec.clone(vec.data.clone(type))) as any, - offset, nullCount, this._chunkOffsets - ) as any; - } - protected sliceInternal(clone: this, offset: number, length: number) { - const chunks = this._chunkVectors; - const offsets = this._chunkOffsets; - const chunkSlices: Vector[] = []; - for (let childIndex = -1, numChildren = chunks.length; ++childIndex < numChildren;) { - const child = chunks[childIndex]; - const childLength = child.length; - const childOffset = offsets[childIndex]; - // If the child is to the right of the slice boundary, exclude - if (childOffset >= offset + length) { continue; } - // If the child is to the left of of the slice boundary, exclude - if (offset >= childOffset + childLength) { continue; } - // If the child is between both left and right boundaries, include w/o slicing - if (childOffset >= offset && (childOffset + childLength) <= offset + length) { - chunkSlices.push(child); - continue; - } - // If the child overlaps one of the slice boundaries, include that slice - const begin = Math.max(0, offset - childOffset); - const end = begin + Math.min(childLength - begin, (offset + length) - childOffset); - chunkSlices.push(child.slice(begin, end)); - } - clone._chunkVectors = chunkSlices; - clone._chunkOffsets = ChunkedData.computeOffsets(chunkSlices); - return clone; - } - static computeOffsets(childVectors: Vector[]) { - const childOffsets = new Uint32Array(childVectors.length + 1); - for (let index = 0, length = childOffsets.length, childOffset = childOffsets[0] = 0; ++index < length;) { - childOffsets[index] = (childOffset += childVectors[index - 1].length); - } - return childOffsets; + buffers[BufferType.OFFSET] = toArrayBufferView(Int32Array, valueOffsetsOrChildren); + return new Data(type, offset, length, nullCount, buffers, children); } } + +((Data.prototype as any).childData = Object.freeze([])); diff --git a/js/src/enum.ts b/js/src/enum.ts new file mode 100644 index 0000000000000..0be6a4ed2938e --- /dev/null +++ b/js/src/enum.ts @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as Schema_ from './fb/Schema'; +import * as Message_ from './fb/Message'; + +export import ArrowType = Schema_.org.apache.arrow.flatbuf.Type; +export import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +export import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; +export import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +export import UnionMode = Schema_.org.apache.arrow.flatbuf.UnionMode; +export import VectorType = Schema_.org.apache.arrow.flatbuf.VectorType; +export import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; +export import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +export import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; + +/** + * * + * Main data type enumeration: + * * + * Data types in this library are all *logical*. They can be expressed as + * either a primitive physical type (bytes or bits of some fixed size), a + * nested type consisting of other data types, or another data type (e.g. a + * timestamp encoded as an int64) + */ +export enum Type { + NONE = 0, // The default placeholder type + Null = 1, // A NULL type having no physical storage + Int = 2, // Signed or unsigned 8, 16, 32, or 64-bit little-endian integer + Float = 3, // 2, 4, or 8-byte floating point value + Binary = 4, // Variable-length bytes (no guarantee of UTF8-ness) + Utf8 = 5, // UTF8 variable-length string as List + Bool = 6, // Boolean as 1 bit, LSB bit-packed ordering + Decimal = 7, // Precision-and-scale-based decimal type. Storage type depends on the parameters. + Date = 8, // int32_t days or int64_t milliseconds since the UNIX epoch + Time = 9, // Time as signed 32 or 64-bit integer, representing either seconds, milliseconds, microseconds, or nanoseconds since midnight since midnight + Timestamp = 10, // Exact timestamp encoded with int64 since UNIX epoch (Default unit millisecond) + Interval = 11, // YEAR_MONTH or DAY_TIME interval in SQL style + List = 12, // A list of some logical data type + Struct = 13, // Struct of logical types + Union = 14, // Union of logical types + FixedSizeBinary = 15, // Fixed-size binary. Each value occupies the same number of bytes + FixedSizeList = 16, // Fixed-size list. Each value occupies the same number of bytes + Map = 17, // Map of named logical types + + // These enum values are here so that TypeScript can narrow the type signatures further + // beyond the base Arrow types. The base Arrow types include metadata like bitWidths that + // impact the type signatures of the values we return. For example, the Int8Vector reads + // 1-byte numbers from an Int8Array, an Int32Vector reads a 4-byte number from an Int32Array, + // and an Int64Vector reads a pair of 4-byte lo, hi int32s, and returns them as a zero-copy + // slice from an underlying Int32Array. Library consumers benefit by doing this type narrowing, + // since we can ensure the types across all public methods are propagated and never bail to `any`. + // These values are _never_ actually used at runtime, and they will _never_ be written into the + // flatbuffers metadata of serialized Arrow IPC payloads. + Dictionary = -1, // Dictionary aka Category type + Int8 = -2, + Int16 = -3, + Int32 = -4, + Int64 = -5, + Uint8 = -6, + Uint16 = -7, + Uint32 = -8, + Uint64 = -9, + Float16 = -10, + Float32 = -11, + Float64 = -12, + DateDay = -13, + DateMillisecond = -14, + TimestampSecond = -15, + TimestampMillisecond = -16, + TimestampMicrosecond = -17, + TimestampNanosecond = -18, + TimeSecond = -19, + TimeMillisecond = -20, + TimeMicrosecond = -21, + TimeNanosecond = -22, + DenseUnion = -23, + SparseUnion = -24, + IntervalDayTime = -25, + IntervalYearMonth = -26, +} diff --git a/js/src/fb/Schema.ts b/js/src/fb/Schema.ts index 4a4aeb65599be..e9829d9d8348a 100644 --- a/js/src/fb/Schema.ts +++ b/js/src/fb/Schema.ts @@ -588,7 +588,7 @@ export namespace org.apache.arrow.flatbuf { * @param {Array.} data * @returns {flatbuffers.Offset} */ - static createTypeIdsVector(builder: flatbuffers.Builder, data: number[] | Uint8Array): flatbuffers.Offset { + static createTypeIdsVector(builder: flatbuffers.Builder, data: number[] | Int32Array): flatbuffers.Offset { builder.startVector(4, data.length, 4); for (let i = data.length - 1; i >= 0; i--) { builder.addInt32(data[i]); diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts new file mode 100644 index 0000000000000..ae38d4e5be333 --- /dev/null +++ b/js/src/interfaces.ts @@ -0,0 +1,240 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data } from './data'; +import { Type } from './enum'; +import * as type from './type'; +import { DataType } from './type'; +import * as vecs from './vector/index'; + +/** @ignore */ +export interface ArrayBufferViewConstructor { + readonly prototype: T; + new(length: number): T; + new(arrayOrArrayBuffer: ArrayLike | ArrayBufferLike): T; + new(buffer: ArrayBufferLike, byteOffset: number, length?: number): T; + /** + * The size in bytes of each element in the array. + */ + readonly BYTES_PER_ELEMENT: number; + /** + * Returns a new array from a set of elements. + * @param items A set of elements to include in the new array object. + */ + of(...items: number[]): T; + /** + * Creates an array from an array-like or iterable object. + * @param arrayLike An array-like or iterable object to convert to an array. + * @param mapfn A mapping function to call on every element of the array. + * @param thisArg Value of 'this' used to invoke the mapfn. + */ + from(arrayLike: ArrayLike, mapfn?: (v: number, k: number) => number, thisArg?: any): T; +} + +/** @ignore */ +export type VectorCtorArgs< + T extends Vector, + R extends DataType = any, + TArgs extends any[] = any[], + TCtor extends new (data: Data, ...args: TArgs) => T = + new (data: Data, ...args: TArgs) => T +> = TCtor extends new (data: Data, ...args: infer TArgs) => T ? TArgs : never; + +/** + * Obtain the constructor function of an instance type + * @ignore + */ +export type ConstructorType< + T, + TCtor extends new (...args: any[]) => T = + new (...args: any[]) => T +> = TCtor extends new (...args: any[]) => T ? TCtor : never; + +/** @ignore */ +export type VectorCtorType< + T extends Vector, + R extends DataType = any, + TCtor extends new (data: Data, ...args: VectorCtorArgs) => T = + new (data: Data, ...args: VectorCtorArgs) => T +> = TCtor extends new (data: Data, ...args: VectorCtorArgs) => T ? TCtor : never; + +/** @ignore */ +export type Vector = + T extends Type ? TypeToVector : + T extends DataType ? DataTypeToVector : + never + ; + +/** @ignore */ +export type VectorCtor = + T extends Vector ? VectorCtorType : + T extends Type ? VectorCtorType> : + T extends DataType ? VectorCtorType> : + VectorCtorType + ; + +/** @ignore */ +export type DataTypeCtor = + T extends DataType ? ConstructorType : + T extends Vector ? ConstructorType : + T extends Type ? ConstructorType> : + never + ; + +/** @ignore */ +type TypeToVector = + T extends Type.Null ? vecs.NullVector : + T extends Type.Bool ? vecs.BoolVector : + T extends Type.Int8 ? vecs.Int8Vector : + T extends Type.Int16 ? vecs.Int16Vector : + T extends Type.Int32 ? vecs.Int32Vector : + T extends Type.Int64 ? vecs.Int64Vector : + T extends Type.Uint8 ? vecs.Uint8Vector : + T extends Type.Uint16 ? vecs.Uint16Vector : + T extends Type.Uint32 ? vecs.Uint32Vector : + T extends Type.Uint64 ? vecs.Uint64Vector : + T extends Type.Int ? vecs.IntVector : + T extends Type.Float16 ? vecs.Float16Vector : + T extends Type.Float32 ? vecs.Float32Vector : + T extends Type.Float64 ? vecs.Float64Vector : + T extends Type.Float ? vecs.FloatVector : + T extends Type.Utf8 ? vecs.Utf8Vector : + T extends Type.Binary ? vecs.BinaryVector : + T extends Type.FixedSizeBinary ? vecs.FixedSizeBinaryVector : + T extends Type.Date ? vecs.DateVector : + T extends Type.DateDay ? vecs.DateDayVector : + T extends Type.DateMillisecond ? vecs.DateMillisecondVector : + T extends Type.Timestamp ? vecs.TimestampVector : + T extends Type.TimestampSecond ? vecs.TimestampSecondVector : + T extends Type.TimestampMillisecond ? vecs.TimestampMillisecondVector : + T extends Type.TimestampMicrosecond ? vecs.TimestampMicrosecondVector : + T extends Type.TimestampNanosecond ? vecs.TimestampNanosecondVector : + T extends Type.Time ? vecs.TimeVector : + T extends Type.TimeSecond ? vecs.TimeSecondVector : + T extends Type.TimeMillisecond ? vecs.TimeMillisecondVector : + T extends Type.TimeMicrosecond ? vecs.TimeMicrosecondVector : + T extends Type.TimeNanosecond ? vecs.TimeNanosecondVector : + T extends Type.Decimal ? vecs.DecimalVector : + T extends Type.Union ? vecs.UnionVector : + T extends Type.DenseUnion ? vecs.DenseUnionVector : + T extends Type.SparseUnion ? vecs.SparseUnionVector : + T extends Type.Interval ? vecs.IntervalVector : + T extends Type.IntervalDayTime ? vecs.IntervalDayTimeVector : + T extends Type.IntervalYearMonth ? vecs.IntervalYearMonthVector : + T extends Type.Map ? vecs.MapVector : + T extends Type.List ? vecs.ListVector : + T extends Type.Struct ? vecs.StructVector : + T extends Type.Dictionary ? vecs.DictionaryVector : + T extends Type.FixedSizeList ? vecs.FixedSizeListVector : + vecs.BaseVector + ; + +/** @ignore */ +type DataTypeToVector = + T extends type.Null ? vecs.NullVector : + T extends type.Bool ? vecs.BoolVector : + T extends type.Int8 ? vecs.Int8Vector : + T extends type.Int16 ? vecs.Int16Vector : + T extends type.Int32 ? vecs.Int32Vector : + T extends type.Int64 ? vecs.Int64Vector : + T extends type.Uint8 ? vecs.Uint8Vector : + T extends type.Uint16 ? vecs.Uint16Vector : + T extends type.Uint32 ? vecs.Uint32Vector : + T extends type.Uint64 ? vecs.Uint64Vector : + T extends type.Int ? vecs.IntVector : + T extends type.Float16 ? vecs.Float16Vector : + T extends type.Float32 ? vecs.Float32Vector : + T extends type.Float64 ? vecs.Float64Vector : + T extends type.Float ? vecs.FloatVector : + T extends type.Utf8 ? vecs.Utf8Vector : + T extends type.Binary ? vecs.BinaryVector : + T extends type.FixedSizeBinary ? vecs.FixedSizeBinaryVector : + T extends type.Date_ ? vecs.DateVector : + T extends type.DateDay ? vecs.DateDayVector : + T extends type.DateMillisecond ? vecs.DateMillisecondVector : + T extends type.Timestamp ? vecs.TimestampVector : + T extends type.TimestampSecond ? vecs.TimestampSecondVector : + T extends type.TimestampMillisecond ? vecs.TimestampMillisecondVector : + T extends type.TimestampMicrosecond ? vecs.TimestampMicrosecondVector : + T extends type.TimestampNanosecond ? vecs.TimestampNanosecondVector : + T extends type.Time ? vecs.TimeVector : + T extends type.TimeSecond ? vecs.TimeSecondVector : + T extends type.TimeMillisecond ? vecs.TimeMillisecondVector : + T extends type.TimeMicrosecond ? vecs.TimeMicrosecondVector : + T extends type.TimeNanosecond ? vecs.TimeNanosecondVector : + T extends type.Decimal ? vecs.DecimalVector : + T extends type.Union ? vecs.UnionVector : + T extends type.DenseUnion ? vecs.DenseUnionVector : + T extends type.SparseUnion ? vecs.SparseUnionVector : + T extends type.Interval ? vecs.IntervalVector : + T extends type.IntervalDayTime ? vecs.IntervalDayTimeVector : + T extends type.IntervalYearMonth ? vecs.IntervalYearMonthVector : + T extends type.Map_ ? vecs.MapVector : + T extends type.List ? vecs.ListVector : + T extends type.Struct ? vecs.StructVector : + T extends type.Dictionary ? vecs.DictionaryVector : + T extends type.FixedSizeList ? vecs.FixedSizeListVector : + vecs.BaseVector + ; + +/** @ignore */ +type TypeToDataType = + T extends Type.Null ? type.Null + : T extends Type.Bool ? type.Bool + : T extends Type.Int ? type.Int + : T extends Type.Int16 ? type.Int16 + : T extends Type.Int32 ? type.Int32 + : T extends Type.Int64 ? type.Int64 + : T extends Type.Uint8 ? type.Uint8 + : T extends Type.Uint16 ? type.Uint16 + : T extends Type.Uint32 ? type.Uint32 + : T extends Type.Uint64 ? type.Uint64 + : T extends Type.Int8 ? type.Int8 + : T extends Type.Float16 ? type.Float16 + : T extends Type.Float32 ? type.Float32 + : T extends Type.Float64 ? type.Float64 + : T extends Type.Float ? type.Float + : T extends Type.Utf8 ? type.Utf8 + : T extends Type.Binary ? type.Binary + : T extends Type.FixedSizeBinary ? type.FixedSizeBinary + : T extends Type.Date ? type.Date_ + : T extends Type.DateDay ? type.DateDay + : T extends Type.DateMillisecond ? type.DateMillisecond + : T extends Type.Timestamp ? type.Timestamp + : T extends Type.TimestampSecond ? type.TimestampSecond + : T extends Type.TimestampMillisecond ? type.TimestampMillisecond + : T extends Type.TimestampMicrosecond ? type.TimestampMicrosecond + : T extends Type.TimestampNanosecond ? type.TimestampNanosecond + : T extends Type.Time ? type.Time + : T extends Type.TimeSecond ? type.TimeSecond + : T extends Type.TimeMillisecond ? type.TimeMillisecond + : T extends Type.TimeMicrosecond ? type.TimeMicrosecond + : T extends Type.TimeNanosecond ? type.TimeNanosecond + : T extends Type.Decimal ? type.Decimal + : T extends Type.Union ? type.Union + : T extends Type.DenseUnion ? type.DenseUnion + : T extends Type.SparseUnion ? type.SparseUnion + : T extends Type.Interval ? type.Interval + : T extends Type.IntervalDayTime ? type.IntervalDayTime + : T extends Type.IntervalYearMonth ? type.IntervalYearMonth + : T extends Type.Map ? type.Map_ + : T extends Type.List ? type.List + : T extends Type.Struct ? type.Struct + : T extends Type.Dictionary ? type.Dictionary + : T extends Type.FixedSizeList ? type.FixedSizeList + : DataType + ; diff --git a/js/src/io/adapters.ts b/js/src/io/adapters.ts new file mode 100644 index 0000000000000..427fc29ab2228 --- /dev/null +++ b/js/src/io/adapters.ts @@ -0,0 +1,386 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { + toUint8Array, + joinUint8Arrays, + ArrayBufferViewInput, + toUint8ArrayIterator, + toUint8ArrayAsyncIterator +} from '../util/buffer'; + +import { ReadableDOMStreamOptions } from './interfaces'; + +/** @ignore */ +export default { + fromIterable(source: Iterable | T): IterableIterator { + return pump(fromIterable(source)); + }, + fromAsyncIterable(source: AsyncIterable | PromiseLike): AsyncIterableIterator { + return pump(fromAsyncIterable(source)); + }, + fromDOMStream(source: ReadableStream): AsyncIterableIterator { + return pump(fromDOMStream(source)); + }, + fromNodeStream(stream: NodeJS.ReadableStream): AsyncIterableIterator { + return pump(fromNodeStream(stream)); + }, + // @ts-ignore + toDOMStream(source: Iterable | AsyncIterable, options?: ReadableDOMStreamOptions): ReadableStream { + throw new Error(`"toDOMStream" not available in this environment`); + }, + // @ts-ignore + toNodeStream(source: Iterable | AsyncIterable, options?: import('stream').ReadableOptions): import('stream').Readable { + throw new Error(`"toNodeStream" not available in this environment`); + }, +}; + +/** @ignore */ +const pump = | AsyncIterator>(iterator: T) => { iterator.next(); return iterator; }; + +/** @ignore */ +function* fromIterable(source: Iterable | T): IterableIterator { + + let done: boolean, threw = false; + let buffers: Uint8Array[] = [], buffer: Uint8Array; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before creating the source Iterator + ({ cmd, size } = yield null); + + // initialize the iterator + let it = toUint8ArrayIterator(source)[Symbol.iterator](); + + try { + do { + // read the next value + ({ done, value: buffer } = isNaN(size - bufferLength) ? + it.next(undefined) : it.next(size - bufferLength)); + // if chunk is not null or empty, push it onto the queue + if (!done && buffer.byteLength > 0) { + buffers.push(buffer); + bufferLength += buffer.byteLength; + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } catch (e) { + (threw = true) && (typeof it.throw === 'function') && (it.throw(e)); + } finally { + (threw === false) && (typeof it.return === 'function') && (it.return()); + } +} + +/** @ignore */ +async function* fromAsyncIterable(source: AsyncIterable | PromiseLike): AsyncIterableIterator { + + let done: boolean, threw = false; + let buffers: Uint8Array[] = [], buffer: Uint8Array; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before creating the source AsyncIterator + ({ cmd, size } = yield null); + + // initialize the iterator + let it = toUint8ArrayAsyncIterator(source)[Symbol.asyncIterator](); + + try { + do { + // read the next value + ({ done, value: buffer } = isNaN(size - bufferLength) + ? await it.next(undefined) + : await it.next(size - bufferLength)); + // if chunk is not null or empty, push it onto the queue + if (!done && buffer.byteLength > 0) { + buffers.push(buffer); + bufferLength += buffer.byteLength; + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } catch (e) { + (threw = true) && (typeof it.throw === 'function') && (await it.throw(e)); + } finally { + (threw === false) && (typeof it.return === 'function') && (await it.return()); + } +} + +// All this manual Uint8Array chunk management can be avoided if/when engines +// add support for ArrayBuffer.transfer() or ArrayBuffer.prototype.realloc(): +// https://github.com/domenic/proposal-arraybuffer-transfer +/** @ignore */ +async function* fromDOMStream(source: ReadableStream): AsyncIterableIterator { + + let done = false, threw = false; + let buffers: Uint8Array[] = [], buffer: Uint8Array; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before we establish the ReadableStream lock + ({ cmd, size } = yield null); + + // initialize the reader and lock the stream + let it = new AdaptiveByteReader(source); + + try { + do { + // read the next value + ({ done, value: buffer } = isNaN(size - bufferLength) + ? await it['read'](undefined) + : await it['read'](size - bufferLength)); + // if chunk is not null or empty, push it onto the queue + if (!done && buffer.byteLength > 0) { + buffers.push(toUint8Array(buffer)); + bufferLength += buffer.byteLength; + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } catch (e) { + (threw = true) && (await it['cancel'](e)); + } finally { + (threw === false) ? (await it['cancel']()) + : source['locked'] && it.releaseLock(); + } +} + +/** @ignore */ +class AdaptiveByteReader { + + private supportsBYOB: boolean; + private byobReader: ReadableStreamBYOBReader | null = null; + private defaultReader: ReadableStreamDefaultReader | null = null; + private reader: ReadableStreamBYOBReader | ReadableStreamDefaultReader | null; + + constructor(private source: ReadableStream) { + try { + this.supportsBYOB = !!(this.reader = this.getBYOBReader()); + } catch (e) { + this.supportsBYOB = !!!(this.reader = this.getDefaultReader()); + } + } + + get closed(): Promise { + return this.reader ? this.reader['closed'].catch(() => {}) : Promise.resolve(); + } + + releaseLock(): void { + if (this.reader) { + this.reader.releaseLock(); + } + this.reader = this.byobReader = this.defaultReader = null; + } + + async cancel(reason?: any): Promise { + const { reader, source } = this; + reader && (await reader['cancel'](reason)); + source && (source['locked'] && this.releaseLock()); + } + + async read(size?: number): Promise> { + if (size === 0) { + return { done: this.reader == null, value: new Uint8Array(0) }; + } + const result = !this.supportsBYOB || typeof size !== 'number' + ? await this.getDefaultReader().read() + : await this.readFromBYOBReader(size); + !result.done && (result.value = toUint8Array(result as ReadableStreamReadResult)); + return result as ReadableStreamReadResult; + } + + private getDefaultReader() { + if (this.byobReader) { this.releaseLock(); } + if (!this.defaultReader) { + this.defaultReader = this.source['getReader'](); + // We have to catch and swallow errors here to avoid uncaught promise rejection exceptions + // that seem to be raised when we call `releaseLock()` on this reader. I'm still mystified + // about why these errors are raised, but I'm sure there's some important spec reason that + // I haven't considered. I hate to employ such an anti-pattern here, but it seems like the + // only solution in this case :/ + this.defaultReader['closed'].catch(() => {}); + } + return (this.reader = this.defaultReader); + } + + private getBYOBReader() { + if (this.defaultReader) { this.releaseLock(); } + if (!this.byobReader) { + this.byobReader = this.source['getReader']({ mode: 'byob' }); + // We have to catch and swallow errors here to avoid uncaught promise rejection exceptions + // that seem to be raised when we call `releaseLock()` on this reader. I'm still mystified + // about why these errors are raised, but I'm sure there's some important spec reason that + // I haven't considered. I hate to employ such an anti-pattern here, but it seems like the + // only solution in this case :/ + this.byobReader['closed'].catch(() => {}); + } + return (this.reader = this.byobReader); + } + + // This strategy plucked from the example in the streams spec: + // https://streams.spec.whatwg.org/#example-manual-read-bytes + private async readFromBYOBReader(size: number) { + return await readInto(this.getBYOBReader(), new ArrayBuffer(size), 0, size); + } +} + +/** @ignore */ +async function readInto(reader: ReadableStreamBYOBReader, buffer: ArrayBufferLike, offset: number, size: number): Promise> { + if (offset >= size) { + return { done: false, value: new Uint8Array(buffer, 0, size) }; + } + const { done, value } = await reader.read(new Uint8Array(buffer, offset, size - offset)); + if (((offset += value.byteLength) < size) && !done) { + return await readInto(reader, value.buffer, offset, size); + } + return { done, value: new Uint8Array(value.buffer, 0, offset) }; +} + +/** @ignore */ +type EventName = 'end' | 'error' | 'readable'; +/** @ignore */ +type Event = [EventName, (_: any) => void, Promise<[EventName, Error | null]>]; +/** @ignore */ +const onEvent = (stream: NodeJS.ReadableStream, event: T) => { + let handler = (_: any) => resolve([event, _]); + let resolve: (value?: [T, any] | PromiseLike<[T, any]>) => void; + return [event, handler, new Promise<[T, any]>( + (r) => (resolve = r) && stream['once'](event, handler) + )] as Event; +}; + +/** @ignore */ +async function* fromNodeStream(stream: NodeJS.ReadableStream): AsyncIterableIterator { + + let events: Event[] = []; + let event: EventName = 'error'; + let done = false, err: Error | null = null; + let cmd: 'peek' | 'read', size: number, bufferLength = 0; + let buffers: Uint8Array[] = [], buffer: Uint8Array | Buffer | string; + + function byteRange() { + if (cmd === 'peek') { + return joinUint8Arrays(buffers, size)[0]; + } + [buffer, buffers, bufferLength] = joinUint8Arrays(buffers, size); + return buffer; + } + + // Yield so the caller can inject the read command before we + // add the listener for the source stream's 'readable' event. + ({ cmd, size } = yield null); + + // ignore stdin if it's a TTY + if ((stream as any)['isTTY']) { return yield new Uint8Array(0); } + + try { + // initialize the stream event handlers + events[0] = onEvent(stream, 'end'); + events[1] = onEvent(stream, 'error'); + + do { + events[2] = onEvent(stream, 'readable'); + + // wait on the first message event from the stream + [event, err] = await Promise.race(events.map((x) => x[2])); + + // if the stream emitted an Error, rethrow it + if (event === 'error') { break; } + if (!(done = event === 'end')) { + // If the size is NaN, request to read everything in the stream's internal buffer + if (!isFinite(size - bufferLength)) { + buffer = toUint8Array(stream['read'](undefined)); + } else { + buffer = toUint8Array(stream['read'](size - bufferLength)); + // If the byteLength is 0, then the requested amount is more than the stream has + // in its internal buffer. In this case the stream needs a "kick" to tell it to + // continue emitting readable events, so request to read everything the stream + // has in its internal buffer right now. + if (buffer.byteLength < (size - bufferLength)) { + buffer = toUint8Array(stream['read'](undefined)); + } + } + // if chunk is not null or empty, push it onto the queue + if (buffer.byteLength > 0) { + buffers.push(buffer); + bufferLength += buffer.byteLength; + } + } + // If we have enough bytes in our buffer, yield chunks until we don't + if (done || size <= bufferLength) { + do { + ({ cmd, size } = yield byteRange()); + } while (size < bufferLength); + } + } while (!done); + } finally { + await cleanup(events, event === 'error' ? err : null); + } + + function cleanup(events: Event[], err?: T) { + buffer = buffers = null; + return new Promise(async (resolve, reject) => { + for (const [evt, fn] of events) { + stream['off'](evt, fn); + } + try { + // Some stream implementations don't call the destroy callback, + // because it's really a node-internal API. Just calling `destroy` + // here should be enough to conform to the ReadableStream contract + const destroy = (stream as any)['destroy']; + destroy && destroy.call(stream, err); + err = undefined; + } catch (e) { err = e || err; } finally { + err != null ? reject(err) : resolve(); + } + }); + } +} diff --git a/js/src/io/file.ts b/js/src/io/file.ts new file mode 100644 index 0000000000000..d88bc5f6f4e56 --- /dev/null +++ b/js/src/io/file.ts @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { FileHandle } from './interfaces'; +import { ByteStream, AsyncByteStream } from './stream'; +import { ArrayBufferViewInput, toUint8Array } from '../util/buffer'; + +/** @ignore */ +export class RandomAccessFile extends ByteStream { + public size: number; + public position: number = 0; + protected buffer: Uint8Array | null; + constructor(buffer: ArrayBufferViewInput, byteLength?: number) { + super(); + this.buffer = toUint8Array(buffer); + this.size = typeof byteLength === 'undefined' ? this.buffer.byteLength : byteLength; + } + public readInt32(position: number) { + const { buffer, byteOffset } = this.readAt(position, 4); + return new DataView(buffer, byteOffset).getInt32(0, true); + } + public seek(position: number) { + this.position = Math.min(position, this.size); + return position < this.size; + } + public read(nBytes?: number | null) { + const { buffer, size, position } = this; + if (buffer && position < size) { + if (typeof nBytes !== 'number') { nBytes = Infinity; } + this.position = Math.min(size, + position + Math.min(size - position, nBytes)); + return buffer.subarray(position, this.position); + } + return null; + } + public readAt(position: number, nBytes: number) { + const buf = this.buffer; + const end = Math.min(this.size, position + nBytes); + return buf ? buf.subarray(position, end) : new Uint8Array(nBytes); + } + public close() { this.buffer && (this.buffer = null); } + public throw(value?: any) { this.close(); return { done: true, value }; } + public return(value?: any) { this.close(); return { done: true, value }; } +} + +/** @ignore */ +export class AsyncRandomAccessFile extends AsyncByteStream { + // @ts-ignore + public size: number; + public position: number = 0; + public _pending?: Promise; + protected _handle: FileHandle | null; + constructor(file: FileHandle, byteLength?: number) { + super(); + this._handle = file; + if (typeof byteLength === 'number') { + this.size = byteLength; + } else { + this._pending = (async () => { + delete this._pending; + this.size = (await file.stat()).size; + })(); + } + } + public async readInt32(position: number) { + const { buffer, byteOffset } = await this.readAt(position, 4); + return new DataView(buffer, byteOffset).getInt32(0, true); + } + public async seek(position: number) { + this._pending && await this._pending; + this.position = Math.min(position, this.size); + return position < this.size; + } + public async read(nBytes?: number | null) { + this._pending && await this._pending; + const { _handle: file, size, position } = this; + if (file && position < size) { + if (typeof nBytes !== 'number') { nBytes = Infinity; } + let pos = position, offset = 0, bytesRead = 0; + let end = Math.min(size, pos + Math.min(size - pos, nBytes)); + let buffer = new Uint8Array(Math.max(0, (this.position = end) - pos)); + while ((pos += bytesRead) < end && (offset += bytesRead) < buffer.byteLength) { + ({ bytesRead } = await file.read(buffer, offset, buffer.byteLength - offset, pos)); + } + return buffer; + } + return null; + } + public async readAt(position: number, nBytes: number) { + this._pending && await this._pending; + const { _handle: file, size } = this; + if (file && (position + nBytes) < size) { + const end = Math.min(size, position + nBytes); + const buffer = new Uint8Array(end - position); + return (await file.read(buffer, 0, nBytes, position)).buffer; + } + return new Uint8Array(nBytes); + } + public async close() { const f = this._handle; this._handle = null; f && await f.close(); } + public async throw(value?: any) { await this.close(); return { done: true, value }; } + public async return(value?: any) { await this.close(); return { done: true, value }; } +} diff --git a/js/src/io/interfaces.ts b/js/src/io/interfaces.ts new file mode 100644 index 0000000000000..9892562e0c0ec --- /dev/null +++ b/js/src/io/interfaces.ts @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './adapters'; + +/** @ignore */ +export const ITERATOR_DONE: any = Object.freeze({ done: true, value: void (0) }); + +/** @ignore */ +export type FileHandle = import('fs').promises.FileHandle; +/** @ignore */ +export type ArrowJSONLike = { schema: any; batches?: any[]; dictionaries?: any[]; }; +/** @ignore */ +export type ReadableDOMStreamOptions = { type: 'bytes' | undefined, autoAllocateChunkSize?: number, highWaterMark?: number }; + +/** @ignore */ +export class ArrowJSON { + // @ts-ignore + constructor(private _json: ArrowJSONLike) {} + public get schema(): any { return this._json['schema']; } + public get batches(): any[] { return (this._json['batches'] || []) as any[]; } + public get dictionaries(): any[] { return (this._json['dictionaries'] || []) as any[]; } +} + +/** @ignore */ +export interface Readable { + + readonly closed: Promise; + cancel(reason?: any): Promise; + + read(size?: number | null): Promise; + peek(size?: number | null): Promise; + throw(value?: any): Promise>; + return(value?: any): Promise>; + next(size?: number | null): Promise>; +} + +/** @ignore */ +export interface Writable { + readonly closed: Promise; + close(): void; + write(chunk: T): void; + abort(reason?: any): void; +} + +/** @ignore */ +export interface ReadableWritable extends Readable, Writable { + [Symbol.asyncIterator](): AsyncIterableIterator; + toDOMStream(options?: ReadableDOMStreamOptions): ReadableStream; + toNodeStream(options?: import('stream').ReadableOptions): import('stream').Readable; +} + +/** @ignore */ +export abstract class ReadableInterop { + + public abstract toDOMStream(options?: ReadableDOMStreamOptions): ReadableStream; + public abstract toNodeStream(options?: import('stream').ReadableOptions): import('stream').Readable; + + public tee(): [ReadableStream, ReadableStream] { + return this._getDOMStream().tee(); + } + public pipe(writable: R, options?: { end?: boolean; }) { + return this._getNodeStream().pipe(writable, options); + } + public pipeTo(writable: WritableStream, options?: PipeOptions) { return this._getDOMStream().pipeTo(writable, options); } + public pipeThrough>(duplex: { writable: WritableStream, readable: R }, options?: PipeOptions) { + return this._getDOMStream().pipeThrough(duplex, options); + } + + private _DOMStream?: ReadableStream; + private _getDOMStream() { + return this._DOMStream || (this._DOMStream = this.toDOMStream()); + } + + private _nodeStream?: import('stream').Readable; + private _getNodeStream() { + return this._nodeStream || (this._nodeStream = this.toNodeStream()); + } +} + +/** @ignore */ +type Resolution = { resolve: (value?: T | PromiseLike) => void; reject: (reason?: any) => void; }; + +/** @ignore */ +export class AsyncQueue extends ReadableInterop + implements AsyncIterableIterator, ReadableWritable { + + protected _values: TWritable[] = []; + protected _error?: { error: any; }; + protected _closedPromise: Promise; + protected _closedPromiseResolve?: (value?: any) => void; + protected resolvers: Resolution>[] = []; + + constructor() { + super(); + this._closedPromise = new Promise((r) => this._closedPromiseResolve = r); + } + + public get closed(): Promise { return this._closedPromise; } + public async cancel(reason?: any) { await this.return(reason); } + public write(value: TWritable) { + if (this._ensureOpen()) { + this.resolvers.length <= 0 + ? (this._values.push(value)) + : (this.resolvers.shift()!.resolve({ done: false, value } as any)); + } + } + public abort(value?: any) { + if (this._closedPromiseResolve) { + this.resolvers.length <= 0 + ? (this._error = { error: value }) + : (this.resolvers.shift()!.reject({ done: true, value })); + } + } + public close() { + if (this._closedPromiseResolve) { + const { resolvers } = this; + while (resolvers.length > 0) { + resolvers.shift()!.resolve(ITERATOR_DONE); + } + this._closedPromiseResolve(); + this._closedPromiseResolve = undefined; + } + } + + public [Symbol.asyncIterator]() { return this; } + public toDOMStream(options?: ReadableDOMStreamOptions) { + return streamAdapters.toDOMStream( + (this._closedPromiseResolve || this._error) + ? (this as AsyncIterable) + : (this._values as any) as Iterable, + options); + } + public toNodeStream(options?: import('stream').ReadableOptions) { + return streamAdapters.toNodeStream( + (this._closedPromiseResolve || this._error) + ? (this as AsyncIterable) + : (this._values as any) as Iterable, + options); + } + public async throw(_?: any) { await this.abort(_); return ITERATOR_DONE; } + public async return(_?: any) { await this.close(); return ITERATOR_DONE; } + + public async read(size?: number | null): Promise { return (await this.next(size, 'read')).value; } + public async peek(size?: number | null): Promise { return (await this.next(size, 'peek')).value; } + public next(..._args: any[]): Promise> { + if (this._values.length > 0) { + return Promise.resolve({ done: false, value: this._values.shift()! } as any); + } else if (this._error) { + return Promise.reject({ done: true, value: this._error.error }); + } else if (!this._closedPromiseResolve) { + return Promise.resolve(ITERATOR_DONE); + } else { + return new Promise>((resolve, reject) => { + this.resolvers.push({ resolve, reject }); + }); + } + } + + protected _ensureOpen() { + if (this._closedPromiseResolve) { + return true; + } + throw new Error(`${this} is closed`); + } +} diff --git a/js/src/io/stream.ts b/js/src/io/stream.ts new file mode 100644 index 0000000000000..2fe686532a5e5 --- /dev/null +++ b/js/src/io/stream.ts @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import streamAdapters from './adapters'; +import { decodeUtf8 } from '../util/utf8'; +import { ITERATOR_DONE, Readable, Writable, AsyncQueue } from './interfaces'; +import { toUint8Array, joinUint8Arrays, ArrayBufferViewInput } from '../util/buffer'; + +import { + isPromise, isFetchResponse, + isIterable, isAsyncIterable, + isReadableDOMStream, isReadableNodeStream +} from '../util/compat'; + +/** @ignore */ +export type WritableSink = Writable | WritableStream | NodeJS.WritableStream | null; +/** @ignore */ +export type ReadableSource = Readable | PromiseLike | AsyncIterable | ReadableStream | NodeJS.ReadableStream | null; + +/** @ignore */ +export class AsyncByteQueue extends AsyncQueue { + public write(value: ArrayBufferViewInput | Uint8Array) { + if ((value = toUint8Array(value)).byteLength > 0) { + return super.write(value as T); + } + } + public toString(sync: true): string; + public toString(sync?: false): Promise; + public toString(sync = false) { + return sync + ? decodeUtf8(this.toUint8Array(true)) + : this.toUint8Array(false).then(decodeUtf8); + } + public toUint8Array(sync: true): Uint8Array; + public toUint8Array(sync?: false): Promise; + public toUint8Array(sync = false) { + return sync ? joinUint8Arrays(this._values as any[])[0] : (async () => { + let buffers = [], byteLength = 0; + for await (const chunk of this) { + buffers.push(chunk); + byteLength += chunk.byteLength; + } + return joinUint8Arrays(buffers, byteLength)[0]; + })(); + } +} + +/** @ignore */ +export class ByteStream implements IterableIterator { + // @ts-ignore + private source: ByteStreamSource; + constructor(source?: Iterable | ArrayBufferViewInput) { + if (source) { + this.source = new ByteStreamSource(streamAdapters.fromIterable(source)); + } + } + [Symbol.iterator]() { return this; } + public next(value?: any) { return this.source.next(value); } + public throw(value?: any) { return this.source.throw(value); } + public return(value?: any) { return this.source.return(value); } + public peek(size?: number | null) { return this.source.peek(size); } + public read(size?: number | null) { return this.source.read(size); } +} + +/** @ignore */ +export class AsyncByteStream implements Readable, AsyncIterableIterator { + // @ts-ignore + private source: AsyncByteStreamSource; + constructor(source?: PromiseLike | Response | ReadableStream | NodeJS.ReadableStream | AsyncIterable | Iterable) { + if (source instanceof AsyncByteStream) { + this.source = (source as AsyncByteStream).source; + } else if (source instanceof AsyncByteQueue) { + this.source = new AsyncByteStreamSource(streamAdapters.fromAsyncIterable(source)); + } else if (isReadableNodeStream(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromNodeStream(source)); + } else if (isFetchResponse(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromDOMStream(source.body!)); + } else if (isIterable(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromIterable(source)); + } else if (isPromise(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromAsyncIterable(source)); + } else if (isAsyncIterable(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromAsyncIterable(source)); + } else if (isReadableDOMStream(source)) { + this.source = new AsyncByteStreamSource(streamAdapters.fromDOMStream(source)); + } + } + [Symbol.asyncIterator]() { return this; } + public next(value?: any) { return this.source.next(value); } + public throw(value?: any) { return this.source.throw(value); } + public return(value?: any) { return this.source.return(value); } + public get closed(): Promise { return this.source.closed; } + public cancel(reason?: any) { return this.source.cancel(reason); } + public peek(size?: number | null) { return this.source.peek(size); } + public read(size?: number | null) { return this.source.read(size); } +} + +/** @ignore */ +interface ByteStreamSourceIterator extends IterableIterator { + next(value?: { cmd: 'peek' | 'read', size?: number | null }): IteratorResult; +} + +/** @ignore */ +interface AsyncByteStreamSourceIterator extends AsyncIterableIterator { + next(value?: { cmd: 'peek' | 'read', size?: number | null }): Promise>; +} + +/** @ignore */ +class ByteStreamSource { + constructor(protected source: ByteStreamSourceIterator) {} + public cancel(reason?: any) { this.return(reason); } + public peek(size?: number | null): T | null { return this.next(size, 'peek').value; } + public read(size?: number | null): T | null { return this.next(size, 'read').value; } + public next(size?: number | null, cmd: 'peek' | 'read' = 'read') { return this.source.next({ cmd, size }); } + public throw(value?: any) { return Object.create((this.source.throw && this.source.throw(value)) || ITERATOR_DONE); } + public return(value?: any) { return Object.create((this.source.return && this.source.return(value)) || ITERATOR_DONE); } +} + +/** @ignore */ +class AsyncByteStreamSource implements Readable { + + private _closedPromise: Promise; + private _closedPromiseResolve?: (value?: any) => void; + constructor (protected source: ByteStreamSourceIterator | AsyncByteStreamSourceIterator) { + this._closedPromise = new Promise((r) => this._closedPromiseResolve = r); + } + public async cancel(reason?: any) { await this.return(reason); } + public get closed(): Promise { return this._closedPromise; } + public async read(size?: number | null): Promise { return (await this.next(size, 'read')).value; } + public async peek(size?: number | null): Promise { return (await this.next(size, 'peek')).value; } + public async next(size?: number | null, cmd: 'peek' | 'read' = 'read') { return (await this.source.next({ cmd, size })); } + public async throw(value?: any) { + const result = (this.source.throw && await this.source.throw(value)) || ITERATOR_DONE; + this._closedPromiseResolve && this._closedPromiseResolve(); + this._closedPromiseResolve = undefined; + return Object.create(result); + } + public async return(value?: any) { + const result = (this.source.return && await this.source.return(value)) || ITERATOR_DONE; + this._closedPromiseResolve && this._closedPromiseResolve(); + this._closedPromiseResolve = undefined; + return Object.create(result); + } +} diff --git a/js/src/ipc/magic.ts b/js/src/ipc/magic.ts deleted file mode 100644 index 0688d1a2d1e19..0000000000000 --- a/js/src/ipc/magic.ts +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { flatbuffers } from 'flatbuffers'; -import ByteBuffer = flatbuffers.ByteBuffer; - -export const PADDING = 4; -export const MAGIC_STR = 'ARROW1'; -export const MAGIC = new Uint8Array(MAGIC_STR.length); - -for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { - MAGIC[i] = MAGIC_STR.charCodeAt(i); -} - -export function checkForMagicArrowString(buffer: Uint8Array, index = 0) { - for (let i = -1, n = MAGIC.length; ++i < n;) { - if (MAGIC[i] !== buffer[index + i]) { - return false; - } - } - return true; -} - -export function isValidArrowFile(bb: ByteBuffer) { - let fileLength = bb.capacity(), footerLength: number, lengthOffset: number; - if ((fileLength < magicX2AndPadding /* Arrow buffer too small */) || - (!checkForMagicArrowString(bb.bytes(), 0) /* Missing magic start */) || - (!checkForMagicArrowString(bb.bytes(), fileLength - magicLength) /* Missing magic end */) || - (/* Invalid footer length */ - (footerLength = bb.readInt32(lengthOffset = fileLength - magicAndPadding)) < 1 && - (footerLength + lengthOffset > fileLength))) { - return false; - } - return true; -} - -export const magicLength = MAGIC.length; -export const magicAndPadding = magicLength + PADDING; -export const magicX2AndPadding = magicLength * 2 + PADDING; diff --git a/js/src/ipc/message.ts b/js/src/ipc/message.ts new file mode 100644 index 0000000000000..194e4ac7f679d --- /dev/null +++ b/js/src/ipc/message.ts @@ -0,0 +1,249 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { MessageHeader } from '../enum'; +import { flatbuffers } from 'flatbuffers'; +import ByteBuffer = flatbuffers.ByteBuffer; +import { Message } from './metadata/message'; +import { isFileHandle } from '../util/compat'; +import { AsyncRandomAccessFile } from '../io/file'; +import { toUint8Array, ArrayBufferViewInput } from '../util/buffer'; +import { ByteStream, ReadableSource, AsyncByteStream } from '../io/stream'; +import { ArrowJSON, ArrowJSONLike, ITERATOR_DONE, FileHandle } from '../io/interfaces'; + +/** @ignore */ const invalidMessageType = (type: MessageHeader) => `Expected ${MessageHeader[type]} Message in stream, but was null or length 0.`; +/** @ignore */ const nullMessage = (type: MessageHeader) => `Header pointer of flatbuffer-encoded ${MessageHeader[type]} Message is null or length 0.`; +/** @ignore */ const invalidMessageMetadata = (expected: number, actual: number) => `Expected to read ${expected} metadata bytes, but only read ${actual}.`; +/** @ignore */ const invalidMessageBodyLength = (expected: number, actual: number) => `Expected to read ${expected} bytes for message body, but only read ${actual}.`; + +/** @ignore */ +export class MessageReader implements IterableIterator { + protected source: ByteStream; + constructor(source: ByteStream | ArrayBufferViewInput | Iterable) { + this.source = source instanceof ByteStream ? source : new ByteStream(source); + } + public [Symbol.iterator](): IterableIterator { return this as IterableIterator; } + public next(): IteratorResult { + let r; + if ((r = this.readMetadataLength()).done) { return ITERATOR_DONE; } + if ((r = this.readMetadata(r.value)).done) { return ITERATOR_DONE; } + return ( r) as IteratorResult; + } + public throw(value?: any) { return this.source.throw(value); } + public return(value?: any) { return this.source.return(value); } + public readMessage(type?: T | null) { + let r: IteratorResult>; + if ((r = this.next()).done) { return null; } + if ((type != null) && r.value.headerType !== type) { + throw new Error(invalidMessageType(type)); + } + return r.value; + } + public readMessageBody(bodyLength: number): Uint8Array { + if (bodyLength <= 0) { return new Uint8Array(0); } + const buf = toUint8Array(this.source.read(bodyLength)); + if (buf.byteLength < bodyLength) { + throw new Error(invalidMessageBodyLength(bodyLength, buf.byteLength)); + } + // 1. Work around bugs in fs.ReadStream's internal Buffer pooling, see: https://github.com/nodejs/node/issues/24817 + // 2. Work around https://github.com/whatwg/streams/blob/0ebe4b042e467d9876d80ae045de3843092ad797/reference-implementation/lib/helpers.js#L126 + return /* 1. */ (buf.byteOffset % 8 === 0) && + /* 2. */ (buf.byteOffset + buf.byteLength) <= buf.buffer.byteLength ? buf : buf.slice(); + } + public readSchema(throwIfNull = false) { + const type = MessageHeader.Schema; + const message = this.readMessage(type); + const schema = message && message.header(); + if (throwIfNull && !schema) { + throw new Error(nullMessage(type)); + } + return schema; + } + protected readMetadataLength(): IteratorResult { + const buf = this.source.read(PADDING); + const bb = buf && new ByteBuffer(buf); + const len = +(bb && bb.readInt32(0))!; + return { done: len <= 0, value: len }; + } + protected readMetadata(metadataLength: number): IteratorResult { + const buf = this.source.read(metadataLength); + if (!buf) { return ITERATOR_DONE; } + if (buf.byteLength < metadataLength) { + throw new Error(invalidMessageMetadata(metadataLength, buf.byteLength)); + } + return { done: false, value: Message.decode(buf) }; + } +} + +/** @ignore */ +export class AsyncMessageReader implements AsyncIterableIterator { + protected source: AsyncByteStream; + constructor(source: ReadableSource); + constructor(source: FileHandle, byteLength?: number); + constructor(source: any, byteLength?: number) { + this.source = source instanceof AsyncByteStream ? source + : isFileHandle(source) + ? new AsyncRandomAccessFile(source, byteLength!) + : new AsyncByteStream(source); + } + public [Symbol.asyncIterator](): AsyncIterableIterator { return this as AsyncIterableIterator; } + public async next(): Promise> { + let r; + if ((r = await this.readMetadataLength()).done) { return ITERATOR_DONE; } + if ((r = await this.readMetadata(r.value)).done) { return ITERATOR_DONE; } + return ( r) as IteratorResult; + } + public async throw(value?: any) { return await this.source.throw(value); } + public async return(value?: any) { return await this.source.return(value); } + public async readMessage(type?: T | null) { + let r: IteratorResult>; + if ((r = await this.next()).done) { return null; } + if ((type != null) && r.value.headerType !== type) { + throw new Error(invalidMessageType(type)); + } + return r.value; + } + public async readMessageBody(bodyLength: number): Promise { + if (bodyLength <= 0) { return new Uint8Array(0); } + const buf = toUint8Array(await this.source.read(bodyLength)); + if (buf.byteLength < bodyLength) { + throw new Error(invalidMessageBodyLength(bodyLength, buf.byteLength)); + } + // 1. Work around bugs in fs.ReadStream's internal Buffer pooling, see: https://github.com/nodejs/node/issues/24817 + // 2. Work around https://github.com/whatwg/streams/blob/0ebe4b042e467d9876d80ae045de3843092ad797/reference-implementation/lib/helpers.js#L126 + return /* 1. */ (buf.byteOffset % 8 === 0) && + /* 2. */ (buf.byteOffset + buf.byteLength) <= buf.buffer.byteLength ? buf : buf.slice(); + } + public async readSchema(throwIfNull = false) { + const type = MessageHeader.Schema; + const message = await this.readMessage(type); + const schema = message && message.header(); + if (throwIfNull && !schema) { + throw new Error(nullMessage(type)); + } + return schema; + } + protected async readMetadataLength(): Promise> { + const buf = await this.source.read(PADDING); + const bb = buf && new ByteBuffer(buf); + const len = +(bb && bb.readInt32(0))!; + return { done: len <= 0, value: len }; + } + protected async readMetadata(metadataLength: number): Promise> { + const buf = await this.source.read(metadataLength); + if (!buf) { return ITERATOR_DONE; } + if (buf.byteLength < metadataLength) { + throw new Error(invalidMessageMetadata(metadataLength, buf.byteLength)); + } + return { done: false, value: Message.decode(buf) }; + } +} + +/** @ignore */ +export class JSONMessageReader extends MessageReader { + private _schema = false; + private _json: ArrowJSON; + private _body: any[] = []; + private _batchIndex = 0; + private _dictionaryIndex = 0; + constructor(source: ArrowJSON | ArrowJSONLike) { + super(new Uint8Array(0)); + this._json = source instanceof ArrowJSON ? source : new ArrowJSON(source); + } + public next() { + const { _json, _batchIndex, _dictionaryIndex } = this; + const numBatches = _json.batches.length; + const numDictionaries = _json.dictionaries.length; + if (!this._schema) { + this._schema = true; + const message = Message.fromJSON(_json.schema, MessageHeader.Schema); + return { value: message, done: _batchIndex >= numBatches && _dictionaryIndex >= numDictionaries }; + } + if (_dictionaryIndex < numDictionaries) { + const batch = _json.dictionaries[this._dictionaryIndex++]; + this._body = batch['data']['columns']; + const message = Message.fromJSON(batch, MessageHeader.DictionaryBatch); + return { done: false, value: message }; + } + if (_batchIndex < numBatches) { + const batch = _json.batches[this._batchIndex++]; + this._body = batch['columns']; + const message = Message.fromJSON(batch, MessageHeader.RecordBatch); + return { done: false, value: message }; + } + this._body = []; + return ITERATOR_DONE; + } + public readMessageBody(_bodyLength?: number) { + return flattenDataSources(this._body) as any; + function flattenDataSources(xs: any[]): any[][] { + return (xs || []).reduce((buffers, column: any) => [ + ...buffers, + ...(column['VALIDITY'] && [column['VALIDITY']] || []), + ...(column['TYPE'] && [column['TYPE']] || []), + ...(column['OFFSET'] && [column['OFFSET']] || []), + ...(column['DATA'] && [column['DATA']] || []), + ...flattenDataSources(column['children']) + ], [] as any[][]); + } + } + public readMessage(type?: T | null) { + let r: IteratorResult>; + if ((r = this.next()).done) { return null; } + if ((type != null) && r.value.headerType !== type) { + throw new Error(invalidMessageType(type)); + } + return r.value; + } + public readSchema() { + const type = MessageHeader.Schema; + const message = this.readMessage(type); + const schema = message && message.header(); + if (!message || !schema) { + throw new Error(nullMessage(type)); + } + return schema; + } +} + +/** @ignore */ +export const PADDING = 4; +/** @ignore */ +export const MAGIC_STR = 'ARROW1'; +/** @ignore */ +export const MAGIC = new Uint8Array(MAGIC_STR.length); + +for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { + MAGIC[i] = MAGIC_STR.charCodeAt(i); +} + +/** @ignore */ +export function checkForMagicArrowString(buffer: Uint8Array, index = 0) { + for (let i = -1, n = MAGIC.length; ++i < n;) { + if (MAGIC[i] !== buffer[index + i]) { + return false; + } + } + return true; +} + +/** @ignore */ +export const magicLength = MAGIC.length; +/** @ignore */ +export const magicAndPadding = magicLength + PADDING; +/** @ignore */ +export const magicX2AndPadding = magicLength * 2 + PADDING; diff --git a/js/src/ipc/metadata.ts b/js/src/ipc/metadata.ts deleted file mode 100644 index 025b051734295..0000000000000 --- a/js/src/ipc/metadata.ts +++ /dev/null @@ -1,96 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/* tslint:disable:class-name */ - -import { Schema, Long, MessageHeader, MetadataVersion } from '../type'; - -export class Footer { - constructor(public dictionaryBatches: FileBlock[], public recordBatches: FileBlock[], public schema: Schema) {} -} - -export class FileBlock { - public offset: number; - public bodyLength: number; - constructor(public metaDataLength: number, bodyLength: Long | number, offset: Long | number) { - this.offset = typeof offset === 'number' ? offset : offset.low; - this.bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; - } -} - -export class Message { - public bodyLength: number; - public version: MetadataVersion; - public headerType: MessageHeader; - constructor(version: MetadataVersion, bodyLength: Long | number, headerType: MessageHeader) { - this.version = version; - this.headerType = headerType; - this.bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; - } - static isSchema(m: Message): m is Schema { return m.headerType === MessageHeader.Schema; } - static isRecordBatch(m: Message): m is RecordBatchMetadata { return m.headerType === MessageHeader.RecordBatch; } - static isDictionaryBatch(m: Message): m is DictionaryBatch { return m.headerType === MessageHeader.DictionaryBatch; } -} - -export class RecordBatchMetadata extends Message { - public length: number; - public nodes: FieldMetadata[]; - public buffers: BufferMetadata[]; - constructor(version: MetadataVersion, length: Long | number, nodes: FieldMetadata[], buffers: BufferMetadata[], bodyLength?: Long | number) { - if (bodyLength === void(0)) { - bodyLength = buffers.reduce((bodyLength, buffer) => bodyLength + buffer.length, 0); - } - super(version, bodyLength, MessageHeader.RecordBatch); - this.nodes = nodes; - this.buffers = buffers; - this.length = typeof length === 'number' ? length : length.low; - } -} - -export class DictionaryBatch extends Message { - public id: number; - public isDelta: boolean; - public data: RecordBatchMetadata; - constructor(version: MetadataVersion, data: RecordBatchMetadata, id: Long | number, isDelta: boolean = false) { - super(version, data.bodyLength, MessageHeader.DictionaryBatch); - this.isDelta = isDelta; - this.data = data; - this.id = typeof id === 'number' ? id : id.low; - } - private static atomicDictionaryId = 0; - public static getId() { return DictionaryBatch.atomicDictionaryId++; } - public get nodes(): FieldMetadata[] { return this.data.nodes; } - public get buffers(): BufferMetadata[] { return this.data.buffers; } -} - -export class BufferMetadata { - public offset: number; - public length: number; - constructor(offset: Long | number, length: Long | number) { - this.offset = typeof offset === 'number' ? offset : offset.low; - this.length = typeof length === 'number' ? length : length.low; - } -} - -export class FieldMetadata { - public length: number; - public nullCount: number; - constructor(length: Long | number, nullCount: Long | number) { - this.length = typeof length === 'number' ? length : length.low; - this.nullCount = typeof nullCount === 'number' ? nullCount : nullCount.low; - } -} diff --git a/js/src/ipc/metadata/file.ts b/js/src/ipc/metadata/file.ts new file mode 100644 index 0000000000000..d7786fbbf9324 --- /dev/null +++ b/js/src/ipc/metadata/file.ts @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* tslint:disable:class-name */ + +import * as File_ from '../../fb/File'; +import { flatbuffers } from 'flatbuffers'; + +import Long = flatbuffers.Long; +import Builder = flatbuffers.Builder; +import ByteBuffer = flatbuffers.ByteBuffer; +import _Block = File_.org.apache.arrow.flatbuf.Block; +import _Footer = File_.org.apache.arrow.flatbuf.Footer; + +import { Schema } from '../../schema'; +import { MetadataVersion } from '../../enum'; +import { toUint8Array } from '../../util/buffer'; +import { ArrayBufferViewInput } from '../../util/buffer'; + +/** @ignore */ +class Footer_ { + + /** @nocollapse */ + public static decode(buf: ArrayBufferViewInput) { + buf = new ByteBuffer(toUint8Array(buf)); + const footer = _Footer.getRootAsFooter(buf); + const schema = Schema.decode(footer.schema()!); + return new OffHeapFooter(schema, footer) as Footer_; + } + + /** @nocollapse */ + public static encode(footer: Footer_) { + + const b: Builder = new Builder(); + const schemaOffset = Schema.encode(b, footer.schema); + + _Footer.startRecordBatchesVector(b, footer.numRecordBatches); + [...footer.recordBatches()].slice().reverse().forEach((rb) => FileBlock.encode(b, rb)); + const recordBatchesOffset = b.endVector(); + + _Footer.startDictionariesVector(b, footer.numDictionaries); + [...footer.dictionaryBatches()].slice().reverse().forEach((db) => FileBlock.encode(b, db)); + + const dictionaryBatchesOffset = b.endVector(); + + _Footer.startFooter(b); + _Footer.addSchema(b, schemaOffset); + _Footer.addVersion(b, MetadataVersion.V4); + _Footer.addRecordBatches(b, recordBatchesOffset); + _Footer.addDictionaries(b, dictionaryBatchesOffset); + _Footer.finishFooterBuffer(b, _Footer.endFooter(b)); + + return b.asUint8Array(); + } + + // @ts-ignore + protected _recordBatches: FileBlock[]; + // @ts-ignore + protected _dictionaryBatches: FileBlock[]; + public get numRecordBatches() { return this._recordBatches.length; } + public get numDictionaries() { return this._dictionaryBatches.length; } + + constructor(public schema: Schema, + public version: MetadataVersion = MetadataVersion.V4, + recordBatches?: FileBlock[], dictionaryBatches?: FileBlock[]) { + recordBatches && (this._recordBatches = recordBatches); + dictionaryBatches && (this._dictionaryBatches = dictionaryBatches); + } + + public *recordBatches(): Iterable { + for (let block, i = -1, n = this.numRecordBatches; ++i < n;) { + if (block = this.getRecordBatch(i)) { yield block; } + } + } + + public *dictionaryBatches(): Iterable { + for (let block, i = -1, n = this.numDictionaries; ++i < n;) { + if (block = this.getDictionaryBatch(i)) { yield block; } + } + } + + public getRecordBatch(index: number) { + return index >= 0 + && index < this.numRecordBatches + && this._recordBatches[index] || null; + } + + public getDictionaryBatch(index: number) { + return index >= 0 + && index < this.numDictionaries + && this._dictionaryBatches[index] || null; + } +} + +export { Footer_ as Footer }; + +/** @ignore */ +class OffHeapFooter extends Footer_ { + + public get numRecordBatches() { return this._footer.recordBatchesLength(); } + public get numDictionaries() { return this._footer.dictionariesLength(); } + + constructor(schema: Schema, protected _footer: _Footer) { + super(schema, _footer.version()); + } + + public getRecordBatch(index: number) { + if (index >= 0 && index < this.numRecordBatches) { + const fileBlock = this._footer.recordBatches(index); + if (fileBlock) { return FileBlock.decode(fileBlock); } + } + return null; + } + + public getDictionaryBatch(index: number) { + if (index >= 0 && index < this.numDictionaries) { + const fileBlock = this._footer.dictionaries(index); + if (fileBlock) { return FileBlock.decode(fileBlock); } + } + return null; + } +} + +/** @ignore */ +export class FileBlock { + + /** @nocollapse */ + public static decode(block: _Block) { + return new FileBlock(block.metaDataLength(), block.bodyLength(), block.offset()); + } + + /** @nocollapse */ + public static encode(b: Builder, fileBlock: FileBlock) { + const { metaDataLength } = fileBlock; + const offset = new Long(fileBlock.offset, 0); + const bodyLength = new Long(fileBlock.bodyLength, 0); + return _Block.createBlock(b, offset, metaDataLength, bodyLength); + } + + public offset: number; + public bodyLength: number; + public metaDataLength: number; + + constructor(metaDataLength: number, bodyLength: Long | number, offset: Long | number) { + this.metaDataLength = metaDataLength; + this.offset = typeof offset === 'number' ? offset : offset.low; + this.bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; + } +} diff --git a/js/src/ipc/metadata/json.ts b/js/src/ipc/metadata/json.ts new file mode 100644 index 0000000000000..fa219b3e7853b --- /dev/null +++ b/js/src/ipc/metadata/json.ts @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Schema, Field } from '../../schema'; +import { + DataType, Dictionary, TimeBitWidth, + Utf8, Binary, Decimal, FixedSizeBinary, + List, FixedSizeList, Map_, Struct, Union, + Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, +} from '../../type'; + +import { DictionaryBatch, RecordBatch, FieldNode, BufferRegion } from './message'; +import { TimeUnit, Precision, IntervalUnit, UnionMode, DateUnit } from '../../enum'; + +/** @ignore */ +export function schemaFromJSON(_schema: any, dictionaries: Map = new Map(), dictionaryFields: Map[]> = new Map()) { + return new Schema( + schemaFieldsFromJSON(_schema, dictionaries, dictionaryFields), + customMetadataFromJSON(_schema['customMetadata']), + dictionaries, dictionaryFields + ); +} + +/** @ignore */ +export function recordBatchFromJSON(b: any) { + return new RecordBatch( + b['count'], + fieldNodesFromJSON(b['columns']), + buffersFromJSON(b['columns']) + ); +} + +/** @ignore */ +export function dictionaryBatchFromJSON(b: any) { + return new DictionaryBatch( + recordBatchFromJSON(b['data']), + b['id'], b['isDelta'] + ); +} + +/** @ignore */ +function schemaFieldsFromJSON(_schema: any, dictionaries?: Map, dictionaryFields?: Map[]>) { + return (_schema['fields'] || []).filter(Boolean).map((f: any) => Field.fromJSON(f, dictionaries, dictionaryFields)); +} + +/** @ignore */ +function fieldChildrenFromJSON(_field: any, dictionaries?: Map, dictionaryFields?: Map[]>): Field[] { + return (_field['children'] || []).filter(Boolean).map((f: any) => Field.fromJSON(f, dictionaries, dictionaryFields)); +} + +/** @ignore */ +function fieldNodesFromJSON(xs: any[]): FieldNode[] { + return (xs || []).reduce((fieldNodes, column: any) => [ + ...fieldNodes, + new FieldNode( + column['count'], + nullCountFromJSON(column['VALIDITY']) + ), + ...fieldNodesFromJSON(column['children']) + ], [] as FieldNode[]); +} + +/** @ignore */ +function buffersFromJSON(xs: any[], buffers: BufferRegion[] = []): BufferRegion[] { + for (let i = -1, n = (xs || []).length; ++i < n;) { + const column = xs[i]; + column['VALIDITY'] && buffers.push(new BufferRegion(buffers.length, column['VALIDITY'].length)); + column['TYPE'] && buffers.push(new BufferRegion(buffers.length, column['TYPE'].length)); + column['OFFSET'] && buffers.push(new BufferRegion(buffers.length, column['OFFSET'].length)); + column['DATA'] && buffers.push(new BufferRegion(buffers.length, column['DATA'].length)); + buffers = buffersFromJSON(column['children'], buffers); + } + return buffers; +} + +/** @ignore */ +function nullCountFromJSON(validity: number[]) { + return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); +} + +/** @ignore */ +export function fieldFromJSON(_field: any, dictionaries?: Map, dictionaryFields?: Map[]>) { + + let id: number; + let keys: TKeys | null; + let field: Field | void; + let dictMeta: any; + let type: DataType; + let dictType: Dictionary; + let dictField: Field; + + // If no dictionary encoding, or in the process of decoding the children of a dictionary-encoded field + if (!dictionaries || !dictionaryFields || !(dictMeta = _field['dictionary'])) { + type = typeFromJSON(_field, fieldChildrenFromJSON(_field, dictionaries, dictionaryFields)); + field = new Field(_field['name'], type, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + } + // tslint:disable + // If dictionary encoded and the first time we've seen this dictionary id, decode + // the data type and child fields, then wrap in a Dictionary type and insert the + // data type into the dictionary types map. + else if (!dictionaries.has(id = dictMeta['id'])) { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta['indexType']) ? indexTypeFromJSON(keys) as TKeys : new Int32(); + dictionaries.set(id, type = typeFromJSON(_field, fieldChildrenFromJSON(_field))); + dictType = new Dictionary(type, keys, id, dictMeta['isOrdered']); + dictField = new Field(_field['name'], dictType, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + dictionaryFields.set(id, [field = dictField]); + } + // If dictionary encoded, and have already seen this dictionary Id in the schema, then reuse the + // data type and wrap in a new Dictionary type and field. + else { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta['indexType']) ? indexTypeFromJSON(keys) as TKeys : new Int32(); + dictType = new Dictionary(dictionaries.get(id)!, keys, id, dictMeta['isOrdered']); + dictField = new Field(_field['name'], dictType, _field['nullable'], customMetadataFromJSON(_field['customMetadata'])); + dictionaryFields.get(id)!.push(field = dictField); + } + return field || null; +} + +/** @ignore */ +function customMetadataFromJSON(_metadata?: object) { + return new Map(Object.entries(_metadata || {})); +} + +/** @ignore */ +function indexTypeFromJSON(_type: any) { + return new Int(_type['isSigned'], _type['bitWidth']); +} + +/** @ignore */ +function typeFromJSON(f: any, children?: Field[]): DataType { + + const typeId = f['type']['name']; + + switch (typeId) { + case 'NONE': return new DataType(); + case 'null': return new Null(); + case 'binary': return new Binary(); + case 'utf8': return new Utf8(); + case 'bool': return new Bool(); + case 'list': return new List((children || [])[0]); + case 'struct': return new Struct(children || []); + case 'struct_': return new Struct(children || []); + } + + switch (typeId) { + case 'int': { + const t = f['type']; + return new Int(t['isSigned'], t['bitWidth'] as IntBitWidth); + } + case 'floatingpoint': { + const t = f['type']; + return new Float(Precision[t['precision']] as any); + } + case 'decimal': { + const t = f['type']; + return new Decimal(t['scale'], t['precision']); + } + case 'date': { + const t = f['type']; + return new Date_(DateUnit[t['unit']] as any); + } + case 'time': { + const t = f['type']; + return new Time(TimeUnit[t['unit']] as any, t['bitWidth'] as TimeBitWidth); + } + case 'timestamp': { + const t = f['type']; + return new Timestamp(TimeUnit[t['unit']] as any, t['timezone']); + } + case 'interval': { + const t = f['type']; + return new Interval(IntervalUnit[t['unit']] as any); + } + case 'union': { + const t = f['type']; + return new Union(UnionMode[t['mode']] as any, (t['typeIds'] || []), children || []); + } + case 'fixedsizebinary': { + const t = f['type']; + return new FixedSizeBinary(t['byteWidth']); + } + case 'fixedsizelist': { + const t = f['type']; + return new FixedSizeList(t['listSize'], (children || [])[0]); + } + case 'map': { + const t = f['type']; + return new Map_(children || [], t['keysSorted']); + } + } + throw new Error(`Unrecognized type: "${typeId}"`); +} diff --git a/js/src/ipc/metadata/message.ts b/js/src/ipc/metadata/message.ts new file mode 100644 index 0000000000000..794ece9101e52 --- /dev/null +++ b/js/src/ipc/metadata/message.ts @@ -0,0 +1,593 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { flatbuffers } from 'flatbuffers'; +import * as Schema_ from '../../fb/Schema'; +import * as Message_ from '../../fb/Message'; + +import { Schema, Field } from '../../schema'; +import { toUint8Array } from '../../util/buffer'; +import { ArrayBufferViewInput } from '../../util/buffer'; +import { MessageHeader, MetadataVersion } from '../../enum'; +import { instance as typeAssembler } from '../../visitor/typeassembler'; +import { fieldFromJSON, schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON } from './json'; + +import Long = flatbuffers.Long; +import Builder = flatbuffers.Builder; +import ByteBuffer = flatbuffers.ByteBuffer; +import _Int = Schema_.org.apache.arrow.flatbuf.Int; +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import _Field = Schema_.org.apache.arrow.flatbuf.Field; +import _Schema = Schema_.org.apache.arrow.flatbuf.Schema; +import _Buffer = Schema_.org.apache.arrow.flatbuf.Buffer; +import _Message = Message_.org.apache.arrow.flatbuf.Message; +import _KeyValue = Schema_.org.apache.arrow.flatbuf.KeyValue; +import _FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; +import _Endianness = Schema_.org.apache.arrow.flatbuf.Endianness; +import _RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; +import _DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; +import _DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; + +import { + DataType, Dictionary, TimeBitWidth, + Utf8, Binary, Decimal, FixedSizeBinary, + List, FixedSizeList, Map_, Struct, Union, + Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, +} from '../../type'; + +/** @ignore */ +export class Message { + + /** @nocollapse */ + public static fromJSON(msg: any, headerType: T): Message { + const message = new Message(0, MetadataVersion.V4, headerType); + message._createHeader = messageHeaderFromJSON(msg, headerType); + return message; + } + + /** @nocollapse */ + public static decode(buf: ArrayBufferViewInput) { + buf = new ByteBuffer(toUint8Array(buf)); + const _message = _Message.getRootAsMessage(buf); + const bodyLength: Long = _message.bodyLength()!; + const version: MetadataVersion = _message.version(); + const headerType: MessageHeader = _message.headerType(); + const message = new Message(bodyLength, version, headerType); + message._createHeader = decodeMessageHeader(_message, headerType); + return message; + } + + /** @nocollapse */ + public static encode(message: Message) { + let b = new Builder(), headerOffset = -1; + if (message.isSchema()) { + headerOffset = Schema.encode(b, message.header() as Schema); + } else if (message.isRecordBatch()) { + headerOffset = RecordBatch.encode(b, message.header() as RecordBatch); + } else if (message.isDictionaryBatch()) { + headerOffset = DictionaryBatch.encode(b, message.header() as DictionaryBatch); + } + _Message.startMessage(b); + _Message.addVersion(b, MetadataVersion.V4); + _Message.addHeader(b, headerOffset); + _Message.addHeaderType(b, message.headerType); + _Message.addBodyLength(b, new Long(message.bodyLength, 0)); + _Message.finishMessageBuffer(b, _Message.endMessage(b)); + return b.asUint8Array(); + } + + /** @nocollapse */ + public static from(header: Schema | RecordBatch | DictionaryBatch, bodyLength = 0) { + if (header instanceof Schema) { + return new Message(0, MetadataVersion.V4, MessageHeader.Schema, header); + } + if (header instanceof RecordBatch) { + return new Message(bodyLength, MetadataVersion.V4, MessageHeader.RecordBatch, header); + } + if (header instanceof DictionaryBatch) { + return new Message(bodyLength, MetadataVersion.V4, MessageHeader.DictionaryBatch, header); + } + throw new Error(`Unrecognized Message header: ${header}`); + } + + // @ts-ignore + public body: Uint8Array; + protected _headerType: T; + protected _bodyLength: number; + protected _version: MetadataVersion; + public get type() { return this.headerType; } + public get version() { return this._version; } + public get headerType() { return this._headerType; } + public get bodyLength() { return this._bodyLength; } + // @ts-ignore + protected _createHeader: MessageHeaderDecoder; + public header() { return this._createHeader(); } + public isSchema(): this is Message { return this.headerType === MessageHeader.Schema; } + public isRecordBatch(): this is Message { return this.headerType === MessageHeader.RecordBatch; } + public isDictionaryBatch(): this is Message { return this.headerType === MessageHeader.DictionaryBatch; } + + constructor(bodyLength: Long | number, version: MetadataVersion, headerType: T, header?: any) { + this._version = version; + this._headerType = headerType; + this.body = new Uint8Array(0); + header && (this._createHeader = () => header); + this._bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; + } +} + +/** @ignore */ +export class RecordBatch { + protected _length: number; + protected _nodes: FieldNode[]; + protected _buffers: BufferRegion[]; + public get nodes() { return this._nodes; } + public get length() { return this._length; } + public get buffers() { return this._buffers; } + constructor(length: Long | number, nodes: FieldNode[], buffers: BufferRegion[]) { + this._nodes = nodes; + this._buffers = buffers; + this._length = typeof length === 'number' ? length : length.low; + } +} + +/** @ignore */ +export class DictionaryBatch { + + protected _id: number; + protected _isDelta: boolean; + protected _data: RecordBatch; + public get id() { return this._id; } + public get data() { return this._data; } + public get isDelta() { return this._isDelta; } + public get length(): number { return this.data.length; } + public get nodes(): FieldNode[] { return this.data.nodes; } + public get buffers(): BufferRegion[] { return this.data.buffers; } + + constructor(data: RecordBatch, id: Long | number, isDelta: boolean = false) { + this._data = data; + this._isDelta = isDelta; + this._id = typeof id === 'number' ? id : id.low; + } +} + +/** @ignore */ +export class BufferRegion { + public offset: number; + public length: number; + constructor(offset: Long | number, length: Long | number) { + this.offset = typeof offset === 'number' ? offset : offset.low; + this.length = typeof length === 'number' ? length : length.low; + } +} + +/** @ignore */ +export class FieldNode { + public length: number; + public nullCount: number; + constructor(length: Long | number, nullCount: Long | number) { + this.length = typeof length === 'number' ? length : length.low; + this.nullCount = typeof nullCount === 'number' ? nullCount : nullCount.low; + } +} + +function messageHeaderFromJSON(message: any, type: MessageHeader) { + return (() => { + switch (type) { + case MessageHeader.Schema: return Schema.fromJSON(message); + case MessageHeader.RecordBatch: return RecordBatch.fromJSON(message); + case MessageHeader.DictionaryBatch: return DictionaryBatch.fromJSON(message); + } + throw new Error(`Unrecognized Message type: { name: ${MessageHeader[type]}, type: ${type} }`); + }) as MessageHeaderDecoder; +} + +function decodeMessageHeader(message: _Message, type: MessageHeader) { + return (() => { + switch (type) { + case MessageHeader.Schema: return Schema.decode(message.header(new _Schema())!); + case MessageHeader.RecordBatch: return RecordBatch.decode(message.header(new _RecordBatch())!, message.version()); + case MessageHeader.DictionaryBatch: return DictionaryBatch.decode(message.header(new _DictionaryBatch())!, message.version()); + } + throw new Error(`Unrecognized Message type: { name: ${MessageHeader[type]}, type: ${type} }`); + }) as MessageHeaderDecoder; +} + +Field['encode'] = encodeField; +Field['decode'] = decodeField; +Field['fromJSON'] = fieldFromJSON; + +Schema['encode'] = encodeSchema; +Schema['decode'] = decodeSchema; +Schema['fromJSON'] = schemaFromJSON; + +RecordBatch['encode'] = encodeRecordBatch; +RecordBatch['decode'] = decodeRecordBatch; +RecordBatch['fromJSON'] = recordBatchFromJSON; + +DictionaryBatch['encode'] = encodeDictionaryBatch; +DictionaryBatch['decode'] = decodeDictionaryBatch; +DictionaryBatch['fromJSON'] = dictionaryBatchFromJSON; + +FieldNode['encode'] = encodeFieldNode; +FieldNode['decode'] = decodeFieldNode; + +BufferRegion['encode'] = encodeBufferRegion; +BufferRegion['decode'] = decodeBufferRegion; + +declare module '../../schema' { + namespace Field { + export { encodeField as encode }; + export { decodeField as decode }; + export { fieldFromJSON as fromJSON }; + } + namespace Schema { + export { encodeSchema as encode }; + export { decodeSchema as decode }; + export { schemaFromJSON as fromJSON }; + } +} + +declare module './message' { + namespace RecordBatch { + export { encodeRecordBatch as encode }; + export { decodeRecordBatch as decode }; + export { recordBatchFromJSON as fromJSON }; + } + namespace DictionaryBatch { + export { encodeDictionaryBatch as encode }; + export { decodeDictionaryBatch as decode }; + export { dictionaryBatchFromJSON as fromJSON }; + } + namespace FieldNode { + export { encodeFieldNode as encode }; + export { decodeFieldNode as decode }; + } + namespace BufferRegion { + export { encodeBufferRegion as encode }; + export { decodeBufferRegion as decode }; + } +} + +/** @ignore */ +function decodeSchema(_schema: _Schema, dictionaries: Map = new Map(), dictionaryFields: Map[]> = new Map()) { + const fields = decodeSchemaFields(_schema, dictionaries, dictionaryFields); + return new Schema(fields, decodeCustomMetadata(_schema), dictionaries, dictionaryFields); +} + +/** @ignore */ +function decodeRecordBatch(batch: _RecordBatch, version = MetadataVersion.V4) { + return new RecordBatch(batch.length(), decodeFieldNodes(batch), decodeBuffers(batch, version)); +} + +/** @ignore */ +function decodeDictionaryBatch(batch: _DictionaryBatch, version = MetadataVersion.V4) { + return new DictionaryBatch(RecordBatch.decode(batch.data()!, version), batch.id(), batch.isDelta()); +} + +/** @ignore */ +function decodeBufferRegion(b: _Buffer) { + return new BufferRegion(b.offset(), b.length()); +} + +/** @ignore */ +function decodeFieldNode(f: _FieldNode) { + return new FieldNode(f.length(), f.nullCount()); +} + +/** @ignore */ +function decodeFieldNodes(batch: _RecordBatch) { + const nodes = [] as FieldNode[]; + for (let f, i = -1, j = -1, n = batch.nodesLength(); ++i < n;) { + if (f = batch.nodes(i)) { + nodes[++j] = FieldNode.decode(f); + } + } + return nodes; +} + +/** @ignore */ +function decodeBuffers(batch: _RecordBatch, version: MetadataVersion) { + const bufferRegions = [] as BufferRegion[]; + for (let b, i = -1, j = -1, n = batch.buffersLength(); ++i < n;) { + if (b = batch.buffers(i)) { + // If this Arrow buffer was written before version 4, + // advance the buffer's bb_pos 8 bytes to skip past + // the now-removed page_id field + if (version < MetadataVersion.V4) { + b.bb_pos += (8 * (i + 1)); + } + bufferRegions[++j] = BufferRegion.decode(b); + } + } + return bufferRegions; +} + +/** @ignore */ +function decodeSchemaFields(schema: _Schema, dictionaries?: Map, dictionaryFields?: Map[]>) { + const fields = [] as Field[]; + for (let f, i = -1, j = -1, n = schema.fieldsLength(); ++i < n;) { + if (f = schema.fields(i)) { + fields[++j] = Field.decode(f, dictionaries, dictionaryFields); + } + } + return fields; +} + +/** @ignore */ +function decodeFieldChildren(field: _Field, dictionaries?: Map, dictionaryFields?: Map[]>): Field[] { + const children = [] as Field[]; + for (let f, i = -1, j = -1, n = field.childrenLength(); ++i < n;) { + if (f = field.children(i)) { + children[++j] = Field.decode(f, dictionaries, dictionaryFields); + } + } + return children; +} + +/** @ignore */ +function decodeField(f: _Field, dictionaries?: Map, dictionaryFields?: Map[]>) { + + let id: number; + let field: Field | void; + let type: DataType; + let keys: _Int | TKeys | null; + let dictType: Dictionary; + let dictMeta: _DictionaryEncoding | null; + let dictField: Field; + + // If no dictionary encoding, or in the process of decoding the children of a dictionary-encoded field + if (!dictionaries || !dictionaryFields || !(dictMeta = f.dictionary())) { + type = decodeFieldType(f, decodeFieldChildren(f, dictionaries, dictionaryFields)); + field = new Field(f.name()!, type, f.nullable(), decodeCustomMetadata(f)); + } + // tslint:disable + // If dictionary encoded and the first time we've seen this dictionary id, decode + // the data type and child fields, then wrap in a Dictionary type and insert the + // data type into the dictionary types map. + else if (!dictionaries.has(id = dictMeta.id().low)) { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta.indexType()) ? decodeIndexType(keys) as TKeys : new Int32(); + dictionaries.set(id, type = decodeFieldType(f, decodeFieldChildren(f))); + dictType = new Dictionary(type, keys, id, dictMeta.isOrdered()); + dictField = new Field(f.name()!, dictType, f.nullable(), decodeCustomMetadata(f)); + dictionaryFields.set(id, [field = dictField]); + } + // If dictionary encoded, and have already seen this dictionary Id in the schema, then reuse the + // data type and wrap in a new Dictionary type and field. + else { + // a dictionary index defaults to signed 32 bit int if unspecified + keys = (keys = dictMeta.indexType()) ? decodeIndexType(keys) as TKeys : new Int32(); + dictType = new Dictionary(dictionaries.get(id)!, keys, id, dictMeta.isOrdered()); + dictField = new Field(f.name()!, dictType, f.nullable(), decodeCustomMetadata(f)); + dictionaryFields.get(id)!.push(field = dictField); + } + return field || null; +} + +/** @ignore */ +function decodeCustomMetadata(parent?: _Schema | _Field | null) { + const data = new Map(); + if (parent) { + for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { + if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { + data.set(key, entry.value()!); + } + } + } + return data; +} + +/** @ignore */ +function decodeIndexType(_type: _Int) { + return new Int(_type.isSigned(), _type.bitWidth() as IntBitWidth); +} + +/** @ignore */ +function decodeFieldType(f: _Field, children?: Field[]): DataType { + + const typeId = f.typeType(); + + switch (typeId) { + case Type.NONE: return new DataType(); + case Type.Null: return new Null(); + case Type.Binary: return new Binary(); + case Type.Utf8: return new Utf8(); + case Type.Bool: return new Bool(); + case Type.List: return new List((children || [])[0]); + case Type.Struct_: return new Struct(children || []); + } + + switch (typeId) { + case Type.Int: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Int())!; + return new Int(t.isSigned(), t.bitWidth()); + } + case Type.FloatingPoint: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.FloatingPoint())!; + return new Float(t.precision()); + } + case Type.Decimal: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Decimal())!; + return new Decimal(t.scale(), t.precision()); + } + case Type.Date: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Date())!; + return new Date_(t.unit()); + } + case Type.Time: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Time())!; + return new Time(t.unit(), t.bitWidth() as TimeBitWidth); + } + case Type.Timestamp: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Timestamp())!; + return new Timestamp(t.unit(), t.timezone()); + } + case Type.Interval: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Interval())!; + return new Interval(t.unit()); + } + case Type.Union: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Union())!; + return new Union(t.mode(), t.typeIdsArray() || [], children || []); + } + case Type.FixedSizeBinary: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.FixedSizeBinary())!; + return new FixedSizeBinary(t.byteWidth()); + } + case Type.FixedSizeList: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.FixedSizeList())!; + return new FixedSizeList(t.listSize(), (children || [])[0]); + } + case Type.Map: { + const t = f.type(new Schema_.org.apache.arrow.flatbuf.Map())!; + return new Map_(children || [], t.keysSorted()); + } + } + throw new Error(`Unrecognized type: "${Type[typeId]}" (${typeId})`); +} + +/** @ignore */ +function encodeSchema(b: Builder, schema: Schema) { + + const fieldOffsets = schema.fields.map((f) => Field.encode(b, f)); + + _Schema.startFieldsVector(b, fieldOffsets.length); + + const fieldsVectorOffset = _Schema.createFieldsVector(b, fieldOffsets); + + const metadataOffset = !(schema.metadata && schema.metadata.size > 0) ? -1 : + _Schema.createCustomMetadataVector(b, [...schema.metadata].map(([k, v]) => { + const key = b.createString(`${k}`); + const val = b.createString(`${v}`); + _KeyValue.startKeyValue(b); + _KeyValue.addKey(b, key); + _KeyValue.addValue(b, val); + return _KeyValue.endKeyValue(b); + })); + + _Schema.startSchema(b); + _Schema.addFields(b, fieldsVectorOffset); + _Schema.addEndianness(b, platformIsLittleEndian ? _Endianness.Little : _Endianness.Big); + + if (metadataOffset !== -1) { _Schema.addCustomMetadata(b, metadataOffset); } + + return _Schema.endSchema(b); +} + +/** @ignore */ +function encodeField(b: Builder, field: Field) { + + let nameOffset = -1; + let typeOffset = -1; + let dictionaryOffset = -1; + + let type = field.type; + let typeId: Type = field.typeId; + + if (!DataType.isDictionary(type)) { + typeOffset = typeAssembler.visit(type, b)!; + } else { + typeId = type.dictionary.typeId; + dictionaryOffset = typeAssembler.visit(type, b)!; + typeOffset = typeAssembler.visit(type.dictionary, b)!; + } + + const childOffsets = (type.children || []).map((f: Field) => Field.encode(b, f)); + const childrenVectorOffset = _Field.createChildrenVector(b, childOffsets); + + const metadataOffset = !(field.metadata && field.metadata.size > 0) ? -1 : + _Field.createCustomMetadataVector(b, [...field.metadata].map(([k, v]) => { + const key = b.createString(`${k}`); + const val = b.createString(`${v}`); + _KeyValue.startKeyValue(b); + _KeyValue.addKey(b, key); + _KeyValue.addValue(b, val); + return _KeyValue.endKeyValue(b); + })); + + if (field.name) { + nameOffset = b.createString(field.name); + } + + _Field.startField(b); + _Field.addType(b, typeOffset); + _Field.addTypeType(b, typeId); + _Field.addChildren(b, childrenVectorOffset); + _Field.addNullable(b, !!field.nullable); + + if (nameOffset !== -1) { _Field.addName(b, nameOffset); } + if (dictionaryOffset !== -1) { _Field.addDictionary(b, dictionaryOffset); } + if (metadataOffset !== -1) { _Field.addCustomMetadata(b, metadataOffset); } + + return _Field.endField(b); +} + +/** @ignore */ +function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { + + const nodes = recordBatch.nodes || []; + const buffers = recordBatch.buffers || []; + + _RecordBatch.startNodesVector(b, nodes.length); + nodes.slice().reverse().forEach((n) => FieldNode.encode(b, n)); + + const nodesVectorOffset = b.endVector(); + + _RecordBatch.startBuffersVector(b, buffers.length); + buffers.slice().reverse().forEach((b_) => BufferRegion.encode(b, b_)); + + const buffersVectorOffset = b.endVector(); + + _RecordBatch.startRecordBatch(b); + _RecordBatch.addLength(b, new Long(recordBatch.length, 0)); + _RecordBatch.addNodes(b, nodesVectorOffset); + _RecordBatch.addBuffers(b, buffersVectorOffset); + return _RecordBatch.endRecordBatch(b); +} + +/** @ignore */ +function encodeDictionaryBatch(b: Builder, dictionaryBatch: DictionaryBatch) { + const dataOffset = RecordBatch.encode(b, dictionaryBatch.data); + _DictionaryBatch.startDictionaryBatch(b); + _DictionaryBatch.addId(b, new Long(dictionaryBatch.id, 0)); + _DictionaryBatch.addIsDelta(b, dictionaryBatch.isDelta); + _DictionaryBatch.addData(b, dataOffset); + return _DictionaryBatch.endDictionaryBatch(b); +} + +/** @ignore */ +function encodeFieldNode(b: Builder, node: FieldNode) { + return _FieldNode.createFieldNode(b, new Long(node.length, 0), new Long(node.nullCount, 0)); +} + +/** @ignore */ +function encodeBufferRegion(b: Builder, node: BufferRegion) { + return _Buffer.createBuffer(b, new Long(node.offset, 0), new Long(node.length, 0)); +} + +/** @ignore */ +const platformIsLittleEndian = (function() { + const buffer = new ArrayBuffer(2); + new DataView(buffer).setInt16(0, 256, true /* littleEndian */); + // Int16Array uses the platform's endianness. + return new Int16Array(buffer)[0] === 256; +})(); + +/** @ignore */ +type MessageHeaderDecoder = () => T extends MessageHeader.Schema ? Schema + : T extends MessageHeader.RecordBatch ? RecordBatch + : T extends MessageHeader.DictionaryBatch ? DictionaryBatch : never; diff --git a/js/src/ipc/node/iterable.ts b/js/src/ipc/node/iterable.ts new file mode 100644 index 0000000000000..a5e558e01662e --- /dev/null +++ b/js/src/ipc/node/iterable.ts @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Readable } from 'stream'; +import { isIterable, isAsyncIterable } from '../../util/compat'; + +type ReadableOptions = import('stream').ReadableOptions; + +/** @ignore */ +export function toNodeStream(source: Iterable | AsyncIterable, options?: ReadableOptions): Readable { + if (isAsyncIterable(source)) { return new AsyncIterableReadable(source[Symbol.asyncIterator](), options); } + if (isIterable(source)) { return new IterableReadable(source[Symbol.iterator](), options); } + /* istanbul ignore next */ + throw new Error(`toNodeStream() must be called with an Iterable or AsyncIterable`); +} + +/** @ignore */ +class IterableReadable extends Readable { + private _pulling: boolean; + private _bytesMode: boolean; + private _iterator: Iterator; + constructor(it: Iterator, options?: ReadableOptions) { + super(options); + this._iterator = it; + this._pulling = false; + this._bytesMode = !options || !options.objectMode; + } + _read(size: number) { + const it = this._iterator; + if (it && !this._pulling && (this._pulling = true)) { + this._pulling = this._pull(size, it); + } + } + _destroy(e: Error | null, cb: (e: Error | null) => void) { + let it = this._iterator, fn: any; + it && (fn = e != null && it.throw || it.return); + fn && fn.call(it, e); + cb && cb(null); + } + private _pull(size: number, it: Iterator) { + const bm = this._bytesMode; + let r: IteratorResult | null = null; + while (this.readable && !(r = it.next(bm ? size : null)).done) { + if (size != null) { + size -= (bm && ArrayBuffer.isView(r.value) ? r.value.byteLength : 1); + } + if (!this.push(r.value) || size <= 0) { break; } + } + if ((r && r.done || !this.readable) && (this.push(null) || true)) { + it.return && it.return(); + } + return !this.readable; + } +} + +/** @ignore */ +class AsyncIterableReadable extends Readable { + private _pulling: boolean; + private _bytesMode: boolean; + private _iterator: AsyncIterator; + constructor(it: AsyncIterator, options?: ReadableOptions) { + super(options); + this._iterator = it; + this._pulling = false; + this._bytesMode = !options || !options.objectMode; + } + _read(size: number) { + const it = this._iterator; + if (it && !this._pulling && (this._pulling = true)) { + (async () => this._pulling = await this._pull(size, it))(); + } + } + _destroy(e: Error | null, cb: (e: Error | null) => void) { + let it = this._iterator, fn: any; + it && (fn = e != null && it.throw || it.return); + fn && fn.call(it, e).then(() => cb && cb(null)) || (cb && cb(null)); + } + private async _pull(size: number, it: AsyncIterator) { + const bm = this._bytesMode; + let r: IteratorResult | null = null; + while (this.readable && !(r = await it.next(bm ? size : null)).done) { + if (size != null) { + size -= (bm && ArrayBuffer.isView(r.value) ? r.value.byteLength : 1); + } + if (!this.push(r.value) || size <= 0) { break; } + } + if ((r && r.done || !this.readable) && (this.push(null) || true)) { + it.return && it.return(); + } + return !this.readable; + } +} diff --git a/js/src/ipc/node/reader.ts b/js/src/ipc/node/reader.ts new file mode 100644 index 0000000000000..aeb8688d211c9 --- /dev/null +++ b/js/src/ipc/node/reader.ts @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Duplex, DuplexOptions } from 'stream'; +import { DataType } from '../../type'; +import { RecordBatch } from '../../recordbatch'; +import { AsyncByteQueue } from '../../io/stream'; +import { RecordBatchReader } from '../../ipc/reader'; + +/** @ignore */ +export function recordBatchReaderThroughNodeStream(options?: DuplexOptions & { autoDestroy: boolean }) { + return new RecordBatchReaderDuplex(options); +} + +type CB = (error?: Error | null | undefined) => void; + +/** @ignore */ +class RecordBatchReaderDuplex extends Duplex { + private _pulling: boolean = false; + private _autoDestroy: boolean = true; + private _reader: RecordBatchReader | null; + private _asyncQueue: AsyncByteQueue | null; + constructor(options?: DuplexOptions & { autoDestroy: boolean }) { + super({ allowHalfOpen: false, ...options, readableObjectMode: true, writableObjectMode: false }); + this._reader = null; + this._pulling = false; + this._asyncQueue = new AsyncByteQueue(); + this._autoDestroy = options && (typeof options.autoDestroy === 'boolean') ? options.autoDestroy : true; + } + _final(cb?: CB) { + const aq = this._asyncQueue; + aq && aq.close(); + cb && cb(); + } + _write(x: any, _: string, cb: CB) { + const aq = this._asyncQueue; + aq && aq.write(x); + cb && cb(); + return true; + } + _read(size: number) { + const aq = this._asyncQueue; + if (aq && !this._pulling && (this._pulling = true)) { + (async () => { + if (!this._reader) { + this._reader = await this._open(aq); + } + this._pulling = await this._pull(size, this._reader); + })(); + } + } + _destroy(err: Error | null, cb: (error: Error | null) => void) { + const aq = this._asyncQueue; + if (aq) { err ? aq.abort(err) : aq.close(); } + cb(this._asyncQueue = this._reader = null); + } + async _open(source: AsyncByteQueue) { + return await (await RecordBatchReader.from(source)).open({ autoDestroy: this._autoDestroy }); + } + async _pull(size: number, reader: RecordBatchReader) { + let r: IteratorResult> | null = null; + while (this.readable && !(r = await reader.next()).done) { + if (!this.push(r.value) || (size != null && --size <= 0)) { break; } + } + if ((r && r.done || !this.readable)) { + this.push(null); + await reader.cancel(); + } + return !this.readable; + } +} diff --git a/js/src/ipc/node/writer.ts b/js/src/ipc/node/writer.ts new file mode 100644 index 0000000000000..673050791c67c --- /dev/null +++ b/js/src/ipc/node/writer.ts @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Duplex, DuplexOptions } from 'stream'; +import { DataType } from '../../type'; +import { AsyncByteStream } from '../../io/stream'; +import { RecordBatchWriter } from '../../ipc/writer'; + +/** @ignore */ +export function recordBatchWriterThroughNodeStream(this: typeof RecordBatchWriter, options?: DuplexOptions & { autoDestroy: boolean }) { + return new RecordBatchWriterDuplex(new this(options)); +} + +type CB = (error?: Error | null | undefined) => void; + +/** @ignore */ +class RecordBatchWriterDuplex extends Duplex { + private _pulling: boolean = false; + private _reader: AsyncByteStream | null; + private _writer: RecordBatchWriter | null; + constructor(writer: RecordBatchWriter, options?: DuplexOptions) { + super({ allowHalfOpen: false, ...options, writableObjectMode: true, readableObjectMode: false }); + this._writer = writer; + this._reader = new AsyncByteStream(writer); + } + _final(cb?: CB) { + const writer = this._writer; + writer && writer.close(); + cb && cb(); + } + _write(x: any, _: string, cb: CB) { + const writer = this._writer; + writer && writer.write(x); + cb && cb(); + return true; + } + _read(size: number) { + const it = this._reader; + if (it && !this._pulling && (this._pulling = true)) { + (async () => this._pulling = await this._pull(size, it))(); + } + } + _destroy(err: Error | null, cb: (error: Error | null) => void) { + const writer = this._writer; + if (writer) { err ? writer.abort(err) : writer.close(); } + cb(this._reader = this._writer = null); + } + async _pull(size: number, reader: AsyncByteStream) { + let r: IteratorResult | null = null; + while (this.readable && !(r = await reader.next(size || null)).done) { + if (size != null && r.value) { + size -= r.value.byteLength; + } + if (!this.push(r.value) || size <= 0) { break; } + } + if ((r && r.done || !this.readable)) { + this.push(null); + await reader.cancel(); + } + return !this.readable; + } +} diff --git a/js/src/ipc/reader.ts b/js/src/ipc/reader.ts new file mode 100644 index 0000000000000..91990afb35b17 --- /dev/null +++ b/js/src/ipc/reader.ts @@ -0,0 +1,737 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../type'; +import { Vector } from '../vector'; +import { MessageHeader } from '../enum'; +import { Footer } from './metadata/file'; +import { Schema, Field } from '../schema'; +import streamAdapters from '../io/adapters'; +import { Message } from './metadata/message'; +import { RecordBatch } from '../recordbatch'; +import * as metadata from './metadata/message'; +import { ArrayBufferViewInput } from '../util/buffer'; +import { ByteStream, AsyncByteStream } from '../io/stream'; +import { RandomAccessFile, AsyncRandomAccessFile } from '../io/file'; +import { VectorLoader, JSONVectorLoader } from '../visitor/vectorloader'; +import { + FileHandle, + ArrowJSONLike, + ITERATOR_DONE, + ReadableInterop, +} from '../io/interfaces'; +import { + MessageReader, AsyncMessageReader, JSONMessageReader, + checkForMagicArrowString, magicLength, magicAndPadding, magicX2AndPadding +} from './message'; +import { + isPromise, + isIterable, isAsyncIterable, + isIteratorResult, isArrowJSON, + isFileHandle, isFetchResponse, + isReadableDOMStream, isReadableNodeStream +} from '../util/compat'; + +/** @ignore */ export type FromArg0 = ArrowJSONLike; +/** @ignore */ export type FromArg1 = PromiseLike; +/** @ignore */ export type FromArg2 = Iterable | ArrayBufferViewInput; +/** @ignore */ export type FromArg3 = PromiseLike | ArrayBufferViewInput>; +/** @ignore */ export type FromArg4 = Response | NodeJS.ReadableStream | ReadableStream | AsyncIterable; +/** @ignore */ export type FromArg5 = FileHandle | PromiseLike | PromiseLike; +/** @ignore */ export type FromArgs = FromArg0 | FromArg1 | FromArg2 | FromArg3 | FromArg4 | FromArg5; + +/** @ignore */ type OpenOptions = { autoDestroy?: boolean; }; +/** @ignore */ type RecordBatchReaders = RecordBatchFileReader | RecordBatchStreamReader; +/** @ignore */ type AsyncRecordBatchReaders = AsyncRecordBatchFileReader | AsyncRecordBatchStreamReader; +/** @ignore */ type RecordBatchFileReaders = RecordBatchFileReader | AsyncRecordBatchFileReader; +/** @ignore */ type RecordBatchStreamReaders = RecordBatchStreamReader | AsyncRecordBatchStreamReader; + +export class RecordBatchReader extends ReadableInterop> { + + protected _impl: RecordBatchReaderImpls; + protected constructor(impl: RecordBatchReaderImpls) { + super(); + this._impl = impl; + } + + public get closed() { return this._impl.closed; } + public get schema() { return this._impl.schema; } + public get autoDestroy() { return this._impl.autoDestroy; } + public get dictionaries() { return this._impl.dictionaries; } + public get numDictionaries() { return this._impl.numDictionaries; } + public get numRecordBatches() { return this._impl.numRecordBatches; } + public get footer() { return this._impl.isFile() ? this._impl.footer : null; } + + public isSync(): this is RecordBatchReaders { return this._impl.isSync(); } + public isAsync(): this is AsyncRecordBatchReaders { return this._impl.isAsync(); } + public isFile(): this is RecordBatchFileReaders { return this._impl.isFile(); } + public isStream(): this is RecordBatchStreamReaders { return this._impl.isStream(); } + + public next() { + return this._impl.next(); + } + public throw(value?: any) { + return this._impl.throw(value); + } + public return(value?: any) { + return this._impl.return(value); + } + public cancel() { + return this._impl.cancel(); + } + public reset(schema?: Schema | null): this { + this._impl.reset(schema); + return this; + } + public open(options?: OpenOptions) { + const opening = this._impl.open(options); + return isPromise(opening) ? opening.then(() => this) : this; + } + public readRecordBatch(index: number): RecordBatch | null | Promise | null> { + return this._impl.isFile() ? this._impl.readRecordBatch(index) : null; + } + public [Symbol.iterator](): IterableIterator> { + return (>> this._impl)[Symbol.iterator](); + } + public [Symbol.asyncIterator](): AsyncIterableIterator> { + return (>> this._impl)[Symbol.asyncIterator](); + } + public toDOMStream() { + return streamAdapters.toDOMStream>( + (this.isSync() + ? { [Symbol.iterator]: () => this } as Iterable> + : { [Symbol.asyncIterator]: () => this } as AsyncIterable>)); + } + public toNodeStream() { + return streamAdapters.toNodeStream>( + (this.isSync() + ? { [Symbol.iterator]: () => this } as Iterable> + : { [Symbol.asyncIterator]: () => this } as AsyncIterable>), + { objectMode: true }); + } + + /** @nocollapse */ + // @ts-ignore + public static throughNode(options?: import('stream').DuplexOptions & { autoDestroy: boolean }): import('stream').Duplex { + throw new Error(`"throughNode" not available in this environment`); + } + /** @nocollapse */ + public static throughDOM( + // @ts-ignore + writableStrategy?: ByteLengthQueuingStrategy, + // @ts-ignore + readableStrategy?: { autoDestroy: boolean } + ): { writable: WritableStream, readable: ReadableStream> } { + throw new Error(`"throughDOM" not available in this environment`); + } + + public static from(source: T): T; + public static from(source: FromArg0): RecordBatchStreamReader; + public static from(source: FromArg1): Promise>; + public static from(source: FromArg2): RecordBatchFileReader | RecordBatchStreamReader; + public static from(source: FromArg3): Promise | RecordBatchStreamReader>; + public static from(source: FromArg4): Promise | AsyncRecordBatchReaders>; + public static from(source: FromArg5): Promise | AsyncRecordBatchStreamReader>; + /** @nocollapse */ + public static from(source: any) { + if (source instanceof RecordBatchReader) { + return source; + } else if (isArrowJSON(source)) { + return fromArrowJSON(source); + } else if (isFileHandle(source)) { + return fromFileHandle(source); + } else if (isPromise(source)) { + return (async () => await RecordBatchReader.from(await source))(); + } else if (isFetchResponse(source) || isReadableDOMStream(source) || isReadableNodeStream(source) || isAsyncIterable(source)) { + return fromAsyncByteStream(new AsyncByteStream(source)); + } + return fromByteStream(new ByteStream(source)); + } + + public static readAll(source: T): T extends RecordBatchReaders ? IterableIterator : AsyncIterableIterator; + public static readAll(source: FromArg0): IterableIterator>; + public static readAll(source: FromArg1): AsyncIterableIterator>; + public static readAll(source: FromArg2): IterableIterator | RecordBatchStreamReader>; + public static readAll(source: FromArg3): AsyncIterableIterator | RecordBatchStreamReader>; + public static readAll(source: FromArg4): AsyncIterableIterator | AsyncRecordBatchReaders>; + public static readAll(source: FromArg5): AsyncIterableIterator | AsyncRecordBatchStreamReader>; + /** @nocollapse */ + public static readAll(source: any) { + if (source instanceof RecordBatchReader) { + return source.isSync() ? readAllSync(source) : readAllAsync(source as AsyncRecordBatchReaders); + } else if (isArrowJSON(source) || ArrayBuffer.isView(source) || isIterable(source) || isIteratorResult(source)) { + return readAllSync(source) as IterableIterator>; + } + return readAllAsync(source) as AsyncIterableIterator | AsyncRecordBatchReaders>; + } +} + +// +// Since TS is a structural type system, we define the following subclass stubs +// so that concrete types exist to associate with with the interfaces below. +// +// The implementation for each RecordBatchReader is hidden away in the set of +// `RecordBatchReaderImpl` classes in the second half of this file. This allows +// us to export a single RecordBatchReader class, and swap out the impl based +// on the io primitives or underlying arrow (JSON, file, or stream) at runtime. +// +// Async/await makes our job a bit harder, since it forces everything to be +// either fully sync or fully async. This is why the logic for the reader impls +// has been duplicated into both sync and async variants. Since the RBR +// delegates to its impl, an RBR with an AsyncRecordBatchFileReaderImpl for +// example will return async/await-friendly Promises, but one with a (sync) +// RecordBatchStreamReaderImpl will always return values. Nothing should be +// different about their logic, aside from the async handling. This is also why +// this code looks highly structured, as it should be nearly identical and easy +// to follow. +// + +/** @ignore */ +export class RecordBatchStreamReader extends RecordBatchReader { + constructor(protected _impl: RecordBatchStreamReaderImpl) { super (_impl); } + public [Symbol.iterator]() { return (this._impl as IterableIterator>)[Symbol.iterator](); } + public async *[Symbol.asyncIterator](): AsyncIterableIterator> { yield* this[Symbol.iterator](); } +} +/** @ignore */ +export class AsyncRecordBatchStreamReader extends RecordBatchReader { + constructor(protected _impl: AsyncRecordBatchStreamReaderImpl) { super (_impl); } + public [Symbol.iterator](): IterableIterator> { throw new Error(`AsyncRecordBatchStreamReader is not Iterable`); } + public [Symbol.asyncIterator]() { return (this._impl as AsyncIterableIterator>)[Symbol.asyncIterator](); } +} +/** @ignore */ +export class RecordBatchFileReader extends RecordBatchStreamReader { + constructor(protected _impl: RecordBatchFileReaderImpl) { super (_impl); } +} +/** @ignore */ +export class AsyncRecordBatchFileReader extends AsyncRecordBatchStreamReader { + constructor(protected _impl: AsyncRecordBatchFileReaderImpl) { super (_impl); } +} + +// +// Now override the return types for each sync/async RecordBatchReader variant +// + +/** @ignore */ +export interface RecordBatchStreamReader extends RecordBatchReader { + open(options?: OpenOptions | undefined): this; + cancel(): void; + throw(value?: any): IteratorResult; + return(value?: any): IteratorResult; + next(value?: any): IteratorResult>; +} + +/** @ignore */ +export interface AsyncRecordBatchStreamReader extends RecordBatchReader { + open(options?: OpenOptions | undefined): Promise; + cancel(): Promise; + throw(value?: any): Promise>; + return(value?: any): Promise>; + next(value?: any): Promise>>; +} + +/** @ignore */ +export interface RecordBatchFileReader extends RecordBatchStreamReader { + footer: Footer; + readRecordBatch(index: number): RecordBatch | null; +} + +/** @ignore */ +export interface AsyncRecordBatchFileReader extends AsyncRecordBatchStreamReader { + footer: Footer; + readRecordBatch(index: number): Promise | null>; +} + +/** @ignore */ +type RecordBatchReaderImpls = + RecordBatchJSONReaderImpl | + RecordBatchFileReaderImpl | + RecordBatchStreamReaderImpl | + AsyncRecordBatchFileReaderImpl | + AsyncRecordBatchStreamReaderImpl; + +/** @ignore */ +interface RecordBatchReaderImpl { + + closed: boolean; + schema: Schema; + autoDestroy: boolean; + dictionaries: Map; + + isFile(): this is RecordBatchFileReaders; + isStream(): this is RecordBatchStreamReaders; + isSync(): this is RecordBatchReaders; + isAsync(): this is AsyncRecordBatchReaders; + + reset(schema?: Schema | null): this; +} + +/** @ignore */ +interface RecordBatchStreamReaderImpl extends RecordBatchReaderImpl { + + open(options?: OpenOptions): this; + cancel(): void; + + throw(value?: any): IteratorResult; + return(value?: any): IteratorResult; + next(value?: any): IteratorResult>; + + [Symbol.iterator](): IterableIterator>; +} + +/** @ignore */ +interface AsyncRecordBatchStreamReaderImpl extends RecordBatchReaderImpl { + + open(options?: OpenOptions): Promise; + cancel(): Promise; + + throw(value?: any): Promise>; + return(value?: any): Promise>; + next(value?: any): Promise>>; + + [Symbol.asyncIterator](): AsyncIterableIterator>; +} + +/** @ignore */ +interface RecordBatchFileReaderImpl extends RecordBatchStreamReaderImpl { + readRecordBatch(index: number): RecordBatch | null; +} + +/** @ignore */ +interface AsyncRecordBatchFileReaderImpl extends AsyncRecordBatchStreamReaderImpl { + readRecordBatch(index: number): Promise | null>; +} + +/** @ignore */ +abstract class RecordBatchReaderImpl implements RecordBatchReaderImpl { + + // @ts-ignore + public schema: Schema; + public closed = false; + public autoDestroy = true; + public dictionaries: Map; + + protected _dictionaryIndex = 0; + protected _recordBatchIndex = 0; + public get numDictionaries() { return this._dictionaryIndex; } + public get numRecordBatches() { return this._recordBatchIndex; } + + constructor(dictionaries = new Map()) { + this.dictionaries = dictionaries; + } + + public isSync(): this is RecordBatchReaders { return false; } + public isAsync(): this is AsyncRecordBatchReaders { return false; } + public isFile(): this is RecordBatchFileReaders { return false; } + public isStream(): this is RecordBatchStreamReaders { return false; } + + public reset(schema?: Schema | null) { + this._dictionaryIndex = 0; + this._recordBatchIndex = 0; + this.schema = schema; + this.dictionaries = new Map(); + return this; + } + + protected _loadRecordBatch(header: metadata.RecordBatch, body: any) { + return new RecordBatch(this.schema, header.length, this._loadVectors(header, body, this.schema.fields)); + } + protected _loadDictionaryBatch(header: metadata.DictionaryBatch, body: any) { + const { id, isDelta, data } = header; + const { dictionaries, schema } = this; + if (isDelta || !dictionaries.get(id)) { + + const type = schema.dictionaries.get(id)!; + const vector = (isDelta ? dictionaries.get(id)!.concat( + Vector.new(this._loadVectors(data, body, [type])[0])) : + Vector.new(this._loadVectors(data, body, [type])[0])) as Vector; + + (schema.dictionaryFields.get(id) || []).forEach(({ type }) => type.dictionaryVector = vector); + + return vector; + } + return dictionaries.get(id)!; + } + protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { + return new VectorLoader(body, header.nodes, header.buffers).visitMany(types); + } +} + +/** @ignore */ +class RecordBatchStreamReaderImpl extends RecordBatchReaderImpl implements IterableIterator> { + + protected _reader: MessageReader; + protected _handle: ByteStream | ArrowJSONLike; + + constructor(source: ByteStream | ArrowJSONLike, dictionaries?: Map) { + super(dictionaries); + this._reader = !isArrowJSON(source) + ? new MessageReader(this._handle = source) + : new JSONMessageReader(this._handle = source); + } + + public isSync(): this is RecordBatchReaders { return true; } + public isStream(): this is RecordBatchStreamReaders { return true; } + public [Symbol.iterator](): IterableIterator> { + return this as IterableIterator>; + } + public cancel() { + if (!this.closed && (this.closed = true)) { + this.reset()._reader.return(); + this._reader = null; + this.dictionaries = null; + } + } + public open(options?: OpenOptions) { + if (!this.closed) { + this.autoDestroy = shouldAutoDestroy(this, options); + if (!(this.schema || (this.schema = this._reader.readSchema()!))) { + this.cancel(); + } + } + return this; + } + public throw(value?: any): IteratorResult { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return this.reset()._reader.throw(value); + } + return ITERATOR_DONE; + } + public return(value?: any): IteratorResult { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return this.reset()._reader.return(value); + } + return ITERATOR_DONE; + } + public next(): IteratorResult> { + if (this.closed) { return ITERATOR_DONE; } + let message: Message | null, { _reader: reader } = this; + while (message = this._readNextMessageAndValidate()) { + if (message.isSchema()) { + this.reset(message.header()); + } else if (message.isRecordBatch()) { + this._recordBatchIndex++; + const header = message.header(); + const buffer = reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return { done: false, value: recordBatch }; + } else if (message.isDictionaryBatch()) { + this._dictionaryIndex++; + const header = message.header(); + const buffer = reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + return this.return(); + } + protected _readNextMessageAndValidate(type?: T | null) { + return this._reader.readMessage(type); + } +} + +/** @ignore */ +class AsyncRecordBatchStreamReaderImpl extends RecordBatchReaderImpl implements AsyncIterableIterator> { + + protected _handle: AsyncByteStream; + protected _reader: AsyncMessageReader; + + constructor(source: AsyncByteStream, dictionaries?: Map) { + super(dictionaries); + this._reader = new AsyncMessageReader(this._handle = source); + } + public isAsync(): this is AsyncRecordBatchReaders { return true; } + public isStream(): this is RecordBatchStreamReaders { return true; } + public [Symbol.asyncIterator](): AsyncIterableIterator> { + return this as AsyncIterableIterator>; + } + public async cancel() { + if (!this.closed && (this.closed = true)) { + await this.reset()._reader.return(); + this._reader = null; + this.dictionaries = null; + } + } + public async open(options?: OpenOptions) { + if (!this.closed) { + this.autoDestroy = shouldAutoDestroy(this, options); + if (!(this.schema || (this.schema = (await this._reader.readSchema())!))) { + await this.cancel(); + } + } + return this; + } + public async throw(value?: any): Promise> { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return await this.reset()._reader.throw(value); + } + return ITERATOR_DONE; + } + public async return(value?: any): Promise> { + if (!this.closed && this.autoDestroy && (this.closed = true)) { + return await this.reset()._reader.return(value); + } + return ITERATOR_DONE; + } + public async next() { + if (this.closed) { return ITERATOR_DONE; } + let message: Message | null, { _reader: reader } = this; + while (message = await this._readNextMessageAndValidate()) { + if (message.isSchema()) { + await this.reset(message.header()); + } else if (message.isRecordBatch()) { + this._recordBatchIndex++; + const header = message.header(); + const buffer = await reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return { done: false, value: recordBatch }; + } else if (message.isDictionaryBatch()) { + this._dictionaryIndex++; + const header = message.header(); + const buffer = await reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + return await this.return(); + } + protected async _readNextMessageAndValidate(type?: T | null) { + return await this._reader.readMessage(type); + } +} + +/** @ignore */ +class RecordBatchFileReaderImpl extends RecordBatchStreamReaderImpl { + + // @ts-ignore + protected _footer?: Footer; + // @ts-ignore + protected _handle: RandomAccessFile; + public get footer() { return this._footer!; } + public get numDictionaries() { return this._footer ? this._footer.numDictionaries : 0; } + public get numRecordBatches() { return this._footer ? this._footer.numRecordBatches : 0; } + + constructor(source: RandomAccessFile | ArrayBufferViewInput, dictionaries?: Map) { + super(source instanceof RandomAccessFile ? source : new RandomAccessFile(source), dictionaries); + } + public isSync(): this is RecordBatchReaders { return true; } + public isFile(): this is RecordBatchFileReaders { return true; } + public open(options?: OpenOptions) { + if (!this.closed && !this._footer) { + this.schema = (this._footer = this._readFooter()).schema; + for (const block of this._footer.dictionaryBatches()) { + block && this._readDictionaryBatch(this._dictionaryIndex++); + } + } + return super.open(options); + } + public readRecordBatch(index: number) { + if (this.closed) { return null; } + if (!this._footer) { this.open(); } + const block = this._footer && this._footer.getRecordBatch(index); + if (block && this._handle.seek(block.offset)) { + const message = this._reader.readMessage(MessageHeader.RecordBatch); + if (message && message.isRecordBatch()) { + const header = message.header(); + const buffer = this._reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return recordBatch; + } + } + return null; + } + protected _readDictionaryBatch(index: number) { + const block = this._footer && this._footer.getDictionaryBatch(index); + if (block && this._handle.seek(block.offset)) { + const message = this._reader.readMessage(MessageHeader.DictionaryBatch); + if (message && message.isDictionaryBatch()) { + const header = message.header(); + const buffer = this._reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + } + protected _readFooter() { + const { _handle } = this; + const offset = _handle.size - magicAndPadding; + const length = _handle.readInt32(offset); + const buffer = _handle.readAt(offset - length, length); + return Footer.decode(buffer); + } + protected _readNextMessageAndValidate(type?: T | null): Message | null { + if (!this._footer) { this.open(); } + if (this._footer && this._recordBatchIndex < this.numRecordBatches) { + const block = this._footer && this._footer.getRecordBatch(this._recordBatchIndex); + if (block && this._handle.seek(block.offset)) { + return this._reader.readMessage(type); + } + } + return null; + } +} + +/** @ignore */ +class AsyncRecordBatchFileReaderImpl extends AsyncRecordBatchStreamReaderImpl + implements AsyncRecordBatchFileReaderImpl { + + protected _footer?: Footer; + // @ts-ignore + protected _handle: AsyncRandomAccessFile; + public get footer() { return this._footer!; } + public get numDictionaries() { return this._footer ? this._footer.numDictionaries : 0; } + public get numRecordBatches() { return this._footer ? this._footer.numRecordBatches : 0; } + + constructor(source: FileHandle, byteLength?: number, dictionaries?: Map); + constructor(source: FileHandle | AsyncRandomAccessFile, dictionaries?: Map); + constructor(source: FileHandle | AsyncRandomAccessFile, ...rest: any[]) { + const byteLength = typeof rest[0] !== 'number' ? rest.shift() : undefined; + const dictionaries = rest[0] instanceof Map ? > rest.shift() : undefined; + super(source instanceof AsyncRandomAccessFile ? source : new AsyncRandomAccessFile(source, byteLength), dictionaries); + } + public isFile(): this is RecordBatchFileReaders { return true; } + public isAsync(): this is AsyncRecordBatchReaders { return true; } + public async open(options?: OpenOptions) { + if (!this.closed && !this._footer) { + this.schema = (this._footer = await this._readFooter()).schema; + for (const block of this._footer.dictionaryBatches()) { + block && await this._readDictionaryBatch(this._dictionaryIndex++); + } + } + return await super.open(options); + } + public async readRecordBatch(index: number) { + if (this.closed) { return null; } + if (!this._footer) { await this.open(); } + const block = this._footer && this._footer.getRecordBatch(index); + if (block && (await this._handle.seek(block.offset))) { + const message = await this._reader.readMessage(MessageHeader.RecordBatch); + if (message && message.isRecordBatch()) { + const header = message.header(); + const buffer = await this._reader.readMessageBody(message.bodyLength); + const recordBatch = this._loadRecordBatch(header, buffer); + return recordBatch; + } + } + return null; + } + protected async _readDictionaryBatch(index: number) { + const block = this._footer && this._footer.getDictionaryBatch(index); + if (block && (await this._handle.seek(block.offset))) { + const message = await this._reader.readMessage(MessageHeader.DictionaryBatch); + if (message && message.isDictionaryBatch()) { + const header = message.header(); + const buffer = await this._reader.readMessageBody(message.bodyLength); + const vector = this._loadDictionaryBatch(header, buffer); + this.dictionaries.set(header.id, vector); + } + } + } + protected async _readFooter() { + const { _handle } = this; + _handle._pending && await _handle._pending; + const offset = _handle.size - magicAndPadding; + const length = await _handle.readInt32(offset); + const buffer = await _handle.readAt(offset - length, length); + return Footer.decode(buffer); + } + protected async _readNextMessageAndValidate(type?: T | null): Promise | null> { + if (!this._footer) { await this.open(); } + if (this._footer && this._recordBatchIndex < this.numRecordBatches) { + const block = this._footer.getRecordBatch(this._recordBatchIndex); + if (block && await this._handle.seek(block.offset)) { + return await this._reader.readMessage(type); + } + } + return null; + } +} + +/** @ignore */ +class RecordBatchJSONReaderImpl extends RecordBatchStreamReaderImpl { + constructor(source: ArrowJSONLike, dictionaries?: Map) { + super(source, dictionaries); + } + protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { + return new JSONVectorLoader(body, header.nodes, header.buffers).visitMany(types); + } +} + +// +// Define some helper functions and static implementations down here. There's +// a bit of branching in the static methods that can lead to the same routines +// being executed, so we've broken those out here for readability. +// + +/** @ignore */ +function shouldAutoDestroy(self: { autoDestroy: boolean }, options?: OpenOptions) { + return options && (typeof options['autoDestroy'] === 'boolean') ? options['autoDestroy'] : self['autoDestroy']; +} + +/** @ignore */ +function* readAllSync(source: RecordBatchReaders | FromArg0 | FromArg2) { + const reader = RecordBatchReader.from( source) as RecordBatchReaders; + try { + if (!reader.open({ autoDestroy: false }).closed) { + do { yield reader; } while (!(reader.reset().open()).closed); + } + } finally { reader.cancel(); } +} + +/** @ignore */ +async function* readAllAsync(source: AsyncRecordBatchReaders | FromArg1 | FromArg3 | FromArg4 | FromArg5) { + const reader = await RecordBatchReader.from( source) as RecordBatchReader; + try { + if (!(await reader.open({ autoDestroy: false })).closed) { + do { yield reader; } while (!(await reader.reset().open()).closed); + } + } finally { await reader.cancel(); } +} + +/** @ignore */ +function fromArrowJSON(source: ArrowJSONLike) { + return new RecordBatchStreamReader(new RecordBatchJSONReaderImpl(source)); +} + +/** @ignore */ +function fromByteStream(source: ByteStream) { + const bytes = source.peek((magicLength + 7) & ~7); + return bytes && bytes.byteLength >= 4 ? !checkForMagicArrowString(bytes) + ? new RecordBatchStreamReader(new RecordBatchStreamReaderImpl(source)) + : new RecordBatchFileReader(new RecordBatchFileReaderImpl(source.read())) + : new RecordBatchStreamReader(new RecordBatchStreamReaderImpl(function*(): any {}())); +} + +/** @ignore */ +async function fromAsyncByteStream(source: AsyncByteStream) { + const bytes = await source.peek((magicLength + 7) & ~7); + return bytes && bytes.byteLength >= 4 ? !checkForMagicArrowString(bytes) + ? new AsyncRecordBatchStreamReader(new AsyncRecordBatchStreamReaderImpl(source)) + : new RecordBatchFileReader(new RecordBatchFileReaderImpl(await source.read())) + : new AsyncRecordBatchStreamReader(new AsyncRecordBatchStreamReaderImpl(async function*(): any {}())); +} + +/** @ignore */ +async function fromFileHandle(source: FileHandle) { + const { size } = await source.stat(); + const file = new AsyncRandomAccessFile(source, size); + if (size >= magicX2AndPadding) { + if (checkForMagicArrowString(await file.readAt(0, (magicLength + 7) & ~7))) { + return new AsyncRecordBatchFileReader(new AsyncRecordBatchFileReaderImpl(file)); + } + } + return new AsyncRecordBatchStreamReader(new AsyncRecordBatchStreamReaderImpl(file)); +} diff --git a/js/src/ipc/reader/arrow.ts b/js/src/ipc/reader/arrow.ts deleted file mode 100644 index 1847c9c2eb628..0000000000000 --- a/js/src/ipc/reader/arrow.ts +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { readJSON } from './json'; -import { fromReadableStream } from './node'; -import { RecordBatch } from '../../recordbatch'; -import { readBuffers, readBuffersAsync } from './binary'; -import { readRecordBatches, readRecordBatchesAsync, TypeDataLoader } from './vector'; -import { Schema } from '../../type'; -import { Message } from '../metadata'; - -export { readJSON, RecordBatch }; -export { readBuffers, readBuffersAsync }; -export { readRecordBatches, readRecordBatchesAsync }; - -export function* read(sources: Iterable | object | string) { - let input: any = sources; - let messages: Iterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>; - if (typeof input === 'string') { - try { input = JSON.parse(input); } - catch (e) { input = sources; } - } - if (!input || typeof input !== 'object') { - messages = (typeof input === 'string') ? readBuffers([input]) : []; - } else { - messages = (typeof input[Symbol.iterator] === 'function') ? readBuffers(input) : readJSON(input); - } - yield* readRecordBatches(messages); -} - -export async function* readAsync(sources: AsyncIterable) { - for await (let recordBatch of readRecordBatchesAsync(readBuffersAsync(sources))) { - yield recordBatch; - } -} - -export async function* readStream(stream: NodeJS.ReadableStream) { - for await (const recordBatch of readAsync(fromReadableStream(stream))) { - yield recordBatch as RecordBatch; - } -} diff --git a/js/src/ipc/reader/binary.ts b/js/src/ipc/reader/binary.ts deleted file mode 100644 index 988ce606b2614..0000000000000 --- a/js/src/ipc/reader/binary.ts +++ /dev/null @@ -1,432 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Vector } from '../../vector'; -import { flatbuffers } from 'flatbuffers'; -import { TypeDataLoader } from './vector'; -import { checkForMagicArrowString, PADDING, magicAndPadding, isValidArrowFile } from '../magic'; -import { Message, Footer, FileBlock, RecordBatchMetadata, DictionaryBatch, BufferMetadata, FieldMetadata, } from '../metadata'; -import { - Schema, Field, - DataType, Dictionary, - Null, TimeBitWidth, - Binary, Bool, Utf8, Decimal, - Date_, Time, Timestamp, Interval, - List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, -} from '../../type'; - -import { - Int8, Uint8, - Int16, Uint16, - Int32, Uint32, - Int64, Uint64, - Float16, Float64, Float32, -} from '../../type'; - -import ByteBuffer = flatbuffers.ByteBuffer; - -type MessageReader = (bb: ByteBuffer) => IterableIterator; - -export function* readBuffers(sources: Iterable | Uint8Array | Buffer | string) { - let schema: Schema | null = null; - let dictionaries = new Map(); - let readMessages: MessageReader | null = null; - if (ArrayBuffer.isView(sources) || typeof sources === 'string') { - sources = [sources as T]; - } - for (const source of sources) { - const bb = toByteBuffer(source); - if ((!schema && ({ schema, readMessages } = readSchema(bb)) || true) && schema && readMessages) { - for (const message of readMessages(bb)) { - yield { - schema, message, - loader: new BinaryDataLoader( - bb, - arrayIterator((message as any).nodes || []), - arrayIterator((message as any).buffers || []), - dictionaries - ) - }; - } - } - } -} - -export async function* readBuffersAsync(sources: AsyncIterable) { - let schema: Schema | null = null; - let dictionaries = new Map(); - let readMessages: MessageReader | null = null; - for await (const source of sources) { - const bb = toByteBuffer(source); - if ((!schema && ({ schema, readMessages } = readSchema(bb)) || true) && schema && readMessages) { - for (const message of readMessages(bb)) { - yield { - schema, message, - loader: new BinaryDataLoader( - bb, - arrayIterator((message as any).nodes || []), - arrayIterator((message as any).buffers || []), - dictionaries - ) - }; - } - } - } -} - -export class BinaryDataLoader extends TypeDataLoader { - private bytes: Uint8Array; - private messageOffset: number; - constructor(bb: ByteBuffer, nodes: Iterator, buffers: Iterator, dictionaries: Map) { - super(nodes, buffers, dictionaries); - this.bytes = bb.bytes(); - this.messageOffset = bb.position(); - } - protected readOffsets(type: T, buffer?: BufferMetadata) { return this.readData(type, buffer); } - protected readTypeIds(type: T, buffer?: BufferMetadata) { return this.readData(type, buffer); } - protected readData(_type: T, { length, offset }: BufferMetadata = this.getBufferMetadata()) { - return new Uint8Array(this.bytes.buffer, this.bytes.byteOffset + this.messageOffset + offset, length); - } -} - -function* arrayIterator(arr: Array) { yield* arr; } - -function toByteBuffer(bytes?: Uint8Array | Buffer | string) { - let arr: Uint8Array = bytes as any || new Uint8Array(0); - if (typeof bytes === 'string') { - arr = new Uint8Array(bytes.length); - for (let i = -1, n = bytes.length; ++i < n;) { - arr[i] = bytes.charCodeAt(i); - } - return new ByteBuffer(arr); - } - return new ByteBuffer(arr); -} - -function readSchema(bb: ByteBuffer) { - let schema: Schema, readMessages, footer: Footer | null; - if (footer = readFileSchema(bb)) { - schema = footer.schema; - readMessages = readFileMessages(footer); - } else if (schema = readStreamSchema(bb)!) { - readMessages = readStreamMessages; - } else { - throw new Error('Invalid Arrow buffer'); - } - return { schema, readMessages }; -} - -function readStreamSchema(bb: ByteBuffer) { - if (!checkForMagicArrowString(bb.bytes(), 0)) { - for (const message of readMessages(bb)) { - if (Message.isSchema(message)) { - return message as Schema; - } - } - } - return null; -} - -function* readStreamMessages(bb: ByteBuffer) { - for (const message of readMessages(bb)) { - if (Message.isRecordBatch(message)) { - yield message; - } else if (Message.isDictionaryBatch(message)) { - yield message; - } else { - yield message; - } - // position the buffer after the body to read the next message - bb.setPosition(bb.position() + message.bodyLength); - } -} - -function readFileSchema(bb: ByteBuffer) { - if (!isValidArrowFile(bb)) { - return null; - } - let fileLength = bb.capacity(); - let lengthOffset = fileLength - magicAndPadding; - let footerLength = bb.readInt32(lengthOffset); - bb.setPosition(lengthOffset - footerLength); - return footerFromByteBuffer(bb); -} - -function readFileMessages(footer: Footer) { - return function* (bb: ByteBuffer) { - let message: RecordBatchMetadata | DictionaryBatch; - for (let i = -1, batches = footer.dictionaryBatches, n = batches.length; ++i < n;) { - bb.setPosition(batches[i].offset); - if (message = readMessage(bb, bb.readInt32(bb.position())) as DictionaryBatch) { - yield message; - } - } - for (let i = -1, batches = footer.recordBatches, n = batches.length; ++i < n;) { - bb.setPosition(batches[i].offset); - if (message = readMessage(bb, bb.readInt32(bb.position())) as RecordBatchMetadata) { - yield message; - } - } - }; -} - -function* readMessages(bb: ByteBuffer) { - let length: number, message: Schema | RecordBatchMetadata | DictionaryBatch; - while (bb.position() < bb.capacity() && - (length = bb.readInt32(bb.position())) > 0) { - if (message = readMessage(bb, length)!) { - yield message; - } - } -} - -function readMessage(bb: ByteBuffer, length: number) { - bb.setPosition(bb.position() + PADDING); - const message = messageFromByteBuffer(bb); - bb.setPosition(bb.position() + length); - return message; -} - -import * as File_ from '../../fb/File'; -import * as Schema_ from '../../fb/Schema'; -import * as Message_ from '../../fb/Message'; - -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; -import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; -import _Footer = File_.org.apache.arrow.flatbuf.Footer; -import _Block = File_.org.apache.arrow.flatbuf.Block; -import _Message = Message_.org.apache.arrow.flatbuf.Message; -import _Schema = Schema_.org.apache.arrow.flatbuf.Schema; -import _Field = Schema_.org.apache.arrow.flatbuf.Field; -import _RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; -import _DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; -import _FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; -import _Buffer = Schema_.org.apache.arrow.flatbuf.Buffer; -import _DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; -import _Null = Schema_.org.apache.arrow.flatbuf.Null; -import _Int = Schema_.org.apache.arrow.flatbuf.Int; -import _FloatingPoint = Schema_.org.apache.arrow.flatbuf.FloatingPoint; -import _Binary = Schema_.org.apache.arrow.flatbuf.Binary; -import _Bool = Schema_.org.apache.arrow.flatbuf.Bool; -import _Utf8 = Schema_.org.apache.arrow.flatbuf.Utf8; -import _Decimal = Schema_.org.apache.arrow.flatbuf.Decimal; -import _Date = Schema_.org.apache.arrow.flatbuf.Date; -import _Time = Schema_.org.apache.arrow.flatbuf.Time; -import _Timestamp = Schema_.org.apache.arrow.flatbuf.Timestamp; -import _Interval = Schema_.org.apache.arrow.flatbuf.Interval; -import _List = Schema_.org.apache.arrow.flatbuf.List; -import _Struct = Schema_.org.apache.arrow.flatbuf.Struct_; -import _Union = Schema_.org.apache.arrow.flatbuf.Union; -import _FixedSizeBinary = Schema_.org.apache.arrow.flatbuf.FixedSizeBinary; -import _FixedSizeList = Schema_.org.apache.arrow.flatbuf.FixedSizeList; -import _Map = Schema_.org.apache.arrow.flatbuf.Map; - -function footerFromByteBuffer(bb: ByteBuffer) { - const dictionaryFields = new Map>(); - const f = _Footer.getRootAsFooter(bb), s = f.schema()!; - return new Footer( - dictionaryBatchesFromFooter(f), recordBatchesFromFooter(f), - new Schema(fieldsFromSchema(s, dictionaryFields), customMetadata(s), f.version(), dictionaryFields) - ); -} - -function messageFromByteBuffer(bb: ByteBuffer) { - const m = _Message.getRootAsMessage(bb)!, type = m.headerType(), version = m.version(); - switch (type) { - case MessageHeader.Schema: return schemaFromMessage(version, m.header(new _Schema())!, new Map()); - case MessageHeader.RecordBatch: return recordBatchFromMessage(version, m, m.header(new _RecordBatch())!); - case MessageHeader.DictionaryBatch: return dictionaryBatchFromMessage(version, m, m.header(new _DictionaryBatch())!); - } - return null; - // throw new Error(`Unrecognized Message type '${type}'`); -} - -function schemaFromMessage(version: MetadataVersion, s: _Schema, dictionaryFields: Map>) { - return new Schema(fieldsFromSchema(s, dictionaryFields), customMetadata(s), version, dictionaryFields); -} - -function recordBatchFromMessage(version: MetadataVersion, m: _Message, b: _RecordBatch) { - return new RecordBatchMetadata(version, b.length(), fieldNodesFromRecordBatch(b), buffersFromRecordBatch(b, version), m.bodyLength()); -} - -function dictionaryBatchFromMessage(version: MetadataVersion, m: _Message, d: _DictionaryBatch) { - return new DictionaryBatch(version, recordBatchFromMessage(version, m, d.data()!), d.id(), d.isDelta()); -} - -function dictionaryBatchesFromFooter(f: _Footer) { - const blocks = [] as FileBlock[]; - for (let b: _Block, i = -1, n = f && f.dictionariesLength(); ++i < n;) { - if (b = f.dictionaries(i)!) { - blocks.push(new FileBlock(b.metaDataLength(), b.bodyLength(), b.offset())); - } - } - return blocks; -} - -function recordBatchesFromFooter(f: _Footer) { - const blocks = [] as FileBlock[]; - for (let b: _Block, i = -1, n = f && f.recordBatchesLength(); ++i < n;) { - if (b = f.recordBatches(i)!) { - blocks.push(new FileBlock(b.metaDataLength(), b.bodyLength(), b.offset())); - } - } - return blocks; -} - -function fieldsFromSchema(s: _Schema, dictionaryFields: Map> | null) { - const fields = [] as Field[]; - for (let i = -1, c: Field | null, n = s && s.fieldsLength(); ++i < n;) { - if (c = field(s.fields(i)!, dictionaryFields)) { - fields.push(c); - } - } - return fields; -} - -function fieldsFromField(f: _Field, dictionaryFields: Map> | null) { - const fields = [] as Field[]; - for (let i = -1, c: Field | null, n = f && f.childrenLength(); ++i < n;) { - if (c = field(f.children(i)!, dictionaryFields)) { - fields.push(c); - } - } - return fields; -} - -function fieldNodesFromRecordBatch(b: _RecordBatch) { - const fieldNodes = [] as FieldMetadata[]; - for (let i = -1, n = b.nodesLength(); ++i < n;) { - fieldNodes.push(fieldNodeFromRecordBatch(b.nodes(i)!)); - } - return fieldNodes; -} - -function buffersFromRecordBatch(b: _RecordBatch, version: MetadataVersion) { - const buffers = [] as BufferMetadata[]; - for (let i = -1, n = b.buffersLength(); ++i < n;) { - let buffer = b.buffers(i)!; - // If this Arrow buffer was written before version 4, - // advance the buffer's bb_pos 8 bytes to skip past - // the now-removed page id field. - if (version < MetadataVersion.V4) { - buffer.bb_pos += (8 * (i + 1)); - } - buffers.push(bufferFromRecordBatch(buffer)); - } - return buffers; -} - -function field(f: _Field, dictionaryFields: Map> | null) { - let name = f.name()!; - let field: Field | void; - let nullable = f.nullable(); - let metadata = customMetadata(f); - let dataType: DataType | null; - let keysMeta: _Int | null, id: number; - let dictMeta: _DictionaryEncoding | null; - if (!dictionaryFields || !(dictMeta = f.dictionary())) { - if (dataType = typeFromField(f, fieldsFromField(f, dictionaryFields))) { - field = new Field(name, dataType, nullable, metadata); - } - } else if (dataType = dictionaryFields.has(id = dictMeta.id().low) - ? dictionaryFields.get(id)!.type.dictionary - : typeFromField(f, fieldsFromField(f, null))) { - dataType = new Dictionary(dataType, - // a dictionary index defaults to signed 32 bit int if unspecified - (keysMeta = dictMeta.indexType()) ? intFromField(keysMeta)! : new Int32(), - id, dictMeta.isOrdered() - ); - field = new Field(name, dataType, nullable, metadata); - dictionaryFields.has(id) || dictionaryFields.set(id, field as Field); - } - return field || null; -} - -function customMetadata(parent?: _Schema | _Field | null) { - const data = new Map(); - if (parent) { - for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { - if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { - data.set(key, entry.value()!); - } - } - } - return data; -} - -function fieldNodeFromRecordBatch(f: _FieldNode) { - return new FieldMetadata(f.length(), f.nullCount()); -} - -function bufferFromRecordBatch(b: _Buffer) { - return new BufferMetadata(b.offset(), b.length()); -} - -function typeFromField(f: _Field, children?: Field[]): DataType | null { - switch (f.typeType()) { - case Type.NONE: return null; - case Type.Null: return nullFromField(f.type(new _Null())!); - case Type.Int: return intFromField(f.type(new _Int())!); - case Type.FloatingPoint: return floatFromField(f.type(new _FloatingPoint())!); - case Type.Binary: return binaryFromField(f.type(new _Binary())!); - case Type.Utf8: return utf8FromField(f.type(new _Utf8())!); - case Type.Bool: return boolFromField(f.type(new _Bool())!); - case Type.Decimal: return decimalFromField(f.type(new _Decimal())!); - case Type.Date: return dateFromField(f.type(new _Date())!); - case Type.Time: return timeFromField(f.type(new _Time())!); - case Type.Timestamp: return timestampFromField(f.type(new _Timestamp())!); - case Type.Interval: return intervalFromField(f.type(new _Interval())!); - case Type.List: return listFromField(f.type(new _List())!, children || []); - case Type.Struct_: return structFromField(f.type(new _Struct())!, children || []); - case Type.Union: return unionFromField(f.type(new _Union())!, children || []); - case Type.FixedSizeBinary: return fixedSizeBinaryFromField(f.type(new _FixedSizeBinary())!); - case Type.FixedSizeList: return fixedSizeListFromField(f.type(new _FixedSizeList())!, children || []); - case Type.Map: return mapFromField(f.type(new _Map())!, children || []); - } - throw new Error(`Unrecognized type ${f.typeType()}`); -} - -function nullFromField (_type: _Null) { return new Null(); } -function intFromField (_type: _Int) { switch (_type.bitWidth()) { - case 8: return _type.isSigned() ? new Int8() : new Uint8(); - case 16: return _type.isSigned() ? new Int16() : new Uint16(); - case 32: return _type.isSigned() ? new Int32() : new Uint32(); - case 64: return _type.isSigned() ? new Int64() : new Uint64(); - } - return null; } -function floatFromField (_type: _FloatingPoint) { switch (_type.precision()) { - case Precision.HALF: return new Float16(); - case Precision.SINGLE: return new Float32(); - case Precision.DOUBLE: return new Float64(); - } - return null; } -function binaryFromField (_type: _Binary) { return new Binary(); } -function utf8FromField (_type: _Utf8) { return new Utf8(); } -function boolFromField (_type: _Bool) { return new Bool(); } -function decimalFromField (_type: _Decimal) { return new Decimal(_type.scale(), _type.precision()); } -function dateFromField (_type: _Date) { return new Date_(_type.unit()); } -function timeFromField (_type: _Time) { return new Time(_type.unit(), _type.bitWidth() as TimeBitWidth); } -function timestampFromField (_type: _Timestamp) { return new Timestamp(_type.unit(), _type.timezone()); } -function intervalFromField (_type: _Interval) { return new Interval(_type.unit()); } -function listFromField (_type: _List, children: Field[]) { return new List(children); } -function structFromField (_type: _Struct, children: Field[]) { return new Struct(children); } -function unionFromField (_type: _Union, children: Field[]) { return new Union(_type.mode(), (_type.typeIdsArray() || []) as Type[], children); } -function fixedSizeBinaryFromField(_type: _FixedSizeBinary) { return new FixedSizeBinary(_type.byteWidth()); } -function fixedSizeListFromField (_type: _FixedSizeList, children: Field[]) { return new FixedSizeList(_type.listSize(), children); } -function mapFromField (_type: _Map, children: Field[]) { return new Map_(_type.keysSorted(), children); } diff --git a/js/src/ipc/reader/json.ts b/js/src/ipc/reader/json.ts deleted file mode 100644 index 0f0c018d66bb9..0000000000000 --- a/js/src/ipc/reader/json.ts +++ /dev/null @@ -1,304 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Vector } from '../../vector'; -import { flatbuffers } from 'flatbuffers'; -import { TypeDataLoader } from './vector'; -import { packBools } from '../../util/bit'; -import * as IntUtil from '../../util/int'; -import { TextEncoder } from 'text-encoding-utf-8'; -import { RecordBatchMetadata, DictionaryBatch, BufferMetadata, FieldMetadata } from '../metadata'; -import { - Schema, Field, - DataType, Dictionary, - Null, TimeBitWidth, - Binary, Bool, Utf8, Decimal, - Date_, Time, Timestamp, Interval, - List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, -} from '../../type'; - -import { - Int8, Uint8, - Int16, Uint16, - Int32, Uint32, - Int64, Uint64, - Float16, Float64, Float32, -} from '../../type'; - -import Long = flatbuffers.Long; - -export function* readJSON(json: any) { - const schema = schemaFromJSON(json['schema']); - const dictionaries = new Map(); - for (const batch of (json['dictionaries'] || [])) { - const message = dictionaryBatchFromJSON(batch); - yield { - schema, message, - loader: new JSONDataLoader( - flattenDataSources(batch['data']['columns']), - arrayIterator(message.nodes), - arrayIterator(message.buffers), - dictionaries - ) - }; - } - for (const batch of (json['batches'] || [])) { - const message = recordBatchFromJSON(batch); - yield { - schema, message, - loader: new JSONDataLoader( - flattenDataSources(batch['columns']), - arrayIterator(message.nodes), - arrayIterator(message.buffers), - dictionaries - ) - }; - } -} - -function* arrayIterator(arr: Array) { yield* arr; } -function flattenDataSources(xs: any[]): any[][] { - return (xs || []).reduce((buffers, column: any) => [ - ...buffers, - ...(column['VALIDITY'] && [column['VALIDITY']] || []), - ...(column['OFFSET'] && [column['OFFSET']] || []), - ...(column['TYPE'] && [column['TYPE']] || []), - ...(column['DATA'] && [column['DATA']] || []), - ...flattenDataSources(column['children']) - ], [] as any[][]); -} - -const utf8Encoder = new TextEncoder('utf-8'); - -export class JSONDataLoader extends TypeDataLoader { - constructor(private sources: any[][], nodes: Iterator, buffers: Iterator, dictionaries: Map) { - super(nodes, buffers, dictionaries); - } - protected readNullBitmap(_type: T, nullCount: number, { offset } = this.getBufferMetadata()) { - return nullCount <= 0 ? new Uint8Array(0) : packBools(this.sources[offset]); - } - protected readOffsets(_type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { - return new Int32Array(this.sources[offset]); - } - protected readTypeIds(_type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { - return new Int8Array(this.sources[offset]); - } - protected readData(type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { - const { sources } = this; - if (DataType.isTimestamp(type) === true) { - return new Uint8Array(IntUtil.Int64.convertArray(sources[offset] as string[]).buffer); - } else if ((DataType.isInt(type) || DataType.isTime(type)) && type.bitWidth === 64) { - return new Uint8Array(IntUtil.Int64.convertArray(sources[offset] as string[]).buffer); - } else if (DataType.isDate(type) && type.unit === DateUnit.MILLISECOND) { - return new Uint8Array(IntUtil.Int64.convertArray(sources[offset] as string[]).buffer); - } else if (DataType.isDecimal(type) === true) { - return new Uint8Array(IntUtil.Int128.convertArray(sources[offset] as string[]).buffer); - } else if (DataType.isBinary(type) === true || DataType.isFixedSizeBinary(type) === true) { - return new Uint8Array(binaryDataFromJSON(sources[offset] as string[])); - } else if (DataType.isBool(type) === true) { - return new Uint8Array(packBools(sources[offset] as number[]).buffer); - } else if (DataType.isUtf8(type) === true) { - return utf8Encoder.encode((sources[offset] as string[]).join('')); - } else { - return toTypedArray(type.ArrayType, sources[offset].map((x) => +x)) as any; - } - } -} - -function binaryDataFromJSON(values: string[]) { - // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] - // There are definitely more efficient ways to do this... but it gets the - // job done. - const joined = values.join(''); - const data = new Uint8Array(joined.length / 2); - for (let i = 0; i < joined.length; i += 2) { - data[i >> 1] = parseInt(joined.substr(i, 2), 16); - } - return data.buffer; -} - -import * as Schema_ from '../../fb/Schema'; -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; -import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; -import UnionMode = Schema_.org.apache.arrow.flatbuf.UnionMode; -import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; -import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; -import { toTypedArray } from '../../data'; - -function schemaFromJSON(s: any): Schema { - const dictionaryFields = new Map>(); - return new Schema( - fieldsFromJSON(s['fields'], dictionaryFields), - customMetadata(s['customMetadata']), - MetadataVersion.V4, dictionaryFields - ); -} - -function recordBatchFromJSON(b: any): RecordBatchMetadata { - return new RecordBatchMetadata( - MetadataVersion.V4, - b['count'], - fieldNodesFromJSON(b['columns']), - buffersFromJSON(b['columns']) - ); -} - -function dictionaryBatchFromJSON(b: any): DictionaryBatch { - return new DictionaryBatch( - MetadataVersion.V4, - recordBatchFromJSON(b['data']), - b['id'], b['isDelta'] - ); -} - -function fieldsFromJSON(fs: any[], dictionaryFields: Map> | null): Field[] { - return (fs || []) - .map((f) => fieldFromJSON(f, dictionaryFields)) - .filter((f) => f != null) as Field[]; -} - -function fieldNodesFromJSON(xs: any[]): FieldMetadata[] { - return (xs || []).reduce((fieldNodes, column: any) => [ - ...fieldNodes, - new FieldMetadata( - new Long(column['count'], 0), - new Long(nullCountFromJSON(column['VALIDITY']), 0) - ), - ...fieldNodesFromJSON(column['children']) - ], [] as FieldMetadata[]); -} - -function buffersFromJSON(xs: any[], buffers: BufferMetadata[] = []): BufferMetadata[] { - for (let i = -1, n = (xs || []).length; ++i < n;) { - const column = xs[i]; - column['VALIDITY'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['VALIDITY'].length, 0))); - column['OFFSET'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['OFFSET'].length, 0))); - column['TYPE'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['TYPE'].length, 0))); - column['DATA'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['DATA'].length, 0))); - buffers = buffersFromJSON(column['children'], buffers); - } - return buffers; -} - -function nullCountFromJSON(validity: number[]) { - return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); -} - -function fieldFromJSON(f: any, dictionaryFields: Map> | null) { - let name = f['name']; - let field: Field | void; - let nullable = f['nullable']; - let dataType: DataType | null; - let id: number, keysMeta: any, dictMeta: any; - let metadata = customMetadata(f['customMetadata']); - if (!dictionaryFields || !(dictMeta = f['dictionary'])) { - if (dataType = typeFromJSON(f['type'], fieldsFromJSON(f['children'], dictionaryFields))) { - field = new Field(name, dataType, nullable, metadata); - } - } else if (dataType = dictionaryFields.has(id = dictMeta['id']) - ? dictionaryFields.get(id)!.type.dictionary - : typeFromJSON(f['type'], fieldsFromJSON(f['children'], null))) { - dataType = new Dictionary(dataType, - // a dictionary index defaults to signed 32 bit int if unspecified - (keysMeta = dictMeta['indexType']) ? intFromJSON(keysMeta)! : new Int32(), - id, dictMeta['isOrdered'] - ); - field = new Field(name, dataType, nullable, metadata); - dictionaryFields.has(id) || dictionaryFields.set(id, field as Field); - } - return field || null; -} - -function customMetadata(metadata?: any) { - return new Map(Object.entries(metadata || {})); -} - -const namesToTypeMap: { [n: string]: Type } = { - 'NONE': Type.NONE, - 'null': Type.Null, - 'int': Type.Int, - 'floatingpoint': Type.FloatingPoint, - 'binary': Type.Binary, - 'bool': Type.Bool, - 'utf8': Type.Utf8, - 'decimal': Type.Decimal, - 'date': Type.Date, - 'time': Type.Time, - 'timestamp': Type.Timestamp, - 'interval': Type.Interval, - 'list': Type.List, - 'struct': Type.Struct_, - 'union': Type.Union, - 'fixedsizebinary': Type.FixedSizeBinary, - 'fixedsizelist': Type.FixedSizeList, - 'map': Type.Map, -}; - -function typeFromJSON(t: any, children?: Field[]) { - switch (namesToTypeMap[t['name']]) { - case Type.NONE: return null; - case Type.Null: return nullFromJSON(t); - case Type.Int: return intFromJSON(t); - case Type.FloatingPoint: return floatingPointFromJSON(t); - case Type.Binary: return binaryFromJSON(t); - case Type.Utf8: return utf8FromJSON(t); - case Type.Bool: return boolFromJSON(t); - case Type.Decimal: return decimalFromJSON(t); - case Type.Date: return dateFromJSON(t); - case Type.Time: return timeFromJSON(t); - case Type.Timestamp: return timestampFromJSON(t); - case Type.Interval: return intervalFromJSON(t); - case Type.List: return listFromJSON(t, children || []); - case Type.Struct_: return structFromJSON(t, children || []); - case Type.Union: return unionFromJSON(t, children || []); - case Type.FixedSizeBinary: return fixedSizeBinaryFromJSON(t); - case Type.FixedSizeList: return fixedSizeListFromJSON(t, children || []); - case Type.Map: return mapFromJSON(t, children || []); - } - throw new Error(`Unrecognized type ${t['name']}`); -} - -function nullFromJSON (_type: any) { return new Null(); } -function intFromJSON (_type: any) { switch (_type['bitWidth']) { - case 8: return _type['isSigned'] ? new Int8() : new Uint8(); - case 16: return _type['isSigned'] ? new Int16() : new Uint16(); - case 32: return _type['isSigned'] ? new Int32() : new Uint32(); - case 64: return _type['isSigned'] ? new Int64() : new Uint64(); - } - return null; } -function floatingPointFromJSON (_type: any) { switch (Precision[_type['precision']] as any) { - case Precision.HALF: return new Float16(); - case Precision.SINGLE: return new Float32(); - case Precision.DOUBLE: return new Float64(); - } - return null; } -function binaryFromJSON (_type: any) { return new Binary(); } -function utf8FromJSON (_type: any) { return new Utf8(); } -function boolFromJSON (_type: any) { return new Bool(); } -function decimalFromJSON (_type: any) { return new Decimal(_type['scale'], _type['precision']); } -function dateFromJSON (_type: any) { return new Date_(DateUnit[_type['unit']] as any); } -function timeFromJSON (_type: any) { return new Time(TimeUnit[_type['unit']] as any, _type['bitWidth'] as TimeBitWidth); } -function timestampFromJSON (_type: any) { return new Timestamp(TimeUnit[_type['unit']] as any, _type['timezone']); } -function intervalFromJSON (_type: any) { return new Interval(IntervalUnit[_type['unit']] as any); } -function listFromJSON (_type: any, children: Field[]) { return new List(children); } -function structFromJSON (_type: any, children: Field[]) { return new Struct(children); } -function unionFromJSON (_type: any, children: Field[]) { return new Union(UnionMode[_type['mode']] as any, (_type['typeIds'] || []) as Type[], children); } -function fixedSizeBinaryFromJSON(_type: any) { return new FixedSizeBinary(_type['byteWidth']); } -function fixedSizeListFromJSON (_type: any, children: Field[]) { return new FixedSizeList(_type['listSize'], children); } -function mapFromJSON (_type: any, children: Field[]) { return new Map_(_type['keysSorted'], children); } diff --git a/js/src/ipc/reader/node.ts b/js/src/ipc/reader/node.ts deleted file mode 100644 index 24295c81cbd52..0000000000000 --- a/js/src/ipc/reader/node.ts +++ /dev/null @@ -1,78 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { flatbuffers } from 'flatbuffers'; -import * as Message_ from '../../fb/Message'; -import ByteBuffer = flatbuffers.ByteBuffer; -import _Message = Message_.org.apache.arrow.flatbuf.Message; -import { PADDING, isValidArrowFile, checkForMagicArrowString } from '../magic'; - -export async function* fromReadableStream(stream: NodeJS.ReadableStream) { - - let bb: ByteBuffer; - let bytesRead = 0, bytes = new Uint8Array(0); - let messageLength = 0, message: _Message | null = null; - - for await (let chunk of (stream as any as AsyncIterable)) { - - if (chunk == null) { - continue; - } - - const grown = new Uint8Array(bytes.byteLength + chunk.length); - - if (typeof chunk !== 'string') { - grown.set(bytes, 0) || grown.set(chunk, bytes.byteLength); - } else { - for (let i = -1, j = bytes.byteLength, n = chunk.length; ++i < n;) { - grown[i + j] = chunk.charCodeAt(i); - } - } - - bytes = grown; - - // If we're reading in an Arrow File, just concatenate the bytes until - // the file is fully read in - if (checkForMagicArrowString(bytes)) { - if (!isValidArrowFile(new ByteBuffer(bytes))) { - continue; - } - return yield bytes; - } - - if (bytes.byteLength > 0 && messageLength <= 0) { - messageLength = new DataView(bytes.buffer).getInt32(0, true); - } - - while (messageLength > 0 && messageLength <= bytes.byteLength) { - if (!message) { - (bb = new ByteBuffer(bytes)).setPosition(4); - if (message = _Message.getRootAsMessage(bb)) { - messageLength += message.bodyLength().low; - continue; - } - throw new Error(`Invalid message at position ${bytesRead}`); - } - bytesRead += messageLength + PADDING; - yield bytes.subarray(0, messageLength + PADDING); - bytes = bytes.subarray(messageLength + PADDING); - messageLength = bytes.byteLength < 4 ? 0 : - new DataView(bytes.buffer).getInt32(bytes.byteOffset, true); - message = null; - } - } -} diff --git a/js/src/ipc/reader/vector.ts b/js/src/ipc/reader/vector.ts deleted file mode 100644 index c4688f5e2b851..0000000000000 --- a/js/src/ipc/reader/vector.ts +++ /dev/null @@ -1,131 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Vector } from '../../vector'; -import { RecordBatch } from '../../recordbatch'; -import { TypeVisitor } from '../../visitor'; -import { FlatType, NestedType, ListType } from '../../type'; -import { Message, FieldMetadata, BufferMetadata } from '../metadata'; -import { FlatData, ListData, NestedData, SingleNestedData, DenseUnionData, SparseUnionData, BoolData, FlatListData, DictionaryData } from '../../data'; -import { - Schema, Field, - Dictionary, - Null, Int, Float, - Binary, Bool, Utf8, Decimal, - Date_, Time, Timestamp, Interval, - List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, - UnionMode, SparseUnion, DenseUnion, FlatListType, DataType, -} from '../../type'; - -export function* readRecordBatches(messages: Iterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>) { - for (const { schema, message, loader } of messages) { - yield* readRecordBatch(schema, message, loader); - } -} - -export async function* readRecordBatchesAsync(messages: AsyncIterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>) { - for await (const { schema, message, loader } of messages) { - yield* readRecordBatch(schema, message, loader); - } -} - -export function* readRecordBatch(schema: Schema, message: Message, loader: TypeDataLoader) { - if (Message.isRecordBatch(message)) { - yield new RecordBatch(schema, message.length, loader.visitFields(schema.fields)); - } else if (Message.isDictionaryBatch(message)) { - const dictionaryId = message.id; - const dictionaries = loader.dictionaries; - const dictionaryField = schema.dictionaries.get(dictionaryId)!; - const dictionaryDataType = (dictionaryField.type as Dictionary).dictionary; - let dictionaryVector = Vector.create(loader.visit(dictionaryDataType)); - if (message.isDelta && dictionaries.has(dictionaryId)) { - dictionaryVector = dictionaries.get(dictionaryId)!.concat(dictionaryVector); - } - dictionaries.set(dictionaryId, dictionaryVector); - } -} - -export abstract class TypeDataLoader extends TypeVisitor { - - public dictionaries: Map; - protected nodes: Iterator; - protected buffers: Iterator; - - constructor(nodes: Iterator, buffers: Iterator, dictionaries: Map) { - super(); - this.nodes = nodes; - this.buffers = buffers; - this.dictionaries = dictionaries; - } - - public visitFields(fields: Field[]) { return fields.map((field) => this.visit(field.type)); } - - public visitNull (type: Null) { return this.visitNullType(type); } - public visitInt (type: Int) { return this.visitFlatType(type); } - public visitFloat (type: Float) { return this.visitFlatType(type); } - public visitBinary (type: Binary) { return this.visitFlatList(type); } - public visitUtf8 (type: Utf8) { return this.visitFlatList(type); } - public visitBool (type: Bool) { return this.visitBoolType(type); } - public visitDecimal (type: Decimal) { return this.visitFlatType(type); } - public visitDate (type: Date_) { return this.visitFlatType(type); } - public visitTime (type: Time) { return this.visitFlatType(type); } - public visitTimestamp (type: Timestamp) { return this.visitFlatType(type); } - public visitInterval (type: Interval) { return this.visitFlatType(type); } - public visitList (type: List) { return this.visitListType(type); } - public visitStruct (type: Struct) { return this.visitNestedType(type); } - public visitUnion (type: Union) { return this.visitUnionType(type); } - public visitFixedSizeBinary(type: FixedSizeBinary) { return this.visitFlatType(type); } - public visitFixedSizeList (type: FixedSizeList) { return this.visitFixedSizeListType(type); } - public visitMap (type: Map_) { return this.visitNestedType(type); } - public visitDictionary (type: Dictionary) { - return new DictionaryData(type, this.dictionaries.get(type.id)!, this.visit(type.indices)); - } - protected getFieldMetadata() { return this.nodes.next().value; } - protected getBufferMetadata() { return this.buffers.next().value; } - protected readNullBitmap(type: T, nullCount: number, buffer = this.getBufferMetadata()) { - return nullCount > 0 && this.readData(type, buffer) || new Uint8Array(0); - } - protected abstract readData(type: T, buffer?: BufferMetadata): any; - protected abstract readOffsets(type: T, buffer?: BufferMetadata): any; - protected abstract readTypeIds(type: T, buffer?: BufferMetadata): any; - protected visitNullType(type: Null, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new FlatData(type, length, this.readNullBitmap(type, nullCount), new Uint8Array(0), 0, nullCount); - } - protected visitFlatType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new FlatData(type, length, this.readNullBitmap(type, nullCount), this.readData(type), 0, nullCount); - } - protected visitBoolType(type: Bool, { length, nullCount }: FieldMetadata = this.getFieldMetadata(), data?: Uint8Array) { - return new BoolData(type, length, this.readNullBitmap(type, nullCount), data || this.readData(type), 0, nullCount); - } - protected visitFlatList(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new FlatListData(type, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.readData(type), 0, nullCount); - } - protected visitListType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new ListData(type, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.visit(type.children![0].type), 0, nullCount); - } - protected visitFixedSizeListType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new SingleNestedData(type, length, this.readNullBitmap(type, nullCount), this.visit(type.children![0].type), 0, nullCount); - } - protected visitNestedType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return new NestedData(type, length, this.readNullBitmap(type, nullCount), this.visitFields(type.children), 0, nullCount); - } - protected visitUnionType(type: DenseUnion | SparseUnion, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { - return type.mode === UnionMode.Sparse ? - new SparseUnionData(type as SparseUnion, length, this.readNullBitmap(type, nullCount), this.readTypeIds(type), this.visitFields(type.children), 0, nullCount) : - new DenseUnionData(type as DenseUnion, length, this.readNullBitmap(type, nullCount), this.readTypeIds(type), this.readOffsets(type), this.visitFields(type.children), 0, nullCount); - } -} diff --git a/js/src/ipc/whatwg/iterable.ts b/js/src/ipc/whatwg/iterable.ts new file mode 100644 index 0000000000000..31916f2a3bdac --- /dev/null +++ b/js/src/ipc/whatwg/iterable.ts @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { toUint8Array } from '../../util/buffer'; +import { ReadableDOMStreamOptions } from '../../io/interfaces'; +import { isIterable, isAsyncIterable } from '../../util/compat'; + +/** @ignore */ +export function toDOMStream(source: Iterable | AsyncIterable, options?: ReadableDOMStreamOptions): ReadableStream { + if (isAsyncIterable(source)) { return asyncIterableAsReadableDOMStream(source, options); } + if (isIterable(source)) { return iterableAsReadableDOMStream(source, options); } + /* istanbul ignore next */ + throw new Error(`toDOMStream() must be called with an Iterable or AsyncIterable`); +} + +/** @ignore */ +function iterableAsReadableDOMStream(source: Iterable, options?: ReadableDOMStreamOptions) { + + let it: Iterator | null = null; + const bm = (options && options.type === 'bytes') || false; + const hwm = options && options.highWaterMark || (2 ** 24); + + return new ReadableStream({ + ...options as any, + start(controller) { next(controller, it || (it = source[Symbol.iterator]())); }, + pull(controller) { it ? (next(controller, it)) : controller.close(); }, + cancel() { (it && (it.return && it.return()) || true) && (it = null); } + }, { highWaterMark: bm ? hwm : undefined, ...options }); + + function next(controller: ReadableStreamDefaultController, it: Iterator) { + let buf: Uint8Array; + let r: IteratorResult | null = null; + let size = controller.desiredSize || null; + while (!(r = it.next(bm ? size : null)).done) { + if (ArrayBuffer.isView(r.value) && (buf = toUint8Array(r.value))) { + size != null && bm && (size = size - buf.byteLength + 1); + r.value = buf; + } + controller.enqueue(r.value); + if (size != null && --size <= 0) { return; } + } + controller.close(); + } +} + +/** @ignore */ +function asyncIterableAsReadableDOMStream(source: AsyncIterable, options?: ReadableDOMStreamOptions) { + + let it: AsyncIterator | null = null; + const bm = (options && options.type === 'bytes') || false; + const hwm = options && options.highWaterMark || (2 ** 24); + + return new ReadableStream({ + ...options as any, + async start(controller) { await next(controller, it || (it = source[Symbol.asyncIterator]())); }, + async pull(controller) { it ? (await next(controller, it)) : controller.close(); }, + async cancel() { (it && (it.return && await it.return()) || true) && (it = null); }, + }, { highWaterMark: bm ? hwm : undefined, ...options }); + + async function next(controller: ReadableStreamDefaultController, it: AsyncIterator) { + let buf: Uint8Array; + let r: IteratorResult | null = null; + let size = controller.desiredSize || null; + while (!(r = await it.next(bm ? size : null)).done) { + if (ArrayBuffer.isView(r.value) && (buf = toUint8Array(r.value))) { + size != null && bm && (size = size - buf.byteLength + 1); + r.value = buf; + } + controller.enqueue(r.value); + if (size != null && --size <= 0) { return; } + } + controller.close(); + } +} diff --git a/js/src/ipc/whatwg/reader.ts b/js/src/ipc/whatwg/reader.ts new file mode 100644 index 0000000000000..3e39900fe27e5 --- /dev/null +++ b/js/src/ipc/whatwg/reader.ts @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../../type'; +import { RecordBatch } from '../../recordbatch'; +import { AsyncByteQueue } from '../../io/stream'; +import { RecordBatchReader } from '../../ipc/reader'; + +/** @ignore */ +export function recordBatchReaderThroughDOMStream(writableStrategy?: ByteLengthQueuingStrategy, readableStrategy?: { autoDestroy: boolean }) { + + const queue = new AsyncByteQueue(); + let reader: RecordBatchReader | null = null; + + const readable = new ReadableStream>({ + async cancel() { await queue.close(); }, + async start(controller) { await next(controller, reader || (reader = await open())); }, + async pull(controller) { reader ? await next(controller, reader) : controller.close(); } + }); + + return { writable: new WritableStream(queue, { 'highWaterMark': 2 ** 14, ...writableStrategy }), readable }; + + async function open() { + return await (await RecordBatchReader.from(queue)).open(readableStrategy); + } + + async function next(controller: ReadableStreamDefaultController>, reader: RecordBatchReader) { + let size = controller.desiredSize; + let r: IteratorResult> | null = null; + while (!(r = await reader.next()).done) { + controller.enqueue(r.value); + if (size != null && --size <= 0) { + return; + } + } + controller.close(); + } +} diff --git a/js/src/ipc/whatwg/writer.ts b/js/src/ipc/whatwg/writer.ts new file mode 100644 index 0000000000000..de3b3f1d2474a --- /dev/null +++ b/js/src/ipc/whatwg/writer.ts @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { DataType } from '../../type'; +import { RecordBatch } from '../../recordbatch'; +import { AsyncByteStream } from '../../io/stream'; +import { RecordBatchWriter } from '../../ipc/writer'; + +/** @ignore */ +export function recordBatchWriterThroughDOMStream( + this: typeof RecordBatchWriter, + writableStrategy?: QueuingStrategy> & { autoDestroy: boolean }, + readableStrategy?: { highWaterMark?: number, size?: any } +) { + + const writer = new this(writableStrategy); + const reader = new AsyncByteStream(writer); + const readable = new ReadableStream({ + type: 'bytes', + async cancel() { await reader.cancel(); }, + async pull(controller) { await next(controller); }, + async start(controller) { await next(controller); }, + }, { 'highWaterMark': 2 ** 14, ...readableStrategy }); + + return { writable: new WritableStream(writer, writableStrategy), readable }; + + async function next(controller: ReadableStreamDefaultController) { + let buf: Uint8Array | null = null; + let size = controller.desiredSize; + while (buf = await reader.read(size || null)) { + controller.enqueue(buf); + if (size != null && (size -= buf.byteLength) <= 0) { return; } + } + controller.close(); + } +} diff --git a/js/src/ipc/writer.ts b/js/src/ipc/writer.ts new file mode 100644 index 0000000000000..746e5ef58e369 --- /dev/null +++ b/js/src/ipc/writer.ts @@ -0,0 +1,417 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Table } from '../table'; +import { MAGIC } from './message'; +import { Vector } from '../vector'; +import { Column } from '../column'; +import { Schema, Field } from '../schema'; +import { Chunked } from '../vector/chunked'; +import { Message } from './metadata/message'; +import { RecordBatch } from '../recordbatch'; +import * as metadata from './metadata/message'; +import { DataType, Dictionary } from '../type'; +import { FileBlock, Footer } from './metadata/file'; +import { MessageHeader, MetadataVersion } from '../enum'; +import { WritableSink, AsyncByteQueue } from '../io/stream'; +import { VectorAssembler } from '../visitor/vectorassembler'; +import { JSONTypeAssembler } from '../visitor/jsontypeassembler'; +import { JSONVectorAssembler } from '../visitor/jsonvectorassembler'; +import { ArrayBufferViewInput, toUint8Array } from '../util/buffer'; +import { Writable, ReadableInterop, ReadableDOMStreamOptions } from '../io/interfaces'; +import { isPromise, isAsyncIterable, isWritableDOMStream, isWritableNodeStream } from '../util/compat'; + +export class RecordBatchWriter extends ReadableInterop implements Writable> { + + /** @nocollapse */ + // @ts-ignore + public static throughNode(options?: import('stream').DuplexOptions & { autoDestroy: boolean }): import('stream').Duplex { + throw new Error(`"throughNode" not available in this environment`); + } + /** @nocollapse */ + public static throughDOM( + // @ts-ignore + writableStrategy?: QueuingStrategy> & { autoDestroy: boolean }, + // @ts-ignore + readableStrategy?: { highWaterMark?: number, size?: any } + ): { writable: WritableStream | RecordBatch>, readable: ReadableStream } { + throw new Error(`"throughDOM" not available in this environment`); + } + + constructor(options?: { autoDestroy: boolean }) { + super(); + this._autoDestroy = options && (typeof options.autoDestroy === 'boolean') ? options.autoDestroy : true; + } + + protected _position = 0; + protected _started = false; + protected _autoDestroy: boolean; + // @ts-ignore + protected _sink = new AsyncByteQueue(); + protected _schema: Schema | null = null; + protected _dictionaryBlocks: FileBlock[] = []; + protected _recordBatchBlocks: FileBlock[] = []; + + public toString(sync: true): string; + public toString(sync?: false): Promise; + public toString(sync: any = false) { + return this._sink.toString(sync) as Promise | string; + } + public toUint8Array(sync: true): Uint8Array; + public toUint8Array(sync?: false): Promise; + public toUint8Array(sync: any = false) { + return this._sink.toUint8Array(sync) as Promise | Uint8Array; + } + + public writeAll(input: Table | Iterable>): this; + public writeAll(input: AsyncIterable>): Promise; + public writeAll(input: PromiseLike>>): Promise; + public writeAll(input: PromiseLike | Iterable>>): Promise; + public writeAll(input: PromiseLike | Table | Iterable> | AsyncIterable>) { + if (isPromise(input)) { + return input.then((x) => this.writeAll(x)); + } else if (isAsyncIterable>(input)) { + return writeAllAsync(this, input); + } + return writeAll(this, input); + } + + public get closed() { return this._sink.closed; } + public [Symbol.asyncIterator]() { return this._sink[Symbol.asyncIterator](); } + public toDOMStream(options?: ReadableDOMStreamOptions) { return this._sink.toDOMStream(options); } + public toNodeStream(options?: import('stream').ReadableOptions) { return this._sink.toNodeStream(options); } + + public close() { + return this.reset()._sink.close(); + } + public abort(reason?: any) { + return this.reset()._sink.abort(reason); + } + public finish() { + this._autoDestroy ? this.close() : this.reset(this._sink, this._schema); + return this; + } + public reset(sink: WritableSink = this._sink, schema: Schema | null = null) { + + if ((sink === this._sink) || (sink instanceof AsyncByteQueue)) { + this._sink = sink as AsyncByteQueue; + } else { + this._sink = new AsyncByteQueue(); + if (sink && isWritableDOMStream(sink)) { + this.toDOMStream({ type: 'bytes' }).pipeTo(sink); + } else if (sink && isWritableNodeStream(sink)) { + this.toNodeStream({ objectMode: false }).pipe(sink); + } + } + + if (this._started && this._schema) { + this._writeFooter(); + } + + this._started = false; + this._dictionaryBlocks = []; + this._recordBatchBlocks = []; + + if (!schema || (schema !== this._schema)) { + if (schema === null) { + this._position = 0; + this._schema = null; + } else { + this._started = true; + this._schema = schema; + this._writeSchema(schema); + } + } + + return this; + } + + public write(chunk?: Table | RecordBatch | null) { + let schema: Schema | null; + if (!this._sink) { + throw new Error(`RecordBatchWriter is closed`); + } else if (!chunk || !(schema = chunk.schema)) { + return this.finish() && undefined; + } else if (schema !== this._schema) { + if (this._started && this._autoDestroy) { + return this.close(); + } + this.reset(this._sink, schema); + } + (chunk instanceof Table) + ? this.writeAll(chunk.chunks) + : this._writeRecordBatch(chunk); + } + + protected _writeMessage(message: Message, alignment = 8) { + + const a = alignment - 1; + const buffer = Message.encode(message); + const flatbufferSize = buffer.byteLength; + const alignedSize = (flatbufferSize + 4 + a) & ~a; + const nPaddingBytes = alignedSize - flatbufferSize - 4; + + if (message.headerType === MessageHeader.RecordBatch) { + this._recordBatchBlocks.push(new FileBlock(alignedSize, message.bodyLength, this._position)); + } else if (message.headerType === MessageHeader.DictionaryBatch) { + this._dictionaryBlocks.push(new FileBlock(alignedSize, message.bodyLength, this._position)); + } + + // Write the flatbuffer size prefix including padding + this._write(Int32Array.of(alignedSize - 4)); + // Write the flatbuffer + if (flatbufferSize > 0) { this._write(buffer); } + // Write any padding + return this._writePadding(nPaddingBytes); + } + + protected _write(chunk: ArrayBufferViewInput) { + if (this._started) { + const buffer = toUint8Array(chunk); + if (buffer && buffer.byteLength > 0) { + this._sink.write(buffer); + this._position += buffer.byteLength; + } + } + return this; + } + + protected _writeSchema(schema: Schema) { + return this + ._writeMessage(Message.from(schema)) + ._writeDictionaries(schema.dictionaryFields); + } + + protected _writeFooter() { + return this._writePadding(4); // eos bytes + } + + protected _writeMagic() { + return this._write(MAGIC); + } + + protected _writePadding(nBytes: number) { + return nBytes > 0 ? this._write(new Uint8Array(nBytes)) : this; + } + + protected _writeRecordBatch(records: RecordBatch) { + const { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(records); + const recordBatch = new metadata.RecordBatch(records.length, nodes, bufferRegions); + const message = Message.from(recordBatch, byteLength); + return this + ._writeMessage(message) + ._writeBodyBuffers(buffers); + } + + protected _writeDictionaryBatch(dictionary: Vector, id: number, isDelta = false) { + const { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(dictionary); + const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions); + const dictionaryBatch = new metadata.DictionaryBatch(recordBatch, id, isDelta); + const message = Message.from(dictionaryBatch, byteLength); + return this + ._writeMessage(message) + ._writeBodyBuffers(buffers); + } + + protected _writeBodyBuffers(buffers: ArrayBufferView[]) { + let buffer: ArrayBufferView; + let size: number, padding: number; + for (let i = -1, n = buffers.length; ++i < n;) { + if ((buffer = buffers[i]) && (size = buffer.byteLength) > 0) { + this._write(buffer); + if ((padding = ((size + 7) & ~7) - size) > 0) { + this._writePadding(padding); + } + } + } + return this; + } + + protected _writeDictionaries(dictionaryFields: Map>[]>) { + for (const [id, fields] of dictionaryFields) { + const vector = fields[0].type.dictionaryVector; + if (!(vector instanceof Chunked)) { + this._writeDictionaryBatch(vector, id, false); + } else { + const chunks = vector.chunks; + for (let i = -1, n = chunks.length; ++i < n;) { + this._writeDictionaryBatch(chunks[i], id, i > 0); + } + } + } + return this; + } +} + +/** @ignore */ +export class RecordBatchStreamWriter extends RecordBatchWriter { + + public static writeAll(this: typeof RecordBatchWriter, input: Table | Iterable>, options?: { autoDestroy: true }): RecordBatchStreamWriter; + // @ts-ignore + public static writeAll(this: typeof RecordBatchWriter, input: AsyncIterable>, options?: { autoDestroy: true }): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike>>, options?: { autoDestroy: true }): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike | Iterable>>, options?: { autoDestroy: true }): Promise>; + /** @nocollapse */ + public static writeAll(this: typeof RecordBatchWriter, input: any, options?: { autoDestroy: true }) { + return new RecordBatchStreamWriter(options).writeAll(input); + } +} + +/** @ignore */ +export class RecordBatchFileWriter extends RecordBatchWriter { + + public static writeAll(this: typeof RecordBatchWriter, input: Table | Iterable>): RecordBatchFileWriter; + // @ts-ignore + public static writeAll(this: typeof RecordBatchWriter, input: AsyncIterable>): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike>>): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike | Iterable>>): Promise>; + /** @nocollapse */ + public static writeAll(this: typeof RecordBatchWriter, input: any) { + return new RecordBatchFileWriter().writeAll(input); + } + + constructor() { + super(); + this._autoDestroy = true; + } + + protected _writeSchema(schema: Schema) { + return this + ._writeMagic()._writePadding(2) + ._writeDictionaries(schema.dictionaryFields); + } + + protected _writeFooter() { + const buffer = Footer.encode(new Footer( + this._schema!, MetadataVersion.V4, + this._recordBatchBlocks, this._dictionaryBlocks + )); + return this + ._write(buffer) // Write the flatbuffer + ._write(Int32Array.of(buffer.byteLength)) // then the footer size suffix + ._writeMagic(); // then the magic suffix + } +} + +/** @ignore */ +export class RecordBatchJSONWriter extends RecordBatchWriter { + + public static writeAll(this: typeof RecordBatchWriter, input: Table | Iterable>): RecordBatchJSONWriter; + // @ts-ignore + public static writeAll(this: typeof RecordBatchWriter, input: AsyncIterable>): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike>>): Promise>; + public static writeAll(this: typeof RecordBatchWriter, input: PromiseLike | Iterable>>): Promise>; + /** @nocollapse */ + public static writeAll(this: typeof RecordBatchWriter, input: any) { + return new RecordBatchJSONWriter().writeAll(input as any); + } + + constructor() { + super(); + this._autoDestroy = true; + } + + protected _writeMessage() { return this; } + protected _writeSchema(schema: Schema) { + return this._write(`{\n "schema": ${ + JSON.stringify({ fields: schema.fields.map(fieldToJSON) }, null, 2) + }`)._writeDictionaries(schema.dictionaryFields); + } + protected _writeDictionaries(dictionaryFields: Map>[]>) { + this._write(`,\n "dictionaries": [\n`); + super._writeDictionaries(dictionaryFields); + return this._write(`\n ]`); + } + protected _writeDictionaryBatch(dictionary: Vector, id: number, isDelta = false) { + this._write(this._dictionaryBlocks.length === 0 ? ` ` : `,\n `); + this._write(`${dictionaryBatchToJSON(this._schema!, dictionary, id, isDelta)}`); + this._dictionaryBlocks.push(new FileBlock(0, 0, 0)); + return this; + } + protected _writeRecordBatch(records: RecordBatch) { + this._write(this._recordBatchBlocks.length === 0 + ? `,\n "batches": [\n ` + : `,\n `); + this._write(`${recordBatchToJSON(records)}`); + this._recordBatchBlocks.push(new FileBlock(0, 0, 0)); + return this; + } + public close() { + if (this._recordBatchBlocks.length > 0) { + this._write(`\n ]`); + } + if (this._schema) { + this._write(`\n}`); + } + return super.close(); + } +} + +/** @ignore */ +function writeAll(writer: RecordBatchWriter, input: Table | Iterable>) { + const chunks = (input instanceof Table) ? input.chunks : input; + for (const batch of chunks) { + writer.write(batch); + } + return writer.finish(); +} + +/** @ignore */ +async function writeAllAsync(writer: RecordBatchWriter, batches: AsyncIterable>) { + for await (const batch of batches) { + writer.write(batch); + } + return writer.finish(); +} + +/** @ignore */ +function fieldToJSON({ name, type, nullable }: Field): object { + const assembler = new JSONTypeAssembler(); + return { + 'name': name, 'nullable': nullable, + 'type': assembler.visit(type), + 'children': (type.children || []).map(fieldToJSON), + 'dictionary': !DataType.isDictionary(type) ? undefined : { + 'id': type.id, + 'isOrdered': type.isOrdered, + 'indexType': assembler.visit(type.indices) + } + }; +} + +/** @ignore */ +function dictionaryBatchToJSON(schema: Schema, dictionary: Vector, id: number, isDelta = false) { + const f = schema.dictionaryFields.get(id)![0]; + const field = new Field(f.name, f.type.dictionary, f.nullable, f.metadata); + const columns = JSONVectorAssembler.assemble(new Column(field, [dictionary])); + return JSON.stringify({ + 'id': id, + 'isDelta': isDelta, + 'data': { + 'count': dictionary.length, + 'columns': columns + } + }, null, 2); +} + +/** @ignore */ +function recordBatchToJSON(records: RecordBatch) { + return JSON.stringify({ + 'count': records.length, + 'columns': JSONVectorAssembler.assemble(records) + }, null, 2); +} diff --git a/js/src/ipc/writer/binary.ts b/js/src/ipc/writer/binary.ts deleted file mode 100644 index df7c586d94ab5..0000000000000 --- a/js/src/ipc/writer/binary.ts +++ /dev/null @@ -1,725 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Table } from '../../table'; -import { DenseUnionData } from '../../data'; -import { RecordBatch } from '../../recordbatch'; -import { VectorVisitor, TypeVisitor } from '../../visitor'; -import { MAGIC, magicLength, magicAndPadding, PADDING } from '../magic'; -import { align, getBool, packBools, iterateBits } from '../../util/bit'; -import { Vector, UnionVector, DictionaryVector, NestedVector, ListVector } from '../../vector'; -import { BufferMetadata, FieldMetadata, Footer, FileBlock, Message, RecordBatchMetadata, DictionaryBatch } from '../metadata'; -import { - Schema, Field, TypedArray, MetadataVersion, - DataType, - Dictionary, - Null, Int, Float, - Binary, Bool, Utf8, Decimal, - Date_, Time, Timestamp, Interval, - List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, - FlatType, FlatListType, NestedType, UnionMode, SparseUnion, DenseUnion, SingleNestedType, -} from '../../type'; - -export function* serializeStream(table: Table) { - yield serializeMessage(table.schema).buffer; - for (const [id, field] of table.schema.dictionaries) { - const vec = table.getColumn(field.name) as any as DictionaryVector; - if (vec && vec.dictionary) { - yield serializeDictionaryBatch(vec.dictionary, id).buffer; - } - } - for (const recordBatch of table.batches) { - yield serializeRecordBatch(recordBatch).buffer; - } -} - -export function* serializeFile(table: Table) { - - const recordBatches = []; - const dictionaryBatches = []; - - // First yield the magic string (aligned) - let buffer = new Uint8Array(align(magicLength, 8)); - let metadataLength, bodyLength, byteLength = buffer.byteLength; - buffer.set(MAGIC, 0); - yield buffer; - - // Then yield the schema - ({ metadataLength, buffer } = serializeMessage(table.schema)); - byteLength += buffer.byteLength; - yield buffer; - - for (const [id, field] of table.schema.dictionaries) { - const vec = table.getColumn(field.name) as any as DictionaryVector; - if (vec && vec.dictionary) { - ({ metadataLength, bodyLength, buffer } = serializeDictionaryBatch(vec.dictionary, id)); - dictionaryBatches.push(new FileBlock(metadataLength, bodyLength, byteLength)); - byteLength += buffer.byteLength; - yield buffer; - } - } - for (const recordBatch of table.batches) { - ({ metadataLength, bodyLength, buffer } = serializeRecordBatch(recordBatch)); - recordBatches.push(new FileBlock(metadataLength, bodyLength, byteLength)); - byteLength += buffer.byteLength; - yield buffer; - } - - // Then yield the footer metadata (not aligned) - ({ metadataLength, buffer } = serializeFooter(new Footer(dictionaryBatches, recordBatches, table.schema))); - yield buffer; - - // Last, yield the footer length + terminating magic arrow string (aligned) - buffer = new Uint8Array(magicAndPadding); - new DataView(buffer.buffer).setInt32(0, metadataLength, platformIsLittleEndian); - buffer.set(MAGIC, buffer.byteLength - magicLength); - yield buffer; -} - -export function serializeRecordBatch(recordBatch: RecordBatch) { - const { byteLength, fieldNodes, buffers, buffersMeta } = new RecordBatchSerializer().visitRecordBatch(recordBatch); - const rbMeta = new RecordBatchMetadata(MetadataVersion.V4, recordBatch.length, fieldNodes, buffersMeta); - const rbData = concatBuffersWithMetadata(byteLength, buffers, buffersMeta); - return serializeMessage(rbMeta, rbData); -} - -export function serializeDictionaryBatch(dictionary: Vector, id: Long | number, isDelta: boolean = false) { - const { byteLength, fieldNodes, buffers, buffersMeta } = new RecordBatchSerializer().visitRecordBatch(RecordBatch.from([dictionary])); - const rbMeta = new RecordBatchMetadata(MetadataVersion.V4, dictionary.length, fieldNodes, buffersMeta); - const dbMeta = new DictionaryBatch(MetadataVersion.V4, rbMeta, id, isDelta); - const rbData = concatBuffersWithMetadata(byteLength, buffers, buffersMeta); - return serializeMessage(dbMeta, rbData); -} - -export function serializeMessage(message: Message, data?: Uint8Array) { - const b = new Builder(); - _Message.finishMessageBuffer(b, writeMessage(b, message)); - // Slice out the buffer that contains the message metadata - const metadataBytes = b.asUint8Array(); - // Reserve 4 bytes for writing the message size at the front. - // Metadata length includes the metadata byteLength + the 4 - // bytes for the length, and rounded up to the nearest 8 bytes. - const metadataLength = align(PADDING + metadataBytes.byteLength, 8); - // + the length of the optional data buffer at the end, padded - const dataByteLength = data ? data.byteLength : 0; - // ensure the entire message is aligned to an 8-byte boundary - const messageBytes = new Uint8Array(align(metadataLength + dataByteLength, 8)); - // Write the metadata length into the first 4 bytes, but subtract the - // bytes we use to hold the length itself. - new DataView(messageBytes.buffer).setInt32(0, metadataLength - PADDING, platformIsLittleEndian); - // Copy the metadata bytes into the message buffer - messageBytes.set(metadataBytes, PADDING); - // Copy the optional data buffer after the metadata bytes - (data && dataByteLength > 0) && messageBytes.set(data, metadataLength); - // if (messageBytes.byteLength % 8 !== 0) { debugger; } - // Return the metadata length because we need to write it into each FileBlock also - return { metadataLength, bodyLength: message.bodyLength, buffer: messageBytes }; -} - -export function serializeFooter(footer: Footer) { - const b = new Builder(); - _Footer.finishFooterBuffer(b, writeFooter(b, footer)); - // Slice out the buffer that contains the footer metadata - const footerBytes = b.asUint8Array(); - const metadataLength = footerBytes.byteLength; - return { metadataLength, buffer: footerBytes }; -} - -export class RecordBatchSerializer extends VectorVisitor { - public byteLength = 0; - public buffers: TypedArray[] = []; - public fieldNodes: FieldMetadata[] = []; - public buffersMeta: BufferMetadata[] = []; - public visitRecordBatch(recordBatch: RecordBatch) { - this.buffers = []; - this.byteLength = 0; - this.fieldNodes = []; - this.buffersMeta = []; - for (let vector: Vector, index = -1, numCols = recordBatch.numCols; ++index < numCols;) { - if (vector = recordBatch.getChildAt(index)!) { - this.visit(vector); - } - } - return this; - } - public visit(vector: Vector) { - if (!DataType.isDictionary(vector.type)) { - const { data, length, nullCount } = vector; - if (length > 2147483647) { - throw new RangeError('Cannot write arrays larger than 2^31 - 1 in length'); - } - this.fieldNodes.push(new FieldMetadata(length, nullCount)); - this.addBuffer(nullCount <= 0 - ? new Uint8Array(0) // placeholder validity buffer - : this.getTruncatedBitmap(data.offset, length, data.nullBitmap!) - ); - } - return super.visit(vector); - } - public visitNull (_nullz: Vector) { return this; } - public visitBool (vector: Vector) { return this.visitBoolVector(vector); } - public visitInt (vector: Vector) { return this.visitFlatVector(vector); } - public visitFloat (vector: Vector) { return this.visitFlatVector(vector); } - public visitUtf8 (vector: Vector) { return this.visitFlatListVector(vector); } - public visitBinary (vector: Vector) { return this.visitFlatListVector(vector); } - public visitDate (vector: Vector) { return this.visitFlatVector(vector); } - public visitTimestamp (vector: Vector) { return this.visitFlatVector(vector); } - public visitTime (vector: Vector