From 30158ad5a068208237f3c30e6c9eb60454bae402 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 30 Oct 2017 08:50:28 +0100 Subject: [PATCH 001/177] ARROW-1718: [C++/Python] Implement casts from timestamp to date32/64, properly handle NumPy datetime64[D] -> date32 This was sort of a can of worms. cc @xhochy @cpcloud @BryanCutler Author: Wes McKinney Closes #1258 from wesm/ARROW-1718 and squashes the following commits: c693889 [Wes McKinney] Use explicit static_cast to convert int64->int32 586face [Wes McKinney] Use correct syntax for int64 literals in MSVC. remove incorrect comments d61b7a4 [Wes McKinney] Downcast datetime64[D] as int64_t to int32_t e6f8b62 [Wes McKinney] Build with clang in Travis CI b187a09 [Wes McKinney] Remove now unneeded template specialization for Date32Type, but another test fails 284b9ba [Wes McKinney] Complete unit tests for timestamp->date32/64 6ca361f [Wes McKinney] tweak test case 68f3a32 [Wes McKinney] Test case, c++ unittest placeholder 383f730 [Wes McKinney] Implement builtin converter for date32, test datetime.date, ints, overflows d52bd77 [Wes McKinney] One failing test case --- .travis.yml | 4 +- cpp/src/arrow/compute/cast.cc | 99 +++++++++--- cpp/src/arrow/compute/compute-test.cc | 64 ++++++++ cpp/src/arrow/python/builtin_convert.cc | 27 +++- cpp/src/arrow/python/numpy_to_arrow.cc | 153 +++++++++---------- cpp/src/arrow/python/util/datetime.h | 5 + python/pyarrow/tests/test_convert_builtin.py | 19 +++ python/pyarrow/tests/test_convert_pandas.py | 11 ++ 8 files changed, 278 insertions(+), 104 deletions(-) diff --git a/.travis.yml b/.travis.yml index 039ae95208b74..6419548a622f3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -51,12 +51,12 @@ matrix: os: linux group: deprecated before_script: - - export CC="gcc-4.9" - - export CXX="g++-4.9" - export ARROW_TRAVIS_USE_TOOLCHAIN=1 - export ARROW_TRAVIS_VALGRIND=1 - export ARROW_TRAVIS_PLASMA=1 - export ARROW_TRAVIS_CLANG_FORMAT=1 + - export CC="clang-4.0" + - export CXX="clang++-4.0" - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - $TRAVIS_BUILD_DIR/ci/travis_lint.sh - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index 68a2b12379e34..114ab9af0d0d0 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -69,11 +69,18 @@ namespace arrow { namespace compute { +constexpr int64_t kMillisecondsInDay = 86400000; + template -inline const T* GetValuesAs(const ArrayData& data, int i) { +inline const T* GetValues(const ArrayData& data, int i) { return reinterpret_cast(data.buffers[i]->data()) + data.offset; } +template +inline T* GetMutableValues(const ArrayData* data, int i) { + return reinterpret_cast(data->buffers[i]->mutable_data()) + data->offset; +} + namespace { void CopyData(const Array& input, ArrayData* output) { @@ -164,7 +171,7 @@ struct CastFunctorbuffers[1]->data(), in_data->offset, in_data->length); - auto out = reinterpret_cast(output->buffers[1]->mutable_data()); + auto out = GetMutableValues(output, 1); for (int64_t i = 0; i < input.length(); ++i) { *out++ = bit_reader.IsSet() ? kOne : kZero; bit_reader.Next(); @@ -214,8 +221,8 @@ struct CastFunctor::v using in_type = typename I::c_type; DCHECK_EQ(output->offset, 0); - const in_type* in_data = GetValuesAs(*input.data(), 1); - uint8_t* out_data = reinterpret_cast(output->buffers[1]->mutable_data()); + const in_type* in_data = GetValues(*input.data(), 1); + uint8_t* out_data = GetMutableValues(output, 1); for (int64_t i = 0; i < input.length(); ++i) { BitUtil::SetBitTo(out_data, i, (*in_data++) != 0); } @@ -233,8 +240,8 @@ struct CastFunctor(*input.data(), 1); - auto out_data = reinterpret_cast(output->buffers[1]->mutable_data()); + const in_type* in_data = GetValues(*input.data(), 1); + auto out_data = GetMutableValues(output, 1); if (!options.allow_int_overflow) { constexpr in_type kMax = static_cast(std::numeric_limits::max()); @@ -276,8 +283,8 @@ struct CastFunctor(*input.data(), 1); - auto out_data = reinterpret_cast(output->buffers[1]->mutable_data()); + const in_type* in_data = GetValues(*input.data(), 1); + auto out_data = GetMutableValues(output, 1); for (int64_t i = 0; i < input.length(); ++i) { *out_data++ = static_cast(*in_data++); } @@ -288,13 +295,16 @@ struct CastFunctor -inline void ShiftTime(FunctionContext* ctx, const CastOptions& options, - const bool is_multiply, const int64_t factor, const Array& input, - ArrayData* output) { - const in_type* in_data = GetValuesAs(*input.data(), 1); - auto out_data = reinterpret_cast(output->buffers[1]->mutable_data()); +void ShiftTime(FunctionContext* ctx, const CastOptions& options, const bool is_multiply, + const int64_t factor, const Array& input, ArrayData* output) { + const in_type* in_data = GetValues(*input.data(), 1); + auto out_data = GetMutableValues(output, 1); - if (is_multiply) { + if (factor == 1) { + for (int64_t i = 0; i < input.length(); i++) { + out_data[i] = static_cast(in_data[i]); + } + } else if (is_multiply) { for (int64_t i = 0; i < input.length(); i++) { out_data[i] = static_cast(in_data[i] * factor); } @@ -352,6 +362,52 @@ struct CastFunctor { } }; +template <> +struct CastFunctor { + void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, + ArrayData* output) { + const auto& in_type = static_cast(*input.type()); + + static const int64_t kTimestampToDateFactors[4] = { + 86400LL, // SECOND + 86400LL * 1000LL, // MILLI + 86400LL * 1000LL * 1000LL, // MICRO + 86400LL * 1000LL * 1000LL * 1000LL, // NANO + }; + + const int64_t factor = kTimestampToDateFactors[static_cast(in_type.unit())]; + ShiftTime(ctx, options, false, factor, input, output); + } +}; + +template <> +struct CastFunctor { + void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, + ArrayData* output) { + const auto& in_type = static_cast(*input.type()); + + std::pair conversion = + kTimeConversionTable[static_cast(in_type.unit())] + [static_cast(TimeUnit::MILLI)]; + + ShiftTime(ctx, options, conversion.first, conversion.second, input, + output); + + // Ensure that intraday milliseconds have been zeroed out + auto out_data = GetMutableValues(output, 1); + for (int64_t i = 0; i < input.length(); ++i) { + const int64_t remainder = out_data[i] % kMillisecondsInDay; + if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && input.IsValid(i) && + remainder > 0)) { + ctx->SetStatus( + Status::Invalid("Timestamp value had non-zero intraday milliseconds")); + break; + } + out_data[i] -= remainder; + } + } +}; + // ---------------------------------------------------------------------- // From one time32 or time64 to another @@ -385,8 +441,6 @@ struct CastFunctor struct CastFunctor { void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, @@ -415,7 +469,7 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices, internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), indices.length()); - const index_c_type* in = GetValuesAs(*indices.data(), 1); + const index_c_type* in = GetValues(*indices.data(), 1); uint8_t* out = output->buffers[1]->mutable_data(); int32_t byte_width = @@ -479,7 +533,7 @@ Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices, internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), indices.length()); - const index_c_type* in = GetValuesAs(*indices.data(), 1); + const index_c_type* in = GetValues(*indices.data(), 1); for (int64_t i = 0; i < indices.length(); ++i) { if (valid_bits_reader.IsSet()) { int32_t length; @@ -550,7 +604,7 @@ void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary, internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), indices.length()); - const index_c_type* in = GetValuesAs(*indices.data(), 1); + const index_c_type* in = GetValues(*indices.data(), 1); for (int64_t i = 0; i < indices.length(); ++i) { if (valid_bits_reader.IsSet()) { out[i] = dictionary[in[i]]; @@ -575,7 +629,7 @@ struct CastFunctortype)) << "Dictionary type: " << values_type << " target type: " << (*output->type); - const c_type* dictionary = GetValuesAs(*type.dictionary()->data(), 1); + const c_type* dictionary = GetValues(*type.dictionary()->data(), 1); auto out = reinterpret_cast(output->buffers[1]->mutable_data()); const Array& indices = *dict_array.indices(); @@ -755,7 +809,10 @@ class CastKernel : public UnaryKernel { FN(Time64Type, Time32Type); \ FN(Time64Type, Time64Type); -#define TIMESTAMP_CASES(FN, IN_TYPE) FN(TimestampType, TimestampType); +#define TIMESTAMP_CASES(FN, IN_TYPE) \ + FN(TimestampType, TimestampType); \ + FN(TimestampType, Date32Type); \ + FN(TimestampType, Date64Type); #define DICTIONARY_CASES(FN, IN_TYPE) \ FN(IN_TYPE, NullType); \ diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 8a7ef923b4719..61d53c4d50b33 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -355,6 +355,70 @@ TEST_F(TestCast, TimestampToTimestamp) { timestamp(TimeUnit::SECOND), options); } +TEST_F(TestCast, TimestampToDate32_Date64) { + CastOptions options; + + vector is_valid = {true, true, false}; + + // 2000-01-01, 2000-01-02, null + vector v_nano = {946684800000000000, 946771200000000000, 0}; + vector v_micro = {946684800000000, 946771200000000, 0}; + vector v_milli = {946684800000, 946771200000, 0}; + vector v_second = {946684800, 946771200, 0}; + vector v_day = {10957, 10958, 0}; + + // Simple conversions + CheckCase( + timestamp(TimeUnit::NANO), v_nano, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::MICRO), v_micro, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::MILLI), v_milli, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second, is_valid, date64(), v_milli, options); + + CheckCase( + timestamp(TimeUnit::NANO), v_nano, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::MICRO), v_micro, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::MILLI), v_milli, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second, is_valid, date32(), v_day, options); + + // Disallow truncate, failures + vector v_nano_fail = {946684800000000001, 946771200000000001, 0}; + vector v_micro_fail = {946684800000001, 946771200000001, 0}; + vector v_milli_fail = {946684800001, 946771200001, 0}; + vector v_second_fail = {946684801, 946771201, 0}; + + options.allow_time_truncate = false; + CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, + date64(), options); + + CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, + date32(), options); + + // Make sure that nulls are excluded from the truncation checks + vector v_second_nofail = {946684800, 946771200, 1}; + CheckCase( + timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date32(), v_day, options); +} + TEST_F(TestCast, TimeToTime) { CastOptions options; diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index d52627ebfee12..0e775a0fb0e99 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -519,7 +519,26 @@ class UInt64Converter : public TypedConverterVisitor { +class Date32Converter : public TypedConverterVisitor { + public: + inline Status AppendItem(const OwnedRef& item) { + int32_t t; + if (PyDate_Check(item.obj())) { + auto pydate = reinterpret_cast(item.obj()); + t = static_cast(PyDate_to_s(pydate)); + } else { + int64_t casted_val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); + if (casted_val > std::numeric_limits::max()) { + return Status::Invalid("Integer as date32 larger than INT32_MAX"); + } + t = static_cast(casted_val); + } + return typed_builder_->Append(t); + } +}; + +class Date64Converter : public TypedConverterVisitor { public: inline Status AppendItem(const OwnedRef& item) { int64_t t; @@ -535,7 +554,7 @@ class DateConverter : public TypedConverterVisitor }; class TimestampConverter - : public TypedConverterVisitor { + : public TypedConverterVisitor { public: explicit TimestampConverter(TimeUnit::type unit) : unit_(unit) {} @@ -717,8 +736,10 @@ std::shared_ptr GetConverter(const std::shared_ptr& type return std::make_shared(); case Type::UINT64: return std::make_shared(); + case Type::DATE32: + return std::make_shared(); case Type::DATE64: - return std::make_shared(); + return std::make_shared(); case Type::TIMESTAMP: return std::make_shared( static_cast(*type).unit()); diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index ead3a04810121..c5aff2e4f2e3a 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -260,6 +260,7 @@ class NumPyConverter { : pool_(pool), type_(type), arr_(reinterpret_cast(ao)), + dtype_(PyArray_DESCR(arr_)), mask_(nullptr), use_pandas_null_sentinels_(use_pandas_null_sentinels) { if (mo != nullptr && mo != Py_None) { @@ -431,6 +432,7 @@ class NumPyConverter { MemoryPool* pool_; std::shared_ptr type_; PyArrayObject* arr_; + PyArray_Descr* dtype_; PyArrayObject* mask_; int64_t length_; int64_t stride_; @@ -450,7 +452,7 @@ Status NumPyConverter::Convert() { return Status::Invalid("only handle 1-dimensional arrays"); } - if (PyArray_DESCR(arr_)->type_num == NPY_OBJECT) { + if (dtype_->type_num == NPY_OBJECT) { return ConvertObjects(); } @@ -462,33 +464,12 @@ Status NumPyConverter::Convert() { return VisitTypeInline(*type_, this); } -template -void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* output_data) { - // Passing input_data as non-const is a concession to PyObject* - int64_t j = 0; - for (int64_t i = 0; i < length; ++i) { - output_data[i] = static_cast(input_data[j]); - j += stride; - } -} - -template <> -void CopyStrided(PyObject** input_data, int64_t length, - int64_t stride, PyObject** output_data) { - int64_t j = 0; - for (int64_t i = 0; i < length; ++i) { - output_data[i] = input_data[j]; - if (output_data[i] != nullptr) { - Py_INCREF(output_data[i]); - } - j += stride; - } -} +namespace { -static Status CastBuffer(const std::shared_ptr& input, const int64_t length, - const std::shared_ptr& in_type, - const std::shared_ptr& out_type, MemoryPool* pool, - std::shared_ptr* out) { +Status CastBuffer(const std::shared_ptr& input, const int64_t length, + const std::shared_ptr& in_type, + const std::shared_ptr& out_type, MemoryPool* pool, + std::shared_ptr* out) { // Must cast std::vector> buffers = {nullptr, input}; auto tmp_data = std::make_shared(in_type, length, buffers, 0); @@ -499,6 +480,7 @@ static Status CastBuffer(const std::shared_ptr& input, const int64_t len compute::FunctionContext context(pool); compute::CastOptions cast_options; cast_options.allow_int_overflow = false; + cast_options.allow_time_truncate = false; RETURN_NOT_OK( compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array)); @@ -506,29 +488,47 @@ static Status CastBuffer(const std::shared_ptr& input, const int64_t len return Status::OK(); } +template +void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* output_data) { + // Passing input_data as non-const is a concession to PyObject* + int64_t j = 0; + for (int64_t i = 0; i < length; ++i) { + output_data[i] = static_cast(input_data[j]); + j += stride; + } +} + template -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { +Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* pool, + std::shared_ptr* out) { using traits = internal::arrow_traits; using T = typename traits::T; + // Strided, must copy into new contiguous memory + const int64_t stride = PyArray_STRIDES(arr)[0]; + const int64_t stride_elements = stride / sizeof(T); + + auto new_buffer = std::make_shared(pool); + RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length)); + CopyStrided(reinterpret_cast(PyArray_DATA(arr)), length, stride_elements, + reinterpret_cast(new_buffer->mutable_data())); + *out = new_buffer; + return Status::OK(); +} + +} // namespace + +template +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { if (is_strided()) { - // Strided, must copy into new contiguous memory - const int64_t stride = PyArray_STRIDES(arr_)[0]; - const int64_t stride_elements = stride / sizeof(T); - - auto new_buffer = std::make_shared(pool_); - RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length_)); - CopyStrided(reinterpret_cast(PyArray_DATA(arr_)), length_, stride_elements, - reinterpret_cast(new_buffer->mutable_data())); - *data = new_buffer; + RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); } else { // Can zero-copy *data = std::make_shared(reinterpret_cast(arr_)); } std::shared_ptr input_type; - RETURN_NOT_OK( - NumPyDtypeToArrow(reinterpret_cast(PyArray_DESCR(arr_)), &input_type)); + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK(CastBuffer(*data, length_, input_type, type_, pool_, data)); @@ -537,45 +537,6 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { return Status::OK(); } -template <> -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - // Handle LONGLONG->INT64 and other fun things - int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num); - int type_size = NumPyTypeSize(type_num_compat); - - if (type_size == 4) { - // Source and target are INT32, so can refer to the main implementation. - return ConvertData(data); - } else if (type_size == 8) { - // We need to scale down from int64 to int32 - auto new_buffer = std::make_shared(pool_); - RETURN_NOT_OK(new_buffer->Resize(sizeof(int32_t) * length_)); - - auto input = reinterpret_cast(PyArray_DATA(arr_)); - auto output = reinterpret_cast(new_buffer->mutable_data()); - - if (is_strided()) { - // Strided, must copy into new contiguous memory - const int64_t stride = PyArray_STRIDES(arr_)[0]; - const int64_t stride_elements = stride / sizeof(int64_t); - CopyStrided(input, length_, stride_elements, output); - } else { - // TODO(wesm): int32 overflow checks - for (int64_t i = 0; i < length_; ++i) { - *output++ = static_cast(*input++); - } - } - *data = new_buffer; - } else { - std::stringstream ss; - ss << "Cannot convert NumPy array of element size "; - ss << type_size << " to a Date32 array"; - return Status::NotImplemented(ss.str()); - } - - return Status::OK(); -} - template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { int64_t nbytes = BitUtil::BytesForBits(length_); @@ -597,6 +558,42 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* return Status::OK(); } +template <> +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { + if (is_strided()) { + RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); + } else { + // Can zero-copy + *data = std::make_shared(reinterpret_cast(arr_)); + } + + // If we have inbound datetime64[D] data, this needs to be downcasted + // separately here from int64_t to int32_t, because this data is not + // supported in compute::Cast + auto date_dtype = reinterpret_cast(dtype_->c_metadata); + if (dtype_->type_num == NPY_DATETIME && date_dtype->meta.base == NPY_FR_D) { + auto date32_buffer = std::make_shared(pool_); + RETURN_NOT_OK(date32_buffer->Resize(sizeof(int32_t) * length_)); + + auto datetime64_values = reinterpret_cast((*data)->data()); + auto date32_values = reinterpret_cast(date32_buffer->mutable_data()); + for (int64_t i = 0; i < length_; ++i) { + // TODO(wesm): How pedantic do we really want to be about checking for int32 + // overflow here? + *date32_values++ = static_cast(*datetime64_values++); + } + *data = date32_buffer; + } else { + std::shared_ptr input_type; + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); + if (!input_type->Equals(*type_)) { + RETURN_NOT_OK(CastBuffer(*data, length_, input_type, type_, pool_, data)); + } + } + + return Status::OK(); +} + template struct UnboxDate {}; diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h index c110bc64a2a2f..e76c2e0db4aea 100644 --- a/cpp/src/arrow/python/util/datetime.h +++ b/cpp/src/arrow/python/util/datetime.h @@ -235,6 +235,11 @@ static inline Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, return Status::OK(); } +static inline int64_t PyDate_to_s(PyDateTime_Date* pydate) { + return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), + PyDateTime_GET_DAY(pydate)); +} + static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { int64_t total_seconds = 0; total_seconds += PyDateTime_DATE_GET_SECOND(pydate); diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 414266ddb14ed..c7a0d49b40db1 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -178,6 +178,25 @@ def test_date(self): assert arr[2].as_py() == datetime.date(1970, 1, 1) assert arr[3].as_py() == datetime.date(2040, 2, 26) + def test_date32(self): + data = [datetime.date(2000, 1, 1), None] + arr = pa.array(data, type=pa.date32()) + + data2 = [10957, None] + arr2 = pa.array(data2, type=pa.date32()) + + for x in [arr, arr2]: + assert len(x) == 2 + assert x.type == pa.date32() + assert x.null_count == 1 + assert x[0].as_py() == datetime.date(2000, 1, 1) + assert x[1] is pa.NA + + # Overflow + data3 = [2**32, None] + with pytest.raises(pa.ArrowException): + pa.array(data3, type=pa.date32()) + def test_timestamp(self): data = [ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456), diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index d00bf1b28eddc..07ecf3010a32e 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -525,6 +525,16 @@ def test_timestamps_with_timezone(self): self._check_pandas_roundtrip(df) + def test_datetime64_to_date32(self): + # ARROW-1718 + arr = pa.array([date(2017, 10, 23), None]) + c = pa.Column.from_array("d", arr) + s = c.to_pandas() + + arr2 = pa.Array.from_pandas(s, type=pa.date32()) + + assert arr2.equals(arr.cast('date32')) + def test_date_infer(self): df = pd.DataFrame({ 'date': [date(2000, 1, 1), @@ -984,6 +994,7 @@ def test_numpy_datetime64_columns(self): dtype='datetime64[s]') self._check_array_from_pandas_roundtrip(datetime64_s) + def test_numpy_datetime64_day_unit(self): datetime64_d = np.array([ '2007-07-13', None, From 39243ffaf5eb1d1f2a748ea1ec2b36658ba7f3d7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 30 Oct 2017 14:59:47 -0400 Subject: [PATCH 002/177] ARROW-1409: [Format] Remove page id from Buffer metadata, increment metadata version number This is a breaking metadata change per discussion on the mailing list. I expect this kind of truly breaking changes to be exceedingly rare going forward, and when we make a 1.0.0 release we should document expectations around metadata / memory format stability. This could be made backwards compatible with some effort (we would have to add `RecordBatchV3` and `BufferV3` types). Author: Wes McKinney Closes #1225 from wesm/ARROW-1409 and squashes the following commits: 582fad90 [Wes McKinney] Disable JS in Travis CI for now 845f290f [Wes McKinney] Bump metadata version in Java, add check for V4 e2150c19 [Wes McKinney] Remove page id from Buffer metadata, increment metadata version number --- .travis.yml | 15 +++++------ cpp/src/arrow/ipc/ipc-read-write-test.cc | 2 +- cpp/src/arrow/ipc/message.cc | 15 +---------- cpp/src/arrow/ipc/message.h | 14 ++++++++++- cpp/src/arrow/ipc/metadata-internal.cc | 25 +++++++++++++++++-- cpp/src/arrow/ipc/metadata-internal.h | 10 ++++---- cpp/src/arrow/ipc/reader.cc | 15 +---------- cpp/src/arrow/ipc/writer.cc | 12 +-------- format/Schema.fbs | 14 +++++++---- .../arrow/vector/schema/ArrowBuffer.java | 16 +++--------- .../arrow/vector/schema/ArrowRecordBatch.java | 2 +- .../vector/stream/MessageSerializer.java | 6 ++++- js/src/format/Schema_generated.ts | 20 +++------------ 13 files changed, 75 insertions(+), 91 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6419548a622f3..52d7a5f800505 100644 --- a/.travis.yml +++ b/.travis.yml @@ -112,13 +112,14 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_integration.sh - - language: node_js - os: linux - node_js: node - before_script: - - $TRAVIS_BUILD_DIR/ci/travis_before_script_js.sh - script: - - $TRAVIS_BUILD_DIR/ci/travis_script_js.sh + # TODO(wesm): Re-enable after issues in ARROW-1409 resolved + # - language: node_js + # os: linux + # node_js: node + # before_script: + # - $TRAVIS_BUILD_DIR/ci/travis_before_script_js.sh + # script: + # - $TRAVIS_BUILD_DIR/ci/travis_script_js.sh - compiler: gcc language: cpp os: linux diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index adf34a9eb5422..6f2f5cf856055 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -243,7 +243,7 @@ TEST_F(TestIpcRoundTrip, MetadataVersion) { std::unique_ptr message; ASSERT_OK(ReadMessage(0, metadata_length, mmap_.get(), &message)); - ASSERT_EQ(MetadataVersion::V3, message->metadata_version()); + ASSERT_EQ(MetadataVersion::V4, message->metadata_version()); } TEST_P(TestIpcRoundTrip, SliceRoundTrip) { diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 0dd5c72e51980..21d6a69a28603 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -67,20 +67,7 @@ class Message::MessageImpl { } MetadataVersion version() const { - switch (message_->version()) { - case flatbuf::MetadataVersion_V1: - // Arrow 0.1 - return MetadataVersion::V1; - case flatbuf::MetadataVersion_V2: - // Arrow 0.2 - return MetadataVersion::V2; - case flatbuf::MetadataVersion_V3: - // Arrow >= 0.3 - return MetadataVersion::V3; - // Add cases as other versions become available - default: - return MetadataVersion::V3; - } + return internal::GetMetadataVersion(message_->version()); } const void* header() const { return message_->header(); } diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index a1b6c07a43d0e..495474e505157 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -42,7 +42,19 @@ class RandomAccessFile; namespace ipc { -enum class MetadataVersion : char { V1, V2, V3 }; +enum class MetadataVersion : char { + /// 0.1.0 + V1, + + /// 0.2.0 + V2, + + /// 0.3.0 to 0.7.1 + V3, + + /// >= 0.8.0 + V4 +}; // ARROW-109: We set this number arbitrarily to help catch user mistakes. For // deeply nested schemas, it is expected the user will indicate explicitly the diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index ad00cfb6c09be..f04e9b05a01b8 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -33,6 +33,7 @@ #include "arrow/ipc/Message_generated.h" #include "arrow/ipc/Tensor_generated.h" #include "arrow/ipc/dictionary.h" +#include "arrow/ipc/message.h" #include "arrow/ipc/util.h" #include "arrow/status.h" #include "arrow/tensor.h" @@ -57,6 +58,26 @@ using VectorLayoutOffset = flatbuffers::Offset; using Offset = flatbuffers::Offset; using FBString = flatbuffers::Offset; +MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version) { + switch (version) { + case flatbuf::MetadataVersion_V1: + // Arrow 0.1 + return MetadataVersion::V1; + case flatbuf::MetadataVersion_V2: + // Arrow 0.2 + return MetadataVersion::V2; + case flatbuf::MetadataVersion_V3: + // Arrow 0.3 to 0.7.1 + return MetadataVersion::V4; + case flatbuf::MetadataVersion_V4: + // Arrow >= 0.8 + return MetadataVersion::V4; + // Add cases as other versions become available + default: + return MetadataVersion::V4; + } +} + static Status IntFromFlatbuffer(const flatbuf::Int* int_data, std::shared_ptr* out) { if (int_data->bitWidth() > 64) { @@ -700,7 +721,7 @@ static Status WriteBuffers(FBB& fbb, const std::vector& buffers, for (size_t i = 0; i < buffers.size(); ++i) { const BufferMetadata& buffer = buffers[i]; - fb_buffers.emplace_back(buffer.page, buffer.offset, buffer.length); + fb_buffers.emplace_back(buffer.offset, buffer.length); } *out = fbb.CreateVectorOfStructs(fb_buffers); return Status::OK(); @@ -751,7 +772,7 @@ Status WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, auto fb_shape = fbb.CreateVector(dims); auto fb_strides = fbb.CreateVector(tensor.strides()); int64_t body_length = tensor.data()->size(); - flatbuf::Buffer buffer(-1, buffer_start_offset, body_length); + flatbuf::Buffer buffer(buffer_start_offset, body_length); TensorOffset fb_tensor = flatbuf::CreateTensor(fbb, fb_type_type, fb_type, fb_shape, fb_strides, &buffer); diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 309e7587a754c..380f3c9eb1013 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -27,6 +27,7 @@ #include "arrow/ipc/Schema_generated.h" #include "arrow/ipc/dictionary.h" +#include "arrow/ipc/message.h" namespace arrow { @@ -48,10 +49,12 @@ namespace ipc { namespace internal { static constexpr flatbuf::MetadataVersion kCurrentMetadataVersion = - flatbuf::MetadataVersion_V3; + flatbuf::MetadataVersion_V4; static constexpr flatbuf::MetadataVersion kMinMetadataVersion = - flatbuf::MetadataVersion_V3; + flatbuf::MetadataVersion_V4; + +MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version); static constexpr const char* kArrowMagicBytes = "ARROW1"; @@ -62,9 +65,6 @@ struct FieldMetadata { }; struct BufferMetadata { - /// The shared memory page id where to find this. Set to -1 if unused - int32_t page; - /// The relative offset into the memory page to the starting byte of the buffer int64_t offset; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 50eb9039c6ab6..8e10d7d66f907 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -550,20 +550,7 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { int num_record_batches() const { return footer_->recordBatches()->size(); } MetadataVersion version() const { - switch (footer_->version()) { - case flatbuf::MetadataVersion_V1: - // Arrow 0.1 - return MetadataVersion::V1; - case flatbuf::MetadataVersion_V2: - // Arrow 0.2 - return MetadataVersion::V2; - case flatbuf::MetadataVersion_V3: - // Arrow 0.3 - return MetadataVersion::V3; - // Add cases as other versions become available - default: - return MetadataVersion::V3; - } + return internal::GetMetadataVersion(footer_->version()); } FileBlock record_batch(int i) const { diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 279a69544faf2..5598cc68296f7 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -149,8 +149,6 @@ class RecordBatchSerializer : public ArrayVisitor { buffer_meta_.reserve(buffers_.size()); - const int32_t kNoPageId = -1; - // Construct the buffer metadata for the record batch header for (size_t i = 0; i < buffers_.size(); ++i) { const Buffer* buffer = buffers_[i].get(); @@ -163,15 +161,7 @@ class RecordBatchSerializer : public ArrayVisitor { padding = BitUtil::RoundUpToMultipleOf8(size) - size; } - // TODO(wesm): We currently have no notion of shared memory page id's, - // but we've included it in the metadata IDL for when we have it in the - // future. Use page = -1 for now - // - // Note that page ids are a bespoke notion for Arrow and not a feature we - // are using from any OS-level shared memory. The thought is that systems - // may (in the future) associate integer page id's with physical memory - // pages (according to whatever is the desired shared memory mechanism) - buffer_meta_.push_back({kNoPageId, offset, size + padding}); + buffer_meta_.push_back({offset, size + padding}); offset += size + padding; } diff --git a/format/Schema.fbs b/format/Schema.fbs index 186f8e362bde2..6021e92b847e7 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -20,9 +20,17 @@ namespace org.apache.arrow.flatbuf; enum MetadataVersion:short { + /// 0.1.0 V1, + + /// 0.2.0 V2, - V3 + + /// 0.3.0 -> 0.7.1 + V3, + + /// >= 0.8.0 + V4 } /// These are stored in the flatbuffer in the Type union below @@ -293,10 +301,6 @@ enum Endianness:short { Little, Big } /// ---------------------------------------------------------------------- /// A Buffer represents a single contiguous memory segment struct Buffer { - /// The shared memory page id where this buffer is located. Currently this is - /// not used - page: int; - /// The relative offset into the shared memory page where the bytes for this /// buffer starts offset: long; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java index d8c9e3001d0a5..4e0187e791b5a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java @@ -24,21 +24,15 @@ public class ArrowBuffer implements FBSerializable { - private int page; private long offset; private long size; - public ArrowBuffer(int page, long offset, long size) { + public ArrowBuffer(long offset, long size) { super(); - this.page = page; this.offset = offset; this.size = size; } - public int getPage() { - return page; - } - public long getOffset() { return offset; } @@ -52,7 +46,6 @@ public int hashCode() { final int prime = 31; int result = 1; result = prime * result + (int) (offset ^ (offset >>> 32)); - result = prime * result + page; result = prime * result + (int) (size ^ (size >>> 32)); return result; } @@ -72,9 +65,6 @@ public boolean equals(Object obj) { if (offset != other.offset) { return false; } - if (page != other.page) { - return false; - } if (size != other.size) { return false; } @@ -83,12 +73,12 @@ public boolean equals(Object obj) { @Override public int writeTo(FlatBufferBuilder builder) { - return Buffer.createBuffer(builder, page, offset, size); + return Buffer.createBuffer(builder, offset, size); } @Override public String toString() { - return "ArrowBuffer [page=" + page + ", offset=" + offset + ", size=" + size + "]"; + return "ArrowBuffer [offset=" + offset + ", size=" + size + "]"; } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java index c842d4c3f9a74..bf0967a2797fe 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java @@ -72,7 +72,7 @@ public ArrowRecordBatch(int length, List nodes, List b for (ArrowBuf arrowBuf : buffers) { arrowBuf.retain(); long size = arrowBuf.readableBytes(); - arrowBuffers.add(new ArrowBuffer(0, offset, size)); + arrowBuffers.add(new ArrowBuffer(offset, size)); LOGGER.debug(String.format("Buffer in RecordBatch at %d, length: %d", offset, size)); offset += size; if (alignBuffers && offset % 8 != 0) { // align on 8 byte boundaries diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java index f69aa41e7f6bd..c397cec72f0ed 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java @@ -385,6 +385,10 @@ public static ArrowMessage deserializeMessageBatch(ReadChannel in, BufferAllocat throw new IOException("Cannot currently deserialize record batches over 2GB"); } + if (message.version() != MetadataVersion.V4) { + throw new IOException("Received metadata with an incompatible version number"); + } + switch (message.headerType()) { case MessageHeader.RecordBatch: return deserializeRecordBatch(in, message, alloc); @@ -409,7 +413,7 @@ public static ByteBuffer serializeMessage(FlatBufferBuilder builder, byte header Message.startMessage(builder); Message.addHeaderType(builder, headerType); Message.addHeader(builder, headerOffset); - Message.addVersion(builder, MetadataVersion.V3); + Message.addVersion(builder, MetadataVersion.V4); Message.addBodyLength(builder, bodyLength); builder.finish(Message.endMessage(builder)); return builder.dataBuffer(); diff --git a/js/src/format/Schema_generated.ts b/js/src/format/Schema_generated.ts index 65493b7f685ec..c5b3e5011d790 100644 --- a/js/src/format/Schema_generated.ts +++ b/js/src/format/Schema_generated.ts @@ -2027,16 +2027,6 @@ export namespace org.apache.arrow.flatbuf { return this; } - /** - * The shared memory page id where this buffer is located. Currently this is - * not used - * - * @returns {number} - */ - page(): number { - return this.bb.readInt32(this.bb_pos); - } - /** * The relative offset into the shared memory page where the bytes for this * buffer starts @@ -2044,7 +2034,7 @@ export namespace org.apache.arrow.flatbuf { * @returns {flatbuffers.Long} */ offset(): flatbuffers.Long { - return this.bb.readInt64(this.bb_pos + 8); + return this.bb.readInt64(this.bb_pos); } /** @@ -2054,7 +2044,7 @@ export namespace org.apache.arrow.flatbuf { * @returns {flatbuffers.Long} */ length(): flatbuffers.Long { - return this.bb.readInt64(this.bb_pos + 16); + return this.bb.readInt64(this.bb_pos + 8); } /** @@ -2064,12 +2054,10 @@ export namespace org.apache.arrow.flatbuf { * @param {flatbuffers.Long} length * @returns {flatbuffers.Offset} */ - static createBuffer(builder: flatbuffers.Builder, page: number, offset: flatbuffers.Long, length: flatbuffers.Long): flatbuffers.Offset { - builder.prep(8, 24); + static createBuffer(builder: flatbuffers.Builder, offset: flatbuffers.Long, length: flatbuffers.Long): flatbuffers.Offset { + builder.prep(8, 16); builder.writeInt64(length); builder.writeInt64(offset); - builder.pad(4); - builder.writeInt32(page); return builder.offset(); } From 72b50bc597693f098b67489bd8da40862c9770a2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Oct 2017 11:36:28 -0400 Subject: [PATCH 003/177] [C++] Fix clang-format failure from ARROW-1409 Change-Id: Ia45874945e050de0aa6294cbbe01ec63e9b14235 --- cpp/src/arrow/ipc/metadata-internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index f04e9b05a01b8..f0f0f675853b1 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -72,7 +72,7 @@ MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version) { case flatbuf::MetadataVersion_V4: // Arrow >= 0.8 return MetadataVersion::V4; - // Add cases as other versions become available + // Add cases as other versions become available default: return MetadataVersion::V4; } From 088055019837f3a8215425959fadb037e01d0b02 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Tue, 31 Oct 2017 12:51:06 -0400 Subject: [PATCH 004/177] ARROW-1754: [Python] Fix buggy Parquet roundtrip when an index name is the same as a column name Author: Phillip Cloud Closes #1271 from cpcloud/ARROW-1754 and squashes the following commits: 3ffbe541 [Phillip Cloud] ARROW-1754: [Python] Fix buggy Parquet roundtrip when an index name is the same as a column name --- python/pyarrow/pandas_compat.py | 52 ++++++++-------------------- python/pyarrow/tests/test_parquet.py | 43 +++++++++++++++++------ 2 files changed, 48 insertions(+), 47 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index d6c844c8490f5..1984598ff3533 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -18,7 +18,6 @@ import ast import collections import json -import re import numpy as np import pandas as pd @@ -29,13 +28,6 @@ from pyarrow.compat import PY2, zip_longest # noqa -INDEX_LEVEL_NAME_REGEX = re.compile(r'^__index_level_\d+__$') - - -def is_unnamed_index_level(name): - return INDEX_LEVEL_NAME_REGEX.match(name) is not None - - def infer_dtype(column): try: return pd.api.types.infer_dtype(column) @@ -143,7 +135,7 @@ def get_column_metadata(column, name, arrow_type): Parameters ---------- - column : pandas.Series + column : pandas.Series or pandas.Index name : str arrow_type : pyarrow.DataType @@ -161,7 +153,7 @@ def get_column_metadata(column, name, arrow_type): } string_dtype = 'object' - if not isinstance(name, six.string_types): + if name is not None and not isinstance(name, six.string_types): raise TypeError( 'Column name must be a string. Got column {} of type {}'.format( name, type(name).__name__ @@ -176,23 +168,7 @@ def get_column_metadata(column, name, arrow_type): } -def index_level_name(index, i): - """Return the name of an index level or a default name if `index.name` is - None. - - Parameters - ---------- - index : pandas.Index - i : int - - Returns - ------- - name : str - """ - if index.name is not None: - return index.name - else: - return '__index_level_{:d}__'.format(i) +index_level_name = '__index_level_{:d}__'.format def construct_metadata(df, column_names, index_levels, preserve_index, types): @@ -222,11 +198,11 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types): ] if preserve_index: - index_column_names = [index_level_name(level, i) - for i, level in enumerate(index_levels)] + index_column_names = list(map( + index_level_name, range(len(index_levels)) + )) index_column_metadata = [ - get_column_metadata(level, name=index_level_name(level, i), - arrow_type=arrow_type) + get_column_metadata(level, name=level.name, arrow_type=arrow_type) for i, (level, arrow_type) in enumerate( zip(index_levels, index_types) ) @@ -317,7 +293,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1): for i, column in enumerate(index_columns): columns_to_convert.append(column) convert_types.append(None) - names.append(index_level_name(column, i)) + names.append(index_level_name(i)) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the @@ -378,6 +354,7 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pyarrow.lib as lib index_columns = [] + columns = [] column_indexes = [] index_arrays = [] index_names = [] @@ -390,6 +367,7 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] + columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) @@ -397,11 +375,11 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): # Build up a list of index columns and names while removing those columns # from the original table - for name in index_columns: - i = schema.get_field_index(name) + logical_index_names = [c['name'] for c in columns[-len(index_columns):]] + for raw_name, logical_name in zip(index_columns, logical_index_names): + i = schema.get_field_index(raw_name) if i != -1: col = table.column(i) - index_name = None if is_unnamed_index_level(name) else name col_pandas = col.to_pandas() values = col_pandas.values if not values.flags.writeable: @@ -410,9 +388,9 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) - index_names.append(index_name) + index_names.append(logical_name) block_table = block_table.remove_column( - block_table.schema.get_field_index(name) + block_table.schema.get_field_index(raw_name) ) # Convert an arrow table to Block from the internal pandas API diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index a7fe98ce71cd1..95dd6a471b6b3 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1171,7 +1171,8 @@ def test_dataset_read_pandas(tmpdir): @parquet -def test_dataset_read_pandas_common_metadata(tmpdir): +@pytest.mark.parametrize('preserve_index', [True, False]) +def test_dataset_read_pandas_common_metadata(tmpdir, preserve_index): # ARROW-1103 import pyarrow.parquet as pq @@ -1186,15 +1187,11 @@ def test_dataset_read_pandas_common_metadata(tmpdir): paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) - df.index = pd.Index(np.arange(i * size, (i + 1) * size)) - df.index.name = 'index' + df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index') - path = pjoin(dirpath, '{0}.parquet'.format(i)) + path = pjoin(dirpath, '{:d}.parquet'.format(i)) - df_ex_index = df.reset_index(drop=True) - df_ex_index['index'] = df.index - table = pa.Table.from_pandas(df_ex_index, - preserve_index=False) + table = pa.Table.from_pandas(df, preserve_index=preserve_index) # Obliterate metadata table = table.replace_schema_metadata(None) @@ -1206,7 +1203,9 @@ def test_dataset_read_pandas_common_metadata(tmpdir): paths.append(path) # Write _metadata common file - table_for_metadata = pa.Table.from_pandas(df) + table_for_metadata = pa.Table.from_pandas( + df, preserve_index=preserve_index + ) pq.write_metadata(table_for_metadata.schema, pjoin(dirpath, '_metadata')) @@ -1214,7 +1213,7 @@ def test_dataset_read_pandas_common_metadata(tmpdir): columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) - + expected.index.name = df.index.name if preserve_index else None tm.assert_frame_equal(result, expected) @@ -1387,3 +1386,27 @@ def test_large_table_int32_overflow(): table = pa.Table.from_arrays([parr], names=['one']) f = io.BytesIO() _write_table(table, f) + + +def test_index_column_name_duplicate(tmpdir): + data = { + 'close': { + pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998, + pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998, + }, + 'time': { + pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp( + '2017-06-30 01:31:00' + ), + pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp( + '2017-06-30 01:32:00' + ), + } + } + path = str(tmpdir / 'data.parquet') + dfx = pd.DataFrame(data).set_index('time', drop=False) + tdfx = pa.Table.from_pandas(dfx) + _write_table(tdfx, path) + arrow_table = _read_table(path) + result_df = arrow_table.to_pandas() + tm.assert_frame_equal(result_df, dfx) From eca992471b1b2230259509f74d7bc4af97922788 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 31 Oct 2017 12:54:06 -0400 Subject: [PATCH 005/177] ARROW-1658: [Python] Add boundschecking of dictionary indices when creating CategoricalBlock We should probably do this bounds-checking earlier and in the main Arrow C++ library when ingesting "untrusted" arrays. I will create a JIRA, but this is a stopgap in the meantime Author: Wes McKinney Closes #1270 from wesm/ARROW-1658 and squashes the following commits: 234a5685 [Wes McKinney] Add boundschecking of dictionary indices when creating CategoricalBlock as workaround for segfaults from invalid codes making their way into pandas --- cpp/src/arrow/python/arrow_to_pandas.cc | 38 ++++++++++++++------- python/pyarrow/array.pxi | 37 +++++++++++--------- python/pyarrow/tests/test_convert_pandas.py | 15 ++++++++ 3 files changed, 61 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 7f1591213cec6..c92faede1347b 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -966,9 +966,10 @@ class CategoricalBlock : public PandasBlock { "CategoricalBlock allocation happens when calling Write"); } - template + template Status WriteIndices(const std::shared_ptr& col) { - using TRAITS = internal::arrow_traits; + using ArrayType = typename TypeTraits::ArrayType; + using TRAITS = internal::arrow_traits; using T = typename TRAITS::T; constexpr int npy_type = TRAITS::npy_type; @@ -977,10 +978,22 @@ class CategoricalBlock : public PandasBlock { // Sniff the first chunk const std::shared_ptr arr_first = data.chunk(0); const auto& dict_arr_first = static_cast(*arr_first); - const auto& indices_first = - static_cast(*dict_arr_first.indices()); + const auto& indices_first = static_cast(*dict_arr_first.indices()); + + auto CheckIndices = [](const ArrayType& arr, int64_t dict_length) { + const T* values = arr.raw_values(); + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsValid(i) && (values[i] < 0 || values[i] >= dict_length)) { + std::stringstream ss; + ss << "Out of bounds dictionary index: " << static_cast(values[i]); + return Status::Invalid(ss.str()); + } + } + return Status::OK(); + }; if (data.num_chunks() == 1 && indices_first.null_count() == 0) { + RETURN_NOT_OK(CheckIndices(indices_first, dict_arr_first.dictionary()->length())); RETURN_NOT_OK(AllocateNDArrayFromIndices(npy_type, indices_first)); } else { if (options_.zero_copy_only) { @@ -998,9 +1011,10 @@ class CategoricalBlock : public PandasBlock { const std::shared_ptr arr = data.chunk(c); const auto& dict_arr = static_cast(*arr); - const auto& indices = static_cast(*dict_arr.indices()); + const auto& indices = static_cast(*dict_arr.indices()); auto in_values = reinterpret_cast(indices.raw_values()); + RETURN_NOT_OK(CheckIndices(indices, dict_arr.dictionary()->length())); // Null is -1 in CategoricalBlock for (int i = 0; i < arr->length(); ++i) { *out_values++ = indices.IsNull(i) ? -1 : in_values[i]; @@ -1026,16 +1040,16 @@ class CategoricalBlock : public PandasBlock { switch (dict_type.index_type()->id()) { case Type::INT8: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_col)); break; case Type::INT16: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_col)); break; case Type::INT32: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_col)); break; case Type::INT64: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_col)); break; default: { std::stringstream ss; @@ -1091,13 +1105,11 @@ class CategoricalBlock : public PandasBlock { PyObject* block_arr = PyArray_NewFromDescr(&PyArray_Type, descr, 1, block_dims, nullptr, data, NPY_ARRAY_CARRAY, nullptr); + RETURN_IF_PYERROR(); npy_intp placement_dims[1] = {num_columns_}; PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64); - if (placement_arr == NULL) { - // TODO(wesm): propagating Python exception - return Status::OK(); - } + RETURN_IF_PYERROR(); block_arr_.reset(block_arr); placement_arr_.reset(placement_arr); diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 7da5c3caffdc2..7752d062a774c 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -162,6 +162,7 @@ def array(object obj, type=None, mask=None, return DictionaryArray.from_arrays( values.codes, values.categories.values, mask=mask, ordered=values.ordered, + from_pandas=from_pandas, memory_pool=memory_pool) else: values, type = pdcompat.get_datetimetz_type(values, obj.dtype, @@ -671,7 +672,7 @@ cdef class DictionaryArray(Array): @staticmethod def from_arrays(indices, dictionary, mask=None, ordered=False, - MemoryPool memory_pool=None): + from_pandas=False, MemoryPool memory_pool=None): """ Construct Arrow DictionaryArray from array of indices (must be non-negative integers) and corresponding array of dictionary values @@ -682,15 +683,20 @@ cdef class DictionaryArray(Array): dictionary : ndarray or pandas.Series mask : ndarray or pandas.Series, boolean type True values indicate that indices are actually null + from_pandas : boolean, default False + If True, the indices should be treated as though they originated in + a pandas.Categorical (null encoded as -1) ordered : boolean, default False Set to True if the category values are ordered + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise uses default pool Returns ------- dict_array : DictionaryArray """ cdef: - Array arrow_indices, arrow_dictionary + Array _indices, _dictionary DictionaryArray result shared_ptr[CDataType] c_type shared_ptr[CArray] c_result @@ -699,29 +705,28 @@ cdef class DictionaryArray(Array): if mask is not None: raise NotImplementedError( "mask not implemented with Arrow array inputs yet") - arrow_indices = indices + _indices = indices else: - if mask is None: - mask = indices == -1 - else: - mask = mask | (indices == -1) - arrow_indices = Array.from_pandas(indices, mask=mask, - memory_pool=memory_pool) + if from_pandas: + if mask is None: + mask = indices == -1 + else: + mask = mask | (indices == -1) + _indices = array(indices, mask=mask, memory_pool=memory_pool) if isinstance(dictionary, Array): - arrow_dictionary = dictionary + _dictionary = dictionary else: - arrow_dictionary = Array.from_pandas(dictionary, - memory_pool=memory_pool) + _dictionary = array(dictionary, memory_pool=memory_pool) - if not isinstance(arrow_indices, IntegerArray): + if not isinstance(_indices, IntegerArray): raise ValueError('Indices must be integer type') cdef c_bool c_ordered = ordered - c_type.reset(new CDictionaryType(arrow_indices.type.sp_type, - arrow_dictionary.sp_array, c_ordered)) - c_result.reset(new CDictionaryArray(c_type, arrow_indices.sp_array)) + c_type.reset(new CDictionaryType(_indices.type.sp_type, + _dictionary.sp_array, c_ordered)) + c_result.reset(new CDictionaryArray(c_type, _indices.sp_array)) result = DictionaryArray() result.init(c_result) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 07ecf3010a32e..dabccac37c3d8 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -217,6 +217,21 @@ def test_zero_copy_success(self): result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True) npt.assert_array_equal(result, [0, 1, 2]) + def test_dictionary_indices_boundscheck(self): + # ARROW-1658. No validation of indices leads to segfaults in pandas + indices = [[0, 1], [0, -1]] + + for inds in indices: + arr = pa.DictionaryArray.from_arrays(inds, ['a']) + batch = pa.RecordBatch.from_arrays([arr], ['foo']) + table = pa.Table.from_batches([batch, batch, batch]) + + with pytest.raises(pa.ArrowException): + arr.to_pandas() + + with pytest.raises(pa.ArrowException): + table.to_pandas() + def test_zero_copy_dictionaries(self): arr = pa.DictionaryArray.from_arrays( np.array([0, 0]), From 9dc4c58d57159edf24bcbe86b6220a6a3bcb09ef Mon Sep 17 00:00:00 2001 From: dhirschf Date: Tue, 31 Oct 2017 14:11:19 -0400 Subject: [PATCH 006/177] ARROW-1753: [Python] Provide for matching subclasses with register_type in serialization context https://issues.apache.org/jira/browse/ARROW-1753 Author: dhirschf Author: Philipp Moritz Closes #1272 from dhirschfeld/ARROW-1753 and squashes the following commits: bb7f041a [Philipp Moritz] fix subclass serialization tests cbc5e09c [dhirschf] Added a test that register_type will work for subclasses of the registered type 46b60f44 [dhirschf] Allow register_type to match subclasses in serialization context --- python/pyarrow/serialization.pxi | 17 ++++-- python/pyarrow/serialization.py | 2 + python/pyarrow/tests/test_serialization.py | 66 ++++++++++++++++++++++ python/requirements.txt | 2 +- 4 files changed, 82 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi index 4e9ab8eb3b374..6b7227797a836 100644 --- a/python/pyarrow/serialization.pxi +++ b/python/pyarrow/serialization.pxi @@ -88,17 +88,26 @@ cdef class SerializationContext: self.custom_deserializers[type_id] = custom_deserializer def _serialize_callback(self, obj): - if type(obj) not in self.type_to_type_id: + found = False + for type_ in type(obj).__mro__: + if type_ in self.type_to_type_id: + found = True + break + + if not found: raise SerializationCallbackError( "pyarrow does not know how to " - "serialize objects of type {}.".format(type(obj)), obj) - type_id = self.type_to_type_id[type(obj)] + "serialize objects of type {}.".format(type(obj)), obj + ) + + # use the closest match to type(obj) + type_id = self.type_to_type_id[type_] if type_id in self.types_to_pickle: serialized_obj = {"data": pickle.dumps(obj), "pickle": True} elif type_id in self.custom_serializers: serialized_obj = {"data": self.custom_serializers[type_id](obj)} else: - if is_named_tuple(type(obj)): + if is_named_tuple(type_): serialized_obj = {} serialized_obj["_pa_getnewargs_"] = obj.__getnewargs__() elif hasattr(obj, "__dict__"): diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py index 9dc8ee6dee9ad..2b47513fd1c85 100644 --- a/python/pyarrow/serialization.py +++ b/python/pyarrow/serialization.py @@ -69,6 +69,8 @@ def _deserialize_default_dict(data): type(lambda: 0), "function", pickle=True) + serialization_context.register_type(type, "type", pickle=True) + # ---------------------------------------------------------------------- # Set up serialization for numpy with dtype object (primitive types are # handled efficiently with Arrow's Tensor facilities, see diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index 7878a09228d06..b0c5bc49e6a58 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -416,3 +416,69 @@ class TempClass(object): with pytest.raises(pa.DeserializationCallbackError) as err: serialized_object.deserialize(deserialization_context) assert err.value.type_id == 20*b"\x00" + + +def test_fallback_to_subclasses(): + + class SubFoo(Foo): + def __init__(self): + Foo.__init__(self) + + # should be able to serialize/deserialize an instance + # if a base class has been registered + serialization_context = pa.SerializationContext() + serialization_context.register_type(Foo, "Foo") + + subfoo = SubFoo() + # should fallbact to Foo serializer + serialized_object = pa.serialize(subfoo, serialization_context) + + reconstructed_object = serialized_object.deserialize( + serialization_context + ) + assert type(reconstructed_object) == Foo + + +class Serializable(object): + pass + + +def serialize_serializable(obj): + return {"type": type(obj), "data": obj.__dict__} + + +def deserialize_serializable(obj): + val = obj["type"].__new__(obj["type"]) + val.__dict__.update(obj["data"]) + return val + + +class SerializableClass(Serializable): + def __init__(self): + self.value = 3 + + +def test_serialize_subclasses(): + + # This test shows how subclasses can be handled in an idiomatic way + # by having only a serializer for the base class + + # This technique should however be used with care, since pickling + # type(obj) with couldpickle will include the full class definition + # in the serialized representation. + # This means the class definition is part of every instance of the + # object, which in general is not desirable; registering all subclasses + # with register_type will result in faster and more memory + # efficient serialization. + + serialization_context.register_type( + Serializable, "Serializable", + custom_serializer=serialize_serializable, + custom_deserializer=deserialize_serializable) + + a = SerializableClass() + serialized = pa.serialize(a) + + deserialized = serialized.deserialize() + assert type(deserialized).__name__ == SerializableClass.__name__ + assert deserialized.value == 3 diff --git a/python/requirements.txt b/python/requirements.txt index d2e28a7747ba8..8d0c33afa69a6 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,4 +1,4 @@ pytest -cloudpickle +cloudpickle>=0.4.0 numpy>=1.10.0 six From 142e6ee69bd6a4dc316d00d9efd6d86d119df075 Mon Sep 17 00:00:00 2001 From: Heimir Sverrisson Date: Tue, 31 Oct 2017 22:54:09 -0400 Subject: [PATCH 007/177] ARROW-1455 [Python] Add Dockerfile for validating Dask integration A Docker container is created with all the dependencies needed to pull down the Dask code from Github and install it locally, together with Arrow, to run an integration test. Author: Heimir Sverrisson Closes #1249 from heimir-sverrisson/hs/dockerize_dask and squashes the following commits: d146185b [Heimir Sverrisson] ARROW-1455 [Python] Add Dockerfile for validating Dask integration --- dev/dask_integration.sh | 21 +++++ dev/dask_integration/Dockerfile | 88 +++++++++++++++++++ dev/dask_integration/dask_integration.sh | 49 +++++++++++ dev/docker-compose.yml | 5 ++ dev/run_docker_compose.sh | 2 +- python/testing/README.md | 24 ++++- .../dask_tests/test_dask_integration.py | 51 +++++++++++ 7 files changed, 238 insertions(+), 2 deletions(-) create mode 100755 dev/dask_integration.sh create mode 100644 dev/dask_integration/Dockerfile create mode 100755 dev/dask_integration/dask_integration.sh create mode 100644 python/testing/dask_tests/test_dask_integration.py diff --git a/dev/dask_integration.sh b/dev/dask_integration.sh new file mode 100755 index 0000000000000..d344328b6af1e --- /dev/null +++ b/dev/dask_integration.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Pass the service name to run_docker_compose.sh +# Which validates environment and runs the service +exec "$(dirname ${BASH_SOURCE})"/run_docker_compose.sh dask_integration diff --git a/dev/dask_integration/Dockerfile b/dev/dask_integration/Dockerfile new file mode 100644 index 0000000000000..f72ef8ca0daab --- /dev/null +++ b/dev/dask_integration/Dockerfile @@ -0,0 +1,88 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM ubuntu:14.04 +ADD . /apache-arrow +WORKDIR /apache-arrow +# Basic OS utilities +RUN apt-get update && apt-get install -y \ + wget \ + git \ + gcc \ + g++ +# This will install conda in /home/ubuntu/miniconda +RUN wget -O /tmp/miniconda.sh \ + https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ + rm /tmp/miniconda.sh +# Create Conda environment +ENV PATH="/home/ubuntu/miniconda/bin:${PATH}" +RUN conda create -y -q -n test-environment \ + python=3.6 +# Install dependencies +RUN conda install -c conda-forge \ + numpy \ + pandas \ + bcolz \ + blosc \ + bokeh \ + boto3 \ + chest \ + cloudpickle \ + coverage \ + cytoolz \ + distributed \ + graphviz \ + h5py \ + ipython \ + partd \ + psutil \ + "pytest<=3.1.1" \ + scikit-image \ + scikit-learn \ + scipy \ + sqlalchemy \ + toolz +# install pytables from defaults for now +RUN conda install pytables + +RUN pip install -q git+https://github.com/dask/partd --upgrade --no-deps +RUN pip install -q git+https://github.com/dask/zict --upgrade --no-deps +RUN pip install -q git+https://github.com/dask/distributed --upgrade --no-deps +RUN pip install -q git+https://github.com/mrocklin/sparse --upgrade --no-deps +RUN pip install -q git+https://github.com/dask/s3fs --upgrade --no-deps + +RUN conda install -q -c conda-forge numba cython +RUN pip install -q git+https://github.com/dask/fastparquet + +RUN pip install -q \ + cachey \ + graphviz \ + moto \ + pyarrow \ + --upgrade --no-deps + +RUN pip install -q \ + cityhash \ + flake8 \ + mmh3 \ + pandas_datareader \ + pytest-xdist \ + xxhash \ + pycodestyle + +CMD arrow/dev/dask_integration/dask_integration.sh + diff --git a/dev/dask_integration/dask_integration.sh b/dev/dask_integration/dask_integration.sh new file mode 100755 index 0000000000000..f5a24e462b742 --- /dev/null +++ b/dev/dask_integration/dask_integration.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set up environment and working directory +cd /apache-arrow + +export ARROW_BUILD_TYPE=release +export ARROW_HOME=$(pwd)/dist +export PARQUET_HOME=$(pwd)/dist +CONDA_BASE=/home/ubuntu/miniconda +export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} + +# Allow for --user Python installation inside Docker +export HOME=$(pwd) + +# Clean up and get the dask master branch from github +rm -rf dask .local +export GIT_COMMITTER_NAME="Nobody" +export GIT_COMMITTER_EMAIL="nobody@nowhere.com" +git clone https://github.com/dask/dask.git +pushd dask +pip install --user -e .[complete] +# Verify integrity of the installed dask dataframe code +py.test dask/dataframe/tests/test_dataframe.py +popd + +# Run the integration test +pushd arrow/python/testing +py.test dask_tests +popd + +pushd dask/dask/dataframe/io +py.test tests/test_parquet.py +popd diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml index 7bd2cd4412cec..4b9014894003b 100644 --- a/dev/docker-compose.yml +++ b/dev/docker-compose.yml @@ -28,3 +28,8 @@ services: - "4000:4000" volumes: - ../..:/apache-arrow + dask_integration: + build: + context: dask_integration + volumes: + - ../..:/apache-arrow diff --git a/dev/run_docker_compose.sh b/dev/run_docker_compose.sh index f46879ed1e436..681a3a75ffe20 100755 --- a/dev/run_docker_compose.sh +++ b/dev/run_docker_compose.sh @@ -37,4 +37,4 @@ fi GID=$(id -g ${USERNAME}) docker-compose -f arrow/dev/docker-compose.yml run \ - -u "${UID}:${GID}" "${1}" + --rm -u "${UID}:${GID}" "${1}" diff --git a/python/testing/README.md b/python/testing/README.md index 07970a231b54b..0ebeec4a1c3e7 100644 --- a/python/testing/README.md +++ b/python/testing/README.md @@ -23,4 +23,26 @@ ```shell ./test_hdfs.sh -``` \ No newline at end of file +``` + +## Testing Dask integration + +Initial integration testing with Dask has been Dockerized. +To invoke the test run the following command in the `arrow` +root-directory: + +```shell +bash dev/dask_integration.sh +``` + +This script will create a `dask` directory on the same level as +`arrow`. It will clone the Dask project from Github into `dask` +and do a Python `--user` install. The Docker code will use the parent +directory of `arrow` as `$HOME` and that's where Python will +install `dask` into a `.local` directory. + +The output of the Docker session will contain the results of tests +of the Dask dataframe followed by the single integration test that +now exists for Arrow. That test creates a set of `csv`-files and then +does parallel reading of `csv`-files into a Dask dataframe. The code +for this test resides here in the `dask_test` directory. diff --git a/python/testing/dask_tests/test_dask_integration.py b/python/testing/dask_tests/test_dask_integration.py new file mode 100644 index 0000000000000..e678348780cd8 --- /dev/null +++ b/python/testing/dask_tests/test_dask_integration.py @@ -0,0 +1,51 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import date, timedelta +import csv +from random import randint +import dask.dataframe as dd +import pyarrow as pa + +def make_datafiles(tmpdir, prefix='data', num_files=20): + rowcount = 5000 + fieldnames = ['date', 'temperature', 'dewpoint'] + start_date = date(1900, 1, 1) + for i in range(num_files): + filename = '{0}/{1}-{2}.csv'.format(tmpdir, prefix, i) + with open(filename, 'w') as outcsv: + writer = csv.DictWriter(outcsv, fieldnames) + writer.writeheader() + the_date = start_date + for _ in range(rowcount): + temperature = randint(-10, 35) + dewpoint = temperature - randint(0, 10) + writer.writerow({'date': the_date, 'temperature': temperature, + 'dewpoint': dewpoint}) + the_date += timedelta(days=1) + +def test_dask_file_read(tmpdir): + prefix = 'data' + make_datafiles(tmpdir, prefix) + # Read all datafiles in parallel + datafiles = '{0}/{1}-*.csv'.format(tmpdir, prefix) + dask_df = dd.read_csv(datafiles) + # Convert Dask dataframe to Arrow table + table = pa.Table.from_pandas(dask_df.compute()) + # Second column (1) is temperature + dask_temp = int(1000 * dask_df['temperature'].mean().compute()) + arrow_temp = int(1000 * table[1].to_pandas().mean()) + assert dask_temp == arrow_temp From 0373541e2e9cd3510d6e4c8ac5b580f11eb675ec Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 3 Nov 2017 17:10:30 +0100 Subject: [PATCH 008/177] ARROW-1766: [GLib] Fix failing builds on OSX Author: Phillip Cloud Closes #1279 from cpcloud/ARROW-1766 and squashes the following commits: ed3b22b [Phillip Cloud] Only set ARCHFLAGS on osx d9c1604 [Phillip Cloud] [GLib] Fix failing builds on OSX --- ci/travis_before_script_c_glib.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/travis_before_script_c_glib.sh b/ci/travis_before_script_c_glib.sh index a63945e1745d0..a22ecd3753fd8 100755 --- a/ci/travis_before_script_c_glib.sh +++ b/ci/travis_before_script_c_glib.sh @@ -100,7 +100,12 @@ if [ $BUILD_SYSTEM = "autotools" ]; then ./configure $CONFIGURE_OPTIONS - make -j4 + if [ "$TRAVIS_OS_NAME" = "osx" ]; then + ARCHFLAGS="-arch x86_64" make -j4 + else + make -j4 + fi + make install else MESON_OPTIONS="--prefix=$ARROW_C_GLIB_INSTALL" From 527af63cde4ac1df53e82d6f8ced064738f5c1a3 Mon Sep 17 00:00:00 2001 From: Paul Taylor Date: Fri, 3 Nov 2017 17:44:52 -0400 Subject: [PATCH 009/177] ARROW-1652: [JS] housekeeping, vector cleanup This PR addresses the first few issues in the [JS roadmap doc](https://docs.google.com/document/d/142dek89oM2TVI2Yql106Zo8IB1Ff_9zDg_EG6jPWS0M) I sent out a week or so ago. Sorry for the big PR, the housekeeping and vector cleanup work were pretty co-dependent. JIRA issues addressed by this PR: [ARROW-1032](https://issues.apache.org/jira/browse/ARROW-1032) - Support custom_metadata [ARROW-1651](https://issues.apache.org/jira/browse/ARROW-1651) - Lazy row accessor in Table [ARROW-1652](https://issues.apache.org/jira/browse/ARROW-1652) - Separate Vector into BatchVector and CompositeVector Tasks from the roadmap (some not in JIRA): ##### Housekeeping 1. Enable the strict-mode tsc compiler settings in the build 2. Compile mjs files for node 8.x ESModules 3. Compile ES6 UMD target with native iterators/generators ##### Vector 1. Refactor Vector types to primitive forms representing the portion of a column in a single RecordBatch 2. Add Column Vector that represents primitive Vectors across RecordBatches as an entire column 3. Refactor linear column-to-batch-index lookup in `Vector.get(i)` 4. Simplify inheritance hierarchy/generic types with Traits (e.g. Nullable, Iterable, and Typed numeric variants) ##### Table 1. Implement lazy row accessor 2. Share API/row logic with StructVector cc: @wesm @TheNeuralBit Author: Paul Taylor Closes #1273 from trxcllnt/vector-cleanup and squashes the following commits: c53d6de3 [Paul Taylor] refactor: rename vector mixins 2c83c823 [Paul Taylor] update to typescript@2.6.1 48c6ca48 [Paul Taylor] refactor: StructVector/Table#get always take numeric index, Table extends StructVector 18671edc [Paul Taylor] fix lint 04e9941d [Paul Taylor] refactor: use new compilation targets in perf tests bd7a8373 [Paul Taylor] refactor: update test's Arrow imports for new types 37b7f615 [Paul Taylor] refactor: update vector tests for new types 15ab8d4c [Paul Taylor] refactor: update table tests for new types db04a0b0 [Paul Taylor] refactor: export new Arrow types 84233dec [Paul Taylor] refactor reader to use new arrow types, fix strict TS compilation errors af4845d6 [Paul Taylor] refactor: add Arrow vector mixins 54fa2fd3 [Paul Taylor] refactor: break out virtual vector, move to types folder 2121bf1a [Paul Taylor] refactor: break out table, add Row type, move to types folder abc93310 [Paul Taylor] refactor: move struct to types folder 2a4127ce [Paul Taylor] refactor: move dictionary to types folder 607be424 [Paul Taylor] refactor: break out list/fixedsizelist/utf8, move to types folder b8a68665 [Paul Taylor] refactor: break out Typed vectors, move into types folder 0f8de75a [Paul Taylor] refactor: rename vector folder to types, move vector base class d2def198 [Paul Taylor] clean up build scripts, add ES2015 UMD and mjs targets 84b2c505 [Paul Taylor] use strict typescript compiler settings --- js/.gitignore | 9 +- js/closure-compiler-scripts/text-encoding.js | 2 +- js/gulp/argv.js | 36 ++ js/gulp/arrow-task.js | 57 +++ js/gulp/build-task.js | 35 ++ js/gulp/clean-task.js | 31 ++ js/gulp/closure-task.js | 91 +++++ js/gulp/memoize-task.js | 30 ++ js/gulp/package-task.js | 75 ++++ js/gulp/test-task.js | 44 +++ js/gulp/typescript-task.js | 43 +++ js/gulp/uglify-task.js | 113 ++++++ js/gulp/util.js | 152 ++++++++ js/gulpfile.js | 350 ++++-------------- js/package.json | 39 +- js/perf/index.js | 10 +- js/src/Arrow.externs.ts | 45 ++- js/src/Arrow.ts | 73 ++-- js/src/reader/arrow.ts | 28 +- js/src/reader/dictionary.ts | 16 +- js/src/reader/file.ts | 9 +- js/src/reader/message.ts | 8 +- js/src/reader/stream.ts | 4 +- js/src/reader/vector.ts | 312 +++++++++------- js/src/table.ts | 143 ------- js/src/text-encoding-utf-8.d.ts | 4 + js/src/types/arrow.ts | 88 +++++ js/src/types/dictionary.ts | 58 +++ .../struct.ts => types/fixedsizelist.ts} | 26 +- js/src/types/list.ts | 35 ++ js/src/types/table/from.ts | 34 ++ js/src/types/table/row.ts | 61 +++ js/src/types/table/struct.ts | 63 ++++ js/src/types/table/table.ts | 30 ++ js/src/types/table/toString.ts | 40 ++ js/src/types/types.ts | 98 +++++ .../{vector/dictionary.ts => types/utf8.ts} | 41 +- js/src/types/vector/bool.ts | 55 +++ js/src/types/vector/date.ts | 29 ++ js/src/types/vector/long.ts | 35 ++ js/src/types/vector/traits.ts | 69 ++++ js/src/types/vector/typed.ts | 57 +++ js/src/types/vector/virtual.ts | 129 +++++++ js/src/vector/list.ts | 108 ------ js/src/vector/typed.ts | 326 ---------------- js/src/vector/vector.ts | 91 ----- js/test/Arrow.ts | 30 +- js/test/__snapshots__/table-tests.ts.snap | 200 +++++----- js/test/table-tests.ts | 17 +- js/test/test-config.ts | 2 +- js/test/vector-tests.ts | 75 ++-- js/tsconfig/tsconfig.base.json | 30 +- js/tsconfig/tsconfig.es2015.cls.json | 3 +- js/tsconfig/tsconfig.es5.cls.json | 1 + js/tsconfig/tsconfig.esnext.cls.json | 3 +- 55 files changed, 2195 insertions(+), 1398 deletions(-) create mode 100644 js/gulp/argv.js create mode 100644 js/gulp/arrow-task.js create mode 100644 js/gulp/build-task.js create mode 100644 js/gulp/clean-task.js create mode 100644 js/gulp/closure-task.js create mode 100644 js/gulp/memoize-task.js create mode 100644 js/gulp/package-task.js create mode 100644 js/gulp/test-task.js create mode 100644 js/gulp/typescript-task.js create mode 100644 js/gulp/uglify-task.js create mode 100644 js/gulp/util.js delete mode 100644 js/src/table.ts create mode 100644 js/src/text-encoding-utf-8.d.ts create mode 100644 js/src/types/arrow.ts create mode 100644 js/src/types/dictionary.ts rename js/src/{vector/struct.ts => types/fixedsizelist.ts} (53%) create mode 100644 js/src/types/list.ts create mode 100644 js/src/types/table/from.ts create mode 100644 js/src/types/table/row.ts create mode 100644 js/src/types/table/struct.ts create mode 100644 js/src/types/table/table.ts create mode 100644 js/src/types/table/toString.ts create mode 100644 js/src/types/types.ts rename js/src/{vector/dictionary.ts => types/utf8.ts} (50%) create mode 100644 js/src/types/vector/bool.ts create mode 100644 js/src/types/vector/date.ts create mode 100644 js/src/types/vector/long.ts create mode 100644 js/src/types/vector/traits.ts create mode 100644 js/src/types/vector/typed.ts create mode 100644 js/src/types/vector/virtual.ts delete mode 100644 js/src/vector/list.ts delete mode 100644 js/src/vector/typed.ts delete mode 100644 js/src/vector/vector.ts diff --git a/js/.gitignore b/js/.gitignore index 6d0f88d191cb0..88c612d8faf37 100644 --- a/js/.gitignore +++ b/js/.gitignore @@ -18,6 +18,7 @@ # Logs logs *.log +.esm-cache npm-debug.log* yarn-debug.log* yarn-error.log* @@ -57,10 +58,6 @@ build/Release node_modules/ jspm_packages/ -# Typescript declaration files -types/ -typings/ - # Optional npm cache directory .npm @@ -85,6 +82,4 @@ package-lock.json # compilation targets dist -targets/es5 -targets/es2015 -targets/esnext +targets diff --git a/js/closure-compiler-scripts/text-encoding.js b/js/closure-compiler-scripts/text-encoding.js index ca9154f88ecba..398883ab9b4be 100644 --- a/js/closure-compiler-scripts/text-encoding.js +++ b/js/closure-compiler-scripts/text-encoding.js @@ -11,7 +11,7 @@ // Utilities // -goog.module("module$text_encoding"); +goog.module("module$text_encoding_utf_8"); goog.module.declareLegacyNamespace(); /** * @param {number} a The number to test. diff --git a/js/gulp/argv.js b/js/gulp/argv.js new file mode 100644 index 0000000000000..33553704eec25 --- /dev/null +++ b/js/gulp/argv.js @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const argv = require(`command-line-args`)([ + { name: `all`, alias: `a`, type: Boolean }, + { name: 'update', alias: 'u', type: Boolean }, + { name: 'verbose', alias: 'v', type: Boolean }, + { name: `target`, type: String, defaultValue: `` }, + { name: `module`, type: String, defaultValue: `` }, + { name: `coverage`, type: Boolean, defaultValue: false }, + { name: `targets`, alias: `t`, type: String, multiple: true, defaultValue: [] }, + { name: `modules`, alias: `m`, type: String, multiple: true, defaultValue: [] } +]); + +const { targets, modules } = argv; + +argv.target && !targets.length && targets.push(argv.target); +argv.module && !modules.length && modules.push(argv.module); +(argv.all || !targets.length) && targets.push(`all`); +(argv.all || !modules.length) && modules.push(`all`); + +module.exports = { argv, targets, modules }; \ No newline at end of file diff --git a/js/gulp/arrow-task.js b/js/gulp/arrow-task.js new file mode 100644 index 0000000000000..d160ecb0e5de4 --- /dev/null +++ b/js/gulp/arrow-task.js @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { + mainExport, gCCLanguageNames, + targetDir, observableFromStreams +} = require('./util'); + +const gulp = require('gulp'); +const path = require('path'); +const gulpRename = require(`gulp-rename`); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); + +const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target, format) { + const out = targetDir(target), srcGlob = `src/**/*`; + const es5Glob = `${targetDir(`es5`, `cjs`)}/**/*.js`; + const esmGlob = `${targetDir(`es2015`, `esm`)}/**/*.js`; + const es5UmdGlob = `${targetDir(`es5`, `umd`)}/**/*.js`; + const es5UmdMaps = `${targetDir(`es5`, `umd`)}/**/*.map`; + const es2015UmdGlob = `${targetDir(`es2015`, `umd`)}/**/*.js`; + const es2015UmdMaps = `${targetDir(`es2015`, `umd`)}/**/*.map`; + const ch_ext = (ext) => gulpRename((p) => { p.extname = ext; }); + const append = (ap) => gulpRename((p) => { p.basename += ap; }); + return Observable.forkJoin( + observableFromStreams(gulp.src(srcGlob), gulp.dest(out)), // copy src ts files + observableFromStreams(gulp.src(es5Glob), gulp.dest(out)), // copy es5 cjs files + observableFromStreams(gulp.src(esmGlob), ch_ext(`.mjs`), gulp.dest(out)), // copy es2015 esm files and rename to `.mjs` + observableFromStreams(gulp.src(es5UmdGlob), append(`.es5.min`), gulp.dest(out)), // copy es5 umd files and add `.min` + observableFromStreams(gulp.src(es5UmdMaps), gulp.dest(out)), // copy es5 umd sourcemap files, but don't rename + observableFromStreams(gulp.src(es2015UmdGlob), append(`.es2015.min`), gulp.dest(out)), // copy es2015 umd files and add `.es6.min` + observableFromStreams(gulp.src(es2015UmdMaps), gulp.dest(out)), // copy es2015 umd sourcemap files, but don't rename + ).publish(new ReplaySubject()).refCount(); +}))({}); + +const arrowTSTask = ((cache) => memoizeTask(cache, function copyTS(target, format) { + return observableFromStreams(gulp.src(`src/**/*`), gulp.dest(targetDir(target, format))); +}))({}); + + +module.exports = arrowTask; +module.exports.arrowTask = arrowTask; +module.exports.arrowTSTask = arrowTSTask; \ No newline at end of file diff --git a/js/gulp/build-task.js b/js/gulp/build-task.js new file mode 100644 index 0000000000000..01152e662fcec --- /dev/null +++ b/js/gulp/build-task.js @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { npmPkgName } = require('./util'); +const { memoizeTask } = require('./memoize-task'); + +const uglifyTask = require('./uglify-task'); +const closureTask = require('./closure-task'); +const typescriptTask = require('./typescript-task'); +const { arrowTask, arrowTSTask } = require('./arrow-task'); + +const buildTask = ((cache) => memoizeTask(cache, function build(target, format, ...args) { + return target === npmPkgName ? arrowTask(target, format, ...args)() + : target === `ts` ? arrowTSTask(target, format, ...args)() + : format === `umd` ? target === `es5` ? closureTask(target, format, ...args)() + : uglifyTask(target, format, ...args)() + : typescriptTask(target, format, ...args)(); +}))({}); + +module.exports = buildTask; +module.exports.buildTask = buildTask; \ No newline at end of file diff --git a/js/gulp/clean-task.js b/js/gulp/clean-task.js new file mode 100644 index 0000000000000..d6c90f4637c8b --- /dev/null +++ b/js/gulp/clean-task.js @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const del = require('del'); +const { targetDir } = require('./util'); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); + +const cleanTask = ((cache) => memoizeTask(cache, function clean(target, format) { + return Observable + .from(del(`${targetDir(target, format)}/**`)) + .catch((e) => Observable.empty()) + .multicast(new ReplaySubject()).refCount(); +}))({}); + +module.exports = cleanTask; +module.exports.cleanTask = cleanTask; \ No newline at end of file diff --git a/js/gulp/closure-task.js b/js/gulp/closure-task.js new file mode 100644 index 0000000000000..950bf40e22a39 --- /dev/null +++ b/js/gulp/closure-task.js @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { + targetDir, + mainExport, + gCCLanguageNames, + UMDSourceTargets, + observableFromStreams +} = require('./util'); + +const gulp = require('gulp'); +const path = require('path'); +const sourcemaps = require('gulp-sourcemaps'); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); +const closureCompiler = require('google-closure-compiler').gulp(); + +const closureTask = ((cache) => memoizeTask(cache, function closure(target, format) { + const src = targetDir(target, `cls`); + const out = targetDir(target, format); + const entry = path.join(src, mainExport); + const externs = path.join(src, `${mainExport}.externs`); + return observableFromStreams( + gulp.src([ +/* external libs first --> */ `closure-compiler-scripts/*.js`, +/* then sources glob --> */ `${src}/**/*.js`, +/* and exclusions last --> */ `!${src}/format/*.js`, + `!${src}/Arrow.externs.js`, + ], { base: `./` }), + sourcemaps.init(), + closureCompiler(createClosureArgs(entry, externs)), + // rename the sourcemaps from *.js.map files to *.min.js.map + sourcemaps.write(`.`, { mapFile: (mapPath) => mapPath.replace(`.js.map`, `.${target}.min.js.map`) }), + gulp.dest(out) + ).publish(new ReplaySubject()).refCount(); +}))({}); + +const createClosureArgs = (entry, externs) => ({ + third_party: true, + warning_level: `QUIET`, + dependency_mode: `LOOSE`, + rewrite_polyfills: false, + externs: `${externs}.js`, + entry_point: `${entry}.js`, + // formatting: `PRETTY_PRINT`, + compilation_level: `ADVANCED`, + assume_function_wrapper: true, + js_output_file: `${mainExport}.js`, + language_in: gCCLanguageNames[`es2015`], + language_out: gCCLanguageNames[`es5`], + output_wrapper: +`// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +(function (global, factory) { + typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : + typeof define === 'function' && define.amd ? define(['exports'], factory) : + (factory(global.Arrow = global.Arrow || {})); +}(this, (function (exports) {%output%}.bind(this))));` +}); + +module.exports = closureTask; +module.exports.closureTask = closureTask; diff --git a/js/gulp/memoize-task.js b/js/gulp/memoize-task.js new file mode 100644 index 0000000000000..0b0fc843c451a --- /dev/null +++ b/js/gulp/memoize-task.js @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { taskName } = require('./util'); + +const memoizeTask = ((cache, taskFn) => ((target, format, ...args) => { + // Give the memoized fn a displayName so gulp's output is easier to follow. + const fn = () => ( + cache[taskName(target, format)] || ( + cache[taskName(target, format)] = taskFn(target, format, ...args))); + fn.displayName = `${taskFn.name || ``}:${taskName(target, format, ...args)}:task`; + return fn; +})); + +module.exports = memoizeTask; +module.exports.memoizeTask = memoizeTask; \ No newline at end of file diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js new file mode 100644 index 0000000000000..7b4b15a33e6ef --- /dev/null +++ b/js/gulp/package-task.js @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { + metadataFiles, packageJSONFields, + mainExport, npmPkgName, npmOrgName, + targetDir, packageName, observableFromStreams +} = require('./util'); + +const gulp = require('gulp'); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); +const gulpJsonTransform = require('gulp-json-transform'); + +const packageTask = ((cache) => memoizeTask(cache, function bundle(target, format) { + const out = targetDir(target, format); + const jsonTransform = gulpJsonTransform(target === npmPkgName ? createMainPackageJson(target, format) : + target === `ts` ? createTypeScriptPackageJson(target, format) + : createScopedPackageJSON(target, format), + 2); + return Observable.forkJoin( + observableFromStreams(gulp.src(metadataFiles), gulp.dest(out)), // copy metadata files + observableFromStreams(gulp.src(`package.json`), jsonTransform, gulp.dest(out)) // write packageJSONs + ).publish(new ReplaySubject()).refCount(); +}))({}); + +module.exports = packageTask; +module.exports.packageTask = packageTask; + +const createMainPackageJson = (target, format) => (orig) => ({ + ...createTypeScriptPackageJson(target, format)(orig), + name: npmPkgName, + main: mainExport, + module: `${mainExport}.mjs`, + browser: `${mainExport}.es5.min.js`, + [`browser:es2015`]: `${mainExport}.es2015.min.js`, + [`@std/esm`]: { esm: `mjs` }, +}); + +const createTypeScriptPackageJson = (target, format) => (orig) => ({ + ...createScopedPackageJSON(target, format)(orig), + main: `${mainExport}.ts`, types: `${mainExport}.ts` +}); + +const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => + conditionallyAddStandardESMEntry(target, format)( + packageJSONFields.reduce( + (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), + { name: `${npmOrgName}/${packageName(target, format)}`, + version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, + browser: undefined, [`browser:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined } + ) + ) +); + +const conditionallyAddStandardESMEntry = (target, format) => (packageJSON) => ( + format !== `esm` + ? packageJSON + : { ...packageJSON, [`@std/esm`]: { esm: `js` } } +); + \ No newline at end of file diff --git a/js/gulp/test-task.js b/js/gulp/test-task.js new file mode 100644 index 0000000000000..b46b2bb14edcb --- /dev/null +++ b/js/gulp/test-task.js @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const path = require('path'); +const child_process = require(`child_process`); +const { argv } = require('./argv'); +const { memoizeTask } = require('./memoize-task'); + +const jestArgv = []; +argv.update && jestArgv.push(`-u`); +argv.verbose && jestArgv.push(`--verbose`); +argv.coverage && jestArgv.push(`--coverage`); + +const debugArgv = [`--runInBand`, `--env`, `jest-environment-node-debug`]; +const jest = require.resolve(path.join(`..`, `node_modules`, `.bin`, `jest`)); + +const testTask = ((cache, execArgv, testOptions) => memoizeTask(cache, function test(target, format, debug = false) { + const opts = Object.assign({}, testOptions); + const args = !debug ? [...execArgv] : [...debugArgv, ...execArgv]; + opts.env = Object.assign({}, opts.env, { TEST_TARGET: target, TEST_MODULE: format }); + return !debug ? + child_process.spawn(jest, args, opts) : + child_process.exec(`node --inspect-brk ${jest} ${args.join(` `)}`, opts); +}))({}, jestArgv, { + env: Object.assign({}, process.env), + stdio: [`ignore`, `inherit`, `inherit`], +}); + +module.exports = testTask; +module.exports.testTask = testTask; diff --git a/js/gulp/typescript-task.js b/js/gulp/typescript-task.js new file mode 100644 index 0000000000000..2c66846667cd3 --- /dev/null +++ b/js/gulp/typescript-task.js @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { + targetDir, tsconfigName, observableFromStreams +} = require('./util'); + +const gulp = require('gulp'); +const path = require('path'); +const ts = require(`gulp-typescript`); +const sourcemaps = require('gulp-sourcemaps'); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); + +const typescriptTask = ((cache) => memoizeTask(cache, function typescript(target, format) { + const out = targetDir(target, format); + const tsconfigFile = `tsconfig.${tsconfigName(target, format)}.json`; + const tsProject = ts.createProject(path.join(`tsconfig`, tsconfigFile), { typescript: require(`typescript`) }); + const { stream: { js, dts } } = observableFromStreams( + tsProject.src(), sourcemaps.init(), + tsProject(ts.reporter.fullReporter(true)) + ); + const writeDTypes = observableFromStreams(dts, gulp.dest(out)); + const writeJS = observableFromStreams(js, sourcemaps.write(), gulp.dest(out)); + return Observable.forkJoin(writeDTypes, writeJS).publish(new ReplaySubject()).refCount(); +}))({}); + +module.exports = typescriptTask; +module.exports.typescriptTask = typescriptTask; \ No newline at end of file diff --git a/js/gulp/uglify-task.js b/js/gulp/uglify-task.js new file mode 100644 index 0000000000000..804d450453644 --- /dev/null +++ b/js/gulp/uglify-task.js @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const { + targetDir, + mainExport, + ESKeywords, + UMDSourceTargets, + uglifyLanguageNames, + observableFromStreams +} = require('./util'); + +const path = require('path'); +const webpack = require(`webpack`); +const { memoizeTask } = require('./memoize-task'); +const { Observable, ReplaySubject } = require('rxjs'); +const UglifyJSPlugin = require(`uglifyjs-webpack-plugin`); +const esmRequire = require(`@std/esm`)(module, { cjs: true, esm: `js` }); + +const uglifyTask = ((cache, commonConfig) => memoizeTask(cache, function uglifyJS(target, format) { + + const sourceTarget = UMDSourceTargets[target]; + const PublicNames = reservePublicNames(sourceTarget, `cls`); + const out = targetDir(target, format), src = targetDir(sourceTarget, `cls`); + + const targetConfig = { ...commonConfig, + output: { ...commonConfig.output, + path: path.resolve(`./${out}`) } }; + + const webpackConfigs = [ + [mainExport, PublicNames] + ].map(([entry, reserved]) => ({ + ...targetConfig, + name: entry, + entry: { [entry]: path.resolve(`${src}/${entry}.js`) }, + plugins: [ + ...(targetConfig.plugins || []), + new webpack.SourceMapDevToolPlugin({ + filename: `[name].${target}.min.js.map`, + moduleFilenameTemplate: ({ resourcePath }) => + resourcePath + .replace(/\s/, `_`) + .replace(/\.\/node_modules\//, ``) + }), + new UglifyJSPlugin({ + sourceMap: true, + uglifyOptions: { + ecma: uglifyLanguageNames[target], + compress: { unsafe: true }, + output: { comments: false, beautify: false }, + mangle: { eval: true, safari10: true, // <-- Works around a Safari 10 bug: // https://github.com/mishoo/UglifyJS2/issues/1753 + properties: { reserved, keep_quoted: true } + } + }, + }) + ] + })); + + const compilers = webpack(webpackConfigs); + return Observable + .bindNodeCallback(compilers.run.bind(compilers))() + .multicast(new ReplaySubject()).refCount(); +}))({}, { + resolve: { mainFields: [`module`, `main`] }, + module: { rules: [{ test: /\.js$/, enforce: `pre`, use: [`source-map-loader`] }] }, + output: { filename: '[name].js', library: mainExport, libraryTarget: `umd`, umdNamedDefine: true }, +}); + +module.exports = uglifyTask; +module.exports.uglifyTask = uglifyTask; + +const reservePublicNames = ((ESKeywords) => function reservePublicNames(target, format) { + const publicModulePath = `../${targetDir(target, format)}/${mainExport}.js`; + return [ + ...ESKeywords, + ...reserveExportedNames(esmRequire(publicModulePath)) + ]; +})(ESKeywords); + +// Reflect on the Arrow modules to come up with a list of keys to save from Uglify's +// mangler. Assume all the non-inherited static and prototype members of the Arrow +// module and its direct exports are public, and should be preserved through minification. +const reserveExportedNames = (entryModule) => ( + Object + .getOwnPropertyNames(entryModule) + .filter((name) => ( + typeof entryModule[name] === `object` || + typeof entryModule[name] === `function` + )) + .map((name) => [name, entryModule[name]]) + .reduce((reserved, [name, value]) => { + const fn = function() {}; + const ownKeys = value && Object.getOwnPropertyNames(value) || []; + const protoKeys = typeof value === `function` && Object.getOwnPropertyNames(value.prototype) || []; + const publicNames = [...ownKeys, ...protoKeys].filter((x) => x !== `default` && x !== `undefined` && !(x in fn)); + return [...reserved, name, ...publicNames]; + }, [] + ) +); diff --git a/js/gulp/util.js b/js/gulp/util.js new file mode 100644 index 0000000000000..21ffc3127339c --- /dev/null +++ b/js/gulp/util.js @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +const path = require(`path`); +const pump = require(`pump`); +const { Observable, ReplaySubject } = require('rxjs'); + +const mainExport = `Arrow`; +const npmPkgName = `apache-arrow`; +const npmOrgName = `@${npmPkgName}`; + +const releasesRootDir = `targets`; +const knownTargets = [`es5`, `es2015`, `esnext`]; +const knownModules = [`cjs`, `esm`, `cls`, `umd`]; +const moduleFormatsToSkipCombosOf = { cls: true }; +const metadataFiles = [`LICENSE`, `README.md`]; +const packageJSONFields = [ + `version`, `license`, `description`, + `author`, `homepage`, `repository`, + `bugs`, `keywords`, `dependencies` +]; + +// see: https://github.com/google/closure-compiler/blob/c1372b799d94582eaf4b507a4a22558ff26c403c/src/com/google/javascript/jscomp/CompilerOptions.java#L2988 +const gCCLanguageNames = { + es5: `ECMASCRIPT5`, + es2015: `ECMASCRIPT_2015`, + es2016: `ECMASCRIPT_2016`, + es2017: `ECMASCRIPT_2017`, + esnext: `ECMASCRIPT_NEXT` +}; + +const UMDSourceTargets = { + es5: `es5`, + es2015: `es2015`, + es2016: `es2015`, + es2017: `es2015`, + esnext: `es2015` +}; + +const uglifyLanguageNames = { + es5: 5, es2015: 6, + es2016: 7, es2017: 8, + esnext: 8 // <--- ? +}; + +// ES7+ keywords Uglify shouldn't mangle +// Hardcoded here since some are from ES7+, others are +// only defined in interfaces, so difficult to get by reflection. +const ESKeywords = [ + // PropertyDescriptors + `configurable`, `enumerable`, + // IteratorResult, Symbol.asyncIterator + `done`, `value`, `Symbol.asyncIterator`, `asyncIterator`, + // AsyncObserver + `values`, `hasError`, `hasCompleted`,`errorValue`, `closed`, + // Observable/Subscription/Scheduler + `next`, `error`, `complete`, `subscribe`, `unsubscribe`, `isUnsubscribed`, + // EventTarget + `addListener`, `removeListener`, `addEventListener`, `removeEventListener`, + // Arrow properties + `low`, `high`, `data`, `index`, `field`, `validity`, `columns`, `fieldNode`, `subarray`, +]; + +function taskName(target, format) { + return !format ? target : `${target}:${format}`; +} + +function packageName(target, format) { + return !format ? target : `${target}-${format}`; +} + +function tsconfigName(target, format) { + return !format ? target : `${target}.${format}`; +} + +function targetDir(target, format) { + return path.join(releasesRootDir, ...(!format ? [target] : [target, format])); +} + +function logAndDie(e) { + if (e) { + console.error(e); + process.exit(1); + } +} + +function observableFromStreams(...streams) { + const pumped = streams.length <= 1 ? streams[0] + : pump(...streams, logAndDie); + const fromEvent = Observable.fromEvent.bind(null, pumped); + const streamObs = fromEvent(`data`) + .merge(fromEvent(`error`).flatMap((e) => Observable.throw(e))) + .takeUntil(fromEvent(`end`).merge(fromEvent(`close`))) + .defaultIfEmpty(`empty stream`) + .multicast(new ReplaySubject()).refCount(); + streamObs.stream = pumped; + streamObs.observable = streamObs; + return streamObs; +} + +function* combinations(_targets, _modules) { + + const targets = known(knownTargets, _targets || [`all`]); + const modules = known(knownModules, _modules || [`all`]); + + if (_targets[0] === `all` && _modules[0] === `all`) { + yield [`ts`, ``]; + yield [npmPkgName, ``]; + } + + for (const format of modules) { + for (const target of targets) { + yield [target, format]; + } + } + + function known(known, values) { + return ~values.indexOf(`all`) + ? known + : Object.keys( + values.reduce((map, arg) => (( + (known.indexOf(arg) !== -1) && + (map[arg.toLowerCase()] = true) + || true) && map + ), {}) + ).sort((a, b) => known.indexOf(a) - known.indexOf(b)); + } +} + +module.exports = { + + mainExport, npmPkgName, npmOrgName, metadataFiles, packageJSONFields, + + knownTargets, knownModules, moduleFormatsToSkipCombosOf, + ESKeywords, gCCLanguageNames, UMDSourceTargets, uglifyLanguageNames, + + taskName, packageName, tsconfigName, targetDir, combinations, observableFromStreams, +}; \ No newline at end of file diff --git a/js/gulpfile.js b/js/gulpfile.js index 9f8e564bd9e3a..4cf0342c3be78 100644 --- a/js/gulpfile.js +++ b/js/gulpfile.js @@ -15,278 +15,92 @@ // specific language governing permissions and limitations // under the License. -const del = require(`del`); -const gulp = require(`gulp`); -const path = require(`path`); -const pump = require(`pump`); -const ts = require(`gulp-typescript`); -const streamMerge = require(`merge2`); -const sourcemaps = require(`gulp-sourcemaps`); -const child_process = require(`child_process`); -const gulpJsonTransform = require(`gulp-json-transform`); -const closureCompiler = require(`google-closure-compiler`).gulp(); - -const knownTargets = [`es5`, `es2015`, `esnext`]; -const knownModules = [`cjs`, `esm`, `cls`, `umd`]; - -// see: https://github.com/google/closure-compiler/blob/c1372b799d94582eaf4b507a4a22558ff26c403c/src/com/google/javascript/jscomp/CompilerOptions.java#L2988 -const gCCTargets = { - es5: `ECMASCRIPT5`, - es2015: `ECMASCRIPT_2015`, - es2016: `ECMASCRIPT_2016`, - es2017: `ECMASCRIPT_2017`, - esnext: `ECMASCRIPT_NEXT` -}; - -const tsProjects = []; -const argv = require(`command-line-args`)([ - { name: `all`, alias: `a`, type: Boolean }, - { name: 'update', alias: 'u', type: Boolean }, - { name: 'verbose', alias: 'v', type: Boolean }, - { name: `target`, type: String, defaultValue: `` }, - { name: `module`, type: String, defaultValue: `` }, - { name: `coverage`, type: Boolean, defaultValue: false }, - { name: `targets`, alias: `t`, type: String, multiple: true, defaultValue: [] }, - { name: `modules`, alias: `m`, type: String, multiple: true, defaultValue: [] } -]); - -const { targets, modules } = argv; - -argv.target && !targets.length && targets.push(argv.target); -argv.module && !modules.length && modules.push(argv.module); -(argv.all || !targets.length) && targets.push(`all`); -(argv.all || !modules.length) && modules.push(`all`); - -for (const [target, format] of combinations([`all`, `all`])) { - const combo = `${target}:${format}`; - gulp.task(`test:${combo}`, gulp.series(testTask(target, format, combo, `targets/${target}/${format}`))); - gulp.task(`clean:${combo}`, gulp.series(cleanTask(target, format, combo, `targets/${target}/${format}`))); - gulp.task(`build:${combo}`, gulp.series(buildTask(target, format, combo, `targets/${target}/${format}`))); - gulp.task(`bundle:${combo}`, gulp.series(bundleTask(target, format, combo, `targets/${target}/${format}`))); - gulp.task(`package:${combo}`, gulp.series(packageTask(target, format, combo, `targets/${target}/${format}`))); - gulp.task(`test:debug:${combo}`, gulp.series(testTask(target, format, combo, `targets/${target}/${format}`, true))); +const del = require('del'); +const gulp = require('gulp'); +const path = require('path'); +const { Observable } = require('rxjs'); +const testsTask = require('./gulp/test-task'); +const buildTask = require('./gulp/build-task'); +const cleanTask = require('./gulp/clean-task'); +const packageTask = require('./gulp/package-task'); +const { targets, modules } = require('./gulp/argv'); +const { + targetDir, + taskName, combinations, + knownTargets, knownModules, + npmPkgName, UMDSourceTargets, + moduleFormatsToSkipCombosOf +} = require('./gulp/util'); + +for (const [target, format] of combinations([`all`], [`all`])) { + const task = taskName(target, format); + gulp.task(`clean:${task}`, cleanTask(target, format)); + gulp.task( `test:${task}`, testsTask(target, format)); + gulp.task(`debug:${task}`, testsTask(target, format, true)); + gulp.task(`build:${task}`, gulp.series(`clean:${task}`, + buildTask(target, format), + packageTask(target, format))); } -gulp.task(`test`, gulp.series(runTaskCombos(`test`))); -gulp.task(`clean`, gulp.parallel(runTaskCombos(`clean`))); -gulp.task(`build`, gulp.parallel(runTaskCombos(`build`))); -gulp.task(`bundle`, gulp.parallel(runTaskCombos(`bundle`))); -gulp.task(`package`, gulp.parallel(runTaskCombos(`package`))); -gulp.task(`test:debug`, gulp.series(runTaskCombos(`test:debug`))); -gulp.task(`default`, gulp.task(`package`)); - -function runTaskCombos(name) { - const combos = []; +// The UMD bundles build temporary es5/6/next targets via TS, +// then run the TS source through either closure-compiler or +// uglify, so we special case that here. +knownTargets.forEach((target) => + gulp.task(`build:${target}:umd`, + gulp.series( + gulp.parallel( + cleanTask(target, `umd`), + cleanTask(UMDSourceTargets[target], `cls`), + ), + buildTask(UMDSourceTargets[target], `cls`), + buildTask(target, `umd`), packageTask(target, `umd`) + ) + ) +); + +// The main "apache-arrow" module builds the es5/cjs, es5/umd, +// es2015/esm, es2015/umd, and ts targets, then copies and +// renames the compiled output into the apache-arrow folder +gulp.task(`build:${npmPkgName}`, + gulp.series( + cleanTask(npmPkgName), + gulp.parallel( + `build:${taskName(`es5`, `cjs`)}`, + `build:${taskName(`es5`, `umd`)}`, + `build:${taskName(`es2015`, `esm`)}`, + `build:${taskName(`es2015`, `umd`)}` + ), + buildTask(npmPkgName), packageTask(npmPkgName) + ) +); + + +function gulpConcurrent(tasks) { + return () => Observable.bindCallback((tasks, cb) => gulp.parallel(tasks)(cb))(tasks); +} + +const buildConcurrent = (tasks) => () => + gulpConcurrent(tasks)() + .concat(Observable + .defer(() => Observable + .merge(...knownTargets.map((target) => + del(`${targetDir(target, `cls`)}/**`))))); + +gulp.task( `test`, gulp.series(getTasks(`test`))); +gulp.task(`debug`, gulp.series(getTasks(`debug`))); +gulp.task(`clean`, gulp.parallel(getTasks(`clean`))); +gulp.task(`build`, buildConcurrent(getTasks(`build`))); +gulp.task(`default`, gulp.series(`build`, `test`)); + +function getTasks(name) { + const tasks = []; + if (targets.indexOf(`ts`) !== -1) tasks.push(`${name}:ts`); + if (targets.indexOf(npmPkgName) !== -1) tasks.push(`${name}:${npmPkgName}`); for (const [target, format] of combinations(targets, modules)) { - if (format === `cls`) { + if (moduleFormatsToSkipCombosOf[format] && name === `test`) { continue; } - combos.push(`${name}:${target}:${format}`); - } - return combos; -} - -function cleanTask(target, format, taskName, outDir) { - return function cleanTask() { - const globs = [`${outDir}/**`]; - if (target === `es5` && format === `cjs`) { - globs.push(`types`, `typings`); - } - return del(globs); - }; -} - -function buildTask(target, format, taskName, outDir) { - return format === `umd` - ? closureTask(target, format, taskName, outDir) - : typescriptTask(target, format, taskName, outDir); -} - -function bundleTask(target, format, taskName, outDir) { - return function bundleTask() { - return streamMerge([ - pump(gulp.src([`LICENSE`, `README.md`]), gulp.dest(outDir), onError), - pump( - gulp.src(`package.json`), - gulpJsonTransform((orig) => [ - `version`, `description`, `keywords`, - `repository`, `author`, `homepage`, `bugs`, `license`, - `dependencies`, `peerDependencies` - ].reduce((copy, key) => ( - (copy[key] = orig[key]) && copy || copy - ), { - main: `Arrow.js`, - types: `Arrow.d.ts`, - typings: `Arrow.d.ts`, - name: `@apache-arrow/${target}-${format}` - }), 2), - gulp.dest(outDir), - onError - ) - ]); - } -} - -function packageTask(target, format, taskName, outDir) { - return [`build:${taskName}`, `bundle:${taskName}`]; -} - -function testTask(target, format, taskName, outDir, debug) { - const jestOptions = !debug ? [] : [ - `--runInBand`, `--env`, `jest-environment-node-debug`]; - argv.update && jestOptions.unshift(`-u`); - argv.verbose && jestOptions.unshift(`--verbose`); - argv.coverage && jestOptions.unshift(`--coverage`); - const jestPath = `./node_modules/.bin/jest`; - const debugOpts = jestOptions.join(' '); - const spawnOptions = { - stdio: [`ignore`, `inherit`, `inherit`], - env: Object.assign({}, process.env, { - TEST_TARGET: target, TEST_MODULE: format - }) - }; - return function testTask() { - return !debug ? - child_process.spawn(jestPath, jestOptions, spawnOptions) : - child_process.exec(`node --inspect-brk ${jestPath} ${debugOpts}`, spawnOptions); - } -} - -function closureTask(target, format, taskName, outDir) { - const clsTarget = `es5`; - const googleRoot = `targets/${clsTarget}/cls`; - const languageIn = clsTarget === `es5` ? `es2015` : clsTarget; - return [ - [`clean:${taskName}`, `build:${clsTarget}:cls`], - function closureTask() { - return closureStream( - closureSrcs(), - closureCompiler(closureArgs()) - ).on('end', () => del([`targets/${target}/cls/**`])); - } - ]; - function closureSrcs() { - return gulp.src([ - `closure-compiler-scripts/*.js`, - `${googleRoot}/**/*.js`, - `!${googleRoot}/format/*.js`, - `!${googleRoot}/Arrow.externs.js`, - ], { base: `./` }); - } - function closureStream(sources, compiler) { - const streams = [ - sources, - sourcemaps.init(), - compiler, - sourcemaps.write('.'), - gulp.dest(outDir) - ]; - // copy the ES5 UMD bundle to dist - if (target === `es5`) { - streams.push(gulp.dest(`dist`)); - } - return pump(...streams, onError); - } - function closureArgs() { - return { - third_party: true, - externs: `${googleRoot}/Arrow.externs.js`, - warning_level: `QUIET`, - dependency_mode: `LOOSE`, - rewrite_polyfills: false, - // formatting: `PRETTY_PRINT`, - compilation_level: `ADVANCED`, - assume_function_wrapper: true, - js_output_file: `Arrow.js`, - language_in: gCCTargets[languageIn], - language_out: gCCTargets[clsTarget], - entry_point: `${googleRoot}/Arrow.js`, - output_wrapper: -`// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -(function (global, factory) { - typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : - typeof define === 'function' && define.amd ? define(['exports'], factory) : - (factory(global.Arrow = global.Arrow || {})); -}(this, (function (exports) {%output%}.bind(this))));` - }; - } -} - -function typescriptTask(target, format, taskName, outDir) { - return [ - [`clean:${taskName}`], - function typescriptTask() { - const tsconfigPath = `tsconfig/tsconfig.${target}.${format}.json`; - let { js, dts } = tsProjects.find((p) => p.target === target && p.format === format) || {}; - if (!js || !dts) { - let tsProject = ts.createProject(tsconfigPath); - ({ js, dts } = pump( - tsProject.src(), - sourcemaps.init(), - tsProject(ts.reporter.fullReporter(true)), - onError - )); - dts = [dts, gulp.dest(outDir)]; - js = [js, sourcemaps.write(), gulp.dest(outDir)]; - // copy types to the root - if (target === `es5` && format === `cjs`) { - dts.push(gulp.dest(`types`)); - } - tsProjects.push({ - target, format, - js: js = pump(...js, onError), - dts: dts = pump(...dts, onError) - }); - } - return streamMerge([ dts, js ]); - } - ]; -} - -function* combinations(_targets, _modules) { - - const targets = known(knownTargets, _targets || [`all`]); - const modules = known(knownModules, _modules || [`all`]); - - for (const format of modules) { - for (const target of targets) { - yield [target, format]; - } - } - - function known(known, values) { - return ~values.indexOf(`all`) - ? known - : Object.keys( - values.reduce((map, arg) => (( - (known.indexOf(arg) !== -1) && - (map[arg.toLowerCase()] = true) - || true) && map - ), {}) - ).sort((a, b) => known.indexOf(a) - known.indexOf(b)); + tasks.push(`${name}:${taskName(target, format)}`); } + return tasks.length && tasks || [(done) => done()]; } - -function onError(err) { - if (typeof err === 'number') { - process.exit(err); - } else if (err) { - console.error(err.stack || err.toString()); - process.exit(1); - } -} \ No newline at end of file diff --git a/js/package.json b/js/package.json index 03687a8b25ca2..ba93a34686288 100644 --- a/js/package.json +++ b/js/package.json @@ -14,18 +14,18 @@ "test": "gulp test", "build": "gulp build", "clean": "gulp clean", + "debug": "gulp debug", "bundle": "gulp bundle", "package": "gulp package", "perf": "node ./perf/index.js", - "test:debug": "gulp test:debug", "test:coverage": "gulp test -t esnext -m esm --coverage", "validate": "npm-run-all clean lint build test bundle", "lerna:publish": "lerna exec --bail=false npm publish", "prepublishOnly": "sh ./prepublish.sh", "doc": "shx rm -rf ./doc && esdoc", "lint": "npm-run-all -p lint:*", - "lint:src": "tslint --fix --type-check -p tsconfig.json -c tslint.json \"src/**/*.ts\"", - "lint:test": "tslint --fix --type-check -p test/tsconfig.json -c tslint.json \"test/**/*.ts\"" + "lint:src": "tslint --fix --project -p tsconfig.json -c tslint.json \"src/**/*.ts\"", + "lint:test": "tslint --fix --project -p test/tsconfig.json -c tslint.json \"test/**/*.ts\"" }, "repository": { "type": "git", @@ -50,7 +50,6 @@ "README.md" ], "peerDependencies": { - "tslib": "~1.7.1", "command-line-usage": "4.0.1" }, "dependencies": { @@ -58,38 +57,44 @@ "text-encoding": "0.6.4" }, "devDependencies": { - "@types/flatbuffers": "1.6.4", - "@types/jest": "20.0.8", - "@types/node": "^8.0.24", + "@std/esm": "0.12.5", + "@types/flatbuffers": "1.6.5", + "@types/jest": "21.1.5", + "@types/node": "8.0.47", "@types/text-encoding": "0.0.32", "benchmark": "2.1.4", - "coveralls": "2.13.1", "command-line-args": "4.0.7", + "coveralls": "3.0.0", "del": "3.0.0", "esdoc": "1.0.3", "esdoc-standard-plugin": "1.0.0", "google-closure-compiler": "20170910.0.0", "gulp": "github:gulpjs/gulp#4.0", - "gulp-json-transform": "0.4.2", + "gulp-json-transform": "0.4.5", + "gulp-rename": "1.2.2", "gulp-sourcemaps": "2.6.1", - "gulp-typescript": "3.2.2", - "jest": "21.1.0", + "gulp-typescript": "3.2.3", + "jest": "21.2.1", "jest-environment-node-debug": "2.0.0", "json": "9.0.6", - "lerna": "2.2.0", - "lint-staged": "4.2.1", + "lerna": "2.5.0", + "lint-staged": "4.3.0", "merge2": "1.2.0", "mkdirp": "0.5.1", "npm-run-all": "4.1.1", "pump": "1.0.2", "rimraf": "2.6.2", + "rxjs": "5.5.2", "shx": "0.2.2", + "source-map-loader": "0.2.3", "text-encoding-utf-8": "1.0.1", "trash": "4.1.0", - "ts-jest": "21.0.1", - "tslib": "1.7.1", - "tslint": "5.7.0", - "typescript": "2.5.2" + "ts-jest": "21.1.4", + "tslib": "1.8.0", + "tslint": "5.8.0", + "typescript": "2.6.1", + "uglifyjs-webpack-plugin": "1.0.1", + "webpack": "3.8.1" }, "lint-staged": { "*.@(ts)": [ diff --git a/js/perf/index.js b/js/perf/index.js index 669f690122d10..3a2ed96772330 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,12 +16,10 @@ // under the License. // Use the ES5 UMD target as perf baseline -// ES6/7 iterators are faster in turbofan, but something about the -// ES5 transpilation (rewriting let and const to var?) JITs better -const { Table, readBuffers } = require('../dist/Arrow'); +// const { Table, readBuffers } = require('../targets/es5/umd'); // const { Table, readBuffers } = require('../targets/es5/cjs'); +const { Table, readBuffers } = require('../targets/es2015/umd'); // const { Table, readBuffers } = require('../targets/es2015/cjs'); -// const { Table, readBuffers } = require('../targets/esnext/cjs'); const Benchmark = require('benchmark'); const arrowTestConfigurations = require('./config'); @@ -35,12 +33,12 @@ for (let [name, ...buffers] of arrowTestConfigurations) { const getByIndexSuite = new Benchmark.Suite(`Get ${name} values by index`, { async: true }); parseSuite.add(createFromTableTest(name, buffers)); parseSuite.add(createReadBuffersTest(name, buffers)); - for (const vector of Table.from(...buffers).cols()) { + for (const vector of Table.from(...buffers).columns) { sliceSuite.add(createSliceTest(vector)); iterateSuite.add(createIterateTest(vector)); getByIndexSuite.add(createGetByIndexTest(vector)); } - suites.push(parseSuite, sliceSuite, getByIndexSuite, iterateSuite); + suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); } console.log('Running apache-arrow performance tests...\n'); diff --git a/js/src/Arrow.externs.ts b/js/src/Arrow.externs.ts index 7289d6d2732b6..c23930271183d 100644 --- a/js/src/Arrow.externs.ts +++ b/js/src/Arrow.externs.ts @@ -24,23 +24,30 @@ Symbol.iterator; /** @type {symbol} */ Symbol.asyncIterator; -let Table = function() {}; + +let RowVector = function() {}; /** @type {?} */ -Table.prototype.length; +RowVector.prototype.toJSON; /** @type {?} */ -Table.prototype.rows; +RowVector.prototype.toArray; /** @type {?} */ -Table.prototype.cols; +RowVector.prototype.toObject; /** @type {?} */ -Table.prototype.getRow; +RowVector.prototype.toString; + +let Table = function() {}; +/** @type {?} */ +( Table).from; /** @type {?} */ -Table.prototype.getCell; +Table.prototype.columns; /** @type {?} */ -Table.prototype.getCellAt; +Table.prototype.length; /** @type {?} */ -Table.prototype.getColumn; +Table.prototype.col; /** @type {?} */ -Table.prototype.getColumnAt; +Table.prototype.key; +/** @type {?} */ +Table.prototype.select; /** @type {?} */ Table.prototype.toString; @@ -52,24 +59,26 @@ Vector.prototype.name; /** @type {?} */ Vector.prototype.type; /** @type {?} */ -Vector.prototype.props; -/** @type {?} */ Vector.prototype.get; /** @type {?} */ Vector.prototype.concat; /** @type {?} */ Vector.prototype.slice; - -let TypedVector = function() {}; /** @type {?} */ -TypedVector.prototype.arrayType; +Vector.prototype.metadata; +/** @type {?} */ +Vector.prototype.nullable; +/** @type {?} */ +Vector.prototype.nullCount; -let ValidityVector = function() {}; +let BoolVector = function() {}; +/** @type {?} */ +( BoolVector).pack; /** @type {?} */ -( ValidityVector).pack; +BoolVector.prototype.set; let DictionaryVector = function() {}; /** @type {?} */ -DictionaryVector.prototype.index; +DictionaryVector.prototype.getKey; /** @type {?} */ -DictionaryVector.prototype.value; +DictionaryVector.prototype.getValue; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index ea8a5c3e1d9bb..3196550884dbf 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -15,44 +15,45 @@ // specific language governing permissions and limitations // under the License. -import { Table } from './table'; import { readBuffers } from './reader/arrow'; -import { Vector } from './vector/vector'; -import { StructVector } from './vector/struct'; -import { DictionaryVector } from './vector/dictionary'; -import { ListVector, Utf8Vector, FixedSizeListVector } from './vector/list'; -import { - TypedVector, BitVector, - DateVector, IndexVector, - Int8Vector, Int16Vector, - Int32Vector, Int64Vector, - Uint8Vector, Uint16Vector, - Uint32Vector, Uint64Vector, - Float32Vector, Float64Vector, -} from './vector/typed'; -export { - Table, readBuffers, - Vector, - BitVector, - ListVector, - Utf8Vector, - DateVector, - IndexVector, +import { Vector } from './types/types'; +import { ListVector } from './types/list'; +import { Utf8Vector } from './types/utf8'; +import { BoolVector } from './types/vector/bool'; +import { DateVector } from './types/vector/date'; +import { RowVector } from './types/table/row'; +import { TableVector } from './types/table/table'; +import { StructVector } from './types/table/struct'; +import { DictionaryVector } from './types/dictionary'; +import { FixedSizeListVector } from './types/fixedsizelist'; +import { LongVector, Int64Vector, Uint64Vector, } from './types/vector/long'; +import { TypedVector, Int8Vector, Int16Vector, Int32Vector, - Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, - Uint64Vector, Float32Vector, - Float64Vector, - StructVector, + Float64Vector +} from './types/vector/typed'; + +import './types/table/from'; + +export { + Vector, + readBuffers, DictionaryVector, - FixedSizeListVector, + RowVector as Row, + TableVector as Table, + StructVector, Utf8Vector, + ListVector, FixedSizeListVector, + BoolVector, TypedVector, LongVector, + DateVector, Float32Vector, Float64Vector, + Int8Vector, Int16Vector, Int32Vector, Int64Vector, + Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, }; /* These exports are needed for the closure umd targets */ @@ -60,15 +61,18 @@ try { const Arrow = eval('exports'); if (typeof Arrow === 'object') { // string indexers tell closure compiler not to rename these properties - Arrow['Table'] = Table; - Arrow['readBuffers'] = readBuffers; Arrow['Vector'] = Vector; - Arrow['BitVector'] = BitVector; - Arrow['ListVector'] = ListVector; + Arrow['Table'] = TableVector; + Arrow['readBuffers'] = readBuffers; + Arrow['BoolVector'] = BoolVector; Arrow['Utf8Vector'] = Utf8Vector; - Arrow['DateVector'] = DateVector; - Arrow['IndexVector'] = IndexVector; + Arrow['ListVector'] = ListVector; + Arrow['StructVector'] = StructVector; + Arrow['DictionaryVector'] = DictionaryVector; + Arrow['FixedSizeListVector'] = FixedSizeListVector; + Arrow['LongVector'] = LongVector; Arrow['TypedVector'] = TypedVector; + Arrow['DateVector'] = DateVector; Arrow['Int8Vector'] = Int8Vector; Arrow['Int16Vector'] = Int16Vector; Arrow['Int32Vector'] = Int32Vector; @@ -79,9 +83,6 @@ try { Arrow['Uint64Vector'] = Uint64Vector; Arrow['Float32Vector'] = Float32Vector; Arrow['Float64Vector'] = Float64Vector; - Arrow['StructVector'] = StructVector; - Arrow['DictionaryVector'] = DictionaryVector; - Arrow['FixedSizeListVector'] = FixedSizeListVector; } } catch (e) { /* not the UMD bundle */ } /* end closure exports */ diff --git a/js/src/reader/arrow.ts b/js/src/reader/arrow.ts index dbb6acd0e79e8..033bfecae61dd 100644 --- a/js/src/reader/arrow.ts +++ b/js/src/reader/arrow.ts @@ -18,17 +18,18 @@ import { flatbuffers } from 'flatbuffers'; import * as Schema_ from '../format/Schema_generated'; import * as Message_ from '../format/Message_generated'; +export import Schema = Schema_.org.apache.arrow.flatbuf.Schema; +export import RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; import { readFile } from './file'; import { readStream } from './stream'; import { readVector } from './vector'; -import { Vector } from '../vector/vector'; import { readDictionary } from './dictionary'; +import { Vector, Column } from '../types/types'; import ByteBuffer = flatbuffers.ByteBuffer; -export import Schema = Schema_.org.apache.arrow.flatbuf.Schema; -export import RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; -export type Dictionaries = { [k: string]: Vector }; +import Field = Schema_.org.apache.arrow.flatbuf.Field; +export type Dictionaries = { [k: string]: Vector } | null; export type IteratorState = { nodeIndex: number; bufferIndex: number }; export function* readRecords(...bytes: ByteBuffer[]) { @@ -47,22 +48,27 @@ export function* readBuffers(...bytes: Array) { const dictionaries: Dictionaries = {}; const byteBuffers = bytes.map(toByteBuffer); for (let { schema, batch } of readRecords(...byteBuffers)) { - let vectors: Vector[] = []; + let vectors: Column[] = []; let state = { nodeIndex: 0, bufferIndex: 0 }; - let index = -1, fieldsLength = schema.fieldsLength(); + let fieldsLength = schema.fieldsLength(); + let index = -1, field: Field, vector: Vector; if (batch.id) { // A dictionary batch only contain a single vector. Traverse each // field and its children until we find one that uses this dictionary while (++index < fieldsLength) { - let vector = readDictionary(schema.fields(index), batch, state, dictionaries); - if (vector) { - dictionaries[batch.id] = dictionaries[batch.id] && dictionaries[batch.id].concat(vector) || vector; - break; + if (field = schema.fields(index)!) { + if (vector = readDictionary(field, batch, state, dictionaries)!) { + dictionaries[batch.id] = dictionaries[batch.id] && dictionaries[batch.id].concat(vector) || vector; + break; + } } } } else { while (++index < fieldsLength) { - vectors[index] = readVector(schema.fields(index), batch, state, dictionaries); + if ((field = schema.fields(index)!) && + (vector = readVector(field, batch, state, dictionaries)!)) { + vectors[index] = vector as Column; + } } yield vectors; } diff --git a/js/src/reader/dictionary.ts b/js/src/reader/dictionary.ts index 93a9ba76bba3a..0c58ace3b0dca 100644 --- a/js/src/reader/dictionary.ts +++ b/js/src/reader/dictionary.ts @@ -17,22 +17,20 @@ import { readVector } from './vector'; import { MessageBatch } from './message'; +import { DictionaryVector } from '../types/dictionary'; import * as Schema_ from '../format/Schema_generated'; import { IteratorState, Dictionaries } from './arrow'; import Field = Schema_.org.apache.arrow.flatbuf.Field; -export function readDictionary(field: Field | null, - batch: MessageBatch, - iterator: IteratorState, - dictionaries: Dictionaries) { - let id: string, encoding = field && field.dictionary(); +export function readDictionary(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries): DictionaryVector | null { + let vector: DictionaryVector | null, id, encoding = field.dictionary(); if (encoding && batch.id === (id = encoding.id().toFloat64().toString())) { - return readVector(field, batch, iterator, null); + return readVector(field, batch, iterator, null) as DictionaryVector; } - for (let i = -1, n = field && field.childrenLength() || 0; ++i < n;) { - let vector = readDictionary(field.children(i), batch, iterator, dictionaries); - if (vector) { + for (let i = -1, n = field.childrenLength() | 0; ++i < n;) { + if (vector = readDictionary(field.children(i)!, batch, iterator, dictionaries)) { return vector; } } + return null; } diff --git a/js/src/reader/file.ts b/js/src/reader/file.ts index b05b99a5e6dcf..bd60b476312a0 100644 --- a/js/src/reader/file.ts +++ b/js/src/reader/file.ts @@ -58,9 +58,12 @@ export function* readFile(...bbs: ByteBuffer[]) { throw new Error('Invalid file'); } bb.setPosition(footerOffset - footerLength); - let footer = Footer.getRootAsFooter(bb), schema = footer.schema(); + let schema, footer = Footer.getRootAsFooter(bb); + if (!(schema = footer.schema()!)) { + return; + } for (let i = -1, n = footer.dictionariesLength(); ++i < n;) { - let block = footer.dictionaries(i); + let block = footer.dictionaries(i)!; bb.setPosition(block.offset().low); for (let batch of readMessageBatches(bb)) { yield { schema, batch }; @@ -68,7 +71,7 @@ export function* readFile(...bbs: ByteBuffer[]) { } } for (let i = -1, n = footer.recordBatchesLength(); ++i < n;) { - const block = footer.recordBatches(i); + const block = footer.recordBatches(i)!; bb.setPosition(block.offset().low); for (let batch of readMessageBatches(bb)) { yield { schema, batch }; diff --git a/js/src/reader/message.ts b/js/src/reader/message.ts index 5472f10833878..6c8a969021f90 100644 --- a/js/src/reader/message.ts +++ b/js/src/reader/message.ts @@ -46,13 +46,13 @@ export function* readMessageBatches(bb: ByteBuffer) { let bytes = bb.bytes(); for (let message of readMessages(bb)) { let type = message.headerType(); - let id: string, data: RecordBatch; + let id: string | void, data: RecordBatch; if (type === MessageHeader.RecordBatch) { - data = message.header(new RecordBatch()); + data = message.header(new RecordBatch())!; } else if (type === MessageHeader.DictionaryBatch) { - let header = message.header(new DictionaryBatch()); + let header = message.header(new DictionaryBatch())!; id = header.id().toFloat64().toString(); - data = header.data(); + data = header.data()!; } else { continue; } diff --git a/js/src/reader/stream.ts b/js/src/reader/stream.ts index 9869f633d08f4..2062b1a8c4c10 100644 --- a/js/src/reader/stream.ts +++ b/js/src/reader/stream.ts @@ -30,8 +30,8 @@ export function* readStream(...bbs: ByteBuffer[]) { throw new Error('Invalid Arrow Stream'); } for (const message of readMessages(bbs[0])) { - if (message.headerType() === MessageHeader.Schema) { - const schema = message.header(new Schema()); + let schema: Schema; + if (message.headerType() === MessageHeader.Schema && (schema = message.header(new Schema())!)) { for (const bb of bbs) { for (const batch of readMessageBatches(bb)) { yield { schema, batch }; diff --git a/js/src/reader/vector.ts b/js/src/reader/vector.ts index 3b6663be89bdc..4d3321833209d 100644 --- a/js/src/reader/vector.ts +++ b/js/src/reader/vector.ts @@ -17,22 +17,28 @@ import { flatbuffers } from 'flatbuffers'; import { MessageBatch } from './message'; -import { Vector } from '../vector/vector'; import * as Schema_ from '../format/Schema_generated'; -import { StructVector } from '../vector/struct'; +import * as Message_ from '../format/Message_generated'; import { IteratorState, Dictionaries } from './arrow'; -import { DictionaryVector } from '../vector/dictionary'; -import { Utf8Vector, ListVector, FixedSizeListVector } from '../vector/list'; import { - TypedArray, TypedArrayCtor, IntArray, FloatArray, + Vector, Column, + IntArray, FloatArray, + TypedArray, TypedArrayConstructor, +} from '../types/types'; + +import { + DictionaryVector, + Utf8Vector, StructVector, + ListVector, FixedSizeListVector, + DateVector, Float32Vector, Float64Vector, Int8Vector, Int16Vector, Int32Vector, Int64Vector, Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, - Float32Vector, Float64Vector, IndexVector, DateVector, -} from '../vector/typed'; +} from '../types/arrow'; import Int = Schema_.org.apache.arrow.flatbuf.Int; import Type = Schema_.org.apache.arrow.flatbuf.Type; import Field = Schema_.org.apache.arrow.flatbuf.Field; +import FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; import Precision = Schema_.org.apache.arrow.flatbuf.Precision; import VectorType = Schema_.org.apache.arrow.flatbuf.VectorType; import VectorLayout = Schema_.org.apache.arrow.flatbuf.VectorLayout; @@ -40,31 +46,33 @@ import FixedSizeList = Schema_.org.apache.arrow.flatbuf.FixedSizeList; import FloatingPoint = Schema_.org.apache.arrow.flatbuf.FloatingPoint; import DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; -export function readVector(field: Field, batch: MessageBatch, state: IteratorState, dictionaries: Dictionaries) { - return readDictionaryVector(field, batch, state, dictionaries) || - readTypedVector(field, batch, state, dictionaries); +export function readVector(field: Field, batch: MessageBatch, state: IteratorState, dictionaries: Dictionaries): Column | DictionaryVector | null { + return readDictionaryVector(field, batch, state, dictionaries) || + readTypedVector(field, batch, state, dictionaries); } -function readTypedVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries) { +function readTypedVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries): Column | DictionaryVector | null { let typeType = field.typeType(), readTyped = typedVectorReaders[typeType]; if (!readTyped) { throw new Error('Unrecognized vector name "' + Type[typeType] + '" type "' + typeType + '"'); } - return readTyped(field, batch, iterator, dictionaries); + return readTyped(field, batch, iterator, dictionaries) as Column; } -function readDictionaryVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries) { - let encoding: DictionaryEncoding | null; - if (dictionaries && (encoding = field.dictionary())) { - let id = encoding.id().toFloat64().toString(); - let fieldType = encoding.indexType() || +function readDictionaryVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries): DictionaryVector | null { + let data: Vector, encoding: DictionaryEncoding; + if (dictionaries && + (encoding = field.dictionary()!) && + (data = dictionaries[encoding.id().toFloat64().toString()])) { + let indexType = encoding.indexType() || /* a dictionary index defaults to signed 32 bit int if unspecified */ { bitWidth: () => 32, isSigned: () => true }; // workaround for https://issues.apache.org/jira/browse/ARROW-1363 - let indexField = createSyntheticDictionaryIndexField(field, fieldType); - let index = readIntVector(indexField, batch, iterator, null, fieldType); - return DictionaryVector.create(field, index.length, index, dictionaries[id]); + let indexField = createSyntheticDictionaryIndexField(field, indexType); + let keys = readIntVector(indexField, batch, iterator, null, indexType)!; + return new DictionaryVector({ data, keys: keys! }); } + return null; } const IntViews = [Int8Array, Int16Array, Int32Array, Int32Array ]; @@ -74,39 +82,54 @@ const Uint8Views = [Uint8Array, Uint8Array, Uint8Array, Uint8Array ]; const Uint32Views = [Uint32Array, Uint32Array, Uint32Array, Uint32Array ]; const FloatViews = [Int8Array, Int16Array, Float32Array, Float64Array]; -const createIntDataViews = createDataView.bind(null, IntViews, null); -const createUintDataViews = createDataView.bind(null, UintViews, null); -const createDateDataViews = createDataView.bind(null, Uint32Views, null); -const createFloatDataViews = createDataView.bind(null, FloatViews, null); -const createNestedDataViews = createDataView.bind(null, Uint32Views, null); -const createValidityDataViews = createDataView.bind(null, Uint8Views, null); -const createUtf8DataViews = createDataView.bind(null, Uint8Views, Int32Views); +const createIntDataViews = createTypedArray.bind(null, IntViews, null); +const createUintDataViews = createTypedArray.bind(null, UintViews, null); +const createDateDataViews = createTypedArray.bind(null, Uint32Views, null); +const createFloatDataViews = createTypedArray.bind(null, FloatViews, null); +const createNestedDataViews = createTypedArray.bind(null, Uint32Views, null); +const createValidityDataViews = createTypedArray.bind(null, Uint8Views, null); +const createUtf8DataViews = createTypedArray.bind(null, Uint8Views, Int32Views); +// Define as computed properties for closure-compiler const floatVectors = { + [Precision.HALF]: Float32Vector, [Precision.SINGLE]: Float32Vector, - [Precision.DOUBLE]: Float64Vector -}; + [Precision.DOUBLE]: Float64Vector, +} as { [k: number]: any }; + +// and again as string-indexed keys for Uglify... +floatVectors[Precision['HALF']] = Float32Vector; +floatVectors[Precision['SINGLE']] = Float32Vector; +floatVectors[Precision['DOUBLE']] = Float64Vector; + const intVectors = [ [/* unsigned */ Uint8Vector, /* signed */ Int8Vector ], [/* unsigned */ Uint16Vector, /* signed */ Int16Vector], [/* unsigned */ Uint32Vector, /* signed */ Int32Vector], [/* unsigned */ Uint64Vector, /* signed */ Int64Vector] -]; +] as any[][]; -function readIntVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries, fieldType?: FieldType) { - let type = (fieldType || field.type(new Int())); +function readIntVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries, primitiveType?: PrimitiveType) { + let type = (primitiveType || field.type(new Int())!); return type.isSigned() ? read_IntVector(field, batch, iterator, dictionaries, type) : readUintVector(field, batch, iterator, dictionaries, type); } -const read_IntVector = readVectorLayout(createIntDataViews, createIntVector); -const readUintVector = readVectorLayout(createUintDataViews, createIntVector); -function createIntVector(field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) { - let type = fieldType || field.type(new Int()), bitWidth = type.bitWidth(); - let Vector = valueForBitWidth(bitWidth, intVectors)[+type.isSigned()]; - return Vector.create(field, length, validity, data || offsets); - // ----------------------------------------------- 👆: +function read_IntVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries, primitiveType?: PrimitiveType) { + return readVectorLayout(createIntDataViews, createIntVector, field, batch, iterator, dictionaries, primitiveType); +} + +function readUintVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries, primitiveType?: PrimitiveType) { + return readVectorLayout(createUintDataViews, createIntVector, field, batch, iterator, dictionaries, primitiveType); +} + +function createIntVector(argv: VectorFactoryArgv) { + let { field, fieldNode, data, validity, offsets, primitiveType } = argv; + let type = primitiveType || field.type(new Int())!, bitWidth = type.bitWidth(); + let IntVector = valueForBitWidth(bitWidth, intVectors)[+type.isSigned()]; + return new IntVector({ fieldNode, field, validity, data: data! || offsets! }); + // ---------------------------------------------------- 👆: // Workaround for https://issues.apache.org/jira/browse/ARROW-1363 // This bug causes dictionary encoded vector indicies' IntVector data // buffers to be tagged as VectorType.OFFSET (0) in the field metadata @@ -118,68 +141,58 @@ function createIntVector(field, length, data, validity, offsets, fieldType, batc // the offset buffer is the data, because IntVectors don't have offsets. } -const readFloatVector = readVectorLayout( - createFloatDataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - let type = field.type(new FloatingPoint()); - let Vector = floatVectors[type.precision()]; - return Vector.create(field, length, validity, data); - } -); +function bindVectorReader(createBufferView: BufferViewFactory, createVector: VectorFactory) { + return function readVector(field: Field, batch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries, primitiveType?: PrimitiveType) { + return readVectorLayout(createBufferView, createVector, field, batch, iterator, dictionaries, primitiveType); + }; +} -const readDateVector = readVectorLayout( - createDateDataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - return DateVector.create(field, length, validity, data); - } -); +const readFloatVector = bindVectorReader(createFloatDataViews, ({ field, fieldNode, data, validity }: VectorFactoryArgv) => { + const type = field.type(new FloatingPoint())!; + const FloatVector = floatVectors[type.precision()]; + return new FloatVector({ field, fieldNode, validity, data: data! }); +}); -const readUtf8Vector = readVectorLayout( - createUtf8DataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - let offsetsAdjusted = new Int32Array(offsets.buffer, offsets.byteOffset, length + 1); - return Utf8Vector.create( - field, length, validity, - Uint8Vector.create(field, data.length, null, data), - IndexVector.create(field, length + 1, null, offsetsAdjusted) - ); - } -); +const readDateVector = bindVectorReader(createDateDataViews, ({ field, fieldNode, data, validity }: VectorFactoryArgv) => { + return new DateVector({ field, fieldNode, validity, data: data! }); +}); -const readListVector = readVectorLayout( - createNestedDataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - let offsetsAdjusted = new Int32Array(offsets.buffer, offsets.byteOffset, length + 1); - return ListVector.create( - field, length, validity, - readVector(field.children(0), batch, iterator, dictionaries), - IndexVector.create(field, length + 1, null, offsetsAdjusted) - ); - } -); +const readUtf8Vector = bindVectorReader(createUtf8DataViews, ({ field, fieldNode, data, offsets, validity }: VectorFactoryArgv) => { + return new Utf8Vector({ + field, fieldNode, + values: new ListVector({ + validity, + offsets: offsets as Int32Array, + values: new Uint8Vector({ data: data! }) + }) as any as Vector + }); +}); -const readFixedSizeListVector = readVectorLayout( - createNestedDataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - let size = field.type(new FixedSizeList()).listSize(); - return FixedSizeListVector.create( - field, length, size, validity, - readVector(field.children(0), batch, iterator, dictionaries) - ); - } -); +const readListVector = bindVectorReader(createNestedDataViews, ({ field, fieldNode, offsets, validity, iterator, messageBatch, dictionaries }: VectorFactoryArgv) => { + return new ListVector({ + field, fieldNode, validity, + offsets: offsets! as Int32Array, + values: readVector(field.children(0)!, messageBatch, iterator, dictionaries)! + }); +}); -const readStructVector = readVectorLayout>( - createNestedDataViews, - (field, length, data, validity, offsets, fieldType, batch, iterator, dictionaries) => { - let vectors: Vector[] = []; - for (let i = -1, n = field.childrenLength(); ++i < n;) { - vectors[i] = readVector(field.children(i), batch, iterator, dictionaries); - } - return StructVector.create(field, length, validity, ...vectors); +const readFixedSizeListVector = bindVectorReader(createNestedDataViews, ({ field, fieldNode, validity, iterator, messageBatch, dictionaries }: VectorFactoryArgv) => { + return new FixedSizeListVector({ + field, fieldNode, validity, + listSize: field.type(new FixedSizeList())!.listSize(), + values: readVector(field.children(0)!, messageBatch, iterator, dictionaries)! + }); +}); + +const readStructVector = bindVectorReader(createNestedDataViews, ({ field, fieldNode, validity, iterator, messageBatch, dictionaries }: VectorFactoryArgv>) => { + let columns: Column[] = []; + for (let i = -1, n = field.childrenLength(); ++i < n;) { + columns[i] = readVector(field.children(i)!, messageBatch, iterator, dictionaries) as Column; } -); + return new StructVector({ field, fieldNode, validity, columns }); +}); +// Define as computed properties for closure-compiler const typedVectorReaders = { [Type.Int]: readIntVector, [Type.Date]: readDateVector, @@ -188,60 +201,73 @@ const typedVectorReaders = { [Type.Struct_]: readStructVector, [Type.FloatingPoint]: readFloatVector, [Type.FixedSizeList]: readFixedSizeListVector, -}; +} as { [k: number]: (...args: any[]) => Vector | null }; + +// and again as string-indexed keys for Uglify... +typedVectorReaders[Type['Int']] = readIntVector; +typedVectorReaders[Type['Date']] = readDateVector; +typedVectorReaders[Type['List']] = readListVector; +typedVectorReaders[Type['Utf8']] = readUtf8Vector; +typedVectorReaders[Type['Struct_']] = readStructVector; +typedVectorReaders[Type['FloatingPoint']] = readFloatVector; +typedVectorReaders[Type['FixedSizeList']] = readFixedSizeListVector; + +type VectorFactory = (argv: VectorFactoryArgv) => V; +type PrimitiveType = { bitWidth(): number; isSigned(): boolean }; +type BufferViewFactory = (batch: MessageBatch, type: VectorType, bitWidth: number, offset: number, length: number) => T; -type FieldType = { bitWidth(): number; isSigned(): boolean }; -type dataViewFactory = (batch: MessageBatch, type: VectorType, bitWidth: number, offset: number, length: number) => V; -type vectorFactory> = (field: Field, - length: number, - data: TList, - nulls: Uint8Array, - offsets: TypedArray, - fieldType: FieldType, - chunk: MessageBatch, - iterable: IteratorState, - dictionaries: Dictionaries) => V; +interface VectorFactoryArgv { + field: Field; + fieldNode: FieldNode; + iterator: IteratorState; + dictionaries: Dictionaries; + messageBatch: MessageBatch; + data?: T; + offsets?: TypedArray; + validity?: Uint8Array; + primitiveType?: PrimitiveType; +} -function readVectorLayout(createDataView: dataViewFactory, createVector: vectorFactory>) { - return function readLayout( - field: Field, - chunk: MessageBatch, - iterator: IteratorState, - dictionaries: Dictionaries, - integerFieldType?: FieldType - ) { - let batch = chunk.data; - let layoutLength = field.layoutLength(); - let node = batch.nodes(iterator.nodeIndex++); - let data: TList, offsets: any, validity: Uint8Array; - let type, bitWidth, bufferLength, nodeLength = node.length().low; - for (let i = -1; ++i < layoutLength;) { - let layout = field.layout(i); - let buffer = batch.buffers(iterator.bufferIndex++); - if ((type = layout.type()) === VectorType.TYPE || - (bufferLength = buffer.length().low) <= 0 || - (bitWidth = layout.bitWidth()) <= 0) { - continue; - } else if (type === VectorType.DATA) { - data = createDataView(chunk, type, bitWidth, buffer.offset().low, bufferLength); - } else if (type === VectorType.OFFSET) { - offsets = createDataView(chunk, type, bitWidth, buffer.offset().low, bufferLength); - } else if (node.nullCount().low > 0) { - validity = createValidityDataViews(chunk, type, bitWidth, buffer.offset().low, nodeLength); - } +function readVectorLayout( + createBufferView: BufferViewFactory, createVector: VectorFactory, + field: Field, messageBatch: MessageBatch, iterator: IteratorState, dictionaries: Dictionaries, primitiveType?: PrimitiveType +) { + let fieldNode: FieldNode, recordBatch = messageBatch.data; + if (!(fieldNode = recordBatch.nodes(iterator.nodeIndex)!)) { + return null; + } + iterator.nodeIndex += 1; + let type, bitWidth, layout, buffer, bufferLength; + let data: T | undefined, offsets: TypedArray | undefined, validity: Uint8Array | undefined; + for (let i = -1, n = field.layoutLength(); ++i < n;) { + if (!(layout = field.layout(i)!) || + !(buffer = recordBatch.buffers(iterator.bufferIndex)!)) { + continue; } - return createVector(field, nodeLength, data, validity, offsets, integerFieldType, chunk, iterator, dictionaries); - }; + iterator.bufferIndex += 1; + if ((type = layout.type()) === VectorType.TYPE || + (bufferLength = buffer.length().low) <= 0 || + (bitWidth = layout.bitWidth()) <= 0) { + continue; + } else if (type === VectorType.DATA) { + data = createBufferView(messageBatch, type, bitWidth, buffer.offset().low, bufferLength); + } else if (type === VectorType.OFFSET) { + offsets = createBufferView(messageBatch, type, bitWidth, buffer.offset().low, bufferLength); + } else if (fieldNode.nullCount().low > 0) { + validity = createValidityDataViews(messageBatch, type, bitWidth, buffer.offset().low, fieldNode.length().low); + } + } + return createVector({ data, offsets, validity, field, fieldNode, iterator, messageBatch, dictionaries, primitiveType }); } -function createDataView( - dataViews: TypedArrayCtor[], offsetViews: TypedArrayCtor[] | null, +function createTypedArray( + bufferViews: TypedArrayConstructor[], offsetViews: TypedArrayConstructor[] | null, batch: MessageBatch, type: VectorType, bitWidth: number, offset: number, length: number ) { const buffer = batch.bytes.buffer; const byteLength = buffer.byteLength; const byteOffset = batch.offset + offset; - const DataViewType = valueForBitWidth(bitWidth, type === VectorType.OFFSET && offsetViews || dataViews); + const DataViewType = valueForBitWidth(bitWidth, type === VectorType.OFFSET && offsetViews || bufferViews); const dataViewLength = ((byteOffset + length) <= byteLength ? length : byteLength - byteOffset @@ -249,12 +275,12 @@ function createDataView( return new DataViewType(buffer, byteOffset, dataViewLength); } -function valueForBitWidth(bitWidth: number, values: any[]) { +function valueForBitWidth(bitWidth: number, values: T[]) { return values[bitWidth >> 4] || values[3]; } -function createSyntheticDictionaryIndexField(field: Field, type: FieldType) { - let layouts = []; +function createSyntheticDictionaryIndexField(field: Field, type: PrimitiveType) { + let layouts = [] as VectorLayout[]; let builder = new flatbuffers.Builder(); if (field.nullable()) { VectorLayout.startVectorLayout(builder); @@ -270,7 +296,7 @@ function createSyntheticDictionaryIndexField(field: Field, type: FieldType) { builder.finish(VectorLayout.endVectorLayout(builder)); layouts.push(VectorLayout.getRootAsVectorLayout(builder.dataBuffer())); return Object.create(field, { - layout: { value(i) { return layouts[i]; } }, + layout: { value(i: number) { return layouts[i]; } }, layoutLength: { value() { return layouts.length; } } }); -} \ No newline at end of file +} diff --git a/js/src/table.ts b/js/src/table.ts deleted file mode 100644 index 5e781054daf31..0000000000000 --- a/js/src/table.ts +++ /dev/null @@ -1,143 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { readBuffers } from './reader/arrow'; -import { StructVector } from './vector/struct'; -import { Vector, sliceToRangeArgs } from './vector/vector'; - -export type RowObject = { [k: string]: any }; - -export class Table implements Iterable> { - public length: number; - protected _columns: Vector[]; - protected _columnsMap: { [k: string]: Vector }; - static from(...bytes: Array) { - let columns: Vector[]; - for (let vectors of readBuffers(...bytes)) { - columns = !columns ? vectors : columns.map((v, i) => v.concat(vectors[i])); - } - return new Table(columns); - } - static fromStruct(vector: StructVector) { - return new Table(( vector).vectors); - } - constructor(columns: Vector[]) { - this._columns = columns || []; - this.length = Math.max(...this._columns.map((v) => v.length)); - this._columnsMap = this._columns.reduce((map, vec) => { - return (map[vec.name] = vec) && map || map; - }, {}); - } - *[Symbol.iterator]() { - for (let cols = this._columns, i = -1, n = this.length; ++i < n;) { - yield rowAsMap(i, cols); - } - } - *rows(startRow?: number | boolean, endRow?: number | boolean, compact?: boolean) { - let start = startRow as number, end = endRow as number; - if (typeof startRow === 'boolean') { - compact = startRow; - start = end; - end = undefined; - } else if (typeof endRow === 'boolean') { - compact = endRow; - end = undefined; - } - let rowIndex = -1, { length } = this; - const [rowOffset, rowsTotal] = sliceToRangeArgs(length, start, end); - while (++rowIndex < rowsTotal) { - yield this.getRow((rowIndex + rowOffset) % length, compact); - } - } - *cols(startCol?: number, endCol?: number) { - for (const column of this._columns.slice(startCol, endCol)) { - yield column; - } - } - getRow(rowIndex: number): RowObject; - getRow(rowIndex: number, compact: boolean): Array; - getRow(rowIndex: number, compact?: boolean) { - return (compact && rowAsArray || rowAsObject)(rowIndex, this._columns); - } - getCell(columnName: string, rowIndex: number) { - return this.getColumn>(columnName).get(rowIndex); - } - getCellAt(columnIndex: number, rowIndex: number) { - return this.getColumnAt>(columnIndex).get(rowIndex); - } - getColumn>(columnName: string) { - return this._columnsMap[columnName] as T; - } - getColumnAt>(columnIndex: number) { - return this._columns[columnIndex] as T; - } - toString(): string; - toString(index: boolean): string; - toString(options: { index: boolean }): string; - toString(options?: any) { - const index = typeof options === 'object' ? options && !!options.index - : typeof options === 'boolean' ? !!options - : false; - const { length } = this; - if (length <= 0) { return ''; } - const maxColumnWidths = []; - const rows = new Array(length + 1); - rows[0] = this._columns.map((c) => c.name); - index && rows[0].unshift('Index'); - for (let i = -1, n = rows.length - 1; ++i < n;) { - rows[i + 1] = this.getRow(i, true); - index && rows[i + 1].unshift(i); - } - // Pass one to convert to strings and count max column widths - for (let i = -1, n = rows.length; ++i < n;) { - const row = rows[i]; - for (let j = -1, k = row.length; ++j < k;) { - const val = row[j] = `${row[j]}`; - maxColumnWidths[j] = !maxColumnWidths[j] - ? val.length - : Math.max(maxColumnWidths[j], val.length); - } - } - // Pass two to pad each one to max column width - for (let i = -1, n = rows.length; ++i < n;) { - const row = rows[i]; - for (let j = -1, k = row.length; ++j < k;) { - row[j] = leftPad(row[j], ' ', maxColumnWidths[j]); - } - rows[i] = row.join(', '); - } - return rows.join('\n'); - } -} - -Table.prototype.length = 0; - -function leftPad(str, fill, n) { - return (new Array(n + 1).join(fill) + str).slice(-1 * n); -} - -function rowAsMap(row: number, columns: Vector[]) { - return columns.reduce((map, vector) => map.set(vector.name, vector.get(row)), new Map()); -} - -function rowAsObject(rowIndex: number, columns: Vector[]) { - return columns.reduce((row, vector) => (row[vector.name] = vector.get(rowIndex)) && row || row, Object.create(null)); -} - -function rowAsArray(rowIndex: number, columns: Vector[]) { - return columns.reduce((row, vector, columnIndex) => (row[columnIndex] = vector.get(rowIndex)) && row || row, new Array(columns.length)); -} diff --git a/js/src/text-encoding-utf-8.d.ts b/js/src/text-encoding-utf-8.d.ts new file mode 100644 index 0000000000000..68ba4dfd9a346 --- /dev/null +++ b/js/src/text-encoding-utf-8.d.ts @@ -0,0 +1,4 @@ +declare module 'text-encoding-utf-8' { + import * as TextEncoding from 'text-encoding'; + export = TextEncoding; +} diff --git a/js/src/types/arrow.ts b/js/src/types/arrow.ts new file mode 100644 index 0000000000000..e18f5da4f1fd6 --- /dev/null +++ b/js/src/types/arrow.ts @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as Schema_ from '../format/Schema_generated'; +import * as Message_ from '../format/Message_generated'; +import Field = Schema_.org.apache.arrow.flatbuf.Field; +import FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; + +import { BoolVector } from './vector/bool'; +import { DictionaryVector } from './dictionary'; +import { nullableMixin, fieldMixin } from './vector/traits'; +import { ListVector as ListVectorBase } from './list'; +import { Utf8Vector as Utf8VectorBase } from './utf8'; +import { Vector, Column, TypedArray } from './types'; +import { DateVector as DateVectorBase } from './vector/date'; +import { TableVector as TableVectorBase } from './table/table'; +import { StructVector as StructVectorBase } from './table/struct'; +import { FixedSizeListVector as FixedSizeListVectorBase } from './fixedsizelist'; +import { + LongVector as LongVectorBase, + Int64Vector as Int64VectorBase, + Uint64Vector as Uint64VectorBase, +} from './vector/long'; +import { + TypedVector, + Int8Vector as Int8VectorBase, + Int16Vector as Int16VectorBase, + Int32Vector as Int32VectorBase, + Uint8Vector as Uint8VectorBase, + Uint16Vector as Uint16VectorBase, + Uint32Vector as Uint32VectorBase, + Float32Vector as Float32VectorBase, + Float64Vector as Float64VectorBase, +} from './vector/typed'; + +export { TypedArray, TypedVector }; +export { Column, BoolVector, DictionaryVector }; +export class ListVector extends MixinArrowTraits(ListVectorBase) {} +export class Utf8Vector extends MixinArrowTraits(Utf8VectorBase) {} +export class TableVector extends MixinArrowTraits(TableVectorBase) {} +export class StructVector extends MixinArrowTraits(StructVectorBase) {} +export class FixedSizeListVector extends MixinArrowTraits(FixedSizeListVectorBase) {} +export class DateVector extends MixinArrowTraits(DateVectorBase) {} +export class LongVector extends MixinArrowTraits(LongVectorBase) {} +export class Int8Vector extends MixinArrowTraits(Int8VectorBase) {} +export class Int16Vector extends MixinArrowTraits(Int16VectorBase) {} +export class Int32Vector extends MixinArrowTraits(Int32VectorBase) {} +export class Int64Vector extends MixinArrowTraits(Int64VectorBase) {} +export class Uint8Vector extends MixinArrowTraits(Uint8VectorBase) {} +export class Uint16Vector extends MixinArrowTraits(Uint16VectorBase) {} +export class Uint32Vector extends MixinArrowTraits(Uint32VectorBase) {} +export class Uint64Vector extends MixinArrowTraits(Uint64VectorBase) {} +export class Float32Vector extends MixinArrowTraits(Float32VectorBase) {} +export class Float64Vector extends MixinArrowTraits(Float64VectorBase) {} + +export function MixinArrowTraits, TArgv>(BaseVector: new (argv: TArgv) => T) { + const FieldVector = fieldMixin(BaseVector); + const NullableVector = nullableMixin(BaseVector); + const NullableFieldVector = nullableMixin(FieldVector); + return function(this: any, argv: TArgv & (object | { validity: Uint8Array } | { field: Field, fieldNode: FieldNode })) { + return new ((!isFieldArgv(argv) ? !isNullableArgv(argv) ? + BaseVector : NullableVector : !isNullableArgv(argv) ? + FieldVector : NullableFieldVector + ) as any)(argv); + } as any as { new (argv: TArgv & (object | { validity: Uint8Array } | { field: Field, fieldNode: FieldNode })): T }; +} + +function isFieldArgv(x: any): x is { field: Field, fieldNode: FieldNode } { + return x && x.field instanceof Field && x.fieldNode instanceof FieldNode; +} + +function isNullableArgv(x: any): x is { validity: Uint8Array } { + return x && x.validity && ArrayBuffer.isView(x.validity) && x.validity instanceof Uint8Array; +} diff --git a/js/src/types/dictionary.ts b/js/src/types/dictionary.ts new file mode 100644 index 0000000000000..cafa753311d3b --- /dev/null +++ b/js/src/types/dictionary.ts @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector, Column } from './types'; +import { VirtualVector } from './vector/virtual'; + +export interface DictionaryVector extends Vector { + getValue(key: number): T; + getKey(index: number): number; +} + +export class DictionaryVector extends Vector implements Column, DictionaryVector { + readonly data: Vector; + readonly keys: Column; + constructor(argv: { data: Vector, keys: Vector }) { + super(); + this.data = argv.data; + this.keys = argv.keys as Column; + } + get name () { return this.keys.name; } + get type () { return this.keys.type; } + get length () { return this.keys.length; } + get metadata () { return this.keys.metadata; } + get nullable () { return this.keys.nullable; } + get nullCount () { return this.keys.nullCount; } + get(index: number) { + return this.getValue(this.getKey(index)!); + } + getKey(index: number) { + return this.keys.get(index); + } + getValue(key: number) { + return this.data.get(key); + } + concat(...vectors: Vector[]): Vector { + return new VirtualVector(Array, this, ...vectors); + } + *[Symbol.iterator]() { + const { data, keys } = this; + for (let i = -1, n = keys.length; ++i < n;) { + yield data.get(keys.get(i)!); + } + } +} diff --git a/js/src/vector/struct.ts b/js/src/types/fixedsizelist.ts similarity index 53% rename from js/src/vector/struct.ts rename to js/src/types/fixedsizelist.ts index e59ac91e9cd08..6311d891d5058 100644 --- a/js/src/vector/struct.ts +++ b/js/src/types/fixedsizelist.ts @@ -15,25 +15,21 @@ // specific language governing permissions and limitations // under the License. -import { Vector } from './vector'; -import { BitVector, ValidityArgs } from './typed'; +import { List, Vector } from './types'; +import { VirtualVector } from './vector/virtual'; -export class StructVector extends Vector { - protected vectors: Vector[]; - constructor(validity: ValidityArgs, ...vectors: Vector[]) { +export class FixedSizeListVector> extends Vector { + readonly listSize: number; + readonly values: Vector; + constructor(argv: { listSize: number, values: Vector }) { super(); - this.vectors = vectors; - this.length = Math.max(0, ...vectors.map((v) => v.length)); - validity && (this.validity = BitVector.from(validity)); + this.values = argv.values; + this.listSize = argv.listSize; } get(index: number) { - return this.validity.get(index) ? this.vectors.map((v) => v.get(index)) : null; + return this.values.slice(this.listSize * index, this.listSize * (index + 1)); } - concat(vector: StructVector) { - return StructVector.from(this, - this.length + vector.length, - this.validity.concat(vector.validity), - ...this.vectors.map((v, i) => v.concat(vector.vectors[i])) - ); + concat(...vectors: Vector[]): Vector { + return new VirtualVector(Array, this, ...vectors); } } diff --git a/js/src/types/list.ts b/js/src/types/list.ts new file mode 100644 index 0000000000000..ca9170b5908d5 --- /dev/null +++ b/js/src/types/list.ts @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { List, Vector } from './types'; +import { VirtualVector } from './vector/virtual'; + +export class ListVector> extends Vector { + readonly offsets: Int32Array; + readonly values: Vector; + constructor(argv: { offsets: Int32Array, values: Vector }) { + super(); + this.values = argv.values; + this.offsets = argv.offsets; + } + get(index: number) { + return this.values.slice(this.offsets[index], this.offsets[index + 1]); + } + concat(...vectors: Vector[]): Vector { + return new VirtualVector(Array, this, ...vectors); + } +} \ No newline at end of file diff --git a/js/src/types/table/from.ts b/js/src/types/table/from.ts new file mode 100644 index 0000000000000..ae0755961eb7d --- /dev/null +++ b/js/src/types/table/from.ts @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Column } from '../types'; +import { TableVector } from './table'; +import { readBuffers } from '../../reader/arrow'; + +export function fromBuffers(...bytes: Array) { + let columns: Column[] = null as any; + for (let vectors of readBuffers(...bytes)) { + columns = !columns ? vectors : columns.map((v, i) => v.concat(vectors[i]) as Column); + } + return new TableVector({ columns }); +} + +TableVector.from = fromBuffers; + +declare module './table' { + namespace TableVector { export let from: typeof fromBuffers; } +} \ No newline at end of file diff --git a/js/src/types/table/row.ts b/js/src/types/table/row.ts new file mode 100644 index 0000000000000..432cfd7364ba3 --- /dev/null +++ b/js/src/types/table/row.ts @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Row, Vector, Struct } from '../types'; +import { VirtualVector } from '../vector/virtual'; + +export class RowVector extends Vector implements Row { + readonly row: number; + readonly length: number; + readonly table: Struct; + [Symbol.toStringTag]() { return 'Row'; } + constructor(table: Struct, row: number) { + super(); + this.row = row; + this.table = table; + this.length = table.columns.length; + } + get(index: number) { + const col = this.table.columns[index]; + return col ? col.get(this.row) as T : null; + } + col(key: string) { + const col = this.table.col(key); + return col ? col.get(this.row) as T : null; + } + *[Symbol.iterator]() { + const { row } = this; + for (const col of this.table.columns) { + yield col ? col.get(row) : null; + } + } + concat(...rows: Vector[]): Vector { + return new VirtualVector(Array, this, ...rows as any[]); + } + toArray() { return [...this]; } + toJSON() { return this.toArray(); } + toString() { return `Row [${this.length})` } + toObject(): Record { + const { row } = this, map = Object.create(null); + for (const col of this.table.columns) { + if (col && col.name) { + map[col.name] = col.get(row); + } + } + return map; + } +} diff --git a/js/src/types/table/struct.ts b/js/src/types/table/struct.ts new file mode 100644 index 0000000000000..de6a3a05692aa --- /dev/null +++ b/js/src/types/table/struct.ts @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { RowVector } from './row'; +import { toString } from './toString'; +import { VirtualVector } from '../vector/virtual'; +import { Row, Vector, Column, Struct } from '../types'; + +export interface StructVector { + toString(): string; + toString(index: boolean): string; + toString(options: { index: boolean }): string; +} + +export class StructVector extends Vector> implements Struct { + readonly length: number; + readonly columns: Column[]; + constructor(argv: { columns: Column[] }) { + super(); + this.columns = argv.columns || []; + if (!this.length) { + this.length = Math.max(...this.columns.map((col) => col.length)) | 0; + } + } + get(index: number): StructRow { + return new StructRow(this, index); + } + col(name: string) { + return this.columns.find((col) => col.name === name) || null; + } + key(index: number) { + return this.columns[index] ? this.columns[index].name : null; + } + select(...columns: string[]) { + return new StructVector({ columns: columns.map((name) => this.col(name)!) }); + } + concat(...structs: Vector>[]): Vector> { + return new VirtualVector(Array, this, ...structs as any[]); + } + toString(x?: any) { + return toString(this, x); + } +} + +export class StructRow extends RowVector { + toString() { + return JSON.stringify(this); + } +} \ No newline at end of file diff --git a/js/src/types/table/table.ts b/js/src/types/table/table.ts new file mode 100644 index 0000000000000..d9074dec2d382 --- /dev/null +++ b/js/src/types/table/table.ts @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { StructVector, StructRow } from './struct'; + +export class TableVector extends StructVector { + get(index: number): TableRow { + return new TableRow(this, index); + } +} + +export class TableRow extends StructRow { + toString() { + return this.toArray().map((x) => JSON.stringify(x)).join(', '); + } +} \ No newline at end of file diff --git a/js/src/types/table/toString.ts b/js/src/types/table/toString.ts new file mode 100644 index 0000000000000..85b23ca246058 --- /dev/null +++ b/js/src/types/table/toString.ts @@ -0,0 +1,40 @@ +import { Struct } from '../types'; + +export function toString(source: Struct, options?: any) { + const index = typeof options === 'object' ? options && !!options.index + : typeof options === 'boolean' ? !!options + : false; + const { length } = source; + if (length <= 0) { return ''; } + const rows = new Array(length + 1); + const maxColumnWidths = [] as number[]; + rows[0] = source.columns.map((_, i) => source.key(i)); + index && rows[0].unshift('Index'); + for (let i = -1, n = rows.length - 1; ++i < n;) { + rows[i + 1] = [...source.get(i)!]; + index && rows[i + 1].unshift(i); + } + // Pass one to convert to strings and count max column widths + for (let i = -1, n = rows.length; ++i < n;) { + const row = rows[i]; + for (let j = -1, k = row.length; ++j < k;) { + const val = row[j] = `${row[j]}`; + maxColumnWidths[j] = !maxColumnWidths[j] + ? val.length + : Math.max(maxColumnWidths[j], val.length); + } + } + // Pass two to pad each one to max column width + for (let i = -1, n = rows.length; ++i < n;) { + const row = rows[i]; + for (let j = -1, k = row.length; ++j < k;) { + row[j] = leftPad(row[j], ' ', maxColumnWidths[j]); + } + rows[i] = row.join(', '); + } + return rows.join('\n'); +} + +function leftPad(str: string, fill: string, n: number) { + return (new Array(n + 1).join(fill) + str).slice(-1 * n); +} diff --git a/js/src/types/types.ts b/js/src/types/types.ts new file mode 100644 index 0000000000000..f732bc0971f35 --- /dev/null +++ b/js/src/types/types.ts @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as Schema_ from '../format/Schema_generated'; +import Type = Schema_.org.apache.arrow.flatbuf.Type; + +export interface TypedArrayConstructor { + readonly prototype: T; + readonly BYTES_PER_ELEMENT: number; + new (length: number): T; + new (elements: Iterable): T; + new (arrayOrArrayBuffer: ArrayLike | ArrayBufferLike): T; + new (buffer: ArrayBufferLike, byteOffset: number, length?: number): T; +} + +export interface TypedArray extends Iterable { + [index: number]: number; + readonly length: number; + readonly byteLength: number; + readonly byteOffset: number; + readonly buffer: ArrayBufferLike; + readonly BYTES_PER_ELEMENT: number; + [Symbol.iterator](): IterableIterator; + slice(start?: number, end?: number): TypedArray; + subarray(begin: number, end?: number): TypedArray; + set(array: ArrayLike, offset?: number): void; +} + +export type FloatArray = Float32Array | Float64Array; +export type IntArray = Int8Array | Int16Array | Int32Array; +export type UintArray = Uint8ClampedArray | Uint8Array | Uint16Array | Uint32Array; + +export type List = T[] | TypedArray; + +export interface Vector extends Iterable { + readonly length: number; + get(index: number): T | null; + concat(...vectors: Vector[]): Vector; + slice(start?: number, end?: number): R; +} + +export interface Row extends Vector { + col(key: string): T | null; +} + +export interface Column extends Vector { + readonly name: string; + readonly type: string; + readonly nullable: boolean; + readonly nullCount: number; + readonly metadata: Map; +} + +export interface Struct extends Vector> { + readonly columns: Column[]; + key(key: number): string | null; + col(key: string): Column | null; + select(...columns: string[]): Struct; + concat(...structs: Vector>[]): Vector>; +} + +export class Vector implements Vector { + slice(start?: number, end?: number): R { + let { length } = this, from = start! | 0; + let to = end === undefined ? length : Math.max(end | 0, from); + let result = new Array(to - Math.min(from, to)); + for (let i = -1, n = result.length; ++i < n;) { + result[i] = this.get(i + from); + } + return result as any; + } + *[Symbol.iterator]() { + for (let i = -1, n = this.length; ++i < n;) { + yield this.get(i); + } + } +} + +(Vector.prototype as any).name = ''; +(Vector.prototype as any).type = Type[0]; +(Vector.prototype as any).stride = 1; +(Vector.prototype as any).nullable = !1; +(Vector.prototype as any).nullCount = 0; +(Vector.prototype as any).metadata = new Map(); diff --git a/js/src/vector/dictionary.ts b/js/src/types/utf8.ts similarity index 50% rename from js/src/vector/dictionary.ts rename to js/src/types/utf8.ts index de811eaf5b050..178704f6161f5 100644 --- a/js/src/vector/dictionary.ts +++ b/js/src/types/utf8.ts @@ -15,37 +15,26 @@ // specific language governing permissions and limitations // under the License. -import { Vector } from './vector'; +import { Vector } from './types'; +import { TextDecoder } from 'text-encoding-utf-8'; +import { VirtualVector } from './vector/virtual'; -export class DictionaryVector extends Vector { - protected data: Vector; - protected keys: Vector; - constructor(index: Vector, dictionary: Vector) { +const decoder = new TextDecoder('utf-8'); + +export class Utf8Vector extends Vector { + readonly values: Vector; + constructor(argv: { values: Vector }) { super(); - this.keys = index; - this.data = dictionary; - this.length = index && index.length || 0; - } - index(index: number) { - return this.keys.get(index); - } - value(index: number) { - return this.data.get(index); + this.values = argv.values; } get(index: number) { - return this.value(this.index(index)); + const chars = this.getCodePoints(index); + return chars ? decoder.decode(chars) : null; } - concat(vector: DictionaryVector) { - return DictionaryVector.from(this, - this.length + vector.length, - this.keys.concat(vector.keys), - this.data - ); + getCodePoints(index: number) { + return this.values.get(index); } - *[Symbol.iterator]() { - let { data } = this; - for (const loc of this.keys) { - yield data.get(loc); - } + concat(...vectors: Vector[]): Vector { + return new VirtualVector(Array, this, ...vectors); } } diff --git a/js/src/types/vector/bool.ts b/js/src/types/vector/bool.ts new file mode 100644 index 0000000000000..b2eea81f87f05 --- /dev/null +++ b/js/src/types/vector/bool.ts @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { TypedVector } from './typed'; + +export interface BoolVector extends TypedVector { + set(index: number, value: boolean): void; +} + +export class BoolVector extends TypedVector { + static pack = pack; + get(index: number) { + return (this.data[index >> 3] & 1 << index % 8) !== 0; + } + set(index: number, value: boolean) { + if (index > -1 === false) { + return; + } else if (value) { + this.data[index >> 3] |= (1 << (index % 8)); + } else { + this.data[index >> 3] &= ~(1 << (index % 8)); + } + } +} + +export function pack(values: Iterable) { + let xs = [], n, i = 0; + let bit = 0, byte = 0; + for (const value of values) { + value && (byte |= 1 << bit); + if (++bit === 8) { + xs[i++] = byte; + byte = bit = 0; + } + } + if (i === 0 || bit > 0) { xs[i++] = byte; } + if (i % 8 && (n = i + 8 - i % 8)) { + do { xs[i] = 0; } while (++i < n); + } + return new Uint8Array(xs); +} diff --git a/js/src/types/vector/date.ts b/js/src/types/vector/date.ts new file mode 100644 index 0000000000000..82dc82e64021f --- /dev/null +++ b/js/src/types/vector/date.ts @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { TypedVector } from './typed'; + +export class DateVector extends TypedVector { + get(index: number): Date { + return new Date(4294967296 * /* 2^32 */ + (super.get(index * 2 + 1) as any) + /* high */ + (super.get(index * 2) as any) /* low */ + ); + } +} + +(DateVector.prototype as any).stride = 2; diff --git a/js/src/types/vector/long.ts b/js/src/types/vector/long.ts new file mode 100644 index 0000000000000..de8eb0c13710b --- /dev/null +++ b/js/src/types/vector/long.ts @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { TypedVector } from './typed'; +import { TypedArray } from '../types'; +import { flatbuffers } from 'flatbuffers'; +import Long = flatbuffers.Long; + +export class LongVector extends TypedVector { + get(index: number) { + return new Long( + super.get(index * 2) as any, /* low */ + super.get(index * 2 + 1) as any /* high */ + ); + } +} + +(LongVector.prototype as any).stride = 2; + +export class Int64Vector extends LongVector {} +export class Uint64Vector extends LongVector {} \ No newline at end of file diff --git a/js/src/types/vector/traits.ts b/js/src/types/vector/traits.ts new file mode 100644 index 0000000000000..872c40b64d559 --- /dev/null +++ b/js/src/types/vector/traits.ts @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BoolVector } from './bool'; +import { Vector, Column } from '../types'; +import * as Schema_ from '../../format/Schema_generated'; +import * as Message_ from '../../format/Message_generated'; +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import Field = Schema_.org.apache.arrow.flatbuf.Field; +import FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; + +type Ctor = new (argv: TArgv) => Vector; + +export const nullableMixin = (superclass: new (argv: TArgv) => T) => + class extends (superclass as Ctor) { + readonly validity: Vector; + constructor(argv: TArgv & { validity: Uint8Array }) { + super(argv); + this.validity = new BoolVector({ data: argv.validity }); + } + get(index: number) { + return this.validity.get(index) ? super.get(index) : null; + } + }; + +export const fieldMixin = (superclass: new (argv: TArgv) => T) => + class extends (superclass as Ctor) implements Column { + readonly field: Field; + readonly type: string; + readonly length: number; + readonly stride: number; + readonly nullable: boolean; + readonly nullCount: number; + readonly fieldNode: FieldNode; + constructor(argv: TArgv & { field: Field, fieldNode: FieldNode }) { + super(argv); + const { field, fieldNode } = argv; + this.field = field; + this.fieldNode = fieldNode; + this.nullable = field.nullable(); + this.type = Type[field.typeType()]; + this.nullCount = fieldNode.nullCount().low; + this.length = (fieldNode.length().low / this.stride) | 0; + } + get name() { return this.field.name()!; } + get metadata() { + const { field } = this, data = new Map(); + for (let entry, key, i = -1, n = field && field.customMetadataLength() | 0; ++i < n;) { + if ((entry = field.customMetadata(i)) && (key = entry.key()) != null) { + data.set(key, entry.value()!); + } + } + return data; + } + }; diff --git a/js/src/types/vector/typed.ts b/js/src/types/vector/typed.ts new file mode 100644 index 0000000000000..fc093f2cb5f2a --- /dev/null +++ b/js/src/types/vector/typed.ts @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../types'; +import { VirtualVector } from './virtual'; +import { TypedArray, TypedArrayConstructor } from '../types'; + +export interface TypedVector { + slice(start?: number, end?: number): TArray; +} + +export class TypedVector extends Vector { + readonly data: TArray; + readonly stride: number; + readonly length: number; + constructor(argv: { data: TArray } | TArray) { + super(); + const data = ArrayBuffer.isView(argv) ? argv : argv.data; + this.length = ((this.data = data).length / this.stride) | 0; + } + get(index: number): T | null { + return this.data[index] as any; + } + concat(...vectors: Vector[]): Vector { + return new VirtualVector(this.data.constructor as TypedArrayConstructor, this, ...vectors); + } + slice(start?: number, end?: number) { + const { data, stride } = this, from = start! | 0; + const to = end === undefined ? data.length : Math.max(end | 0, from); + return data.subarray(Math.min(from, to) * stride | 0, to * stride | 0); + } +} + +(TypedVector.prototype as any).stride = 1; + +export class Int8Vector extends TypedVector {} +export class Int16Vector extends TypedVector {} +export class Int32Vector extends TypedVector {} +export class Uint8Vector extends TypedVector {} +export class Uint16Vector extends TypedVector {} +export class Uint32Vector extends TypedVector {} +export class Float32Vector extends TypedVector {} +export class Float64Vector extends TypedVector {} diff --git a/js/src/types/vector/virtual.ts b/js/src/types/vector/virtual.ts new file mode 100644 index 0000000000000..7f56012dc7d4b --- /dev/null +++ b/js/src/types/vector/virtual.ts @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { TypedVector } from './typed'; +import { Vector, Column, TypedArray, TypedArrayConstructor } from '../types'; + +export class VirtualVector implements Column { + readonly name: string; + readonly type: string; + readonly length: number; + readonly vectors: Vector[]; + readonly offsets: Uint32Array; + readonly ArrayType: ArrayConstructor | TypedArrayConstructor; + constructor(ArrayType: ArrayConstructor | TypedArrayConstructor, ...vectors: Vector[]) { + this.vectors = vectors; + this.ArrayType = ArrayType; + this.name = (vectors[0] as any).name; + this.type = (vectors[0] as any).type; + this.length = vectors.reduce((sum, vec) => sum + vec.length, 0); + this.offsets = Uint32Array.from(vectors.reduce((sums, vector, index) => [...sums, vector.length + sums[index]], [0])); + } + *[Symbol.iterator]() { + for (const vector of this.vectors) { + yield* vector; + } + } + get nullable() { + return (this.vectors as Column[]).some((vec) => vec.nullable); + } + get nullCount() { + return (this.vectors as Column[]).reduce((sum, v) => sum + v.nullCount | 0, 0); + } + get metadata() { + return new Map( + (this.vectors as Column[]).reduce((entries, v) => [ + ...entries, ...v.metadata.entries() + ], [] as [string, string][]) + ); + } + get(index: number) { + return findIndex(this.offsets, index) ? this.vectors[_vector].get(_offset) : null; + } + concat(...vectors: Vector[]) { + return new VirtualVector(this.ArrayType, ...this.vectors, ...vectors); + } + slice(begin?: number, end?: number) { + const ArrayType = this.ArrayType as any; + // clamp begin and end values between the virtual length + clampRange(this.length, begin!, end); + const from = _from, total = _total; + // find the start vector index and adjusted value index offset + if (!findIndex(this.offsets, from)) { return new ArrayType(0); } + const set = ArrayType === Array ? arraySet : typedArraySet as any; + let index = _vector, vectors = this.vectors as TypedVector[]; + let vector = vectors[index], source = vector.slice(_offset, _offset + total), target = source; + // Perf optimization: if the first slice contains all the values we're looking for, + // we don't have to copy values to a target Array. If we're slicing a TypedArray, + // this is a significant improvement as we avoid the memcpy 🎉 + if ((source.length / vector.stride | 0) < total) { + let vectorsLength = vectors.length; + let count = 0, length = 0, sources = []; + do { + sources.push(source); + length += source.length; + count += (source.length / vector.stride | 0); + } while ( + (count < total) && + (vector = vectors[index = (++index % vectorsLength)]) && + (source = vector.slice(0, Math.min(vector.length, total - count))) + ); + target = new ArrayType(length); + for (let i = -1, j = 0, n = sources.length; ++i < n;) { + j = set(sources[i], target, j); + } + } + return target; + } +} + +let _from = -1, _total = -1; +function clampRange(length: number, start: number, end?: number) { + let total = length, from = start || 0; + let to = end === end && typeof end == 'number' ? end : total; + if (to < 0) { to = total + to; } + if (from < 0) { from = total - (from * -1) % total; } + if (to < from) { from = to; to = start; } + _from = from; + _total = !isFinite(total = (to - from)) || total < 0 ? 0 : total; +} + +let _offset = -1, _vector = -1; +function findIndex(offsets: Uint32Array, index: number) { + let offset = 0, left = 0, middle = 0, right = offsets.length - 1; + while (index < offsets[right] && index >= (offset = offsets[left])) { + if (left + 1 === right) { + _vector = left; + _offset = index - offset; + return true; + } + middle = left + ((right - left) / 2) | 0; + index >= offsets[middle] ? (left = middle) : (right = middle); + } + return false; +} + +function arraySet(source: T[], target: T[], index: number) { + for (let i = 0, n = source.length; i < n;) { + target[index++] = source[i++]; + } + return index; +} + +function typedArraySet(source: TypedArray, target: TypedArray, index: number) { + return target.set(source, index) || index + source.length; +} diff --git a/js/src/vector/list.ts b/js/src/vector/list.ts deleted file mode 100644 index 7360d968b0250..0000000000000 --- a/js/src/vector/list.ts +++ /dev/null @@ -1,108 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Vector } from './vector'; -import { TextDecoder } from 'text-encoding'; -import { IndexVector, BitVector, ValidityArgs } from './typed'; - -export class ListVectorBase extends Vector { - protected values: Vector; - protected offsets: IndexVector; - constructor(validity: ValidityArgs, values: Vector, offsets: IndexVector) { - super(); - this.values = values; - this.offsets = offsets; - validity && (this.validity = BitVector.from(validity)); - } - get(index: number) { - let batch, from, to, { offsets } = this; - if (!this.validity.get(index) || - /* return null if `to` is null */ - ((to = offsets.get(index + 1)) === null) || !( - /* - return null if `batch` is less than than 0. this check is placed - second to avoid creating the [from, batch] tuple if `to` is null - */ - ([from, batch] = offsets.get(index, true) as number[]) && batch > -1)) { - return null; - } - return this.values.slice(from, to, batch) as any; - } - concat(vector: ListVectorBase) { - return (this.constructor as typeof ListVectorBase).from(this, - this.length + vector.length, - this.validity.concat(vector.validity), - this.values.concat(vector.values), - this.offsets.concat(vector.offsets) - ); - } - *[Symbol.iterator]() { - let v, r1, r2, { values } = this; - let it = this.offsets[Symbol.iterator](); - let iv = this.validity[Symbol.iterator](); - while (!(v = iv.next()).done && !(r1 = it.next()).done && !(r2 = it.next()).done) { - yield !v.value ? null : values.slice(r1.value[0], r2.value, r1.value[1]) as any; - } - } -} - -export class ListVector extends ListVectorBase {} -export class Utf8Vector extends ListVectorBase { - protected static decoder = new TextDecoder(`utf-8`); - get(index: number) { - let chars = super.get(index) as any; - return chars ? Utf8Vector.decoder.decode(chars) : null; - } - *[Symbol.iterator]() { - let decoder = Utf8Vector.decoder; - for (const chars of super[Symbol.iterator]()) { - yield !chars ? null : decoder.decode(chars); - } - } -} - -export class FixedSizeListVector extends Vector { - protected size: number; - protected values: Vector; - constructor(size: number, validity: ValidityArgs, values: Vector) { - super(); - this.values = values; - this.size = Math.abs(size | 0) || 1; - validity && (this.validity = BitVector.from(validity)); - } - get(index: number) { - return !this.validity.get(index) ? null : this.values.slice( - this.size * index, this.size * (index + 1) - ) as T[]; - } - concat(vector: FixedSizeListVector) { - return FixedSizeListVector.from(this, - this.length + vector.length, - this.size, - this.validity.concat(vector.validity), - this.values.concat(vector.values) - ); - } - *[Symbol.iterator]() { - let v, i = -1; - let { size, length, values } = this; - let iv = this.validity[Symbol.iterator](); - while (!(v = iv.next()).done && ++i < length) { - yield !v.value ? null : values.slice(size * i, size * (i + 1)) as T[]; - } - } -} diff --git a/js/src/vector/typed.ts b/js/src/vector/typed.ts deleted file mode 100644 index b38812e07d065..0000000000000 --- a/js/src/vector/typed.ts +++ /dev/null @@ -1,326 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { Vector } from './vector'; -import { flatbuffers } from 'flatbuffers'; - -import Long = flatbuffers.Long; - -export type VArray = { - [k: number]: T; length: number; - constructor: VArrayCtor>; -}; - -export type VArrayCtor = { - readonly prototype: VArray; - BYTES_PER_ELEMENT?: number; - new(...args: any[]): VArray; -}; - -export class VirtualVector> extends Vector { - protected lists: TArrayType[]; - protected _arrayType: VArrayCtor; - public get arrayType() { return this._arrayType; } - constructor(...lists: TArrayType[]) { - super(); - this.lists = lists.filter(Boolean); - } - get(index: number): T { - /* inlined `findVirtual` impl */ - let rows, length, lists = this.lists; - for (let batch = -1; - (rows = lists[++batch]) && - (length = rows.length) <= index && - 0 <= (index -= length);) {} - return rows && -1 < index ? rows[index] : null; - } - protected range(from: number, total: number, batch?: number) { - /* inlined `findVirtual` impl */ - let rows, local = from, length; - let { lists, _arrayType } = this; - for (batch = (batch || 0) - 1; - (rows = lists[++batch]) && - (length = rows.length) <= local && - 0 <= (local -= length);) {} - if (rows && local > -1) { - let index = 0, listsLength = lists.length; - let set: any = Array.isArray(rows) ? arraySet : typedArraySet; - let slice = _arrayType['prototype']['subarray'] || _arrayType['prototype']['slice']; - let source = slice.call(rows, local, local + total), target = source; - // Perf optimization: if the first slice contains all the values we're looking for, - // we don't have to copy values to a target Array. If we're slicing a TypedArray, - // this is a significant improvement as we avoid the memcpy 🎉 - if (source.length < total) { - target = new _arrayType(total); - while ((index = set(source, target, index)) < total) { - rows = lists[batch = ((batch + 1) % listsLength)]; - source = slice.call(rows, 0, Math.min(rows.length, total - index)); - } - } - return target as any; - } - return new _arrayType(0); - } - *[Symbol.iterator]() { - let index = -1, { lists, length } = this; - for (let outer = -1, n = lists.length; ++outer < n;) { - let list = lists[outer] as any; - for (let inner = -1, k = list.length; ++index < length && ++inner < k;) { - yield list[inner]; - } - } - } -} - -export type ValidityArgs = Vector | Uint8Array; -export class BitVector extends VirtualVector { - static constant: Vector = new (class ValidVector extends Vector { - get() { return true; } - *[Symbol.iterator]() { - do { yield true; } while (true); - } - })(); - static from(src: any) { - return src instanceof BitVector ? src - : src === BitVector.constant ? src - : src instanceof Uint8Array ? new BitVector(src) - : src instanceof Array ? new BitVector(BitVector.pack(src)) - : src instanceof Vector ? new BitVector(BitVector.pack(src)) - : BitVector.constant as Vector; - } - static pack(values: Iterable) { - let xs = [], n, i = 0; - let bit = 0, byte = 0; - for (const value of values) { - value && (byte |= 1 << bit); - if (++bit === 8) { - xs[i++] = byte; - byte = bit = 0; - } - } - if (i === 0 || bit > 0) { xs[i++] = byte; } - if (i % 8 && (n = n = i + 8 - i % 8)) { - do { xs[i] = 0; } while (++i < n); - } - return new Uint8Array(xs); - } - constructor(...lists: Uint8Array[]) { - super(...lists); - this.length = this.lists.reduce((l, xs) => l + xs['length'], 0); - } - get(index: number) { - /* inlined `findVirtual` impl */ - let rows, length, lists = this.lists; - for (let batch = -1; - (rows = lists[++batch]) && - (length = rows.length) <= index && - 0 <= (index -= length);) {} - return !(!rows || index < 0 || (rows[index >> 3 | 0] & 1 << index % 8) === 0); - } - set(index: number, value: boolean) { - /* inlined `findVirtual` impl */ - let rows, length, lists = this.lists; - for (let batch = -1; - (rows = lists[++batch]) && - (length = rows.length) <= index && - 0 <= (index -= length);) {} - if (rows && index > -1) { - value - ? (rows[index >> 3 | 0] |= (1 << (index % 8))) - : (rows[index >> 3 | 0] &= ~(1 << (index % 8))); - } - } - concat(vector: BitVector) { - return new BitVector(...this.lists, ...vector.lists); - } - *[Symbol.iterator]() { - for (const byte of super[Symbol.iterator]()) { - for (let i = -1; ++i < 8;) { - yield (byte & 1 << i) !== 0; - } - } - } -} - -export class TypedVector extends VirtualVector { - constructor(validity: ValidityArgs, ...lists: TArrayType[]) { - super(...lists); - validity && (this.validity = BitVector.from(validity)); - } - concat(vector: TypedVector) { - return (this.constructor as typeof TypedVector).from(this, - this.length + vector.length, - this.validity.concat(vector.validity), - ...this.lists, ...vector.lists - ); - } -} - -export class DateVector extends TypedVector { - get(index: number) { - return !this.validity.get(index) ? null : new Date( - Math.pow(2, 32) * - super.get(2 * index + 1) + - super.get(2 * index) - ); - } - *[Symbol.iterator]() { - let v, low, high; - let it = super[Symbol.iterator](); - let iv = this.validity[Symbol.iterator](); - while (!(v = iv.next()).done && !(low = it.next()).done && !(high = it.next()).done) { - yield !v.value ? null : new Date(Math.pow(2, 32) * high.value + low.value); - } - } -} - -export class IndexVector extends TypedVector { - get(index: number, returnWithBatchIndex = false) { - /* inlined `findVirtual` impl */ - let rows, length, batch = -1, lists = this.lists; - for (; - (rows = lists[++batch]) && - (length = rows.length) <= index && - 0 <= (index -= length);) {} - return !returnWithBatchIndex - ? (rows && -1 < index ? rows[index + batch] : null) as number - : (rows && -1 < index ? [rows[index + batch], batch] : [0, -1]) as number[]; - } - *[Symbol.iterator]() { - // Alternate between iterating a tuple of [from, batch], and to. The from - // and to values are relative to the record batch they're defined in, so - // `ListVectorBase` needs to know the right batch to read. - let xs = new Int32Array(2), { lists } = this; - for (let i = -1, n = lists.length; ++i < n;) { - let list = lists[i] as any; - for (let j = -1, k = list.length - 1; ++j < k;) { - xs[1] = i; - xs[0] = list[j]; - yield xs; - yield list[j + 1]; - } - } - } -} - -export class ByteVector extends TypedVector { - get(index: number) { - return this.validity.get(index) ? super.get(index) : null; - } - *[Symbol.iterator]() { - let v, r, { validity } = this; - let it = super[Symbol.iterator](); - // fast path the case of no nulls - if (validity === BitVector.constant) { - yield* it; - } else { - let iv = validity[Symbol.iterator](); - while (!(v = iv.next()).done && !(r = it.next()).done) { - yield !v.value ? null : r.value; - } - } - } -} - -export class LongVector extends TypedVector { - get(index: number) { - return !this.validity.get(index) ? null : new Long( - super.get(index * 2), /* low */ - super.get(index * 2 + 1) /* high */ - ); - } - *[Symbol.iterator]() { - let v, low, high; - let it = super[Symbol.iterator](); - let iv = this.validity[Symbol.iterator](); - while (!(v = iv.next()).done && !(low = it.next()).done && !(high = it.next()).done) { - yield !v.value ? null : new Long(low.value, high.value); - } - } -} - -export class Int8Vector extends ByteVector {} -export class Int16Vector extends ByteVector {} -export class Int32Vector extends ByteVector {} -export class Int64Vector extends LongVector {} -export class Uint8Vector extends ByteVector {} -export class Uint16Vector extends ByteVector {} -export class Uint32Vector extends ByteVector {} -export class Uint64Vector extends LongVector {} -export class Float32Vector extends ByteVector {} -export class Float64Vector extends ByteVector {} - -LongVector.prototype.stride = 2; -(Vector.prototype as any).lists = []; -(Vector.prototype as any).validity = BitVector.constant; -(VirtualVector.prototype as any)._arrayType = Array; -(BitVector.prototype as any)._arrayType = Uint8Array; -(Int8Vector.prototype as any)._arrayType = Int8Array; -(Int16Vector.prototype as any)._arrayType = Int16Array; -(Int32Vector.prototype as any)._arrayType = Int32Array; -(Int64Vector.prototype as any)._arrayType = Int32Array; -(Uint8Vector.prototype as any)._arrayType = Uint8Array; -(Uint16Vector.prototype as any)._arrayType = Uint16Array; -(Uint32Vector.prototype as any)._arrayType = Uint32Array; -(Uint64Vector.prototype as any)._arrayType = Uint32Array; -(DateVector.prototype as any)._arrayType = Uint32Array; -(IndexVector.prototype as any)._arrayType = Int32Array; -(Float32Vector.prototype as any)._arrayType = Float32Array; -(Float64Vector.prototype as any)._arrayType = Float64Array; - -function arraySet(source: Array, target: Array, index: number) { - for (let i = 0, n = source.length; i < n;) { - target[index++] = source[i++]; - } - return index; -} - -function typedArraySet(source: TypedArray, target: TypedArray, index: number) { - return target.set(source, index) || index + source.length; -} - -// Rather than eat the iterator cost, we've inlined this function into the relevant functions -// function* findVirtual(index: number, lists: TList[], batch?: number) { -// let rows, length; -// for (batch = (batch || 0) - 1; -// (rows = lists[++batch]) && -// (length = rows.length) <= index && -// 0 <= (index -= length);) {} -// return rows && -1 < index ? yield [rows, index, batch] : null; -// } - -export type TypedArrayCtor = { - readonly prototype: T; - readonly BYTES_PER_ELEMENT: number; - new(length: number): T; - new(array: ArrayLike): T; - new(buffer: ArrayBufferLike, byteOffset?: number, length?: number): T; -}; - -export type FloatArray = Float32Array | Float64Array; -export type IntArray = Int8Array | Int16Array | Int32Array | Uint8ClampedArray | Uint8Array | Uint16Array | Uint32Array; - -export type TypedArray = ( - Int8Array | - Uint8Array | - Int16Array | - Int32Array | - Uint16Array | - Uint32Array | - Float32Array | - Float64Array | - Uint8ClampedArray); diff --git a/js/src/vector/vector.ts b/js/src/vector/vector.ts deleted file mode 100644 index 1f39f87cbefc8..0000000000000 --- a/js/src/vector/vector.ts +++ /dev/null @@ -1,91 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import * as Schema_ from '../format/Schema_generated'; -export import Type = Schema_.org.apache.arrow.flatbuf.Type; -export import Field = Schema_.org.apache.arrow.flatbuf.Field; - -export function sliceToRangeArgs(length: number, start: number, end?: number) { - let total = length, from = start || 0; - let to = end === end && typeof end == 'number' ? end : total; - if (to < 0) { to = total + to; } - if (from < 0) { from = total - (from * -1) % total; } - if (to < from) { from = to; to = start; } - total = !isFinite(total = (to - from)) || total < 0 ? 0 : total; - return [from, total]; -} - -export class Vector implements Iterable { - static defaultName = ''; - static defaultProps = new Map(); - static defaultType = Type[Type.NONE]; - static create(field: Field, length: number, ...args: any[]) { - let vector = new this(...args), m; - vector.length = length; - vector.name = field.name(); - vector.type = Type[field.typeType()]; - if ((m = field.customMetadataLength()) > 0) { - let entry, i = 0, data = vector.props = new Map(); - do { - entry = field.customMetadata(i); - data[entry.key()] = entry.value(); - } while (++i < m); - } - return vector; - } - static from(source: Vector, length: number, ...args: any[]) { - let vector = new this(...args); - vector.length = length; - source.name !== Vector.defaultName && (vector.name = source.name); - source.type !== Vector.defaultType && (vector.type = source.type); - source.props !== Vector.defaultProps && (vector.props = source.props); - return vector; - } - public name: string; - public type: string; - public length: number; - public stride: number; - public props: Map; - protected validity: Vector; - get(index: number): T { return null; } - concat(vector: Vector) { return vector; } - slice(start?: number, end?: number, batch?: number) { - const { stride } = this; - const [offset, length] = sliceToRangeArgs( - stride * this.length, stride * (start || 0), stride * end - ); - return this.range(offset, length, batch); - } - protected range(index: number, length: number, batch?: number) { - const result = new Array(length); - for (let i = -1, n = this.length; ++i < length;) { - result[i] = this.get((i + index) % n) as any; - } - return result as Iterable; - } - *[Symbol.iterator]() { - for (let i = -1, n = this.length; ++i < n;) { - yield this.get(i); - } - } -} - -Vector.prototype.length = 0; -Vector.prototype.stride = 1; -Vector.prototype.name = Vector.defaultName; -Vector.prototype.type = Vector.defaultType; -Vector.prototype.props = Vector.defaultProps; diff --git a/js/test/Arrow.ts b/js/test/Arrow.ts index 3f29c5409ab26..722781db658dc 100644 --- a/js/test/Arrow.ts +++ b/js/test/Arrow.ts @@ -18,9 +18,9 @@ /* tslint:disable */ // Dynamically load an Ix target build based on command line arguments -const target = process.env.TEST_TARGET; -const format = process.env.TEST_MODULE; -const resolve = require('path').resolve; +const path = require('path'); +const target = process.env.TEST_TARGET!; +const format = process.env.TEST_MODULE!; // these are duplicated in the gulpfile :< const targets = [`es5`, `es2015`, `esnext`]; @@ -30,21 +30,24 @@ function throwInvalidImportError(name: string, value: string, values: string[]) throw new Error('Unrecognized ' + name + ' \'' + value + '\'. Please run tests with \'--' + name + ' \''); } -if (!~targets.indexOf(target)) throwInvalidImportError('target', target, targets); -if (!~formats.indexOf(format)) throwInvalidImportError('module', format, formats); +let modulePath = ``; -let Arrow: any = require(resolve(`./targets/${target}/${format}/Arrow.js`)); +if (target === `ts` || target === `apache-arrow`) modulePath = target; +else if (!~targets.indexOf(target)) throwInvalidImportError('target', target, targets); +else if (!~formats.indexOf(format)) throwInvalidImportError('module', format, formats); +else modulePath = path.join(target, format); + +let Arrow: any = require(path.resolve(`./targets`, modulePath, `Arrow`)); import { Table as Table_, - readBuffers as readBuffers_, Vector as Vector_, - BitVector as BitVector_, + readBuffers as readBuffers_, + BoolVector as BoolVector_, + TypedVector as TypedVector_, ListVector as ListVector_, Utf8Vector as Utf8Vector_, DateVector as DateVector_, - IndexVector as IndexVector_, - TypedVector as TypedVector_, Int8Vector as Int8Vector_, Int16Vector as Int16Vector_, Int32Vector as Int32Vector_, @@ -61,14 +64,13 @@ import { } from '../src/Arrow'; export let Table = Arrow.Table as typeof Table_; -export let readBuffers = Arrow.readBuffers as typeof readBuffers_; export let Vector = Arrow.Vector as typeof Vector_; -export let BitVector = Arrow.BitVector as typeof BitVector_; +export let readBuffers = Arrow.readBuffers as typeof readBuffers_; +export let BoolVector = Arrow.BoolVector as typeof BoolVector_; +export let TypedVector = Arrow.TypedVector as typeof TypedVector_; export let ListVector = Arrow.ListVector as typeof ListVector_; export let Utf8Vector = Arrow.Utf8Vector as typeof Utf8Vector_; export let DateVector = Arrow.DateVector as typeof DateVector_; -export let IndexVector = Arrow.IndexVector as typeof IndexVector_; -export let TypedVector = Arrow.TypedVector as typeof TypedVector_; export let Int8Vector = Arrow.Int8Vector as typeof Int8Vector_; export let Int16Vector = Arrow.Int16Vector as typeof Int16Vector_; export let Int32Vector = Arrow.Int32Vector as typeof Int32Vector_; diff --git a/js/test/__snapshots__/table-tests.ts.snap b/js/test/__snapshots__/table-tests.ts.snap index a7fb9c5a64c24..401b992d9d7b5 100644 --- a/js/test/__snapshots__/table-tests.ts.snap +++ b/js/test/__snapshots__/table-tests.ts.snap @@ -118,19 +118,19 @@ Array [ `; exports[`dictionary file Arrow Table toString() prints a pretty Table 1`] = ` -" example-csv -Hermione,25,-53.235599517822266,40.231998443603516 - Severus,30,-62.22999954223633,3 - Harry,20,23,-100.23652648925781" +" example-csv +[\\"Hermione\\",25,{\\"0\\":-53.235599517822266,\\"1\\":40.231998443603516}] + [\\"Severus\\",30,{\\"0\\":-62.22999954223633,\\"1\\":3}] + [\\"Harry\\",20,{\\"0\\":23,\\"1\\":-100.23652648925781}]" `; exports[`dictionary file Arrow Table toString() prints an empty Table 1`] = `""`; exports[`dictionary file Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, example-csv - 0, Hermione,25,-53.235599517822266,40.231998443603516 - 1, Severus,30,-62.22999954223633,3 - 2, Harry,20,23,-100.23652648925781" +"Index, example-csv + 0, [\\"Hermione\\",25,{\\"0\\":-53.235599517822266,\\"1\\":40.231998443603516}] + 1, [\\"Severus\\",30,{\\"0\\":-62.22999954223633,\\"1\\":3}] + 2, [\\"Harry\\",20,{\\"0\\":23,\\"1\\":-100.23652648925781}]" `; exports[`dictionary stream Arrow Table creates a Table from Arrow buffers 1`] = `"example-csv"`; @@ -251,19 +251,19 @@ Array [ `; exports[`dictionary stream Arrow Table toString() prints a pretty Table 1`] = ` -" example-csv -Hermione,25,-53.235599517822266,40.231998443603516 - Severus,30,-62.22999954223633,3 - Harry,20,23,-100.23652648925781" +" example-csv +[\\"Hermione\\",25,{\\"0\\":-53.235599517822266,\\"1\\":40.231998443603516}] + [\\"Severus\\",30,{\\"0\\":-62.22999954223633,\\"1\\":3}] + [\\"Harry\\",20,{\\"0\\":23,\\"1\\":-100.23652648925781}]" `; exports[`dictionary stream Arrow Table toString() prints an empty Table 1`] = `""`; exports[`dictionary stream Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, example-csv - 0, Hermione,25,-53.235599517822266,40.231998443603516 - 1, Severus,30,-62.22999954223633,3 - 2, Harry,20,23,-100.23652648925781" +"Index, example-csv + 0, [\\"Hermione\\",25,{\\"0\\":-53.235599517822266,\\"1\\":40.231998443603516}] + 1, [\\"Severus\\",30,{\\"0\\":-62.22999954223633,\\"1\\":3}] + 2, [\\"Harry\\",20,{\\"0\\":23,\\"1\\":-100.23652648925781}]" `; exports[`dictionary2 file Arrow Table creates a Table from Arrow buffers 1`] = `"struct"`; @@ -353,17 +353,17 @@ Array [ `; exports[`dictionary2 file Arrow Table toString() prints a pretty Table 1`] = ` -" struct - a0fb47f9-f8fb-4403-a64a-786d7611f8ef,Airbus,1502880750,32.45663833618164,1.8712350130081177 -50fb46f4-fefa-42c1-919c-0121974cdd00,Boeing,1502880750,38.766666412353516,-4.181231498718262" +" struct + [\\"a0fb47f9-f8fb-4403-a64a-786d7611f8ef\\",\\"Airbus\\",1502880750,{\\"0\\":32.45663833618164,\\"1\\":1.8712350130081177}] +[\\"50fb46f4-fefa-42c1-919c-0121974cdd00\\",\\"Boeing\\",1502880750,{\\"0\\":38.766666412353516,\\"1\\":-4.181231498718262}]" `; exports[`dictionary2 file Arrow Table toString() prints an empty Table 1`] = `""`; exports[`dictionary2 file Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, struct - 0, a0fb47f9-f8fb-4403-a64a-786d7611f8ef,Airbus,1502880750,32.45663833618164,1.8712350130081177 - 1, 50fb46f4-fefa-42c1-919c-0121974cdd00,Boeing,1502880750,38.766666412353516,-4.181231498718262" +"Index, struct + 0, [\\"a0fb47f9-f8fb-4403-a64a-786d7611f8ef\\",\\"Airbus\\",1502880750,{\\"0\\":32.45663833618164,\\"1\\":1.8712350130081177}] + 1, [\\"50fb46f4-fefa-42c1-919c-0121974cdd00\\",\\"Boeing\\",1502880750,{\\"0\\":38.766666412353516,\\"1\\":-4.181231498718262}]" `; exports[`multi_dictionary file Arrow Table creates a Table from Arrow buffers 1`] = `"struct"`; @@ -459,17 +459,17 @@ Array [ `; exports[`multi_dictionary file Arrow Table toString() prints a pretty Table 1`] = ` -" struct - a0fb47f9-f8fb-4403-a64a-786d7611f8ef,12345,Airbus,1502880750,32.45663833618164,1.8712350130081177 -50fb46f4-fefa-42c1-919c-0121974cdd00,67890,Boeing,1502880750,38.766666412353516,-4.181231498718262" +" struct + [\\"a0fb47f9-f8fb-4403-a64a-786d7611f8ef\\",\\"12345\\",\\"Airbus\\",1502880750,{\\"0\\":32.45663833618164,\\"1\\":1.8712350130081177}] +[\\"50fb46f4-fefa-42c1-919c-0121974cdd00\\",\\"67890\\",\\"Boeing\\",1502880750,{\\"0\\":38.766666412353516,\\"1\\":-4.181231498718262}]" `; exports[`multi_dictionary file Arrow Table toString() prints an empty Table 1`] = `""`; exports[`multi_dictionary file Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, struct - 0, a0fb47f9-f8fb-4403-a64a-786d7611f8ef,12345,Airbus,1502880750,32.45663833618164,1.8712350130081177 - 1, 50fb46f4-fefa-42c1-919c-0121974cdd00,67890,Boeing,1502880750,38.766666412353516,-4.181231498718262" +"Index, struct + 0, [\\"a0fb47f9-f8fb-4403-a64a-786d7611f8ef\\",\\"12345\\",\\"Airbus\\",1502880750,{\\"0\\":32.45663833618164,\\"1\\":1.8712350130081177}] + 1, [\\"50fb46f4-fefa-42c1-919c-0121974cdd00\\",\\"67890\\",\\"Boeing\\",1502880750,{\\"0\\":38.766666412353516,\\"1\\":-4.181231498718262}]" `; exports[`multipart count Arrow Table creates a Table from Arrow buffers 1`] = `"row_count"`; @@ -1373,47 +1373,47 @@ Array [ `; exports[`struct file Arrow Table toString() prints a pretty Table 1`] = ` -" struct_nullable - null - ,MhRNxD4 -137773603,3F9HBxK -410361374,aVd88fp - null - ,3loZrRf - null - null - , - , - null - ,78SLiRw - null - null - ,0ilsf82 - ,LjS9MbU - ," +" struct_nullable + null + [null,\\"MhRNxD4\\"] +[137773603,\\"3F9HBxK\\"] +[410361374,\\"aVd88fp\\"] + null + [null,\\"3loZrRf\\"] + null + null + [null,null] + [null,null] + null + [null,\\"78SLiRw\\"] + null + null + [null,\\"0ilsf82\\"] + [null,\\"LjS9MbU\\"] + [null,null]" `; exports[`struct file Arrow Table toString() prints an empty Table 1`] = `""`; exports[`struct file Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, struct_nullable - 0, null - 1, ,MhRNxD4 - 2, 137773603,3F9HBxK - 3, 410361374,aVd88fp - 4, null - 5, ,3loZrRf - 6, null - 7, null - 8, , - 9, , - 10, null - 11, ,78SLiRw - 12, null - 13, null - 14, ,0ilsf82 - 15, ,LjS9MbU - 16, ," +"Index, struct_nullable + 0, null + 1, [null,\\"MhRNxD4\\"] + 2, [137773603,\\"3F9HBxK\\"] + 3, [410361374,\\"aVd88fp\\"] + 4, null + 5, [null,\\"3loZrRf\\"] + 6, null + 7, null + 8, [null,null] + 9, [null,null] + 10, null + 11, [null,\\"78SLiRw\\"] + 12, null + 13, null + 14, [null,\\"0ilsf82\\"] + 15, [null,\\"LjS9MbU\\"] + 16, [null,null]" `; exports[`struct stream Arrow Table creates a Table from Arrow buffers 1`] = `"struct_nullable"`; @@ -1771,45 +1771,45 @@ Array [ `; exports[`struct stream Arrow Table toString() prints a pretty Table 1`] = ` -" struct_nullable - null - ,MhRNxD4 -137773603,3F9HBxK -410361374,aVd88fp - null - ,3loZrRf - null - null - , - , - null - ,78SLiRw - null - null - ,0ilsf82 - ,LjS9MbU - ," +" struct_nullable + null + [null,\\"MhRNxD4\\"] +[137773603,\\"3F9HBxK\\"] +[410361374,\\"aVd88fp\\"] + null + [null,\\"3loZrRf\\"] + null + null + [null,null] + [null,null] + null + [null,\\"78SLiRw\\"] + null + null + [null,\\"0ilsf82\\"] + [null,\\"LjS9MbU\\"] + [null,null]" `; exports[`struct stream Arrow Table toString() prints an empty Table 1`] = `""`; exports[`struct stream Arrow Table toString({ index: true }) prints a pretty Table with an Index column 1`] = ` -"Index, struct_nullable - 0, null - 1, ,MhRNxD4 - 2, 137773603,3F9HBxK - 3, 410361374,aVd88fp - 4, null - 5, ,3loZrRf - 6, null - 7, null - 8, , - 9, , - 10, null - 11, ,78SLiRw - 12, null - 13, null - 14, ,0ilsf82 - 15, ,LjS9MbU - 16, ," +"Index, struct_nullable + 0, null + 1, [null,\\"MhRNxD4\\"] + 2, [137773603,\\"3F9HBxK\\"] + 3, [410361374,\\"aVd88fp\\"] + 4, null + 5, [null,\\"3loZrRf\\"] + 6, null + 7, null + 8, [null,null] + 9, [null,null] + 10, null + 11, [null,\\"78SLiRw\\"] + 12, null + 13, null + 14, [null,\\"0ilsf82\\"] + 15, [null,\\"LjS9MbU\\"] + 16, [null,null]" `; diff --git a/js/test/table-tests.ts b/js/test/table-tests.ts index c840299155af4..d0d70059e1561 100644 --- a/js/test/table-tests.ts +++ b/js/test/table-tests.ts @@ -23,7 +23,7 @@ for (let [name, ...buffers] of arrowTestConfigurations) { test(`creates a Table from Arrow buffers`, () => { expect.hasAssertions(); const table = Table.from(...buffers); - for (const vector of table.cols()) { + for (const vector of table.columns) { expect(vector.name).toMatchSnapshot(); expect(vector.type).toMatchSnapshot(); expect(vector.length).toMatchSnapshot(); @@ -35,7 +35,7 @@ for (let [name, ...buffers] of arrowTestConfigurations) { test(`vector iterators report the same values as get`, () => { expect.hasAssertions(); const table = Table.from(...buffers); - for (const vector of table.cols()) { + for (const vector of table.columns) { let i = -1, n = vector.length; for (let v of vector) { expect(++i).toBeLessThan(n); @@ -46,12 +46,13 @@ for (let [name, ...buffers] of arrowTestConfigurations) { }); test(`batch and Table Vectors report the same values`, () => { expect.hasAssertions(); - let rowsTotal = 0, table = Table.from(...buffers); + let rowsTotal = 0; + let table = Table.from(...buffers); for (let vectors of readBuffers(...buffers)) { let rowsNow = Math.max(...vectors.map((v) => v.length)); for (let vi = -1, vn = vectors.length; ++vi < vn;) { let v1 = vectors[vi]; - let v2 = table.getColumnAt(vi); + let v2 = table.columns[vi]; expect(v1.name).toEqual(v2.name); expect(v1.type).toEqual(v2.type); for (let i = -1, n = v1.length; ++i < n;) { @@ -64,15 +65,15 @@ for (let [name, ...buffers] of arrowTestConfigurations) { test(`enumerates Table rows`, () => { expect.hasAssertions(); const table = Table.from(...buffers); - for (const row of table.rows()) { - expect(row).toMatchSnapshot(); + for (const row of table) { + expect(row!.toObject()).toMatchSnapshot(); } }); test(`enumerates Table rows compact`, () => { expect.hasAssertions(); const table = Table.from(...buffers); - for (const row of table.rows(true)) { - expect(row).toMatchSnapshot(); + for (const row of table) { + expect(row!.toArray()).toMatchSnapshot(); } }); test(`toString() prints an empty Table`, () => { diff --git a/js/test/test-config.ts b/js/test/test-config.ts index b31ff11ad4173..89de1cc6c70c9 100644 --- a/js/test/test-config.ts +++ b/js/test/test-config.ts @@ -20,7 +20,7 @@ import * as path from 'path'; const arrowFormats = ['file', 'stream']; const arrowFileNames = ['simple', 'struct', 'dictionary', 'dictionary2', 'multi_dictionary']; const multipartArrows = ['count', 'latlong', 'origins']; -export let arrowTestConfigurations = []; +export let arrowTestConfigurations = [] as (string | Buffer)[][]; arrowTestConfigurations = arrowFormats.reduce((configs, format) => { return arrowFileNames.reduce((configs, name) => { diff --git a/js/test/vector-tests.ts b/js/test/vector-tests.ts index 0c9ef4404ed6a..0eca2327bff43 100644 --- a/js/test/vector-tests.ts +++ b/js/test/vector-tests.ts @@ -18,7 +18,7 @@ import { flatbuffers } from 'flatbuffers'; import Long = flatbuffers.Long; import { - BitVector, + BoolVector, TypedVector, Int64Vector, Uint64Vector, @@ -32,22 +32,34 @@ import { Float64Vector, } from './Arrow'; -const LongVectors = { Int64Vector, Uint64Vector }; -const ByteVectors = { Int8Vector, Int16Vector, Int32Vector, Uint8Vector, Uint16Vector, Uint32Vector, Float32Vector, Float64Vector }; +const LongVectors = { + Int64Vector: [Int64Vector, Int32Array], + Uint64Vector: [Uint64Vector, Uint32Array] +}; -const longVectors = toMap(LongVectors, Object.keys(LongVectors)); -const byteVectors = toMap(ByteVectors, Object.keys(ByteVectors)); +const TypedVectors = { + Int8Vector: [Int8Vector, Int8Array], + Int16Vector: [Int16Vector, Int16Array], + Int32Vector: [Int32Vector, Int32Array], + Uint8Vector: [Uint8Vector, Uint8Array], + Uint16Vector: [Uint16Vector, Uint16Array], + Uint32Vector: [Uint32Vector, Uint32Array], + Float32Vector: [Float32Vector, Float32Array], + Float64Vector: [Float64Vector, Float64Array] +}; + +const longVectors = toMap<[typeof TypedVector, any]>(LongVectors, Object.keys(LongVectors)); +const byteVectors = toMap<[typeof TypedVector, any]>(TypedVectors, Object.keys(TypedVectors)); const bytes = Array.from( { length: 5 }, () => Uint8Array.from( { length: 64 }, () => Math.random() * 255 | 0)); -describe(`BitVector`, () => { - const vector = new BitVector(new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0])); +describe(`BoolVector`, () => { + const vector = new BoolVector({ data: new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0]) }); const values = [true, true, false, true, true, false, false, false]; const n = values.length; - vector.length = 1; test(`gets expected values`, () => { let i = -1; while (++i < n) { @@ -62,11 +74,11 @@ describe(`BitVector`, () => { } }); test(`can set values to true and false`, () => { - const v = new BitVector(new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0])); + const v = new BoolVector({ data: new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0]) }); const expected1 = [true, true, false, true, true, false, false, false]; const expected2 = [true, true, true, true, true, false, false, false]; const expected3 = [true, true, false, false, false, false, true, true]; - function validate(expected) { + function validate(expected: boolean[]) { for (let i = -1; ++i < n;) { expect(v.get(i)).toEqual(expected[i]); } @@ -88,40 +100,42 @@ describe(`BitVector`, () => { validate(expected1); }); test(`packs 0 values`, () => { - expect(BitVector.pack([])).toEqual( + expect(BoolVector.pack([])).toEqual( new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])); }); test(`packs 3 values`, () => { - expect(BitVector.pack([ + expect(BoolVector.pack([ true, false, true ])).toEqual(new Uint8Array([5, 0, 0, 0, 0, 0, 0, 0])); }); test(`packs 8 values`, () => { - expect(BitVector.pack([ + expect(BoolVector.pack([ true, true, false, true, true, false, false, false ])).toEqual(new Uint8Array([27, 0, 0, 0, 0, 0, 0, 0])); }); test(`packs 25 values`, () => { - expect(BitVector.pack([ + expect(BoolVector.pack([ true, true, false, true, true, false, false, false, false, false, false, true, true, false, true, true, false ])).toEqual(new Uint8Array([27, 216, 0, 0, 0, 0, 0, 0])); }); test(`from with boolean Array packs values`, () => { - expect(BitVector.from([ - true, false, true - ]).slice()).toEqual(new Uint8Array([5, 0, 0, 0, 0, 0, 0, 0])); + expect(new BoolVector({ + data: BoolVector.pack([true, false, true]) + }).slice()).toEqual(new Uint8Array([5, 0, 0, 0, 0, 0, 0, 0])); }); }); -for (const [VectorName, VectorType] of longVectors) { - const ArrayType = VectorType.prototype.arrayType; +for (const [VectorName, [VectorType, ArrayType]] of longVectors) { describe(`${VectorName}`, () => { const values = concatTyped(ArrayType, ...bytes); - const bLists = bytes.map((b) => new ArrayType(b.buffer)); - const vector = new VectorType(null, ...bLists); - const n = vector.length = values.length * 0.5; + const vector = bytes + .map((b) => new VectorType({ + data: new ArrayType(b.buffer) + })) + .reduce((v: any, v2) => v.concat(v2)); + const n = values.length * 0.5; test(`gets expected values`, () => { let i = -1; while (++i < n) { @@ -161,13 +175,16 @@ for (const [VectorName, VectorType] of longVectors) { }); } -for (const [VectorName, VectorType] of byteVectors) { - const ArrayType = VectorType.prototype.arrayType; +for (const [VectorName, [VectorType, ArrayType]] of byteVectors) { describe(`${VectorName}`, () => { const values = concatTyped(ArrayType, ...bytes); - const bLists = bytes.map((b) => new ArrayType(b.buffer)); - const vector = new VectorType(null, ...bLists); - const n = vector.length = values.length; + const vector = bytes + .map((b) => new VectorType({ + data: new ArrayType(b.buffer) + })) + .reduce((v: any, v2) => v.concat(v2)); + + const n = values.length; test(`gets expected values`, () => { let i = -1; while (++i < n) { @@ -212,9 +229,9 @@ function toMap(entries: any, keys: string[]) { } function concatTyped(ArrayType: any, ...bytes: any[]) { - const BPM = ArrayType.BYTES_PER_ELEMENT; + const BPE = ArrayType.BYTES_PER_ELEMENT; return bytes.reduce((v, bytes) => { - const l = bytes.byteLength / BPM; + const l = bytes.byteLength / BPE; const a = new ArrayType(v.length + l); const b = new ArrayType(bytes.buffer); a.set(v); diff --git a/js/tsconfig/tsconfig.base.json b/js/tsconfig/tsconfig.base.json index 4a46ed1f0af9b..8b8210198960a 100644 --- a/js/tsconfig/tsconfig.base.json +++ b/js/tsconfig/tsconfig.base.json @@ -3,23 +3,33 @@ "include": ["../src/**/*.ts"], "compileOnSave": false, "compilerOptions": { - "lib": ["dom", "esnext", "esnext.asynciterable"], + + /* Basic stuff */ "moduleResolution": "node", + "lib": ["dom", "esnext", "esnext.asynciterable"], + + /* Control what is emitted */ + "declaration": true, + "noEmitOnError": true, + "removeComments": false, + "downlevelIteration": true, + + /* Create inline sourcemaps with sources */ "sourceMap": false, "inlineSources": true, "inlineSourceMap": true, - "declaration": true, - "skipLibCheck": true, + + /* The most restrictive settings possible */ + "strict": true, + "skipLibCheck": false, "importHelpers": true, "noEmitHelpers": true, - "noImplicitAny": false, - "noEmitOnError": false, - "noImplicitThis": true, + "noImplicitAny": true, "noUnusedLocals": true, - "removeComments": false, - "downlevelIteration": true, - "noImplicitUseStrict": true, - "preserveConstEnums": false, + "noImplicitReturns": true, + "allowUnusedLabels": false, + "noUnusedParameters": true, + "allowUnreachableCode": false, "noFallthroughCasesInSwitch": true, "forceConsistentCasingInFileNames": true } diff --git a/js/tsconfig/tsconfig.es2015.cls.json b/js/tsconfig/tsconfig.es2015.cls.json index 11ccc04d58375..fccacb349d023 100644 --- a/js/tsconfig/tsconfig.es2015.cls.json +++ b/js/tsconfig/tsconfig.es2015.cls.json @@ -4,7 +4,8 @@ "compilerOptions": { "target": "ES2015", "module": "es2015", + "declaration": false, "noEmitHelpers": true, - "importHelpers": false + "importHelpers": true } } diff --git a/js/tsconfig/tsconfig.es5.cls.json b/js/tsconfig/tsconfig.es5.cls.json index 55f7ea52cf362..6e6f213b4d6e5 100644 --- a/js/tsconfig/tsconfig.es5.cls.json +++ b/js/tsconfig/tsconfig.es5.cls.json @@ -4,6 +4,7 @@ "compilerOptions": { "target": "ES5", "module": "es2015", + "declaration": false, "noEmitHelpers": true, "importHelpers": false } diff --git a/js/tsconfig/tsconfig.esnext.cls.json b/js/tsconfig/tsconfig.esnext.cls.json index 009a5ac10d644..03206c9d77d38 100644 --- a/js/tsconfig/tsconfig.esnext.cls.json +++ b/js/tsconfig/tsconfig.esnext.cls.json @@ -4,7 +4,8 @@ "compilerOptions": { "target": "ESNEXT", "module": "es2015", + "declaration": false, "noEmitHelpers": true, - "importHelpers": false + "importHelpers": true } } From 82cd6e5158169eb3c782dba8991473e88a113181 Mon Sep 17 00:00:00 2001 From: dhirschf Date: Fri, 3 Nov 2017 17:51:52 -0400 Subject: [PATCH 010/177] ARROW-1764: [Python] Add -c conda-forge for Windows dev installation instructions Author: dhirschf Author: Korn, Uwe Author: Uwe L. Korn Closes #1277 from xhochy/ARROW-1764 and squashes the following commits: 63143bba [Uwe L. Korn] Merge pull request #4 from dhirschfeld/gflags bdb6b3f1 [dhirschf] Clarification of gflags channel 27f08961 [Korn, Uwe] ARROW-1764: [Python] Add -c conda-forge for Windows dev installation instructions --- cpp/apidoc/Windows.md | 6 +++++- python/doc/source/development.rst | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/apidoc/Windows.md b/cpp/apidoc/Windows.md index 774482ea1c4f3..e7e83f1946b56 100644 --- a/cpp/apidoc/Windows.md +++ b/cpp/apidoc/Windows.md @@ -41,9 +41,13 @@ conda config --add channels conda-forge Now, you can bootstrap a build environment ```shell -conda create -n arrow-dev cmake git boost-cpp flatbuffers rapidjson cmake thrift-cpp snappy zlib brotli gflags lz4-c zstd +conda create -n arrow-dev cmake git boost-cpp flatbuffers rapidjson cmake thrift-cpp snappy zlib brotli gflags lz4-c zstd -c conda-forge ``` +***Note:*** +> *Make sure to get the `conda-forge` build of `gflags` as the + naming of the library differs from that in the `defaults` channel* + Activate just created conda environment with pre-installed packages from previous step: diff --git a/python/doc/source/development.rst b/python/doc/source/development.rst index 3ca460d463a06..7ef6a722be1dc 100644 --- a/python/doc/source/development.rst +++ b/python/doc/source/development.rst @@ -257,7 +257,8 @@ First, starting from fresh clones of Apache Arrow and parquet-cpp: .. code-block:: shell conda create -n arrow-dev cmake git boost-cpp ^ - flatbuffers snappy zlib brotli thrift-cpp rapidjson + flatbuffers snappy zlib brotli thrift-cpp rapidjson ^ + -c conda-forge activate arrow-dev As one git housekeeping item, we must run this command in our Arrow clone: From 5d665762cd8c6ebbe94ce39b435a63ca4cf15967 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Fri, 3 Nov 2017 20:55:27 -0400 Subject: [PATCH 011/177] ARROW-1727: [Format] Expand Arrow streaming format to permit deltas / additions to existing dictionaries Add an `isDelta` flag to the `DictionaryBatch` to allow for dictionary modifications mid-stream, update documentation. Author: Brian Hulette Closes #1257 from TheNeuralBit/ARROW-1727 and squashes the following commits: c69a5539 [Brian Hulette] Documentation tweaks 3dff0a9c [Brian Hulette] Add isDelta flag to DictionaryBatch, update documentation --- format/IPC.md | 45 ++++++++++++++++++++++++++++++++++++++++++++- format/Layout.md | 6 +++--- format/Message.fbs | 10 +++++++--- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/format/IPC.md b/format/IPC.md index 2f79031443b17..f3b48854c2072 100644 --- a/format/IPC.md +++ b/format/IPC.md @@ -67,7 +67,9 @@ We provide a streaming format for record batches. It is presented as a sequence of encapsulated messages, each of which follows the format above. The schema comes first in the stream, and it is the same for all of the record batches that follow. If any fields in the schema are dictionary-encoded, one or more -`DictionaryBatch` messages will follow the schema. +`DictionaryBatch` messages will be included. `DictionaryBatch` and +`RecordBatch` messages may be interleaved, but before any dictionary key is used +in a `RecordBatch` it should be defined in a `DictionaryBatch`. ``` @@ -76,6 +78,10 @@ that follow. If any fields in the schema are dictionary-encoded, one or more ... + +... + +... ``` @@ -109,6 +115,10 @@ Schematically we have: ``` +In the file format, there is no requirement that dictionary keys should be +defined in a `DictionaryBatch` before they are used in a `RecordBatch`, as long +as the keys are defined somewhere in the file. + ### RecordBatch body structure The `RecordBatch` metadata contains a depth-first (pre-order) flattened set of @@ -181,6 +191,7 @@ the dictionaries can be properly interpreted. table DictionaryBatch { id: long; data: RecordBatch; + isDelta: boolean = false; } ``` @@ -189,6 +200,38 @@ in the schema, so that dictionaries can even be used for multiple fields. See the [Physical Layout][4] document for more about the semantics of dictionary-encoded data. +The dictionary `isDelta` flag allows dictionary batches to be modified +mid-stream. A dictionary batch with `isDelta` set indicates that its vector +should be concatenated with those of any previous batches with the same `id`. A +stream which encodes one column, the list of strings +`["A", "B", "C", "B", "D", "C", "E", "A"]`, with a delta dictionary batch could +take the form: + +``` + + +(0) "A" +(1) "B" +(2) "C" + + +0 +1 +2 +1 + + +(3) "D" +(4) "E" + + +3 +2 +4 +0 +EOS +``` + ### Tensor (Multi-dimensional Array) Message Format The `Tensor` message types provides a way to write a multidimensional array of diff --git a/format/Layout.md b/format/Layout.md index ebf93821aab24..963202f9fb77a 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -615,9 +615,9 @@ the the types array indicates that a slot contains a different type at the index ## Dictionary encoding When a field is dictionary encoded, the values are represented by an array of Int32 representing the index of the value in the dictionary. -The Dictionary is received as a DictionaryBatch whose id is referenced by a dictionary attribute defined in the metadata ([Message.fbs][7]) in the Field table. -The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its index in the DictionaryBatch. -When a Schema references a Dictionary id, it must send a DictionaryBatch for this id before any RecordBatch. +The Dictionary is received as one or more DictionaryBatches with the id referenced by a dictionary attribute defined in the metadata ([Message.fbs][7]) in the Field table. +The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its index in the DictionaryBatches. +When a Schema references a Dictionary id, it must send at least one DictionaryBatch for this id. As an example, you could have the following data: ``` diff --git a/format/Message.fbs b/format/Message.fbs index f4a95713cea93..830718139d88c 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -61,16 +61,20 @@ table RecordBatch { buffers: [Buffer]; } -/// ---------------------------------------------------------------------- /// For sending dictionary encoding information. Any Field can be /// dictionary-encoded, but in this case none of its children may be /// dictionary-encoded. -/// There is one vector / column per dictionary -/// +/// There is one vector / column per dictionary, but that vector / column +/// may be spread across multiple dictionary batches by using the isDelta +/// flag table DictionaryBatch { id: long; data: RecordBatch; + + /// If isDelta is true the values in the dictionary are to be appended to a + /// dictionary with the indicated id + isDelta: bool = false; } /// ---------------------------------------------------------------------- From b9a2ce9b277c7938775d51b919b4a6464be7a66f Mon Sep 17 00:00:00 2001 From: "Korn, Uwe" Date: Sat, 4 Nov 2017 14:24:12 +0100 Subject: [PATCH 012/177] ARROW-1765: [Doc] Use dependencies from conda in C++ docker build Author: Korn, Uwe Closes #1278 from xhochy/ARROW-1765 and squashes the following commits: f29ffaf [Korn, Uwe] ARROW-1765: [Doc] Use dependencies from conda in C++ docker build --- dev/gen_apidocs/Dockerfile | 15 +++++---------- dev/gen_apidocs/create_documents.sh | 8 ++++++-- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/dev/gen_apidocs/Dockerfile b/dev/gen_apidocs/Dockerfile index 0b2844cc8454b..eaeb548184d61 100644 --- a/dev/gen_apidocs/Dockerfile +++ b/dev/gen_apidocs/Dockerfile @@ -15,8 +15,6 @@ # limitations under the License. # FROM ubuntu:14.04 -ADD . /apache-arrow -WORKDIR /apache-arrow # Prerequsites for apt-add-repository RUN apt-get update && apt-get install -y \ software-properties-common python-software-properties @@ -34,14 +32,6 @@ RUN wget -O /tmp/miniconda.sh \ https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \ rm /tmp/miniconda.sh -# C++ dependencies -RUN /home/ubuntu/miniconda/bin/conda install -c conda-forge \ - boost-cpp \ - doxygen \ - maven \ - cmake \ - zlib \ - thrift-cpp # C_Glib dependencies RUN apt-get install -y \ libgtk2.0-dev \ @@ -69,6 +59,7 @@ RUN /home/ubuntu/miniconda/bin/conda create -y -q -n pyarrow-dev \ six \ setuptools \ # C++ + boost-cpp \ cmake \ flatbuffers \ rapidjson \ @@ -79,5 +70,9 @@ RUN /home/ubuntu/miniconda/bin/conda create -y -q -n pyarrow-dev \ jemalloc \ lz4-c \ zstd \ + doxygen \ + maven \ -c conda-forge +ADD . /apache-arrow +WORKDIR /apache-arrow CMD arrow/dev/gen_apidocs/create_documents.sh diff --git a/dev/gen_apidocs/create_documents.sh b/dev/gen_apidocs/create_documents.sh index afbe041506d85..762b85222bd4d 100755 --- a/dev/gen_apidocs/create_documents.sh +++ b/dev/gen_apidocs/create_documents.sh @@ -16,6 +16,8 @@ # limitations under the License. # +set -ex + # Set up environment and output directory for C++ libraries cd /apache-arrow rm -rf dist @@ -25,8 +27,6 @@ export ARROW_HOME=$(pwd)/dist export PARQUET_HOME=$(pwd)/dist CONDA_BASE=/home/ubuntu/miniconda export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} -export THRIFT_HOME=${CONDA_BASE} -export BOOST_ROOT=${CONDA_BASE} export PATH=${CONDA_BASE}/bin:${PATH} # Prepare the asf-site before copying api docs @@ -41,6 +41,10 @@ popd # Make Python documentation (Depends on C++ ) # Build Arrow C++ source activate pyarrow-dev + +export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX +export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX + rm -rf arrow/cpp/build mkdir arrow/cpp/build pushd arrow/cpp/build From fc7104fc4f42d77525fe15bb221bf80c181fc1d4 Mon Sep 17 00:00:00 2001 From: "Korn, Uwe" Date: Sat, 4 Nov 2017 15:14:20 -0400 Subject: [PATCH 013/177] ARROW-1742: C++: clang-format is not detected correct on OSX anymore Author: Korn, Uwe Closes #1281 from xhochy/ARROW-1742 and squashes the following commits: 89e9a767 [Korn, Uwe] ARROW-1742: C++: clang-format is not detected correct on OSX anymore --- cpp/CMakeLists.txt | 7 ++-- cpp/build-support/run_clang_format.py | 2 +- cpp/cmake_modules/FindClangTools.cmake | 51 +++++++++++++++++++++----- 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d8dc5df88b4a4..5f0c431d54aca 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -48,6 +48,7 @@ set(ARROW_ABI_VERSION "${ARROW_SO_VERSION}.0.0") set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") +set(CLANG_FORMAT_VERSION "4.0") find_package(ClangTools) if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND) # Generate a Clang compile_commands.json "compilation database" file for use @@ -435,11 +436,9 @@ endif (UNIX) # "make format" and "make check-format" targets ############################################################ -set(CLANG_FORMAT_VERSION 4.0) - # runs clang format and updates files in place. add_custom_target(format ${BUILD_SUPPORT_DIR}/run_clang_format.py - ${CLANG_FORMAT_VERSION} + ${CLANG_FORMAT_BIN} ${BUILD_SUPPORT_DIR}/clang_format_exclusions.txt ${CMAKE_CURRENT_SOURCE_DIR}/src) @@ -447,7 +446,7 @@ add_custom_target(format ${BUILD_SUPPORT_DIR}/run_clang_format.py # TODO(wesm): Make this work in run_clang_format.py add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run_clang_format.py - ${CLANG_FORMAT_VERSION} + ${CLANG_FORMAT_BIN} ${BUILD_SUPPORT_DIR}/clang_format_exclusions.txt ${CMAKE_CURRENT_SOURCE_DIR}/src 1) diff --git a/cpp/build-support/run_clang_format.py b/cpp/build-support/run_clang_format.py index fcf39ecc6a5f9..6dec34bd09afe 100755 --- a/cpp/build-support/run_clang_format.py +++ b/cpp/build-support/run_clang_format.py @@ -27,7 +27,7 @@ sys.argv[0]) sys.exit(1) -CLANG_FORMAT = 'clang-format-{0}'.format(sys.argv[1]) +CLANG_FORMAT = sys.argv[1] EXCLUDE_GLOBS_FILENAME = sys.argv[2] SOURCE_DIR = sys.argv[3] diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake index 0e9430ba29195..e9221ff22dc1f 100644 --- a/cpp/cmake_modules/FindClangTools.cmake +++ b/cpp/cmake_modules/FindClangTools.cmake @@ -49,16 +49,47 @@ else() message("clang-tidy found at ${CLANG_TIDY_BIN}") endif() -find_program(CLANG_FORMAT_BIN - NAMES clang-format-4.0 - clang-format-3.9 - clang-format-3.8 - clang-format-3.7 - clang-format-3.6 - clang-format - PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin - NO_DEFAULT_PATH -) +if (CLANG_FORMAT_VERSION) + find_program(CLANG_FORMAT_BIN + NAMES clang-format-${CLANG_FORMAT_VERSION} + PATHS + ${ClangTools_PATH} + $ENV{CLANG_TOOLS_PATH} + /usr/local/bin /usr/bin + NO_DEFAULT_PATH + ) + + # If not found yet, search alternative locations + if (("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") AND APPLE) + # Homebrew ships older LLVM versions in /usr/local/opt/llvm@version/ + STRING(REGEX REPLACE "^([0-9]+)\\.[0-9]+" "\\1" CLANG_FORMAT_MAJOR_VERSION "${CLANG_FORMAT_VERSION}") + STRING(REGEX REPLACE "^[0-9]+\\.([0-9]+)" "\\1" CLANG_FORMAT_MINOR_VERSION "${CLANG_FORMAT_VERSION}") + if ("${CLANG_FORMAT_MINOR_VERSION}" STREQUAL "0") + find_program(CLANG_FORMAT_BIN + NAMES clang-format + PATHS /usr/local/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin + NO_DEFAULT_PATH + ) + else() + find_program(CLANG_FORMAT_BIN + NAMES clang-format + PATHS /usr/local/opt/llvm@${CLANG_FORMAT_VERSION}/bin + NO_DEFAULT_PATH + ) + endif() + endif() +else() + find_program(CLANG_FORMAT_BIN + NAMES clang-format-4.0 + clang-format-3.9 + clang-format-3.8 + clang-format-3.7 + clang-format-3.6 + clang-format + PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin + NO_DEFAULT_PATH + ) +endif() if ( "${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND" ) set(CLANG_FORMAT_FOUND 0) From 62190d7a5201cad6ae0b26d790942ffc8861eee9 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 4 Nov 2017 16:56:27 -0400 Subject: [PATCH 014/177] ARROW-1756: [Python] Fix large file read/write error This is the part of [ARROW-1756](https://issues.apache.org/jira/browse/ARROW-1756). Author: Licht-T Author: Wes McKinney Closes #1276 from Licht-T/fix-large-file-read-write-error and squashes the following commits: e21964a3 [Wes McKinney] Break in read IO loop when reaching EOF 8a68756b [Wes McKinney] Minor code tweaks, fix clang documentation warnings 81c19721 [Licht-T] TST: Add test for the large file read/write 9b71afee [Licht-T] ENH: Convert errno to string error message fbb7eea6 [Licht-T] BUG: Fix large file read/write error --- cpp/src/arrow/buffer.h | 2 +- cpp/src/arrow/compare.h | 10 ++--- cpp/src/arrow/io/file.cc | 66 +++++++++++++++++++++++----- python/pyarrow/tests/conftest.py | 14 +++++- python/pyarrow/tests/test_feather.py | 9 +++- 5 files changed, 81 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 8e989064be4e1..7c5f6174fa6e9 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -340,7 +340,7 @@ Status AllocateResizableBuffer(MemoryPool* pool, const int64_t size, #ifndef ARROW_NO_DEPRECATED_API /// \brief Create Buffer referencing std::string memory -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 /// /// Warning: string instance must stay alive /// diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 27176ed864cdd..df3386e4bfc19 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -33,27 +33,27 @@ class Tensor; #ifndef ARROW_NO_DEPRECATED_API /// Returns true if the arrays are exactly equal -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 Status ARROW_EXPORT ArrayEquals(const Array& left, const Array& right, bool* are_equal); -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 Status ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right, bool* are_equal); /// Returns true if the arrays are approximately equal. For non-floating point /// types, this is equivalent to ArrayEquals(left, right) -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 Status ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right, bool* are_equal); /// Returns true if indicated equal-length segment of arrays is exactly equal -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 Status ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right, int64_t start_idx, int64_t end_idx, int64_t other_start_idx, bool* are_equal); /// Returns true if the type metadata are exactly equal -/// \deprecated Since 0.8.0 +/// \note Deprecated since 0.8.0 Status ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right, bool* are_equal); #endif diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 74c6c09e6219b..057cad1111685 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -22,6 +22,21 @@ #define _FILE_OFFSET_BITS 64 +// define max read/write count +#if defined(_MSC_VER) +#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX +#else + +#ifdef __APPLE__ +// due to macOS bug, we need to set read/write max +#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX +#else +// see notes on Linux read/write manpage +#define ARROW_MAX_IO_CHUNKSIZE 0x7ffff000 +#endif + +#endif + #include "arrow/io/file.h" #if _WIN32 || _WIN64 @@ -238,39 +253,68 @@ static inline Status FileSeek(int fd, int64_t pos) { return Status::OK(); } -static inline Status FileRead(int fd, uint8_t* buffer, int64_t nbytes, +static inline Status FileRead(const int fd, uint8_t* buffer, const int64_t nbytes, int64_t* bytes_read) { #if defined(_MSC_VER) - if (nbytes > INT32_MAX) { + if (nbytes > ARROW_MAX_IO_CHUNKSIZE) { return Status::IOError("Unable to read > 2GB blocks yet"); } *bytes_read = static_cast(_read(fd, buffer, static_cast(nbytes))); #else - *bytes_read = static_cast(read(fd, buffer, static_cast(nbytes))); + *bytes_read = 0; + + while (*bytes_read != -1 && *bytes_read < nbytes) { + int64_t chunksize = + std::min(static_cast(ARROW_MAX_IO_CHUNKSIZE), nbytes - *bytes_read); + int64_t ret = static_cast( + read(fd, buffer + *bytes_read, static_cast(chunksize))); + + if (ret != -1) { + *bytes_read += ret; + if (ret < chunksize) { + // EOF + break; + } + } else { + *bytes_read = ret; + } + } #endif if (*bytes_read == -1) { - // TODO(wesm): errno to string - return Status::IOError("Error reading bytes from file"); + return Status::IOError(std::string("Error reading bytes from file: ") + + std::string(strerror(errno))); } return Status::OK(); } -static inline Status FileWrite(int fd, const uint8_t* buffer, int64_t nbytes) { - int ret; +static inline Status FileWrite(const int fd, const uint8_t* buffer, + const int64_t nbytes) { + int ret = 0; #if defined(_MSC_VER) - if (nbytes > INT32_MAX) { + if (nbytes > ARROW_MAX_IO_CHUNKSIZE) { return Status::IOError("Unable to write > 2GB blocks to file yet"); } ret = static_cast(_write(fd, buffer, static_cast(nbytes))); #else - ret = static_cast(write(fd, buffer, static_cast(nbytes))); + int64_t bytes_written = 0; + + while (ret != -1 && bytes_written < nbytes) { + int64_t chunksize = + std::min(static_cast(ARROW_MAX_IO_CHUNKSIZE), nbytes - bytes_written); + ret = static_cast( + write(fd, buffer + bytes_written, static_cast(chunksize))); + + if (ret != -1) { + bytes_written += ret; + } + } #endif if (ret == -1) { - // TODO(wesm): errno to string - return Status::IOError("Error writing bytes to file"); + return Status::IOError(std::string("Error writing bytes from file: ") + + std::string(strerror(errno))); } return Status::OK(); } diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index c6bd6c9b3a2d7..e27682232a22d 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pytest import skip +from pytest import skip, mark groups = [ @@ -70,6 +70,18 @@ def pytest_addoption(parser): default=False, help=('Run only the {0} test group'.format(group))) + parser.addoption('--runslow', action='store_true', + default=False, help='run slow tests') + + +def pytest_collection_modifyitems(config, items): + if not config.getoption('--runslow'): + skip_slow = mark.skip(reason='need --runslow option to run') + + for item in items: + if 'slow' in item.keywords: + item.add_marker(skip_slow) + def pytest_runtest_setup(item): only_set = False diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 9e7fc8863e759..b0764fdec1768 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -50,7 +50,7 @@ def tearDown(self): pass def test_file_not_exist(self): - with self.assertRaises(pa.ArrowIOError): + with pytest.raises(pa.ArrowIOError): FeatherReader('test_invalid_file') def _get_null_counts(self, path, columns=None): @@ -98,7 +98,7 @@ def _assert_error_on_write(self, df, exc, path=None): def f(): write_feather(df, path) - self.assertRaises(exc, f) + pytest.raises(exc, f) def test_num_rows_attr(self): df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]}) @@ -466,3 +466,8 @@ def test_unsupported(self): # non-strings df = pd.DataFrame({'a': ['a', 1, 2.0]}) self._assert_error_on_write(df, ValueError) + + @pytest.mark.slow + def test_large_dataframe(self): + df = pd.DataFrame({'A': np.arange(400000000)}) + self._check_pandas_roundtrip(df) From b513c8d2047c6ba7dea0530f436ca80f74809530 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 5 Nov 2017 12:52:38 +0100 Subject: [PATCH 015/177] ARROW-1762: [C++] Add note to readme about need to set LC_ALL on some Linux systems Author: Wes McKinney Closes #1284 from wesm/ARROW-1762 and squashes the following commits: b71cf40 [Wes McKinney] Add note to readme about need to set LC_ALL on some Linux systems --- cpp/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/README.md b/cpp/README.md index 60383535b1596..2034d29fae324 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -69,6 +69,14 @@ Simple release build: Detailed unit test logs will be placed in the build directory under `build/test-logs`. +On some Linux distributions, running the test suite might require setting an +explicit locale. If you see any locale-related errors, try setting the +environment variable (which requires the `locales` package or equivalent): + +``` +export LC_ALL="en_US.UTF-8" +``` + ### Statically linking to Arrow on Windows The Arrow headers on Windows static library builds (enabled by the CMake From ea4a8f5a01c0c028bbec1e199ca70efaffbf068b Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sun, 5 Nov 2017 13:32:13 +0100 Subject: [PATCH 016/177] ARROW-1714: [Python] Fix invalid serialization/deserialization None name Series This closes [ARROW-1714](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-1714). Author: Licht-T Author: Wes McKinney Closes #1263 from Licht-T/fix-invaid-conversion-none-column-name and squashes the following commits: 3afa60d [Wes McKinney] Be robust to pandas 0.21 conventions for null column labels b353260 [Wes McKinney] Don't use locals() fe6a075 [Licht-T] TST: Add test for None name Series serialization 3535dc4 [Licht-T] BUG: Fix invalid deserialization of None column name --- python/pyarrow/pandas_compat.py | 52 ++++++++++++++++++++++++-------- python/pyarrow/table.pxi | 5 ++- python/pyarrow/tests/test_ipc.py | 9 +++++- 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 1984598ff3533..87b47b8a6bc13 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -263,6 +263,8 @@ def _column_name_to_strings(name): return tuple(map(_column_name_to_strings, name)) elif isinstance(name, collections.Sequence): raise TypeError("Unsupported type for MultiIndex level") + elif name is None: + return None return str(name) @@ -280,7 +282,9 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1): for name in df.columns: col = df[name] if not isinstance(name, six.string_types): - name = str(_column_name_to_strings(name)) + name = _column_name_to_strings(name) + if name is not None: + name = str(name) if schema is not None: field = schema.field_by_name(name) @@ -361,6 +365,7 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): schema = table.schema row_count = table.num_rows metadata = schema.metadata + columns_metadata = None has_pandas_metadata = metadata is not None and b'pandas' in metadata @@ -370,6 +375,7 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) + columns_metadata = pandas_metadata.get('columns', None) block_table = table @@ -428,6 +434,18 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] + if columns_metadata is not None: + columns_name_dict = dict( + (str(x['name']), x['name']) + for x in columns_metadata + ) + columns_values = [ + columns_name_dict[y] + if y in columns_name_dict.keys() else y + for y in column_strings + ] + else: + columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of @@ -437,11 +455,11 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): # Create the column index # Construct the base index - if not column_strings: - columns = pd.Index(column_strings) + if not columns_values: + columns = pd.Index(columns_values) else: columns = pd.MultiIndex.from_tuples( - list(map(to_pair, column_strings)), + list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) @@ -466,25 +484,35 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): _level if _level.dtype == _dtype else _level.astype(_dtype) for _level, _dtype in levels_dtypes ] + columns = pd.MultiIndex( levels=new_levels, labels=labels, names=columns.names ) - # flatten a single level column MultiIndex for pandas 0.21.0 :( - if isinstance(columns, pd.MultiIndex) and columns.nlevels == 1: - levels, = columns.levels - labels, = columns.labels - - # Cheaply check that we do not somehow have duplicate column names - assert len(levels) == len(labels), 'Found non-unique column index' - columns = levels[labels] + # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 + columns = _flatten_single_level_multiindex(columns) axes = [columns, index] return _int.BlockManager(blocks, axes) +def _flatten_single_level_multiindex(index): + if isinstance(index, pd.MultiIndex) and index.nlevels == 1: + levels, = index.levels + labels, = index.labels + + # Cheaply check that we do not somehow have duplicate column names + if not index.is_unique: + raise ValueError('Found non-unique column index') + + return pd.Index([levels[_label] if _label != -1 else None + for _label in labels], + name=index.names[0]) + return index + + def _add_any_metadata(table, pandas_metadata): modified_columns = {} diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 6165a6622b836..5ba5f83d22f90 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -345,7 +345,10 @@ cdef _schema_from_arrays(arrays, names, dict metadata, else: raise TypeError(type(val)) - c_name = tobytes(names[i]) + if names[i] is None: + c_name = tobytes(u'None') + else: + c_name = tobytes(names[i]) fields[i].reset(new CField(c_name, type_, True)) schema.reset(new CSchema(fields, unbox_metadata(metadata))) diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 68c0c80aa6187..5033ea95783ab 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -432,16 +432,23 @@ def test_serialize_pandas_no_preserve_index(): def test_serialize_with_pandas_objects(): df = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3]) + s = pd.Series([1, 2, 3, 4]) data = { 'a_series': df['a'], - 'a_frame': df + 'a_frame': df, + 's_series': s } serialized = pa.serialize(data).to_buffer() deserialized = pa.deserialize(serialized) assert_frame_equal(deserialized['a_frame'], df) + assert_series_equal(deserialized['a_series'], df['a']) + assert deserialized['a_series'].name == 'a' + + assert_series_equal(deserialized['s_series'], s) + assert deserialized['s_series'].name is None def test_schema_batch_serialize_methods(): From 1ee73ef4bedc5f63b909421b1ad247e086689a74 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sun, 5 Nov 2017 18:26:06 -0500 Subject: [PATCH 017/177] ARROW-1770: [GLib] Fix GLib compiler warning Author: Phillip Cloud Closes #1287 from cpcloud/ARROW-1770 and squashes the following commits: bb406961 [Phillip Cloud] ARROW-1770: [GLib] Fix GLib compiler warning --- c_glib/arrow-glib/input-stream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_glib/arrow-glib/input-stream.cpp b/c_glib/arrow-glib/input-stream.cpp index d628baeeeae5b..a7a894b9d4f3e 100644 --- a/c_glib/arrow-glib/input-stream.cpp +++ b/c_glib/arrow-glib/input-stream.cpp @@ -437,13 +437,13 @@ namespace garrow { } arrow::Status ReadAt(int64_t position, int64_t n_bytes, - int64_t *n_read_bytes, uint8_t* out) { + int64_t *n_read_bytes, uint8_t* out) override { return arrow::io::RandomAccessFile::ReadAt( position, n_bytes, n_read_bytes, out); } arrow::Status ReadAt(int64_t position, int64_t n_bytes, - std::shared_ptr* out) { + std::shared_ptr* out) override { return arrow::io::RandomAccessFile::ReadAt(position, n_bytes, out); } From d7f1398a050b07dde3949e03792e5c477e4abfe4 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sun, 5 Nov 2017 18:50:50 -0500 Subject: [PATCH 018/177] ARROW-1749: [C++] Handle range of Decimal128 values that require 39 digits to be displayed Author: Phillip Cloud Closes #1282 from cpcloud/ARROW-1749 and squashes the following commits: 858a6f1d [Phillip Cloud] Move _BitScanReverse to the correct file 71b0d88a [Phillip Cloud] Move bit utility to bitutil 439fd2e9 [Phillip Cloud] Fold the constant 2084065d [Phillip Cloud] Remove unused include 0c3df97c [Phillip Cloud] Try things f577612c [Phillip Cloud] Variable name 0eef9579 [Phillip Cloud] ARROW-1749: [C++] Handle range of Decimal128 values that require 39 digits to be displayed --- cpp/src/arrow/array.cc | 4 +- cpp/src/arrow/python/arrow_to_pandas.cc | 8 +- cpp/src/arrow/util/bit-util.h | 22 +++ cpp/src/arrow/util/decimal-test.cc | 39 ++++- cpp/src/arrow/util/decimal.cc | 213 ++++++++++++++---------- cpp/src/arrow/util/decimal.h | 24 ++- 6 files changed, 204 insertions(+), 106 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index fc4b96e1b2bec..b523876bf0e4e 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -321,8 +321,8 @@ DecimalArray::DecimalArray(const std::shared_ptr& data) std::string DecimalArray::FormatValue(int64_t i) const { const auto& type_ = static_cast(*type()); - Decimal128 value(GetValue(i)); - return value.ToString(type_.precision(), type_.scale()); + const Decimal128 value(GetValue(i)); + return value.ToString(type_.scale()); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index c92faede1347b..3894772daa467 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -616,11 +616,10 @@ static Status ConvertTimes(PandasOptions options, const ChunkedArray& data, return Status::OK(); } -static Status RawDecimalToString(const uint8_t* bytes, int precision, int scale, - std::string* result) { +static Status RawDecimalToString(const uint8_t* bytes, int scale, std::string* result) { DCHECK_NE(result, nullptr); Decimal128 decimal(bytes); - *result = decimal.ToString(precision, scale); + *result = decimal.ToString(scale); return Status::OK(); } @@ -636,7 +635,6 @@ static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, for (int c = 0; c < data.num_chunks(); c++) { auto* arr(static_cast(data.chunk(c).get())); auto type(std::dynamic_pointer_cast(arr->type())); - const int precision = type->precision(); const int scale = type->scale(); for (int64_t i = 0; i < arr->length(); ++i) { @@ -646,7 +644,7 @@ static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, } else { const uint8_t* raw_value = arr->GetValue(i); std::string decimal_string; - RETURN_NOT_OK(RawDecimalToString(raw_value, precision, scale, &decimal_string)); + RETURN_NOT_OK(RawDecimalToString(raw_value, scale, &decimal_string)); *out_values++ = internal::DecimalFromString(Decimal, decimal_string); RETURN_IF_PYERROR(); } diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 8043f90ccdf6a..d6415f3c75b7b 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -43,6 +43,8 @@ #endif #if defined(_MSC_VER) +#include +#pragma intrinsic(_BitScanReverse) #define ARROW_BYTE_SWAP64 _byteswap_uint64 #define ARROW_BYTE_SWAP32 _byteswap_ulong #else @@ -55,6 +57,7 @@ #include #include +#include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/type_traits.h" #include "arrow/util/visibility.h" @@ -296,6 +299,25 @@ static inline int Log2(uint64_t x) { return result; } +/// \brief Count the number of leading zeros in a 32 bit integer. +static inline int64_t CountLeadingZeros(uint32_t value) { + DCHECK_NE(value, 0); +#if defined(__clang__) || defined(__GNUC__) + return static_cast(__builtin_clz(value)); +#elif defined(_MSC_VER) + unsigned long index; // NOLINT + _BitScanReverse(&index, static_cast(value)); // NOLINT + return 31LL - static_cast(index); +#else + int64_t bitpos = 0; + while (value != 0) { + value >>= 1; + ++bitpos; + } + return 32LL - bitpos; +#endif +} + /// Swaps the byte order (i.e. endianess) static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); } static inline uint64_t ByteSwap(uint64_t value) { diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index 565a1bbb939b6..b0271fff15ccf 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -36,9 +36,8 @@ class DecimalTestFixture : public ::testing::Test { TEST_F(DecimalTestFixture, TestToString) { Decimal128 decimal(this->integer_value_); - int precision = 8; int scale = 5; - std::string result = decimal.ToString(precision, scale); + std::string result = decimal.ToString(scale); ASSERT_EQ(result, this->string_value_); } @@ -256,4 +255,40 @@ TEST(Decimal128TestFalse, ConstructibleFromBool) { ASSERT_EQ(0, value.low_bits()); } +TEST(Decimal128Test, Division) { + const std::string expected_string_value("-23923094039234029"); + const Decimal128 value(expected_string_value); + const Decimal128 result(value / 3); + const Decimal128 expected_value("-7974364679744676"); + ASSERT_EQ(expected_value, result); +} + +TEST(Decimal128Test, PrintLargePositiveValue) { + const std::string string_value("99999999999999999999999999999999999999"); + const Decimal128 value(string_value); + const std::string printed_value = value.ToIntegerString(); + ASSERT_EQ(string_value, printed_value); +} + +TEST(Decimal128Test, PrintLargeNegativeValue) { + const std::string string_value("-99999999999999999999999999999999999999"); + const Decimal128 value(string_value); + const std::string printed_value = value.ToIntegerString(); + ASSERT_EQ(string_value, printed_value); +} + +TEST(Decimal128Test, PrintMaxValue) { + const std::string string_value("170141183460469231731687303715884105727"); + const Decimal128 value(string_value); + const std::string printed_value = value.ToIntegerString(); + ASSERT_EQ(string_value, printed_value); +} + +TEST(Decimal128Test, PrintMinValue) { + const std::string string_value("-170141183460469231731687303715884105728"); + const Decimal128 value(string_value); + const std::string printed_value = value.ToIntegerString(); + ASSERT_EQ(string_value, printed_value); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 7196b252c5b60..9d94bef847fa1 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -19,14 +19,10 @@ #include #include #include +#include #include #include -#ifdef _MSC_VER -#include -#pragma intrinsic(_BitScanReverse) -#endif - #include "arrow/util/bit-util.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" @@ -55,62 +51,116 @@ std::array Decimal128::ToBytes() const { return out; } -std::string Decimal128::ToString(int precision, int scale) const { - using std::size_t; +static constexpr Decimal128 kTenTo36(static_cast(0xC097CE7BC90715), + 0xB34B9F1000000000); +static constexpr Decimal128 kTenTo18(0xDE0B6B3A7640000); - const bool is_negative = *this < 0; - - // Decimal values are sent to clients as strings so in the interest of - // speed the string will be created without the using stringstream with the - // whole/fractional_part(). - size_t last_char_idx = precision + (scale > 0) // Add a space for decimal place - + (scale == precision) // Add a space for leading 0 - + is_negative; // Add a space for negative sign - - std::string str(last_char_idx, '0'); - - // Start filling in the values in reverse order by taking the last digit - // of the value. Use a positive value and worry about the sign later. At this - // point the last_char_idx points to the string terminator. - Decimal128 remaining_value(*this); - - const auto first_digit_idx = static_cast(is_negative); - if (is_negative) { - remaining_value.Negate(); - } - - if (scale > 0) { - int remaining_scale = scale; - do { - str[--last_char_idx] = - static_cast(remaining_value % 10 + '0'); // Ascii offset - remaining_value /= 10; - } while (--remaining_scale > 0); - str[--last_char_idx] = '.'; - DCHECK_GT(last_char_idx, first_digit_idx) << "Not enough space remaining"; - } - - do { - str[--last_char_idx] = static_cast(remaining_value % 10 + '0'); // Ascii offset - remaining_value /= 10; - if (remaining_value == 0) { - // Trim any extra leading 0's. - if (last_char_idx > first_digit_idx) { - str.erase(0, last_char_idx - first_digit_idx); - } +std::string Decimal128::ToIntegerString() const { + Decimal128 remainder; + std::stringstream buf; + bool need_fill = false; - break; + // get anything above 10 ** 36 and print it + Decimal128 top; + Status s = Divide(kTenTo36, &top, &remainder); + DCHECK(s.ok()) << s.message(); + + if (top != 0) { + buf << static_cast(top); + remainder.Abs(); + need_fill = true; + } + + // now get anything above 10 ** 18 and print it + Decimal128 tail; + s = remainder.Divide(kTenTo18, &top, &tail); + + if (need_fill || top != 0) { + if (need_fill) { + buf << std::setw(18) << std::setfill('0'); + } else { + need_fill = true; + tail.Abs(); + } + + buf << static_cast(top); + } + + // finally print the tail, which is less than 10**18 + if (need_fill) { + buf << std::setw(18) << std::setfill('0'); + } + buf << static_cast(tail); + return buf.str(); +} + +Decimal128::operator int64_t() const { + DCHECK(high_bits_ == 0 || high_bits_ == -1) + << "Trying to cast an Decimal128 greater than the value range of a " + "int64_t. high_bits_ must be equal to 0 or -1, got: " + << high_bits_; + return static_cast(low_bits_); +} + +std::string Decimal128::ToString(int32_t scale) const { + const std::string str(ToIntegerString()); + + if (scale == 0) { + return str; + } + + if (*this < 0) { + const auto len = static_cast(str.size()); + + if (len - 1 > scale) { + const auto n = static_cast(len - scale); + return str.substr(0, n) + "." + str.substr(n, static_cast(scale)); + } + + if (len - 1 == scale) { + return "-0." + str.substr(1, std::string::npos); } - // For safety, enforce string length independent of remaining_value. - } while (last_char_idx > first_digit_idx); - if (is_negative) { - str[0] = '-'; + std::string result("-0." + std::string(static_cast(scale - len + 1), '0')); + return result + str.substr(1, std::string::npos); + } + + const auto len = static_cast(str.size()); + + if (len > scale) { + const auto n = static_cast(len - scale); + return str.substr(0, n) + "." + str.substr(n, static_cast(scale)); + } + + if (len == scale) { + return "0." + str; } - return str; + return "0." + std::string(static_cast(scale - len), '0') + str; } +static constexpr auto kInt64DecimalDigits = + static_cast(std::numeric_limits::digits10); +static constexpr int64_t kPowersOfTen[kInt64DecimalDigits + 1] = {1LL, + 10LL, + 100LL, + 1000LL, + 10000LL, + 100000LL, + 1000000LL, + 10000000LL, + 100000000LL, + 1000000000LL, + 10000000000LL, + 100000000000LL, + 1000000000000LL, + 10000000000000LL, + 100000000000000LL, + 1000000000000000LL, + 10000000000000000LL, + 100000000000000000LL, + 1000000000000000000LL}; + static void StringToInteger(const std::string& str, Decimal128* out) { using std::size_t; @@ -122,13 +172,10 @@ static void StringToInteger(const std::string& str, Decimal128* out) { DCHECK_GT(length, 0) << "length of parsed decimal string should be greater than 0"; - size_t posn = 0; - - while (posn < length) { - const size_t group = std::min(static_cast(18), length - posn); - const auto chunk = static_cast(std::stoll(str.substr(posn, group))); - const auto multiple = - static_cast(std::pow(10.0, static_cast(group))); + for (size_t posn = 0; posn < length;) { + const size_t group = std::min(kInt64DecimalDigits, length - posn); + const int64_t chunk = std::stoll(str.substr(posn, group)); + const int64_t multiple = kPowersOfTen[group]; *out *= multiple; *out += chunk; @@ -266,6 +313,8 @@ Decimal128& Decimal128::Negate() { return *this; } +Decimal128& Decimal128::Abs() { return *this < 0 ? Negate() : *this; } + Decimal128& Decimal128::operator+=(const Decimal128& right) { const uint64_t sum = low_bits_ + right.low_bits_; high_bits_ += right.high_bits_; @@ -288,20 +337,11 @@ Decimal128& Decimal128::operator-=(const Decimal128& right) { Decimal128& Decimal128::operator/=(const Decimal128& right) { Decimal128 remainder; - DCHECK(Divide(right, this, &remainder).ok()); + Status s = Divide(right, this, &remainder); + DCHECK(s.ok()); return *this; } -Decimal128::operator char() const { - DCHECK(high_bits_ == 0 || high_bits_ == -1) - << "Trying to cast an Decimal128 greater than the value range of a " - "char. high_bits_ must be equal to 0 or -1, got: " - << high_bits_; - DCHECK_LE(low_bits_, std::numeric_limits::max()) - << "low_bits_ too large for C type char, got: " << low_bits_; - return static_cast(low_bits_); -} - Decimal128& Decimal128::operator|=(const Decimal128& right) { low_bits_ |= right.low_bits_; high_bits_ |= right.high_bits_; @@ -440,18 +480,6 @@ static int64_t FillInArray(const Decimal128& value, uint32_t* array, bool& was_n return 1; } -/// \brief Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is the MSB. -static int64_t FindLastSetBit(uint32_t value) { -#if defined(__clang__) || defined(__GNUC__) - // Count leading zeros - return __builtin_clz(value) + 1; -#elif defined(_MSC_VER) - unsigned long index; // NOLINT - _BitScanReverse(&index, static_cast(value)); // NOLINT - return static_cast(index + 1UL); -#endif -} - /// Shift the number in the array left by bits positions. /// \param array the number to shift, must have length elements /// \param length the number of entries in the array @@ -581,7 +609,7 @@ Status Decimal128::Divide(const Decimal128& divisor, Decimal128* result, // Normalize by shifting both by a multiple of 2 so that // the digit guessing is better. The requirement is that // divisor_array[0] is greater than 2**31. - int64_t normalize_bits = 32 - FindLastSetBit(divisor_array[0]); + int64_t normalize_bits = BitUtil::CountLeadingZeros(divisor_array[0]); ShiftArrayLeft(divisor_array, divisor_length, normalize_bits); ShiftArrayLeft(dividend_array, dividend_length, normalize_bits); @@ -589,7 +617,7 @@ Status Decimal128::Divide(const Decimal128& divisor, Decimal128* result, for (int64_t j = 0; j < result_length; ++j) { // Guess the next digit. At worst it is two too large uint32_t guess = std::numeric_limits::max(); - auto high_dividend = + const auto high_dividend = static_cast(dividend_array[j]) << 32 | dividend_array[j + 1]; if (dividend_array[j] != divisor_array[0]) { guess = static_cast(high_dividend / divisor_array[0]); @@ -625,10 +653,9 @@ Status Decimal128::Divide(const Decimal128& divisor, Decimal128* result, // if guess was too big, we add back divisor if (dividend_array[j] > prev) { --guess; - uint32_t carry = 0; for (int64_t i = divisor_length - 1; i >= 0; --i) { - uint64_t sum = + const auto sum = static_cast(divisor_array[i]) + dividend_array[j + i + 1] + carry; dividend_array[j + i + 1] = static_cast(sum); carry = static_cast(sum >> 32); @@ -645,6 +672,7 @@ Status Decimal128::Divide(const Decimal128& divisor, Decimal128* result, // return result and remainder RETURN_NOT_OK(BuildFromArray(result, result_array, result_length)); RETURN_NOT_OK(BuildFromArray(remainder, dividend_array, dividend_length)); + FixDivisionSigns(result, remainder, dividend_was_negative, divisor_was_negative); return Status::OK(); } @@ -679,6 +707,11 @@ Decimal128 operator-(const Decimal128& operand) { return result.Negate(); } +Decimal128 operator~(const Decimal128& operand) { + Decimal128 result(~operand.high_bits(), ~operand.low_bits()); + return result; +} + Decimal128 operator+(const Decimal128& left, const Decimal128& right) { Decimal128 result(left.high_bits(), left.low_bits()); result += right; @@ -700,14 +733,16 @@ Decimal128 operator*(const Decimal128& left, const Decimal128& right) { Decimal128 operator/(const Decimal128& left, const Decimal128& right) { Decimal128 remainder; Decimal128 result; - DCHECK(left.Divide(right, &result, &remainder).ok()); + Status s = left.Divide(right, &result, &remainder); + DCHECK(s.ok()); return result; } Decimal128 operator%(const Decimal128& left, const Decimal128& right) { Decimal128 remainder; Decimal128 result; - DCHECK(left.Divide(right, &result, &remainder).ok()); + Status s = left.Divide(right, &result, &remainder); + DCHECK(s.ok()); return remainder; } diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 72da5547907db..487f222580201 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -39,15 +39,16 @@ namespace arrow { class ARROW_EXPORT Decimal128 { public: /// \brief Create an Decimal128 from the two's complement representation. - constexpr Decimal128(int64_t high, uint64_t low) : high_bits_(high), low_bits_(low) {} + constexpr Decimal128(int64_t high, uint64_t low) noexcept + : high_bits_(high), low_bits_(low) {} /// \brief Empty constructor creates an Decimal128 with a value of 0. - constexpr Decimal128() : Decimal128(0, 0) {} + constexpr Decimal128() noexcept : Decimal128(0, 0) {} /// \brief Convert any integer value into an Decimal128. template ::value, T>::type> - constexpr Decimal128(T value) + constexpr Decimal128(T value) noexcept : Decimal128(static_cast(value) >= 0 ? 0 : -1, static_cast(value)) {} @@ -61,6 +62,9 @@ class ARROW_EXPORT Decimal128 { /// \brief Negate the current value Decimal128& Negate(); + /// \brief Absolute value + Decimal128& Abs(); + /// \brief Add a number to this one. The result is truncated to 128 bits. Decimal128& operator+=(const Decimal128& right); @@ -85,9 +89,6 @@ class ARROW_EXPORT Decimal128 { /// \brief In-place division. Decimal128& operator/=(const Decimal128& right); - /// \brief Cast the value to char. This is used when converting the value a string. - explicit operator char() const; - /// \brief Bitwise or between two Decimal128. Decimal128& operator|=(const Decimal128& right); @@ -110,8 +111,14 @@ class ARROW_EXPORT Decimal128 { std::array ToBytes() const; /// \brief Convert the Decimal128 value to a base 10 decimal string with the given - /// precision and scale. - std::string ToString(int precision, int scale) const; + /// scale. + std::string ToString(int32_t scale) const; + + /// \brief Convert the value to an integer string + std::string ToIntegerString() const; + + /// \brief Cast this value to an int64_t. + explicit operator int64_t() const; /// \brief Convert a decimal string to an Decimal128 value, optionally including /// precision and scale if they're passed in and not null. @@ -131,6 +138,7 @@ ARROW_EXPORT bool operator>(const Decimal128& left, const Decimal128& right); ARROW_EXPORT bool operator>=(const Decimal128& left, const Decimal128& right); ARROW_EXPORT Decimal128 operator-(const Decimal128& operand); +ARROW_EXPORT Decimal128 operator~(const Decimal128& operand); ARROW_EXPORT Decimal128 operator+(const Decimal128& left, const Decimal128& right); ARROW_EXPORT Decimal128 operator-(const Decimal128& left, const Decimal128& right); ARROW_EXPORT Decimal128 operator*(const Decimal128& left, const Decimal128& right); From b25b2433d67d797e9cd461377eb3798e60de2727 Mon Sep 17 00:00:00 2001 From: Yuliya Feldman Date: Sun, 5 Nov 2017 18:57:05 -0500 Subject: [PATCH 019/177] =?UTF-8?q?ARROW-1663:=20[Java]=20use=20consistent?= =?UTF-8?q?=20name=20for=20null=20and=20not-null=20in=20FixedSizeLis?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …t, add backward compatibility while deserializing schema that was generated before this JIRA checkin Author: Yuliya Feldman Closes #1193 from yufeldman/ARROW-1663 and squashes the following commits: 7f9bd34f [Yuliya Feldman] ARROW-1663: Addressing code review comments 600d379b [Yuliya Feldman] ARROW-1663: Addressing code review comments 93f527b0 [Yuliya Feldman] ARROW-1663: Addressing code review comments 7cfa22b0 [Yuliya Feldman] ARROW-1663: use consistent name for null and not-null in FixedSizeListVector and ListVector, add backward compatibility while deserializing schema that was generated before this JIRA checkin 58d6e9c9 [Yuliya Feldman] ARROW-1663: use consistent name for null and not-null in FixedSizeList, add backward compatibility while deserializing schema that was generated before this JIRA checkin --- .../org/apache/arrow/vector/ZeroVector.java | 6 +-- .../arrow/vector/complex/ListVector.java | 5 --- .../apache/arrow/vector/types/pojo/Field.java | 27 ++++++++++++- .../arrow/vector/TestFixedSizeListVector.java | 14 +++++++ .../apache/arrow/vector/TestListVector.java | 7 +--- .../apache/arrow/vector/pojo/TestConvert.java | 40 +++++++++++++++++++ 6 files changed, 85 insertions(+), 14 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index b267b2087d05c..5ac00375f8317 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -18,6 +18,8 @@ package org.apache.arrow.vector; +import static org.apache.arrow.vector.complex.BaseRepeatedValueVector.DATA_VECTOR_NAME; + import java.util.Collections; import java.util.Iterator; import java.util.List; @@ -39,8 +41,6 @@ public class ZeroVector implements FieldVector { public final static ZeroVector INSTANCE = new ZeroVector(); - private final String name = "[DEFAULT]"; - private final TransferPair defaultPair = new TransferPair() { @Override public void transfer() { @@ -109,7 +109,7 @@ public void clear() { @Override public Field getField() { - return new Field(name, FieldType.nullable(new Null()), null); + return new Field(DATA_VECTOR_NAME, FieldType.nullable(new Null()), null); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 6511efcb7d513..ea28a60619209 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -39,7 +39,6 @@ import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.complex.impl.ComplexCopier; import org.apache.arrow.vector.complex.impl.UnionListReader; @@ -49,7 +48,6 @@ import org.apache.arrow.vector.schema.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; -import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -324,9 +322,6 @@ public int getBufferSize() { @Override public Field getField() { - if (getDataVector() instanceof ZeroVector) { - return new Field(name, fieldType, ImmutableList.of(new Field(DATA_VECTOR_NAME, FieldType.nullable(Null.INSTANCE), null))); - } return new Field(name, fieldType, ImmutableList.of(getDataVector().getField())); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java index 48e71a976c0e8..eba149bf79f65 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java @@ -20,6 +20,7 @@ import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.arrow.vector.complex.BaseRepeatedValueVector.DATA_VECTOR_NAME; import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField; import java.util.Iterator; @@ -39,6 +40,7 @@ import com.google.flatbuffers.FlatBufferBuilder; import org.apache.arrow.flatbuf.KeyValue; +import org.apache.arrow.flatbuf.Type; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.schema.TypeLayout; @@ -121,7 +123,9 @@ public static Field convertField(org.apache.arrow.flatbuf.Field field) { } ImmutableList.Builder childrenBuilder = ImmutableList.builder(); for (int i = 0; i < field.childrenLength(); i++) { - childrenBuilder.add(convertField(field.children(i))); + Field childField = convertField(field.children(i)); + childField = mutateOriginalNameIfNeeded(field, childField); + childrenBuilder.add(childField); } List children = childrenBuilder.build(); ImmutableMap.Builder metadataBuilder = ImmutableMap.builder(); @@ -134,6 +138,27 @@ public static Field convertField(org.apache.arrow.flatbuf.Field field) { return new Field(name, nullable, type, dictionary, children, new TypeLayout(layout.build()), metadata); } + /** + * Helper method to ensure backward compatibility with schemas generated prior to ARROW-1347, ARROW-1663 + * @param field + * @param originalChildField original field which name might be mutated + * @return original or mutated field + */ + private static Field mutateOriginalNameIfNeeded(org.apache.arrow.flatbuf.Field field, Field originalChildField) { + if ((field.typeType() == Type.List || field.typeType() == Type.FixedSizeList) + && originalChildField.getName().equals("[DEFAULT]")) { + return + new Field(DATA_VECTOR_NAME, + originalChildField.isNullable(), + originalChildField.getType(), + originalChildField.getDictionary(), + originalChildField.getChildren(), + originalChildField.getTypeLayout(), + originalChildField.getMetadata()); + } + return originalChildField; + } + public void validate() { TypeLayout expectedLayout = TypeLayout.getTypeLayout(getType()); if (!expectedLayout.equals(typeLayout)) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java index 5677f2566797a..43d9387b106a4 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java @@ -26,6 +26,7 @@ import org.apache.arrow.vector.complex.impl.UnionFixedSizeListReader; import org.apache.arrow.vector.complex.impl.UnionListReader; import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.FieldType; @@ -220,4 +221,17 @@ public void testTransferPair() { } } } + + @Test + public void testConsistentChildName() throws Exception { + try (FixedSizeListVector listVector = FixedSizeListVector.empty("sourceVector", 2, allocator)) { + String emptyListStr = listVector.getField().toString(); + Assert.assertTrue(emptyListStr.contains(ListVector.DATA_VECTOR_NAME)); + + listVector.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); + String emptyVectorStr = listVector.getField().toString(); + Assert.assertTrue(emptyVectorStr.contains(ListVector.DATA_VECTOR_NAME)); + } + } + } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index 1c9b574998018..59e1646e86e56 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -26,12 +26,9 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.impl.UnionListWriter; -import org.apache.arrow.vector.complex.impl.UnionListReader; import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.complex.writer.FieldWriter; -import org.apache.arrow.vector.holders.NullableBigIntHolder; import org.apache.arrow.vector.types.Types; -import org.apache.arrow.vector.types.Types.*; +import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.TransferPair; import org.junit.After; @@ -635,7 +632,7 @@ public void testConsistentChildName() throws Exception { String emptyListStr = listVector.getField().toString(); assertTrue(emptyListStr.contains(ListVector.DATA_VECTOR_NAME)); - listVector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + listVector.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())); String emptyVectorStr = listVector.getField().toString(); assertTrue(emptyVectorStr.contains(ListVector.DATA_VECTOR_NAME)); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index f98aeac8c8196..f6f1ad221f3d1 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -21,13 +21,19 @@ import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; import static org.apache.arrow.vector.types.FloatingPointPrecision.SINGLE; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import java.nio.ByteBuffer; import java.util.HashMap; import java.util.Map; import com.google.common.collect.ImmutableList; import com.google.flatbuffers.FlatBufferBuilder; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.UnionMode; @@ -64,6 +70,40 @@ public void complex() { run(initialField); } + @Test + public void list() throws Exception { + ImmutableList.Builder childrenBuilder = ImmutableList.builder(); + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + ListVector writeVector = ListVector.empty("list", allocator); + FixedSizeListVector writeFixedVector = FixedSizeListVector.empty("fixedlist", 5, allocator)) { + Field listVectorField = writeVector.getField(); + childrenBuilder.add(listVectorField); + Field listFixedVectorField = writeFixedVector.getField(); + childrenBuilder.add(listFixedVectorField); + } + + Field initialField = new Field("a", FieldType.nullable(Struct.INSTANCE), childrenBuilder.build()); + ImmutableList.Builder parentBuilder = ImmutableList.builder(); + parentBuilder.add(initialField); + FlatBufferBuilder builder = new FlatBufferBuilder(); + builder.finish(initialField.getField(builder)); + org.apache.arrow.flatbuf.Field flatBufField = org.apache.arrow.flatbuf.Field.getRootAsField(builder.dataBuffer()); + Field finalField = Field.convertField(flatBufField); + assertEquals(initialField, finalField); + assertFalse(finalField.toString().contains("[DEFAULT]")); + + Schema initialSchema = new Schema(parentBuilder.build()); + String jsonSchema = initialSchema.toJson(); + String modifiedSchema = jsonSchema.replace("$data$", "[DEFAULT]"); + + Schema tempSchema = Schema.fromJSON(modifiedSchema); + FlatBufferBuilder schemaBuilder = new FlatBufferBuilder(); + org.apache.arrow.vector.types.pojo.Schema schema = new org.apache.arrow.vector.types.pojo.Schema(tempSchema.getFields()); + schemaBuilder.finish(schema.getSchema(schemaBuilder)); + Schema finalSchema = Schema.deserialize(ByteBuffer.wrap(schemaBuilder.sizedByteArray())); + assertFalse(finalSchema.toString().contains("[DEFAULT]")); + } + @Test public void schema() { ImmutableList.Builder childrenBuilder = ImmutableList.builder(); From 972193035802c47fa9ee8cb0962cda14c3f77847 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 6 Nov 2017 00:02:31 -0500 Subject: [PATCH 020/177] ARROW-480: [Python] Implement RowGroupMetaData.ColumnChunk This is the patch for [ARROW-480](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-480). Author: Licht-T Closes #1215 from Licht-T/feature-column-metadata and squashes the following commits: b89c98c1 [Licht-T] Fix string format to get Python 3 compatible 2de9ca8a [Licht-T] Merge remote-tracking branch 'arrow/master' into feature-column-metadata 367761b7 [Licht-T] TST: Check every row-groups and columns d4dc4d1a [Licht-T] TST: Add test for column statistics 06397f4b [Licht-T] Refactoring c5dbef39 [Licht-T] Fix RowGroupStatistics.__repr__ e3b0c6a4 [Licht-T] Convert std::string to Python str by frombytes 5ab032ae [Licht-T] Change EncodeMin/EncodeMax to decoded min/max c29fb4e4 [Licht-T] Rename ColumnChunk to column 6a71795b [Licht-T] Rename HasMinMax to snake case 8f605cfb [Licht-T] Use bint as C++ boolean type for existing checks 9cc55cb9 [Licht-T] Remove unnecessary class and methods 33c087a4 [Licht-T] TST: Fix Parquet MetaData test bdc0bdca [Licht-T] ENH: Implement RowGroupMetaData.ColumnChunk --- python/pyarrow/_parquet.pxd | 49 ++++++ python/pyarrow/_parquet.pyx | 224 +++++++++++++++++++++++++++ python/pyarrow/compat.py | 6 + python/pyarrow/tests/test_parquet.py | 54 ++++++- 4 files changed, 330 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 5094232bdc7b4..04a5b1368ce45 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -105,6 +105,11 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0" ParquetVersion_V2" parquet::ParquetVersion::PARQUET_2_0" + enum ParquetSortOrder" parquet::SortOrder::type": + ParquetSortOrder_SIGNED" parquet::SortOrder::SIGNED" + ParquetSortOrder_UNSIGNED" parquet::SortOrder::UNSIGNED" + ParquetSortOrder_UNKNOWN" parquet::SortOrder::UNKNOWN" + cdef cppclass ColumnDescriptor: c_bool Equals(const ColumnDescriptor& other) @@ -126,6 +131,8 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: c_bool Equals(const SchemaDescriptor& other) int num_columns() + cdef c_string FormatStatValue(ParquetType parquet_type, const char* val) + cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass ColumnReader: @@ -155,10 +162,52 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass RowGroupReader: pass + cdef cppclass CEncodedStatistics" parquet::EncodedStatistics": + const c_string& max() const + const c_string& min() const + int64_t null_count + int64_t distinct_count + bint has_min + bint has_max + bint has_null_count + bint has_distinct_count + + cdef cppclass CRowGroupStatistics" parquet::RowGroupStatistics": + int64_t null_count() const + int64_t distinct_count() const + int64_t num_values() const + bint HasMinMax() + void Reset() + c_string EncodeMin() + c_string EncodeMax() + CEncodedStatistics Encode() + void SetComparator() + ParquetType physical_type() const + + cdef cppclass CColumnChunkMetaData" parquet::ColumnChunkMetaData": + int64_t file_offset() const + const c_string& file_path() const + + ParquetType type() const + int64_t num_values() const + shared_ptr[ColumnPath] path_in_schema() const + bint is_stats_set() const + shared_ptr[CRowGroupStatistics] statistics() const; + ParquetCompression compression() const + const vector[ParquetEncoding]& encodings() const + + bint has_dictionary_page() const + int64_t dictionary_page_offset() const + int64_t data_page_offset() const + int64_t index_page_offset() const + int64_t total_compressed_size() const + int64_t total_uncompressed_size() const + cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData": int num_columns() int64_t num_rows() int64_t total_byte_size() + unique_ptr[CColumnChunkMetaData] ColumnChunk(int i) const cdef cppclass CFileMetaData" parquet::FileMetaData": uint32_t size() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index b096fa1b4d337..eca6b201b3ad0 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -35,6 +35,212 @@ from pyarrow.lib import ArrowException, NativeFile import six +try: + from textwrap import indent +except ImportError: + def indent(text, prefix): + lines = [prefix + line for line in text.splitlines(True)] + return ''.join(lines) + + +cdef class RowGroupStatistics: + cdef: + shared_ptr[CRowGroupStatistics] statistics + + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[CRowGroupStatistics]& statistics): + self.statistics = statistics + + def __repr__(self): + return """{0} + has_min_max: {1} + min: {2} + max: {3} + null_count: {4} + distinct_count: {5} + num_values: {6} + physical_type: {7}""".format(object.__repr__(self), + self.has_min_max, + self.min, + self.max, + self.null_count, + self.distinct_count, + self.num_values, + self.physical_type) + + property has_min_max: + + def __get__(self): + return self.statistics.get().HasMinMax() + + property min: + + def __get__(self): + raw_physical_type = self.statistics.get().physical_type() + encode_min = self.statistics.get().EncodeMin() + + min_value = FormatStatValue(raw_physical_type, encode_min.c_str()) + return frombytes(min_value) + + property max: + + def __get__(self): + raw_physical_type = self.statistics.get().physical_type() + encode_max = self.statistics.get().EncodeMax() + + max_value = FormatStatValue(raw_physical_type, encode_max.c_str()) + return frombytes(max_value) + + property null_count: + + def __get__(self): + return self.statistics.get().null_count() + + property distinct_count: + + def __get__(self): + return self.statistics.get().distinct_count() + + property num_values: + + def __get__(self): + return self.statistics.get().num_values() + + property physical_type: + + def __get__(self): + physical_type = self.statistics.get().physical_type() + return physical_type_name_from_enum(physical_type) + + +cdef class ColumnChunkMetaData: + cdef: + unique_ptr[CColumnChunkMetaData] up_metadata + CColumnChunkMetaData* metadata + + def __cinit__(self): + pass + + cdef init(self, const CRowGroupMetaData& row_group_metadata, int i): + self.up_metadata = row_group_metadata.ColumnChunk(i) + self.metadata = self.up_metadata.get() + + def __repr__(self): + statistics = indent(repr(self.statistics), 4 * ' ') + return """{0} + file_offset: {1} + file_path: {2} + type: {3} + num_values: {4} + path_in_schema: {5} + is_stats_set: {6} + statistics: +{7} + compression: {8} + encodings: {9} + has_dictionary_page: {10} + dictionary_page_offset: {11} + data_page_offset: {12} + index_page_offset: {13} + total_compressed_size: {14} + total_uncompressed_size: {15}""".format(object.__repr__(self), + self.file_offset, + self.file_path, + self.type, + self.num_values, + self.path_in_schema, + self.is_stats_set, + statistics, + self.compression, + self.encodings, + self.has_dictionary_page, + self.dictionary_page_offset, + self.data_page_offset, + self.index_page_offset, + self.total_compressed_size, + self.total_uncompressed_size) + + property file_offset: + + def __get__(self): + return self.metadata.file_offset() + + property file_path: + + def __get__(self): + return frombytes(self.metadata.file_path()) + + property type: + + def __get__(self): + return physical_type_name_from_enum(self.metadata.type()) + + property num_values: + + def __get__(self): + return self.metadata.num_values() + + property path_in_schema: + + def __get__(self): + path = self.metadata.path_in_schema().get().ToDotString() + return frombytes(path) + + property is_stats_set: + + def __get__(self): + return self.metadata.is_stats_set() + + property statistics: + + def __get__(self): + statistics = RowGroupStatistics() + statistics.init(self.metadata.statistics()) + return statistics + + property compression: + + def __get__(self): + return self.metadata.compression() + + property encodings: + + def __get__(self): + return map(encoding_name_from_enum, + self.metadata.encodings()) + + property has_dictionary_page: + + def __get__(self): + return self.metadata.has_dictionary_page() + + property dictionary_page_offset: + + def __get__(self): + return self.metadata.dictionary_page_offset() + + property data_page_offset: + + def __get__(self): + return self.metadata.data_page_offset() + + property index_page_offset: + + def __get__(self): + return self.metadata.index_page_offset() + + property total_compressed_size: + + def __get__(self): + return self.metadata.total_compressed_size() + + property total_uncompressed_size: + + def __get__(self): + return self.metadata.total_uncompressed_size() + cdef class RowGroupMetaData: cdef: @@ -52,6 +258,11 @@ cdef class RowGroupMetaData: self.metadata = self.up_metadata.get() self.parent = parent + def column(self, int i): + chunk = ColumnChunkMetaData() + chunk.init(deref(self.metadata), i) + return chunk + def __repr__(self): return """{0} num_columns: {1} @@ -371,6 +582,19 @@ cdef logical_type_name_from_enum(ParquetLogicalType type_): }.get(type_, 'UNKNOWN') +cdef encoding_name_from_enum (ParquetEncoding encoding_): + return { + ParquetEncoding_PLAIN: "PLAIN", + ParquetEncoding_PLAIN_DICTIONARY: "PLAIN_DICTIONARY", + ParquetEncoding_RLE: "RLE", + ParquetEncoding_BIT_PACKED: "BIT_PACKED", + ParquetEncoding_DELTA_BINARY_PACKED: "DELTA_BINARY_PACKED", + ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: "DELTA_LENGTH_BYTE_ARRAY", + ParquetEncoding_DELTA_BYTE_ARRAY: "DELTA_BYTE_ARRAY", + ParquetEncoding_RLE_DICTIONARY: "RLE_DICTIONARY", + }.get(encoding_, 'UNKNOWN') + + cdef class ParquetReader: cdef: object source diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index f9c148b14e368..866cbdd96d063 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -103,6 +103,9 @@ def tobytes(o): def frombytes(o): return o + + def unichar(s): + return unichr(s) else: unicode_type = str def lzip(*x): @@ -131,6 +134,9 @@ def tobytes(o): def frombytes(o): return o.decode('utf8') + def unichar(s): + return chr(s) + try: import cloudpickle as pickle except ImportError: diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 95dd6a471b6b3..e2e6863c4748f 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -22,7 +22,7 @@ import json import pytest -from pyarrow.compat import guid, u, BytesIO +from pyarrow.compat import guid, u, BytesIO, unichar, frombytes from pyarrow.filesystem import LocalFileSystem import pyarrow as pa from .pandas_examples import dataframe_with_arrays, dataframe_with_lists @@ -469,13 +469,61 @@ def test_parquet_metadata_api(): schema[-1] # Row group - rg_meta = meta.row_group(0) - repr(rg_meta) + for rg in range(meta.num_row_groups): + rg_meta = meta.row_group(rg) + repr(rg_meta) + + for col in range(rg_meta.num_columns): + col_meta = rg_meta.column(col) + repr(col_meta) assert rg_meta.num_rows == len(df) assert rg_meta.num_columns == ncols + 1 # +1 for index +@parquet +@pytest.mark.parametrize( + 'data, dtype, min_value, max_value, null_count, num_values', + [ + ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4), + ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4), + ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4), + ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4), + ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4), + ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4), + ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4), + ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4), + ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4), + ( + [u'', u'b', unichar(1000), None, u'aaa'], + str, u' ', frombytes((unichar(1000) + u' ').encode('utf-8')), 1, 4 + ), + ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5), + ] +) +def test_parquet_column_statistics_api( + data, + dtype, + min_value, + max_value, + null_count, + num_values): + df = pd.DataFrame({'data': data}, dtype=dtype) + + fileh = make_sample_file(df) + + meta = fileh.metadata + + rg_meta = meta.row_group(0) + col_meta = rg_meta.column(0) + + stat = col_meta.statistics + assert stat.min == min_value + assert stat.max == max_value + assert stat.null_count == null_count + assert stat.num_values == num_values + + @parquet def test_compare_schemas(): df = alltypes_sample(size=10000) From 0106f531c04477b1c8bd088d097624ff43b44658 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 6 Nov 2017 12:55:49 -0500 Subject: [PATCH 021/177] ARROW-1750: [C++] Remove the need for arrow/util/random.h Author: Phillip Cloud Closes #1283 from cpcloud/ARROW-1750 and squashes the following commits: 3f6af737 [Phillip Cloud] ARROW-1750: [C++] Remove the need for arrow/util/random.h --- cpp/src/arrow/array-test.cc | 93 ++++++++----- cpp/src/arrow/ipc/ipc-json-test.cc | 4 +- cpp/src/arrow/test-util.h | 214 ++++++++++++++++++++++------- cpp/src/arrow/util/CMakeLists.txt | 1 - cpp/src/arrow/util/decimal.cc | 11 +- cpp/src/arrow/util/decimal.h | 5 +- cpp/src/arrow/util/random.h | 126 ----------------- 7 files changed, 236 insertions(+), 218 deletions(-) delete mode 100644 cpp/src/arrow/util/random.h diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 168ef10573e77..9f248cdbbb754 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -263,6 +263,8 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_TRUE(result->Equals(*expected)); } + int64_t FlipValue(int64_t value) const { return ~value; } + protected: std::shared_ptr type_; std::unique_ptr builder_; @@ -272,44 +274,64 @@ class TestPrimitiveBuilder : public TestBuilder { vector valid_bytes_; }; -#define PTYPE_DECL(CapType, c_type) \ - typedef CapType##Array ArrayType; \ - typedef CapType##Builder BuilderType; \ - typedef CapType##Type Type; \ - typedef c_type T; \ - \ - static std::shared_ptr type() { \ - return std::shared_ptr(new Type()); \ - } +/// \brief uint8_t isn't a valid template parameter to uniform_int_distribution, so +/// we use SampleType to determine which kind of integer to use to sample. +template ::value, T>::type> +struct UniformIntSampleType { + using type = T; +}; + +template <> +struct UniformIntSampleType { + using type = uint16_t; +}; + +template <> +struct UniformIntSampleType { + using type = int16_t; +}; -#define PINT_DECL(CapType, c_type, LOWER, UPPER) \ +#define PTYPE_DECL(CapType, c_type) \ + typedef CapType##Array ArrayType; \ + typedef CapType##Builder BuilderType; \ + typedef CapType##Type Type; \ + typedef c_type T; \ + \ + static std::shared_ptr type() { return std::make_shared(); } + +#define PINT_DECL(CapType, c_type) \ + struct P##CapType { \ + PTYPE_DECL(CapType, c_type) \ + static void draw(int64_t N, vector* draws) { \ + using sample_type = typename UniformIntSampleType::type; \ + const T lower = std::numeric_limits::min(); \ + const T upper = std::numeric_limits::max(); \ + test::randint(N, static_cast(lower), static_cast(upper), \ + draws); \ + } \ + } + +#define PFLOAT_DECL(CapType, c_type, LOWER, UPPER) \ struct P##CapType { \ PTYPE_DECL(CapType, c_type) \ static void draw(int64_t N, vector* draws) { \ - test::randint(N, LOWER, UPPER, draws); \ + test::random_real(N, 0, LOWER, UPPER, draws); \ } \ } -#define PFLOAT_DECL(CapType, c_type, LOWER, UPPER) \ - struct P##CapType { \ - PTYPE_DECL(CapType, c_type) \ - static void draw(int64_t N, vector* draws) { \ - test::random_real(N, 0, LOWER, UPPER, draws); \ - } \ - } - -PINT_DECL(UInt8, uint8_t, 0, UINT8_MAX); -PINT_DECL(UInt16, uint16_t, 0, UINT16_MAX); -PINT_DECL(UInt32, uint32_t, 0, UINT32_MAX); -PINT_DECL(UInt64, uint64_t, 0, UINT64_MAX); +PINT_DECL(UInt8, uint8_t); +PINT_DECL(UInt16, uint16_t); +PINT_DECL(UInt32, uint32_t); +PINT_DECL(UInt64, uint64_t); -PINT_DECL(Int8, int8_t, INT8_MIN, INT8_MAX); -PINT_DECL(Int16, int16_t, INT16_MIN, INT16_MAX); -PINT_DECL(Int32, int32_t, INT32_MIN, INT32_MAX); -PINT_DECL(Int64, int64_t, INT64_MIN, INT64_MAX); +PINT_DECL(Int8, int8_t); +PINT_DECL(Int16, int16_t); +PINT_DECL(Int32, int32_t); +PINT_DECL(Int64, int64_t); -PFLOAT_DECL(Float, float, -1000, 1000); -PFLOAT_DECL(Double, double, -1000, 1000); +PFLOAT_DECL(Float, float, -1000.0f, 1000.0f); +PFLOAT_DECL(Double, double, -1000.0, 1000.0); struct PBoolean { PTYPE_DECL(Boolean, uint8_t) @@ -324,6 +346,11 @@ void TestPrimitiveBuilder::RandomData(int64_t N, double pct_null) { test::random_null_bytes(N, pct_null, valid_bytes_.data()); } +template <> +int64_t TestPrimitiveBuilder::FlipValue(int64_t value) const { + return !value; +} + template <> void TestPrimitiveBuilder::Check(const std::unique_ptr& builder, bool nullable) { @@ -454,8 +481,8 @@ TYPED_TEST(TestPrimitiveBuilder, Equality) { const int64_t first_valid_idx = std::distance(valid_bytes.begin(), first_valid); // This should be true with a very high probability, but might introduce flakiness ASSERT_LT(first_valid_idx, size - 1); - draws[first_valid_idx] = - static_cast(~*reinterpret_cast(&draws[first_valid_idx])); + draws[first_valid_idx] = static_cast( + this->FlipValue(*reinterpret_cast(&draws[first_valid_idx]))); ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &unequal_array)); // test normal equality @@ -724,8 +751,8 @@ void CheckSliceApproxEquals() { vector draws2; const uint32_t kSeed = 0; - test::random_real(kSize, kSeed, 0, 100, &draws1); - test::random_real(kSize, kSeed + 1, 0, 100, &draws2); + test::random_real(kSize, kSeed, 0.0, 100.0, &draws1); + test::random_real(kSize, kSeed + 1, 0.0, 100.0, &draws2); // Make the draws equal in the sliced segment, but unequal elsewhere (to // catch not using the slice offset) diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index f2dd9e74e335d..a560f09d6fdb1 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -222,8 +222,8 @@ void MakeBatchArrays(const std::shared_ptr& schema, const int num_rows, std::vector v1_values; std::vector v2_values; - test::randint(num_rows, 0, 100, &v1_values); - test::randint(num_rows, 0, 100, &v2_values); + test::randint(num_rows, 0, 100, &v1_values); + test::randint(num_rows, 0, 100, &v2_values); std::shared_ptr v1; ArrayFromVector(is_valid, v1_values, &v1); diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 044fb9476ca73..7306f577a36e0 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -18,6 +18,7 @@ #ifndef ARROW_TEST_UTIL_H_ #define ARROW_TEST_UTIL_H_ +#include #include #include #include @@ -38,8 +39,8 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" -#include "arrow/util/random.h" #define ASSERT_RAISES(ENUM, expr) \ do { \ @@ -47,7 +48,7 @@ if (!s.Is##ENUM()) { \ FAIL() << s.ToString(); \ } \ - } while (0) + } while (false) #define ASSERT_OK(expr) \ do { \ @@ -55,7 +56,7 @@ if (!s.ok()) { \ FAIL() << s.ToString(); \ } \ - } while (0) + } while (false) #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) @@ -63,15 +64,15 @@ do { \ ::arrow::Status s = (expr); \ EXPECT_TRUE(s.ok()); \ - } while (0) + } while (false) #define ABORT_NOT_OK(s) \ do { \ ::arrow::Status _s = (s); \ if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - exit(-1); \ + exit(EXIT_FAILURE); \ } \ - } while (0); + } while (false); namespace arrow { @@ -79,27 +80,22 @@ using ArrayVector = std::vector>; namespace test { -template -void randint(int64_t N, T lower, T upper, std::vector* out) { - Random rng(random_seed()); - uint64_t draw; - uint64_t span = upper - lower; - T val; - for (int64_t i = 0; i < N; ++i) { - draw = rng.Uniform64(span); - val = static_cast(draw + lower); - out->push_back(val); - } +template +void randint(int64_t N, T lower, T upper, std::vector* out) { + const int random_seed = 0; + std::mt19937 gen(random_seed); + std::uniform_int_distribution d(lower, upper); + out->resize(N, static_cast(0)); + std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast(d(gen)); }); } -template +template void random_real(int64_t n, uint32_t seed, T min_value, T max_value, - std::vector* out) { + std::vector* out) { std::mt19937 gen(seed); std::uniform_real_distribution d(min_value, max_value); - for (int64_t i = 0; i < n; ++i) { - out->push_back(d(gen)); - } + out->resize(n, static_cast(0)); + std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast(d(gen)); }); } template @@ -115,7 +111,8 @@ inline Status CopyBufferFromVector(const std::vector& values, MemoryPool* poo auto buffer = std::make_shared(pool); RETURN_NOT_OK(buffer->Resize(nbytes)); - memcpy(buffer->mutable_data(), values.data(), nbytes); + auto immutable_data = reinterpret_cast(values.data()); + std::copy(immutable_data, immutable_data + nbytes, buffer->mutable_data()); *result = buffer; return Status::OK(); @@ -143,56 +140,173 @@ static inline Status GetBitmapFromVector(const std::vector& is_valid, // Sets approximately pct_null of the first n bytes in null_bytes to zero // and the rest to non-zero (true) values. static inline void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { - Random rng(random_seed()); - for (int64_t i = 0; i < n; ++i) { - null_bytes[i] = rng.NextDoubleFraction() > pct_null; - } + const int random_seed = 0; + std::mt19937 gen(random_seed); + std::uniform_real_distribution d(0.0, 1.0); + std::generate(null_bytes, null_bytes + n, + [&d, &gen, &pct_null] { return d(gen) > pct_null; }); } static inline void random_is_valid(int64_t n, double pct_null, std::vector* is_valid) { - Random rng(random_seed()); - for (int64_t i = 0; i < n; ++i) { - is_valid->push_back(rng.NextDoubleFraction() > pct_null); - } + const int random_seed = 0; + std::mt19937 gen(random_seed); + std::uniform_real_distribution d(0.0, 1.0); + is_valid->resize(n, false); + std::generate(is_valid->begin(), is_valid->end(), + [&d, &gen, &pct_null] { return d(gen) > pct_null; }); } static inline void random_bytes(int64_t n, uint32_t seed, uint8_t* out) { std::mt19937 gen(seed); - std::uniform_int_distribution d(0, 255); + std::uniform_int_distribution d(0, std::numeric_limits::max()); + std::generate(out, out + n, [&d, &gen] { return static_cast(d(gen) & 0xFF); }); +} - for (int64_t i = 0; i < n; ++i) { - out[i] = static_cast(d(gen) & 0xFF); +static void DecimalRange(int32_t precision, Decimal128* min_decimal, + Decimal128* max_decimal) { + DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got " + << precision; + DCHECK_LE(precision, 38) << "decimal precision must be less than or equal to 38, got " + << precision; + + switch (precision) { + case 1: + case 2: + *max_decimal = std::numeric_limits::max(); + break; + case 3: + case 4: + *max_decimal = std::numeric_limits::max(); + break; + case 5: + case 6: + *max_decimal = 8388607; + break; + case 7: + case 8: + case 9: + *max_decimal = std::numeric_limits::max(); + break; + case 10: + case 11: + *max_decimal = 549755813887; + break; + case 12: + case 13: + case 14: + *max_decimal = 140737488355327; + break; + case 15: + case 16: + *max_decimal = 36028797018963967; + break; + case 17: + case 18: + *max_decimal = std::numeric_limits::max(); + break; + case 19: + case 20: + case 21: + *max_decimal = Decimal128("2361183241434822606847"); + break; + case 22: + case 23: + *max_decimal = Decimal128("604462909807314587353087"); + break; + case 24: + case 25: + case 26: + *max_decimal = Decimal128("154742504910672534362390527"); + break; + case 27: + case 28: + *max_decimal = Decimal128("39614081257132168796771975167"); + break; + case 29: + case 30: + case 31: + *max_decimal = Decimal128("10141204801825835211973625643007"); + break; + case 32: + case 33: + *max_decimal = Decimal128("2596148429267413814265248164610047"); + break; + case 34: + case 35: + *max_decimal = Decimal128("664613997892457936451903530140172287"); + break; + case 36: + case 37: + case 38: + *max_decimal = Decimal128("170141183460469231731687303715884105727"); + break; + default: + DCHECK(false); + break; } + + *min_decimal = ~(*max_decimal); } -static inline void random_ascii(int64_t n, uint32_t seed, uint8_t* out) { +class UniformDecimalDistribution { + public: + explicit UniformDecimalDistribution(int32_t precision) { + Decimal128 max_decimal; + Decimal128 min_decimal; + DecimalRange(precision, &min_decimal, &max_decimal); + + const auto min_low = static_cast(min_decimal.low_bits()); + const auto max_low = static_cast(max_decimal.low_bits()); + + const int64_t min_high = min_decimal.high_bits(); + const int64_t max_high = max_decimal.high_bits(); + + using param_type = std::uniform_int_distribution::param_type; + + lower_dist_.param(param_type(min_low, max_low)); + upper_dist_.param(param_type(min_high, max_high)); + } + + template + Decimal128 operator()(Generator& gen) { + return Decimal128(upper_dist_(gen), static_cast(lower_dist_(gen))); + } + + private: + // The lower bits distribution is intentionally int64_t. + // If it were uint64_t then the size of the interval [min_high, max_high] would be 0 + // because min_high > max_high due to 2's complement. + // So, we generate the same range of bits using int64_t and then cast to uint64_t. + std::uniform_int_distribution lower_dist_; + std::uniform_int_distribution upper_dist_; +}; + +static inline void random_decimals(int64_t n, uint32_t seed, int32_t precision, + uint8_t* out) { std::mt19937 gen(seed); - std::uniform_int_distribution d(65, 122); + UniformDecimalDistribution dist(precision); - for (int64_t i = 0; i < n; ++i) { - out[i] = static_cast(d(gen) & 0xFF); + for (int64_t i = 0; i < n; ++i, out += 16) { + const Decimal128 value(dist(gen)); + value.ToBytes(out); } } -template -void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, T* out) { +template +void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, U* out) { DCHECK(out || (n == 0)); std::mt19937 gen(seed); std::uniform_int_distribution d(min_value, max_value); - for (int64_t i = 0; i < n; ++i) { - out[i] = static_cast(d(gen)); - } + std::generate(out, out + n, [&d, &gen] { return static_cast(d(gen)); }); +} + +static inline void random_ascii(int64_t n, uint32_t seed, uint8_t* out) { + rand_uniform_int(n, seed, static_cast('A'), static_cast('z'), out); } static inline int64_t null_count(const std::vector& valid_bytes) { - int64_t result = 0; - for (size_t i = 0; i < valid_bytes.size(); ++i) { - if (valid_bytes[i] == 0) { - ++result; - } - } - return result; + return static_cast(std::count(valid_bytes.cbegin(), valid_bytes.cend(), '\0')); } Status MakeRandomInt32PoolBuffer(int64_t length, MemoryPool* pool, diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 5df5e748f39e5..7810a3be46da5 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -38,7 +38,6 @@ install(FILES logging.h macros.h parallel.h - random.h rle-encoding.h sse-util.h stl.h diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 9d94bef847fa1..cc180258aa4df 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -43,14 +43,17 @@ Decimal128::Decimal128(const uint8_t* bytes) } std::array Decimal128::ToBytes() const { - const uint64_t raw[] = {BitUtil::ToLittleEndian(low_bits_), - BitUtil::ToLittleEndian(static_cast(high_bits_))}; - const auto* raw_data = reinterpret_cast(raw); std::array out{{0}}; - std::copy(raw_data, raw_data + out.size(), out.begin()); + ToBytes(out.data()); return out; } +void Decimal128::ToBytes(uint8_t* out) const { + DCHECK_NE(out, NULLPTR); + reinterpret_cast(out)[0] = BitUtil::ToLittleEndian(low_bits_); + reinterpret_cast(out)[1] = BitUtil::ToLittleEndian(high_bits_); +} + static constexpr Decimal128 kTenTo36(static_cast(0xC097CE7BC90715), 0xB34B9F1000000000); static constexpr Decimal128 kTenTo18(0xDE0B6B3A7640000); diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 487f222580201..a0423e9fce49c 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -102,13 +102,14 @@ class ARROW_EXPORT Decimal128 { Decimal128& operator>>=(uint32_t bits); /// \brief Get the high bits of the two's complement representation of the number. - int64_t high_bits() const { return high_bits_; } + inline int64_t high_bits() const { return high_bits_; } /// \brief Get the low bits of the two's complement representation of the number. - uint64_t low_bits() const { return low_bits_; } + inline uint64_t low_bits() const { return low_bits_; } /// \brief Return the raw bytes of the value in little-endian byte order. std::array ToBytes() const; + void ToBytes(uint8_t* out) const; /// \brief Convert the Decimal128 value to a base 10 decimal string with the given /// scale. diff --git a/cpp/src/arrow/util/random.h b/cpp/src/arrow/util/random.h deleted file mode 100644 index 2e05a73033d0f..0000000000000 --- a/cpp/src/arrow/util/random.h +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// Moved from Kudu http://github.com/cloudera/kudu - -#ifndef ARROW_UTIL_RANDOM_H_ -#define ARROW_UTIL_RANDOM_H_ - -#include - -#include - -namespace arrow { -namespace internal { -namespace random { - -static const uint32_t M = 2147483647L; // 2^31-1 -const double kTwoPi = 6.283185307179586476925286; - -} // namespace random -} // namespace internal - -// A very simple random number generator. Not especially good at -// generating truly random bits, but good enough for our needs in this -// package. This implementation is not thread-safe. -class Random { - public: - explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { - // Avoid bad seeds. - if (seed_ == 0 || seed_ == internal::random::M) { - seed_ = 1; - } - } - - // Next pseudo-random 32-bit unsigned integer. - // FIXME: This currently only generates 31 bits of randomness. - // The MSB will always be zero. - uint32_t Next() { - static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 - // We are computing - // seed_ = (seed_ * A) % M, where M = 2^31-1 - // - // seed_ must not be zero or M, or else all subsequent computed values - // will be zero or M respectively. For all other values, seed_ will end - // up cycling through every number in [1,M-1] - uint64_t product = seed_ * A; - - // Compute (product % M) using the fact that ((x << 31) % M) == x. - seed_ = static_cast((product >> 31) + (product & internal::random::M)); - // The first reduction may overflow by 1 bit, so we may need to - // repeat. mod == M is not possible; using > allows the faster - // sign-bit-based test. - if (seed_ > internal::random::M) { - seed_ -= internal::random::M; - } - return seed_; - } - - // Alias for consistency with Next64 - uint32_t Next32() { return Next(); } - - // Next pseudo-random 64-bit unsigned integer. - // FIXME: This currently only generates 62 bits of randomness due to Next() - // only giving 31 bits of randomness. The 2 most significant bits will always - // be zero. - uint64_t Next64() { - uint64_t large = Next(); - // Only shift by 31 bits so we end up with zeros in MSB and not scattered - // throughout the 64-bit word. This is due to the weakness in Next() noted - // above. - large <<= 31; - large |= Next(); - return large; - } - - // Returns a uniformly distributed value in the range [0..n-1] - // REQUIRES: n > 0 - uint32_t Uniform(uint32_t n) { return Next() % n; } - - // Alias for consistency with Uniform64 - uint32_t Uniform32(uint32_t n) { return Uniform(n); } - - // Returns a uniformly distributed 64-bit value in the range [0..n-1] - // REQUIRES: n > 0 - uint64_t Uniform64(uint64_t n) { return Next64() % n; } - - // Randomly returns true ~"1/n" of the time, and false otherwise. - // REQUIRES: n > 0 - bool OneIn(int n) { return (Next() % n) == 0; } - - // Skewed: pick "base" uniformly from range [0,max_log] and then - // return "base" random bits. The effect is to pick a number in the - // range [0,2^max_log-1] with exponential bias towards smaller numbers. - uint32_t Skewed(int max_log) { return Uniform(1 << Uniform(max_log + 1)); } - - // Creates a normal distribution variable using the - // Box-Muller transform. See: - // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform - // Adapted from WebRTC source code at: - // webrtc/trunk/modules/video_coding/main/test/test_util.cc - double Normal(double mean, double std_dev) { - double uniform1 = (Next() + 1.0) / (internal::random::M + 1.0); - double uniform2 = (Next() + 1.0) / (internal::random::M + 1.0); - return (mean + - std_dev * sqrt(-2 * ::log(uniform1)) * - cos(internal::random::kTwoPi * uniform2)); - } - - // Return a random number between 0.0 and 1.0 inclusive. - double NextDoubleFraction() { - return Next() / static_cast(internal::random::M + 1.0); - } - - private: - uint32_t seed_; -}; - -uint32_t random_seed() { - // TODO(wesm): use system time to get a reasonably random seed - return 0; -} - -} // namespace arrow - -#endif // ARROW_UTIL_RANDOM_H_ From 99ea353dbaf15b8db1ad7d8d4419643abe99189d Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 6 Nov 2017 22:24:19 -0500 Subject: [PATCH 022/177] ARROW-1771: [C++] ARROW-1749 Breaks Public API test in parquet-cpp Author: Phillip Cloud Closes #1288 from cpcloud/ARROW-1771 and squashes the following commits: dc002568 [Phillip Cloud] Formatting again ac2367b4 [Phillip Cloud] Fix formatting 35e67e50 [Phillip Cloud] Add public API test 659e5165 [Phillip Cloud] Fix formatting de4632ac [Phillip Cloud] ARROW-1771: [C++] ARROW-1749 Breaks Public API test in parquet-cpp --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/public-api-test.cc | 26 ++++++++++++++++++++++++++ cpp/src/arrow/util/bit-util.h | 3 +-- 3 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 cpp/src/arrow/public-api-test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 22b475146da7c..69d5052330c3a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -171,6 +171,7 @@ ADD_ARROW_TEST(array-test) ADD_ARROW_TEST(buffer-test) ADD_ARROW_TEST(memory_pool-test) ADD_ARROW_TEST(pretty_print-test) +ADD_ARROW_TEST(public-api-test) ADD_ARROW_TEST(status-test) ADD_ARROW_TEST(type-test) ADD_ARROW_TEST(table-test) diff --git a/cpp/src/arrow/public-api-test.cc b/cpp/src/arrow/public-api-test.cc new file mode 100644 index 0000000000000..8298d748fe8fd --- /dev/null +++ b/cpp/src/arrow/public-api-test.cc @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/api.h" + +#ifdef DCHECK +#error "DCHECK should not be visible from Arrow public headers." +#endif + +#include + +TEST(_, _) {} diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index d6415f3c75b7b..cab3c9ee703c9 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -57,7 +57,6 @@ #include #include -#include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/type_traits.h" #include "arrow/util/visibility.h" @@ -301,7 +300,7 @@ static inline int Log2(uint64_t x) { /// \brief Count the number of leading zeros in a 32 bit integer. static inline int64_t CountLeadingZeros(uint32_t value) { - DCHECK_NE(value, 0); +// DCHECK_NE(value, 0); #if defined(__clang__) || defined(__GNUC__) return static_cast(__builtin_clz(value)); #elif defined(_MSC_VER) From 3995eb3c1d94ab8fd151c32a5d1994a6b085deec Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Nov 2017 22:24:51 -0500 Subject: [PATCH 023/177] ARROW-1768: [Python] Fix suppressed exception in ParquetWriter.__del__ This closes [ARROW-1768](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-1768). Author: Wes McKinney Author: Licht-T Closes #1286 from Licht-T/fix-suppressed-exception-in-parquetwriter-del and squashes the following commits: ccd7344d [Wes McKinney] Use getattr to be robust to is_open attribute not existing 4a42683d [Licht-T] Fix suppressed exception in ParquetWriter.__del__ --- python/pyarrow/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 9dcc30c8af479..9e0749bb35c9e 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -260,7 +260,7 @@ def __init__(self, where, schema, flavor=None, self.is_open = True def __del__(self): - if self.is_open: + if getattr(self, 'is_open', False): self.close() def write_table(self, table, row_group_size=None): From e631119d853a3182fa4e2d8cd980440f59c7a679 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Tue, 7 Nov 2017 11:57:43 -0700 Subject: [PATCH 024/177] [Format] Fix link to Flatbuffers project in IPC.md Close #1291 Change-Id: Ibae55c472b08cedcf22bd783c7a7fce1449051fe --- format/IPC.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/format/IPC.md b/format/IPC.md index f3b48854c2072..5a5d3aef62be7 100644 --- a/format/IPC.md +++ b/format/IPC.md @@ -252,5 +252,5 @@ shared memory region) to be a multiple of 8: [1]: https://github.com/apache/arrow/blob/master/format/File.fbs [2]: https://github.com/apache/arrow/blob/master/format/Message.fbs -[3]: https://github.com/google]/flatbuffers +[3]: https://github.com/google/flatbuffers [4]: https://github.com/apache/arrow/blob/master/format/Layout.md From 3188d70202795d8e0a8092ec5685d859b02e366d Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Tue, 7 Nov 2017 16:22:20 -0500 Subject: [PATCH 025/177] ARROW-1716: [Format/JSON] Use string integer value for Decimals in JSON Author: Phillip Cloud Closes #1267 from cpcloud/ARROW-1716 and squashes the following commits: b4f3aed2 [Phillip Cloud] Add cases for every valid precision f8d4391f [Phillip Cloud] Use the full range of decimal values in integration tests 6fef5f71 [Phillip Cloud] ARROW-1716: [Format/JSON] Use string integer value for Decimals in JSON --- cpp/src/arrow/ipc/ipc-read-write-test.cc | 4 +- cpp/src/arrow/ipc/json-internal.cc | 69 +++++++++--- cpp/src/arrow/ipc/test-common.h | 13 ++- integration/integration_test.py | 105 ++++++++++-------- .../vector/file/json/JsonFileReader.java | 9 +- .../vector/file/json/JsonFileWriter.java | 8 +- .../arrow/vector/util/DecimalUtility.java | 32 +++++- 7 files changed, 165 insertions(+), 75 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index 6f2f5cf856055..40cd3f0eef0b8 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -727,7 +727,7 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) { int64_t size = 24; std::vector values; - test::randint(size, 0, 100, &values); + test::randint(size, 0, 100, &values); auto data = test::GetBufferFromVector(values); @@ -748,7 +748,7 @@ TEST_F(TestTensorRoundTrip, NonContiguous) { ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_)); std::vector values; - test::randint(24, 0, 100, &values); + test::randint(24, 0, 100, &values); auto data = test::GetBufferFromVector(values); Tensor tensor(int64(), data, {4, 3}, {48, 16}); diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 025f6c276541e..c1c0661d6ad35 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -33,6 +33,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/util/string.h" #include "arrow/visitor_inline.h" @@ -448,7 +449,8 @@ class ArrayWriter { } void WriteDataValues(const FixedSizeBinaryArray& arr) { - int32_t width = arr.byte_width(); + const int32_t width = arr.byte_width(); + for (int64_t i = 0; i < arr.length(); ++i) { const uint8_t* buf = arr.GetValue(i); std::string encoded = HexEncode(buf, width); @@ -456,6 +458,13 @@ class ArrayWriter { } } + void WriteDataValues(const DecimalArray& arr) { + for (int64_t i = 0; i < arr.length(); ++i) { + const Decimal128 value(arr.GetValue(i)); + writer_->String(value.ToIntegerString()); + } + } + void WriteDataValues(const BooleanArray& arr) { for (int i = 0; i < arr.length(); ++i) { writer_->Bool(arr.Value(i)); @@ -1053,7 +1062,9 @@ class ArrayReader { } template - typename std::enable_if::value, Status>::type + typename std::enable_if::value && + !std::is_base_of::value, + Status>::type Visit(const T& type) { typename TypeTraits::BuilderType builder(type_, pool_); @@ -1073,22 +1084,52 @@ class ArrayReader { for (int i = 0; i < length_; ++i) { if (!is_valid_[i]) { RETURN_NOT_OK(builder.AppendNull()); - continue; - } + } else { + const rj::Value& val = json_data_arr[i]; + DCHECK(val.IsString()) + << "Found non-string JSON value when parsing FixedSizeBinary value"; + std::string hex_string = val.GetString(); + if (static_cast(hex_string.size()) != byte_width * 2) { + DCHECK(false) << "Expected size: " << byte_width * 2 + << " got: " << hex_string.size(); + } + const char* hex_data = hex_string.c_str(); - const rj::Value& val = json_data_arr[i]; - DCHECK(val.IsString()); - std::string hex_string = val.GetString(); - if (static_cast(hex_string.size()) != byte_width * 2) { - DCHECK(false) << "Expected size: " << byte_width * 2 - << " got: " << hex_string.size(); + for (int32_t j = 0; j < byte_width; ++j) { + RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j])); + } + RETURN_NOT_OK(builder.Append(byte_buffer_data)); } - const char* hex_data = hex_string.c_str(); + } + return builder.Finish(&result_); + } + + template + typename std::enable_if::value, Status>::type Visit( + const T& type) { + typename TypeTraits::BuilderType builder(type_, pool_); + + const auto& json_data = obj_->FindMember("DATA"); + RETURN_NOT_ARRAY("DATA", json_data, *obj_); - for (int32_t j = 0; j < byte_width; ++j) { - RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j])); + const auto& json_data_arr = json_data->value.GetArray(); + + DCHECK_EQ(static_cast(json_data_arr.Size()), length_); + + for (int i = 0; i < length_; ++i) { + if (!is_valid_[i]) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + const rj::Value& val = json_data_arr[i]; + DCHECK(val.IsString()) + << "Found non-string JSON value when parsing Decimal128 value"; + DCHECK_GT(val.GetStringLength(), 0) + << "Empty string found when parsing Decimal128 value"; + + Decimal128 value; + RETURN_NOT_OK(Decimal128::FromString(val.GetString(), &value)); + RETURN_NOT_OK(builder.Append(value)); } - RETURN_NOT_OK(builder.Append(byte_buffer_data)); } return builder.Finish(&result_); } diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index b2137b7dbef6a..91023db489852 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -671,8 +671,11 @@ Status MakeFWBinary(std::shared_ptr* out) { } Status MakeDecimal(std::shared_ptr* out) { - auto f0 = field("f0", decimal(19, 4)); - auto schema = ::arrow::schema({f0, f0}); + constexpr int kDecimalPrecision = 38; + auto type = decimal(kDecimalPrecision, 4); + auto f0 = field("f0", type); + auto f1 = field("f1", type); + auto schema = ::arrow::schema({f0, f1}); constexpr int kDecimalSize = 16; constexpr int length = 10; @@ -682,7 +685,7 @@ Status MakeDecimal(std::shared_ptr* out) { RETURN_NOT_OK(AllocateBuffer(default_memory_pool(), kDecimalSize * length, &data)); - test::random_bytes(kDecimalSize * length, 0, data->mutable_data()); + test::random_decimals(length, 1, kDecimalPrecision, data->mutable_data()); test::random_null_bytes(length, 0.1, is_valid_bytes.data()); RETURN_NOT_OK(BitUtil::BytesToBits(is_valid_bytes, default_memory_pool(), &is_valid)); @@ -690,10 +693,10 @@ Status MakeDecimal(std::shared_ptr* out) { auto a1 = std::make_shared(f0->type(), length, data, is_valid, kUnknownNullCount); - auto a2 = std::make_shared(f0->type(), length, data); + auto a2 = std::make_shared(f1->type(), length, data); ArrayVector arrays = {a1, a2}; - *out = std::make_shared(schema, a1->length(), arrays); + *out = std::make_shared(schema, length, arrays); return Status::OK(); } diff --git a/integration/integration_test.py b/integration/integration_test.py index 59a1de5a4639d..205176eccc11a 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -65,24 +65,16 @@ def rands(nchars): return ''.join(np.random.choice(RANDS_CHARS, nchars)) -if six.PY2: - def frombytes(o): - return o +def tobytes(o): + if isinstance(o, six.text_type): + return o.encode('utf8') + return o - def tobytes(o): - if isinstance(o, unicode): - return o.encode('utf8') - else: - return o -else: - def tobytes(o): - if isinstance(o, str): - return o.encode('utf8') - else: - return o - def frombytes(o): +def frombytes(o): + if isinstance(o, six.binary_type): return o.decode('utf8') + return o # from the merge_arrow_pr.py script @@ -177,7 +169,7 @@ def _get_type_layout(self): class PrimitiveColumn(Column): def __init__(self, name, count, is_valid, values): - Column.__init__(self, name, count) + super(PrimitiveColumn, self).__init__(name, count) self.is_valid = is_valid self.values = values @@ -191,15 +183,16 @@ def _get_buffers(self): ] -TEST_INT_MIN = - 2**31 + 1 -TEST_INT_MAX = 2**31 - 1 +TEST_INT_MAX = 2 ** 31 - 1 +TEST_INT_MIN = ~TEST_INT_MAX + class IntegerType(PrimitiveType): def __init__(self, name, is_signed, bit_width, nullable=True, min_value=TEST_INT_MIN, max_value=TEST_INT_MAX): - PrimitiveType.__init__(self, name, nullable=nullable) + super(IntegerType, self).__init__(name, nullable=nullable) self.is_signed = is_signed self.bit_width = bit_width self.min_value = min_value @@ -239,9 +232,11 @@ class DateType(IntegerType): MILLISECOND = 1 def __init__(self, name, unit, nullable=True): - self.unit = unit bit_width = 32 if unit == self.DAY else 64 - IntegerType.__init__(self, name, True, bit_width, nullable=nullable) + super(DateType, self).__init__( + name, True, bit_width, nullable=nullable + ) + self.unit = unit def _get_type(self): return OrderedDict([ @@ -268,9 +263,10 @@ class TimeType(IntegerType): } def __init__(self, name, unit='s', nullable=True): + super(TimeType, self).__init__( + name, True, self.BIT_WIDTHS[unit], nullable=nullable + ) self.unit = unit - IntegerType.__init__(self, name, True, self.BIT_WIDTHS[unit], - nullable=nullable) def _get_type(self): return OrderedDict([ @@ -283,9 +279,9 @@ def _get_type(self): class TimestampType(IntegerType): def __init__(self, name, unit='s', tz=None, nullable=True): + super(TimestampType, self).__init__(name, True, 64, nullable=nullable) self.unit = unit self.tz = tz - IntegerType.__init__(self, name, True, 64, nullable=nullable) def _get_type(self): fields = [ @@ -302,7 +298,7 @@ def _get_type(self): class FloatingPointType(PrimitiveType): def __init__(self, name, bit_width, nullable=True): - PrimitiveType.__init__(self, name, nullable=nullable) + super(FloatingPointType, self).__init__(name, nullable=nullable) self.bit_width = bit_width self.precision = { @@ -331,13 +327,30 @@ def generate_column(self, size, name=None): return PrimitiveColumn(name, size, is_valid, values) -class DecimalType(PrimitiveType): - def __init__(self, name, bit_width, precision, scale, nullable=True): - PrimitiveType.__init__(self, name, nullable=True) +DECIMAL_PRECISION_TO_VALUE = { + key: (1 << (8 * i - 1)) - 1 for i, key in enumerate( + [1, 3, 5, 7, 10, 12, 15, 17, 19, 22, 24, 27, 29, 32, 34, 36], + start=1, + ) +} - self.bit_width = bit_width + +def decimal_range_from_precision(precision): + assert 1 <= precision <= 38 + try: + max_value = DECIMAL_PRECISION_TO_VALUE[precision] + except KeyError: + return decimal_range_from_precision(precision - 1) + else: + return ~max_value, max_value + + +class DecimalType(PrimitiveType): + def __init__(self, name, precision, scale, bit_width=128, nullable=True): + super(DecimalType, self).__init__(name, nullable=True) self.precision = precision self.scale = scale + self.bit_width = bit_width @property def numpy_type(self): @@ -359,7 +372,8 @@ def _get_type_layout(self): ('typeBitWidth', self.bit_width)])])]) def generate_column(self, size, name=None): - values = [random.randint(0, 2**self.bit_width - 1) for x in range(size)] + min_value, max_value = decimal_range_from_precision(self.precision) + values = [random.randint(min_value, max_value) for _ in range(size)] is_valid = self._make_is_valid(size) if name is None: @@ -369,14 +383,12 @@ def generate_column(self, size, name=None): class DecimalColumn(PrimitiveColumn): - def __init__(self, name, count, is_valid, values, bit_width): - PrimitiveColumn.__init__(self, name, count, is_valid, values) + def __init__(self, name, count, is_valid, values, bit_width=128): + super(DecimalColumn, self).__init__(name, count, is_valid, values) self.bit_width = bit_width - self.hex_width = bit_width / 4 def _encode_value(self, x): - hex_format_str = '%%0%dx' % self.hex_width - return (hex_format_str % x).upper() + return str(x) class BooleanType(PrimitiveType): @@ -510,7 +522,7 @@ def _encode_value(self, x): class ListType(DataType): def __init__(self, name, value_type, nullable=True): - DataType.__init__(self, name, nullable=nullable) + super(ListType, self).__init__(name, nullable=nullable) self.value_type = value_type def _get_type(self): @@ -553,7 +565,7 @@ def generate_column(self, size, name=None): class ListColumn(Column): def __init__(self, name, count, is_valid, offsets, values): - Column.__init__(self, name, count) + super(ListColumn, self).__init__(name, count) self.is_valid = is_valid self.offsets = offsets self.values = values @@ -571,7 +583,7 @@ def _get_children(self): class StructType(DataType): def __init__(self, name, field_types, nullable=True): - DataType.__init__(self, name, nullable=nullable) + super(StructType, self).__init__(name, nullable=nullable) self.field_types = field_types def _get_type(self): @@ -620,7 +632,7 @@ def get_json(self): class DictionaryType(DataType): def __init__(self, name, index_type, dictionary, nullable=True): - DataType.__init__(self, name, nullable=nullable) + super(DictionaryType, self).__init__(name, nullable=nullable) assert isinstance(index_type, IntegerType) assert isinstance(dictionary, Dictionary) @@ -655,7 +667,7 @@ def generate_column(self, size, name=None): class StructColumn(Column): def __init__(self, name, count, is_valid, field_values): - Column.__init__(self, name, count) + super(StructColumn, self).__init__(name, count) self.is_valid = is_valid self.field_values = field_values @@ -758,11 +770,12 @@ def generate_primitive_case(batch_sizes): def generate_decimal_case(): fields = [ - DecimalType('f1', 128, 24, 10, True), - DecimalType('f2', 128, 32, -10, True) + DecimalType(name='f{}'.format(i), precision=precision, scale=2) + for i, precision in enumerate(range(3, 39)) ] - batch_sizes = [7, 10] + possible_batch_sizes = 7, 10 + batch_sizes = [possible_batch_sizes[i % 2] for i in range(len(fields))] return _generate_file('decimal', fields, batch_sizes) @@ -867,8 +880,9 @@ def run(self): def _compare_implementations(self, producer, consumer): print('##########################################################') - print('{0} producing, {1} consuming'.format(producer.name, - consumer.name)) + print( + '{0} producing, {1} consuming'.format(producer.name, consumer.name) + ) print('##########################################################') for json_path in self.json_files: @@ -1033,6 +1047,7 @@ def run_all_tests(debug=False): runner.run() print('-- All tests passed!') + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Arrow integration test CLI') parser.add_argument('--debug', dest='debug', action='store_true', diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java index c6ebd61aa07b9..e1c7c909f10be 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java @@ -27,6 +27,8 @@ import java.io.File; import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -332,9 +334,10 @@ private void setValueFromParser(ValueVector valueVector, int i) throws IOExcepti ((Float8Vector) valueVector).getMutator().set(i, parser.readValueAs(Double.class)); break; case DECIMAL: { - DecimalVector decimalVector = ((DecimalVector) valueVector); - byte[] value = decodeHexSafe(parser.readValueAs(String.class)); - DecimalUtility.writeByteArrayToArrowBuf(value, decimalVector.getBuffer(), i); + DecimalVector decimalVector = (DecimalVector) valueVector; + // Here we assume the decimal value is the unscaled integer value as a string + BigDecimal decimalValue = new BigDecimal(parser.readValueAs(String.class)); + DecimalUtility.writeBigDecimalToArrowBuf(decimalValue, decimalVector.getBuffer(), i); } break; case VARBINARY: diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java index 04e44379e5dfa..05341bec44ea3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.IOException; +import java.math.BigDecimal; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -48,6 +49,7 @@ import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryProvider; import org.apache.arrow.vector.schema.ArrowVectorType; +import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; @@ -242,9 +244,9 @@ private void writeValueToGenerator(ValueVector valueVector, int i) throws IOExce } break; case DECIMAL: { - ArrowBuf bytebuf = valueVector.getDataBuffer(); - String hexString = Hex.encodeHexString(DecimalUtility.getByteArrayFromArrowBuf(bytebuf, i)); - generator.writeString(hexString); + BigDecimal decimalValue = ((DecimalVector) valueVector).getAccessor().getObject(i); + // We write the unscaled value, because the scale is stored in the type metadata. + generator.writeString(decimalValue.unscaledValue().toString()); } break; default: diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index 033ae6c09914d..acf7c58a1337f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -142,8 +142,18 @@ public static StringBuilder toStringWithZeroes(long number, int desiredLength) { */ public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int index, int scale) { byte[] value = new byte[DECIMAL_BYTE_LENGTH]; + byte temp; final int startIndex = index * DECIMAL_BYTE_LENGTH; + + // Decimal stored as little endian, need to swap bytes to make BigDecimal bytebuf.getBytes(startIndex, value, 0, DECIMAL_BYTE_LENGTH); + int stop = DECIMAL_BYTE_LENGTH / 2; + for (int i = 0, j; i < stop; i++) { + temp = value[i]; + j = (DECIMAL_BYTE_LENGTH - 1) - i; + value[i] = value[j]; + value[j] = temp; + } BigInteger unscaledValue = new BigInteger(value); return new BigDecimal(unscaledValue, scale); } @@ -212,10 +222,26 @@ private static void writeByteArrayToArrowBuf(byte[] bytes, ArrowBuf bytebuf, int if (bytes.length > DECIMAL_BYTE_LENGTH) { throw new UnsupportedOperationException("Decimal size greater than 16 bytes"); } - final int padLength = DECIMAL_BYTE_LENGTH - bytes.length; - for (int i = 0; i < padLength; i++) { + + // Decimal stored as little endian, need to swap data bytes before writing to ArrowBuf + byte[] bytesLE = new byte[bytes.length]; + int stop = bytes.length / 2; + for (int i = 0, j; i < stop; i++) { + j = (bytes.length - 1) - i; + bytesLE[i] = bytes[j]; + bytesLE[j] = bytes[i]; + } + if (bytes.length % 2 != 0) { + int i = (bytes.length / 2); + bytesLE[i] = bytes[i]; + } + + // Write LE data + bytebuf.setBytes(startIndex, bytesLE, 0, bytes.length); + + // Write padding after data + for (int i = bytes.length; i < DECIMAL_BYTE_LENGTH; i++) { bytebuf.setByte(startIndex + i, padValue); } - bytebuf.setBytes(startIndex + padLength, bytes, 0, bytes.length); } } From bfc0f24fcc58f3885c2175c64864cc12af95f938 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Wed, 8 Nov 2017 14:44:26 +0100 Subject: [PATCH 026/177] ARROW-1776: [C++] Define arrow::gpu::CudaContext::bytes_allocated() Author: Kouhei Sutou Closes #1293 from kou/cpp-gpu-cuda-context-bytes-allocated and squashes the following commits: 5221887 [Kouhei Sutou] [C++] Define arrow::gpu::CudaContext::bytes_allocated() --- cpp/src/arrow/gpu/cuda-test.cc | 1 + cpp/src/arrow/gpu/cuda_context.cc | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/cpp/src/arrow/gpu/cuda-test.cc b/cpp/src/arrow/gpu/cuda-test.cc index afdc3020e8581..7595f8bec7912 100644 --- a/cpp/src/arrow/gpu/cuda-test.cc +++ b/cpp/src/arrow/gpu/cuda-test.cc @@ -55,6 +55,7 @@ TEST_F(TestCudaBuffer, Allocate) { std::shared_ptr buffer; ASSERT_OK(context_->Allocate(kSize, &buffer)); ASSERT_EQ(kSize, buffer->size()); + ASSERT_EQ(kSize, context_->bytes_allocated()); } void AssertCudaBufferEquals(const CudaBuffer& buffer, const uint8_t* host_data, diff --git a/cpp/src/arrow/gpu/cuda_context.cc b/cpp/src/arrow/gpu/cuda_context.cc index 42315cd509c3c..fff8ece6c1cff 100644 --- a/cpp/src/arrow/gpu/cuda_context.cc +++ b/cpp/src/arrow/gpu/cuda_context.cc @@ -64,6 +64,7 @@ class CudaContext::CudaContextImpl { CUdeviceptr data; CU_RETURN_NOT_OK(cuMemAlloc(&data, static_cast(nbytes))); + bytes_allocated_ += nbytes; *out = reinterpret_cast(data); return Status::OK(); } @@ -85,6 +86,7 @@ class CudaContext::CudaContextImpl { Status Free(uint8_t* device_ptr, int64_t nbytes) { CU_RETURN_NOT_OK(cuMemFree(reinterpret_cast(device_ptr))); + bytes_allocated_ -= nbytes; return Status::OK(); } @@ -273,5 +275,7 @@ Status CudaContext::OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle, return Status::OK(); } +int64_t CudaContext::bytes_allocated() const { return impl_->bytes_allocated(); } + } // namespace gpu } // namespace arrow From 252a2a55fc8b80a6412922987b42d64fe41119ef Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 9 Nov 2017 00:24:59 +0900 Subject: [PATCH 027/177] [GLib] Fix a typo in document --- c_glib/arrow-glib/array-builder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 86e7f985be4a4..a5df681421c25 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -316,7 +316,7 @@ garrow_array_builder_new(const std::shared_ptr &type, * * Release ownership of `arrow::ArrayBuilder` in `builder`. * - * Since: 0.8.8 + * Since: 0.8.0 */ void garrow_array_builder_release_ownership(GArrowArrayBuilder *builder) From 78872a1be263e61d7901eb36663a184c2b04effb Mon Sep 17 00:00:00 2001 From: Stephanie Date: Wed, 8 Nov 2017 14:13:28 -0800 Subject: [PATCH 028/177] ARROW-1775: Ability to abort created but unsealed Plasma objects Author: Stephanie Author: Philipp Moritz Closes #1289 from stephanie-wang/abort-objects and squashes the following commits: 38c42b9 [Stephanie] TODO for PascalCase 08d4040 [Stephanie] Move documentation dd5b29e [Stephanie] Fix memory error e6934ac [Philipp Moritz] fix linting 2b8e385 [Stephanie] Return status code when unmapping object fe20b3b [Stephanie] Add test case for PlasmaClient::Abort 646190c [Stephanie] Abort objects that were not sealed when client disconnects 5fc44c5 [Stephanie] Implement PlasmaClient::Abort --- cpp/src/plasma/client.cc | 97 +++++++++++++++++++++++------ cpp/src/plasma/client.h | 19 ++++++ cpp/src/plasma/format/plasma.fbs | 12 ++++ cpp/src/plasma/protocol.cc | 28 +++++++++ cpp/src/plasma/protocol.h | 8 +++ cpp/src/plasma/store.cc | 23 ++++++- cpp/src/plasma/store.h | 3 + cpp/src/plasma/test/client_tests.cc | 44 +++++++++++++ 8 files changed, 213 insertions(+), 21 deletions(-) diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index e57a2a6f3008c..dd32bdc8149a3 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -278,6 +278,39 @@ Status PlasmaClient::Get(const ObjectID* object_ids, int64_t num_objects, return Status::OK(); } +Status PlasmaClient::UnmapObject(const ObjectID& object_id) { + auto object_entry = objects_in_use_.find(object_id); + ARROW_CHECK(object_entry != objects_in_use_.end()); + ARROW_CHECK(object_entry->second->count == 0); + + // Decrement the count of the number of objects in this memory-mapped file + // that the client is using. The corresponding increment should have + // happened in plasma_get. + int fd = object_entry->second->object.handle.store_fd; + auto entry = mmap_table_.find(fd); + ARROW_CHECK(entry != mmap_table_.end()); + ARROW_CHECK(entry->second.count >= 1); + if (entry->second.count == 1) { + // If no other objects are being used, then unmap the file. + int err = munmap(entry->second.pointer, entry->second.length); + if (err == -1) { + return Status::IOError("Error during munmap"); + } + // Remove the corresponding entry from the hash table. + mmap_table_.erase(fd); + } else { + // If there are other objects being used, decrement the reference count. + entry->second.count -= 1; + } + // Update the in_use_object_bytes_. + in_use_object_bytes_ -= (object_entry->second->object.data_size + + object_entry->second->object.metadata_size); + DCHECK_GE(in_use_object_bytes_, 0); + // Remove the entry from the hash table of objects currently in use. + objects_in_use_.erase(object_id); + return Status::OK(); +} + /// This is a helper method for implementing plasma_release. We maintain a /// buffer /// of release calls and only perform them once the buffer becomes full (as @@ -297,28 +330,9 @@ Status PlasmaClient::PerformRelease(const ObjectID& object_id) { ARROW_CHECK(object_entry->second->count >= 0); // Check if the client is no longer using this object. if (object_entry->second->count == 0) { - // Decrement the count of the number of objects in this memory-mapped file - // that the client is using. The corresponding increment should have - // happened in plasma_get. - int fd = object_entry->second->object.handle.store_fd; - auto entry = mmap_table_.find(fd); - ARROW_CHECK(entry != mmap_table_.end()); - entry->second.count -= 1; - ARROW_CHECK(entry->second.count >= 0); - // If none are being used then unmap the file. - if (entry->second.count == 0) { - munmap(entry->second.pointer, entry->second.length); - // Remove the corresponding entry from the hash table. - mmap_table_.erase(fd); - } // Tell the store that the client no longer needs the object. + RETURN_NOT_OK(UnmapObject(object_id)); RETURN_NOT_OK(SendReleaseRequest(store_conn_, object_id)); - // Update the in_use_object_bytes_. - in_use_object_bytes_ -= (object_entry->second->object.data_size + - object_entry->second->object.metadata_size); - DCHECK_GE(in_use_object_bytes_, 0); - // Remove the entry from the hash table of objects currently in use. - objects_in_use_.erase(object_id); } return Status::OK(); } @@ -344,6 +358,20 @@ Status PlasmaClient::Release(const ObjectID& object_id) { return Status::OK(); } +Status PlasmaClient::FlushReleaseHistory() { + // If the client is already disconnected, ignore the flush. + if (store_conn_ < 0) { + return Status::OK(); + } + while (release_history_.size() > 0) { + // Perform a release for the object ID for the first pending release. + RETURN_NOT_OK(PerformRelease(release_history_.back())); + // Remove the last entry from the release history. + release_history_.pop_back(); + } + return Status::OK(); +} + // This method is used to query whether the plasma store contains an object. Status PlasmaClient::Contains(const ObjectID& object_id, bool* has_object) { // Check if we already have a reference to the object. @@ -443,6 +471,35 @@ Status PlasmaClient::Seal(const ObjectID& object_id) { return Release(object_id); } +Status PlasmaClient::Abort(const ObjectID& object_id) { + auto object_entry = objects_in_use_.find(object_id); + ARROW_CHECK(object_entry != objects_in_use_.end()) + << "Plasma client called abort on an object without a reference to it"; + ARROW_CHECK(!object_entry->second->is_sealed) + << "Plasma client called abort on a sealed object"; + + // Flush the release history. + RETURN_NOT_OK(FlushReleaseHistory()); + // Make sure that the Plasma client only has one reference to the object. If + // it has more, then the client needs to release the buffer before calling + // abort. + if (object_entry->second->count > 1) { + return Status::Invalid("Plasma client cannot have a reference to the buffer."); + } + + // Send the abort request. + RETURN_NOT_OK(SendAbortRequest(store_conn_, object_id)); + // Decrease the reference count to zero, then remove the object. + object_entry->second->count--; + RETURN_NOT_OK(UnmapObject(object_id)); + + std::vector buffer; + ObjectID id; + int64_t type; + RETURN_NOT_OK(ReadMessage(store_conn_, &type, &buffer)); + return ReadAbortReply(buffer.data(), buffer.size(), &id); +} + Status PlasmaClient::Delete(const ObjectID& object_id) { // TODO(rkn): In the future, we can use this method to give hints to the // eviction policy about when an object will no longer be needed. diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h index 145942441c9f1..89df2b0b0039f 100644 --- a/cpp/src/plasma/client.h +++ b/cpp/src/plasma/client.h @@ -152,6 +152,15 @@ class ARROW_EXPORT PlasmaClient { /// \return The return status. Status Contains(const ObjectID& object_id, bool* has_object); + /// Abort an unsealed object in the object store. If the abort succeeds, then + /// it will be as if the object was never created at all. The unsealed object + /// must have only a single reference (the one that would have been removed by + /// calling Seal). + /// + /// \param object_id The ID of the object to abort. + /// \return The return status. + Status Abort(const ObjectID& object_id); + /// Seal an object in the object store. The object will be immutable after /// this /// call. @@ -307,6 +316,16 @@ class ARROW_EXPORT PlasmaClient { int get_manager_fd(); private: + /// This is a helper method for unmapping objects for which all references have + /// gone out of scope, either by calling Release or Abort. + /// + /// @param object_id The object ID whose data we should unmap. + Status UnmapObject(const ObjectID& object_id); + + /// This is a helper method that flushes all pending release calls to the + /// store. + Status FlushReleaseHistory(); + Status PerformRelease(const ObjectID& object_id); uint8_t* lookup_or_mmap(int fd, int store_fd_val, int64_t map_size); diff --git a/cpp/src/plasma/format/plasma.fbs b/cpp/src/plasma/format/plasma.fbs index 23782ade539d4..b6d03b8a3c10d 100644 --- a/cpp/src/plasma/format/plasma.fbs +++ b/cpp/src/plasma/format/plasma.fbs @@ -21,6 +21,8 @@ enum MessageType:int { // Create a new object. PlasmaCreateRequest = 1, PlasmaCreateReply, + PlasmaAbortRequest, + PlasmaAbortReply, // Seal an object. PlasmaSealRequest, PlasmaSealReply, @@ -113,6 +115,16 @@ table PlasmaCreateReply { error: PlasmaError; } +table PlasmaAbortRequest { + // ID of the object to be aborted. + object_id: string; +} + +table PlasmaAbortReply { + // ID of the object that was aborted. + object_id: string; +} + table PlasmaSealRequest { // ID of the object to be sealed. object_id: string; diff --git a/cpp/src/plasma/protocol.cc b/cpp/src/plasma/protocol.cc index 2261b6a624a8c..c0ebb88fe5019 100644 --- a/cpp/src/plasma/protocol.cc +++ b/cpp/src/plasma/protocol.cc @@ -100,6 +100,34 @@ Status ReadCreateReply(uint8_t* data, size_t size, ObjectID* object_id, return plasma_error_status(message->error()); } +Status SendAbortRequest(int sock, ObjectID object_id) { + flatbuffers::FlatBufferBuilder fbb; + auto message = CreatePlasmaAbortRequest(fbb, fbb.CreateString(object_id.binary())); + return PlasmaSend(sock, MessageType_PlasmaAbortRequest, &fbb, message); +} + +Status ReadAbortRequest(uint8_t* data, size_t size, ObjectID* object_id) { + DCHECK(data); + auto message = flatbuffers::GetRoot(data); + DCHECK(verify_flatbuffer(message, data, size)); + *object_id = ObjectID::from_binary(message->object_id()->str()); + return Status::OK(); +} + +Status SendAbortReply(int sock, ObjectID object_id) { + flatbuffers::FlatBufferBuilder fbb; + auto message = CreatePlasmaAbortReply(fbb, fbb.CreateString(object_id.binary())); + return PlasmaSend(sock, MessageType_PlasmaAbortReply, &fbb, message); +} + +Status ReadAbortReply(uint8_t* data, size_t size, ObjectID* object_id) { + DCHECK(data); + auto message = flatbuffers::GetRoot(data); + DCHECK(verify_flatbuffer(message, data, size)); + *object_id = ObjectID::from_binary(message->object_id()->str()); + return Status::OK(); +} + // Seal messages. Status SendSealRequest(int sock, ObjectID object_id, unsigned char* digest) { diff --git a/cpp/src/plasma/protocol.h b/cpp/src/plasma/protocol.h index af4b13978c697..e8c334f9181fc 100644 --- a/cpp/src/plasma/protocol.h +++ b/cpp/src/plasma/protocol.h @@ -51,6 +51,14 @@ Status SendCreateReply(int sock, ObjectID object_id, PlasmaObject* object, int e Status ReadCreateReply(uint8_t* data, size_t size, ObjectID* object_id, PlasmaObject* object); +Status SendAbortRequest(int sock, ObjectID object_id); + +Status ReadAbortRequest(uint8_t* data, size_t size, ObjectID* object_id); + +Status SendAbortReply(int sock, ObjectID object_id); + +Status ReadAbortReply(uint8_t* data, size_t size, ObjectID* object_id); + /* Plasma Seal message functions. */ Status SendSealRequest(int sock, ObjectID object_id, unsigned char* digest); diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc index 210cce16238f8..5dbdebc237ce6 100644 --- a/cpp/src/plasma/store.cc +++ b/cpp/src/plasma/store.cc @@ -393,6 +393,18 @@ void PlasmaStore::seal_object(const ObjectID& object_id, unsigned char digest[]) update_object_get_requests(object_id); } +void PlasmaStore::abort_object(const ObjectID& object_id) { + auto entry = get_object_table_entry(&store_info_, object_id); + ARROW_CHECK(entry != NULL) << "To abort an object it must be in the object table."; + ARROW_CHECK(entry->state != PLASMA_SEALED) + << "To abort an object it must not have been sealed."; + ARROW_CHECK(entry->clients.size() == 1) + << "To abort an object, the only client currently using it is the creator."; + + dlfree(entry->pointer); + store_info_.objects.erase(object_id); +} + void PlasmaStore::delete_objects(const std::vector& object_ids) { for (const auto& object_id : object_ids) { ARROW_LOG(DEBUG) << "deleting object " << object_id.hex(); @@ -443,7 +455,11 @@ void PlasmaStore::disconnect_client(int client_fd) { // If this client was using any objects, remove it from the appropriate // lists. for (const auto& entry : store_info_.objects) { - remove_client_from_object_clients(entry.second.get(), it->second.get()); + if (entry.second->state == PLASMA_SEALED) { + remove_client_from_object_clients(entry.second.get(), it->second.get()); + } else { + abort_object(entry.first); + } } // Note, the store may still attempt to send a message to the disconnected @@ -582,6 +598,11 @@ Status PlasmaStore::process_message(Client* client) { warn_if_sigpipe(send_fd(client->fd, object.handle.store_fd), client->fd); } } break; + case MessageType_PlasmaAbortRequest: { + RETURN_NOT_OK(ReadAbortRequest(input, input_size, &object_id)); + abort_object(object_id); + HANDLE_SIGPIPE(SendAbortReply(client->fd, object_id), client->fd); + } break; case MessageType_PlasmaGetRequest: { std::vector object_ids_to_get; int64_t timeout_ms; diff --git a/cpp/src/plasma/store.h b/cpp/src/plasma/store.h index d03d11f4ef0c4..0d08d8a67ffaa 100644 --- a/cpp/src/plasma/store.h +++ b/cpp/src/plasma/store.h @@ -48,6 +48,7 @@ struct Client { class PlasmaStore { public: + // TODO: PascalCase PlasmaStore methods. PlasmaStore(EventLoop* loop, int64_t system_memory, std::string directory, bool hugetlbfs_enabled); @@ -73,6 +74,8 @@ class PlasmaStore { int create_object(const ObjectID& object_id, int64_t data_size, int64_t metadata_size, Client* client, PlasmaObject* result); + void abort_object(const ObjectID& object_id); + /// Delete objects that have been created in the hash table. This should only /// be called on objects that are returned by the eviction policy to evict. /// diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc index 0f19da5f72342..5c0cee4c071ad 100644 --- a/cpp/src/plasma/test/client_tests.cc +++ b/cpp/src/plasma/test/client_tests.cc @@ -127,6 +127,50 @@ TEST_F(TestPlasmaStore, MultipleGetTest) { ASSERT_EQ(object_buffer[1].data[0], 2); } +TEST_F(TestPlasmaStore, AbortTest) { + ObjectID object_id = ObjectID::from_random(); + ObjectBuffer object_buffer; + + // Test for object non-existence. + ARROW_CHECK_OK(client_.Get(&object_id, 1, 0, &object_buffer)); + ASSERT_EQ(object_buffer.data_size, -1); + + // Test object abort. + // First create object. + int64_t data_size = 4; + uint8_t metadata[] = {5}; + int64_t metadata_size = sizeof(metadata); + uint8_t* data; + ARROW_CHECK_OK(client_.Create(object_id, data_size, metadata, metadata_size, &data)); + // Write some data. + for (int64_t i = 0; i < data_size / 2; i++) { + data[i] = static_cast(i % 4); + } + // Attempt to abort. Test that this fails before the first release. + Status status = client_.Abort(object_id); + ASSERT_TRUE(status.IsInvalid()); + // Release, then abort. + ARROW_CHECK_OK(client_.Release(object_id)); + ARROW_CHECK_OK(client_.Abort(object_id)); + + // Test for object non-existence after the abort. + ARROW_CHECK_OK(client_.Get(&object_id, 1, 0, &object_buffer)); + ASSERT_EQ(object_buffer.data_size, -1); + + // Create the object successfully this time. + ARROW_CHECK_OK(client_.Create(object_id, data_size, metadata, metadata_size, &data)); + for (int64_t i = 0; i < data_size; i++) { + data[i] = static_cast(i % 4); + } + ARROW_CHECK_OK(client_.Seal(object_id)); + + // Test that we can get the object. + ARROW_CHECK_OK(client_.Get(&object_id, 1, -1, &object_buffer)); + for (int64_t i = 0; i < data_size; i++) { + ASSERT_EQ(data[i], object_buffer.data[i]); + } +} + } // namespace plasma int main(int argc, char** argv) { From dffa486c86d1d09c16e4c52ad0ff78bbee22c4e1 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 8 Nov 2017 20:50:51 -0500 Subject: [PATCH 029/177] ARROW-1709: [C++] Decimal.ToString is incorrect for negative scale This is on top of ARROW-1716. Will rebase when that's merged. Author: Phillip Cloud Closes #1292 from cpcloud/ARROW-1709 and squashes the following commits: 13ef9a32 [Phillip Cloud] Fix formatting e87e88f0 [Phillip Cloud] Cleanup 3431b667 [Phillip Cloud] Skip leading negative sign in dcheck 9874d84d [Phillip Cloud] Use lambda 3a5e3f6f [Phillip Cloud] DCHECK and format 2dde6d2d [Phillip Cloud] ARROW-1709: [C++] Decimal.ToString is incorrect for negative scale --- cpp/src/arrow/util/decimal-test.cc | 75 +++++++++++++++++++++ cpp/src/arrow/util/decimal.cc | 104 +++++++++++++++++++++++------ 2 files changed, 159 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index b0271fff15ccf..0d0c08cc4551f 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -18,6 +18,7 @@ #include #include +#include #include @@ -291,4 +292,78 @@ TEST(Decimal128Test, PrintMinValue) { ASSERT_EQ(string_value, printed_value); } +class Decimal128PrintingTest + : public ::testing::TestWithParam> {}; + +TEST_P(Decimal128PrintingTest, Print) { + int32_t test_value; + int32_t scale; + std::string expected_string; + std::tie(test_value, scale, expected_string) = GetParam(); + const Decimal128 value(test_value); + const std::string printed_value = value.ToString(scale); + ASSERT_EQ(expected_string, printed_value); +} + +INSTANTIATE_TEST_CASE_P(Decimal128PrintingTest, Decimal128PrintingTest, + ::testing::Values(std::make_tuple(123, 1, "12.3"), + std::make_tuple(123, 5, "0.00123"), + std::make_tuple(123, 10, "1.23E-8"), + std::make_tuple(123, -1, "1.23E+3"), + std::make_tuple(-123, -1, "-1.23E+3"), + std::make_tuple(123, -3, "1.23E+5"), + std::make_tuple(-123, -3, "-1.23E+5"), + std::make_tuple(12345, -3, "1.2345E+7"))); + +class Decimal128ParsingTest + : public ::testing::TestWithParam> {}; + +TEST_P(Decimal128ParsingTest, Parse) { + std::string test_string; + uint64_t expected_low_bits; + int32_t expected_scale; + std::tie(test_string, expected_low_bits, expected_scale) = GetParam(); + Decimal128 value; + int32_t scale; + ASSERT_OK(Decimal128::FromString(test_string, &value, NULLPTR, &scale)); + ASSERT_EQ(value.low_bits(), expected_low_bits); + ASSERT_EQ(expected_scale, scale); +} + +INSTANTIATE_TEST_CASE_P(Decimal128ParsingTest, Decimal128ParsingTest, + ::testing::Values(std::make_tuple("12.3", 123ULL, 1), + std::make_tuple("0.00123", 123ULL, 5), + std::make_tuple("1.23E-8", 123ULL, 10), + std::make_tuple("-1.23E-8", -123LL, 10), + std::make_tuple("1.23E+3", 123ULL, -1), + std::make_tuple("-1.23E+3", -123LL, -1), + std::make_tuple("1.23E+5", 123ULL, -3), + std::make_tuple("1.2345E+7", 12345ULL, -3), + std::make_tuple("1.23e-8", 123ULL, 10), + std::make_tuple("-1.23e-8", -123LL, 10), + std::make_tuple("1.23e+3", 123ULL, -1), + std::make_tuple("-1.23e+3", -123LL, -1), + std::make_tuple("1.23e+5", 123ULL, -3), + std::make_tuple("1.2345e+7", 12345ULL, -3))); + +class Decimal128ParsingTestInvalid : public ::testing::TestWithParam {}; + +TEST_P(Decimal128ParsingTestInvalid, Parse) { + std::string test_string = GetParam(); + Decimal128 value; + ASSERT_RAISES(Invalid, Decimal128::FromString(test_string, &value)); +} + +INSTANTIATE_TEST_CASE_P(Decimal128ParsingTestInvalid, Decimal128ParsingTestInvalid, + ::testing::Values("0.00123D/3", "1.23eA8", "1.23E+3A", + "-1.23E--5", "1.2345E+++07")); + +TEST(Decimal128ParseTest, WithExponentAndNullptrScale) { + Decimal128 value; + ASSERT_OK(Decimal128::FromString("1.23E-8", &value)); + + const Decimal128 expected_value(123); + ASSERT_EQ(expected_value, value); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index cc180258aa4df..447cae5c54654 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -105,6 +105,22 @@ Decimal128::operator int64_t() const { return static_cast(low_bits_); } +static std::string ToStringNegativeScale(const std::string& str, + int32_t adjusted_exponent, bool is_negative) { + std::stringstream buf; + + size_t offset = 0; + buf << str[offset++]; + + if (is_negative) { + buf << str[offset++]; + } + + buf << '.' << str.substr(offset, std::string::npos) << 'E' << std::showpos + << adjusted_exponent; + return buf.str(); +} + std::string Decimal128::ToString(int32_t scale) const { const std::string str(ToIntegerString()); @@ -112,9 +128,18 @@ std::string Decimal128::ToString(int32_t scale) const { return str; } - if (*this < 0) { - const auto len = static_cast(str.size()); + const bool is_negative = *this < 0; + const auto len = static_cast(str.size()); + const auto is_negative_offset = static_cast(is_negative); + const int32_t adjusted_exponent = -scale + (len - 1 - is_negative_offset); + + /// Note that the -6 is taken from the Java BigDecimal documentation. + if (scale < 0 || adjusted_exponent < -6) { + return ToStringNegativeScale(str, adjusted_exponent, is_negative); + } + + if (is_negative) { if (len - 1 > scale) { const auto n = static_cast(len - scale); return str.substr(0, n) + "." + str.substr(n, static_cast(scale)); @@ -128,8 +153,6 @@ std::string Decimal128::ToString(int32_t scale) const { return result + str.substr(1, std::string::npos); } - const auto len = static_cast(str.size()); - if (len > scale) { const auto n = static_cast(len - scale); return str.substr(0, n) + "." + str.substr(n, static_cast(scale)); @@ -164,10 +187,12 @@ static constexpr int64_t kPowersOfTen[kInt64DecimalDigits + 1] = {1LL, 100000000000000000LL, 1000000000000000000LL}; +static inline bool isdigit(char value) { return std::isdigit(value) != 0; } + static void StringToInteger(const std::string& str, Decimal128* out) { using std::size_t; - DCHECK_NE(out, nullptr) << "Decimal128 output variable cannot be nullptr"; + DCHECK_NE(out, NULLPTR) << "Decimal128 output variable cannot be NULLPTR"; DCHECK_EQ(*out, 0) << "When converting a string to Decimal128 the initial output must be 0"; @@ -189,7 +214,7 @@ static void StringToInteger(const std::string& str, Decimal128* out) { Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precision, int* scale) { - // Implements this regex: "(\\+?|-?)((0*)(\\d*))(\\.(\\d+))?"; + // Implements this regex: "(\\+?|-?)((0*)(\\d*))(\\.(\\d+))?((E|e)(\\+|-)?\\d+)?"; if (s.empty()) { return Status::Invalid("Empty string cannot be converted to decimal"); } @@ -215,21 +240,21 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precis DCHECK_LT(charp, end); // skip leading zeros - charp = std::find_if_not(charp, end, [](char c) { return c == '0'; }); + charp = std::find_if_not(charp, end, [](char value) { return value == '0'; }); // all zeros and no decimal point if (charp == end) { - if (out != nullptr) { + if (out != NULLPTR) { *out = 0; } // Not sure what other libraries assign precision to for this case (this case of // a string consisting only of one or more zeros) - if (precision != nullptr) { + if (precision != NULLPTR) { *precision = static_cast(charp - numeric_string_start); } - if (scale != nullptr) { + if (scale != NULLPTR) { *scale = 0; } @@ -238,7 +263,7 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precis std::string::const_iterator whole_part_start = charp; - charp = std::find_if_not(charp, end, [](char c) { return std::isdigit(c) != 0; }); + charp = std::find_if_not(charp, end, isdigit); std::string::const_iterator whole_part_end = charp; std::string whole_part(whole_part_start, whole_part_end); @@ -269,14 +294,13 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precis std::string::const_iterator fractional_part_start = charp; - // The rest must be digits, because if we have a decimal point it must be followed by - // digits + // The rest must be digits or an exponent if (charp != end) { - charp = std::find_if_not(charp, end, [](char c) { return std::isdigit(c) != 0; }); + charp = std::find_if_not(charp, end, isdigit); // The while loop has ended before the end of the string which means we've hit a - // character that isn't a base ten digit - if (charp != end) { + // character that isn't a base ten digit or "E" for exponent + if (charp != end && *charp != 'E' && *charp != 'e') { std::stringstream ss; ss << "Found non base ten digit character '" << *charp << "' before the end of the string"; @@ -287,15 +311,55 @@ Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precis std::string::const_iterator fractional_part_end = charp; std::string fractional_part(fractional_part_start, fractional_part_end); - if (precision != nullptr) { + if (precision != NULLPTR) { *precision = static_cast(whole_part.size() + fractional_part.size()); } - if (scale != nullptr) { - *scale = static_cast(fractional_part.size()); + if (charp != end) { + // we must have an exponent, if this aborts then we have somehow not caught this and + // raised a proper error + DCHECK(*charp == 'E' || *charp == 'e'); + + ++charp; + + const char value = *charp; + const bool starts_with_plus_or_minus = value == '+' || value == '-'; + + // we use this to construct the adjusted exponent integer later + std::string::const_iterator digit_start = charp; + + // skip plus or minus + charp += starts_with_plus_or_minus; + + // confirm that the rest of the characters are digits + charp = std::find_if_not(charp, end, isdigit); + + if (charp != end) { + // we have something other than digits here + std::stringstream ss; + ss << "Found non decimal digit exponent value '" << *charp << "'"; + return Status::Invalid(ss.str()); + } + + if (scale != NULLPTR) { + // compute the scale from the adjusted exponent + std::string adjusted_exponent_string(digit_start, end); + DCHECK(std::all_of(adjusted_exponent_string.cbegin() + starts_with_plus_or_minus, + adjusted_exponent_string.cend(), isdigit)) + << "Non decimal digit character found in " << adjusted_exponent_string; + const auto adjusted_exponent = + static_cast(std::stol(adjusted_exponent_string)); + const auto len = static_cast(whole_part.size() + fractional_part.size()); + + *scale = -adjusted_exponent + len - 1; + } + } else { + if (scale != NULLPTR) { + *scale = static_cast(fractional_part.size()); + } } - if (out != nullptr) { + if (out != NULLPTR) { // zero out in case we've passed in a previously used value *out = 0; StringToInteger(whole_part + fractional_part, out); From 65a9055c705e5f09c949c12365d12839ace063f5 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 9 Nov 2017 11:31:36 -0500 Subject: [PATCH 030/177] ARROW-972: UnionArray in pyarrow This is taking a stab at exposing UnionArray to pyarrow. Tasks to be done: - [x] Support UnionType::SPARSE - [x] Add doc strings Author: Philipp Moritz Author: Wes McKinney Closes #1216 from pcmoritz/pyarrow-union-array and squashes the following commits: 7f3ca313 [Wes McKinney] Fix flakes 9f33076b [Wes McKinney] Change UnionMode to scoped enumeration 9e602a8d [Philipp Moritz] wrap UnionType in pyarrow eeef7226 [Philipp Moritz] linting 502c335a [Philipp Moritz] fixes c6c85491 [Philipp Moritz] add doc strings 9068bbb5 [Philipp Moritz] linting d8da0170 [Philipp Moritz] implement dense and sparse UnionArrays cbdedc7a [Philipp Moritz] make fields in UnionArray unique to be compatiable with Java b796ce64 [Philipp Moritz] Implement UnionArray in pyarrow --- cpp/src/arrow/array.cc | 56 ++++++++++++++++++++++++++ cpp/src/arrow/array.h | 33 ++++++++++++++- cpp/src/arrow/compare.cc | 2 +- cpp/src/arrow/ipc/json-internal.cc | 2 +- cpp/src/arrow/ipc/metadata-internal.cc | 5 ++- cpp/src/arrow/type.cc | 18 ++++++++- cpp/src/arrow/type.h | 18 ++++++--- python/pyarrow/__init__.py | 4 +- python/pyarrow/_parquet.pxd | 2 +- python/pyarrow/array.pxi | 53 ++++++++++++++++++++++++ python/pyarrow/includes/libarrow.pxd | 25 ++++++++++++ python/pyarrow/lib.pxd | 16 ++++++++ python/pyarrow/lib.pyx | 2 + python/pyarrow/public-api.pxi | 2 +- python/pyarrow/scalar.pxi | 19 +++++++++ python/pyarrow/tests/test_array.py | 22 ++++++++++ python/pyarrow/tests/test_schema.py | 8 ++++ python/pyarrow/tests/test_types.py | 13 +++--- python/pyarrow/types.pxi | 49 ++++++++++++++++++++++ 19 files changed, 327 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index b523876bf0e4e..9c91d619cc7e8 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -393,6 +393,62 @@ UnionArray::UnionArray(const std::shared_ptr& type, int64_t length, SetData(internal_data); } +Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + std::shared_ptr* out) { + if (value_offsets.length() == 0) { + return Status::Invalid("UnionArray offsets must have non-zero length"); + } + + if (value_offsets.type_id() != Type::INT32) { + return Status::Invalid("UnionArray offsets must be signed int32"); + } + + if (type_ids.type_id() != Type::INT8) { + return Status::Invalid("UnionArray type_ids must be signed int8"); + } + + if (value_offsets.null_count() != 0) { + return Status::Invalid("MakeDense does not allow NAs in value_offsets"); + } + + BufferVector buffers = {type_ids.null_bitmap(), + static_cast(type_ids).values(), + static_cast(value_offsets).values()}; + auto union_type = union_(children, UnionMode::DENSE); + auto internal_data = + std::make_shared(union_type, type_ids.length(), std::move(buffers), + type_ids.null_count(), type_ids.offset()); + for (const auto& child : children) { + internal_data->child_data.push_back(child->data()); + } + *out = std::make_shared(internal_data); + return Status::OK(); +} + +Status UnionArray::MakeSparse(const Array& type_ids, + const std::vector>& children, + std::shared_ptr* out) { + if (type_ids.type_id() != Type::INT8) { + return Status::Invalid("UnionArray type_ids must be signed int8"); + } + BufferVector buffers = {type_ids.null_bitmap(), + static_cast(type_ids).values(), nullptr}; + auto union_type = union_(children, UnionMode::SPARSE); + auto internal_data = + std::make_shared(union_type, type_ids.length(), std::move(buffers), + type_ids.null_count(), type_ids.offset()); + for (const auto& child : children) { + internal_data->child_data.push_back(child->data()); + if (child->length() != type_ids.length()) { + return Status::Invalid( + "Sparse UnionArray must have len(child) == len(type_ids) for all children"); + } + } + *out = std::make_shared(internal_data); + return Status::OK(); +} + std::shared_ptr UnionArray::child(int i) const { if (!boxed_fields_[i]) { boxed_fields_[i] = MakeArray(data_->child_data[i]); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index afbd780dd3ad5..f7762ce104398 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -612,16 +612,47 @@ class ARROW_EXPORT UnionArray : public Array { const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = 0, int64_t offset = 0); + /// \brief Construct Dense UnionArray from types_ids, value_offsets and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. The value_offsets are assumed to be well-formed. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[out] out Will have length equal to value_offsets.length() + static Status MakeDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + std::shared_ptr* out); + + /// \brief Construct Sparse UnionArray from type_ids and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[out] out Will have length equal to type_ids.length() + static Status MakeSparse(const Array& type_ids, + const std::vector>& children, + std::shared_ptr* out); + /// Note that this buffer does not account for any slice offset std::shared_ptr type_ids() const { return data_->buffers[1]; } /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return data_->buffers[2]; } + int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } + const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - UnionMode mode() const { return static_cast(*type()).mode(); } + UnionMode::type mode() const { return static_cast(*type()).mode(); } std::shared_ptr child(int pos) const; diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2ec86c3695aa5..a2d4de7b73afb 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -152,7 +152,7 @@ class RangeEqualsVisitor { bool CompareUnions(const UnionArray& left) const { const auto& right = static_cast(right_); - const UnionMode union_mode = left.mode(); + const UnionMode::type union_mode = left.mode(); if (union_mode != right.mode()) { return false; } diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index c1c0661d6ad35..1b9baee7dafef 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -774,7 +774,7 @@ static Status GetUnion(const RjObject& json_type, RETURN_NOT_STRING("mode", it_mode, json_type); std::string mode_str = it_mode->value.GetString(); - UnionMode mode; + UnionMode::type mode; if (mode_str == "SPARSE") { mode = UnionMode::SPARSE; diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index f0f0f675853b1..63ef8a549f236 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -163,8 +163,9 @@ static Status StructToFlatbuffer(FBB& fbb, const DataType& type, static Status UnionFromFlatbuffer(const flatbuf::Union* union_data, const std::vector>& children, std::shared_ptr* out) { - UnionMode mode = union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE - : UnionMode::DENSE; + UnionMode::type mode = + (union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE + : UnionMode::DENSE); std::vector type_codes; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a9bf591918558..0d1985fb2d914 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -190,7 +190,7 @@ std::string TimestampType::ToString() const { // Union type UnionType::UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode mode) + const std::vector& type_codes, UnionMode::type mode) : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) { children_ = fields; } @@ -440,10 +440,24 @@ std::shared_ptr struct_(const std::vector>& fie } std::shared_ptr union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode mode) { + const std::vector& type_codes, + UnionMode::type mode) { return std::make_shared(child_fields, type_codes, mode); } +std::shared_ptr union_(const std::vector>& children, + UnionMode::type mode) { + std::vector> types; + std::vector type_codes; + uint8_t counter = 0; + for (const auto& child : children) { + types.push_back(field(std::to_string(counter), child->type())); + type_codes.push_back(counter); + counter++; + } + return union_(types, type_codes, mode); +} + std::shared_ptr dictionary(const std::shared_ptr& index_type, const std::shared_ptr& dict_values, bool ordered) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 446f4d3a0b33f..9e11a034420e5 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -517,14 +517,17 @@ class ARROW_EXPORT DecimalType : public FixedSizeBinaryType { int32_t scale_; }; -enum class UnionMode : char { SPARSE, DENSE }; +struct UnionMode { + enum type { SPARSE, DENSE }; +}; class ARROW_EXPORT UnionType : public NestedType { public: static constexpr Type::type type_id = Type::UNION; UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); + const std::vector& type_codes, + UnionMode::type mode = UnionMode::SPARSE); std::string ToString() const override; std::string name() const override { return "union"; } @@ -534,10 +537,10 @@ class ARROW_EXPORT UnionType : public NestedType { const std::vector& type_codes() const { return type_codes_; } - UnionMode mode() const { return mode_; } + UnionMode::type mode() const { return mode_; } private: - UnionMode mode_; + UnionMode::type mode_; // The type id used in the data to indicate each data type in the union. For // example, the first type in the union might be denoted by the id 5 (instead @@ -842,7 +845,12 @@ struct_(const std::vector>& fields); /// \brief Create an instance of Union type std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); + const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); + +/// \brief Create and instance of Union type +std::shared_ptr ARROW_EXPORT +union_(const std::vector>& children, + UnionMode::type mode = UnionMode::SPARSE); /// \brief Create an instance of Dictionary type std::shared_ptr ARROW_EXPORT diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 1215c822d2e47..2d7d7288b3835 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -36,7 +36,7 @@ time32, time64, timestamp, date32, date64, float16, float32, float64, binary, string, decimal, - list_, struct, dictionary, field, + list_, struct, union, dictionary, field, type_for_alias, DataType, NAType, Field, @@ -52,7 +52,7 @@ Int16Array, UInt16Array, Int32Array, UInt32Array, Int64Array, UInt64Array, - ListArray, + ListArray, UnionArray, BinaryArray, StringArray, FixedSizeBinaryArray, DictionaryArray, diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 04a5b1368ce45..7e5e575096ddd 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -192,7 +192,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: int64_t num_values() const shared_ptr[ColumnPath] path_in_schema() const bint is_stats_set() const - shared_ptr[CRowGroupStatistics] statistics() const; + shared_ptr[CRowGroupStatistics] statistics() const ParquetCompression compression() const const vector[ParquetEncoding]& encodings() const diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 7752d062a774c..9991411e55dfe 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -631,6 +631,58 @@ cdef class ListArray(Array): return pyarrow_wrap_array(out) +cdef class UnionArray(Array): + + @staticmethod + def from_dense(Array types, Array value_offsets, list children): + """ + Construct dense UnionArray from arrays of int8 types, int32 offsets and + children arrays + + Parameters + ---------- + types : Array (int8 type) + value_offsets : Array (int32 type) + children : list + + Returns + ------- + union_array : UnionArray + """ + cdef shared_ptr[CArray] out + cdef vector[shared_ptr[CArray]] c + cdef Array child + for child in children: + c.push_back(child.sp_array) + with nogil: + check_status(CUnionArray.MakeDense( + deref(types.ap), deref(value_offsets.ap), c, &out)) + return pyarrow_wrap_array(out) + + @staticmethod + def from_sparse(Array types, list children): + """ + Construct sparse UnionArray from arrays of int8 types and children + arrays + + Parameters + ---------- + types : Array (int8 type) + children : list + + Returns + ------- + union_array : UnionArray + """ + cdef shared_ptr[CArray] out + cdef vector[shared_ptr[CArray]] c + cdef Array child + for child in children: + c.push_back(child.sp_array) + with nogil: + check_status(CUnionArray.MakeSparse(deref(types.ap), c, &out)) + return pyarrow_wrap_array(out) + cdef class StringArray(Array): pass @@ -789,6 +841,7 @@ cdef dict _array_classes = { _Type_FLOAT: FloatArray, _Type_DOUBLE: DoubleArray, _Type_LIST: ListArray, + _Type_UNION: UnionArray, _Type_BINARY: BinaryArray, _Type_STRING: StringArray, _Type_DICTIONARY: DictionaryArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 731ef94971da0..dfafd371b2857 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -67,6 +67,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_DICTIONARY" arrow::Type::DICTIONARY" _Type_MAP" arrow::Type::MAP" + enum UnionMode" arrow::UnionMode::type": + _UnionMode_SPARSE" arrow::UnionMode::SPARSE" + _UnionMode_DENSE" arrow::UnionMode::DENSE" + enum TimeUnit" arrow::TimeUnit::type": TimeUnit_SECOND" arrow::TimeUnit::SECOND" TimeUnit_MILLI" arrow::TimeUnit::MILLI" @@ -222,6 +226,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStructType" arrow::StructType"(CDataType): CStructType(const vector[shared_ptr[CField]]& fields) + cdef cppclass CUnionType" arrow::UnionType"(CDataType): + CUnionType(const vector[shared_ptr[CField]]& fields, + const vector[uint8_t]& type_codes, UnionMode mode) + UnionMode mode() + cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, @@ -317,6 +326,22 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() + cdef cppclass CUnionArray" arrow::UnionArray"(CArray): + @staticmethod + CStatus MakeSparse(const CArray& type_ids, + const vector[shared_ptr[CArray]]& children, + shared_ptr[CArray]* out) + + @staticmethod + CStatus MakeDense(const CArray& type_ids, const CArray& value_offsets, + const vector[shared_ptr[CArray]]& children, + shared_ptr[CArray]* out) + uint8_t* raw_type_ids() + int32_t value_offset(int i) + shared_ptr[CArray] child(int pos) + const CArray* UnsafeChild(int pos) + UnionMode mode() + cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray): const uint8_t* GetValue(int i, int32_t* length) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 8fdcf553c13fc..531489490754e 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -56,6 +56,11 @@ cdef class DictionaryType(DataType): const CDictionaryType* dict_type +cdef class UnionType(DataType): + cdef: + list child_types + + cdef class TimestampType(DataType): cdef: const CTimestampType* ts_type @@ -139,6 +144,13 @@ cdef class ListValue(ArrayValue): cdef getitem(self, int64_t i) +cdef class UnionValue(ArrayValue): + cdef: + CUnionArray* ap + list value_types + + cdef getitem(self, int64_t i) + cdef class StringValue(ArrayValue): pass @@ -242,6 +254,10 @@ cdef class ListArray(Array): pass +cdef class UnionArray(Array): + pass + + cdef class StringArray(Array): pass diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 6f4451e3f5a41..b4ca49cafe160 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -92,6 +92,8 @@ Type_UNION = _Type_UNION Type_DICTIONARY = _Type_DICTIONARY Type_MAP = _Type_MAP +UnionMode_SPARSE = _UnionMode_SPARSE +UnionMode_DENSE = _UnionMode_DENSE # Exception types include "error.pxi" diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 9f1051228047a..90aff9e936d95 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -72,7 +72,7 @@ cdef public api object pyarrow_wrap_data_type( elif type.get().id() == _Type_STRUCT: out = StructType() elif type.get().id() == _Type_UNION: - out = StructType() + out = UnionType() elif type.get().id() == _Type_TIMESTAMP: out = TimestampType() elif type.get().id() == _Type_FIXED_SIZE_BINARY: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index c37ed3b200ea3..a396fa763c8c8 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -315,6 +315,24 @@ cdef class ListValue(ArrayValue): return result +cdef class UnionValue(ArrayValue): + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + self.ap = sp_array.get() + + cdef getitem(self, int64_t i): + cdef int8_t type_id = self.ap.raw_type_ids()[i] + cdef shared_ptr[CArray] child = self.ap.child(type_id) + if self.ap.mode() == _UnionMode_SPARSE: + return box_scalar(self.type[type_id], child, i) + else: + return box_scalar(self.type[type_id], child, + self.ap.value_offset(i)) + + def as_py(self): + return self.getitem(self.index).as_py() + cdef class FixedSizeBinaryValue(ArrayValue): def as_py(self): @@ -364,6 +382,7 @@ cdef dict _scalar_classes = { _Type_FLOAT: FloatValue, _Type_DOUBLE: DoubleValue, _Type_LIST: ListValue, + _Type_UNION: UnionValue, _Type_BINARY: BinaryValue, _Type_STRING: StringValue, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue, diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index e3a4c97567ee6..7dc93c28ea7a4 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -235,6 +235,28 @@ def test_list_from_arrays(): assert result.equals(expected) +def test_union_from_dense(): + binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') + int64 = pa.array([1, 2, 3], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32') + + result = pa.UnionArray.from_dense(types, value_offsets, [binary, int64]) + + assert result.to_pylist() == [b'a', 1, b'c', b'b', 2, 3, b'd'] + + +def test_union_from_sparse(): + binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'], + type='binary') + int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + + result = pa.UnionArray.from_sparse(types, [binary, int64]) + + assert result.to_pylist() == [b'a', 1, b'b', b'c', 2, 3, b'd'] + + def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index d6b2655b7c6a0..116f3978333a8 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -319,6 +319,14 @@ def test_type_schema_pickling(): pa.field('a', 'int8'), pa.field('b', 'string') ]), + pa.union([ + pa.field('a', pa.int8()), + pa.field('b', pa.int16()) + ], pa.lib.UnionMode_SPARSE), + pa.union([ + pa.field('a', pa.int8()), + pa.field('b', pa.int16()) + ], pa.lib.UnionMode_DENSE), pa.time32('s'), pa.time64('us'), pa.date32(), diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index e6ff5b1560c1d..0e3ea1fd40bf5 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -85,16 +85,17 @@ def test_is_nested_or_struct(): assert not types.is_nested(pa.int32()) -# TODO(wesm): Union types not yet implemented in pyarrow +def test_is_union(): + assert types.is_union(pa.union([pa.field('a', pa.int32()), + pa.field('b', pa.int8()), + pa.field('c', pa.string())], + pa.lib.UnionMode_SPARSE)) + assert not types.is_union(pa.list_(pa.int32())) -# def test_is_union(): -# assert types.is_union(pa.union([pa.field('a', pa.int32()), -# pa.field('b', pa.int8()), -# pa.field('c', pa.string())])) -# assert not types.is_union(pa.list_(pa.int32())) # TODO(wesm): is_map, once implemented + def test_is_binary_string(): assert types.is_binary(pa.binary()) assert not types.is_binary(pa.string()) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index c9a490960ec38..d2e68ff79a524 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -186,7 +186,32 @@ cdef class UnionType(DataType): cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) + self.child_types = [ + pyarrow_wrap_data_type(type.get().child(i).get().type()) + for i in range(self.num_children)] + property num_children: + + def __get__(self): + return self.type.num_children() + + property mode: + + def __get__(self): + cdef CUnionType* type = self.sp_type.get() + return type.mode() + + def __getitem__(self, i): + return self.child_types[i] + + def __getstate__(self): + children = [pyarrow_wrap_field(self.type.child(i)) + for i in range(self.num_children)] + return children, self.mode + + def __setstate__(self, state): + cdef DataType reconstituted = union(*state) + self.init(reconstituted.sp_type) cdef class TimestampType(DataType): @@ -1056,6 +1081,30 @@ def struct(fields): return pyarrow_wrap_data_type(struct_type) +def union(children_fields, mode): + """ + Create UnionType from children fields. + """ + cdef: + Field child_field + vector[shared_ptr[CField]] c_fields + vector[uint8_t] type_codes + shared_ptr[CDataType] union_type + int i + + for i, child_field in enumerate(children_fields): + type_codes.push_back(i) + c_fields.push_back(child_field.sp_field) + + if mode == UnionMode_SPARSE: + union_type.reset(new CUnionType(c_fields, type_codes, + _UnionMode_SPARSE)) + else: + union_type.reset(new CUnionType(c_fields, type_codes, + _UnionMode_DENSE)) + + return pyarrow_wrap_data_type(union_type) + cdef dict _type_aliases = { 'null': null, 'i1': int8, From ed8aef2c66169bca214f3a77cf6b6d797e172791 Mon Sep 17 00:00:00 2001 From: Lu Qi Date: Fri, 10 Nov 2017 14:23:42 +0100 Subject: [PATCH 031/177] ARROW-1793: fix a typo for README.md fix a typo for integration test README.md Author: Lu Qi Closes #1301 from luchy0120/master and squashes the following commits: 2cc86fe [Lu Qi] fix a typo --- integration/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration/README.md b/integration/README.md index 6e5a6c15641ec..8d6df8e5772de 100644 --- a/integration/README.md +++ b/integration/README.md @@ -77,7 +77,7 @@ export ARROW_CPP_EXE_PATH=$CPP_BUILD_DIR/debug Here `$ARROW_HOME` is the location of your Arrow git clone. The `$CPP_BUILD_DIR` may be different depending on how you built with CMake -(in-source of out-of-source). +(in-source or out-of-source). Once this is done, run the integration tests with (optionally adding `--debug` for additional output) @@ -88,4 +88,4 @@ python integration_test.py python integration_test.py --debug # additional output ``` -[1]: https://conda.io/miniconda.html \ No newline at end of file +[1]: https://conda.io/miniconda.html From 2d34f34dc81966f3e186055dc0b962699c98b236 Mon Sep 17 00:00:00 2001 From: Stephanie Date: Fri, 10 Nov 2017 09:36:12 -0500 Subject: [PATCH 032/177] ARROW-1788 Fix Plasma store abort bug on client disconnection Author: Stephanie Closes #1299 from stephanie-wang/plasma-client-disconnect-bug and squashes the following commits: 295144bd [Stephanie] Revert disconnect client check 8e24affd [Stephanie] Refactor abort_object to match remove_client_from_object_clients b41591d5 [Stephanie] When disconnecting a plasma client, only abort the objects that the client created 50932e53 [Stephanie] Add Plasma test for multiple clients --- cpp/src/plasma/store.cc | 27 +++++++++++++------- cpp/src/plasma/store.h | 9 ++++++- cpp/src/plasma/test/client_tests.cc | 38 +++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 10 deletions(-) diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc index 5dbdebc237ce6..31033ccbb3202 100644 --- a/cpp/src/plasma/store.cc +++ b/cpp/src/plasma/store.cc @@ -393,16 +393,22 @@ void PlasmaStore::seal_object(const ObjectID& object_id, unsigned char digest[]) update_object_get_requests(object_id); } -void PlasmaStore::abort_object(const ObjectID& object_id) { +int PlasmaStore::abort_object(const ObjectID& object_id, Client* client) { auto entry = get_object_table_entry(&store_info_, object_id); ARROW_CHECK(entry != NULL) << "To abort an object it must be in the object table."; ARROW_CHECK(entry->state != PLASMA_SEALED) << "To abort an object it must not have been sealed."; - ARROW_CHECK(entry->clients.size() == 1) - << "To abort an object, the only client currently using it is the creator."; - - dlfree(entry->pointer); - store_info_.objects.erase(object_id); + auto it = entry->clients.find(client); + if (it == entry->clients.end()) { + // If the client requesting the abort is not the creator, do not + // perform the abort. + return 0; + } else { + // The client requesting the abort is the creator. Free the object. + dlfree(entry->pointer); + store_info_.objects.erase(object_id); + return 1; + } } void PlasmaStore::delete_objects(const std::vector& object_ids) { @@ -454,11 +460,12 @@ void PlasmaStore::disconnect_client(int client_fd) { ARROW_LOG(INFO) << "Disconnecting client on fd " << client_fd; // If this client was using any objects, remove it from the appropriate // lists. + auto client = it->second.get(); for (const auto& entry : store_info_.objects) { if (entry.second->state == PLASMA_SEALED) { - remove_client_from_object_clients(entry.second.get(), it->second.get()); + remove_client_from_object_clients(entry.second.get(), client); } else { - abort_object(entry.first); + abort_object(entry.first, client); } } @@ -600,7 +607,9 @@ Status PlasmaStore::process_message(Client* client) { } break; case MessageType_PlasmaAbortRequest: { RETURN_NOT_OK(ReadAbortRequest(input, input_size, &object_id)); - abort_object(object_id); + ARROW_CHECK(abort_object(object_id, client) == 1) << "To abort an object, the only " + "client currently using it " + "must be the creator."; HANDLE_SIGPIPE(SendAbortReply(client->fd, object_id), client->fd); } break; case MessageType_PlasmaGetRequest: { diff --git a/cpp/src/plasma/store.h b/cpp/src/plasma/store.h index 0d08d8a67ffaa..a72c6259a9cea 100644 --- a/cpp/src/plasma/store.h +++ b/cpp/src/plasma/store.h @@ -74,7 +74,14 @@ class PlasmaStore { int create_object(const ObjectID& object_id, int64_t data_size, int64_t metadata_size, Client* client, PlasmaObject* result); - void abort_object(const ObjectID& object_id); + /// Abort a created but unsealed object. If the client is not the + /// creator, then the abort will fail. + /// + /// @param object_id Object ID of the object to be aborted. + /// @param client The client who created the object. If this does not + /// match the creator of the object, then the abort will fail. + /// @return 1 if the abort succeeds, else 0. + int abort_object(const ObjectID& object_id, Client* client); /// Delete objects that have been created in the hash table. This should only /// be called on objects that are returned by the eviction policy to evict. diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc index 5c0cee4c071ad..d4285f8988404 100644 --- a/cpp/src/plasma/test/client_tests.cc +++ b/cpp/src/plasma/test/client_tests.cc @@ -45,14 +45,17 @@ class TestPlasmaStore : public ::testing::Test { "/plasma_store -m 1000000000 -s /tmp/store 1> /dev/null 2> /dev/null &"; system(plasma_command.c_str()); ARROW_CHECK_OK(client_.Connect("/tmp/store", "", PLASMA_DEFAULT_RELEASE_DELAY)); + ARROW_CHECK_OK(client2_.Connect("/tmp/store", "", PLASMA_DEFAULT_RELEASE_DELAY)); } virtual void Finish() { ARROW_CHECK_OK(client_.Disconnect()); + ARROW_CHECK_OK(client2_.Disconnect()); system("killall plasma_store &"); } protected: PlasmaClient client_; + PlasmaClient client2_; }; TEST_F(TestPlasmaStore, ContainsTest) { @@ -171,6 +174,41 @@ TEST_F(TestPlasmaStore, AbortTest) { } } +TEST_F(TestPlasmaStore, MultipleClientTest) { + ObjectID object_id = ObjectID::from_random(); + + // Test for object non-existence on the first client. + bool has_object; + ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); + ASSERT_EQ(has_object, false); + + // Test for the object being in local Plasma store. + // First create and seal object on the second client. + int64_t data_size = 100; + uint8_t metadata[] = {5}; + int64_t metadata_size = sizeof(metadata); + uint8_t* data; + ARROW_CHECK_OK(client2_.Create(object_id, data_size, metadata, metadata_size, &data)); + ARROW_CHECK_OK(client2_.Seal(object_id)); + // Test that the first client can get the object. + ObjectBuffer object_buffer; + ARROW_CHECK_OK(client_.Get(&object_id, 1, -1, &object_buffer)); + ARROW_CHECK_OK(client_.Contains(object_id, &has_object)); + ASSERT_EQ(has_object, true); + + // Test that one client disconnecting does not interfere with the other. + // First create object on the second client. + object_id = ObjectID::from_random(); + ARROW_CHECK_OK(client2_.Create(object_id, data_size, metadata, metadata_size, &data)); + // Disconnect the first client. + ARROW_CHECK_OK(client_.Disconnect()); + // Test that the second client can seal and get the created object. + ARROW_CHECK_OK(client2_.Seal(object_id)); + ARROW_CHECK_OK(client2_.Get(&object_id, 1, -1, &object_buffer)); + ARROW_CHECK_OK(client2_.Contains(object_id, &has_object)); + ASSERT_EQ(has_object, true); +} + } // namespace plasma int main(int argc, char** argv) { From 7c205b0337cd0364a7f9e1e19a9a6d5423abfe30 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 11 Nov 2017 17:09:50 -0500 Subject: [PATCH 033/177] ARROW-1787: [Python] Support reading parquet files into DataFrames in a backward compatible way Author: Phillip Cloud Closes #1298 from cpcloud/ARROW-1787 and squashes the following commits: 6f5fbd55 [Phillip Cloud] Add more index naming tests 8ba06256 [Phillip Cloud] Add test data 56e7fe58 [Phillip Cloud] Use BytesIO 5f50da38 [Phillip Cloud] Implement d61c43e6 [Phillip Cloud] Add test 9abad95f [Phillip Cloud] Add test data --- python/pyarrow/pandas_compat.py | 13 +++- .../tests/data/v0.7.1.all-named-index.parquet | Bin 0 -> 3948 bytes python/pyarrow/tests/data/v0.7.1.parquet | Bin 0 -> 4372 bytes .../data/v0.7.1.some-named-index.parquet | Bin 0 -> 4008 bytes python/pyarrow/tests/test_parquet.py | 73 ++++++++++++++++++ 5 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 python/pyarrow/tests/data/v0.7.1.all-named-index.parquet create mode 100644 python/pyarrow/tests/data/v0.7.1.parquet create mode 100644 python/pyarrow/tests/data/v0.7.1.some-named-index.parquet diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 87b47b8a6bc13..db28ee09e1e73 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -18,6 +18,7 @@ import ast import collections import json +import re import numpy as np import pandas as pd @@ -353,6 +354,14 @@ def make_datetimetz(tz): return DatetimeTZDtype('ns', tz=tz) +def backwards_compatible_index_name(raw_name, logical_name): + pattern = r'^__index_level_\d+__$' + if raw_name == logical_name and re.match(pattern, raw_name) is not None: + return None + else: + return logical_name + + def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pandas.core.internals as _int import pyarrow.lib as lib @@ -394,7 +403,9 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) - index_names.append(logical_name) + index_names.append( + backwards_compatible_index_name(raw_name, logical_name) + ) block_table = block_table.remove_column( block_table.schema.get_field_index(raw_name) ) diff --git a/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet b/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e9efd9b390ed4ceaa53c20ca1ac182101593a6bb GIT binary patch literal 3948 zcmcInU5FD`6uvi^$xhOkjN9Dlq^33mJ4_)iCc7@Bxa11D?$)>)U0DjX$e7I5l>C`Z z#9eDysM5t3tq;Xo5Yf^nDTS6IMT)egNLh>62SG|vD8AIEwifZJ=iZq~ve{9nYz8u! zIrq$Wzwh4jok?W=H7hd ziyilV&fxXgeWz|^K4Z~KH#1k(pZIh4m0vOhkr;_5Nd)kG#1~T~=Rs9+9#s$=p&<{P zBiJi0riX_S(iS~25{PaVkT0r=d82MLh)0PikFm1g(LTlsQluxK=`M2sHZZt=$@^Kx z1tjIr493CH938}1n$1!imS;YnqzGf-+Js*g?(g!)g~3U`B>X)Ar?22v*Cy!_FxW)a`yL11x|)4joLZgg*AMU71p=U#fab(R;^ylTMD4Ce#hGN04Vs)4+`Ai zbeE{5;4~&DahCM+^&gA>t24WV=%9!Nnc`4H-?x^Y`vl`;n7+)Mu|7<1EMZ)b=@_?x zOf7jb<)bIUE(OLN2so-U?kMyJ_9n>QmhOGZilI5{SbYy8wVtr9uiaAWXdfuC{ySa&@kN8S;k`O9P}rt zyV~EOZq8A6-lP4r7Sx@W9qY_22eYlSEZS1nv3Wq8N$}F6th3E~D3s#{AGI2vmttRV zBp^~Td~igt_tHy$c*r)~OReVSb=f?a2@W5~b6{7<+iHFYDcWPOrqlTFydw`b?>GT@ zcb<3L)dgm|YgrWFz+DGO!RZXCm;kX+qgcn%=^fQ-L4@1!f@zfc;38Ky%f)8d50}$s zeJ;)tR`|@{8d3!Y#AUSG#Msv#u{Lo$8Y4m5WZrz!>fQIT@&IdRRgiJ^h1f9CyRl$m z;H#5lmPp)5ku0KlGGlxiXu0J^oe#9Fs`K6THpv3(MGvx2+)L?dK1fJpE{* zVw6pIu+l7*Aj5**-yU`8;K_VHAvpVM82iPqQ39 z!GAn6q^>DuN>!sVJhXC-j)Cq;lJOSr1n)FGz~w!w2e`cZx!@KnuB*@rKWz<;xZrgx z-}1fd7@bCOYj)Q~ic79k-Cd2l(Uql8OFAZ#nvy^B zBkqbI1=~dgr4Pl5w4&mJ4ai4+kQ9_cgOVQrosu)Ch}Ij5Xkp0+j^0rY ze*M@Xh@;n?2&pHypwtuC$RlS!5t4dIuSlHim;c4`r<{5Q%SaYqfTr2Z0ocG`zm$$i zJ*K3{O#y6EDMQb?;3i%&a+zw5hueulA?1eaoKcz&(FbbWu4hUeTHJLPg{KZ}LKZLN zf*|Ca5R^PMA&z8~$#{(AXIQ<)gN&<=;26@{v0zRhtVx40Ni+%jV;F4P=+6}W*+G9M zfs-Nyo8pUiD0rJm#52eu$_eCw2dP(F4H`nSkS&zVcz%K9vsnGr2}DjiDG?I@pkem+ zrC3bzn6nQm4zSHm4ZB?^f?fRFtk*?`X=LW+X02{wC-W~4p#pXf!0Zp(rO?vZs##gPe?0B&p1LDdu~pXG+Z9^ls9&$H@pizTHKDa7bF$6h1Lc5AaY#)2AFGCUf6r7EE67 z4evd41~{loo~8s%8O6#0lle7HUgXsKuL1MhJ&>VxfSQe!$%;!`tqj$uD)HD(f?Xmy zHVtw}+%0No7XepRbZw&F5J2xZTFb~5&j;#wRVfZSGZ_cd{^==-yV$^8MAm$ACs z3*7(k0(Uz&&Bh9TkS;|jUT+xUc8mm!CLs3HaL&LP_llc$)1-xCPwm>Nz#35~f_b!M z4jH|^5ft1R2ui%w9*U(*k|}`p@DQu_`$56)K2Tr>r`cFxCQxu1ON%%r4VYOq$)EiS zW37*@p{;Fo9~lv_$4BliVZ6>q%>K5op9pIX9PcBOW-s%Rzo9AKdfrPIbDvFqS??*L z<=dHGAAJt*t0(2g8sWp{`dr}TuQ~Ng3DiAX1?ud;G#jg(x)f`~eT22{-nxdTyYYbN zBWY0EPp(ejz`8!d+P{woE^WM*RnLf>Iy2eR{g8j=egVEOQgj}*Q@5;ux+||+ zmD2`RtDF{VUEPVs+HGcx9pnhzuNF9hdwt_Q0)eF5>Lg6MgM3dnv0DxzyfZn@lc%uC z$DcTA>+P^vJhrS(kY~ZR%Uk=NVJ_ZhZ#KzWuomw;r=AXiyvJ`?#f4hIYQ@!JW%8ik zrJDMk7b6Tc#lh)-lZR7)(+Q^w>WtMWLUbRaM`xe2MruV4KJ8MhK6ftEsI>k>B9l)U zhZ9-jkdaMnNF=PD_YYPtuK{V7R)aJK*n^*n0zHgWo^KI6R!QXsh{SBUYip+XNZ4Bzlx|^@xEzmpaYoivzv-T$*X@S{O_&(Rkz(PY zmFMD-Fl#FCRa(vE%i(uJAA~y^P58sXP?&kJ9%Pt~N5XkMXTU4@YBmcV3^Y-hFH&Q; zTq$MpABSNeU(FTg>-NI_PmE-R+H*!lPw5rC26418O)Gxb{2PS6(KbaoThJ?Gqn&d! z4YW@(7^?FQLvyo1t>!byLdxhweKGSEQ2K%21z7fpzVg6k$F_lOiec*lHVwQixMuI~ z0*x+XJwy97zzl`f)8@wi&ucUvz3z!RC!0asd&O)mPu{9EPb9-!QCD&^8kWngU z3VGTQHVh4K7>b0CA_WFZ2dhS9eX>{#)jUFxp^>5C^|RAk;9T7 zhq!cVgiWF%}k)lQHgGEZIP<*LREtW(2(BI5vlU%a;U@r^V z%>HKfo9~yQ7r=ju;Mf=HM|Vk8K7F5(I+lJ%k@S+B|nF5j>K z_aJsSmlGo+2&v0Ln0P{)MdS)8&a7V5YebNP@?$J73F;?UMzZt-G|grXzy=2AF?m15 zxPYWIn#0&TmZn1(OY;N^JsHrVYwxx)NOVbdjkd`JX?voHr1ADaFEon=;C=5ktDaF`#Yzyz9jJ-K#m&v|u zTU_fl0Qaw?HtscjwqS7X_pp2itKatl_dnP!+reozQ5eTFE#1sCX%9wDjCQP(&=h>e z;4`^S+OijONR)1TcPp?)DAHJCOlM+^Twsay^>bk&T8UMu=Ch^(NUYzmdcy+>eszNa zJ2=fIYAQH|$qAey1AP6*BmZcuTzquMiNyfLzM!_hpPu~`o$+k}%1dG>K*1}(FznQsg~ila z=uc92ZJdpnM(wSQg)+n7-(Wbh#%>&|0f`cAm&bBB}NXHE>Y8sz| zVwZQ+fBevlVYMPtFX7gYsxLhF5g6)#G)%XyGsE?sEE#t$BmOR+J zV+7>gd)}(94zNabt%?F1RM#xYS?##=rcS*~GC>n0OmS=E&Af?|y*ghgdzMfQ&OQgh!Cp zi3Jk_Uya04MBwryE~K7*sYRnd zlgXEI#^FrCIAjzu@l1xV?P)BZ#p?AJfz(Qfk>X%54KR6=fG>GDlbGUw*vxdo-G}11 zPn@1``o^ax;%q%!5^g{$!pl)|2VhOc)z!I$xs?Ou}j{ zG+T?T<%O{hwt>icvRTJwUsC{!$iYxJbb0L$@*bR@Jvi2SaL%l;ud8coq-$(k&u$_A ztLmS%9_7CVH#kIwI1MdhqS@Xmuy#| z8GhOt9Jax0TfWtM*Vgeg%!M9cF7^Pkk^3E4RWIeU<($FhTdn@dncJRJww1KDQ&@G` zT@lZ~omE|f-XWtppD&kK^==wWY#NM5k0J#+s|V{wZDY1li8Mw;;)6qjiH*~fThr6y PQ~S`r@MA3v|Iz;gF;Sai literal 0 HcmV?d00001 diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index e2e6863c4748f..6ba4fd2fad8ea 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1458,3 +1458,76 @@ def test_index_column_name_duplicate(tmpdir): arrow_table = _read_table(path) result_df = arrow_table.to_pandas() tm.assert_frame_equal(result_df, dfx) + + +def test_backwards_compatible_index_naming(): + expected_string = b"""\ +carat cut color clarity depth table price x y z + 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 + 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 + 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 + 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 + 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 + 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 + 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 + 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 + 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 + 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" + expected = pd.read_csv( + io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0 + ) + path = os.path.join(os.path.dirname(__file__), 'data', 'v0.7.1.parquet') + t = _read_table(path) + result = t.to_pandas() + tm.assert_frame_equal(result, expected) + + +def test_backwards_compatible_index_multi_level_named(): + expected_string = b"""\ +carat cut color clarity depth table price x y z + 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 + 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 + 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 + 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 + 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 + 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 + 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 + 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 + 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 + 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" + expected = pd.read_csv( + io.BytesIO(expected_string), + sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0 + ).sort_index() + path = os.path.join( + os.path.dirname(__file__), 'data', 'v0.7.1.all-named-index.parquet' + ) + t = _read_table(path) + result = t.to_pandas() + tm.assert_frame_equal(result, expected) + + +def test_backwards_compatible_index_multi_level_some_named(): + expected_string = b"""\ +carat cut color clarity depth table price x y z + 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 + 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 + 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 + 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 + 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 + 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 + 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 + 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 + 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 + 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" + expected = pd.read_csv( + io.BytesIO(expected_string), + sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0 + ).sort_index() + expected.index = expected.index.set_names(['cut', None, 'clarity']) + path = os.path.join( + os.path.dirname(__file__), 'data', 'v0.7.1.some-named-index.parquet' + ) + t = _read_table(path) + result = t.to_pandas() + tm.assert_frame_equal(result, expected) From 21112f85faa6e1328b3d59a54aa24becc50df4c1 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 11 Nov 2017 18:58:01 -0500 Subject: [PATCH 034/177] ARROW-1800: [C++] Fix and simplify random_decimals Author: Phillip Cloud Closes #1306 from cpcloud/ARROW-1800 and squashes the following commits: d5b08ff0 [Phillip Cloud] ARROW-1800: [C++] Fix and simplify random_decimals --- cpp/src/arrow/test-util.h | 110 ++++++++++++-------------------------- 1 file changed, 34 insertions(+), 76 deletions(-) diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 7306f577a36e0..9b875ce116a29 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -159,12 +159,11 @@ static inline void random_is_valid(int64_t n, double pct_null, static inline void random_bytes(int64_t n, uint32_t seed, uint8_t* out) { std::mt19937 gen(seed); - std::uniform_int_distribution d(0, std::numeric_limits::max()); - std::generate(out, out + n, [&d, &gen] { return static_cast(d(gen) & 0xFF); }); + std::uniform_int_distribution d(0, std::numeric_limits::max()); + std::generate(out, out + n, [&d, &gen] { return static_cast(d(gen)); }); } -static void DecimalRange(int32_t precision, Decimal128* min_decimal, - Decimal128* max_decimal) { +static int32_t DecimalSize(int32_t precision) { DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got " << precision; DCHECK_LE(precision, 38) << "decimal precision must be less than or equal to 38, got " @@ -173,123 +172,82 @@ static void DecimalRange(int32_t precision, Decimal128* min_decimal, switch (precision) { case 1: case 2: - *max_decimal = std::numeric_limits::max(); - break; + return 1; // 127 case 3: case 4: - *max_decimal = std::numeric_limits::max(); - break; + return 2; // 32,767 case 5: case 6: - *max_decimal = 8388607; - break; + return 3; // 8,388,607 case 7: case 8: case 9: - *max_decimal = std::numeric_limits::max(); - break; + return 4; // 2,147,483,427 case 10: case 11: - *max_decimal = 549755813887; - break; + return 5; // 549,755,813,887 case 12: case 13: case 14: - *max_decimal = 140737488355327; - break; + return 6; // 140,737,488,355,327 case 15: case 16: - *max_decimal = 36028797018963967; - break; + return 7; // 36,028,797,018,963,967 case 17: case 18: - *max_decimal = std::numeric_limits::max(); - break; + return 8; // 9,223,372,036,854,775,807 case 19: case 20: case 21: - *max_decimal = Decimal128("2361183241434822606847"); - break; + return 9; // 2,361,183,241,434,822,606,847 case 22: case 23: - *max_decimal = Decimal128("604462909807314587353087"); - break; + return 10; // 604,462,909,807,314,587,353,087 case 24: case 25: case 26: - *max_decimal = Decimal128("154742504910672534362390527"); - break; + return 11; // 154,742,504,910,672,534,362,390,527 case 27: case 28: - *max_decimal = Decimal128("39614081257132168796771975167"); - break; + return 12; // 39,614,081,257,132,168,796,771,975,167 case 29: case 30: case 31: - *max_decimal = Decimal128("10141204801825835211973625643007"); - break; + return 13; // 10,141,204,801,825,835,211,973,625,643,007 case 32: case 33: - *max_decimal = Decimal128("2596148429267413814265248164610047"); - break; + return 14; // 2,596,148,429,267,413,814,265,248,164,610,047 case 34: case 35: - *max_decimal = Decimal128("664613997892457936451903530140172287"); - break; + return 15; // 664,613,997,892,457,936,451,903,530,140,172,287 case 36: case 37: case 38: - *max_decimal = Decimal128("170141183460469231731687303715884105727"); - break; + return 16; // 170,141,183,460,469,231,731,687,303,715,884,105,727 default: DCHECK(false); break; } - - *min_decimal = ~(*max_decimal); + return -1; } -class UniformDecimalDistribution { - public: - explicit UniformDecimalDistribution(int32_t precision) { - Decimal128 max_decimal; - Decimal128 min_decimal; - DecimalRange(precision, &min_decimal, &max_decimal); - - const auto min_low = static_cast(min_decimal.low_bits()); - const auto max_low = static_cast(max_decimal.low_bits()); - - const int64_t min_high = min_decimal.high_bits(); - const int64_t max_high = max_decimal.high_bits(); - - using param_type = std::uniform_int_distribution::param_type; - - lower_dist_.param(param_type(min_low, max_low)); - upper_dist_.param(param_type(min_high, max_high)); - } - - template - Decimal128 operator()(Generator& gen) { - return Decimal128(upper_dist_(gen), static_cast(lower_dist_(gen))); - } - - private: - // The lower bits distribution is intentionally int64_t. - // If it were uint64_t then the size of the interval [min_high, max_high] would be 0 - // because min_high > max_high due to 2's complement. - // So, we generate the same range of bits using int64_t and then cast to uint64_t. - std::uniform_int_distribution lower_dist_; - std::uniform_int_distribution upper_dist_; -}; - static inline void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) { std::mt19937 gen(seed); - UniformDecimalDistribution dist(precision); - - for (int64_t i = 0; i < n; ++i, out += 16) { - const Decimal128 value(dist(gen)); - value.ToBytes(out); + std::uniform_int_distribution d(0, std::numeric_limits::max()); + const int32_t required_bytes = DecimalSize(precision); + constexpr int32_t byte_width = 16; + std::fill(out, out + byte_width * n, '\0'); + + for (int64_t i = 0; i < n; ++i, out += byte_width) { + std::generate(out, out + required_bytes, + [&d, &gen] { return static_cast(d(gen)); }); + + // sign extend if the sign bit is set for the last byte generated + // 0b10000000 == 0x80 == 128 + if ((out[required_bytes - 1] & '\x80') != 0) { + std::fill(out + required_bytes, out + byte_width, '\xFF'); + } } } From 357eedcbf4b06d4ab1a78005fe34244319d6b9b0 Mon Sep 17 00:00:00 2001 From: "Korn, Uwe" Date: Sat, 11 Nov 2017 19:02:53 -0500 Subject: [PATCH 035/177] ARROW-1781: Don't use brew when using the toolchain Author: Korn, Uwe Author: Uwe L. Korn Closes #1295 from xhochy/ARROW-1781 and squashes the following commits: 7de96d8a [Uwe L. Korn] Use a single Brewfile for c_glib 9fde4e91 [Korn, Uwe] Use gtest from conda-forge 8cc6898f [Korn, Uwe] ARROW-1781: Don't use brew when using the toolchain --- .travis.yml | 1 + c_glib/Brewfile | 11 +++++++---- ci/travis_before_script_c_glib.sh | 2 -- ci/travis_before_script_cpp.sh | 5 ++--- cpp/Brewfile | 21 --------------------- 5 files changed, 10 insertions(+), 30 deletions(-) delete mode 100644 cpp/Brewfile diff --git a/.travis.yml b/.travis.yml index 52d7a5f800505..9917a261451c2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -152,6 +152,7 @@ matrix: rvm: 2.2 env: BUILD_SYSTEM=autotools before_script: + - brew update && brew bundle --file=$TRAVIS_BUILD_DIR/c_glib/Brewfile - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library - $TRAVIS_BUILD_DIR/ci/travis_before_script_c_glib.sh script: diff --git a/c_glib/Brewfile b/c_glib/Brewfile index 80d3c81dd6f82..9fe5c3b616317 100644 --- a/c_glib/Brewfile +++ b/c_glib/Brewfile @@ -15,11 +15,14 @@ # specific language governing permissions and limitations # under the License. -brew "gtk-doc" brew "autoconf-archive" -brew "gobject-introspection" -brew "git" +brew "boost" +brew "ccache" brew "cmake" -brew "wget" +brew "git" +brew "gobject-introspection" +brew "gtk-doc" +brew "jemalloc" brew "libtool" brew "lua" +brew "wget" diff --git a/ci/travis_before_script_c_glib.sh b/ci/travis_before_script_c_glib.sh index a22ecd3753fd8..99d05397a9f23 100755 --- a/ci/travis_before_script_c_glib.sh +++ b/ci/travis_before_script_c_glib.sh @@ -22,8 +22,6 @@ set -ex source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh if [ $TRAVIS_OS_NAME = "osx" ]; then - brew update && brew bundle --file=c_glib/Brewfile - export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/usr/local/opt/libffi/lib/pkgconfig export XML_CATALOG_FILES=/usr/local/etc/xml/catalog fi diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index dbdcd33ed0d5b..4998f190f9891 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -38,8 +38,10 @@ if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then rapidjson \ flatbuffers \ gflags \ + gtest \ lz4-c \ snappy \ + ccache \ zstd \ brotli \ zlib \ @@ -53,9 +55,6 @@ if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then conda update -y -p $CPP_TOOLCHAIN ca-certificates -c defaults fi -if [ $TRAVIS_OS_NAME == "osx" ]; then - brew update && brew bundle --file=cpp/Brewfile -fi mkdir $ARROW_CPP_BUILD_DIR pushd $ARROW_CPP_BUILD_DIR diff --git a/cpp/Brewfile b/cpp/Brewfile deleted file mode 100644 index 5f82cacc55991..0000000000000 --- a/cpp/Brewfile +++ /dev/null @@ -1,21 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -brew "jemalloc" -brew "ccache" -brew "boost" -brew "cmake" From 550a39ff23054ee8cce99380b1ee1f29035f9f67 Mon Sep 17 00:00:00 2001 From: rvernica Date: Sun, 12 Nov 2017 22:16:29 +0900 Subject: [PATCH 036/177] ARROW-1801: [Docs] Update install instructions to use red-data-tools repos * Update package install instructions to use red-data-tools.org repositories. * Instructions taken from https://github.com/red-data-tools/packages.red-data-tools.org/blob/master/README.md * Dropped support for Debian jessie and Ubuntu 16.10 * Add support for Debian stretch, Ubuntu 14.04, and CentOS 6 Author: rvernica Closes #1307 from rvernica/patch-3 and squashes the following commits: f15dac08 [rvernica] ARROW-1801 [Docs] Update install instructions to use red-data-tools repos --- site/install.md | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/site/install.md b/site/install.md index 1513fe819789c..0ef2008db9061 100644 --- a/site/install.md +++ b/site/install.md @@ -75,23 +75,25 @@ with the wheel. We have provided APT and Yum repositories for Apache Arrow C++ and Apache Arrow GLib (C). Here are supported platforms: -* Debian GNU/Linux Jessie +* Debian GNU/Linux stretch +* Ubuntu 14.04 LTS * Ubuntu 16.04 LTS -* Ubuntu 16.10 * Ubuntu 17.04 +* Ubuntu 17.10 +* CentOS 6 * CentOS 7 -Debian GNU/Linux Jessie: +Debian GNU/Linux: ```shell -sudo apt update sudo apt install -y -V apt-transport-https -cat < Date: Sun, 12 Nov 2017 15:16:35 +0100 Subject: [PATCH 037/177] ARROW-1763: [Python] Implement __hash__ for DataType Author: Wes McKinney Closes #1308 from wesm/ARROW-1763 and squashes the following commits: 4e379b9 [Wes McKinney] Implement __hash__ for DataType --- python/pyarrow/tests/test_types.py | 24 ++++++++++++++++++++++++ python/pyarrow/types.pxi | 3 +++ 2 files changed, 27 insertions(+) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 0e3ea1fd40bf5..9eefa33b66187 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -137,3 +137,27 @@ def test_is_temporal_date_time_timestamp(): def test_timestamp_type(): # See ARROW-1683 assert isinstance(pa.timestamp('ns'), pa.TimestampType) + + +def test_types_hashable(): + types = [ + pa.null(), + pa.int32(), + pa.time32('s'), + pa.time64('us'), + pa.date32(), + pa.timestamp('us'), + pa.string(), + pa.binary(), + pa.binary(10), + pa.list_(pa.int32()), + pa.struct([pa.field('a', pa.int32()), + pa.field('b', pa.int8()), + pa.field('c', pa.string())]) + ] + + in_dict = {} + for i, type_ in enumerate(types): + assert hash(type_) == hash(type_) + in_dict[type_] = i + assert in_dict[type_] == i diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index d2e68ff79a524..edf0d8a305a02 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -69,6 +69,9 @@ cdef class DataType: ) return frombytes(self.type.ToString()) + def __hash__(self): + return hash(str(self)) + def __reduce__(self): return self.__class__, (), self.__getstate__() From e8331f46f8b324271e694557789ea53b082fdc05 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sun, 12 Nov 2017 23:47:47 -0500 Subject: [PATCH 038/177] ARROW-1794: [C++/Python] Rename DecimalArray to Decimal128Array Author: Phillip Cloud Closes #1312 from cpcloud/ARROW-1794 and squashes the following commits: 0b8ba5e0 [Phillip Cloud] Backward compat 4eb2a3ba [Phillip Cloud] ARROW-1794: [C++/Python] Rename DecimalArray to Decimal128Array --- cpp/src/arrow/array-test.cc | 2 +- cpp/src/arrow/array.cc | 6 +++--- cpp/src/arrow/array.h | 11 +++++++---- cpp/src/arrow/compare.cc | 2 +- cpp/src/arrow/ipc/json-internal.cc | 2 +- cpp/src/arrow/ipc/test-common.h | 6 +++--- cpp/src/arrow/ipc/writer.cc | 2 +- cpp/src/arrow/pretty_print.cc | 2 +- cpp/src/arrow/python/arrow_to_pandas.cc | 2 +- cpp/src/arrow/type_fwd.h | 2 +- cpp/src/arrow/type_traits.h | 2 +- cpp/src/arrow/visitor.cc | 2 +- cpp/src/arrow/visitor.h | 2 +- python/doc/source/api.rst | 2 +- python/pyarrow/__init__.py | 2 +- python/pyarrow/array.pxi | 4 ++-- python/pyarrow/includes/libarrow.pxd | 4 +++- python/pyarrow/lib.pxd | 2 +- python/pyarrow/scalar.pxi | 2 +- 19 files changed, 32 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 9f248cdbbb754..15c75534e53ef 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -2799,7 +2799,7 @@ class DecimalTest : public ::testing::TestWithParam { BitUtil::BytesToBits(valid_bytes, default_memory_pool(), &expected_null_bitmap)); int64_t expected_null_count = test::null_count(valid_bytes); - auto expected = std::make_shared( + auto expected = std::make_shared( type, size, expected_data, expected_null_bitmap, expected_null_count); std::shared_ptr lhs = out->Slice(offset); diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 9c91d619cc7e8..651fa26ba9026 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -314,12 +314,12 @@ const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const { // ---------------------------------------------------------------------- // Decimal -DecimalArray::DecimalArray(const std::shared_ptr& data) +Decimal128Array::Decimal128Array(const std::shared_ptr& data) : FixedSizeBinaryArray(data) { DCHECK_EQ(data->type->id(), Type::DECIMAL); } -std::string DecimalArray::FormatValue(int64_t i) const { +std::string Decimal128Array::FormatValue(int64_t i) const { const auto& type_ = static_cast(*type()); const Decimal128 value(GetValue(i)); return value.ToString(type_.scale()); @@ -515,7 +515,7 @@ struct ValidateVisitor { Status Visit(const PrimitiveArray&) { return Status::OK(); } - Status Visit(const DecimalArray&) { return Status::OK(); } + Status Visit(const Decimal128Array&) { return Status::OK(); } Status Visit(const BinaryArray&) { // TODO(wesm): what to do here? diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index f7762ce104398..3337e4b158267 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -559,19 +559,22 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { }; // ---------------------------------------------------------------------- -// DecimalArray -class ARROW_EXPORT DecimalArray : public FixedSizeBinaryArray { +// Decimal128Array +class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray { public: using TypeClass = DecimalType; using FixedSizeBinaryArray::FixedSizeBinaryArray; - /// \brief Construct DecimalArray from ArrayData instance - explicit DecimalArray(const std::shared_ptr& data); + /// \brief Construct Decimal128Array from ArrayData instance + explicit Decimal128Array(const std::shared_ptr& data); std::string FormatValue(int64_t i) const; }; +// Backward compatibility +using DecimalArray = Decimal128Array; + // ---------------------------------------------------------------------- // Struct diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index a2d4de7b73afb..253c2e1feb801 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -255,7 +255,7 @@ class RangeEqualsVisitor { return Status::OK(); } - Status Visit(const DecimalArray& left) { + Status Visit(const Decimal128Array& left) { return Visit(static_cast(left)); } diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 1b9baee7dafef..976f9660a1b78 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -458,7 +458,7 @@ class ArrayWriter { } } - void WriteDataValues(const DecimalArray& arr) { + void WriteDataValues(const Decimal128Array& arr) { for (int64_t i = 0; i < arr.length(); ++i) { const Decimal128 value(arr.GetValue(i)); writer_->String(value.ToIntegerString()); diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 91023db489852..7fc139381052c 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -690,10 +690,10 @@ Status MakeDecimal(std::shared_ptr* out) { RETURN_NOT_OK(BitUtil::BytesToBits(is_valid_bytes, default_memory_pool(), &is_valid)); - auto a1 = std::make_shared(f0->type(), length, data, is_valid, - kUnknownNullCount); + auto a1 = std::make_shared(f0->type(), length, data, is_valid, + kUnknownNullCount); - auto a2 = std::make_shared(f1->type(), length, data); + auto a2 = std::make_shared(f1->type(), length, data); ArrayVector arrays = {a1, a2}; *out = std::make_shared(schema, length, arrays); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 5598cc68296f7..323116f589b73 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -339,7 +339,7 @@ class RecordBatchSerializer : public ArrayVisitor { VISIT_FIXED_WIDTH(Time32Array) VISIT_FIXED_WIDTH(Time64Array) VISIT_FIXED_WIDTH(FixedSizeBinaryArray) - VISIT_FIXED_WIDTH(DecimalArray) + VISIT_FIXED_WIDTH(Decimal128Array) #undef VISIT_FIXED_WIDTH diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index aaea34a51388c..cfbc30315fcd7 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -170,7 +170,7 @@ class ArrayPrinter : public PrettyPrinter { } template - inline typename std::enable_if::value, void>::type + inline typename std::enable_if::value, void>::type WriteDataValues(const T& array) { for (int i = 0; i < array.length(); ++i) { if (i > 0) { diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 3894772daa467..f966b2987b93f 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -633,7 +633,7 @@ static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, PyObject* Decimal = Decimal_ref.obj(); for (int c = 0; c < data.num_chunks(); c++) { - auto* arr(static_cast(data.chunk(c).get())); + auto* arr(static_cast(data.chunk(c).get())); auto type(std::dynamic_pointer_cast(arr->type())); const int scale = type->scale(); diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 0d06b6f6cb86e..343487055c6fc 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -69,7 +69,7 @@ class StructArray; class StructBuilder; class DecimalType; -class DecimalArray; +class Decimal128Array; class DecimalBuilder; class UnionType; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index fbd78398f4579..bc600372eb01f 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -231,7 +231,7 @@ struct TypeTraits { template <> struct TypeTraits { - using ArrayType = DecimalArray; + using ArrayType = Decimal128Array; using BuilderType = DecimalBuilder; constexpr static bool is_parameter_free = false; }; diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index a7b01b0f6315a..3739e89f3d019 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -56,7 +56,7 @@ ARRAY_VISITOR_DEFAULT(ListArray); ARRAY_VISITOR_DEFAULT(StructArray); ARRAY_VISITOR_DEFAULT(UnionArray); ARRAY_VISITOR_DEFAULT(DictionaryArray); -ARRAY_VISITOR_DEFAULT(DecimalArray); +ARRAY_VISITOR_DEFAULT(Decimal128Array); #undef ARRAY_VISITOR_DEFAULT diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 6c36e465ec436..34679eb950d4b 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -50,7 +50,7 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const Time64Array& array); virtual Status Visit(const TimestampArray& array); virtual Status Visit(const IntervalArray& array); - virtual Status Visit(const DecimalArray& array); + virtual Status Visit(const Decimal128Array& array); virtual Status Visit(const ListArray& array); virtual Status Visit(const StructArray& array); virtual Status Visit(const UnionArray& array); diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst index 6bceba3c650b6..c52024044a62d 100644 --- a/python/doc/source/api.rst +++ b/python/doc/source/api.rst @@ -155,7 +155,7 @@ Array Types Date32Array Date64Array TimestampArray - DecimalArray + Decimal128Array ListArray .. _api.table: diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 2d7d7288b3835..09bf6b35f396a 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -58,7 +58,7 @@ DictionaryArray, Date32Array, Date64Array, TimestampArray, Time32Array, Time64Array, - DecimalArray, StructArray, + Decimal128Array, StructArray, ArrayValue, Scalar, NA, BooleanValue, Int8Value, Int16Value, Int32Value, Int64Value, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 9991411e55dfe..2ef592ff758fc 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -597,7 +597,7 @@ cdef class FixedSizeBinaryArray(Array): pass -cdef class DecimalArray(FixedSizeBinaryArray): +cdef class Decimal128Array(FixedSizeBinaryArray): pass @@ -846,7 +846,7 @@ cdef dict _array_classes = { _Type_STRING: StringArray, _Type_DICTIONARY: DictionaryArray, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, - _Type_DECIMAL: DecimalArray, + _Type_DECIMAL: Decimal128Array, _Type_STRUCT: StructArray, } diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index dfafd371b2857..11cc6b3ff2664 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -312,7 +312,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CFixedSizeBinaryArray" arrow::FixedSizeBinaryArray"(CArray): const uint8_t* GetValue(int i) - cdef cppclass CDecimalArray" arrow::DecimalArray"(CFixedSizeBinaryArray): + cdef cppclass CDecimal128Array" arrow::Decimal128Array"( + CFixedSizeBinaryArray + ): c_string FormatValue(int i) cdef cppclass CListArray" arrow::ListArray"(CArray): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 531489490754e..6413b838f0595 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -246,7 +246,7 @@ cdef class FixedSizeBinaryArray(Array): pass -cdef class DecimalArray(FixedSizeBinaryArray): +cdef class Decimal128Array(FixedSizeBinaryArray): pass diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index a396fa763c8c8..1bc5ed7a372a8 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -258,7 +258,7 @@ cdef class DecimalValue(ArrayValue): def as_py(self): cdef: - CDecimalArray* ap = self.sp_array.get() + CDecimal128Array* ap = self.sp_array.get() c_string s = ap.FormatValue(self.index) return _pydecimal.Decimal(s.decode('utf8')) From 4a33bad5f28830812fe4d47dcfdfb184d5ee43c0 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sun, 12 Nov 2017 23:49:12 -0500 Subject: [PATCH 039/177] ARROW-1767: [C++] Support file reads and writes over 2GB on Windows This closes [ARROW-1767](https://issues.apache.org/jira/browse/ARROW-1767). Author: Licht-T Closes #1311 from Licht-T/feature-large-file-io-windows and squashes the following commits: 690d2801 [Licht-T] ENH: Support large file io on Windows --- cpp/src/arrow/io/file.cc | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 057cad1111685..1ec5e23e587d5 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -255,19 +255,18 @@ static inline Status FileSeek(int fd, int64_t pos) { static inline Status FileRead(const int fd, uint8_t* buffer, const int64_t nbytes, int64_t* bytes_read) { -#if defined(_MSC_VER) - if (nbytes > ARROW_MAX_IO_CHUNKSIZE) { - return Status::IOError("Unable to read > 2GB blocks yet"); - } - *bytes_read = static_cast(_read(fd, buffer, static_cast(nbytes))); -#else *bytes_read = 0; while (*bytes_read != -1 && *bytes_read < nbytes) { int64_t chunksize = std::min(static_cast(ARROW_MAX_IO_CHUNKSIZE), nbytes - *bytes_read); +#if defined(_MSC_VER) + int64_t ret = static_cast( + _read(fd, buffer + *bytes_read, static_cast(chunksize))); +#else int64_t ret = static_cast( read(fd, buffer + *bytes_read, static_cast(chunksize))); +#endif if (ret != -1) { *bytes_read += ret; @@ -279,7 +278,6 @@ static inline Status FileRead(const int fd, uint8_t* buffer, const int64_t nbyte *bytes_read = ret; } } -#endif if (*bytes_read == -1) { return Status::IOError(std::string("Error reading bytes from file: ") + @@ -292,25 +290,23 @@ static inline Status FileRead(const int fd, uint8_t* buffer, const int64_t nbyte static inline Status FileWrite(const int fd, const uint8_t* buffer, const int64_t nbytes) { int ret = 0; -#if defined(_MSC_VER) - if (nbytes > ARROW_MAX_IO_CHUNKSIZE) { - return Status::IOError("Unable to write > 2GB blocks to file yet"); - } - ret = static_cast(_write(fd, buffer, static_cast(nbytes))); -#else int64_t bytes_written = 0; while (ret != -1 && bytes_written < nbytes) { int64_t chunksize = std::min(static_cast(ARROW_MAX_IO_CHUNKSIZE), nbytes - bytes_written); +#if defined(_MSC_VER) + ret = static_cast( + _write(fd, buffer + bytes_written, static_cast(chunksize))); +#else ret = static_cast( write(fd, buffer + bytes_written, static_cast(chunksize))); +#endif if (ret != -1) { bytes_written += ret; } } -#endif if (ret == -1) { return Status::IOError(std::string("Error writing bytes from file: ") + From 6f8e2873c3931ff279b05a5e517a87637eb13d05 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 13 Nov 2017 22:47:22 -0500 Subject: [PATCH 040/177] ARROW-1743: [Python] Avoid non-array writeable-flag check This closes [ARROW-1743](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-1743). Author: Licht-T Closes #1260 from Licht-T/fix-non-array-writable-check and squashes the following commits: 8a104f7c [Licht-T] TST: Add test_categorical_row_index 555acdeb [Licht-T] BUG: Avoid the writeable-flag check for non-array index --- python/pyarrow/pandas_compat.py | 2 +- python/pyarrow/tests/test_convert_pandas.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index db28ee09e1e73..41eaf0bac4525 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -397,7 +397,7 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): col = table.column(i) col_pandas = col.to_pandas() values = col_pandas.values - if not values.flags.writeable: + if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index dabccac37c3d8..b9c3a12213243 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -198,6 +198,13 @@ def test_datetimetz_column_index(self): md = column_indexes['metadata'] assert md['timezone'] == 'America/New_York' + def test_categorical_row_index(self): + df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]}) + df['a'] = df.a.astype('category') + df = df.set_index('a') + + self._check_pandas_roundtrip(df, preserve_index=True) + def test_float_no_nulls(self): data = {} fields = [] From 8f2d15256d6b3a9da569797f363518abf50d23f5 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Mon, 13 Nov 2017 22:53:58 -0500 Subject: [PATCH 041/177] ARROW-1802: [GLib] Support arrow-gpu arrow-gpu isn't required. If `arrow-gpu.pc` isn't installed, GPU support is just ignored. Author: Kouhei Sutou Closes #1313 from kou/glib-gpu and squashes the following commits: b36d491a [Kouhei Sutou] [GLib] Support arrow-gpu --- c_glib/.gitignore | 1 + c_glib/Makefile.am | 1 + c_glib/arrow-glib/Makefile.am | 8 +- c_glib/arrow-glib/buffer.h | 82 +- c_glib/arrow-glib/input-stream.h | 136 +-- c_glib/arrow-glib/meson.build | 33 +- c_glib/arrow-glib/output-stream.h | 47 +- c_glib/arrow-glib/readable.cpp | 4 +- c_glib/arrow-glib/readable.hpp | 1 + c_glib/arrow-gpu-glib/Makefile.am | 109 +++ c_glib/arrow-gpu-glib/arrow-gpu-glib.h | 24 + c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp | 24 + c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in | 28 + c_glib/arrow-gpu-glib/cuda.cpp | 941 +++++++++++++++++++++ c_glib/arrow-gpu-glib/cuda.h | 181 ++++ c_glib/arrow-gpu-glib/cuda.hpp | 54 ++ c_glib/arrow-gpu-glib/meson.build | 80 ++ c_glib/configure.ac | 26 +- c_glib/doc/Makefile.am | 2 +- c_glib/doc/reference/Makefile.am | 11 + c_glib/doc/reference/arrow-glib-docs.sgml | 10 + c_glib/doc/reference/meson.build | 23 +- c_glib/meson.build | 9 +- c_glib/test/run-test.rb | 6 + c_glib/test/run-test.sh | 37 +- c_glib/test/test-gpu-cuda.rb | 144 ++++ 26 files changed, 1744 insertions(+), 278 deletions(-) create mode 100644 c_glib/arrow-gpu-glib/Makefile.am create mode 100644 c_glib/arrow-gpu-glib/arrow-gpu-glib.h create mode 100644 c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp create mode 100644 c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in create mode 100644 c_glib/arrow-gpu-glib/cuda.cpp create mode 100644 c_glib/arrow-gpu-glib/cuda.h create mode 100644 c_glib/arrow-gpu-glib/cuda.hpp create mode 100644 c_glib/arrow-gpu-glib/meson.build create mode 100644 c_glib/test/test-gpu-cuda.rb diff --git a/c_glib/.gitignore b/c_glib/.gitignore index 03bb0fe61892e..2719147405f71 100644 --- a/c_glib/.gitignore +++ b/c_glib/.gitignore @@ -41,6 +41,7 @@ Makefile.in /arrow-glib/enums.h /arrow-glib/stamp-* /arrow-glib/*.pc +/arrow-gpu-glib/*.pc /example/build /example/read-batch /example/read-stream diff --git a/c_glib/Makefile.am b/c_glib/Makefile.am index 577b749fb38bc..4cc70e5a08870 100644 --- a/c_glib/Makefile.am +++ b/c_glib/Makefile.am @@ -19,6 +19,7 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS} SUBDIRS = \ arrow-glib \ + arrow-gpu-glib \ doc \ example \ tool diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am index bf68ec4910e77..5ecb1a6614268 100644 --- a/c_glib/arrow-glib/Makefile.am +++ b/c_glib/arrow-glib/Makefile.am @@ -203,20 +203,18 @@ pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = \ arrow-glib.pc -# GObject Introspection +if HAVE_INTROSPECTION -include $(INTROSPECTION_MAKEFILE) INTROSPECTION_GIRS = INTROSPECTION_SCANNER_ARGS = INTROSPECTION_COMPILER_ARGS = -if HAVE_INTROSPECTION Arrow-1.0.gir: libarrow-glib.la Arrow_1_0_gir_PACKAGES = \ - gobject-2.0 \ gio-2.0 -Arrow_1_0_gir_EXPORT_PACKAGES = arrow +Arrow_1_0_gir_EXPORT_PACKAGES = \ + arrow-glib Arrow_1_0_gir_INCLUDES = \ - GObject-2.0 \ Gio-2.0 Arrow_1_0_gir_CFLAGS = \ $(AM_CPPFLAGS) diff --git a/c_glib/arrow-glib/buffer.h b/c_glib/arrow-glib/buffer.h index b3f3a2cdc5e9b..300bb4f4ea3ca 100644 --- a/c_glib/arrow-glib/buffer.h +++ b/c_glib/arrow-glib/buffer.h @@ -19,44 +19,21 @@ #pragma once -#include +#include G_BEGIN_DECLS -#define GARROW_TYPE_BUFFER \ - (garrow_buffer_get_type()) -#define GARROW_BUFFER(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), GARROW_TYPE_BUFFER, GArrowBuffer)) -#define GARROW_BUFFER_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), GARROW_TYPE_BUFFER, GArrowBufferClass)) -#define GARROW_IS_BUFFER(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), GARROW_TYPE_BUFFER)) -#define GARROW_IS_BUFFER_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), GARROW_TYPE_BUFFER)) -#define GARROW_BUFFER_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), GARROW_TYPE_BUFFER, GArrowBufferClass)) - -typedef struct _GArrowBuffer GArrowBuffer; -typedef struct _GArrowBufferClass GArrowBufferClass; - -/** - * GArrowBuffer: - * - * It wraps `arrow::Buffer`. - */ -struct _GArrowBuffer -{ - /*< private >*/ - GObject parent_instance; -}; - +#define GARROW_TYPE_BUFFER (garrow_buffer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowBuffer, + garrow_buffer, + GARROW, + BUFFER, + GObject) struct _GArrowBufferClass { GObjectClass parent_class; }; -GType garrow_buffer_get_type (void) G_GNUC_CONST; - GArrowBuffer *garrow_buffer_new (const guint8 *data, gint64 size); gboolean garrow_buffer_equal (GArrowBuffer *buffer, @@ -80,49 +57,16 @@ GArrowBuffer *garrow_buffer_slice (GArrowBuffer *buffer, gint64 size); -#define GARROW_TYPE_MUTABLE_BUFFER \ - (garrow_mutable_buffer_get_type()) -#define GARROW_MUTABLE_BUFFER(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_MUTABLE_BUFFER, \ - GArrowMutableBuffer)) -#define GARROW_MUTABLE_BUFFER_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_MUTABLE_BUFFER, \ - GArrowMutableBufferClass)) -#define GARROW_IS_MUTABLE_BUFFER(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), GARROW_TYPE_MUTABLE_BUFFER)) -#define GARROW_IS_MUTABLE_BUFFER_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), GARROW_TYPE_MUTABLE_BUFFER)) -#define GARROW_MUTABLE_BUFFER_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_MUTABLE_BUFFER, \ - GArrowMutableBufferClass)) - -typedef struct _GArrowMutableBuffer GArrowMutableBuffer; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowMutableBufferClass GArrowMutableBufferClass; -#endif - -/** - * GArrowMutableBuffer: - * - * It wraps `arrow::MutableBuffer`. - */ -struct _GArrowMutableBuffer -{ - /*< private >*/ - GArrowBuffer parent_instance; -}; - -#ifndef __GTK_DOC_IGNORE__ +#define GARROW_TYPE_MUTABLE_BUFFER (garrow_mutable_buffer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowMutableBuffer, + garrow_mutable_buffer, + GARROW, + MUTABLE_BUFFER, + GArrowBuffer) struct _GArrowMutableBufferClass { GArrowBufferClass parent_class; }; -#endif - -GType garrow_mutable_buffer_get_type(void) G_GNUC_CONST; GArrowMutableBuffer *garrow_mutable_buffer_new (guint8 *data, gint64 size); diff --git a/c_glib/arrow-glib/input-stream.h b/c_glib/arrow-glib/input-stream.h index 12c7ae700f79d..c2068d6ac0e41 100644 --- a/c_glib/arrow-glib/input-stream.h +++ b/c_glib/arrow-glib/input-stream.h @@ -26,98 +26,28 @@ G_BEGIN_DECLS -#define GARROW_TYPE_INPUT_STREAM \ - (garrow_input_stream_get_type()) -#define GARROW_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_INPUT_STREAM, \ - GArrowInputStream)) -#define GARROW_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_INPUT_STREAM, \ - GArrowInputStreamClass)) -#define GARROW_IS_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_INPUT_STREAM)) -#define GARROW_IS_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_INPUT_STREAM)) -#define GARROW_INPUT_STREAM_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_INPUT_STREAM, \ - GArrowInputStreamClass)) - -typedef struct _GArrowInputStream GArrowInputStream; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowInputStreamClass GArrowInputStreamClass; -#endif - -/** - * GArrowInputStream: - * - * It wraps `arrow::io::InputStream`. - */ -struct _GArrowInputStream -{ - /*< private >*/ - GObject parent_instance; -}; - -#ifndef __GTK_DOC_IGNORE__ +#define GARROW_TYPE_INPUT_STREAM (garrow_input_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowInputStream, + garrow_input_stream, + GARROW, + INPUT_STREAM, + GObject) struct _GArrowInputStreamClass { GObjectClass parent_class; }; -#endif - -GType garrow_input_stream_get_type(void) G_GNUC_CONST; - #define GARROW_TYPE_SEEKABLE_INPUT_STREAM \ (garrow_seekable_input_stream_get_type()) -#define GARROW_SEEKABLE_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_SEEKABLE_INPUT_STREAM, \ - GArrowSeekableInputStream)) -#define GARROW_SEEKABLE_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_SEEKABLE_INPUT_STREAM, \ - GArrowSeekableInputStreamClass)) -#define GARROW_IS_SEEKABLE_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_SEEKABLE_INPUT_STREAM)) -#define GARROW_IS_SEEKABLE_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_SEEKABLE_INPUT_STREAM)) -#define GARROW_SEEKABLE_INPUT_STREAM_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_SEEKABLE_INPUT_STREAM, \ - GArrowSeekableInputStreamClass)) - -typedef struct _GArrowSeekableInputStream GArrowSeekableInputStream; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowSeekableInputStreamClass GArrowSeekableInputStreamClass; -#endif - -/** - * GArrowSeekableInputStream: - * - * It wraps `arrow::io::RandomAccessFile`. - */ -struct _GArrowSeekableInputStream -{ - /*< private >*/ - GArrowInputStream parent_instance; -}; - -#ifndef __GTK_DOC_IGNORE__ +G_DECLARE_DERIVABLE_TYPE(GArrowSeekableInputStream, + garrow_seekable_input_stream, + GARROW, + SEEKABLE_INPUT_STREAM, + GArrowInputStream) struct _GArrowSeekableInputStreamClass { GArrowInputStreamClass parent_class; }; -#endif - -GType garrow_seekable_input_stream_get_type(void) G_GNUC_CONST; guint64 garrow_seekable_input_stream_get_size(GArrowSeekableInputStream *input_stream, GError **error); @@ -133,49 +63,15 @@ GArrowTensor *garrow_seekable_input_stream_read_tensor(GArrowSeekableInputStream #define GARROW_TYPE_BUFFER_INPUT_STREAM \ (garrow_buffer_input_stream_get_type()) -#define GARROW_BUFFER_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_BUFFER_INPUT_STREAM, \ - GArrowBufferInputStream)) -#define GARROW_BUFFER_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_BUFFER_INPUT_STREAM, \ - GArrowBufferInputStreamClass)) -#define GARROW_IS_BUFFER_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_BUFFER_INPUT_STREAM)) -#define GARROW_IS_BUFFER_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_BUFFER_INPUT_STREAM)) -#define GARROW_BUFFER_INPUT_STREAM_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_BUFFER_INPUT_STREAM, \ - GArrowBufferInputStreamClass)) - -typedef struct _GArrowBufferInputStream GArrowBufferInputStream; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowBufferInputStreamClass GArrowBufferInputStreamClass; -#endif - -/** - * GArrowBufferInputStream: - * - * It wraps `arrow::io::BufferReader`. - */ -struct _GArrowBufferInputStream -{ - /*< private >*/ - GArrowSeekableInputStream parent_instance; -}; - -#ifndef __GTK_DOC_IGNORE__ +G_DECLARE_DERIVABLE_TYPE(GArrowBufferInputStream, + garrow_buffer_input_stream, + GARROW, + BUFFER_INPUT_STREAM, + GArrowSeekableInputStream) struct _GArrowBufferInputStreamClass { GArrowSeekableInputStreamClass parent_class; }; -#endif - -GType garrow_buffer_input_stream_get_type(void) G_GNUC_CONST; GArrowBufferInputStream *garrow_buffer_input_stream_new(GArrowBuffer *buffer); diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index 464a002e78b0c..aeec4172d6284 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -179,22 +179,23 @@ pkgconfig.generate(filebase: meson.project_name(), name: 'Apache Arrow GLib', description: 'C API for Apache Arrow based on GLib', version: version, - requires: ['gobject-2.0', 'arrow'], + requires: ['gio-2.0', 'arrow'], libraries: [libarrow_glib], subdirs: ['arrow-glib']) -gnome.generate_gir(libarrow_glib, - sources: sources + c_headers + enums, - namespace: 'Arrow', - nsversion: api_version, - identifier_prefix: 'GArrow', - symbol_prefix: 'garrow', - export_packages: 'arrow-glib', - includes: [ - 'GObject-2.0', - 'Gio-2.0', - ], - install: true, - extra_args: [ - '--warn-all', - ]) +arrow_glib_gir = gnome.generate_gir(libarrow_glib, + sources: sources + c_headers + enums, + namespace: 'Arrow', + nsversion: api_version, + identifier_prefix: 'GArrow', + symbol_prefix: 'garrow', + export_packages: 'arrow-glib', + includes: [ + 'GObject-2.0', + 'Gio-2.0', + ], + install: true, + extra_args: [ + '--warn-all', + ]) +arrow_glib_gir_dependency = declare_dependency(sources: arrow_glib_gir) diff --git a/c_glib/arrow-glib/output-stream.h b/c_glib/arrow-glib/output-stream.h index e42ebcde47d6b..195a97ac9f053 100644 --- a/c_glib/arrow-glib/output-stream.h +++ b/c_glib/arrow-glib/output-stream.h @@ -26,51 +26,16 @@ G_BEGIN_DECLS -#define GARROW_TYPE_OUTPUT_STREAM \ - (garrow_output_stream_get_type()) -#define GARROW_OUTPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_OUTPUT_STREAM, \ - GArrowOutputStream)) -#define GARROW_OUTPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_OUTPUT_STREAM, \ - GArrowOutputStreamClass)) -#define GARROW_IS_OUTPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_OUTPUT_STREAM)) -#define GARROW_IS_OUTPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_OUTPUT_STREAM)) -#define GARROW_OUTPUT_STREAM_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_OUTPUT_STREAM, \ - GArrowOutputStreamClass)) - -typedef struct _GArrowOutputStream GArrowOutputStream; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowOutputStreamClass GArrowOutputStreamClass; -#endif - -/** - * GArrowOutputStream: - * - * It wraps `arrow::io::OutputStream`. - */ -struct _GArrowOutputStream -{ - /*< private >*/ - GObject parent_instance; -}; - -#ifndef __GTK_DOC_IGNORE__ +#define GARROW_TYPE_OUTPUT_STREAM (garrow_output_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowOutputStream, + garrow_output_stream, + GARROW, + OUTPUT_STREAM, + GObject) struct _GArrowOutputStreamClass { GObjectClass parent_class; }; -#endif - -GType garrow_output_stream_get_type(void) G_GNUC_CONST; gint64 garrow_output_stream_write_tensor(GArrowOutputStream *stream, GArrowTensor *tensor, diff --git a/c_glib/arrow-glib/readable.cpp b/c_glib/arrow-glib/readable.cpp index 6a9023e6cddf0..33f98d98c88a4 100644 --- a/c_glib/arrow-glib/readable.cpp +++ b/c_glib/arrow-glib/readable.cpp @@ -45,6 +45,7 @@ G_DEFINE_INTERFACE(GArrowReadable, static void garrow_readable_default_init (GArrowReadableInterface *iface) { + iface->new_raw = garrow_buffer_new_raw; } /** @@ -66,7 +67,8 @@ garrow_readable_read(GArrowReadable *readable, std::shared_ptr arrow_buffer; auto status = arrow_readable->Read(n_bytes, &arrow_buffer); if (garrow_error_check(error, status, "[io][readable][read]")) { - return garrow_buffer_new_raw(&arrow_buffer); + auto *iface = GARROW_READABLE_GET_IFACE(readable); + return iface->new_raw(&arrow_buffer); } else { return NULL; } diff --git a/c_glib/arrow-glib/readable.hpp b/c_glib/arrow-glib/readable.hpp index c241c77aa0329..ce7770103aa1a 100644 --- a/c_glib/arrow-glib/readable.hpp +++ b/c_glib/arrow-glib/readable.hpp @@ -32,6 +32,7 @@ struct _GArrowReadableInterface { GTypeInterface parent_iface; + GArrowBuffer *(*new_raw)(std::shared_ptr *arrow_buffer); std::shared_ptr (*get_raw)(GArrowReadable *file); }; diff --git a/c_glib/arrow-gpu-glib/Makefile.am b/c_glib/arrow-gpu-glib/Makefile.am new file mode 100644 index 0000000000000..ec9615987ee5e --- /dev/null +++ b/c_glib/arrow-gpu-glib/Makefile.am @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +CLEANFILES = + +EXTRA_DIST = \ + meson.build + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir) + +if HAVE_ARROW_GPU +lib_LTLIBRARIES = \ + libarrow-gpu-glib.la + +libarrow_gpu_glib_la_CXXFLAGS = \ + $(GLIB_CFLAGS) \ + $(ARROW_CFLAGS) \ + $(ARROW_GPU_CFLAGS) \ + $(GARROW_CXXFLAGS) + +libarrow_gpu_glib_la_LIBADD = \ + $(GLIB_LIBS) \ + $(ARROW_LIBS) \ + $(ARROW_GPU_LIBS) \ + ../arrow-glib/libarrow-glib.la + +libarrow_gpu_glib_la_headers = \ + arrow-gpu-glib.h \ + cuda.h + +libarrow_gpu_glib_la_sources = \ + cuda.cpp \ + $(libarrow_gpu_glib_la_headers) + +libarrow_gpu_glib_la_cpp_headers = \ + arrow-gpu-glib.hpp \ + cuda.hpp + +libarrow_gpu_glib_la_SOURCES = \ + $(libarrow_gpu_glib_la_sources) \ + $(libarrow_gpu_glib_la_cpp_headers) + +arrow_gpu_glib_includedir = \ + $(includedir)/arrow-gpu-glib +arrow_gpu_glib_include_HEADERS = \ + $(libarrow_gpu_glib_la_headers) \ + $(libarrow_gpu_glib_la_cpp_headers) + +pkgconfigdir = $(libdir)/pkgconfig +pkgconfig_DATA = \ + arrow-gpu-glib.pc + +if HAVE_INTROSPECTION +-include $(INTROSPECTION_MAKEFILE) +INTROSPECTION_GIRS = +INTROSPECTION_SCANNER_ARGS = +INTROSPECTION_SCANNER_ENV = \ + PKG_CONFIG_PATH=${abs_builddir}/../arrow-glib:$${PKG_CONFIG_PATH} +INTROSPECTION_COMPILER_ARGS = \ + --includedir=$(abs_builddir)/../arrow-glib + +ArrowGPU-1.0.gir: libarrow-gpu-glib.la +ArrowGPU_1_0_gir_PACKAGES = \ + arrow-glib +ArrowGPU_1_0_gir_EXPORT_PACKAGES = \ + arrow-gpu-glib +ArrowGPU_1_0_gir_INCLUDES = \ + Arrow-1.0 +ArrowGPU_1_0_gir_CFLAGS = \ + $(AM_CPPFLAGS) +ArrowGPU_1_0_gir_LIBS = \ + $(abs_builddir)/../arrow-glib/libarrow-glib.la \ + libarrow-gpu-glib.la +ArrowGPU_1_0_gir_FILES = \ + $(libarrow_gpu_glib_la_sources) +ArrowGPU_1_0_gir_SCANNERFLAGS = \ + --warn-all \ + --add-include-path=$(abs_builddir)/../arrow-glib \ + --identifier-prefix=GArrowGPU \ + --symbol-prefix=garrow_gpu +INTROSPECTION_GIRS += ArrowGPU-1.0.gir + +girdir = $(datadir)/gir-1.0 +gir_DATA = $(INTROSPECTION_GIRS) + +typelibdir = $(libdir)/girepository-1.0 +typelib_DATA = $(INTROSPECTION_GIRS:.gir=.typelib) + +CLEANFILES += \ + $(gir_DATA) \ + $(typelib_DATA) +endif +endif diff --git a/c_glib/arrow-gpu-glib/arrow-gpu-glib.h b/c_glib/arrow-gpu-glib/arrow-gpu-glib.h new file mode 100644 index 0000000000000..1538c9a1865ac --- /dev/null +++ b/c_glib/arrow-gpu-glib/arrow-gpu-glib.h @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include diff --git a/c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp b/c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp new file mode 100644 index 0000000000000..92017d8b67aab --- /dev/null +++ b/c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include diff --git a/c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in b/c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in new file mode 100644 index 0000000000000..38a6bae1a1298 --- /dev/null +++ b/c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: Apache Arrow GPU GLib +Description: C API for Apache Arrow GPU based on GLib +Version: @VERSION@ +Libs: -L${libdir} -larrow-gpu-glib +Cflags: -I${includedir} +Requires: arrow-glib diff --git a/c_glib/arrow-gpu-glib/cuda.cpp b/c_glib/arrow-gpu-glib/cuda.cpp new file mode 100644 index 0000000000000..c2a9af54dda94 --- /dev/null +++ b/c_glib/arrow-gpu-glib/cuda.cpp @@ -0,0 +1,941 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include + +G_BEGIN_DECLS + +/** + * SECTION: cuda + * @section_id: cuda-classes + * @title: CUDA related classes + * @include: arrow-gpu-glib/arrow-gpu-glib.h + * + * The following classes provide CUDA support for Apache Arrow data. + * + * #GArrowGPUCUDADeviceManager is the starting point. You need at + * least one #GArrowGPUCUDAContext to process Apache Arrow data on + * NVIDIA GPU. + * + * #GArrowGPUCUDAContext is a class to keep context for one GPU. You + * need to create #GArrowGPUCUDAContext for each GPU that you want to + * use. You can create #GArrowGPUCUDAContext by + * garrow_gpu_cuda_device_manager_get_context(). + * + * #GArrowGPUCUDABuffer is a class for data on GPU. You can copy data + * on GPU to/from CPU by garrow_gpu_cuda_buffer_copy_to_host() and + * garrow_gpu_cuda_buffer_copy_from_host(). You can share data on GPU + * with other processes by garrow_gpu_cuda_buffer_export() and + * garrow_gpu_cuda_buffer_new_ipc(). + * + * #GArrowGPUCUDAHostBuffer is a class for data on CPU that is + * directly accessible from GPU. + * + * #GArrowGPUCUDAIPCMemoryHandle is a class to share data on GPU with + * other processes. You can export your data on GPU to other processes + * by garrow_gpu_cuda_buffer_export() and + * garrow_gpu_cuda_ipc_memory_handle_new(). You can import other + * process data on GPU by garrow_gpu_cuda_ipc_memory_handle_new() and + * garrow_gpu_cuda_buffer_new_ipc(). + * + * #GArrowGPUCUDABufferInputStream is a class to read data in + * #GArrowGPUCUDABuffer. + * + * #GArrowGPUCUDABufferOutputStream is a class to write data into + * #GArrowGPUCUDABuffer. + */ + +G_DEFINE_TYPE(GArrowGPUCUDADeviceManager, + garrow_gpu_cuda_device_manager, + G_TYPE_OBJECT) + +static void +garrow_gpu_cuda_device_manager_init(GArrowGPUCUDADeviceManager *object) +{ +} + +static void +garrow_gpu_cuda_device_manager_class_init(GArrowGPUCUDADeviceManagerClass *klass) +{ +} + +/** + * garrow_gpu_cuda_device_manager_new: + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GArrowGPUCUDADeviceManager on success, + * %NULL on error. + * + * Since: 0.8.0 + */ +GArrowGPUCUDADeviceManager * +garrow_gpu_cuda_device_manager_new(GError **error) +{ + arrow::gpu::CudaDeviceManager *manager; + auto status = arrow::gpu::CudaDeviceManager::GetInstance(&manager); + if (garrow_error_check(error, status, "[gpu][cuda][device-manager][new]")) { + auto manager = g_object_new(GARROW_GPU_TYPE_CUDA_DEVICE_MANAGER, + NULL); + return GARROW_GPU_CUDA_DEVICE_MANAGER(manager); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_device_manager_get_context: + * @manager: A #GArrowGPUCUDADeviceManager. + * @gpu_number: A GPU device number for the target context. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDAContext on + * success, %NULL on error. Contexts for the same GPU device number + * share the same data internally. + * + * Since: 0.8.0 + */ +GArrowGPUCUDAContext * +garrow_gpu_cuda_device_manager_get_context(GArrowGPUCUDADeviceManager *manager, + gint gpu_number, + GError **error) +{ + arrow::gpu::CudaDeviceManager *arrow_manager; + arrow::gpu::CudaDeviceManager::GetInstance(&arrow_manager); + std::shared_ptr context; + auto status = arrow_manager->GetContext(gpu_number, &context); + if (garrow_error_check(error, status, + "[gpu][cuda][device-manager][get-context]]")) { + return garrow_gpu_cuda_context_new_raw(&context); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_device_manager_get_n_devices: + * @manager: A #GArrowGPUCUDADeviceManager. + * + * Returns: The number of GPU devices. + * + * Since: 0.8.0 + */ +gsize +garrow_gpu_cuda_device_manager_get_n_devices(GArrowGPUCUDADeviceManager *manager) +{ + arrow::gpu::CudaDeviceManager *arrow_manager; + arrow::gpu::CudaDeviceManager::GetInstance(&arrow_manager); + return arrow_manager->num_devices(); +} + + +typedef struct GArrowGPUCUDAContextPrivate_ { + std::shared_ptr context; +} GArrowGPUCUDAContextPrivate; + +enum { + PROP_CONTEXT = 1 +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowGPUCUDAContext, + garrow_gpu_cuda_context, + G_TYPE_OBJECT) + +#define GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(object) \ + static_cast( \ + garrow_gpu_cuda_context_get_instance_private( \ + GARROW_GPU_CUDA_CONTEXT(object))) + +static void +garrow_gpu_cuda_context_finalize(GObject *object) +{ + auto priv = GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(object); + + priv->context = nullptr; + + G_OBJECT_CLASS(garrow_gpu_cuda_context_parent_class)->finalize(object); +} + +static void +garrow_gpu_cuda_context_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CONTEXT: + priv->context = + *static_cast *>(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_gpu_cuda_context_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + switch (prop_id) { + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_gpu_cuda_context_init(GArrowGPUCUDAContext *object) +{ +} + +static void +garrow_gpu_cuda_context_class_init(GArrowGPUCUDAContextClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_gpu_cuda_context_finalize; + gobject_class->set_property = garrow_gpu_cuda_context_set_property; + gobject_class->get_property = garrow_gpu_cuda_context_get_property; + + /** + * GArrowGPUCUDAContext:context: + * + * Since: 0.8.0 + */ + spec = g_param_spec_pointer("context", + "Context", + "The raw std::shared_ptr *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_CONTEXT, spec); +} + +/** + * garrow_gpu_cuda_context_get_allocated_size: + * @context: A #GArrowGPUCUDAContext. + * + * Returns: The allocated memory by this context in bytes. + * + * Since: 0.8.0 + */ +gint64 +garrow_gpu_cuda_context_get_allocated_size(GArrowGPUCUDAContext *context) +{ + auto arrow_context = garrow_gpu_cuda_context_get_raw(context); + return arrow_context->bytes_allocated(); +} + + +G_DEFINE_TYPE(GArrowGPUCUDABuffer, + garrow_gpu_cuda_buffer, + GARROW_TYPE_BUFFER) + +static void +garrow_gpu_cuda_buffer_init(GArrowGPUCUDABuffer *object) +{ +} + +static void +garrow_gpu_cuda_buffer_class_init(GArrowGPUCUDABufferClass *klass) +{ +} + +/** + * garrow_gpu_cuda_buffer_new: + * @context: A #GArrowGPUCUDAContext. + * @size: The number of bytes to be allocated on GPU device for this context. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDABuffer on + * success, %NULL on error. + * + * Since: 0.8.0 + */ +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new(GArrowGPUCUDAContext *context, + gint64 size, + GError **error) +{ + auto arrow_context = garrow_gpu_cuda_context_get_raw(context); + std::shared_ptr arrow_buffer; + auto status = arrow_context->Allocate(size, &arrow_buffer); + if (garrow_error_check(error, status, "[gpu][cuda][buffer][new]")) { + return garrow_gpu_cuda_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_buffer_new_ipc: + * @context: A #GArrowGPUCUDAContext. + * @handle: A #GArrowGPUCUDAIPCMemoryHandle to be communicated. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDABuffer on + * success, %NULL on error. The buffer has data from the IPC target. + * + * Since: 0.8.0 + */ +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_ipc(GArrowGPUCUDAContext *context, + GArrowGPUCUDAIPCMemoryHandle *handle, + GError **error) +{ + auto arrow_context = garrow_gpu_cuda_context_get_raw(context); + auto arrow_handle = garrow_gpu_cuda_ipc_memory_handle_get_raw(handle); + std::shared_ptr arrow_buffer; + auto status = arrow_context->OpenIpcBuffer(*arrow_handle, &arrow_buffer); + if (garrow_error_check(error, status, + "[gpu][cuda][buffer][new-ipc]")) { + return garrow_gpu_cuda_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_buffer_new_record_batch: + * @context: A #GArrowGPUCUDAContext. + * @record_batch: A #GArrowRecordBatch to be serialized. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDABuffer on + * success, %NULL on error. The buffer has serialized record batch + * data. + * + * Since: 0.8.0 + */ +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_record_batch(GArrowGPUCUDAContext *context, + GArrowRecordBatch *record_batch, + GError **error) +{ + auto arrow_context = garrow_gpu_cuda_context_get_raw(context); + auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + std::shared_ptr arrow_buffer; + auto status = arrow::gpu::SerializeRecordBatch(*arrow_record_batch, + arrow_context.get(), + &arrow_buffer); + if (garrow_error_check(error, status, + "[gpu][cuda][buffer][new-record-batch]")) { + return garrow_gpu_cuda_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_buffer_copy_to_host: + * @buffer: A #GArrowGPUCUDABuffer. + * @position: The offset of memory on GPU device to be copied. + * @size: The size of memory on GPU device to be copied in bytes. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A #GBytes that have copied memory on CPU + * host on success, %NULL on error. + * + * Since: 0.8.0 + */ +GBytes * +garrow_gpu_cuda_buffer_copy_to_host(GArrowGPUCUDABuffer *buffer, + gint64 position, + gint64 size, + GError **error) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto data = static_cast(g_malloc(size)); + auto status = arrow_buffer->CopyToHost(position, size, data); + if (garrow_error_check(error, status, "[gpu][cuda][buffer][copy-to-host]")) { + return g_bytes_new_take(data, size); + } else { + g_free(data); + return NULL; + } +} + +/** + * garrow_gpu_cuda_buffer_copy_from_host: + * @buffer: A #GArrowGPUCUDABuffer. + * @data: (array length=size): Data on CPU host to be copied. + * @size: The size of data on CPU host to be copied in bytes. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.8.0 + */ +gboolean +garrow_gpu_cuda_buffer_copy_from_host(GArrowGPUCUDABuffer *buffer, + const guint8 *data, + gint64 size, + GError **error) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto status = arrow_buffer->CopyFromHost(0, data, size); + return garrow_error_check(error, + status, + "[gpu][cuda][buffer][copy-from-host]"); +} + +/** + * garrow_gpu_cuda_buffer_export: + * @buffer: A #GArrowGPUCUDABuffer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created + * #GArrowGPUCUDAIPCMemoryHandle to handle the exported buffer on + * success, %NULL on error + * + * Since: 0.8.0 + */ +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_buffer_export(GArrowGPUCUDABuffer *buffer, GError **error) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + std::unique_ptr arrow_handle; + auto status = arrow_buffer->ExportForIpc(&arrow_handle); + if (garrow_error_check(error, status, "[gpu][cuda][buffer][export-for-ipc]")) { + return garrow_gpu_cuda_ipc_memory_handle_new_raw(arrow_handle.release()); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_buffer_get_context: + * @buffer: A #GArrowGPUCUDABuffer. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDAContext for the + * buffer. Contexts for the same buffer share the same data internally. + * + * Since: 0.8.0 + */ +GArrowGPUCUDAContext * +garrow_gpu_cuda_buffer_get_context(GArrowGPUCUDABuffer *buffer) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto arrow_context = arrow_buffer->context(); + return garrow_gpu_cuda_context_new_raw(&arrow_context); +} + +/** + * garrow_gpu_cuda_buffer_read_record_batch: + * @buffer: A #GArrowGPUCUDABuffer. + * @schema: A #GArrowSchema for record batch. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowRecordBatch on + * success, %NULL on error. The record batch data is located on GPU. + * + * Since: 0.8.0 + */ +GArrowRecordBatch * +garrow_gpu_cuda_buffer_read_record_batch(GArrowGPUCUDABuffer *buffer, + GArrowSchema *schema, + GError **error) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto arrow_schema = garrow_schema_get_raw(schema); + auto pool = arrow::default_memory_pool(); + std::shared_ptr arrow_record_batch; + auto status = arrow::gpu::ReadRecordBatch(arrow_schema, + arrow_buffer, + pool, + &arrow_record_batch); + if (garrow_error_check(error, status, + "[gpu][cuda][buffer][read-record-batch]")) { + return garrow_record_batch_new_raw(&arrow_record_batch); + } else { + return NULL; + } +} + + +G_DEFINE_TYPE(GArrowGPUCUDAHostBuffer, + garrow_gpu_cuda_host_buffer, + GARROW_TYPE_MUTABLE_BUFFER) + +static void +garrow_gpu_cuda_host_buffer_init(GArrowGPUCUDAHostBuffer *object) +{ +} + +static void +garrow_gpu_cuda_host_buffer_class_init(GArrowGPUCUDAHostBufferClass *klass) +{ +} + +/** + * garrow_gpu_cuda_host_buffer_new: + * @size: The number of bytes to be allocated on CPU host. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GArrowGPUCUDAHostBuffer on success, + * %NULL on error. The allocated memory is accessible from GPU + * device for the @context. + * + * Since: 0.8.0 + */ +GArrowGPUCUDAHostBuffer * +garrow_gpu_cuda_host_buffer_new(gint64 size, GError **error) +{ + arrow::gpu::CudaDeviceManager *manager; + auto status = arrow::gpu::CudaDeviceManager::GetInstance(&manager); + std::shared_ptr arrow_buffer; + status = manager->AllocateHost(size, &arrow_buffer); + if (garrow_error_check(error, status, "[gpu][cuda][host-buffer][new]")) { + return garrow_gpu_cuda_host_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + + +typedef struct GArrowGPUCUDAIPCMemoryHandlePrivate_ { + arrow::gpu::CudaIpcMemHandle *ipc_memory_handle; +} GArrowGPUCUDAIPCMemoryHandlePrivate; + +enum { + PROP_IPC_MEMORY_HANDLE = 1 +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowGPUCUDAIPCMemoryHandle, + garrow_gpu_cuda_ipc_memory_handle, + G_TYPE_OBJECT) + +#define GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object) \ + static_cast( \ + garrow_gpu_cuda_ipc_memory_handle_get_instance_private( \ + GARROW_GPU_CUDA_IPC_MEMORY_HANDLE(object))) + +static void +garrow_gpu_cuda_ipc_memory_handle_finalize(GObject *object) +{ + auto priv = GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object); + + delete priv->ipc_memory_handle; + + G_OBJECT_CLASS(garrow_gpu_cuda_ipc_memory_handle_parent_class)->finalize(object); +} + +static void +garrow_gpu_cuda_ipc_memory_handle_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_IPC_MEMORY_HANDLE: + priv->ipc_memory_handle = + static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_gpu_cuda_ipc_memory_handle_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + switch (prop_id) { + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_gpu_cuda_ipc_memory_handle_init(GArrowGPUCUDAIPCMemoryHandle *object) +{ +} + +static void +garrow_gpu_cuda_ipc_memory_handle_class_init(GArrowGPUCUDAIPCMemoryHandleClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_gpu_cuda_ipc_memory_handle_finalize; + gobject_class->set_property = garrow_gpu_cuda_ipc_memory_handle_set_property; + gobject_class->get_property = garrow_gpu_cuda_ipc_memory_handle_get_property; + + /** + * GArrowGPUCUDAIPCMemoryHandle:ipc-memory-handle: + * + * Since: 0.8.0 + */ + spec = g_param_spec_pointer("ipc-memory-handle", + "IPC Memory Handle", + "The raw arrow::gpu::CudaIpcMemHandle *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_IPC_MEMORY_HANDLE, spec); +} + +/** + * garrow_gpu_cuda_ipc_memory_handle_new: + * @data: (array length=size): A serialized #GArrowGPUCUDAIPCMemoryHandle. + * @size: The size of data. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowGPUCUDAIPCMemoryHandle + * on success, %NULL on error. + * + * Since: 0.8.0 + */ +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_ipc_memory_handle_new(const guint8 *data, + gsize size, + GError **error) +{ + std::unique_ptr arrow_handle; + auto status = arrow::gpu::CudaIpcMemHandle::FromBuffer(data, &arrow_handle); + if (garrow_error_check(error, status, + "[gpu][cuda][ipc-memory-handle][new]")) { + return garrow_gpu_cuda_ipc_memory_handle_new_raw(arrow_handle.release()); + } else { + return NULL; + } +} + +/** + * garrow_gpu_cuda_ipc_memory_handle_serialize: + * @handle: A #GArrowGPUCUDAIPCMemoryHandle. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowBuffer on success, + * %NULL on error. The buffer has serialized @handle. The serialized + * @handle can be deserialized by garrow_gpu_cuda_ipc_memory_handle_new() + * in other process. + * + * Since: 0.8.0 + */ +GArrowBuffer * +garrow_gpu_cuda_ipc_memory_handle_serialize(GArrowGPUCUDAIPCMemoryHandle *handle, + GError **error) +{ + auto arrow_handle = garrow_gpu_cuda_ipc_memory_handle_get_raw(handle); + std::shared_ptr arrow_buffer; + auto status = arrow_handle->Serialize(arrow::default_memory_pool(), + &arrow_buffer); + if (garrow_error_check(error, status, + "[gpu][cuda][ipc-memory-handle][serialize]")) { + return garrow_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +GArrowBuffer * +garrow_gpu_cuda_buffer_input_stream_new_raw_readable_interface(std::shared_ptr *arrow_buffer) +{ + auto buffer = GARROW_BUFFER(g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER, + "buffer", arrow_buffer, + NULL)); + return buffer; +} + +static std::shared_ptr +garrow_gpu_cuda_buffer_input_stream_get_raw_readable_interface(GArrowReadable *readable) +{ + auto input_stream = GARROW_INPUT_STREAM(readable); + auto arrow_input_stream = garrow_input_stream_get_raw(input_stream); + return arrow_input_stream; +} + +static void +garrow_gpu_cuda_buffer_input_stream_readable_interface_init(GArrowReadableInterface *iface) +{ + iface->new_raw = + garrow_gpu_cuda_buffer_input_stream_new_raw_readable_interface; + iface->get_raw = + garrow_gpu_cuda_buffer_input_stream_get_raw_readable_interface; +} + +G_DEFINE_TYPE_WITH_CODE( + GArrowGPUCUDABufferInputStream, + garrow_gpu_cuda_buffer_input_stream, + GARROW_TYPE_BUFFER_INPUT_STREAM, + G_IMPLEMENT_INTERFACE( + GARROW_TYPE_READABLE, + garrow_gpu_cuda_buffer_input_stream_readable_interface_init)) + +static void +garrow_gpu_cuda_buffer_input_stream_init(GArrowGPUCUDABufferInputStream *object) +{ +} + +static void +garrow_gpu_cuda_buffer_input_stream_class_init(GArrowGPUCUDABufferInputStreamClass *klass) +{ +} + +/** + * garrow_gpu_cuda_buffer_input_stream_new: + * @buffer: A #GArrowGPUCUDABuffer. + * + * Returns: (transfer full): A newly created + * #GArrowGPUCUDABufferInputStream. + * + * Since: 0.8.0 + */ +GArrowGPUCUDABufferInputStream * +garrow_gpu_cuda_buffer_input_stream_new(GArrowGPUCUDABuffer *buffer) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto arrow_reader = + std::make_shared(arrow_buffer); + return garrow_gpu_cuda_buffer_input_stream_new_raw(&arrow_reader); +} + + +G_DEFINE_TYPE(GArrowGPUCUDABufferOutputStream, + garrow_gpu_cuda_buffer_output_stream, + GARROW_TYPE_OUTPUT_STREAM) + +static void +garrow_gpu_cuda_buffer_output_stream_init(GArrowGPUCUDABufferOutputStream *object) +{ +} + +static void +garrow_gpu_cuda_buffer_output_stream_class_init(GArrowGPUCUDABufferOutputStreamClass *klass) +{ +} + +/** + * garrow_gpu_cuda_buffer_output_stream_new: + * @buffer: A #GArrowGPUCUDABuffer. + * + * Returns: (transfer full): A newly created + * #GArrowGPUCUDABufferOutputStream. + * + * Since: 0.8.0 + */ +GArrowGPUCUDABufferOutputStream * +garrow_gpu_cuda_buffer_output_stream_new(GArrowGPUCUDABuffer *buffer) +{ + auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); + auto arrow_writer = + std::make_shared(arrow_buffer); + return garrow_gpu_cuda_buffer_output_stream_new_raw(&arrow_writer); +} + +/** + * garrow_gpu_cuda_buffer_output_stream_set_buffer_size: + * @stream: A #GArrowGPUCUDABufferOutputStream. + * @size: A size of CPU buffer in bytes. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Sets CPU buffer size. to limit `cudaMemcpy()` calls. If CPU buffer + * size is `0`, buffering is disabled. + * + * The default is `0`. + * + * Since: 0.8.0 + */ +gboolean +garrow_gpu_cuda_buffer_output_stream_set_buffer_size(GArrowGPUCUDABufferOutputStream *stream, + gint64 size, + GError **error) +{ + auto arrow_stream = garrow_gpu_cuda_buffer_output_stream_get_raw(stream); + auto status = arrow_stream->SetBufferSize(size); + return garrow_error_check(error, + status, + "[gpu][cuda][buffer-output-stream][set-buffer-size]"); +} + +/** + * garrow_gpu_cuda_buffer_output_stream_get_buffer_size: + * @stream: A #GArrowGPUCUDABufferOutputStream. + * + * Returns: The CPU buffer size in bytes. + * + * See garrow_gpu_cuda_buffer_output_stream_set_buffer_size() for CPU + * buffer size details. + * + * Since: 0.8.0 + */ +gint64 +garrow_gpu_cuda_buffer_output_stream_get_buffer_size(GArrowGPUCUDABufferOutputStream *stream) +{ + auto arrow_stream = garrow_gpu_cuda_buffer_output_stream_get_raw(stream); + return arrow_stream->buffer_size(); +} + +/** + * garrow_gpu_cuda_buffer_output_stream_get_buffered_size: + * @stream: A #GArrowGPUCUDABufferOutputStream. + * + * Returns: The size of buffered data in bytes. + * + * Since: 0.8.0 + */ +gint64 +garrow_gpu_cuda_buffer_output_stream_get_buffered_size(GArrowGPUCUDABufferOutputStream *stream) +{ + auto arrow_stream = garrow_gpu_cuda_buffer_output_stream_get_raw(stream); + return arrow_stream->num_bytes_buffered(); +} + + +G_END_DECLS + +GArrowGPUCUDAContext * +garrow_gpu_cuda_context_new_raw(std::shared_ptr *arrow_context) +{ + return GARROW_GPU_CUDA_CONTEXT(g_object_new(GARROW_GPU_TYPE_CUDA_CONTEXT, + "context", arrow_context, + NULL)); +} + +std::shared_ptr +garrow_gpu_cuda_context_get_raw(GArrowGPUCUDAContext *context) +{ + if (!context) + return nullptr; + + auto priv = GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(context); + return priv->context; +} + +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_ipc_memory_handle_new_raw(arrow::gpu::CudaIpcMemHandle *arrow_handle) +{ + auto handle = g_object_new(GARROW_GPU_TYPE_CUDA_IPC_MEMORY_HANDLE, + "ipc-memory-handle", arrow_handle, + NULL); + return GARROW_GPU_CUDA_IPC_MEMORY_HANDLE(handle); +} + +arrow::gpu::CudaIpcMemHandle * +garrow_gpu_cuda_ipc_memory_handle_get_raw(GArrowGPUCUDAIPCMemoryHandle *handle) +{ + if (!handle) + return nullptr; + + auto priv = GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(handle); + return priv->ipc_memory_handle; +} + +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_raw(std::shared_ptr *arrow_buffer) +{ + return GARROW_GPU_CUDA_BUFFER(g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER, + "buffer", arrow_buffer, + NULL)); +} + +std::shared_ptr +garrow_gpu_cuda_buffer_get_raw(GArrowGPUCUDABuffer *buffer) +{ + if (!buffer) + return nullptr; + + auto arrow_buffer = garrow_buffer_get_raw(GARROW_BUFFER(buffer)); + return std::static_pointer_cast(arrow_buffer); +} + +GArrowGPUCUDAHostBuffer * +garrow_gpu_cuda_host_buffer_new_raw(std::shared_ptr *arrow_buffer) +{ + auto buffer = g_object_new(GARROW_GPU_TYPE_CUDA_HOST_BUFFER, + "buffer", arrow_buffer, + NULL); + return GARROW_GPU_CUDA_HOST_BUFFER(buffer); +} + +std::shared_ptr +garrow_gpu_cuda_host_buffer_get_raw(GArrowGPUCUDAHostBuffer *buffer) +{ + if (!buffer) + return nullptr; + + auto arrow_buffer = garrow_buffer_get_raw(GARROW_BUFFER(buffer)); + return std::static_pointer_cast(arrow_buffer); +} + +GArrowGPUCUDABufferInputStream * +garrow_gpu_cuda_buffer_input_stream_new_raw(std::shared_ptr *arrow_reader) +{ + auto input_stream = g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER_INPUT_STREAM, + "input-stream", arrow_reader, + NULL); + return GARROW_GPU_CUDA_BUFFER_INPUT_STREAM(input_stream); +} + +std::shared_ptr +garrow_gpu_cuda_buffer_input_stream_get_raw(GArrowGPUCUDABufferInputStream *input_stream) +{ + if (!input_stream) + return nullptr; + + auto arrow_reader = + garrow_input_stream_get_raw(GARROW_INPUT_STREAM(input_stream)); + return std::static_pointer_cast(arrow_reader); +} + +GArrowGPUCUDABufferOutputStream * +garrow_gpu_cuda_buffer_output_stream_new_raw(std::shared_ptr *arrow_writer) +{ + auto output_stream = g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER_OUTPUT_STREAM, + "output-stream", arrow_writer, + NULL); + return GARROW_GPU_CUDA_BUFFER_OUTPUT_STREAM(output_stream); +} + +std::shared_ptr +garrow_gpu_cuda_buffer_output_stream_get_raw(GArrowGPUCUDABufferOutputStream *output_stream) +{ + if (!output_stream) + return nullptr; + + auto arrow_writer = + garrow_output_stream_get_raw(GARROW_OUTPUT_STREAM(output_stream)); + return std::static_pointer_cast(arrow_writer); +} diff --git a/c_glib/arrow-gpu-glib/cuda.h b/c_glib/arrow-gpu-glib/cuda.h new file mode 100644 index 0000000000000..7c615a144e739 --- /dev/null +++ b/c_glib/arrow-gpu-glib/cuda.h @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +#define GARROW_GPU_TYPE_CUDA_DEVICE_MANAGER \ + (garrow_gpu_cuda_device_manager_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDADeviceManager, + garrow_gpu_cuda_device_manager, + GARROW_GPU, + CUDA_DEVICE_MANAGER, + GObject) +struct _GArrowGPUCUDADeviceManagerClass +{ + GObjectClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_CONTEXT (garrow_gpu_cuda_context_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDAContext, + garrow_gpu_cuda_context, + GARROW_GPU, + CUDA_CONTEXT, + GObject) +struct _GArrowGPUCUDAContextClass +{ + GObjectClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_BUFFER (garrow_gpu_cuda_buffer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDABuffer, + garrow_gpu_cuda_buffer, + GARROW_GPU, + CUDA_BUFFER, + GArrowBuffer) +struct _GArrowGPUCUDABufferClass +{ + GArrowBufferClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_HOST_BUFFER (garrow_gpu_cuda_host_buffer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDAHostBuffer, + garrow_gpu_cuda_host_buffer, + GARROW_GPU, + CUDA_HOST_BUFFER, + GArrowMutableBuffer) +struct _GArrowGPUCUDAHostBufferClass +{ + GArrowMutableBufferClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_IPC_MEMORY_HANDLE \ + (garrow_gpu_cuda_ipc_memory_handle_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDAIPCMemoryHandle, + garrow_gpu_cuda_ipc_memory_handle, + GARROW_GPU, + CUDA_IPC_MEMORY_HANDLE, + GObject) +struct _GArrowGPUCUDAIPCMemoryHandleClass +{ + GObjectClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_BUFFER_INPUT_STREAM \ + (garrow_gpu_cuda_buffer_input_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDABufferInputStream, + garrow_gpu_cuda_buffer_input_stream, + GARROW_GPU, + CUDA_BUFFER_INPUT_STREAM, + GArrowBufferInputStream) +struct _GArrowGPUCUDABufferInputStreamClass +{ + GArrowBufferInputStreamClass parent_class; +}; + +#define GARROW_GPU_TYPE_CUDA_BUFFER_OUTPUT_STREAM \ + (garrow_gpu_cuda_buffer_output_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDABufferOutputStream, + garrow_gpu_cuda_buffer_output_stream, + GARROW_GPU, + CUDA_BUFFER_OUTPUT_STREAM, + GArrowOutputStream) +struct _GArrowGPUCUDABufferOutputStreamClass +{ + GArrowOutputStreamClass parent_class; +}; + +GArrowGPUCUDADeviceManager * +garrow_gpu_cuda_device_manager_new(GError **error); + +GArrowGPUCUDAContext * +garrow_gpu_cuda_device_manager_get_context(GArrowGPUCUDADeviceManager *manager, + gint gpu_number, + GError **error); +gsize +garrow_gpu_cuda_device_manager_get_n_devices(GArrowGPUCUDADeviceManager *manager); + +gint64 +garrow_gpu_cuda_context_get_allocated_size(GArrowGPUCUDAContext *context); + + +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new(GArrowGPUCUDAContext *context, + gint64 size, + GError **error); +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_ipc(GArrowGPUCUDAContext *context, + GArrowGPUCUDAIPCMemoryHandle *handle, + GError **error); +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_record_batch(GArrowGPUCUDAContext *context, + GArrowRecordBatch *record_batch, + GError **error); +GBytes * +garrow_gpu_cuda_buffer_copy_to_host(GArrowGPUCUDABuffer *buffer, + gint64 position, + gint64 size, + GError **error); +gboolean +garrow_gpu_cuda_buffer_copy_from_host(GArrowGPUCUDABuffer *buffer, + const guint8 *data, + gint64 size, + GError **error); +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_buffer_export(GArrowGPUCUDABuffer *buffer, + GError **error); +GArrowGPUCUDAContext * +garrow_gpu_cuda_buffer_get_context(GArrowGPUCUDABuffer *buffer); +GArrowRecordBatch * +garrow_gpu_cuda_buffer_read_record_batch(GArrowGPUCUDABuffer *buffer, + GArrowSchema *schema, + GError **error); + + +GArrowGPUCUDAHostBuffer * +garrow_gpu_cuda_host_buffer_new(gint64 size, GError **error); + +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_ipc_memory_handle_new(const guint8 *data, + gsize size, + GError **error); + +GArrowBuffer * +garrow_gpu_cuda_ipc_memory_handle_serialize(GArrowGPUCUDAIPCMemoryHandle *handle, + GError **error); + +GArrowGPUCUDABufferInputStream * +garrow_gpu_cuda_buffer_input_stream_new(GArrowGPUCUDABuffer *buffer); + +GArrowGPUCUDABufferOutputStream * +garrow_gpu_cuda_buffer_output_stream_new(GArrowGPUCUDABuffer *buffer); + +gboolean +garrow_gpu_cuda_buffer_output_stream_set_buffer_size(GArrowGPUCUDABufferOutputStream *stream, + gint64 size, + GError **error); +gint64 +garrow_gpu_cuda_buffer_output_stream_get_buffer_size(GArrowGPUCUDABufferOutputStream *stream); +gint64 +garrow_gpu_cuda_buffer_output_stream_get_buffered_size(GArrowGPUCUDABufferOutputStream *stream); + +G_END_DECLS diff --git a/c_glib/arrow-gpu-glib/cuda.hpp b/c_glib/arrow-gpu-glib/cuda.hpp new file mode 100644 index 0000000000000..3eeff8b6f18ed --- /dev/null +++ b/c_glib/arrow-gpu-glib/cuda.hpp @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +GArrowGPUCUDAContext * +garrow_gpu_cuda_context_new_raw(std::shared_ptr *arrow_context); +std::shared_ptr +garrow_gpu_cuda_context_get_raw(GArrowGPUCUDAContext *context); + +GArrowGPUCUDAIPCMemoryHandle * +garrow_gpu_cuda_ipc_memory_handle_new_raw(arrow::gpu::CudaIpcMemHandle *arrow_handle); +arrow::gpu::CudaIpcMemHandle * +garrow_gpu_cuda_ipc_memory_handle_get_raw(GArrowGPUCUDAIPCMemoryHandle *handle); + +GArrowGPUCUDABuffer * +garrow_gpu_cuda_buffer_new_raw(std::shared_ptr *arrow_buffer); +std::shared_ptr +garrow_gpu_cuda_buffer_get_raw(GArrowGPUCUDABuffer *buffer); + +GArrowGPUCUDAHostBuffer * +garrow_gpu_cuda_host_buffer_new_raw(std::shared_ptr *arrow_buffer); +std::shared_ptr +garrow_gpu_cuda_host_buffer_get_raw(GArrowGPUCUDAHostBuffer *buffer); + +GArrowGPUCUDABufferInputStream * +garrow_gpu_cuda_buffer_input_stream_new_raw(std::shared_ptr *arrow_reader); +std::shared_ptr +garrow_gpu_cuda_buffer_input_stream_get_raw(GArrowGPUCUDABufferInputStream *input_stream); + +GArrowGPUCUDABufferOutputStream * +garrow_gpu_cuda_buffer_output_stream_new_raw(std::shared_ptr *arrow_writer); +std::shared_ptr +garrow_gpu_cuda_buffer_output_stream_get_raw(GArrowGPUCUDABufferOutputStream *output_stream); diff --git a/c_glib/arrow-gpu-glib/meson.build b/c_glib/arrow-gpu-glib/meson.build new file mode 100644 index 0000000000000..00c7f079d6485 --- /dev/null +++ b/c_glib/arrow-gpu-glib/meson.build @@ -0,0 +1,80 @@ +# -*- indent-tabs-mode: nil -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sources = files( + 'cuda.cpp', +) + +c_headers = files( + 'arrow-gpu-glib.h', + 'cuda.h', +) + +cpp_headers = files( + 'arrow-gpu-glib.hpp', + 'cuda.hpp', +) + +headers = c_headers + cpp_headers +install_headers(headers, subdir: 'arrow-gpu-glib') + + +dependencies = [ + arrow_gpu_dependency, + libarrow_glib_dependency, +] +libarrow_gpu_glib = library('arrow-gpu-glib', + sources: sources, + install: true, + dependencies: dependencies, + include_directories: [ + root_inc, + ], + soversion: so_version, + version: library_version) +libarrow_gpu_glib_dependency = declare_dependency(link_with: libarrow_gpu_glib, + include_directories: [ + root_inc, + ], + dependencies: dependencies) + +pkgconfig.generate(filebase: 'arrow-gpu-glib', + name: 'Apache Arrow GPU GLib', + description: 'C API for Apache Arrow GPU based on GLib', + version: version, + requires: ['arrow-glib', 'arrow-gpu'], + libraries: [libarrow_gpu_glib], + subdirs: ['arrow-gpu-glib']) + +gnome.generate_gir(libarrow_gpu_glib, + dependencies: arrow_glib_gir_dependency, + sources: sources + c_headers, + namespace: 'ArrowGPU', + nsversion: api_version, + identifier_prefix: 'GArrowGPU', + symbol_prefix: 'garrow_gpu', + export_packages: 'arrow-gpu-glib', + includes: [ + 'Arrow-1.0', + ], + install: true, + extra_args: [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + ]) diff --git a/c_glib/configure.ac b/c_glib/configure.ac index 5db435275a300..c6fa0192c944d 100644 --- a/c_glib/configure.ac +++ b/c_glib/configure.ac @@ -77,18 +77,34 @@ AC_ARG_WITH(arrow-cpp-build-dir, [GARROW_ARROW_CPP_BUILD_DIR=""]) if test "x$GARROW_ARROW_CPP_BUILD_DIR" = "x"; then PKG_CHECK_MODULES([ARROW], [arrow arrow-compute]) + PKG_CHECK_MODULES([ARROW_GPU], + [arrow-gpu], + [HAVE_ARROW_GPU=yes], + [HAVE_ARROW_GPU=no]) else ARROW_INCLUDE_DIR="\$(abs_top_srcdir)/../cpp/src" ARROW_LIB_DIR="${GARROW_ARROW_CPP_BUILD_DIR}/${GARROW_ARROW_CPP_BUILD_TYPE}" ARROW_CFLAGS="-I${ARROW_INCLUDE_DIR}" - ARROW_LIBS="-L${ARROW_LIB_DIR} -larrow" - - AC_SUBST(ARROW_LIB_DIR) - AC_SUBST(ARROW_CFLAGS) AC_SUBST(ARROW_LIBS) + + ARROW_GPU_CFLAGS="" + if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/gpu/arrow-gpu.pc"; then + HAVE_ARROW_GPU=yes + ARROW_GPU_LIBS="-larrow_gpu" + else + HAVE_ARROW_GPU=no + ARROW_GPU_LIBS="" + fi + AC_SUBST(ARROW_GPU_CFLAGS) + AC_SUBST(ARROW_GPU_LIBS) +fi + +AM_CONDITIONAL([HAVE_ARROW_GPU], [test "$HAVE_ARROW_GPU" = "yes"]) +if test "$HAVE_ARROW_GPU" = "yes"; then + AC_DEFINE(HAVE_ARROW_GPU, [1], [Define to 1 if Apache Arrow supports GPU.]) fi exampledir="\$(datadir)/arrow-glib/example" @@ -98,6 +114,8 @@ AC_CONFIG_FILES([ Makefile arrow-glib/Makefile arrow-glib/arrow-glib.pc + arrow-gpu-glib/Makefile + arrow-gpu-glib/arrow-gpu-glib.pc doc/Makefile doc/reference/Makefile doc/reference/xml/Makefile diff --git a/c_glib/doc/Makefile.am b/c_glib/doc/Makefile.am index 85c1d5126097c..1d491ab09110e 100644 --- a/c_glib/doc/Makefile.am +++ b/c_glib/doc/Makefile.am @@ -16,4 +16,4 @@ # under the License. SUBDIRS = \ - reference + reference diff --git a/c_glib/doc/reference/Makefile.am b/c_glib/doc/reference/Makefile.am index 45b11f035183e..896aff544d454 100644 --- a/c_glib/doc/reference/Makefile.am +++ b/c_glib/doc/reference/Makefile.am @@ -51,6 +51,17 @@ AM_CFLAGS = \ GTKDOC_LIBS = \ $(top_builddir)/arrow-glib/libarrow-glib.la +if HAVE_ARROW_GPU +DOC_SOURCE_DIR += \ + $(top_srcdir)/arrow-gpu-glib +HFILE_GLOB += \ + $(top_srcdir)/arrow-gpu-glib/*.h +CFILE_GLOB += \ + $(top_srcdir)/arrow-gpu-glib/*.cpp +GTKDOC_LIBS += \ + $(top_builddir)/arrow-gpu-glib/libarrow-gpu-glib.la +endif + include $(srcdir)/gtk-doc.make CLEANFILES += \ diff --git a/c_glib/doc/reference/arrow-glib-docs.sgml b/c_glib/doc/reference/arrow-glib-docs.sgml index a504ef1148383..e267ea2f9d356 100644 --- a/c_glib/doc/reference/arrow-glib-docs.sgml +++ b/c_glib/doc/reference/arrow-glib-docs.sgml @@ -125,6 +125,16 @@ + + Object Hierarchy diff --git a/c_glib/doc/reference/meson.build b/c_glib/doc/reference/meson.build index 08936daf87288..4c9552e83c303 100644 --- a/c_glib/doc/reference/meson.build +++ b/c_glib/doc/reference/meson.build @@ -32,13 +32,26 @@ glib_prefix = dependency('glib-2.0').get_pkgconfig_variable('prefix') glib_doc_path = join_paths(glib_prefix, 'share', 'gtk-doc', 'html') doc_path = join_paths(data_dir, meson.project_name(), 'gtk-doc', 'html') +source_directories = [ + join_paths(meson.source_root(), 'arrow-glib'), + join_paths(meson.build_root(), 'arrow-glib'), +] +dependencies = [ + libarrow_glib_dependency, +] +if arrow_gpu_dependency.found() + source_directories += [ + join_paths(meson.source_root(), 'arrow-gpu-glib'), + join_paths(meson.build_root(), 'arrow-gpu-glib'), + ] + dependencies += [ + libarrow_gpu_glib_dependency, + ] +endif gnome.gtkdoc(meson.project_name(), main_xml: meson.project_name() + '-docs.sgml', - src_dir: [ - join_paths(meson.source_root(), 'arrow-glib'), - join_paths(meson.build_root(), 'arrow-glib'), - ], - dependencies: libarrow_glib_dependency, + src_dir: source_directories, + dependencies: dependencies, gobject_typesfile: meson.project_name() + '.types', scan_args: [ '--rebuild-types', diff --git a/c_glib/meson.build b/c_glib/meson.build index 1fa64ba19c406..9fe1b8cbd7179 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -49,6 +49,10 @@ pkgconfig = import('pkgconfig') root_inc = include_directories('.') subdir('arrow-glib') +arrow_gpu_dependency = dependency('arrow-gpu', required: false) +if arrow_gpu_dependency.found() + subdir('arrow-gpu-glib') +endif subdir('example') if get_option('enable_gtk_doc') @@ -58,4 +62,7 @@ endif run_test = find_program('test/run-test.sh') test('unit test', run_test, - env: ['ARROW_GLIB_TYPELIB_DIR=@0@/arrow-glib'.format(meson.build_root())]) + env: [ + 'ARROW_GLIB_TYPELIB_DIR=@0@/arrow-glib'.format(meson.build_root()), + 'ARROW_GPU_GLIB_TYPELIB_DIR=@0@/arrow-gpu-glib'.format(meson.build_root()), + ]) diff --git a/c_glib/test/run-test.rb b/c_glib/test/run-test.rb index 3451bd29fde1b..392c56f33ae51 100755 --- a/c_glib/test/run-test.rb +++ b/c_glib/test/run-test.rb @@ -37,6 +37,12 @@ def initialize(data) end end +begin + ArrowGPU = GI.load("ArrowGPU") +rescue GObjectIntrospection::RepositoryError::TypelibNotFound +end + +require "rbconfig" require "tempfile" require_relative "helper/buildable" require_relative "helper/omittable" diff --git a/c_glib/test/run-test.sh b/c_glib/test/run-test.sh index 19ccf077833a5..d563e8586ce59 100755 --- a/c_glib/test/run-test.sh +++ b/c_glib/test/run-test.sh @@ -20,27 +20,34 @@ test_dir="$(cd $(dirname $0); pwd)" build_dir="$(cd .; pwd)" -arrow_glib_build_dir="${build_dir}/arrow-glib/" -libtool_dir="${arrow_glib_build_dir}/.libs" -if [ -d "${libtool_dir}" ]; then - LD_LIBRARY_PATH="${libtool_dir}:${LD_LIBRARY_PATH}" -else - if [ -d "${arrow_glib_build_dir}" ]; then - LD_LIBRARY_PATH="${arrow_glib_build_dir}:${LD_LIBRARY_PATH}" +modules="arrow-glib arrow-gpu-glib" + +for module in ${modules}; do + module_build_dir="${build_dir}/${module}" + libtool_dir="${module_build_dir}/.libs" + if [ -d "${libtool_dir}" ]; then + LD_LIBRARY_PATH="${libtool_dir}:${LD_LIBRARY_PATH}" + else + if [ -d "${module_build_dir}" ]; then + LD_LIBRARY_PATH="${module_build_dir}:${LD_LIBRARY_PATH}" + fi fi -fi +done if [ -f "Makefile" -a "${NO_MAKE}" != "yes" ]; then make -j8 > /dev/null || exit $? fi -arrow_glib_typelib_dir="${ARROW_GLIB_TYPELIB_DIR}" -if [ -z "${arrow_glib_typelib_dir}" ]; then - arrow_glib_typelib_dir="${build_dir}/arrow-glib" -fi +for module in ${modules}; do + MODULE_TYPELIB_DIR_VAR_NAME="$(echo ${module} | tr a-z- A-Z_)_TYPELIB_DIR" + module_typelib_dir=$(eval "echo \${${MODULE_TYPELIB_DIR_VAR_NAME}}") + if [ -z "${module_typelib_dir}" ]; then + module_typelib_dir="${build_dir}/${module}" + fi -if [ -d "${arrow_glib_typelib_dir}" ]; then - GI_TYPELIB_PATH="${arrow_glib_typelib_dir}:${GI_TYPELIB_PATH}" -fi + if [ -d "${module_typelib_dir}" ]; then + GI_TYPELIB_PATH="${module_typelib_dir}:${GI_TYPELIB_PATH}" + fi +done ${GDB} ruby ${test_dir}/run-test.rb "$@" diff --git a/c_glib/test/test-gpu-cuda.rb b/c_glib/test/test-gpu-cuda.rb new file mode 100644 index 0000000000000..c710ef2264976 --- /dev/null +++ b/c_glib/test/test-gpu-cuda.rb @@ -0,0 +1,144 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGPUCUDA < Test::Unit::TestCase + include Helper::Buildable + + def setup + omit("Arrow GPU is required") unless defined?(::ArrowGPU) + @manager = ArrowGPU::CUDADeviceManager.new + omit("At least one GPU is required") if @manager.n_devices.zero? + @context = @manager.get_context(0) + end + + sub_test_case("Context") do + def test_allocated_size + allocated_size_before = @context.allocated_size + size = 128 + buffer = ArrowGPU::CUDABuffer.new(@context, size) + assert_equal(size, + @context.allocated_size - allocated_size_before) + end + end + + sub_test_case("Buffer") do + def setup + super + @buffer = ArrowGPU::CUDABuffer.new(@context, 128) + end + + def test_copy + @buffer.copy_from_host("Hello World") + assert_equal("llo W", @buffer.copy_to_host(2, 5).to_s) + end + + def test_export + @buffer.copy_from_host("Hello World") + handle = @buffer.export + serialized_handle = handle.serialize.data + Tempfile.open("arrow-gpu-cuda-export") do |output| + pid = spawn(RbConfig.ruby, "-e", <<-SCRIPT) +require "gi" + +Gio = GI.load("Gio") +Arrow = GI.load("Arrow") +ArrowGPU = GI.load("ArrowGPU") + +manager = ArrowGPU::CUDADeviceManager.new +context = manager.get_context(0) +serialized_handle = #{serialized_handle.to_s.dump} +handle = ArrowGPU::CUDAIPCMemoryHandle.new(serialized_handle) +buffer = ArrowGPU::CUDABuffer.new(context, handle) +File.open(#{output.path.dump}, "w") do |output| + output.print(buffer.copy_to_host(0, 6).to_s) +end + SCRIPT + Process.waitpid(pid) + assert_equal("Hello ", output.read) + end + end + + def test_context + assert_equal(@context.allocated_size, + @buffer.context.allocated_size) + end + + def test_record_batch + field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) + schema = Arrow::Schema.new([field]) + columns = [ + build_boolean_array([true]), + ] + cpu_record_batch = Arrow::RecordBatch.new(schema, 1, columns) + + buffer = ArrowGPU::CUDABuffer.new(@context, cpu_record_batch) + gpu_record_batch = buffer.read_record_batch(schema) + assert_equal(cpu_record_batch.n_rows, + gpu_record_batch.n_rows) + end + end + + sub_test_case("HostBuffer") do + def test_new + buffer = ArrowGPU::CUDAHostBuffer.new(128) + assert_equal(128, buffer.size) + end + end + + sub_test_case("BufferInputStream") do + def test_new + buffer = ArrowGPU::CUDABuffer.new(@context, 128) + buffer.copy_from_host("Hello World") + stream = ArrowGPU::CUDABufferInputStream.new(buffer) + begin + assert_equal("Hello Worl", stream.read(5).copy_to_host(0, 10).to_s) + ensure + stream.close + end + end + end + + sub_test_case("BufferOutputStream") do + def setup + super + @buffer = ArrowGPU::CUDABuffer.new(@context, 128) + @buffer.copy_from_host("\x00" * @buffer.size) + @stream = ArrowGPU::CUDABufferOutputStream.new(@buffer) + end + + def cleanup + super + @stream.close + end + + def test_new + @stream.write("Hello World") + assert_equal("Hello World", @buffer.copy_to_host(0, 11).to_s) + end + + def test_buffer + assert_equal(0, @stream.buffer_size) + @stream.buffer_size = 5 + assert_equal(5, @stream.buffer_size) + @stream.write("Hell") + assert_equal(4, @stream.buffered_size) + assert_equal("\x00" * 5, @buffer.copy_to_host(0, 5).to_s) + @stream.write("o") + assert_equal("Hello", @buffer.copy_to_host(0, 5).to_s) + end + end +end From b18bbeba0ed8b76f6575bd0501de3beb1e6e60f5 Mon Sep 17 00:00:00 2001 From: "Korn, Uwe" Date: Mon, 13 Nov 2017 22:56:13 -0500 Subject: [PATCH 042/177] ARROW-1371: [Website] Add "Powered By" page to the website Author: Korn, Uwe Author: Wes McKinney Author: Uwe L. Korn Closes #1280 from xhochy/ARROW-1371 and squashes the following commits: 84a76e94 [Uwe L. Korn] Add Red Data Tools 57338daa [Wes McKinney] Formatting 522a3592 [Wes McKinney] Add a few more projects ad511ed2 [Korn, Uwe] Add GeoMesa and Dremio 0e4f755c [Korn, Uwe] Add Ray and Spark ecabb167 [Korn, Uwe] ARROW-1371: [Website] Add "Powered By" page to the website --- site/_config.yml | 1 + site/_includes/header.html | 1 + site/powered_by.md | 126 +++++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 site/powered_by.md diff --git a/site/_config.yml b/site/_config.yml index a6c5575d0680f..cbcf97dd3b0d1 100644 --- a/site/_config.yml +++ b/site/_config.yml @@ -30,6 +30,7 @@ exclude: - Gemfile - Gemfile.lock - _docs/format/* + - ruby - asf-site - scripts - README.md diff --git a/site/_includes/header.html b/site/_includes/header.html index 6c0ec30f39ca7..03b3c8750cdb8 100644 --- a/site/_includes/header.html +++ b/site/_includes/header.html @@ -27,6 +27,7 @@
  • Mailing List
  • Slack Channel
  • Committers
  • +
  • Powered By