From f66fa805e89aef948581876ac802b1ffc6430f5c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 19 Dec 2018 12:53:09 -0600 Subject: [PATCH] ARROW-554: [C++] Add functions to unify dictionary types and arrays Author: Antoine Pitrou Closes #3165 from pitrou/ARROW-554-conform-dicts and squashes the following commits: 7d2579b30 ARROW-554: Add functions to conform dictionaries --- cpp/src/arrow/array-dict-test.cc | 62 ++++++++++ cpp/src/arrow/array.cc | 61 ++++++++++ cpp/src/arrow/array.h | 24 +++- cpp/src/arrow/array/builder_dict.cc | 172 +++++++++++++++++++++------- cpp/src/arrow/type-test.cc | 128 +++++++++++++++++++++ cpp/src/arrow/type.cc | 7 +- cpp/src/arrow/type.h | 18 +++ cpp/src/arrow/util/hashing.h | 38 ------ cpp/src/arrow/util/int-util-test.cc | 9 ++ cpp/src/arrow/util/int-util.cc | 40 +++++++ cpp/src/arrow/util/int-util.h | 4 + cpp/src/arrow/visitor_inline.h | 2 +- python/pyarrow/tests/test_types.py | 4 +- 13 files changed, 484 insertions(+), 85 deletions(-) diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index 87cb2290a7bf9..730b891cf57f4 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -31,6 +31,7 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" namespace arrow { @@ -38,6 +39,8 @@ namespace arrow { using std::string; using std::vector; +using internal::checked_cast; + // ---------------------------------------------------------------------- // Dictionary tests @@ -740,4 +743,63 @@ TEST(TestDictionary, FromArray) { ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, &arr4)); } +TEST(TestDictionary, TransposeBasic) { + std::shared_ptr arr, out, expected; + + auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); + auto dict_type = dictionary(int16(), dict); + auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]"); + // ["B", "C", "A", "A"] + ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr)); + + // Transpose to same index type + { + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int16(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*out, *expected); + } + + // Transpose to other type + { + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int8(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*expected, *out); + } +} + +TEST(TestDictionary, TransposeNulls) { + std::shared_ptr arr, out, expected; + + auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); + auto dict_type = dictionary(int16(), dict); + auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]"); + // ["B", "C", null, "A"] + ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr)); + + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int16(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*expected, *out); +} + } // namespace arrow diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index ff94aa2a1e6fe..7e45e90d9c8f7 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -33,6 +33,7 @@ #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/visitor.h" @@ -663,6 +664,66 @@ std::shared_ptr DictionaryArray::dictionary() const { return dict_type_->dictionary(); } +template +static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data, + const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) { + using in_c_type = typename InType::c_type; + using out_c_type = typename OutType::c_type; + + std::shared_ptr out_buffer; + RETURN_NOT_OK(AllocateBuffer(pool, in_data.length * sizeof(out_c_type), &out_buffer)); + // Null bitmap is unchanged + auto out_data = ArrayData::Make(type, in_data.length, {in_data.buffers[0], out_buffer}, + in_data.null_count); + internal::TransposeInts(in_data.GetValues(1), + out_data->GetMutableValues(1), in_data.length, + transpose_map.data()); + *out = MakeArray(out_data); + return Status::OK(); +} + +Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) const { + DCHECK_EQ(type->id(), Type::DICTIONARY); + const auto& out_dict_type = checked_cast(*type); + + // XXX We'll probably want to make this operation a kernel when we + // implement dictionary-to-dictionary casting. + auto in_type_id = dict_type_->index_type()->id(); + auto out_type_id = out_dict_type.index_type()->id(); + +#define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \ + case OUT_INDEX_TYPE::type_id: \ + return TransposeDictIndices(pool, *data(), type, \ + transpose_map, out); + +#define TRANSPOSE_IN_CASE(IN_INDEX_TYPE) \ + case IN_INDEX_TYPE::type_id: \ + switch (out_type_id) { \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int8Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int16Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int32Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int64Type) \ + default: \ + return Status::NotImplemented("unexpected index type"); \ + } + + switch (in_type_id) { + TRANSPOSE_IN_CASE(Int8Type) + TRANSPOSE_IN_CASE(Int16Type) + TRANSPOSE_IN_CASE(Int32Type) + TRANSPOSE_IN_CASE(Int64Type) + default: + return Status::NotImplemented("unexpected index type"); + } + +#undef TRANSPOSE_IN_OUT_CASE +#undef TRANSPOSE_IN_CASE +} + // ---------------------------------------------------------------------- // Implement Array::Accept as inline visitor diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 52c5207d8dddc..aead17f133d74 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -422,6 +422,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { value_type Value(int64_t i) const { return raw_values()[i]; } + // For API compatibility with BinaryArray etc. + value_type GetView(int64_t i) const { return Value(i); } + protected: using PrimitiveArray::PrimitiveArray; }; @@ -442,6 +445,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { i + data_->offset); } + bool GetView(int64_t i) const { return Value(i); } + protected: using PrimitiveArray::PrimitiveArray; }; @@ -802,7 +807,7 @@ class ARROW_EXPORT DictionaryArray : public Array { /// This function does the validation of the indices and input type. It checks if /// all indices are non-negative and smaller than the size of the dictionary /// - /// \param[in] type a data type containing a dictionary + /// \param[in] type a dictionary type /// \param[in] indices an array of non-negative signed /// integers smaller than the size of the dictionary /// \param[out] out the resulting DictionaryArray instance @@ -810,6 +815,23 @@ class ARROW_EXPORT DictionaryArray : public Array { const std::shared_ptr& indices, std::shared_ptr* out); + /// \brief Transpose this DictionaryArray + /// + /// This method constructs a new dictionary array with the given dictionary type, + /// transposing indices using the transpose map. + /// The type and the transpose map are typically computed using + /// DictionaryType::Unify. + /// + /// \param[in] pool a pool to allocate the array data from + /// \param[in] type a dictionary type + /// \param[in] transpose_map a vector transposing this array's indices + /// into the target array's indices + /// \param[out] out the resulting DictionaryArray instance + Status Transpose(MemoryPool* pool, const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) const; + // XXX Do we also want an unsafe in-place Transpose? + std::shared_ptr indices() const; std::shared_ptr dictionary() const; diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 0891e4c0829f4..e534c3cadb14b 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -19,6 +19,9 @@ #include #include +#include +#include +#include #include #include @@ -30,11 +33,117 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" #include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// DictionaryType unification + +struct UnifyDictionaryValues { + MemoryPool* pool_; + std::shared_ptr value_type_; + const std::vector& types_; + std::shared_ptr* out_values_; + std::vector>* out_transpose_maps_; + + Status Visit(const DataType&, void* = nullptr) { + // Default implementation for non-dictionary-supported datatypes + std::stringstream ss; + ss << "Unification of " << value_type_->ToString() + << " dictionaries is not implemented"; + return Status::NotImplemented(ss.str()); + } + + template + Status Visit(const T&, + typename internal::DictionaryTraits::MemoTableType* = nullptr) { + using ArrayType = typename TypeTraits::ArrayType; + using DictTraits = typename internal::DictionaryTraits; + using MemoTableType = typename DictTraits::MemoTableType; + + MemoTableType memo_table; + if (out_transpose_maps_ != nullptr) { + out_transpose_maps_->clear(); + out_transpose_maps_->reserve(types_.size()); + } + // Build up the unified dictionary values and the transpose maps + for (const auto& type : types_) { + const ArrayType& values = checked_cast(*type->dictionary()); + if (out_transpose_maps_ != nullptr) { + std::vector transpose_map; + transpose_map.reserve(values.length()); + for (int64_t i = 0; i < values.length(); ++i) { + int32_t dict_index = memo_table.GetOrInsert(values.GetView(i)); + transpose_map.push_back(dict_index); + } + out_transpose_maps_->push_back(std::move(transpose_map)); + } else { + for (int64_t i = 0; i < values.length(); ++i) { + memo_table.GetOrInsert(values.GetView(i)); + } + } + } + // Build unified dictionary array + std::shared_ptr data; + RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table, + 0 /* start_offset */, &data)); + *out_values_ = MakeArray(data); + return Status::OK(); + } +}; + +Status DictionaryType::Unify(MemoryPool* pool, const std::vector& types, + std::shared_ptr* out_type, + std::vector>* out_transpose_maps) { + if (types.size() == 0) { + return Status::Invalid("need at least one input type"); + } + std::vector dict_types; + dict_types.reserve(types.size()); + for (const auto& type : types) { + if (type->id() != Type::DICTIONARY) { + return Status::TypeError("input types must be dictionary types"); + } + dict_types.push_back(checked_cast(type)); + } + + // XXX Should we check the ordered flag? + auto value_type = dict_types[0]->dictionary()->type(); + for (const auto& type : dict_types) { + auto values = type->dictionary(); + if (!values->type()->Equals(value_type)) { + return Status::TypeError("input types have different value types"); + } + if (values->null_count() != 0) { + return Status::TypeError("input types have null values"); + } + } + + std::shared_ptr values; + { + UnifyDictionaryValues visitor{pool, value_type, dict_types, &values, + out_transpose_maps}; + RETURN_NOT_OK(VisitTypeInline(*value_type, &visitor)); + } + + // Build unified dictionary type with the right index type + std::shared_ptr index_type; + if (values->length() <= std::numeric_limits::max()) { + index_type = int8(); + } else if (values->length() <= std::numeric_limits::max()) { + index_type = int16(); + } else if (values->length() <= std::numeric_limits::max()) { + index_type = int32(); + } else { + index_type = int64(); + } + *out_type = arrow::dictionary(index_type, values); + return Status::OK(); +} + // ---------------------------------------------------------------------- // DictionaryBuilder @@ -118,12 +227,31 @@ Status DictionaryBuilder::AppendNull() { return values_builder_.Append template Status DictionaryBuilder::AppendArray(const Array& array) { - const auto& numeric_array = checked_cast&>(array); + using ArrayType = typename TypeTraits::ArrayType; + + const auto& concrete_array = checked_cast(array); for (int64_t i = 0; i < array.length(); i++) { if (array.IsNull(i)) { RETURN_NOT_OK(AppendNull()); } else { - RETURN_NOT_OK(Append(numeric_array.Value(i))); + RETURN_NOT_OK(Append(concrete_array.GetView(i))); + } + } + return Status::OK(); +} + +template <> +Status DictionaryBuilder::AppendArray(const Array& array) { + if (!type_->Equals(*array.type())) { + return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); + } + + const auto& typed_array = checked_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + RETURN_NOT_OK(AppendNull()); + } else { + RETURN_NOT_OK(Append(typed_array.GetValue(i))); } } return Status::OK(); @@ -168,46 +296,6 @@ Status DictionaryBuilder::FinishInternal(std::shared_ptr* o return Status::OK(); } -// -// StringType and BinaryType specializations -// - -#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \ - \ - template <> \ - Status DictionaryBuilder::AppendArray(const Array& array) { \ - using ArrayType = typename TypeTraits::ArrayType; \ - const ArrayType& binary_array = checked_cast(array); \ - for (int64_t i = 0; i < array.length(); i++) { \ - if (array.IsNull(i)) { \ - RETURN_NOT_OK(AppendNull()); \ - } else { \ - RETURN_NOT_OK(Append(binary_array.GetView(i))); \ - } \ - } \ - return Status::OK(); \ - } - -BINARY_DICTIONARY_SPECIALIZATIONS(StringType); -BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType); - -template <> -Status DictionaryBuilder::AppendArray(const Array& array) { - if (!type_->Equals(*array.type())) { - return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); - } - - const auto& typed_array = checked_cast(array); - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsNull(i)) { - RETURN_NOT_OK(AppendNull()); - } else { - RETURN_NOT_OK(Append(typed_array.GetValue(i))); - } - } - return Status::OK(); -} - template class DictionaryBuilder; template class DictionaryBuilder; template class DictionaryBuilder; diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index e0a10690c2c77..20b7aff884b7f 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -24,6 +24,8 @@ #include +#include "arrow/memory_pool.h" +#include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -480,6 +482,132 @@ TEST(TestStructType, GetChildIndex) { ASSERT_EQ(-1, struct_type.GetChildIndex("not-found")); } +TEST(TestDictionaryType, Equals) { + auto t1 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t2 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t3 = dictionary(int16(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t4 = dictionary(int8(), ArrayFromJSON(int16(), "[3, 4, 5, 6]")); + auto t5 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 7, 6]")); + + ASSERT_TRUE(t1->Equals(t2)); + // Different index type + ASSERT_FALSE(t1->Equals(t3)); + // Different value type + ASSERT_FALSE(t1->Equals(t4)); + // Different values + ASSERT_FALSE(t1->Equals(t5)); +} + +TEST(TestDictionaryType, UnifyNumeric) { + auto t1 = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7]")); + auto t2 = dictionary(int8(), ArrayFromJSON(int64(), "[1, 7, 4, 8]")); + auto t3 = dictionary(int8(), ArrayFromJSON(int64(), "[1, -200]")); + + auto expected = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7, 1, 8, -200]")); + + std::shared_ptr dict_type; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()}, + &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()}, + &dict_type, &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + ASSERT_EQ(transpose_maps.size(), 3); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1, 2})); + ASSERT_EQ(transpose_maps[1], std::vector({3, 2, 1, 4})); + ASSERT_EQ(transpose_maps[2], std::vector({3, 5})); +} + +TEST(TestDictionaryType, UnifyString) { + auto t1 = dictionary(int16(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\"]")); + auto t2 = dictionary(int32(), ArrayFromJSON(utf8(), "[\"quux\", \"foo\"]")); + + auto expected = + dictionary(int8(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"quux\"]")); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type, + &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + + ASSERT_EQ(transpose_maps.size(), 2); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1})); + ASSERT_EQ(transpose_maps[1], std::vector({2, 0})); +} + +TEST(TestDictionaryType, UnifyFixedSizeBinary) { + auto type = fixed_size_binary(3); + + std::string data = "foobarbazqux"; + auto buf = std::make_shared(data); + // ["foo", "bar"] + auto dict1 = std::make_shared(type, 2, SliceBuffer(buf, 0, 6)); + auto t1 = dictionary(int16(), dict1); + // ["bar", "baz", "qux"] + auto dict2 = std::make_shared(type, 3, SliceBuffer(buf, 3, 9)); + auto t2 = dictionary(int16(), dict2); + + // ["foo", "bar", "baz", "qux"] + auto expected_dict = std::make_shared(type, 4, buf); + auto expected = dictionary(int8(), expected_dict); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type, + &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + ASSERT_EQ(transpose_maps.size(), 2); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1})); + ASSERT_EQ(transpose_maps[1], std::vector({1, 2, 3})); +} + +TEST(TestDictionaryType, UnifyLarge) { + // Unifying "large" dictionary types should choose the right index type + std::shared_ptr dict1, dict2, expected_dict; + + Int32Builder builder; + ASSERT_OK(builder.Reserve(120)); + for (int32_t i = 0; i < 120; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&dict1)); + ASSERT_EQ(dict1->length(), 120); + auto t1 = dictionary(int8(), dict1); + + ASSERT_OK(builder.Reserve(30)); + for (int32_t i = 110; i < 140; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&dict2)); + ASSERT_EQ(dict2->length(), 30); + auto t2 = dictionary(int8(), dict2); + + ASSERT_OK(builder.Reserve(140)); + for (int32_t i = 0; i < 140; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&expected_dict)); + ASSERT_EQ(expected_dict->length(), 140); + // int8 would be too narrow to hold all possible index values + auto expected = dictionary(int16(), expected_dict); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); +} + TEST(TypesTest, TestDecimal128Small) { Decimal128Type t1(8, 4); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 5f1ca8d7b0f09..753cb65ff26da 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -260,7 +260,12 @@ DictionaryType::DictionaryType(const std::shared_ptr& index_type, : FixedWidthType(Type::DICTIONARY), index_type_(index_type), dictionary_(dictionary), - ordered_(ordered) {} + ordered_(ordered) { +#ifndef NDEBUG + const auto& int_type = checked_cast(*index_type); + DCHECK_EQ(int_type.is_signed(), true) << "dictionary index type should be signed"; +#endif +} int DictionaryType::bit_width() const { return checked_cast(*index_type_).bit_width(); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 9694202b9705c..8f6cfd6ced4ff 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -39,6 +39,7 @@ namespace arrow { class Array; class Field; +class MemoryPool; struct Type { /// \brief Main data type enumeration @@ -768,6 +769,23 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { bool ordered() const { return ordered_; } + /// \brief Unify several dictionary types + /// + /// Compute a resulting dictionary that will allow the union of values + /// of all input dictionary types. The input types must all have the + /// same value type. + /// \param[in] pool Memory pool to allocate dictionary values from + /// \param[in] types A sequence of input dictionary types + /// \param[out] out_type The unified dictionary type + /// \param[out] out_transpose_maps (optionally) A sequence of integer vectors, + /// one per input type. Each integer vector represents the transposition + /// of input type indices into unified type indices. + // XXX Should we return something special (an empty transpose map?) when + // the transposition is the identity function? + static Status Unify(MemoryPool* pool, const std::vector& types, + std::shared_ptr* out_type, + std::vector>* out_transpose_maps = NULLPTR); + private: // Must be an integer type (not currently checked) std::shared_ptr index_type_; diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index ee368fb4e314c..76724b2a30035 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -651,25 +651,6 @@ template struct HashTraits> { using c_type = typename T::c_type; using MemoTableType = SmallScalarMemoTable; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - std::shared_ptr dict_buffer; - auto dict_length = static_cast(memo_table.size()) - start_offset; - // This makes a copy, but we assume a dictionary array is usually small - // compared to the size of the dictionary-using array. - // (also, copying the dictionary values is cheap compared to the cost - // of building the memo table) - RETURN_NOT_OK( - AllocateBuffer(pool, TypeTraits::bytes_required(dict_length), &dict_buffer)); - memo_table.CopyValues(static_cast(start_offset), - reinterpret_cast(dict_buffer->mutable_data())); - *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */); - return Status::OK(); - } }; template @@ -677,25 +658,6 @@ struct HashTraits< T, typename std::enable_if::value && !is_8bit_int::value>::type> { using c_type = typename T::c_type; using MemoTableType = ScalarMemoTable; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - std::shared_ptr dict_buffer; - auto dict_length = static_cast(memo_table.size()) - start_offset; - // This makes a copy, but we assume a dictionary array is usually small - // compared to the size of the dictionary-using array. - // (also, copying the dictionary values is cheap compared to the cost - // of building the memo table) - RETURN_NOT_OK( - AllocateBuffer(pool, TypeTraits::bytes_required(dict_length), &dict_buffer)); - memo_table.CopyValues(static_cast(start_offset), - reinterpret_cast(dict_buffer->mutable_data())); - *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */); - return Status::OK(); - } }; template diff --git a/cpp/src/arrow/util/int-util-test.cc b/cpp/src/arrow/util/int-util-test.cc index 018eeda7248a3..5eba531d874e0 100644 --- a/cpp/src/arrow/util/int-util-test.cc +++ b/cpp/src/arrow/util/int-util-test.cc @@ -373,5 +373,14 @@ TEST(IntWidth, NullsMany) { } } +TEST(TransposeInts, Int8ToInt64) { + std::vector src = {1, 3, 5, 0, 3, 2}; + std::vector transpose_map = {1111, 2222, 3333, 4444, 5555, 6666, 7777}; + std::vector dest(src.size()); + + TransposeInts(src.data(), dest.data(), 6, transpose_map.data()); + ASSERT_EQ(dest, std::vector({2222, 4444, 6666, 1111, 4444, 3333})); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/int-util.cc b/cpp/src/arrow/util/int-util.cc index ced1cd1c20da2..d81044b3cafdc 100644 --- a/cpp/src/arrow/util/int-util.cc +++ b/cpp/src/arrow/util/int-util.cc @@ -402,5 +402,45 @@ void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) { memcpy(dest, source, length * sizeof(int64_t)); } +template +void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length, + const int32_t* transpose_map) { + while (length >= 4) { + dest[0] = static_cast(transpose_map[src[0]]); + dest[1] = static_cast(transpose_map[src[1]]); + dest[2] = static_cast(transpose_map[src[2]]); + dest[3] = static_cast(transpose_map[src[3]]); + length -= 4; + src += 4; + dest += 4; + } + while (length > 0) { + *dest++ = static_cast(transpose_map[*src++]); + --length; + } +} + +#define INSTANTIATE(SRC, DEST) \ + template ARROW_EXPORT void TransposeInts( \ + const SRC* source, DEST* dest, int64_t length, const int32_t* transpose_map); + +#define INSTANTIATE_ALL_DEST(DEST) \ + INSTANTIATE(int8_t, DEST) \ + INSTANTIATE(int16_t, DEST) \ + INSTANTIATE(int32_t, DEST) \ + INSTANTIATE(int64_t, DEST) + +#define INSTANTIATE_ALL() \ + INSTANTIATE_ALL_DEST(int8_t) \ + INSTANTIATE_ALL_DEST(int16_t) \ + INSTANTIATE_ALL_DEST(int32_t) \ + INSTANTIATE_ALL_DEST(int64_t) + +INSTANTIATE_ALL() + +#undef INSTANTIATE +#undef INSTANTIATE_ALL +#undef INSTANTIATE_ALL_DEST + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/int-util.h b/cpp/src/arrow/util/int-util.h index 68355d34549ac..66d389e5f40cf 100644 --- a/cpp/src/arrow/util/int-util.h +++ b/cpp/src/arrow/util/int-util.h @@ -63,6 +63,10 @@ void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length); ARROW_EXPORT void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length); +template +ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length, + const int32_t* transpose_map); + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index b6fc1f1ff2bfb..a5deaa7a1d22c 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -121,7 +121,7 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor) { // The scalar value's type depends on the array data type: // - the type's `c_type`, if any // - for boolean arrays, a `bool` -// - for binary, string and fixed-size binary arrars, a `util::string_view` +// - for binary, string and fixed-size binary arrays, a `util::string_view` template struct ArrayDataVisitor {}; diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 310656d86fd47..af2d1139c43fe 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -303,8 +303,8 @@ def test_dictionary_type(): assert ty0.dictionary.to_pylist() == ['a', 'b', 'c'] assert ty0.ordered is False - ty1 = pa.dictionary(pa.float32(), pa.array([1.0, 2.0]), ordered=True) - assert ty1.index_type == pa.float32() + ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True) + assert ty1.index_type == pa.int8() assert isinstance(ty0.dictionary, pa.Array) assert ty1.dictionary.to_pylist() == [1.0, 2.0] assert ty1.ordered is True