Skip to content

Commit

Permalink
ARROW-554: [C++] Add functions to unify dictionary types and arrays
Browse files Browse the repository at this point in the history
Author: Antoine Pitrou <[email protected]>

Closes apache#3165 from pitrou/ARROW-554-conform-dicts and squashes the following commits:

7d2579b <Antoine Pitrou> ARROW-554:  Add functions to conform dictionaries
  • Loading branch information
pitrou authored and wesm committed Dec 19, 2018
1 parent 320621d commit f66fa80
Show file tree
Hide file tree
Showing 13 changed files with 484 additions and 85 deletions.
62 changes: 62 additions & 0 deletions cpp/src/arrow/array-dict-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,16 @@
#include "arrow/test-common.h"
#include "arrow/test-util.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"

namespace arrow {

using std::string;
using std::vector;

using internal::checked_cast;

// ----------------------------------------------------------------------
// Dictionary tests

Expand Down Expand Up @@ -740,4 +743,63 @@ TEST(TestDictionary, FromArray) {
ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, &arr4));
}

TEST(TestDictionary, TransposeBasic) {
std::shared_ptr<Array> arr, out, expected;

auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
auto dict_type = dictionary(int16(), dict);
auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]");
// ["B", "C", "A", "A"]
ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr));

// Transpose to same index type
{
auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
auto out_dict_type = dictionary(int16(), out_dict);

const std::vector<int32_t> transpose_map{1, 3, 2};
ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
default_memory_pool(), out_dict_type, transpose_map, &out));

auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]");
ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
AssertArraysEqual(*out, *expected);
}

// Transpose to other type
{
auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
auto out_dict_type = dictionary(int8(), out_dict);

const std::vector<int32_t> transpose_map{1, 3, 2};
ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
default_memory_pool(), out_dict_type, transpose_map, &out));

auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]");
ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
AssertArraysEqual(*expected, *out);
}
}

TEST(TestDictionary, TransposeNulls) {
std::shared_ptr<Array> arr, out, expected;

auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
auto dict_type = dictionary(int16(), dict);
auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]");
// ["B", "C", null, "A"]
ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr));

auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
auto out_dict_type = dictionary(int16(), out_dict);

const std::vector<int32_t> transpose_map{1, 3, 2};
ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
default_memory_pool(), out_dict_type, transpose_map, &out));

auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]");
ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
AssertArraysEqual(*expected, *out);
}

} // namespace arrow
61 changes: 61 additions & 0 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "arrow/util/bit-util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/int-util.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/visitor.h"
Expand Down Expand Up @@ -663,6 +664,66 @@ std::shared_ptr<Array> DictionaryArray::dictionary() const {
return dict_type_->dictionary();
}

template <typename InType, typename OutType>
static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data,
const std::shared_ptr<DataType>& type,
const std::vector<int32_t>& transpose_map,
std::shared_ptr<Array>* out) {
using in_c_type = typename InType::c_type;
using out_c_type = typename OutType::c_type;

std::shared_ptr<Buffer> out_buffer;
RETURN_NOT_OK(AllocateBuffer(pool, in_data.length * sizeof(out_c_type), &out_buffer));
// Null bitmap is unchanged
auto out_data = ArrayData::Make(type, in_data.length, {in_data.buffers[0], out_buffer},
in_data.null_count);
internal::TransposeInts(in_data.GetValues<in_c_type>(1),
out_data->GetMutableValues<out_c_type>(1), in_data.length,
transpose_map.data());
*out = MakeArray(out_data);
return Status::OK();
}

Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type,
const std::vector<int32_t>& transpose_map,
std::shared_ptr<Array>* out) const {
DCHECK_EQ(type->id(), Type::DICTIONARY);
const auto& out_dict_type = checked_cast<const DictionaryType&>(*type);

// XXX We'll probably want to make this operation a kernel when we
// implement dictionary-to-dictionary casting.
auto in_type_id = dict_type_->index_type()->id();
auto out_type_id = out_dict_type.index_type()->id();

#define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \
case OUT_INDEX_TYPE::type_id: \
return TransposeDictIndices<IN_INDEX_TYPE, OUT_INDEX_TYPE>(pool, *data(), type, \
transpose_map, out);

#define TRANSPOSE_IN_CASE(IN_INDEX_TYPE) \
case IN_INDEX_TYPE::type_id: \
switch (out_type_id) { \
TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int8Type) \
TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int16Type) \
TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int32Type) \
TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int64Type) \
default: \
return Status::NotImplemented("unexpected index type"); \
}

switch (in_type_id) {
TRANSPOSE_IN_CASE(Int8Type)
TRANSPOSE_IN_CASE(Int16Type)
TRANSPOSE_IN_CASE(Int32Type)
TRANSPOSE_IN_CASE(Int64Type)
default:
return Status::NotImplemented("unexpected index type");
}

#undef TRANSPOSE_IN_OUT_CASE
#undef TRANSPOSE_IN_CASE
}

// ----------------------------------------------------------------------
// Implement Array::Accept as inline visitor

Expand Down
24 changes: 23 additions & 1 deletion cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray {

value_type Value(int64_t i) const { return raw_values()[i]; }

// For API compatibility with BinaryArray etc.
value_type GetView(int64_t i) const { return Value(i); }

protected:
using PrimitiveArray::PrimitiveArray;
};
Expand All @@ -442,6 +445,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
i + data_->offset);
}

bool GetView(int64_t i) const { return Value(i); }

protected:
using PrimitiveArray::PrimitiveArray;
};
Expand Down Expand Up @@ -802,14 +807,31 @@ class ARROW_EXPORT DictionaryArray : public Array {
/// This function does the validation of the indices and input type. It checks if
/// all indices are non-negative and smaller than the size of the dictionary
///
/// \param[in] type a data type containing a dictionary
/// \param[in] type a dictionary type
/// \param[in] indices an array of non-negative signed
/// integers smaller than the size of the dictionary
/// \param[out] out the resulting DictionaryArray instance
static Status FromArrays(const std::shared_ptr<DataType>& type,
const std::shared_ptr<Array>& indices,
std::shared_ptr<Array>* out);

/// \brief Transpose this DictionaryArray
///
/// This method constructs a new dictionary array with the given dictionary type,
/// transposing indices using the transpose map.
/// The type and the transpose map are typically computed using
/// DictionaryType::Unify.
///
/// \param[in] pool a pool to allocate the array data from
/// \param[in] type a dictionary type
/// \param[in] transpose_map a vector transposing this array's indices
/// into the target array's indices
/// \param[out] out the resulting DictionaryArray instance
Status Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type,
const std::vector<int32_t>& transpose_map,
std::shared_ptr<Array>* out) const;
// XXX Do we also want an unsafe in-place Transpose?

std::shared_ptr<Array> indices() const;
std::shared_ptr<Array> dictionary() const;

Expand Down
Loading

0 comments on commit f66fa80

Please sign in to comment.