Skip to content

Commit

Permalink
PARQUET-1463: [C++] Utilize common hashing machinery for dictionary e…
Browse files Browse the repository at this point in the history
…ncoding
  • Loading branch information
pitrou committed Nov 26, 2018
1 parent 54b0af8 commit f909f0a
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 223 deletions.
8 changes: 8 additions & 0 deletions cpp/src/arrow/util/hashing-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,14 @@ TEST(BinaryMemoTable, Basics) {
table.CopyValues(4 /* start offset */, reinterpret_cast<uint8_t*>(&values[0]));
ASSERT_EQ(values, expected_values);
}
{
std::vector<std::string> expected({B, C, D, E, F});
std::vector<std::string> actual;
table.VisitValues(1 /* start offset */, [&](const util::string_view& v) {
actual.emplace_back(v.data(), v.length());
});
ASSERT_EQ(actual, expected);
}
}

TEST(BinaryMemoTable, Stress) {
Expand Down
12 changes: 12 additions & 0 deletions cpp/src/arrow/util/hashing.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/builder.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
Expand Down Expand Up @@ -605,6 +606,17 @@ class BinaryMemoTable {
CopyValues(0, out_size, out_data);
}

// Visit the stored values in insertion order.
// The visitor function should have the signature `void(util::string_view)`
// or `void(const util::string_view&)`.
template <typename VisitFunc>
void VisitValues(int32_t start, VisitFunc&& visit) const {
for (uint32_t i = start; i < offsets_.size() - 1; ++i) {
visit(
util::string_view(values_.data() + offsets_[i], offsets_[i + 1] - offsets_[i]));
}
}

protected:
struct Payload {
int32_t memo_index;
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/parquet/encoding-benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,7 @@ static void DecodeDict(std::vector<typename Type::c_type>& values,

DictEncoder<Type> encoder(descr.get(), &pool, allocator);
for (int i = 0; i < num_values; ++i) {
// No SSE
encoder.template Put<false>(values[i]);
encoder.Put(values[i]);
}

std::shared_ptr<ResizableBuffer> dict_buffer =
Expand Down
Loading

0 comments on commit f909f0a

Please sign in to comment.