Skip to content

Commit

Permalink
Simplify and improve benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
pitrou committed Sep 26, 2023
1 parent e8e8766 commit 557d0ca
Showing 1 changed file with 74 additions and 67 deletions.
141 changes: 74 additions & 67 deletions cpp/src/parquet/encoding_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -737,102 +737,109 @@ static void BM_DeltaLengthDecodingSpacedByteArray(benchmark::State& state) {
BENCHMARK(BM_PlainDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments);
BENCHMARK(BM_DeltaLengthDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments);

void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out,
int min_size, int max_size, double prefixed_probability) {
std::default_random_engine gen(seed);
std::uniform_int_distribution<int> dist_size(min_size, max_size);
std::uniform_int_distribution<int> dist_byte(0, 255);
std::bernoulli_distribution dist_has_prefix(prefixed_probability);
std::uniform_real_distribution<double> dist_prefix_length(0, 1);

for (int i = 0; i < n; ++i) {
int len = dist_size(gen);
out[i].len = len;
out[i].ptr = buf;

bool do_prefix = dist_has_prefix(gen) && i > 0;
int prefix_len = 0;
if (do_prefix) {
int max_prefix_len = std::min(len, static_cast<int>(out[i - 1].len));
prefix_len = static_cast<int>(std::ceil(max_prefix_len * dist_prefix_length(gen)));
}
for (int j = 0; j < prefix_len; ++j) {
buf[j] = out[i - 1].ptr[j];
}
for (int j = prefix_len; j < len; ++j) {
buf[j] = static_cast<uint8_t>(dist_byte(gen));
struct DeltaByteArrayState {
int32_t min_size = 0;
int32_t max_size;
int32_t array_length;
int32_t total_data_size = 0;
double prefixed_probability;
std::vector<uint8_t> buf;

explicit DeltaByteArrayState(const benchmark::State& state)
: max_size(static_cast<int32_t>(state.range(0))),
array_length(static_cast<int32_t>(state.range(1))),
prefixed_probability(state.range(2) / 100.0) {}

std::vector<ByteArray> MakeRandomByteArray(uint32_t seed) {
std::default_random_engine gen(seed);
std::uniform_int_distribution<int> dist_size(min_size, max_size);
std::uniform_int_distribution<int> dist_byte(0, 255);
std::bernoulli_distribution dist_has_prefix(prefixed_probability);
std::uniform_real_distribution<double> dist_prefix_length(0, 1);

std::vector<ByteArray> out(array_length);
buf.resize(max_size * array_length);
auto buf_ptr = buf.data();
total_data_size = 0;

for (int32_t i = 0; i < array_length; ++i) {
int len = dist_size(gen);
out[i].len = len;
out[i].ptr = buf_ptr;

bool do_prefix = i > 0 && dist_has_prefix(gen);
int prefix_len = 0;
if (do_prefix) {
int max_prefix_len = std::min(len, static_cast<int>(out[i - 1].len));
prefix_len =
static_cast<int>(std::ceil(max_prefix_len * dist_prefix_length(gen)));
}
for (int j = 0; j < prefix_len; ++j) {
buf_ptr[j] = out[i - 1].ptr[j];
}
for (int j = prefix_len; j < len; ++j) {
buf_ptr[j] = static_cast<uint8_t>(dist_byte(gen));
}
buf_ptr += len;
total_data_size += len;
}
buf += len;
return out;
}
}
};

static void BM_DeltaEncodingByteArray(benchmark::State& state) {
int32_t min_length = static_cast<int32_t>(state.range(0));
int32_t max_length = static_cast<int32_t>(state.range(1));
int32_t array_size = static_cast<int32_t>(state.range(2));
double prefixed_probability = state.range(3) / 100;
DeltaByteArrayState delta_state(state);
std::vector<ByteArray> values = delta_state.MakeRandomByteArray(/*seed=*/42);

auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::DELTA_BYTE_ARRAY);
std::vector<ByteArray> values;
std::vector<uint8_t> buf(max_length * array_size);
values.resize(array_size);
prefixed_random_byte_array(array_size, /*seed=*/0, buf.data(), values.data(),
min_length, max_length,
/*prefixed_probability=*/prefixed_probability);
int64_t actual_length = 0;
for (auto v : values) {
actual_length += v.len;
}
const int64_t plain_encoded_size =
delta_state.total_data_size + 4 * delta_state.array_length;
int64_t encoded_size = 0;

for (auto _ : state) {
encoder->Put(values.data(), static_cast<int>(values.size()));
encoder->FlushValues();
encoded_size = encoder->FlushValues()->size();
}
state.SetItemsProcessed(state.iterations() * array_size);
state.SetBytesProcessed(state.iterations() * actual_length);
state.SetItemsProcessed(state.iterations() * delta_state.array_length);
state.SetBytesProcessed(state.iterations() * delta_state.total_data_size);
state.counters["compression_ratio"] =
static_cast<double>(plain_encoded_size) / encoded_size;
}

static void BM_DeltaDecodingByteArray(benchmark::State& state) {
int32_t min_length = static_cast<int32_t>(state.range(0));
int32_t max_length = static_cast<int32_t>(state.range(1));
int32_t array_size = static_cast<int32_t>(state.range(2));
double prefixed_probability = state.range(3) / 100;
DeltaByteArrayState delta_state(state);
std::vector<ByteArray> values = delta_state.MakeRandomByteArray(/*seed=*/42);

auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::DELTA_BYTE_ARRAY);
std::vector<ByteArray> values;
std::vector<uint8_t> input_buf(max_length * array_size);
values.resize(array_size);
prefixed_random_byte_array(array_size, /*seed=*/0, input_buf.data(), values.data(),
min_length, max_length,
/*prefixed_probability=*/prefixed_probability);
int64_t actual_length = 0;
for (auto v : values) {
actual_length += v.len;
}
encoder->Put(values.data(), static_cast<int>(values.size()));
std::shared_ptr<Buffer> buf = encoder->FlushValues();

const int64_t plain_encoded_size =
delta_state.total_data_size + 4 * delta_state.array_length;
const int64_t encoded_size = buf->size();

auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::DELTA_BYTE_ARRAY);
for (auto _ : state) {
decoder->SetData(array_size, buf->data(), static_cast<int>(buf->size()));
decoder->SetData(delta_state.array_length, buf->data(),
static_cast<int>(buf->size()));
decoder->Decode(values.data(), static_cast<int>(values.size()));
::benchmark::DoNotOptimize(values);
}
state.SetItemsProcessed(state.iterations() * array_size);
state.SetBytesProcessed(state.iterations() * actual_length);
state.SetItemsProcessed(state.iterations() * delta_state.array_length);
state.SetBytesProcessed(state.iterations() * delta_state.total_data_size);
state.counters["compression_ratio"] =
static_cast<double>(plain_encoded_size) / encoded_size;
}

static void ByteArrayDeltaCustomArguments(benchmark::internal::Benchmark* b) {
for (int max_string_length : {8, 64, 1024}) {
for (int batch_size : {512, 2048}) {
std::vector<std::pair<int, int>> prefix_gen_params = {
{10, 0}, {90, max_string_length / 2}, {99, max_string_length}};
for (auto& [prefixed_probability, min_prefix_string_length] : prefix_gen_params) {
b->Args({min_prefix_string_length, max_string_length, batch_size,
prefixed_probability});
for (int prefixed_percent : {10, 90, 99}) {
b->Args({max_string_length, batch_size, prefixed_percent});
}
}
}
b->ArgNames({"min-prefix-string-length", "max-string-length", "batch-size",
"prefixed-probability"});
b->ArgNames({"max-string-length", "batch-size", "prefixed-percent"});
}

BENCHMARK(BM_DeltaEncodingByteArray)->Apply(ByteArrayDeltaCustomArguments);
Expand Down

0 comments on commit 557d0ca

Please sign in to comment.