diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index fab955164885d..717c716330563 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -737,102 +737,109 @@ static void BM_DeltaLengthDecodingSpacedByteArray(benchmark::State& state) { BENCHMARK(BM_PlainDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments); BENCHMARK(BM_DeltaLengthDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments); -void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, - int min_size, int max_size, double prefixed_probability) { - std::default_random_engine gen(seed); - std::uniform_int_distribution dist_size(min_size, max_size); - std::uniform_int_distribution dist_byte(0, 255); - std::bernoulli_distribution dist_has_prefix(prefixed_probability); - std::uniform_real_distribution dist_prefix_length(0, 1); - - for (int i = 0; i < n; ++i) { - int len = dist_size(gen); - out[i].len = len; - out[i].ptr = buf; - - bool do_prefix = dist_has_prefix(gen) && i > 0; - int prefix_len = 0; - if (do_prefix) { - int max_prefix_len = std::min(len, static_cast(out[i - 1].len)); - prefix_len = static_cast(std::ceil(max_prefix_len * dist_prefix_length(gen))); - } - for (int j = 0; j < prefix_len; ++j) { - buf[j] = out[i - 1].ptr[j]; - } - for (int j = prefix_len; j < len; ++j) { - buf[j] = static_cast(dist_byte(gen)); +struct DeltaByteArrayState { + int32_t min_size = 0; + int32_t max_size; + int32_t array_length; + int32_t total_data_size = 0; + double prefixed_probability; + std::vector buf; + + explicit DeltaByteArrayState(const benchmark::State& state) + : max_size(static_cast(state.range(0))), + array_length(static_cast(state.range(1))), + prefixed_probability(state.range(2) / 100.0) {} + + std::vector MakeRandomByteArray(uint32_t seed) { + std::default_random_engine gen(seed); + std::uniform_int_distribution dist_size(min_size, max_size); + std::uniform_int_distribution dist_byte(0, 255); + std::bernoulli_distribution dist_has_prefix(prefixed_probability); + std::uniform_real_distribution dist_prefix_length(0, 1); + + std::vector out(array_length); + buf.resize(max_size * array_length); + auto buf_ptr = buf.data(); + total_data_size = 0; + + for (int32_t i = 0; i < array_length; ++i) { + int len = dist_size(gen); + out[i].len = len; + out[i].ptr = buf_ptr; + + bool do_prefix = i > 0 && dist_has_prefix(gen); + int prefix_len = 0; + if (do_prefix) { + int max_prefix_len = std::min(len, static_cast(out[i - 1].len)); + prefix_len = + static_cast(std::ceil(max_prefix_len * dist_prefix_length(gen))); + } + for (int j = 0; j < prefix_len; ++j) { + buf_ptr[j] = out[i - 1].ptr[j]; + } + for (int j = prefix_len; j < len; ++j) { + buf_ptr[j] = static_cast(dist_byte(gen)); + } + buf_ptr += len; + total_data_size += len; } - buf += len; + return out; } -} +}; static void BM_DeltaEncodingByteArray(benchmark::State& state) { - int32_t min_length = static_cast(state.range(0)); - int32_t max_length = static_cast(state.range(1)); - int32_t array_size = static_cast(state.range(2)); - double prefixed_probability = state.range(3) / 100; + DeltaByteArrayState delta_state(state); + std::vector values = delta_state.MakeRandomByteArray(/*seed=*/42); + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); - std::vector values; - std::vector buf(max_length * array_size); - values.resize(array_size); - prefixed_random_byte_array(array_size, /*seed=*/0, buf.data(), values.data(), - min_length, max_length, - /*prefixed_probability=*/prefixed_probability); - int64_t actual_length = 0; - for (auto v : values) { - actual_length += v.len; - } + const int64_t plain_encoded_size = + delta_state.total_data_size + 4 * delta_state.array_length; + int64_t encoded_size = 0; for (auto _ : state) { encoder->Put(values.data(), static_cast(values.size())); - encoder->FlushValues(); + encoded_size = encoder->FlushValues()->size(); } - state.SetItemsProcessed(state.iterations() * array_size); - state.SetBytesProcessed(state.iterations() * actual_length); + state.SetItemsProcessed(state.iterations() * delta_state.array_length); + state.SetBytesProcessed(state.iterations() * delta_state.total_data_size); + state.counters["compression_ratio"] = + static_cast(plain_encoded_size) / encoded_size; } static void BM_DeltaDecodingByteArray(benchmark::State& state) { - int32_t min_length = static_cast(state.range(0)); - int32_t max_length = static_cast(state.range(1)); - int32_t array_size = static_cast(state.range(2)); - double prefixed_probability = state.range(3) / 100; + DeltaByteArrayState delta_state(state); + std::vector values = delta_state.MakeRandomByteArray(/*seed=*/42); + auto encoder = MakeTypedEncoder(Encoding::DELTA_BYTE_ARRAY); - std::vector values; - std::vector input_buf(max_length * array_size); - values.resize(array_size); - prefixed_random_byte_array(array_size, /*seed=*/0, input_buf.data(), values.data(), - min_length, max_length, - /*prefixed_probability=*/prefixed_probability); - int64_t actual_length = 0; - for (auto v : values) { - actual_length += v.len; - } encoder->Put(values.data(), static_cast(values.size())); std::shared_ptr buf = encoder->FlushValues(); + const int64_t plain_encoded_size = + delta_state.total_data_size + 4 * delta_state.array_length; + const int64_t encoded_size = buf->size(); + auto decoder = MakeTypedDecoder(Encoding::DELTA_BYTE_ARRAY); for (auto _ : state) { - decoder->SetData(array_size, buf->data(), static_cast(buf->size())); + decoder->SetData(delta_state.array_length, buf->data(), + static_cast(buf->size())); decoder->Decode(values.data(), static_cast(values.size())); ::benchmark::DoNotOptimize(values); } - state.SetItemsProcessed(state.iterations() * array_size); - state.SetBytesProcessed(state.iterations() * actual_length); + state.SetItemsProcessed(state.iterations() * delta_state.array_length); + state.SetBytesProcessed(state.iterations() * delta_state.total_data_size); + state.counters["compression_ratio"] = + static_cast(plain_encoded_size) / encoded_size; } static void ByteArrayDeltaCustomArguments(benchmark::internal::Benchmark* b) { for (int max_string_length : {8, 64, 1024}) { for (int batch_size : {512, 2048}) { - std::vector> prefix_gen_params = { - {10, 0}, {90, max_string_length / 2}, {99, max_string_length}}; - for (auto& [prefixed_probability, min_prefix_string_length] : prefix_gen_params) { - b->Args({min_prefix_string_length, max_string_length, batch_size, - prefixed_probability}); + for (int prefixed_percent : {10, 90, 99}) { + b->Args({max_string_length, batch_size, prefixed_percent}); } } } - b->ArgNames({"min-prefix-string-length", "max-string-length", "batch-size", - "prefixed-probability"}); + b->ArgNames({"max-string-length", "batch-size", "prefixed-percent"}); } BENCHMARK(BM_DeltaEncodingByteArray)->Apply(ByteArrayDeltaCustomArguments);