forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
apacheGH-38542: [C++][Parquet] Faster scalar BYTE_STREAM_SPLIT (apach…
…e#38529) ### Rationale for this change BYTE_STREAM_SPLIT encoding and decoding benefit from SIMD accelerations on x86, but scalar implementations are used otherwise. ### What changes are included in this PR? Improve the speed of scalar implementations of BYTE_STREAM_SPLIT by using a blocked algorithm. Benchmark numbers on the author's machine (AMD Ryzen 9 3900X): * before: ``` ------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------------------------- BM_ByteStreamSplitDecode_Float_Scalar/1024 1374 ns 1374 ns 510396 bytes_per_second=2.77574G/s BM_ByteStreamSplitDecode_Float_Scalar/4096 5483 ns 5483 ns 127498 bytes_per_second=2.78303G/s BM_ByteStreamSplitDecode_Float_Scalar/32768 44042 ns 44035 ns 15905 bytes_per_second=2.77212G/s BM_ByteStreamSplitDecode_Float_Scalar/65536 87966 ns 87952 ns 7962 bytes_per_second=2.77583G/s BM_ByteStreamSplitDecode_Double_Scalar/1024 2583 ns 2583 ns 271436 bytes_per_second=2.95408G/s BM_ByteStreamSplitDecode_Double_Scalar/4096 10533 ns 10532 ns 65695 bytes_per_second=2.89761G/s BM_ByteStreamSplitDecode_Double_Scalar/32768 84067 ns 84053 ns 8275 bytes_per_second=2.90459G/s BM_ByteStreamSplitDecode_Double_Scalar/65536 168332 ns 168309 ns 4155 bytes_per_second=2.9011G/s BM_ByteStreamSplitEncode_Float_Scalar/1024 1435 ns 1435 ns 484278 bytes_per_second=2.65802G/s BM_ByteStreamSplitEncode_Float_Scalar/4096 5725 ns 5725 ns 121877 bytes_per_second=2.66545G/s BM_ByteStreamSplitEncode_Float_Scalar/32768 46291 ns 46283 ns 15134 bytes_per_second=2.63745G/s BM_ByteStreamSplitEncode_Float_Scalar/65536 91139 ns 91128 ns 7707 bytes_per_second=2.6791G/s BM_ByteStreamSplitEncode_Double_Scalar/1024 3093 ns 3093 ns 226198 bytes_per_second=2.46663G/s BM_ByteStreamSplitEncode_Double_Scalar/4096 12724 ns 12722 ns 54522 bytes_per_second=2.39873G/s BM_ByteStreamSplitEncode_Double_Scalar/32768 100488 ns 100475 ns 6957 bytes_per_second=2.42987G/s BM_ByteStreamSplitEncode_Double_Scalar/65536 200885 ns 200852 ns 3486 bytes_per_second=2.43105G/s ``` * after: ``` ------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------------------------- BM_ByteStreamSplitDecode_Float_Scalar/1024 932 ns 932 ns 753352 bytes_per_second=4.09273G/s BM_ByteStreamSplitDecode_Float_Scalar/4096 3715 ns 3715 ns 188394 bytes_per_second=4.10783G/s BM_ByteStreamSplitDecode_Float_Scalar/32768 30167 ns 30162 ns 23441 bytes_per_second=4.04716G/s BM_ByteStreamSplitDecode_Float_Scalar/65536 59483 ns 59475 ns 11744 bytes_per_second=4.10496G/s BM_ByteStreamSplitDecode_Double_Scalar/1024 1862 ns 1862 ns 374715 bytes_per_second=4.09823G/s BM_ByteStreamSplitDecode_Double_Scalar/4096 7554 ns 7553 ns 91975 bytes_per_second=4.04038G/s BM_ByteStreamSplitDecode_Double_Scalar/32768 60429 ns 60421 ns 11499 bytes_per_second=4.04067G/s BM_ByteStreamSplitDecode_Double_Scalar/65536 120992 ns 120972 ns 5756 bytes_per_second=4.03631G/s BM_ByteStreamSplitEncode_Float_Scalar/1024 737 ns 737 ns 947423 bytes_per_second=5.17843G/s BM_ByteStreamSplitEncode_Float_Scalar/4096 2934 ns 2933 ns 239459 bytes_per_second=5.20175G/s BM_ByteStreamSplitEncode_Float_Scalar/32768 23730 ns 23727 ns 29243 bytes_per_second=5.14485G/s BM_ByteStreamSplitEncode_Float_Scalar/65536 47671 ns 47664 ns 14682 bytes_per_second=5.12209G/s BM_ByteStreamSplitEncode_Double_Scalar/1024 1517 ns 1517 ns 458928 bytes_per_second=5.02827G/s BM_ByteStreamSplitEncode_Double_Scalar/4096 6224 ns 6223 ns 111361 bytes_per_second=4.90407G/s BM_ByteStreamSplitEncode_Double_Scalar/32768 49719 ns 49713 ns 14059 bytes_per_second=4.91099G/s BM_ByteStreamSplitEncode_Double_Scalar/65536 99445 ns 99432 ns 7027 bytes_per_second=4.91072G/s ``` ### Are these changes tested? Yes, though the scalar implementations are unfortunately only exercised on non-x86 by default (see added comment in the PR). ### Are there any user-facing changes? No. * Closes: apache#38542 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
- Loading branch information
Showing
5 changed files
with
298 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include <algorithm> | ||
#include <cmath> | ||
#include <cstddef> | ||
#include <functional> | ||
#include <iostream> | ||
#include <memory> | ||
#include <random> | ||
#include <string> | ||
#include <type_traits> | ||
#include <utility> | ||
#include <vector> | ||
|
||
#include <gtest/gtest.h> | ||
|
||
#include "arrow/testing/gtest_util.h" | ||
#include "arrow/testing/util.h" | ||
#include "arrow/util/byte_stream_split_internal.h" | ||
|
||
namespace arrow::util::internal { | ||
|
||
using ByteStreamSplitTypes = ::testing::Types<float, double>; | ||
|
||
template <typename Func> | ||
struct NamedFunc { | ||
std::string name; | ||
Func func; | ||
|
||
friend std::ostream& operator<<(std::ostream& os, const NamedFunc& func) { | ||
os << func.name; | ||
return os; | ||
} | ||
}; | ||
|
||
// A simplistic reference implementation for validation | ||
void RefererenceByteStreamSplitEncode(const uint8_t* src, int width, | ||
const int64_t num_values, uint8_t* dest) { | ||
for (int64_t i = 0; i < num_values; ++i) { | ||
for (int stream = 0; stream < width; ++stream) { | ||
dest[stream * num_values + i] = *src++; | ||
} | ||
} | ||
} | ||
|
||
template <typename T> | ||
class TestByteStreamSplitSpecialized : public ::testing::Test { | ||
public: | ||
using EncodeFunc = NamedFunc<std::function<decltype(ByteStreamSplitEncode<T>)>>; | ||
using DecodeFunc = NamedFunc<std::function<decltype(ByteStreamSplitDecode<T>)>>; | ||
|
||
static constexpr int kWidth = static_cast<int>(sizeof(T)); | ||
|
||
void SetUp() override { | ||
encode_funcs_.push_back({"reference", &ReferenceEncode}); | ||
encode_funcs_.push_back({"scalar", &ByteStreamSplitEncodeScalar<T>}); | ||
decode_funcs_.push_back({"scalar", &ByteStreamSplitDecodeScalar<T>}); | ||
#if defined(ARROW_HAVE_SIMD_SPLIT) | ||
encode_funcs_.push_back({"simd", &ByteStreamSplitEncodeSimd<T>}); | ||
decode_funcs_.push_back({"simd", &ByteStreamSplitDecodeSimd<T>}); | ||
#endif | ||
} | ||
|
||
void TestRoundtrip(int64_t num_values) { | ||
// Test one-shot roundtrip among all encode/decode function combinations | ||
ARROW_SCOPED_TRACE("num_values = ", num_values); | ||
const auto input = MakeRandomInput(num_values); | ||
std::vector<uint8_t> encoded(num_values * kWidth); | ||
std::vector<T> decoded(num_values); | ||
|
||
for (const auto& encode_func : encode_funcs_) { | ||
ARROW_SCOPED_TRACE("encode_func = ", encode_func); | ||
encoded.assign(encoded.size(), 0); | ||
encode_func.func(reinterpret_cast<const uint8_t*>(input.data()), num_values, | ||
encoded.data()); | ||
for (const auto& decode_func : decode_funcs_) { | ||
ARROW_SCOPED_TRACE("decode_func = ", decode_func); | ||
decoded.assign(decoded.size(), T{}); | ||
decode_func.func(encoded.data(), num_values, /*stride=*/num_values, | ||
decoded.data()); | ||
ASSERT_EQ(decoded, input); | ||
} | ||
} | ||
} | ||
|
||
void TestPiecewiseDecode(int64_t num_values) { | ||
// Test chunked decoding against the reference encode function | ||
ARROW_SCOPED_TRACE("num_values = ", num_values); | ||
const auto input = MakeRandomInput(num_values); | ||
std::vector<uint8_t> encoded(num_values * kWidth); | ||
ReferenceEncode(reinterpret_cast<const uint8_t*>(input.data()), num_values, | ||
encoded.data()); | ||
std::vector<T> decoded(num_values); | ||
|
||
std::default_random_engine gen(seed_++); | ||
std::uniform_int_distribution<int64_t> chunk_size_dist(1, 123); | ||
|
||
for (const auto& decode_func : decode_funcs_) { | ||
ARROW_SCOPED_TRACE("decode_func = ", decode_func); | ||
decoded.assign(decoded.size(), T{}); | ||
|
||
int64_t offset = 0; | ||
while (offset < num_values) { | ||
auto chunk_size = std::min<int64_t>(num_values - offset, chunk_size_dist(gen)); | ||
decode_func.func(encoded.data() + offset, chunk_size, /*stride=*/num_values, | ||
decoded.data() + offset); | ||
offset += chunk_size; | ||
} | ||
ASSERT_EQ(offset, num_values); | ||
ASSERT_EQ(decoded, input); | ||
} | ||
} | ||
|
||
protected: | ||
static void ReferenceEncode(const uint8_t* raw_values, const int64_t num_values, | ||
uint8_t* output_buffer_raw) { | ||
RefererenceByteStreamSplitEncode(raw_values, kWidth, num_values, output_buffer_raw); | ||
} | ||
|
||
static std::vector<T> MakeRandomInput(int64_t num_values) { | ||
std::vector<T> input(num_values); | ||
random_bytes(kWidth * num_values, seed_++, reinterpret_cast<uint8_t*>(input.data())); | ||
// Avoid NaNs to ease comparison | ||
for (auto& value : input) { | ||
if (std::isnan(value)) { | ||
value = nan_replacement_++; | ||
} | ||
} | ||
return input; | ||
} | ||
|
||
std::vector<EncodeFunc> encode_funcs_; | ||
std::vector<DecodeFunc> decode_funcs_; | ||
|
||
static inline uint32_t seed_ = 42; | ||
static inline T nan_replacement_ = 0; | ||
}; | ||
|
||
TYPED_TEST_SUITE(TestByteStreamSplitSpecialized, ByteStreamSplitTypes); | ||
|
||
TYPED_TEST(TestByteStreamSplitSpecialized, RoundtripSmall) { | ||
for (int64_t num_values : {1, 5, 7, 12, 19, 31, 32}) { | ||
this->TestRoundtrip(num_values); | ||
} | ||
} | ||
|
||
TYPED_TEST(TestByteStreamSplitSpecialized, RoundtripMidsized) { | ||
for (int64_t num_values : {126, 127, 128, 129, 133, 200}) { | ||
this->TestRoundtrip(num_values); | ||
} | ||
} | ||
|
||
TYPED_TEST(TestByteStreamSplitSpecialized, PiecewiseDecode) { | ||
this->TestPiecewiseDecode(/*num_values=*/500); | ||
} | ||
|
||
} // namespace arrow::util::internal |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.