Skip to content

Commit

Permalink
Optimize simd::leadingMask 10 times faster (#7736)
Browse files Browse the repository at this point in the history
Summary: Pull Request resolved: #7736

Reviewed By: oerling

Differential Revision: D51555527

fbshipit-source-id: a4fc6d60f23cbe31c7fb434ad9b56266e9663265
  • Loading branch information
Yuhta authored and facebook-github-bot committed Nov 28, 2023
1 parent faa9a3f commit 9f5c96b
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 8 deletions.
69 changes: 61 additions & 8 deletions velox/common/base/SimdUtil-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,19 +242,72 @@ int32_t indicesOfSetBits(
return result - originalResult;
}

namespace detail {

template <typename T, typename A>
xsimd::batch_bool<T, A> leadingMask(int n, const A&) {
constexpr int N = xsimd::batch_bool<T, A>::size;
static const auto kMemo = ({
std::array<xsimd::batch_bool<T, A>, N> memo;
struct LeadingMask {
LeadingMask() {
bool tmp[N]{};
for (int i = 0; i < N; ++i) {
memo[i] = xsimd::batch_bool<T, A>::load_unaligned(tmp);
memo_[i] = xsimd::batch_bool<T, A>::load_unaligned(tmp);
tmp[i] = true;
}
memo;
});
return LIKELY(n >= N) ? xsimd::batch_bool<T, A>(true) : kMemo[n];
memo_[N] = xsimd::batch_bool<T, A>::load_unaligned(tmp);
}

xsimd::batch_bool<T, A> operator[](size_t i) const {
return memo_[i];
}

private:
static constexpr int N = xsimd::batch_bool<T, A>::size;
xsimd::batch_bool<T, A> memo_[N + 1];
};

extern const LeadingMask<int32_t, xsimd::default_arch> leadingMask32;
extern const LeadingMask<int64_t, xsimd::default_arch> leadingMask64;

template <typename T, typename A>
xsimd::batch_bool<T, xsimd::default_arch> leadingMask(int i, const A&);

template <>
inline xsimd::batch_bool<int32_t, xsimd::default_arch> leadingMask(
int i,
const xsimd::default_arch&) {
return leadingMask32[i];
}

template <>
inline xsimd::batch_bool<float, xsimd::default_arch> leadingMask(
int i,
const xsimd::default_arch&) {
return reinterpret_cast<
xsimd::batch_bool<float, xsimd::default_arch>::register_type>(
leadingMask32[i].data);
}

template <>
inline xsimd::batch_bool<int64_t, xsimd::default_arch> leadingMask(
int i,
const xsimd::default_arch&) {
return leadingMask64[i];
}

template <>
inline xsimd::batch_bool<double, xsimd::default_arch> leadingMask(
int i,
const xsimd::default_arch&) {
return reinterpret_cast<
xsimd::batch_bool<double, xsimd::default_arch>::register_type>(
leadingMask64[i].data);
}

} // namespace detail

template <typename T, typename A>
xsimd::batch_bool<T, A> leadingMask(int n, const A& arch) {
constexpr int N = xsimd::batch_bool<T, A>::size;
return detail::leadingMask<T, A>(std::min(n, N), arch);
}

namespace detail {
Expand Down
3 changes: 3 additions & 0 deletions velox/common/base/SimdUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ namespace detail {
alignas(kPadding) int32_t byteSetBits[256][8];
alignas(kPadding) int32_t permute4x64Indices[16][8];

const LeadingMask<int32_t, xsimd::default_arch> leadingMask32;
const LeadingMask<int64_t, xsimd::default_arch> leadingMask64;

} // namespace detail

namespace {
Expand Down
66 changes: 66 additions & 0 deletions velox/common/base/benchmarks/SimdUtilBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/common/base/SimdUtil.h"

#include <folly/Benchmark.h>
#include <folly/init/Init.h>

#include <random>

namespace facebook::velox {
namespace {

#define VELOX_BENCHMARK(_type, _name, ...) \
_type _name(FOLLY_PP_STRINGIZE(_name), __VA_ARGS__)

template <typename T>
class LeadingMask {
public:
LeadingMask(const char* name, std::default_random_engine& gen) {
std::uniform_int_distribution<> dist(xsimd::batch<T>::size + 1);
for (int i = 0; i < kSize; ++i) {
inputs_[i] = dist(gen);
}
folly::addBenchmark(__FILE__, name, [this] { return run(); });
}

private:
unsigned run() {
xsimd::batch_bool<T> ans = {};
for (int i = 0; i < kSize; ++i) {
ans = ans ^ simd::leadingMask<T>(inputs_[i]);
}
folly::doNotOptimizeAway(ans);
return kSize;
}

static constexpr int kSize = 4 << 10;
int8_t inputs_[kSize];
};

} // namespace
} // namespace facebook::velox

int main(int argc, char* argv[]) {
using namespace facebook::velox;
folly::Init follyInit(&argc, &argv);
std::default_random_engine gen(std::random_device{}());
VELOX_BENCHMARK(LeadingMask<int32_t>, leadingMaskInt32, gen);
VELOX_BENCHMARK(LeadingMask<int64_t>, leadingMaskInt64, gen);
folly::runBenchmarks();
return 0;
}

0 comments on commit 9f5c96b

Please sign in to comment.