Skip to content

Commit

Permalink
Optimize simd::leadingMask 10 times faster (#7736)
Browse files Browse the repository at this point in the history
Summary: Pull Request resolved: #7736

Differential Revision: D51555527
  • Loading branch information
Yuhta authored and facebook-github-bot committed Nov 27, 2023
1 parent 9dbce56 commit 3d2dfa2
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 8 deletions.
69 changes: 61 additions & 8 deletions velox/common/base/SimdUtil-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,19 +242,72 @@ int32_t indicesOfSetBits(
return result - originalResult;
}

namespace detail {

template <typename T, typename A>
xsimd::batch_bool<T, A> leadingMask(int n, const A&) {
constexpr int N = xsimd::batch_bool<T, A>::size;
static const auto kMemo = ({
std::array<xsimd::batch_bool<T, A>, N> memo;
struct LeadingMask {
LeadingMask() {
bool tmp[N]{};
for (int i = 0; i < N; ++i) {
memo[i] = xsimd::batch_bool<T, A>::load_unaligned(tmp);
memo_[i] = xsimd::batch_bool<T, A>::load_unaligned(tmp);
tmp[i] = true;
}
memo;
});
return LIKELY(n >= N) ? xsimd::batch_bool<T, A>(true) : kMemo[n];
memo_[N] = xsimd::batch_bool<T, A>::load_unaligned(tmp);
}

xsimd::batch_bool<T, A> operator[](size_t i) const {
return memo_[i];
}

private:
static constexpr int N = xsimd::batch_bool<T, A>::size;
xsimd::batch_bool<T, A> memo_[N + 1];
};

extern const LeadingMask<int32_t, xsimd::default_arch> leadingMask32;
extern const LeadingMask<int64_t, xsimd::default_arch> leadingMask64;

template <typename T, typename A>
xsimd::batch_bool<T, xsimd::default_arch> leadingMask(int i, const A&);

template <>
inline xsimd::batch_bool<int32_t, xsimd::default_arch> leadingMask(
int i,
const xsimd::default_arch&) {
return leadingMask32[i];
}

template <>
inline xsimd::batch_bool<float, xsimd::default_arch> leadingMask(
int i,
const xsimd::default_arch&) {
return reinterpret_cast<
xsimd::batch_bool<float, xsimd::default_arch>::register_type>(
leadingMask32[i].data);
}

template <>
inline xsimd::batch_bool<int64_t, xsimd::default_arch> leadingMask(
int i,
const xsimd::default_arch&) {
return leadingMask64[i];
}

template <>
inline xsimd::batch_bool<double, xsimd::default_arch> leadingMask(
int i,
const xsimd::default_arch&) {
return reinterpret_cast<
xsimd::batch_bool<double, xsimd::default_arch>::register_type>(
leadingMask64[i].data);
}

} // namespace detail

template <typename T, typename A>
xsimd::batch_bool<T, A> leadingMask(int n, const A& arch) {
constexpr int N = xsimd::batch_bool<T, A>::size;
return detail::leadingMask<T, A>(std::min(n, N), arch);
}

namespace detail {
Expand Down
3 changes: 3 additions & 0 deletions velox/common/base/SimdUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ namespace detail {
alignas(kPadding) int32_t byteSetBits[256][8];
alignas(kPadding) int32_t permute4x64Indices[16][8];

const LeadingMask<int32_t, xsimd::default_arch> leadingMask32;
const LeadingMask<int64_t, xsimd::default_arch> leadingMask64;

} // namespace detail

namespace {
Expand Down

0 comments on commit 3d2dfa2

Please sign in to comment.