Info
-
Did you know about intrisincts to support SIMD (Single Instruction, Multiple Data) instructions?
Example
#include <immintrin.h>
int main() {
const std::vector a = {1, 2, 3, 4};
const std::vector b = {5, 6, 7, 8};
const auto va = _mm_loadu_si128((__m128i*)a.data());
const auto vb = _mm_loadu_si128((__m128i*)b.data());
const auto result = _mm_add_epi32(va, vb);
std::vector<int> v(a.size());
_mm_storeu_si128((__m128i*)v.data(), result);
assert((std::vector{1 + 5, 2 + 6, 3 + 7, 4 + 8} == v));
}
Puzzle
Can you implement a function which computes the dot product of the two arrays using SIMD instructions?
[[nodiscard]] constexpr auto dot_product(const auto& lhs, const auto& rhs); // TODO
int main() {
using namespace boost::ut;
"simd.dot_product empty"_test = [] {
const std::vector<float> a{};
const std::vector<float> b{};
expect(0_i == dot_product(a, b));
};
"simd.dot_product one"_test = [] {
const std::vector a = {1.f};
const std::vector b = {3.f};
expect(_i(1*3) == dot_product(a, b));
};
"simd.dot_product many"_test = [] {
const std::vector a = {1.f, 2.f, 3.f, 4.f};
const std::vector b = {5.f, 6.f, 7.f, 8.f};
expect(_i(1*5+2*6+3*7+4*8) == dot_product(a, b));
};
}
Solutions
namespace impl{
template<typename T>
struct simd{
static constexpr auto dot_product(const T& lhs, const T& rhs) -> void
{
static_assert(false);
}
};
template<>
struct simd<std::vector<float>>{
[[nodiscard]] static constexpr auto dot_product(const std::vector<float>& lhs, const std::vector<float>& rhs) -> float
{
std::array<float, 4> out = {0.0f, 0.0f, 0.0f, 0.0f};\
auto vc = _mm_loadu_ps(out.data());
for (auto i = 0uz; i < std::size(lhs); i += 4)
{
const auto va = _mm_loadu_ps(lhs.data() + i);
const auto vb = _mm_loadu_ps(rhs.data() + i);
vc = _mm_fmadd_ps(va, vb, vc);
}
_mm_store_ps(out.data(), vc);
return out.at(0) + out.at(1) + out.at(2) + out.at(3);
}
};
}
[[nodiscard]] constexpr auto dot_product(const auto& lhs, const auto& rhs)
{
return impl::simd<std::remove_cvref_t<decltype(lhs)>>::dot_product(lhs, rhs);
}
static inline float _mm_reduce_add_ps(__m128 x128) {
const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
return _mm_cvtss_f32(x32);
}
[[nodiscard]] constexpr auto dot_product(const auto& lhs, const auto& rhs) {
auto lhs_vector = _mm_load_ps(lhs.data());
auto rhs_vector = _mm_load_ps(rhs.data());
auto output = _mm_fmadd_ps(lhs_vector, rhs_vector, _mm_setzero_ps());
return _mm_reduce_add_ps(output);
}
[[nodiscard]] constexpr auto dot_product(const auto& lhs, const auto& rhs) {
const __m128 vlhs = _mm_load_ps(lhs.data());
const __m128 vrhs = _mm_load_ps(rhs.data());
const __m128 mul = _mm_mul_ps( vlhs, vrhs );
const __m128 r2 = _mm_add_ps( mul, _mm_movehl_ps( mul, mul ) );
const __m128 r1 = _mm_add_ss( r2, _mm_movehdup_ps( r2 ) );
return _mm_cvtss_f32( r1 );
}
[[nodiscard]] constexpr auto dot_product(const auto& lhs, const auto& rhs){
const auto va = _mm256_load_ps(lhs.data());
const auto vb = _mm256_load_ps(rhs.data());
const auto result = _mm256_dp_ps(va,vb,0xFF);
return (float)result[0];
}