Skip to content

Commit

Permalink
[SYCL][ESIMD] Move a few math operations to SPIR-V intrinsics and sup…
Browse files Browse the repository at this point in the history
…port new functions (intel#13383)

This PR has three types of changes:

1) Move some math operations to SPIR-V intrinsics. They are:
```
__esimd_abs to __spirv_ocl_fabs and __spirv_ocl_s_abs
__esimd_fmin to __spirv_ocl_fmin
__esimd_fmadd to __spirv_ocl_fma
```

2) Support three new functions using SPIR-V intrinsics
```
popcount
clz
ctz
```
There are some more functions I will move to SPIR-V intrinsincs once
driver issues are fixed.

3) Remove code that breaks up the fmuladd intrinsic generated by the FE
as the drivers we support can handle it now.

Signed-off-by: Sarnie, Nick <[email protected]>
  • Loading branch information
sarnex authored Jun 3, 2024
1 parent 9f1cee5 commit bcca7a8
Show file tree
Hide file tree
Showing 8 changed files with 292 additions and 41 deletions.
18 changes: 0 additions & 18 deletions llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1237,21 +1237,6 @@ static Instruction *addCastInstIfNeeded(Instruction *OldI, Instruction *NewI,
return NewI;
}

// Translates the following intrinsics:
// %res = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
// %res = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
// To
// %mul = fmul <type> %a, <type> %b
// %res = fadd <type> %mul, <type> %c
// TODO: Remove when newer GPU driver is used in CI.
void translateFmuladd(CallInst *CI) {
assert(CI->getIntrinsicID() == Intrinsic::fmuladd);
IRBuilder<> Bld(CI);
auto *Mul = Bld.CreateFMul(CI->getOperand(0), CI->getOperand(1));
auto *Res = Bld.CreateFAdd(Mul, CI->getOperand(2));
CI->replaceAllUsesWith(Res);
}

// Translates an LLVM intrinsic to a form, digestable by the BE.
bool translateLLVMIntrinsic(CallInst *CI) {
Function *F = CI->getCalledFunction();
Expand All @@ -1263,9 +1248,6 @@ bool translateLLVMIntrinsic(CallInst *CI) {
// no translation - it will be simply removed.
// TODO: make use of 'assume' info in the BE
break;
case Intrinsic::fmuladd:
translateFmuladd(CI);
break;
default:
return false; // "intrinsic wasn't translated, keep the original call"
}
Expand Down
14 changes: 6 additions & 8 deletions llvm/test/SYCLLowerIR/ESIMD/lower_llvm_intrin.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: opt -passes=LowerESIMD -S < %s | FileCheck %s

; This test checks that LowerESIMD pass correctly lowers some llvm intrinsics
; which can't be handled by the VC BE.
; This test checks that LowerESIMD pass does not lower some llvm intrinsics
; which can now be handled by the VC BE.
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
target triple = "spir64-unknown-unknown"

Expand All @@ -10,17 +10,15 @@ declare double @llvm.fmuladd.f64(double %x, double %y, double %z)

define spir_func float @test_fmuladd_f32(float %x, float %y, float %z) {
%1 = call float @llvm.fmuladd.f32(float %x, float %y, float %z)
; CHECK: %[[A:[0-9a-zA-Z\._]+]] = fmul float %x, %y
; CHECK: %[[B:[0-9a-zA-Z\._]+]] = fadd float %[[A]], %z
; CHECK: %[[A:[0-9a-zA-Z\._]+]] = call float @llvm.fmuladd.f32(float %x, float %y, float %z)
ret float %1
; CHECK: ret float %[[B]]
; CHECK: ret float %[[A]]
}

define spir_func double @test_fmuladd_f64(double %x, double %y, double %z) {
%1 = call double @llvm.fmuladd.f64(double %x, double %y, double %z)
; CHECK: %[[A:[0-9a-zA-Z\._]+]] = fmul double %x, %y
; CHECK: %[[B:[0-9a-zA-Z\._]+]] = fadd double %[[A]], %z
; CHECK: %[[A:[0-9a-zA-Z\._]+]] = call double @llvm.fmuladd.f64(double %x, double %y, double %z)
ret double %1
; CHECK: ret double %[[B]]
; CHECK: ret double %[[A]]
}

23 changes: 14 additions & 9 deletions sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,19 @@ template <typename T, int N>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
__spirv_ocl_native_powr(__ESIMD_raw_vec_t(T, N), __ESIMD_raw_vec_t(T, N));

template <typename T, int N>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
__spirv_ocl_fabs(__ESIMD_raw_vec_t(T, N)) __ESIMD_INTRIN_END;

template <typename T, int N>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
__spirv_ocl_s_abs(__ESIMD_raw_vec_t(T, N)) __ESIMD_INTRIN_END;

template <typename T, int N>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
__spirv_ocl_fmin(__ESIMD_raw_vec_t(T, N),
__ESIMD_raw_vec_t(T, N)) __ESIMD_INTRIN_END;

// saturation intrinsics
template <typename T0, typename T1, int SZ>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ)
Expand Down Expand Up @@ -101,10 +114,6 @@ template <typename T0, typename T1, int SZ>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ)
__esimd_sstrunc_sat(__ESIMD_raw_vec_t(T1, SZ) src) __ESIMD_INTRIN_END;

template <typename T, int SZ>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ)
__esimd_abs(__ESIMD_raw_vec_t(T, SZ) src0) __ESIMD_INTRIN_END;

/// 3 kinds of max
template <typename T, int SZ>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ)
Expand All @@ -119,11 +128,7 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ)
__esimd_smax(__ESIMD_raw_vec_t(T, SZ) src0,
__ESIMD_raw_vec_t(T, SZ) src1) __ESIMD_INTRIN_END;

/// 3 kinds of min
template <typename T, int SZ>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ)
__esimd_fmin(__ESIMD_raw_vec_t(T, SZ) src0,
__ESIMD_raw_vec_t(T, SZ) src1) __ESIMD_INTRIN_END;
/// 3 kinds of min, the missing fmin uses spir-v instrinsics above
template <typename T, int SZ>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ)
__esimd_umin(__ESIMD_raw_vec_t(T, SZ) src0,
Expand Down
10 changes: 7 additions & 3 deletions sycl/include/sycl/ext/intel/esimd/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,11 @@ namespace detail {
template <typename TRes, typename TArg, int SZ>
ESIMD_NODEBUG ESIMD_INLINE simd<TRes, SZ>
__esimd_abs_common_internal(simd<TArg, SZ> src0) {
simd<TArg, SZ> Result = simd<TArg, SZ>(__esimd_abs<TArg, SZ>(src0.data()));
simd<TArg, SZ> Result;
if constexpr (detail::is_generic_floating_point_v<TArg>)
Result = simd<TArg, SZ>(__spirv_ocl_fabs<TArg, SZ>(src0.data()));
else
Result = simd<TArg, SZ>(__spirv_ocl_s_abs<TArg, SZ>(src0.data()));
return convert<TRes>(Result);
}

Expand Down Expand Up @@ -266,7 +270,7 @@ __ESIMD_API simd<T, SZ>(min)(simd<T, SZ> src0, simd<T, SZ> src1, Sat sat = {}) {
constexpr bool is_sat = std::is_same_v<Sat, saturation_on_tag>;

if constexpr (std::is_floating_point<T>::value) {
auto Result = __esimd_fmin<T, SZ>(src0.data(), src1.data());
auto Result = __spirv_ocl_fmin<T, SZ>(src0.data(), src1.data());
if constexpr (is_sat)
Result = __esimd_sat<T, T, SZ>(Result);
return simd<T, SZ>(Result);
Expand Down Expand Up @@ -1475,7 +1479,7 @@ template <typename T0, typename T1, int SZ> struct esimd_apply_reduced_min {
template <typename... T>
simd<T0, SZ> operator()(simd<T1, SZ> v1, simd<T1, SZ> v2) {
if constexpr (std::is_floating_point<T1>::value) {
return __esimd_fmin<T1, SZ>(v1.data(), v2.data());
return __spirv_ocl_fmin<T1, SZ>(v1.data(), v2.data());
} else if constexpr (std::is_unsigned<T1>::value) {
return __esimd_umin<T1, SZ>(v1.data(), v2.data());
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,19 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(sycl::half, N)

template <typename T, int N>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
__esimd_fmadd(__ESIMD_raw_vec_t(T, N) a, __ESIMD_raw_vec_t(T, N) b,
__ESIMD_raw_vec_t(T, N) c) __ESIMD_INTRIN_END;
__spirv_ocl_fma(__ESIMD_raw_vec_t(T, N) a, __ESIMD_raw_vec_t(T, N) b,
__ESIMD_raw_vec_t(T, N) c) __ESIMD_INTRIN_END;
template <typename T, int N>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
__spirv_ocl_popcount(__ESIMD_raw_vec_t(T, N) src0) __ESIMD_INTRIN_END;

template <typename T, int N>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
__spirv_ocl_ctz(__ESIMD_raw_vec_t(T, N) src0) __ESIMD_INTRIN_END;

template <typename T, int N>
__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
__spirv_ocl_clz(__ESIMD_raw_vec_t(T, N) src0) __ESIMD_INTRIN_END;

#undef __ESIMD_raw_vec_t
#undef __ESIMD_cpp_vec_t
Expand Down
36 changes: 35 additions & 1 deletion sycl/include/sycl/ext/intel/experimental/esimd/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,40 @@ __ESIMD_API std::enable_if_t<
return __ESIMD_NS::ror<T0, T1, T2>(src0, src1);
}

/// Count the number of 1-bits.
/// @tparam T element type.
/// @tparam N vector length.
/// @return the popcounted vector.
template <typename T, int N>
__ESIMD_API std::enable_if_t<std::is_integral_v<T> && sizeof(T) < 8,
__ESIMD_NS::simd<T, N>>
popcount(__ESIMD_NS::simd<T, N> vec) {
return __spirv_ocl_popcount<T, N>(vec.data());
}

/// Count the number of leading zeros.
/// If the input is 0, the number of total bits is returned.
/// @tparam T element type.
/// @tparam N vector length.
/// @return vector with number of leading zeros of the input vector.
template <typename T, int N>
__ESIMD_API std::enable_if_t<std::is_integral_v<T> && sizeof(T) < 8,
__ESIMD_NS::simd<T, N>>
clz(__ESIMD_NS::simd<T, N> vec) {
return __spirv_ocl_clz<T, N>(vec.data());
}

/// Count the number of trailing zeros.
/// @tparam T element type.
/// @tparam N vector length.
/// @return vector with number of trailing zeros of the input vector.
template <typename T, int N>
__ESIMD_API std::enable_if_t<std::is_integral_v<T> && sizeof(T) < 8,
__ESIMD_NS::simd<T, N>>
ctz(__ESIMD_NS::simd<T, N> vec) {
return __spirv_ocl_ctz<T, N>(vec.data());
}

/// @} sycl_esimd_bitmanip

/// @addtogroup sycl_esimd_math
Expand Down Expand Up @@ -1671,7 +1705,7 @@ ESIMD_INLINE __ESIMD_NS::simd<T, N> fma(__ESIMD_NS::simd<T, N> a,
static_assert(__ESIMD_DNS::is_generic_floating_point_v<T>,
"fma only supports floating point types");
using CppT = __ESIMD_DNS::element_type_traits<T>::EnclosingCppT;
auto Ret = __esimd_fmadd<__ESIMD_DNS::__raw_t<CppT>, N>(
auto Ret = __spirv_ocl_fma<__ESIMD_DNS::__raw_t<CppT>, N>(
__ESIMD_DNS::convert_vector<CppT, T, N>(a.data()),
__ESIMD_DNS::convert_vector<CppT, T, N>(b.data()),
__ESIMD_DNS::convert_vector<CppT, T, N>(c.data()));
Expand Down
116 changes: 116 additions & 0 deletions sycl/test-e2e/ESIMD/clz_ctz.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
//==---------------- clz_ctz.cpp - DPC++ ESIMD on-device test -------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
// RUN: %{run} %t.out

#include "esimd_test_utils.hpp"

#include <iostream>
#include <sycl/detail/core.hpp>
#include <sycl/ext/intel/esimd.hpp>

using namespace sycl;

template <typename T, bool CLZ> bool test(queue &q) {
std::cout << "Running " << (CLZ ? "CLZ " : "CTZ ")
<< esimd_test::type_name<T>() << std::endl;
constexpr unsigned VL = 16;
constexpr unsigned Size = 1024 * 128;

T *A = new T[Size];
T *B = new T[Size];

for (unsigned i = 0; i < Size; ++i) {
A[i] = i;
B[i] = 0;
}

try {
buffer<T, 1> bufa(A, range<1>(Size));
buffer<T, 1> bufb(B, range<1>(Size));

// We need that many workgroups
range<1> GlobalRange{Size / VL};

// We need that many threads in each group
range<1> LocalRange{1};

queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());

auto e = q.submit([&](handler &cgh) {
auto PA = bufa.template get_access<access::mode::read>(cgh);
auto PB = bufb.template get_access<access::mode::write>(cgh);
cgh.parallel_for(GlobalRange * LocalRange,
[=](id<1> i) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::esimd;
unsigned int offset = i * VL * sizeof(T);
simd<T, VL> va;
va.copy_from(PA, offset);
simd<T, VL> vb;
if constexpr (CLZ)
vb = __ESIMD_ENS::clz(va);
else
vb = __ESIMD_ENS::ctz(va);
vb.copy_to(PB, offset);
});
});
e.wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';

delete[] A;
delete[] B;
return false;
}

int err_cnt = 0;

for (unsigned i = 0; i < Size; ++i) {
int Expected =
CLZ ? (i == 0 ? sizeof(T) * 8 : __builtin_clz(i)) : __builtin_ctz(i);
int Computed = B[i];
if (Expected != Computed && ++err_cnt < 10)
std::cout << "Failure at " << std::to_string(i)
<< ": Expected: " << std::to_string(Expected)
<< " Computed: " << std::to_string(Computed) << std::endl;
}
if (err_cnt > 0) {
std::cout << " pass rate: "
<< ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
<< (Size - err_cnt) << "/" << Size << ")\n";
}

delete[] A;
delete[] B;

std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
return err_cnt == 0;
}

int main() {
bool Passed = true;
queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
esimd_test::printTestLabel(q);
Passed &= test<uint32_t, true>(q);
Passed &= test<int32_t, true>(q);
Passed &= test<uint32_t, false>(q);
Passed &= test<int32_t, false>(q);
// TODO: Enable once GPU driver issue is fixed
#if 0
Passed &= test<uint8_t, true>(q);
Passed &= test<int8_t, true>(q);
Passed &= test<uint8_t, false>(q);
Passed &= test<int8_t, false>(q);

Passed &= test<uint16_t, true>(q);
Passed &= test<int16_t, true>(q);
Passed &= test<uint16_t, false>(q);
Passed &= test<int16_t, false>(q);
#endif
return !Passed;
}
Loading

0 comments on commit bcca7a8

Please sign in to comment.