[SYCL][ESIMD] Move a few math operations to SPIR-V intrinsics and sup…

…port new functions (intel#13383) This PR has three types of changes: 1) Move some math operations to SPIR-V intrinsics. They are: ``` __esimd_abs to __spirv_ocl_fabs and __spirv_ocl_s_abs __esimd_fmin to __spirv_ocl_fmin __esimd_fmadd to __spirv_ocl_fma ``` 2) Support three new functions using SPIR-V intrinsics ``` popcount clz ctz ``` There are some more functions I will move to SPIR-V intrinsincs once driver issues are fixed. 3) Remove code that breaks up the fmuladd intrinsic generated by the FE as the drivers we support can handle it now. Signed-off-by: Sarnie, Nick <[email protected]>
AlexeySachkov · Jun 3, 2024 · bcca7a8 · bcca7a8
1 parent 9f1cee5
commit bcca7a8
Show file tree

Hide file tree

Showing 8 changed files with 292 additions and 41 deletions.
diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
@@ -1237,21 +1237,6 @@ static Instruction *addCastInstIfNeeded(Instruction *OldI, Instruction *NewI,
   return NewI;
 }
 
-// Translates the following intrinsics:
-//   %res = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-//   %res = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
-// To
-//   %mul = fmul <type> %a, <type> %b
-//   %res = fadd <type> %mul, <type> %c
-// TODO: Remove when newer GPU driver is used in CI.
-void translateFmuladd(CallInst *CI) {
-  assert(CI->getIntrinsicID() == Intrinsic::fmuladd);
-  IRBuilder<> Bld(CI);
-  auto *Mul = Bld.CreateFMul(CI->getOperand(0), CI->getOperand(1));
-  auto *Res = Bld.CreateFAdd(Mul, CI->getOperand(2));
-  CI->replaceAllUsesWith(Res);
-}
-
 // Translates an LLVM intrinsic to a form, digestable by the BE.
 bool translateLLVMIntrinsic(CallInst *CI) {
   Function *F = CI->getCalledFunction();
@@ -1263,9 +1248,6 @@ bool translateLLVMIntrinsic(CallInst *CI) {
     // no translation - it will be simply removed.
     // TODO: make use of 'assume' info in the BE
     break;
-  case Intrinsic::fmuladd:
-    translateFmuladd(CI);
-    break;
   default:
     return false; // "intrinsic wasn't translated, keep the original call"
   }

diff --git a/llvm/test/SYCLLowerIR/ESIMD/lower_llvm_intrin.ll b/llvm/test/SYCLLowerIR/ESIMD/lower_llvm_intrin.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -passes=LowerESIMD -S < %s | FileCheck %s
 
-; This test checks that LowerESIMD pass correctly lowers some llvm intrinsics
-; which can't be handled by the VC BE.
+; This test checks that LowerESIMD pass does not lower some llvm intrinsics
+; which can now be handled by the VC BE.
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
 target triple = "spir64-unknown-unknown"
 
@@ -10,17 +10,15 @@ declare double @llvm.fmuladd.f64(double %x, double %y, double %z)
 
 define spir_func float @test_fmuladd_f32(float %x, float %y, float %z) {
   %1 = call float @llvm.fmuladd.f32(float %x, float %y, float %z)
-; CHECK: %[[A:[0-9a-zA-Z\._]+]] = fmul float %x, %y
-; CHECK: %[[B:[0-9a-zA-Z\._]+]] = fadd float %[[A]], %z
+; CHECK: %[[A:[0-9a-zA-Z\._]+]] = call float @llvm.fmuladd.f32(float %x, float %y, float %z)
   ret float %1
-; CHECK: ret float %[[B]]
+; CHECK: ret float %[[A]]
 }
 
 define spir_func double @test_fmuladd_f64(double %x, double %y, double %z) {
   %1 = call double @llvm.fmuladd.f64(double %x, double %y, double %z)
-; CHECK: %[[A:[0-9a-zA-Z\._]+]] = fmul double %x, %y
-; CHECK: %[[B:[0-9a-zA-Z\._]+]] = fadd double %[[A]], %z
+; CHECK: %[[A:[0-9a-zA-Z\._]+]] = call double @llvm.fmuladd.f64(double %x, double %y, double %z)
   ret double %1
-; CHECK: ret double %[[B]]
+; CHECK: ret double %[[A]]
 }
 
diff --git a/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp
@@ -72,6 +72,19 @@ template <typename T, int N>
 __ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
     __spirv_ocl_native_powr(__ESIMD_raw_vec_t(T, N), __ESIMD_raw_vec_t(T, N));
 
+template <typename T, int N>
+__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
+    __spirv_ocl_fabs(__ESIMD_raw_vec_t(T, N)) __ESIMD_INTRIN_END;
+
+template <typename T, int N>
+__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
+    __spirv_ocl_s_abs(__ESIMD_raw_vec_t(T, N)) __ESIMD_INTRIN_END;
+
+template <typename T, int N>
+__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
+    __spirv_ocl_fmin(__ESIMD_raw_vec_t(T, N),
+                     __ESIMD_raw_vec_t(T, N)) __ESIMD_INTRIN_END;
+
 // saturation intrinsics
 template <typename T0, typename T1, int SZ>
 __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ)
@@ -101,10 +114,6 @@ template <typename T0, typename T1, int SZ>
 __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ)
     __esimd_sstrunc_sat(__ESIMD_raw_vec_t(T1, SZ) src) __ESIMD_INTRIN_END;
 
-template <typename T, int SZ>
-__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ)
-    __esimd_abs(__ESIMD_raw_vec_t(T, SZ) src0) __ESIMD_INTRIN_END;
-
 /// 3 kinds of max
 template <typename T, int SZ>
 __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ)
@@ -119,11 +128,7 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ)
     __esimd_smax(__ESIMD_raw_vec_t(T, SZ) src0,
                  __ESIMD_raw_vec_t(T, SZ) src1) __ESIMD_INTRIN_END;
 
-/// 3 kinds of min
-template <typename T, int SZ>
-__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ)
-    __esimd_fmin(__ESIMD_raw_vec_t(T, SZ) src0,
-                 __ESIMD_raw_vec_t(T, SZ) src1) __ESIMD_INTRIN_END;
+/// 3 kinds of min, the missing fmin uses spir-v instrinsics above
 template <typename T, int SZ>
 __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ)
     __esimd_umin(__ESIMD_raw_vec_t(T, SZ) src0,

diff --git a/sycl/include/sycl/ext/intel/esimd/math.hpp b/sycl/include/sycl/ext/intel/esimd/math.hpp
@@ -97,7 +97,11 @@ namespace detail {
 template <typename TRes, typename TArg, int SZ>
 ESIMD_NODEBUG ESIMD_INLINE simd<TRes, SZ>
 __esimd_abs_common_internal(simd<TArg, SZ> src0) {
-  simd<TArg, SZ> Result = simd<TArg, SZ>(__esimd_abs<TArg, SZ>(src0.data()));
+  simd<TArg, SZ> Result;
+  if constexpr (detail::is_generic_floating_point_v<TArg>)
+    Result = simd<TArg, SZ>(__spirv_ocl_fabs<TArg, SZ>(src0.data()));
+  else
+    Result = simd<TArg, SZ>(__spirv_ocl_s_abs<TArg, SZ>(src0.data()));
   return convert<TRes>(Result);
 }
 
@@ -266,7 +270,7 @@ __ESIMD_API simd<T, SZ>(min)(simd<T, SZ> src0, simd<T, SZ> src1, Sat sat = {}) {
   constexpr bool is_sat = std::is_same_v<Sat, saturation_on_tag>;
 
   if constexpr (std::is_floating_point<T>::value) {
-    auto Result = __esimd_fmin<T, SZ>(src0.data(), src1.data());
+    auto Result = __spirv_ocl_fmin<T, SZ>(src0.data(), src1.data());
     if constexpr (is_sat)
       Result = __esimd_sat<T, T, SZ>(Result);
     return simd<T, SZ>(Result);
@@ -1475,7 +1479,7 @@ template <typename T0, typename T1, int SZ> struct esimd_apply_reduced_min {
   template <typename... T>
   simd<T0, SZ> operator()(simd<T1, SZ> v1, simd<T1, SZ> v2) {
     if constexpr (std::is_floating_point<T1>::value) {
-      return __esimd_fmin<T1, SZ>(v1.data(), v2.data());
+      return __spirv_ocl_fmin<T1, SZ>(v1.data(), v2.data());
     } else if constexpr (std::is_unsigned<T1>::value) {
       return __esimd_umin<T1, SZ>(v1.data(), v2.data());
     } else {

diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp
@@ -112,8 +112,19 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(sycl::half, N)
 
 template <typename T, int N>
 __ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
-    __esimd_fmadd(__ESIMD_raw_vec_t(T, N) a, __ESIMD_raw_vec_t(T, N) b,
-                  __ESIMD_raw_vec_t(T, N) c) __ESIMD_INTRIN_END;
+    __spirv_ocl_fma(__ESIMD_raw_vec_t(T, N) a, __ESIMD_raw_vec_t(T, N) b,
+                    __ESIMD_raw_vec_t(T, N) c) __ESIMD_INTRIN_END;
+template <typename T, int N>
+__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
+    __spirv_ocl_popcount(__ESIMD_raw_vec_t(T, N) src0) __ESIMD_INTRIN_END;
+
+template <typename T, int N>
+__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
+    __spirv_ocl_ctz(__ESIMD_raw_vec_t(T, N) src0) __ESIMD_INTRIN_END;
+
+template <typename T, int N>
+__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N)
+    __spirv_ocl_clz(__ESIMD_raw_vec_t(T, N) src0) __ESIMD_INTRIN_END;
 
 #undef __ESIMD_raw_vec_t
 #undef __ESIMD_cpp_vec_t

diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp
@@ -330,6 +330,40 @@ __ESIMD_API std::enable_if_t<
   return __ESIMD_NS::ror<T0, T1, T2>(src0, src1);
 }
 
+/// Count the number of 1-bits.
+/// @tparam T element type.
+/// @tparam N vector length.
+/// @return the popcounted vector.
+template <typename T, int N>
+__ESIMD_API std::enable_if_t<std::is_integral_v<T> && sizeof(T) < 8,
+                             __ESIMD_NS::simd<T, N>>
+popcount(__ESIMD_NS::simd<T, N> vec) {
+  return __spirv_ocl_popcount<T, N>(vec.data());
+}
+
+/// Count the number of leading zeros.
+/// If the input is 0, the number of total bits is returned.
+/// @tparam T element type.
+/// @tparam N vector length.
+/// @return vector with number of leading zeros of the input vector.
+template <typename T, int N>
+__ESIMD_API std::enable_if_t<std::is_integral_v<T> && sizeof(T) < 8,
+                             __ESIMD_NS::simd<T, N>>
+clz(__ESIMD_NS::simd<T, N> vec) {
+  return __spirv_ocl_clz<T, N>(vec.data());
+}
+
+/// Count the number of trailing zeros.
+/// @tparam T element type.
+/// @tparam N vector length.
+/// @return vector with number of trailing zeros of the input vector.
+template <typename T, int N>
+__ESIMD_API std::enable_if_t<std::is_integral_v<T> && sizeof(T) < 8,
+                             __ESIMD_NS::simd<T, N>>
+ctz(__ESIMD_NS::simd<T, N> vec) {
+  return __spirv_ocl_ctz<T, N>(vec.data());
+}
+
 /// @} sycl_esimd_bitmanip
 
 /// @addtogroup sycl_esimd_math
@@ -1671,7 +1705,7 @@ ESIMD_INLINE __ESIMD_NS::simd<T, N> fma(__ESIMD_NS::simd<T, N> a,
   static_assert(__ESIMD_DNS::is_generic_floating_point_v<T>,
                 "fma only supports floating point types");
   using CppT = __ESIMD_DNS::element_type_traits<T>::EnclosingCppT;
-  auto Ret = __esimd_fmadd<__ESIMD_DNS::__raw_t<CppT>, N>(
+  auto Ret = __spirv_ocl_fma<__ESIMD_DNS::__raw_t<CppT>, N>(
       __ESIMD_DNS::convert_vector<CppT, T, N>(a.data()),
       __ESIMD_DNS::convert_vector<CppT, T, N>(b.data()),
       __ESIMD_DNS::convert_vector<CppT, T, N>(c.data()));

diff --git a/sycl/test-e2e/ESIMD/clz_ctz.cpp b/sycl/test-e2e/ESIMD/clz_ctz.cpp
@@ -0,0 +1,116 @@
+//==---------------- clz_ctz.cpp - DPC++ ESIMD on-device test -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+#include "esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/intel/esimd.hpp>
+
+using namespace sycl;
+
+template <typename T, bool CLZ> bool test(queue &q) {
+  std::cout << "Running " << (CLZ ? "CLZ " : "CTZ ")
+            << esimd_test::type_name<T>() << std::endl;
+  constexpr unsigned VL = 16;
+  constexpr unsigned Size = 1024 * 128;
+
+  T *A = new T[Size];
+  T *B = new T[Size];
+
+  for (unsigned i = 0; i < Size; ++i) {
+    A[i] = i;
+    B[i] = 0;
+  }
+
+  try {
+    buffer<T, 1> bufa(A, range<1>(Size));
+    buffer<T, 1> bufb(B, range<1>(Size));
+
+    // We need that many workgroups
+    range<1> GlobalRange{Size / VL};
+
+    // We need that many threads in each group
+    range<1> LocalRange{1};
+
+    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+    auto e = q.submit([&](handler &cgh) {
+      auto PA = bufa.template get_access<access::mode::read>(cgh);
+      auto PB = bufb.template get_access<access::mode::write>(cgh);
+      cgh.parallel_for(GlobalRange * LocalRange,
+                       [=](id<1> i) SYCL_ESIMD_KERNEL {
+                         using namespace sycl::ext::intel::esimd;
+                         unsigned int offset = i * VL * sizeof(T);
+                         simd<T, VL> va;
+                         va.copy_from(PA, offset);
+                         simd<T, VL> vb;
+                         if constexpr (CLZ)
+                           vb = __ESIMD_ENS::clz(va);
+                         else
+                           vb = __ESIMD_ENS::ctz(va);
+                         vb.copy_to(PB, offset);
+                       });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    return false;
+  }
+
+  int err_cnt = 0;
+
+  for (unsigned i = 0; i < Size; ++i) {
+    int Expected =
+        CLZ ? (i == 0 ? sizeof(T) * 8 : __builtin_clz(i)) : __builtin_ctz(i);
+    int Computed = B[i];
+    if (Expected != Computed && ++err_cnt < 10)
+      std::cout << "Failure at " << std::to_string(i)
+                << ": Expected: " << std::to_string(Expected)
+                << " Computed: " << std::to_string(Computed) << std::endl;
+  }
+  if (err_cnt > 0) {
+    std::cout << "  pass rate: "
+              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
+              << (Size - err_cnt) << "/" << Size << ")\n";
+  }
+
+  delete[] A;
+  delete[] B;
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt == 0;
+}
+
+int main() {
+  bool Passed = true;
+  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(q);
+  Passed &= test<uint32_t, true>(q);
+  Passed &= test<int32_t, true>(q);
+  Passed &= test<uint32_t, false>(q);
+  Passed &= test<int32_t, false>(q);
+// TODO: Enable once GPU driver issue is fixed
+#if 0
+   Passed &= test<uint8_t, true>(q);
+  Passed &= test<int8_t, true>(q);
+  Passed &= test<uint8_t, false>(q);
+  Passed &= test<int8_t, false>(q);
+
+  Passed &= test<uint16_t, true>(q);
+  Passed &= test<int16_t, true>(q);
+  Passed &= test<uint16_t, false>(q);
+  Passed &= test<int16_t, false>(q);
+#endif
+  return !Passed;
+}