From 6da4d2e77fd3dd7ba3c9099bb42bcddfda5b4dd5 Mon Sep 17 00:00:00 2001 From: PietroGhg <38155419+PietroGhg@users.noreply.github.com> Date: Thu, 7 Sep 2023 10:21:24 +0200 Subject: [PATCH] [SYCL][NATIVECPU][LIBCLC] Use libclc for SYCL Native CPU (#10970) This PR allows linking to libclc when compiling for SYCL Native CPU. Currently only the `x86_64-unknown-linux-gnu` target triple is supported, additional target triples (and possibly a more versatile way of setting them) will come with follow up PRs. Some useful information for reviewing: * We start using an `AddrSpaceMap` (set in `TargetInfo.cpp`) because the mangled names emitted by the device compiler need to match with the names provided by `libclc`. The AddressSpaceMap is taken from the `PTX` Target. * Changes in `Driver` are needed to find and link to `libclc`. * `libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers.ll` has been split into 4 modules, one for each memory ordering constraint. Copies of these modules have been added in `generic` (because some functions in `generic/libspirv/atomic` needed them), and the module split allows to specialize the file for targets that may not support some orderings. Currently only a couple of function for `acquire` and `seq_cst` have been implemented for `generic`, but the others will be implemented in a follow up PR. * We've added a target in `libclc` for `x86_64-unknown-linux`. This has been done because some math builtins in `generic` have been defined as ``` typedef char vec __attribute__((ext_vector_type(8))); __attribute__((overloadable)) vec __clc_native_popcount(vec x) __asm("llvm.ctpop" ".v16i" "8"); vec call(vec x) { return __clc_native_popcount(x); } ``` While this approach conveniently allows to call directly LLVM intrinsics, it does seem to play well with the ABI for `x86_64-unknown-linux`, since it leads to this IR: ``` define dso_local double @call(double noundef %x.coerce) #0 { entry: %0 = bitcast double %x.coerce to <8 x i8> %1 = bitcast <8 x i8> %0 to double %call = call double @llvm.ctpop.v8i8(double noundef %1) #8 %2 = bitcast double %call to <8 x i8> %3 = bitcast <8 x i8> %2 to double ret double %3 } ``` Which is invalid because `lvm.ctpop.v8i8` expect a vector of `i8` and not a `double`, leading to failing asserts in the compiler that prevented from building `libclc`. As a temporary work around we have added empty files that override the files in `generic` when building for `x86_64-unknown-linux`, allowing to complete the build, even though the corresponding builtins will be missing from the library. We are working on a proper solution for this. --------- Co-authored-by: Uwe Dolinsky --- buildbot/configure.py | 9 +- clang/lib/Basic/TargetInfo.cpp | 32 ++ clang/lib/Driver/Driver.cpp | 50 +++ clang/lib/Driver/ToolChains/SYCL.cpp | 9 +- clang/test/Driver/sycl-native-cpu-fsycl.cpp | 46 +-- libclc/CMakeLists.txt | 6 + libclc/generic/libspirv/SOURCES | 4 + .../atomic/loadstore_helpers_acquire.ll | 58 ++++ .../atomic/loadstore_helpers_release.ll | 58 ++++ .../atomic/loadstore_helpers_seq_cst.ll | 105 +++++++ .../atomic/loadstore_helpers_unordered.ll | 106 +++++++ libclc/ptx-nvidiacl/libspirv/SOURCES | 4 +- .../libspirv/atomic/loadstore_helpers.ll | 296 ------------------ .../atomic/loadstore_helpers_acquire.ll | 56 ++++ .../atomic/loadstore_helpers_release.ll | 56 ++++ .../atomic/loadstore_helpers_seq_cst.ll | 103 ++++++ libclc/x86_64-unknown-linux/libspirv/SOURCES | 18 ++ .../libspirv/integer/popcount.cl | 0 .../libspirv/math/ceil.cl | 0 .../libspirv/math/clc_sqrt.cl | 0 .../libspirv/math/fabs.cl | 0 .../libspirv/math/floor.cl | 0 .../x86_64-unknown-linux/libspirv/math/fma.cl | 0 .../libspirv/math/native_cos.cl | 0 .../libspirv/math/native_exp.cl | 0 .../libspirv/math/native_exp2.cl | 0 .../libspirv/math/native_log.cl | 0 .../libspirv/math/native_log10.cl | 0 .../libspirv/math/native_log2.cl | 0 .../libspirv/math/native_sin.cl | 0 .../libspirv/math/native_sqrt.cl | 0 .../libspirv/math/rint.cl | 0 .../libspirv/math/round.cl | 0 .../libspirv/math/sqrt.cl | 0 .../libspirv/math/trunc.cl | 0 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp | 13 +- sycl/doc/design/SYCLNativeCPU.md | 4 +- sycl/include/sycl/access/access.hpp | 2 +- sycl/include/sycl/detail/native_cpu.hpp | 23 +- sycl/plugins/unified_runtime/CMakeLists.txt | 1 + .../ur/adapters/native_cpu/device.cpp | 15 + .../native_cpu/native_cpu_builtins.cpp | 31 +- .../native_cpu/native_cpu_subhandler.cpp | 52 +-- sycl/test/native_cpu/atomic-base.cpp | 42 +++ 44 files changed, 810 insertions(+), 389 deletions(-) create mode 100644 libclc/generic/libspirv/atomic/loadstore_helpers_acquire.ll create mode 100644 libclc/generic/libspirv/atomic/loadstore_helpers_release.ll create mode 100644 libclc/generic/libspirv/atomic/loadstore_helpers_seq_cst.ll create mode 100644 libclc/generic/libspirv/atomic/loadstore_helpers_unordered.ll delete mode 100644 libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers.ll create mode 100644 libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_acquire.ll create mode 100644 libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_release.ll create mode 100644 libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_seq_cst.ll create mode 100644 libclc/x86_64-unknown-linux/libspirv/SOURCES create mode 100644 libclc/x86_64-unknown-linux/libspirv/integer/popcount.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/ceil.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/clc_sqrt.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/fabs.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/floor.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/fma.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/native_cos.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/native_exp.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/native_exp2.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/native_log.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/native_log10.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/native_log2.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/native_sin.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/native_sqrt.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/rint.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/round.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/sqrt.cl create mode 100644 libclc/x86_64-unknown-linux/libspirv/math/trunc.cl create mode 100644 sycl/test/native_cpu/atomic-base.cpp diff --git a/buildbot/configure.py b/buildbot/configure.py index e9f83b8395974..023a130c8766d 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -65,7 +65,7 @@ def do_configure(args): if args.enable_esimd_emulator: sycl_enabled_plugins.append("esimd_emulator") - if args.cuda or args.hip: + if args.cuda or args.hip or args.native_cpu: llvm_enable_projects += ';libclc' if args.cuda: @@ -87,6 +87,12 @@ def do_configure(args): sycl_build_pi_hip_platform = args.hip_platform sycl_enabled_plugins.append("hip") + if args.native_cpu: + #Todo: we should set whatever targets we support for native cpu + libclc_targets_to_build += ';x86_64-unknown-linux-gnu' + sycl_enabled_plugins.append("native_cpu") + + # all llvm compiler targets don't require 3rd party dependencies, so can be # built/tested even if specific runtimes are not available if args.enable_all_llvm_targets: @@ -234,6 +240,7 @@ def main(): parser.add_argument("-t", "--build-type", metavar="BUILD_TYPE", default="Release", help="build type: Debug, Release") parser.add_argument("--cuda", action='store_true', help="switch from OpenCL to CUDA") + parser.add_argument("--native_cpu", action='store_true', help="Enable SYCL Native CPU") parser.add_argument("--hip", action='store_true', help="switch from OpenCL to HIP") parser.add_argument("--hip-platform", type=str, choices=['AMD', 'NVIDIA'], default='AMD', help="choose hardware platform for HIP backend") parser.add_argument("--host-target", default='X86', diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index 6cd5d618a4aca..245af0fd580b6 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -516,6 +516,38 @@ void TargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) { if (Opts.FakeAddressSpaceMap) AddrSpaceMap = &FakeAddrSpaceMap; + + if (Opts.SYCLIsDevice && Opts.SYCLIsNativeCPU) { + // For SYCL Native CPU we use the NVPTXAddrSpaceMap because + // we need builtins to be mangled with AS information + + static const unsigned SYCLNativeCPUASMap[] = { + 0, // Default + 1, // opencl_global + 3, // opencl_local + 4, // opencl_constant + 0, // opencl_private + 0, // opencl_generic + 1, // opencl_global_device + 1, // opencl_global_host + 1, // cuda_device + 4, // cuda_constant + 3, // cuda_shared + 1, // sycl_global + 1, // sycl_global_device + 1, // sycl_global_host + 3, // sycl_local + 0, // sycl_private + 0, // ptr32_sptr + 0, // ptr32_uptr + 0, // ptr64 + 0, // hlsl_groupshared + 20, // wasm_funcref + }; + + AddrSpaceMap = &SYCLNativeCPUASMap; + UseAddrSpaceMapMangling = true; + } } bool TargetInfo::initFeatureMap( diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index dd529867c9639..9954e74b62332 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -5274,6 +5274,53 @@ class OffloadingActionBuilder final { return needLibs; } + bool addSYCLNativeCPULibs(const ToolChain *TC, + ActionList &DeviceLinkObjects) { + std::string LibSpirvFile; + if (Args.hasArg(options::OPT_fsycl_libspirv_path_EQ)) { + auto ProvidedPath = + Args.getLastArgValue(options::OPT_fsycl_libspirv_path_EQ).str(); + if (llvm::sys::fs::exists(ProvidedPath)) + LibSpirvFile = ProvidedPath; + } else { + SmallVector LibraryPaths; + + // Expected path w/out install. + SmallString<256> WithoutInstallPath(C.getDriver().ResourceDir); + llvm::sys::path::append(WithoutInstallPath, Twine("../../clc")); + LibraryPaths.emplace_back(WithoutInstallPath.c_str()); + + // Expected path w/ install. + SmallString<256> WithInstallPath(C.getDriver().ResourceDir); + llvm::sys::path::append(WithInstallPath, Twine("../../../share/clc")); + LibraryPaths.emplace_back(WithInstallPath.c_str()); + + // Select libclc variant based on target triple + std::string LibSpirvTargetName = "builtins.link.libspirv-"; + LibSpirvTargetName.append(TC->getTripleString() + ".bc"); + + for (StringRef LibraryPath : LibraryPaths) { + SmallString<128> LibSpirvTargetFile(LibraryPath); + llvm::sys::path::append(LibSpirvTargetFile, LibSpirvTargetName); + if (llvm::sys::fs::exists(LibSpirvTargetFile) || + Args.hasArg(options::OPT__HASH_HASH_HASH)) { + LibSpirvFile = std::string(LibSpirvTargetFile.str()); + break; + } + } + } + + if (!LibSpirvFile.empty()) { + Arg *LibClcInputArg = MakeInputArg(Args, C.getDriver().getOpts(), + Args.MakeArgString(LibSpirvFile)); + auto *SYCLLibClcInputAction = + C.MakeAction(*LibClcInputArg, types::TY_LLVM_BC); + DeviceLinkObjects.push_back(SYCLLibClcInputAction); + return true; + } + return false; + } + bool addSYCLDeviceLibs(const ToolChain *TC, ActionList &DeviceLinkObjects, bool isSpirvAOT, bool isMSVCEnv) { struct DeviceLibOptInfo { @@ -5684,6 +5731,9 @@ class OffloadingActionBuilder final { TC, DeviceLibs, UseAOTLink, C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment()); } + if (isSYCLNativeCPU) { + SYCLDeviceLibLinked |= addSYCLNativeCPULibs(TC, DeviceLibs); + } JobAction *LinkSYCLLibs = C.MakeAction(DeviceLibs, types::TY_LLVM_BC); for (Action *FullLinkObject : FullLinkObjects) { diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 192e91db616d5..6d7e6b61203b7 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "SYCL.h" #include "CommonArgs.h" +#include "clang/Driver/Action.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/DriverDiagnostic.h" @@ -170,6 +171,8 @@ const char *SYCL::Linker::constructLLVMLinkCommand( // instead of the original object. if (JA.isDeviceOffloading(Action::OFK_SYCL)) { bool IsRDC = !shouldDoPerObjectFileLinking(C); + const bool IsSYCLNativeCPU = isSYCLNativeCPU( + this->getToolChain(), *C.getSingleOffloadToolChain()); auto isNoRDCDeviceCodeLink = [&](const InputInfo &II) { if (IsRDC) return false; @@ -190,12 +193,14 @@ const char *SYCL::Linker::constructLLVMLinkCommand( std::string FileName = this->getToolChain().getInputFilename(II); StringRef InputFilename = llvm::sys::path::filename(FileName); - if (this->getToolChain().getTriple().isNVPTX()) { + const bool IsNVPTX = this->getToolChain().getTriple().isNVPTX(); + if (IsNVPTX || IsSYCLNativeCPU) { // Linking SYCL Device libs requires libclc as well as libdevice if ((InputFilename.find("libspirv") != InputFilename.npos || InputFilename.find("libdevice") != InputFilename.npos)) return true; - LibPostfix = ".cubin"; + if (IsNVPTX) + LibPostfix = ".cubin"; } StringRef LibSyclPrefix("libsycl-"); if (!InputFilename.startswith(LibSyclPrefix) || diff --git a/clang/test/Driver/sycl-native-cpu-fsycl.cpp b/clang/test/Driver/sycl-native-cpu-fsycl.cpp index 27b4598dbba1c..c1de2ac26af23 100644 --- a/clang/test/Driver/sycl-native-cpu-fsycl.cpp +++ b/clang/test/Driver/sycl-native-cpu-fsycl.cpp @@ -1,7 +1,7 @@ -//RUN: %clang -fsycl -fsycl-targets=native_cpu -ccc-print-phases %s 2>&1 | FileCheck %s --check-prefix=CHECK_ACTIONS -//RUN: %clang -fsycl -fsycl-targets=native_cpu -ccc-print-bindings %s 2>&1 | FileCheck %s --check-prefix=CHECK_BINDINGS -//RUN: %clang -fsycl -fsycl-targets=native_cpu -### %s 2>&1 | FileCheck %s --check-prefix=CHECK_INVO -//RUN: %clang -fsycl -fsycl-targets=native_cpu -target aarch64-unknown-linux-gnu -ccc-print-phases %s 2>&1 | FileCheck %s --check-prefix=CHECK_ACTIONS-AARCH64 +//RUN: %clang -fsycl -fsycl-targets=native_cpu -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -ccc-print-phases %s 2>&1 | FileCheck %s --check-prefix=CHECK_ACTIONS +//RUN: %clang -fsycl -fsycl-targets=native_cpu -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -ccc-print-bindings %s 2>&1 | FileCheck %s --check-prefix=CHECK_BINDINGS +//RUN: %clang -fsycl -fsycl-targets=native_cpu -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | FileCheck %s --check-prefix=CHECK_INVO +//RUN: %clang -fsycl -fsycl-targets=native_cpu -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -target aarch64-unknown-linux-gnu -ccc-print-phases %s 2>&1 | FileCheck %s --check-prefix=CHECK_ACTIONS-AARCH64 //CHECK_ACTIONS: +- 0: input, "{{.*}}sycl-native-cpu-fsycl.cpp", c++, (host-sycl) @@ -15,20 +15,23 @@ //CHECK_ACTIONS: +- 8: backend, {7}, assembler, (host-sycl) //CHECK_ACTIONS: +- 9: assembler, {8}, object, (host-sycl) //CHECK_ACTIONS:+- 10: linker, {9}, image, (host-sycl) +//CHECK_ACTIONS: +- 11: linker, {5}, ir, (device-sycl) +//CHECK_ACTIONS: |- 12: input, "{{.*}}libspirv{{.*}}", ir, (device-sycl) +//CHECK_ACTIONS: +- 13: linker, {11, 12}, ir, (device-sycl) //this is where we compile the device code to a shared lib, and we link the host shared lib and the device shared lib -//CHECK_ACTIONS:| +- 11: linker, {5}, ir, (device-sycl) -//CHECK_ACTIONS:| +- 12: backend, {11}, assembler, (device-sycl) -//CHECK_ACTIONS:|- 13: assembler, {12}, object, (device-sycl) +//CHECK_ACTIONS:| +- 14: backend, {13}, assembler, (device-sycl) +//CHECK_ACTIONS:|- 15: assembler, {14}, object, (device-sycl) //call sycl-post-link and clang-offload-wrapper -//CHECK_ACTIONS:| +- 14: sycl-post-link, {11}, tempfiletable, (device-sycl) -//CHECK_ACTIONS:|- 15: clang-offload-wrapper, {14}, object, (device-sycl) -//CHECK_ACTIONS:16: offload, "host-sycl ({{.*}})" {10}, "device-sycl ({{.*}})" {13}, "device-sycl ({{.*}})" {15}, image +//CHECK_ACTIONS:| +- 16: sycl-post-link, {13}, tempfiletable, (device-sycl) +//CHECK_ACTIONS:|- 17: clang-offload-wrapper, {16}, object, (device-sycl) +//CHECK_ACTIONS:18: offload, "host-sycl ({{.*}})" {10}, "device-sycl ({{.*}})" {15}, "device-sycl ({{.*}})" {17}, image //CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["{{.*}}sycl-native-cpu-fsycl.cpp"], output: "[[KERNELIR:.*]].bc" //CHECK_BINDINGS:# "{{.*}}" - "SYCL::Linker", inputs: ["[[KERNELIR]].bc"], output: "[[KERNELLINK:.*]].bc" -//CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["[[KERNELLINK]].bc"], output: "[[KERNELOBJ:.*]].o" -//CHECK_BINDINGS:# "{{.*}}" - "SYCL post link", inputs: ["[[KERNELLINK]].bc"], output: "[[TABLEFILE:.*]].table" +//CHECK_BINDINGS:# "{{.*}}" - "SYCL::Linker", inputs: ["[[KERNELLINK]].bc", "{{.*}}.bc"], output: "[[KERNELLINKWLIB:.*]].bc" +//CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["[[KERNELLINKWLIB]].bc"], output: "[[KERNELOBJ:.*]].o" +//CHECK_BINDINGS:# "{{.*}}" - "SYCL post link", inputs: ["[[KERNELLINKWLIB]].bc"], output: "[[TABLEFILE:.*]].table" //CHECK_BINDINGS:# "{{.*}}" - "offload wrapper", inputs: ["[[TABLEFILE]].table"], output: "[[WRAPPEROBJ:.*]].o" //CHECK_BINDINGS:# "{{.*}}" - "Append Footer to source", inputs: ["{{.*}}sycl-native-cpu-fsycl.cpp"], output: "[[SRCWFOOTER:.*]].cpp" //CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["[[SRCWFOOTER]].cpp", "[[KERNELIR]].bc"], output: "[[HOSTOBJ:.*]].o" @@ -38,21 +41,6 @@ //CHECK_INVO:{{.*}}clang{{.*}}"-x" "ir" //CHECK_INVO:{{.*}}clang{{.*}}"-fsycl-is-host"{{.*}} -// checkes that the device and host triple is correct in the generated actions when it is set explicitly -//CHECK_ACTIONS-AARCH64: +- 0: input, "{{.*}}sycl-native-cpu-fsycl.cpp", c++, (host-sycl) -//CHECK_ACTIONS-AARCH64: +- 1: append-footer, {0}, c++, (host-sycl) -//CHECK_ACTIONS-AARCH64: +- 2: preprocessor, {1}, c++-cpp-output, (host-sycl) -//CHECK_ACTIONS-AARCH64: | +- 3: input, "{{.*}}sycl-native-cpu-fsycl.cpp", c++, (device-sycl) -//CHECK_ACTIONS-AARCH64: | +- 4: preprocessor, {3}, c++-cpp-output, (device-sycl) -//CHECK_ACTIONS-AARCH64: |- 5: compiler, {4}, ir, (device-sycl) +// checks that the device and host triple is correct in the generated actions when it is set explicitly //CHECK_ACTIONS-AARCH64: +- 6: offload, "host-sycl (aarch64-unknown-linux-gnu)" {2}, "device-sycl (aarch64-unknown-linux-gnu)" {5}, c++-cpp-output -//CHECK_ACTIONS-AARCH64: +- 7: compiler, {6}, ir, (host-sycl) -//CHECK_ACTIONS-AARCH64: +- 8: backend, {7}, assembler, (host-sycl) -//CHECK_ACTIONS-AARCH64: +- 9: assembler, {8}, object, (host-sycl) -//CHECK_ACTIONS-AARCH64:+- 10: linker, {9}, image, (host-sycl) -//CHECK_ACTIONS-AARCH64:| +- 11: linker, {5}, ir, (device-sycl) -//CHECK_ACTIONS-AARCH64:| +- 12: backend, {11}, assembler, (device-sycl) -//CHECK_ACTIONS-AARCH64:|- 13: assembler, {12}, object, (device-sycl) -//CHECK_ACTIONS-AARCH64:| +- 14: sycl-post-link, {11}, tempfiletable, (device-sycl) -//CHECK_ACTIONS-AARCH64:|- 15: clang-offload-wrapper, {14}, object, (device-sycl) -//CHECK_ACTIONS-AARCH64:16: offload, "host-sycl (aarch64-unknown-linux-gnu)" {10}, "device-sycl (aarch64-unknown-linux-gnu)" {13}, "device-sycl (aarch64-unknown-linux-gnu)" {15}, image +//CHECK_ACTIONS-AARCH64:{{[0-9]*}}: offload, "host-sycl (aarch64-unknown-linux-gnu)" {{{[0-9]*}}}, "device-sycl (aarch64-unknown-linux-gnu)" {{{[0-9]*}}}, "device-sycl (aarch64-unknown-linux-gnu)" {{{[0-9]*}}}, image diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 6edd22bc719df..220c8ace1224c 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -181,6 +181,8 @@ set( nvptx--nvidiacl_devices none ) set( nvptx64--nvidiacl_devices none ) set( spirv-mesa3d-_devices none ) set( spirv64-mesa3d-_devices none ) +# TODO: Does this need to be set for each possible triple? +set( x86_64-unknown-linux-gnu_devices none ) # Setup aliases set( cedar_aliases palm sumo sumo2 redwood juniper ) @@ -342,6 +344,10 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) # AMDGCN needs libclc to be compiled to high bc version since all atomic # clang builtins need to be accessible set( flags "SHELL:-mcpu=gfx940") + elseif( ${ARCH} STREQUAL x86_64) + # TODO: This is used by native cpu, we should define an option to set this flags + set( flags "SHELL:-Xclang -target-feature -Xclang +avx" + "SHELL:-Xclang -target-feature -Xclang +avx512f") else() set ( flags ) endif() diff --git a/libclc/generic/libspirv/SOURCES b/libclc/generic/libspirv/SOURCES index 892148aed14e7..1e5327b8c6169 100644 --- a/libclc/generic/libspirv/SOURCES +++ b/libclc/generic/libspirv/SOURCES @@ -1,3 +1,7 @@ +atomic/loadstore_helpers_unordered.ll +atomic/loadstore_helpers_release.ll +atomic/loadstore_helpers_acquire.ll +atomic/loadstore_helpers_seq_cst.ll float16.cl subnormal_config.cl subnormal_helper_func.ll diff --git a/libclc/generic/libspirv/atomic/loadstore_helpers_acquire.ll b/libclc/generic/libspirv/atomic/loadstore_helpers_acquire.ll new file mode 100644 index 0000000000000..46418d5d35c2a --- /dev/null +++ b/libclc/generic/libspirv/atomic/loadstore_helpers_acquire.ll @@ -0,0 +1,58 @@ +#if __clang_major__ >= 7 +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +#else +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +#endif +; This file contains helper functions for the acquire memory ordering constraint. +; Other targets can specialize this file to account for unsupported features in their backend. + +declare void @llvm.trap() + +define i32 @__clc__atomic_load_global_4_acquire(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i32 @__clc__atomic_load_local_4_acquire(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_load_global_8_acquire(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_load_local_8_acquire(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i32 @__clc__atomic_uload_global_4_acquire(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + %0 = load atomic volatile i32, i32 addrspace(1)* %ptr acquire, align 4 + ret i32 %0 +} + +define i32 @__clc__atomic_uload_local_4_acquire(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_uload_global_8_acquire(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_uload_local_8_acquire(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + diff --git a/libclc/generic/libspirv/atomic/loadstore_helpers_release.ll b/libclc/generic/libspirv/atomic/loadstore_helpers_release.ll new file mode 100644 index 0000000000000..c10d96eb19f6c --- /dev/null +++ b/libclc/generic/libspirv/atomic/loadstore_helpers_release.ll @@ -0,0 +1,58 @@ +#if __clang_major__ >= 7 +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +#else +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +#endif +; This file contains helper functions for the release memory ordering constraint. +; Other targets can specialize this file to account for unsupported features in their backend. + +declare void @llvm.trap() + +define void @__clc__atomic_store_global_4_release(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_local_4_release(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_global_8_release(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_local_8_release(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_global_4_release(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_local_4_release(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_global_8_release(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_local_8_release(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + diff --git a/libclc/generic/libspirv/atomic/loadstore_helpers_seq_cst.ll b/libclc/generic/libspirv/atomic/loadstore_helpers_seq_cst.ll new file mode 100644 index 0000000000000..c996862619c0e --- /dev/null +++ b/libclc/generic/libspirv/atomic/loadstore_helpers_seq_cst.ll @@ -0,0 +1,105 @@ +#if __clang_major__ >= 7 +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +#else +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +#endif +; This file contains helper functions for the seq_cst memory ordering constraint. +; Other targets can specialize this file to account for unsupported features in their backend. + +declare void @llvm.trap() + +define i32 @__clc__atomic_load_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i32 @__clc__atomic_load_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_load_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_load_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i32 @__clc__atomic_uload_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + %0 = load atomic volatile i32, i32 addrspace(1)* %ptr seq_cst, align 4 + ret i32 %0 +} + +define i32 @__clc__atomic_uload_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_uload_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_uload_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} diff --git a/libclc/generic/libspirv/atomic/loadstore_helpers_unordered.ll b/libclc/generic/libspirv/atomic/loadstore_helpers_unordered.ll new file mode 100644 index 0000000000000..f31df6390163f --- /dev/null +++ b/libclc/generic/libspirv/atomic/loadstore_helpers_unordered.ll @@ -0,0 +1,106 @@ +#if __clang_major__ >= 7 +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +#else +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +#endif +; This file contains helper functions for the unordered memory ordering constraint. +; Other targets can specialize this file to account for unsupported features in their backend. + +declare void @llvm.trap() + +define i32 @__clc__atomic_load_global_4_unordered(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + %0 = load atomic volatile i32, i32 addrspace(1)* %ptr unordered, align 4 + ret i32 %0 +} + +define i32 @__clc__atomic_load_local_4_unordered(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + %0 = load atomic volatile i32, i32 addrspace(3)* %ptr unordered, align 4 + ret i32 %0 +} + +define i64 @__clc__atomic_load_global_8_unordered(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + %0 = load atomic volatile i64, i64 addrspace(1)* %ptr unordered, align 8 + ret i64 %0 +} + +define i64 @__clc__atomic_load_local_8_unordered(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + %0 = load atomic volatile i64, i64 addrspace(3)* %ptr unordered, align 8 + ret i64 %0 +} + +define i32 @__clc__atomic_uload_global_4_unordered(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + %0 = load atomic volatile i32, i32 addrspace(1)* %ptr unordered, align 4 + ret i32 %0 +} + +define i32 @__clc__atomic_uload_local_4_unordered(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + %0 = load atomic volatile i32, i32 addrspace(3)* %ptr unordered, align 4 + ret i32 %0 +} + +define i64 @__clc__atomic_uload_global_8_unordered(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + %0 = load atomic volatile i64, i64 addrspace(1)* %ptr unordered, align 8 + ret i64 %0 +} + +define i64 @__clc__atomic_uload_local_8_unordered(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + %0 = load atomic volatile i64, i64 addrspace(3)* %ptr unordered, align 8 + ret i64 %0 +} + +define void @__clc__atomic_store_global_4_unordered(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + store atomic volatile i32 %value, i32 addrspace(1)* %ptr unordered, align 4 + ret void +} + +define void @__clc__atomic_store_local_4_unordered(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + store atomic volatile i32 %value, i32 addrspace(3)* %ptr unordered, align 4 + ret void +} + +define void @__clc__atomic_store_global_8_unordered(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + store atomic volatile i64 %value, i64 addrspace(1)* %ptr unordered, align 8 + ret void +} + +define void @__clc__atomic_store_local_8_unordered(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + store atomic volatile i64 %value, i64 addrspace(3)* %ptr unordered, align 8 + ret void +} + +define void @__clc__atomic_ustore_global_4_unordered(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + store atomic volatile i32 %value, i32 addrspace(1)* %ptr unordered, align 4 + ret void +} + +define void @__clc__atomic_ustore_local_4_unordered(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + store atomic volatile i32 %value, i32 addrspace(3)* %ptr unordered, align 4 + ret void +} + +define void @__clc__atomic_ustore_global_8_unordered(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + store atomic volatile i64 %value, i64 addrspace(1)* %ptr unordered, align 8 + ret void +} + +define void @__clc__atomic_ustore_local_8_unordered(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + store atomic volatile i64 %value, i64 addrspace(3)* %ptr unordered, align 8 + ret void +} + diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES index 4177aae12b416..f9ab071172569 100644 --- a/libclc/ptx-nvidiacl/libspirv/SOURCES +++ b/libclc/ptx-nvidiacl/libspirv/SOURCES @@ -1,5 +1,7 @@ reflect.ll -atomic/loadstore_helpers.ll +atomic/loadstore_helpers_release.ll +atomic/loadstore_helpers_acquire.ll +atomic/loadstore_helpers_seq_cst.ll cl_khr_int64_extended_atomics/minmax_helpers.ll integer/mul24.cl integer/mul_hi.cl diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers.ll b/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers.ll deleted file mode 100644 index 56b28ae8c1050..0000000000000 --- a/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers.ll +++ /dev/null @@ -1,296 +0,0 @@ -#if __clang_major__ >= 7 -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" -#else -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" -#endif - -declare void @llvm.trap() - -define i32 @__clc__atomic_load_global_4_unordered(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - %0 = load atomic volatile i32, i32 addrspace(1)* %ptr unordered, align 4 - ret i32 %0 -} - -define i32 @__clc__atomic_load_local_4_unordered(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - %0 = load atomic volatile i32, i32 addrspace(3)* %ptr unordered, align 4 - ret i32 %0 -} - -define i64 @__clc__atomic_load_global_8_unordered(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - %0 = load atomic volatile i64, i64 addrspace(1)* %ptr unordered, align 8 - ret i64 %0 -} - -define i64 @__clc__atomic_load_local_8_unordered(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - %0 = load atomic volatile i64, i64 addrspace(3)* %ptr unordered, align 8 - ret i64 %0 -} - -define i32 @__clc__atomic_uload_global_4_unordered(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - %0 = load atomic volatile i32, i32 addrspace(1)* %ptr unordered, align 4 - ret i32 %0 -} - -define i32 @__clc__atomic_uload_local_4_unordered(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - %0 = load atomic volatile i32, i32 addrspace(3)* %ptr unordered, align 4 - ret i32 %0 -} - -define i64 @__clc__atomic_uload_global_8_unordered(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - %0 = load atomic volatile i64, i64 addrspace(1)* %ptr unordered, align 8 - ret i64 %0 -} - -define i64 @__clc__atomic_uload_local_8_unordered(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - %0 = load atomic volatile i64, i64 addrspace(3)* %ptr unordered, align 8 - ret i64 %0 -} - -define i32 @__clc__atomic_load_global_4_acquire(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i32 @__clc__atomic_load_local_4_acquire(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i64 @__clc__atomic_load_global_8_acquire(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i64 @__clc__atomic_load_local_8_acquire(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i32 @__clc__atomic_uload_global_4_acquire(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i32 @__clc__atomic_uload_local_4_acquire(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i64 @__clc__atomic_uload_global_8_acquire(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i64 @__clc__atomic_uload_local_8_acquire(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - - -define i32 @__clc__atomic_load_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i32 @__clc__atomic_load_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i64 @__clc__atomic_load_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i64 @__clc__atomic_load_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i32 @__clc__atomic_uload_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i32 @__clc__atomic_uload_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i64 @__clc__atomic_uload_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define i64 @__clc__atomic_uload_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_store_global_4_unordered(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - store atomic volatile i32 %value, i32 addrspace(1)* %ptr unordered, align 4 - ret void -} - -define void @__clc__atomic_store_local_4_unordered(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - store atomic volatile i32 %value, i32 addrspace(3)* %ptr unordered, align 4 - ret void -} - -define void @__clc__atomic_store_global_8_unordered(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - store atomic volatile i64 %value, i64 addrspace(1)* %ptr unordered, align 8 - ret void -} - -define void @__clc__atomic_store_local_8_unordered(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - store atomic volatile i64 %value, i64 addrspace(3)* %ptr unordered, align 8 - ret void -} - -define void @__clc__atomic_ustore_global_4_unordered(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - store atomic volatile i32 %value, i32 addrspace(1)* %ptr unordered, align 4 - ret void -} - -define void @__clc__atomic_ustore_local_4_unordered(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - store atomic volatile i32 %value, i32 addrspace(3)* %ptr unordered, align 4 - ret void -} - -define void @__clc__atomic_ustore_global_8_unordered(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - store atomic volatile i64 %value, i64 addrspace(1)* %ptr unordered, align 8 - ret void -} - -define void @__clc__atomic_ustore_local_8_unordered(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - store atomic volatile i64 %value, i64 addrspace(3)* %ptr unordered, align 8 - ret void -} - -define void @__clc__atomic_store_global_4_release(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_store_local_4_release(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_store_global_8_release(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_store_local_8_release(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_ustore_global_4_release(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_ustore_local_4_release(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_ustore_global_8_release(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_ustore_local_8_release(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_store_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_store_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_store_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_store_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_ustore_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_ustore_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_ustore_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} - -define void @__clc__atomic_ustore_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - tail call void @llvm.trap() - unreachable -} diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_acquire.ll b/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_acquire.ll new file mode 100644 index 0000000000000..626fd959a3272 --- /dev/null +++ b/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_acquire.ll @@ -0,0 +1,56 @@ +#if __clang_major__ >= 7 +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +#else +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +#endif + +declare void @llvm.trap() + +define i32 @__clc__atomic_load_global_4_acquire(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i32 @__clc__atomic_load_local_4_acquire(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_load_global_8_acquire(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_load_local_8_acquire(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i32 @__clc__atomic_uload_global_4_acquire(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i32 @__clc__atomic_uload_local_4_acquire(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_uload_global_8_acquire(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_uload_local_8_acquire(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_release.ll b/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_release.ll new file mode 100644 index 0000000000000..3e986e6bb1ceb --- /dev/null +++ b/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_release.ll @@ -0,0 +1,56 @@ +#if __clang_major__ >= 7 +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +#else +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +#endif + +declare void @llvm.trap() + +define void @__clc__atomic_store_global_4_release(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_local_4_release(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_global_8_release(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_local_8_release(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_global_4_release(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_local_4_release(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_global_8_release(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_local_8_release(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_seq_cst.ll b/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_seq_cst.ll new file mode 100644 index 0000000000000..ed4c8aaf4db42 --- /dev/null +++ b/libclc/ptx-nvidiacl/libspirv/atomic/loadstore_helpers_seq_cst.ll @@ -0,0 +1,103 @@ +#if __clang_major__ >= 7 +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +#else +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +#endif + +declare void @llvm.trap() + +define i32 @__clc__atomic_load_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i32 @__clc__atomic_load_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_load_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_load_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i32 @__clc__atomic_uload_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i32 @__clc__atomic_uload_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_uload_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define i64 @__clc__atomic_uload_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_store_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_global_4_seq_cst(i32 addrspace(1)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_local_4_seq_cst(i32 addrspace(3)* nocapture %ptr, i32 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_global_8_seq_cst(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} + +define void @__clc__atomic_ustore_local_8_seq_cst(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { +entry: + tail call void @llvm.trap() + unreachable +} diff --git a/libclc/x86_64-unknown-linux/libspirv/SOURCES b/libclc/x86_64-unknown-linux/libspirv/SOURCES new file mode 100644 index 0000000000000..ba0b2c7df78e8 --- /dev/null +++ b/libclc/x86_64-unknown-linux/libspirv/SOURCES @@ -0,0 +1,18 @@ +integer/popcount.cl +math/ceil.cl +math/sqrt.cl +math/clc_sqrt.cl +math/fabs.cl +math/floor.cl +math/fma.cl +math/native_cos.cl +math/native_exp.cl +math/native_exp2.cl +math/native_log.cl +math/native_log10.cl +math/native_log2.cl +math/native_sin.cl +math/native_sqrt.cl +math/rint.cl +math/round.cl +math/trunc.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/integer/popcount.cl b/libclc/x86_64-unknown-linux/libspirv/integer/popcount.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/ceil.cl b/libclc/x86_64-unknown-linux/libspirv/math/ceil.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/clc_sqrt.cl b/libclc/x86_64-unknown-linux/libspirv/math/clc_sqrt.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/fabs.cl b/libclc/x86_64-unknown-linux/libspirv/math/fabs.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/floor.cl b/libclc/x86_64-unknown-linux/libspirv/math/floor.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/fma.cl b/libclc/x86_64-unknown-linux/libspirv/math/fma.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_cos.cl b/libclc/x86_64-unknown-linux/libspirv/math/native_cos.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_exp.cl b/libclc/x86_64-unknown-linux/libspirv/math/native_exp.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_exp2.cl b/libclc/x86_64-unknown-linux/libspirv/math/native_exp2.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_log.cl b/libclc/x86_64-unknown-linux/libspirv/math/native_log.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_log10.cl b/libclc/x86_64-unknown-linux/libspirv/math/native_log10.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_log2.cl b/libclc/x86_64-unknown-linux/libspirv/math/native_log2.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_sin.cl b/libclc/x86_64-unknown-linux/libspirv/math/native_sin.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_sqrt.cl b/libclc/x86_64-unknown-linux/libspirv/math/native_sqrt.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/rint.cl b/libclc/x86_64-unknown-linux/libspirv/math/rint.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/round.cl b/libclc/x86_64-unknown-linux/libspirv/math/round.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/sqrt.cl b/libclc/x86_64-unknown-linux/libspirv/math/sqrt.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libclc/x86_64-unknown-linux/libspirv/math/trunc.cl b/libclc/x86_64-unknown-linux/libspirv/math/trunc.cl new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp index 50651c3e331c9..8693823e71ae2 100644 --- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp @@ -13,6 +13,7 @@ #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h" #include "llvm/IR/Constant.h" +#include "llvm/IR/PassManager.h" #include "llvm/SYCLLowerIR/SYCLUtils.h" #include "llvm/ADT/ArrayRef.h" @@ -41,6 +42,7 @@ #include #include #include +#include #include using namespace llvm; @@ -137,12 +139,13 @@ void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType, // Load the correct NativeCPUDesc and load the pointer from it auto *Addr = Builder.CreateGEP(NativeCPUArgDescType, BaseNativeCPUArg, {Builder.getInt64(UsedI)}); - auto *Load = Builder.CreateLoad(PointerType::getUnqual(Ctx), Addr); if (Arg->getType()->isPointerTy()) { // If the arg is a pointer, just use it + auto *Load = Builder.CreateLoad(Arg->getType(), Addr); KernelArgs.push_back(Load); } else { // Otherwise, load the scalar value and use that + auto *Load = Builder.CreateLoad(PointerType::getUnqual(Ctx), Addr); auto *Scalar = Builder.CreateLoad(Arg->getType(), Load); KernelArgs.push_back(Scalar); } @@ -244,6 +247,8 @@ Value *getStateArg(const Function *F) { return F->getArg(FT->getNumParams() - 1); } +static constexpr unsigned int NativeCPUGlobalAS = 1; + } // namespace PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, @@ -261,10 +266,8 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, Type *StateType = StructType::getTypeByName(M.getContext(), "struct.__nativecpu_state"); if (!StateType) - report_fatal_error("Couldn't find the Native CPU state in the " - "module, make sure that -D __SYCL_NATIVE_CPU__ is set", - false); - Type *StatePtrType = PointerType::getUnqual(StateType); + return PreservedAnalyses::all(); + Type *StatePtrType = PointerType::get(StateType, 1); SmallVector NewKernels; for (auto &OldF : OldKernels) { auto *NewF = cloneFunctionAndAddParam(OldF, StatePtrType); diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md index 4b5390d6c29f6..551efe0e30fbf 100644 --- a/sycl/doc/design/SYCLNativeCPU.md +++ b/sycl/doc/design/SYCLNativeCPU.md @@ -27,11 +27,11 @@ clang++ -o #link clang++ -L -lsycl -o ``` -In order to execute kernels compiled for `native-cpu`, we provide a PI Plugin. The plugin needs to be enabled when configuring DPC++ (e.g. `python buildbot/configure.py --enable-plugin native_cpu`) and needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. +In order to execute kernels compiled for `native-cpu`, we provide a PI Plugin. The plugin needs to be enabled when configuring DPC++ (e.g. `python buildbot/configure.py --native_cpu`) and needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. # Supported features and current limitations -The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barrier` and all the math builtins are not supported, and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](sycl/test/native_cpu). +The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barrier` and several math builtins are not supported, and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](sycl/test/native_cpu). To execute the `e2e` tests on the Native CPU, configure the test suite with: diff --git a/sycl/include/sycl/access/access.hpp b/sycl/include/sycl/access/access.hpp index bf9edc4346e2f..ca8bda18eecb7 100644 --- a/sycl/include/sycl/access/access.hpp +++ b/sycl/include/sycl/access/access.hpp @@ -327,7 +327,7 @@ template inline ToT cast_AS(FromT from) { constexpr access::address_space ToAS = deduce_AS::value; constexpr access::address_space FromAS = deduce_AS::value; if constexpr (FromAS == access::address_space::generic_space) { -#if defined(__NVPTX__) || defined(__AMDGCN__) +#if defined(__NVPTX__) || defined(__AMDGCN__) || defined(__SYCL_NATIVE_CPU__) // TODO: NVPTX and AMDGCN backends do not currently support the // __spirv_GenericCastToPtrExplicit_* builtins, so to work around this // we do C-style casting. This may produce warnings when targetting diff --git a/sycl/include/sycl/detail/native_cpu.hpp b/sycl/include/sycl/detail/native_cpu.hpp index 2cfc04b1a060f..8631d18c10ab4 100644 --- a/sycl/include/sycl/detail/native_cpu.hpp +++ b/sycl/include/sycl/detail/native_cpu.hpp @@ -68,46 +68,47 @@ struct __nativecpu_state { } }; #ifdef __SYCL_DEVICE_ONLY__ +#define __SYCL_NCPU_GLOBAL_AS __attribute((address_space(1))) #define __SYCL_HC_ATTRS \ extern "C" __attribute__((weak)) __attribute((always_inline)) \ [[intel::device_indirectly_callable]] #define __NCPU_ATTRS extern "C" __SYCL_HC_ATTRS -__NCPU_ATTRS size_t __dpcpp_nativecpu_global_id(unsigned dim, - __attribute((address_space(0))) - __nativecpu_state *s) { +__NCPU_ATTRS size_t __dpcpp_nativecpu_global_id( + unsigned dim, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) { return s->MGlobal_id[dim]; } __NCPU_ATTRS size_t __dpcpp_nativecpu_global_range( - unsigned dim, __attribute((address_space(0))) __nativecpu_state *s) { + unsigned dim, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) { return s->MGlobal_range[dim]; } __NCPU_ATTRS size_t __dpcpp_nativecpu_get_wg_size( - unsigned dim, __attribute((address_space(0))) __nativecpu_state *s) { + unsigned dim, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) { return s->MWorkGroup_size[dim]; } -__NCPU_ATTRS size_t __dpcpp_nativecpu_get_wg_id(unsigned dim, - __attribute((address_space(0))) - __nativecpu_state *s) { +__NCPU_ATTRS size_t __dpcpp_nativecpu_get_wg_id( + unsigned dim, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) { return s->MWorkGroup_id[dim]; } __NCPU_ATTRS size_t __dpcpp_nativecpu_get_local_id( - unsigned dim, __attribute((address_space(0))) __nativecpu_state *s) { + unsigned dim, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) { return s->MLocal_id[dim]; } __NCPU_ATTRS size_t __dpcpp_nativecpu_get_num_groups( - unsigned dim, __attribute((address_space(0))) __nativecpu_state *s) { + unsigned dim, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) { return s->MNumGroups[dim]; } __NCPU_ATTRS size_t __dpcpp_nativecpu_get_global_offset( - unsigned dim, __attribute((address_space(0))) __nativecpu_state *s) { + unsigned dim, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) { return s->MGlobalOffset[dim]; } +#undef __SYCL_NCPU_GLOBAL_AS +#undef __SYCL_HC_ATTRS #undef __NCPU_ATTRS #endif diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 71f1d47eca7cb..671f1f3fafdb3 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -285,6 +285,7 @@ if("native_cpu" IN_LIST SYCL_ENABLE_PLUGINS) UnifiedRuntime-Headers Threads::Threads sycl + OpenCL-Headers ) set_target_properties("ur_adapter_native_cpu" PROPERTIES diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp index fe773fbbdd170..155a85d9217da 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp @@ -281,6 +281,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: return UR_RESULT_ERROR_INVALID_VALUE; + case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + ur_memory_order_capability_flags_t Capabilities = + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; + return ReturnValue(Capabilities); + } + case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { + uint64_t Capabilities = UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE; + return ReturnValue(Capabilities); + } CASE_UR_UNSUPPORTED(UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH); diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp index c74d9931566b5..eea1d609469ba 100644 --- a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp +++ b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp @@ -18,8 +18,8 @@ int main() { sycl::range<1> r(1); deviceQueue.submit([&](sycl::handler &h) { h.parallel_for(r, [=](sycl::id<1> id) { acc[id[0]] = 42; }); - // CHECK: @_ZTS5Test1.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2) - // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(i32 0, ptr %2) + // CHECK: @_ZTS5Test1.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2) + // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(i32 0, ptr addrspace(1) %2) }); sycl::nd_range<2> r2({1, 1}, { 1, @@ -27,21 +27,21 @@ int main() { }); deviceQueue.submit([&](sycl::handler &h) { h.parallel_for(r2, [=](sycl::id<2> id) { acc[id[1]] = 42; }); - // CHECK: @_ZTS5Test2.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2) - // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(i32 1, ptr %2) - // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(i32 0, ptr %2) + // CHECK: @_ZTS5Test2.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2) + // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(i32 1, ptr addrspace(1) %2) + // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(i32 0, ptr addrspace(1) %2) }); sycl::nd_range<3> r3({1, 1, 1}, {1, 1, 1}); deviceQueue.submit([&](sycl::handler &h) { h.parallel_for( r3, [=](sycl::item<3> item) { acc[item[2]] = item.get_range(0); }); - // CHECK: @_ZTS5Test3.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2) - // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(i32 2, ptr %2) - // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(i32 1, ptr %2) - // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(i32 0, ptr %2) - // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(i32 2, ptr %2) - // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(i32 1, ptr %2) - // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(i32 0, ptr %2) + // CHECK: @_ZTS5Test3.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2) + // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(i32 2, ptr addrspace(1) %2) + // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(i32 1, ptr addrspace(1) %2) + // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(i32 0, ptr addrspace(1) %2) + // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(i32 2, ptr addrspace(1) %2) + // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(i32 1, ptr addrspace(1) %2) + // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(i32 0, ptr addrspace(1) %2) }); const size_t dim = 2; @@ -69,9 +69,10 @@ int main() { auto rangeZ = id.get_local_range(2); Accessor[groupX * rangeX + localX][groupY * rangeY + localY] [groupZ * rangeZ + localZ] = {rangeX, rangeY, rangeZ}; - // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_local_id(i32 0, ptr %{{[0-9]*}}) - // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_wg_size(i32 0, ptr %{{[0-9]*}}) - // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_wg_id(i32 0, ptr %{{[0-9]*}}) + + // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_local_id(i32 0, ptr addrspace(1) %{{[0-9]*}}) + // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_wg_size(i32 0, ptr addrspace(1) %{{[0-9]*}}) + // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_wg_id(i32 0, ptr addrspace(1) %{{[0-9]*}}) }); }); } diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp index fab20cff1dd64..5425ad6a1d931 100644 --- a/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp +++ b/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp @@ -32,29 +32,29 @@ __attribute__((sycl_kernel)) void launch(const Func &kernelFunc) { void test() { queue q; gen_test(q); - //CHECK: define weak void @_ZTS6init_aIiE(ptr %{{.*}}, ptr %[[STATE:.*]]) #{{.*}} { - //CHECK: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} - //CHECK-NEXT: %[[ARG1:.*]] = load ptr, ptr %{{.*}} - //CHECK-NEXT: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} - //CHECK-NEXT: %[[ARG2:.*]] = load ptr, ptr %{{.*}} - //CHECK-NEXT: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} - //CHECK-NEXT: %{{.*}} = load ptr, ptr %{{.*}} - //CHECK-NEXT: %[[ARG3:.*]] = load i32, ptr %{{.*}} - //CHECK-NEXT: call void @_ZTS6init_aIiE.NativeCPUKernel(ptr %[[ARG1]], ptr %[[ARG2]], i32 %[[ARG3]], ptr %[[STATE]]) - //CHECK-NEXT: ret void - //CHECK-NEXT:} + //CHECK: define weak void @_ZTS6init_aIiE(ptr %{{.*}}, ptr addrspace(1) {{.*}}) #{{.*}} { + //CHECK: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} + //CHECK: %{{.*}} = load ptr addrspace(1), ptr %{{.*}} + //CHECK: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} + //CHECK: %{{.*}} = load ptr, ptr %{{.*}} + //CHECK: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} + //CHECK: %{{.*}} = load ptr, ptr %{{.*}} + //CHECK: %{{.*}} = load i32, ptr %{{.*}} + //CHECK: call void @_ZTS6init_aIiE.NativeCPUKernel(ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, ptr {{.*}}) + //CHECK: ret void + //CHECK:} gen_test(q); - //CHECK: define weak void @_ZTS6init_aIfE(ptr %{{.*}}, ptr %[[STATE1:.*]]) #{{.*}} { - //CHECK: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} - //CHECK-NEXT: %[[ARGF1:.*]] = load ptr, ptr %{{.*}} - //CHECK-NEXT: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} - //CHECK-NEXT: %[[ARGF2:.*]] = load ptr, ptr %{{.*}}, align 8 - //CHECK-NEXT: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} - //CHECK-NEXT: %{{.*}} = load ptr, ptr %{{.*}} - //CHECK-NEXT: %[[ARGF3:.*]] = load float, ptr %{{.*}} - //CHECK-NEXT: call void @_ZTS6init_aIfE.NativeCPUKernel(ptr %[[ARGF1]], ptr %[[ARGF2]], float %[[ARGF3]], ptr %[[STATE1]]) - //CHECK-NEXT: ret void - //CHECK-NEXT:} + //CHECK: define weak void @_ZTS6init_aIfE(ptr %{{.*}}, ptr addrspace(1) {{.*}}) #{{.*}} { + //CHECK: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} + //CHECK: %{{.*}} = load ptr addrspace(1), ptr %{{.*}} + //CHECK: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} + //CHECK: %{{.*}} = load ptr, ptr %{{.*}}, align 8 + //CHECK: %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}} + //CHECK: %{{.*}} = load ptr, ptr %{{.*}} + //CHECK: %{{.*}} = load float, ptr %{{.*}} + //CHECK: call void @_ZTS6init_aIfE.NativeCPUKernel(ptr {{.*}}, ptr {{.*}}, float {{.*}}, ptr {{.*}}) + //CHECK: ret void + //CHECK:} // Check that subhandler is emitted correctly for kernels with no // args:deviceQueue.submit([&](sycl::handler &h) { @@ -64,14 +64,14 @@ void test() { acc[id[0]]; // all kernel arguments are removed }); }); - //CHECK:define weak void @_ZTS5Test1(ptr %{{.*}}, ptr %[[STATE2:.*]]) #{{.*}} { - //CHECK: call void @_ZTS5Test1.NativeCPUKernel(ptr %[[STATE2]]) + //CHECK:define weak void @_ZTS5Test1(ptr %{{.*}}, ptr addrspace(1) %[[STATE2:.*]]) #{{.*}} { + //CHECK: call void @_ZTS5Test1.NativeCPUKernel(ptr addrspace(1) %[[STATE2]]) //CHECK-NEXT: ret void //CHECK-NEXT:} launch([]() {}); - //CHECK:define weak void @_ZTSZ4testvE10TestKernel(ptr %{{.*}}, ptr %[[STATE3:.*]]) #{{.*}} { - //CHECK: call void @_ZTSZ4testvE10TestKernel.NativeCPUKernel(ptr %[[STATE3]]) + //CHECK:define weak void @_ZTSZ4testvE10TestKernel(ptr %{{.*}}, ptr addrspace(1) %[[STATE3:.*]]) #{{.*}} { + //CHECK: call void @_ZTSZ4testvE10TestKernel.NativeCPUKernel(ptr addrspace(1) %[[STATE3]]) //CHECK-NEXT: ret void //CHECK-NEXT:} } diff --git a/sycl/test/native_cpu/atomic-base.cpp b/sycl/test/native_cpu/atomic-base.cpp new file mode 100644 index 0000000000000..51a12e4499ace --- /dev/null +++ b/sycl/test/native_cpu/atomic-base.cpp @@ -0,0 +1,42 @@ +// Simple test that checks that we can run a simple applications that uses +// builtins +// REQUIRES: native_cpu_be +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -o %t +// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t +#include +#include + +using namespace sycl; + +int add_pre_inc_test(queue q, size_t N) { + constexpr auto scope = memory_scope::device; + constexpr auto space = access::address_space::global_space; + int sum = 0; + { + buffer sum_buf(&sum, 1); + + q.submit([&](handler &cgh) { + auto sum = sum_buf.template get_access(cgh); + cgh.parallel_for(range<1>(N), [=](item<1> it) { + int gid = it.get_id(0); + auto atm = + sycl::atomic_ref(sum[0]); + ++atm; + }); + }); + } + + // All work-items increment by 1, so final value should be equal to N + return sum; +} + +int main() { + const int N = 10; + sycl::queue q; + int res = add_pre_inc_test(q, N); + if (res != N) { + std::cout << "Error, result is " << res << " but should be " << N << "\n"; + return 1; + } + return 0; +}