From 51fe70fb3005b76d19bccac4d52acb942e12ec44 Mon Sep 17 00:00:00 2001 From: Kristina Bessonova Date: Wed, 30 May 2018 09:57:10 +0300 Subject: [PATCH] Fix translation of device execution built-ins * Change SPIR-V Writer/Reader to consume/produce LLVM IR with blocks and device side enqueue built-ins as it is produced by clang * Improve function pointers removal related to OpenCL 2.0 blocks * Improve tests * Fix translation of literal structs --- include/LLVMSPIRVLib.h | 4 + lib/SPIRV/CMakeLists.txt | 23 +- lib/SPIRV/OCL20ToSPIRV.cpp | 120 ++++++ lib/SPIRV/OCLUtil.cpp | 13 + lib/SPIRV/OCLUtil.h | 12 +- lib/SPIRV/SPIRVInternal.h | 4 + lib/SPIRV/SPIRVLowerOCLBlocks.cpp | 303 +++++++++++++++ lib/SPIRV/SPIRVReader.cpp | 257 ++++--------- lib/SPIRV/SPIRVUtil.cpp | 26 ++ lib/SPIRV/SPIRVWriter.cpp | 15 +- test/EnqueueEmptyKernel.ll | 80 ++-- test/global_block.ll | 88 +++-- test/literal-struct.ll | 60 +++ test/transcoding/device_execution.ll | 112 ------ .../device_execution_multiple_blocks.ll | 153 -------- .../device_execution_overloading.ll | 203 ---------- .../device_execution_simple_local_memory.ll | 137 ------- test/transcoding/device_execution_vaargs.ll | 231 ------------ test/transcoding/enqueue_kernel.ll | 355 ++++++++++++++++++ test/transcoding/kernel_query.ll | 212 +++++++++++ 20 files changed, 1300 insertions(+), 1108 deletions(-) create mode 100644 lib/SPIRV/SPIRVLowerOCLBlocks.cpp create mode 100644 test/literal-struct.ll delete mode 100644 test/transcoding/device_execution.ll delete mode 100644 test/transcoding/device_execution_multiple_blocks.ll delete mode 100644 test/transcoding/device_execution_overloading.ll delete mode 100644 test/transcoding/device_execution_simple_local_memory.ll delete mode 100644 test/transcoding/device_execution_vaargs.ll create mode 100644 test/transcoding/enqueue_kernel.ll create mode 100644 test/transcoding/kernel_query.ll diff --git a/include/LLVMSPIRVLib.h b/include/LLVMSPIRVLib.h index eb67de3b77..b39c4bdce6 100644 --- a/include/LLVMSPIRVLib.h +++ b/include/LLVMSPIRVLib.h @@ -56,6 +56,7 @@ void initializeOCLTypeToSPIRVPass(PassRegistry &); void initializeSPIRVLowerBoolPass(PassRegistry &); void initializeSPIRVLowerConstExprPass(PassRegistry &); void initializeSPIRVLowerSPIRBlocksPass(PassRegistry &); +void initializeSPIRVLowerOCLBlocksPass(PassRegistry &); void initializeSPIRVLowerMemmovePass(PassRegistry &); void initializeSPIRVRegularizeLLVMPass(PassRegistry &); void initializeSPIRVToOCL20Pass(PassRegistry &); @@ -136,6 +137,9 @@ ModulePass *createSPIRVLowerConstExpr(); /// Create a pass for lowering SPIR 2.0 blocks to functions calls. ModulePass *createSPIRVLowerSPIRBlocks(); +/// Create a pass for removing function pointers related to OCL 2.0 blocks +ModulePass *createSPIRVLowerOCLBlocks(); + /// Create a pass for lowering llvm.memmove to llvm.memcpys with a temporary /// variable. ModulePass *createSPIRVLowerMemmove(); diff --git a/lib/SPIRV/CMakeLists.txt b/lib/SPIRV/CMakeLists.txt index e71c80464e..9fca450647 100644 --- a/lib/SPIRV/CMakeLists.txt +++ b/lib/SPIRV/CMakeLists.txt @@ -4,16 +4,6 @@ if(SPIRV_USE_LLVM_API) endif(SPIRV_USE_LLVM_API) add_llvm_library(LLVMSPIRVLib - libSPIRV/SPIRVBasicBlock.cpp - libSPIRV/SPIRVDebug.cpp - libSPIRV/SPIRVDecorate.cpp - libSPIRV/SPIRVEntry.cpp - libSPIRV/SPIRVFunction.cpp - libSPIRV/SPIRVInstruction.cpp - libSPIRV/SPIRVModule.cpp - libSPIRV/SPIRVStream.cpp - libSPIRV/SPIRVType.cpp - libSPIRV/SPIRVValue.cpp Mangler/FunctionDescriptor.cpp Mangler/Mangler.cpp Mangler/ManglingUtils.cpp @@ -25,8 +15,9 @@ add_llvm_library(LLVMSPIRVLib OCLUtil.cpp SPIRVLowerBool.cpp SPIRVLowerConstExpr.cpp - SPIRVLowerSPIRBlocks.cpp SPIRVLowerMemmove.cpp + SPIRVLowerOCLBlocks.cpp + SPIRVLowerSPIRBlocks.cpp SPIRVReader.cpp SPIRVRegularizeLLVM.cpp SPIRVToOCL20.cpp @@ -34,6 +25,16 @@ add_llvm_library(LLVMSPIRVLib SPIRVWriter.cpp SPIRVWriterPass.cpp TransOCLMD.cpp + libSPIRV/SPIRVBasicBlock.cpp + libSPIRV/SPIRVDebug.cpp + libSPIRV/SPIRVDecorate.cpp + libSPIRV/SPIRVEntry.cpp + libSPIRV/SPIRVFunction.cpp + libSPIRV/SPIRVInstruction.cpp + libSPIRV/SPIRVModule.cpp + libSPIRV/SPIRVStream.cpp + libSPIRV/SPIRVType.cpp + libSPIRV/SPIRVValue.cpp LINK_COMPONENTS Analysis BitWriter diff --git a/lib/SPIRV/OCL20ToSPIRV.cpp b/lib/SPIRV/OCL20ToSPIRV.cpp index a8b8e8c7b0..006ced61ac 100644 --- a/lib/SPIRV/OCL20ToSPIRV.cpp +++ b/lib/SPIRV/OCL20ToSPIRV.cpp @@ -42,6 +42,7 @@ #include "SPIRVInternal.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instruction.h" @@ -259,6 +260,12 @@ class OCL20ToSPIRV : public ModulePass, public InstVisitor { const std::string &DemangledName, unsigned int Offset); + /// Transform enqueue_kernel and kernel query built-in functions to + /// spirv-friendly format filling arguments, required for device-side enqueue + /// instructions, but missed in the original call + void visitCallEnqueueKernel(CallInst *CI, const std::string &DemangledName); + void visitCallKernelQuery(CallInst *CI, const std::string &DemangledName); + /// For cl_intel_subgroups block read built-ins: void visitSubgroupBlockReadINTEL(CallInst *CI, StringRef MangledName, const std::string &DemangledName); @@ -351,6 +358,7 @@ bool OCL20ToSPIRV::runOnModule(Module &Module) { if (auto GV = dyn_cast(I)) GV->eraseFromParent(); + eraseUselessFunctions(M); // remove unused functions declarations LLVM_DEBUG(dbgs() << "After OCL20ToSPIRV:\n" << *M); std::string Err; @@ -512,6 +520,14 @@ void OCL20ToSPIRV::visitCallInst(CallInst &CI) { OCLImageChannelOrderOffset); return; } + if (isEnqueueKernelBI(MangledName)) { + visitCallEnqueueKernel(&CI, DemangledName); + return; + } + if (isKernelQueryBI(MangledName)) { + visitCallKernelQuery(&CI, DemangledName); + return; + } if (DemangledName.find(kOCLBuiltinName::SubgroupBlockReadINTELPrefix) == 0) { visitSubgroupBlockReadINTEL(&CI, MangledName, DemangledName); return; @@ -1449,6 +1465,110 @@ void OCL20ToSPIRV::visitCallGetImageChannel(CallInst *CI, StringRef MangledName, }, &Attrs); } +void OCL20ToSPIRV::visitCallEnqueueKernel(CallInst *CI, + const std::string &DemangledName) { + const DataLayout &DL = M->getDataLayout(); + bool HasEvents = DemangledName.find("events") != std::string::npos; + + // SPIRV OpEnqueueKernel instruction has 10+ arguments. + SmallVector Args; + + // Copy all arguments before block invoke function pointer + // which match with what Clang 6.0 produced + const unsigned BlockFIdx = HasEvents ? 6 : 3; + Args.assign(CI->arg_begin(), CI->arg_begin() + BlockFIdx); + + // If no event arguments in original call, add dummy ones + if (!HasEvents) { + Args.push_back(getInt32(M, 0)); // dummy num events + Args.push_back(getOCLNullClkEventPtr(M)); // dummy wait events + Args.push_back(getOCLNullClkEventPtr(M)); // dummy ret event + } + + // Invoke: Pointer to invoke function + Value *BlockFunc = CI->getArgOperand(BlockFIdx); + Args.push_back(cast(GetUnderlyingObject(BlockFunc, DL))); + + // Param: Pointer to block literal + Value *BlockLiteral = CI->getArgOperand(BlockFIdx + 1); + Args.push_back(BlockLiteral); + + // Param Size: Size of block literal structure + // Param Aligment: Aligment of block literal structure + // TODO: these numbers should be obtained from block literal structure + Type *ParamType = GetUnderlyingObject(BlockLiteral, DL)->getType(); + if (PointerType *PT = dyn_cast(ParamType)) + ParamType = PT->getElementType(); + Args.push_back(getInt32(M, DL.getTypeStoreSize(ParamType))); + Args.push_back(getInt32(M, DL.getPrefTypeAlignment(ParamType))); + + // Local sizes arguments: Sizes of block invoke arguments + // Clang 6.0 and higher generates local size operands as an array, + // so we need to unpack them + if (DemangledName.find("_varargs") != std::string::npos) { + const unsigned LocalSizeArrayIdx = HasEvents ? 9 : 6; + auto *LocalSizeArray = + cast(CI->getArgOperand(LocalSizeArrayIdx)); + auto *LocalSizeArrayTy = + cast(LocalSizeArray->getSourceElementType()); + const uint64_t LocalSizeNum = LocalSizeArrayTy->getNumElements(); + for (unsigned I = 0; I < LocalSizeNum; ++I) + Args.push_back(GetElementPtrInst::Create( + LocalSizeArray->getSourceElementType(), // Pointee type + LocalSizeArray->getPointerOperand(), // Alloca + {getInt32(M, 0), getInt32(M, I)}, // Indices + "", CI)); + } + + StringRef NewName = "__spirv_EnqueueKernel__"; + FunctionType *FT = + FunctionType::get(CI->getType(), getTypes(Args), false /*isVarArg*/); + Function *NewF = + Function::Create(FT, GlobalValue::ExternalLinkage, NewName, M); + NewF->setCallingConv(CallingConv::SPIR_FUNC); + CallInst *NewCall = CallInst::Create(NewF, Args, "", CI); + NewCall->setCallingConv(NewF->getCallingConv()); + CI->replaceAllUsesWith(NewCall); + CI->eraseFromParent(); +} + +void OCL20ToSPIRV::visitCallKernelQuery(CallInst *CI, + const std::string &DemangledName) { + const DataLayout &DL = M->getDataLayout(); + bool HasNDRange = + DemangledName.find("_for_ndrange_impl") != std::string::npos; + // BIs with "_for_ndrange_impl" suffix has NDRange argument first, and + // Invoke argument following. For other BIs Invoke function is the first arg + const unsigned BlockFIdx = HasNDRange ? 1 : 0; + Value *BlockFVal = CI->getArgOperand(BlockFIdx)->stripPointerCasts(); + + auto *BlockF = cast(GetUnderlyingObject(BlockFVal, DL)); + + AttributeList Attrs = CI->getCalledFunction()->getAttributes(); + mutateCallInst(M, CI, + [=](CallInst *CI, std::vector &Args) { + Value *Param = *Args.rbegin(); + Type *ParamType = GetUnderlyingObject(Param, DL)->getType(); + if (PointerType *PT = dyn_cast(ParamType)) { + ParamType = PT->getElementType(); + } + // Last arg corresponds to SPIRV Param operand. + // Insert Invoke in front of Param. + // Add Param Size and Param Align at the end. + Args[BlockFIdx] = BlockF; + Args.push_back(getInt32(M, DL.getTypeStoreSize(ParamType))); + Args.push_back( + getInt32(M, DL.getPrefTypeAlignment(ParamType))); + + Op Opcode = OCLSPIRVBuiltinMap::map(DemangledName); + // Adding "__" postfix, so in case we have multiple such + // functions and their names will have numerical postfix, + // then the numerical postfix will be droped and we will get + // correct function name. + return getSPIRVFuncName(Opcode, kSPIRVName::Postfix); + }, + /*BuiltinFuncMangleInfo*/ nullptr, &Attrs); +} // The intel_sub_group_block_read built-ins are overloaded to support both // buffers and images, but need to be mapped to distinct SPIR-V instructions. diff --git a/lib/SPIRV/OCLUtil.cpp b/lib/SPIRV/OCLUtil.cpp index 83e2182763..4be0ce766d 100644 --- a/lib/SPIRV/OCLUtil.cpp +++ b/lib/SPIRV/OCLUtil.cpp @@ -605,6 +605,19 @@ bool isSpecialTypeInitializer(Instruction *Inst) { return isSamplerInitializer(Inst) || isPipeStorageInitializer(Inst); } +bool isEnqueueKernelBI(const StringRef MangledName) { + return MangledName == "__enqueue_kernel_basic" || + MangledName == "__enqueue_kernel_basic_events" || + MangledName == "__enqueue_kernel_varargs" || + MangledName == "__enqueue_kernel_events_varargs"; +} + +bool isKernelQueryBI(const StringRef MangledName) { + return MangledName == "__get_kernel_work_group_size_impl" || + MangledName == "__get_kernel_sub_group_count_for_ndrange_impl" || + MangledName == "__get_kernel_max_sub_group_size_for_ndrange_impl" || + MangledName == "__get_kernel_preferred_work_group_size_multiple_impl"; +} } // namespace OCLUtil void llvm::mangleOpenClBuiltin(const std::string &UniqName, diff --git a/lib/SPIRV/OCLUtil.h b/lib/SPIRV/OCLUtil.h index 4ac4b5aa01..e67705e280 100644 --- a/lib/SPIRV/OCLUtil.h +++ b/lib/SPIRV/OCLUtil.h @@ -365,6 +365,9 @@ bool isPipeStorageInitializer(Instruction *Inst); /// Check (isSamplerInitializer || isPipeStorageInitializer) bool isSpecialTypeInitializer(Instruction *Inst); +bool isEnqueueKernelBI(const StringRef MangledName); +bool isKernelQueryBI(const StringRef MangledName); + } // namespace OCLUtil /////////////////////////////////////////////////////////////////////////////// @@ -542,11 +545,12 @@ template <> inline void SPIRVMap::init() { // CL 2.0 kernel enqueue builtins _SPIRV_OP(enqueue_marker, EnqueueMarker) _SPIRV_OP(enqueue_kernel, EnqueueKernel) - _SPIRV_OP(get_kernel_ndrange_subgroup_count, GetKernelNDrangeSubGroupCount) - _SPIRV_OP(get_kernel_ndrange_max_subgroup_count, + _SPIRV_OP(get_kernel_sub_group_count_for_ndrange_impl, + GetKernelNDrangeSubGroupCount) + _SPIRV_OP(get_kernel_max_sub_group_size_for_ndrange_impl, GetKernelNDrangeMaxSubGroupSize) - _SPIRV_OP(get_kernel_work_group_size, GetKernelWorkGroupSize) - _SPIRV_OP(get_kernel_preferred_work_group_size_multiple, + _SPIRV_OP(get_kernel_work_group_size_impl, GetKernelWorkGroupSize) + _SPIRV_OP(get_kernel_preferred_work_group_size_multiple_impl, GetKernelPreferredWorkGroupSizeMultiple) _SPIRV_OP(retain_event, RetainEvent) _SPIRV_OP(release_event, ReleaseEvent) diff --git a/lib/SPIRV/SPIRVInternal.h b/lib/SPIRV/SPIRVInternal.h index 99962b0008..b0a8d6c198 100644 --- a/lib/SPIRV/SPIRVInternal.h +++ b/lib/SPIRV/SPIRVInternal.h @@ -533,6 +533,10 @@ Function *getOrCreateFunction(Module *M, Type *RetTy, ArrayRef ArgTypes, AttributeList *Attrs = nullptr, bool TakeName = true); +PointerType *getOCLClkEventType(Module *M); +PointerType *getOCLClkEventPtrType(Module *M); +Constant *getOCLNullClkEventPtr(Module *M); + /// Get function call arguments. /// \param Start Starting index. /// \param End Ending index. diff --git a/lib/SPIRV/SPIRVLowerOCLBlocks.cpp b/lib/SPIRV/SPIRVLowerOCLBlocks.cpp new file mode 100644 index 0000000000..50e183826b --- /dev/null +++ b/lib/SPIRV/SPIRVLowerOCLBlocks.cpp @@ -0,0 +1,303 @@ +//===- SPIRVLowerOCLBlocks.cpp - OCL Utilities ----------------------------===// +// +// The LLVM/SPIRV Translator +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// Copyright (c) 2018 Intel Corporation. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimers in the documentation +// and/or other materials provided with the distribution. +// Neither the names of Intel Corporation, nor the names of its +// contributors may be used to endorse or promote products derived from this +// Software without specific prior written permission. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH +// THE SOFTWARE. +// +//===----------------------------------------------------------------------===// +// +// SPIR-V specification doesn't allow function pointers, so SPIR-V translator +// is designed to fail if a value with function type (except calls) is occured. +// Currently there is only two cases, when function pointers are generating in +// LLVM IR in OpenCL - block calls and device side enqueue built-in calls. +// +// In both cases values with function type used as intermediate representation +// for block literal structure. +// +// This pass is designed to find such cases and simplify them to avoid any +// function pointer types occurrences in LLVM IR in 4 steps. +// +// 1. Find all function pointer allocas, like +// %block = alloca void () * +// +// Then find a single store to that alloca: +// %blockLit = alloca <{ i32, i32, ...}>, align 4 +// %0 = bitcast <{ i32, i32, ... }>* %blockLit to void ()* +// > store void ()* %0, void ()** %block, align 4 +// +// And replace the alloca users by new instructions which used stored value +// %blockLit itself instead of function pointer alloca %block. +// +// 2. Find consecutive casts from block literal type to i8 addrspace(4)* +// used function pointers as an intermediate type: +// %0 = bitcast <{ i32, i32 }> %block to void() * +// %1 = addrspacecast void() * %0 to i8 addrspace(4)* +// And simplify them: +// %2 = addrspacecast <{ i32, i32 }> %block to i8 addrspace(4)* +// +// 3. Find all unused instructions with function pointer type occured after +// pp.1-2 and remove them. +// +// 4. Find unused globals with function pointer type, like +// @block = constant void ()* +// bitcast ({ i32, i32 }* @__block_literal_global to void ()* +// +// And remove them. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "spv-lower-ocl-blocks" + +#include "OCLUtil.h" +#include "SPIRVInternal.h" + +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/PassSupport.h" +#include "llvm/Support/Casting.h" + +using namespace llvm; + +namespace { + +static void +removeUnusedFunctionPtrInst(Instruction *I, + SmallSetVector &FuncPtrInsts) { + for (unsigned OpIdx = 0, Ops = I->getNumOperands(); OpIdx != Ops; ++OpIdx) { + Instruction *OpI = dyn_cast(I->getOperand(OpIdx)); + I->setOperand(OpIdx, nullptr); + if (OpI && OpI != I && OpI->user_empty()) + FuncPtrInsts.insert(OpI); + } + I->eraseFromParent(); +} + +static bool isFuncPtrAlloca(const AllocaInst *AI) { + auto *ET = dyn_cast(AI->getAllocatedType()); + return ET && ET->getElementType()->isFunctionTy(); +} + +static bool hasFuncPtrType(const Value *V) { + auto *PT = dyn_cast(V->getType()); + return PT && PT->getElementType()->isFunctionTy(); +} + +static bool isFuncPtrInst(const Instruction *I) { + if (auto *AI = dyn_cast(I)) + return isFuncPtrAlloca(AI); + + for (auto &Op : I->operands()) { + if (auto *AI = dyn_cast(Op)) + return isFuncPtrAlloca(AI); + + auto *OpI = dyn_cast(&Op); + if (OpI && OpI != I && hasFuncPtrType(OpI)) + return true; + } + return false; +} + +static StoreInst *findSingleStore(AllocaInst *AI) { + StoreInst *Store = nullptr; + for (auto *U : AI->users()) { + if (!isa(U)) + continue; // not a store + if (Store) + return nullptr; // there are more than one stores + Store = dyn_cast(U); + } + return Store; +} + +static void fixFunctionPtrAllocaUsers(AllocaInst *AI) { + // Find and remove a single store to alloca + auto *SingleStore = findSingleStore(AI); + assert(SingleStore && "More than one store to the function pointer alloca"); + auto *StoredVal = SingleStore->getValueOperand(); + SingleStore->eraseFromParent(); + + // Find loads from the alloca and replace thier users + for (auto *U : AI->users()) { + auto *LI = dyn_cast(U); + if (!LI) + continue; + + for (auto *U : LI->users()) { + auto *UInst = cast(U); + auto *Cast = CastInst::CreatePointerBitCastOrAddrSpaceCast( + StoredVal, UInst->getType(), "", UInst); + UInst->replaceAllUsesWith(Cast); + } + } +} + +static int getBlockLiteralIdx(const Function &F) { + StringRef FName = F.getName(); + if (isEnqueueKernelBI(FName)) + return FName.contains("events") ? 7 : 4; + if (isKernelQueryBI(FName)) + return FName.contains("for_ndrange") ? 2 : 1; + if (FName.startswith("__") && FName.contains("_block_invoke")) + return F.hasStructRetAttr() ? 1 : 0; + + return -1; // No block literal argument +} + +static bool hasBlockLiteralArg(const Function &F) { + return getBlockLiteralIdx(F) != -1; +} + +static bool simplifyFunctionPtrCasts(Function &F) { + bool Changed = false; + int BlockLiteralIdx = getBlockLiteralIdx(F); + for (auto *U : F.users()) { + auto *Call = dyn_cast(U); + if (!Call) + continue; + if (Call->getFunction()->getName() == F.getName().str() + "_kernel") + continue; // Skip block invoke function calls inside block invoke kernels + + const DataLayout &DL = F.getParent()->getDataLayout(); + auto *BlockLiteral = Call->getOperand(BlockLiteralIdx); + auto *BlockLiteralVal = GetUnderlyingObject(BlockLiteral, DL); + if (isa(BlockLiteralVal)) + continue; // nothing to do with globals + + auto *BlockLiteralAlloca = cast(BlockLiteralVal); + assert(!BlockLiteralAlloca->getAllocatedType()->isFunctionTy() && + "Function type shouldn't be there"); + + auto *NewBlockLiteral = CastInst::CreatePointerBitCastOrAddrSpaceCast( + BlockLiteralAlloca, BlockLiteral->getType(), "", Call); + BlockLiteral->replaceAllUsesWith(NewBlockLiteral); + Changed |= true; + } + return Changed; +} + +static void +findFunctionPtrAllocas(Module &M, + SmallVectorImpl &FuncPtrAllocas) { + for (auto &F : M) { + if (F.isDeclaration()) + continue; + for (auto &I : instructions(F)) { + auto *AI = dyn_cast(&I); + if (!AI || !isFuncPtrAlloca(AI)) + continue; + FuncPtrAllocas.push_back(AI); + } + } +} + +static void +findUnusedFunctionPtrInsts(Module &M, + SmallSetVector &FuncPtrInsts) { + for (auto &F : M) { + if (F.isDeclaration()) + continue; + for (auto &I : instructions(F)) + if (I.user_empty() && isFuncPtrInst(&I)) + FuncPtrInsts.insert(&I); + } +} + +static void +findUnusedFunctionPtrGlbs(Module &M, + SmallVectorImpl &FuncPtrGlbs) { + for (auto &GV : M.globals()) { + if (!GV.user_empty()) + continue; + auto *GVType = dyn_cast(GV.getType()->getElementType()); + if (GVType && GVType->getElementType()->isFunctionTy()) + FuncPtrGlbs.push_back(&GV); + } +} + +class SPIRVLowerOCLBlocks : public ModulePass { + +public: + SPIRVLowerOCLBlocks() : ModulePass(ID) {} + + bool runOnModule(Module &M) { + bool Changed = false; + + // 1. Find function pointer allocas and fix their users + SmallVector FuncPtrAllocas; + findFunctionPtrAllocas(M, FuncPtrAllocas); + + Changed |= !FuncPtrAllocas.empty(); + for (auto *AI : FuncPtrAllocas) + fixFunctionPtrAllocaUsers(AI); + + // 2. Simplify consecutive casts which use function pointer types + for (auto &F : M) + if (hasBlockLiteralArg(F)) + Changed |= simplifyFunctionPtrCasts(F); + + // 3. Cleanup unused instructions with function pointer type + // which are occured after pp. 1-2 + SmallSetVector FuncPtrInsts; + findUnusedFunctionPtrInsts(M, FuncPtrInsts); + + Changed |= !FuncPtrInsts.empty(); + while (!FuncPtrInsts.empty()) { + Instruction *I = FuncPtrInsts.pop_back_val(); + removeUnusedFunctionPtrInst(I, FuncPtrInsts); + } + + // 4. Find and remove unused global variables with function pointer type + SmallVector FuncPtrGlbs; + findUnusedFunctionPtrGlbs(M, FuncPtrGlbs); + + Changed |= !FuncPtrGlbs.empty(); + for (auto *GV : FuncPtrGlbs) + GV->eraseFromParent(); + + return Changed; + } + + static char ID; +}; // class SPIRVLowerOCLBlocks + +char SPIRVLowerOCLBlocks::ID = 0; + +} // namespace + +INITIALIZE_PASS( + SPIRVLowerOCLBlocks, "spv-lower-ocl-blocks", + "Remove function pointers occured in case of using OpenCL blocks", false, + false) + +llvm::ModulePass *llvm::createSPIRVLowerOCLBlocks() { + return new SPIRVLowerOCLBlocks(); +} diff --git a/lib/SPIRV/SPIRVReader.cpp b/lib/SPIRV/SPIRVReader.cpp index 6799b2d466..ad59f645c3 100644 --- a/lib/SPIRV/SPIRVReader.cpp +++ b/lib/SPIRV/SPIRVReader.cpp @@ -295,17 +295,16 @@ class SPIRVToLLVM { Value *transValueWithoutDecoration(SPIRVValue *, Function *F, BasicBlock *, bool CreatePlaceHolder = true); Value *transDeviceEvent(SPIRVValue *BV, Function *F, BasicBlock *BB); - Value *transEnqueuedBlock(SPIRVValue *BF, SPIRVValue *BC, SPIRVValue *BCSize, - SPIRVValue *BCAligment, Function *F, - BasicBlock *BB); bool transDecoration(SPIRVValue *, Value *); bool transAlign(SPIRVValue *, Value *); Instruction *transOCLBuiltinFromExtInst(SPIRVExtInst *BC, BasicBlock *BB); std::vector transValue(const std::vector &, Function *F, BasicBlock *); Function *transFunction(SPIRVFunction *F); + Value *transBlockInvoke(SPIRVValue *Invoke, BasicBlock *BB); Instruction *transEnqueueKernelBI(SPIRVInstruction *BI, BasicBlock *BB); - Instruction *transWGSizeBI(SPIRVInstruction *BI, BasicBlock *BB); + Instruction *transWGSizeQueryBI(SPIRVInstruction *BI, BasicBlock *BB); + Instruction *transSGSizeQueryBI(SPIRVInstruction *BI, BasicBlock *BB); bool transFPContractMetadata(); bool transKernelMetadata(); bool transNonTemporalMetadata(Instruction *I); @@ -1893,7 +1892,12 @@ Value *SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F, BV, transEnqueueKernelBI(static_cast(BV), BB)); case OpGetKernelWorkGroupSize: case OpGetKernelPreferredWorkGroupSizeMultiple: - return mapValue(BV, transWGSizeBI(static_cast(BV), BB)); + return mapValue( + BV, transWGSizeQueryBI(static_cast(BV), BB)); + case OpGetKernelNDrangeMaxSubGroupSize: + case OpGetKernelNDrangeSubGroupCount: + return mapValue( + BV, transSGSizeQueryBI(static_cast(BV), BB)); default: { auto OC = BV->getOpCode(); if (isSPIRVCmpInstTransToLLVMInst(static_cast(BV))) { @@ -2063,165 +2067,25 @@ SPIRVToLLVM::transOCLBuiltinPostproc(SPIRVInstruction *BI, CallInst *CI, return CI; } -static void adaptBlockInvoke(Function *Invoke, Type *BlockStructTy) { - // As first argument block invoke takes a pointer to captured data. - // We pass to block invoke whole block structure, not only captured data - // as it expected. So we need to update original function to unpack expected - // captured data and use it instead of an original argument - // - // %block = bitcast i8 addrspace(4)* to <{ ..., [X x i8] }> addrspace(4)* - // %block.1 = addrspacecast %block to <{ ..., [X x i8] }>* - // %captured = getelementptr <{ ..., [X x i8] }>, i32 0, i32 5 - // %captured.1 = bitcast %captured to i8* - - BasicBlock *BB = &(Invoke->getEntryBlock()); - BB->splitBasicBlock(BB->begin(), "invoke"); - auto FirstArg = &*(Invoke->arg_begin()); - IRBuilder<> Builder(BB, BB->begin()); - - auto FirstArgTy = dyn_cast(FirstArg->getType()); - assert(FirstArgTy && "Expects that first argument of invoke is a pointer"); - unsigned FirstArgAS = FirstArgTy->getAddressSpace(); - - auto Int8PtrTy = - Type::getInt8PtrTy(Invoke->getParent()->getContext(), FirstArgAS); - auto BlockStructPtrTy = PointerType::get(BlockStructTy, FirstArgAS); - - auto Int32Ty = Type::getInt32Ty(Invoke->getParent()->getContext()); - Value *CapturedGEPIndices[2] = {ConstantInt::get(Int32Ty, 0), - ConstantInt::get(Int32Ty, 5)}; - auto BlockToStructCast = - Builder.CreateBitCast(FirstArg, BlockStructPtrTy, "block"); - auto CapturedGEP = Builder.CreateGEP(BlockToStructCast, CapturedGEPIndices); - auto CapturedToInt8Cast = Builder.CreateBitCast(CapturedGEP, Int8PtrTy); - - FirstArg->replaceUsesOutsideBlock(CapturedToInt8Cast, BB); -} - -static Type *getOrCreateBlockDescTy(Module *M) { - // Get or create block descriptor type which contains block size - // in the last element: %struct.__block_descriptor = type { i64, i64 } - auto BlockDescTy = M->getTypeByName("struct.__block_descriptor"); - if (BlockDescTy) - return BlockDescTy; - - auto Int64Ty = Type::getInt64Ty(M->getContext()); - Type *BlockDescElements[2] = {/*Reserved*/ Int64Ty, /*Block size*/ Int64Ty}; - return StructType::create(M->getContext(), BlockDescElements, - "struct.__block_descriptor"); -} - -Value *SPIRVToLLVM::transEnqueuedBlock(SPIRVValue *SInvoke, - SPIRVValue *SCaptured, - SPIRVValue *SCaptSize, - SPIRVValue *SCaptAlignment, - Function *LBI, BasicBlock *LBB) { - // Search if that block have been already translated - auto Loc = BlockMap.find(SInvoke); - if (Loc != BlockMap.end()) - return Loc->second; - - IRBuilder<> Builder(LBB); - const DataLayout &DL = M->getDataLayout(); - - // Translate block and its arguments from SPIRV values to LLVM - auto LInvoke = transFunction(static_cast(SInvoke)); - auto LCaptured = transValue(SCaptured, LBI, LBB, false); - auto LCaptSize = - dyn_cast(transValue(SCaptSize, LBI, LBB, false)); - auto LCaptAlignment = - dyn_cast(transValue(SCaptAlignment, LBI, LBB, false)); - - // Create basic types - auto Int8Ty = Type::getInt8Ty(*Context); - auto Int32Ty = Type::getInt32Ty(*Context); - auto Int8PtrTy = Type::getInt8PtrTy(*Context, SPIRAS_Private); - auto Int8PtrTyGen = Type::getInt8PtrTy(*Context, SPIRAS_Generic); - auto BlockDescTy = getOrCreateBlockDescTy(M); - auto BlockDescPtrTy = BlockDescTy->getPointerTo(SPIRAS_Private); - - // Create a block as structure: - // <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }> - SmallVector BlockEls = { - /*isa*/ Int8PtrTy, /*flags*/ Int32Ty, /*reserved*/ Int32Ty, - /*invoke*/ Int8PtrTy, /*block_descriptor*/ BlockDescPtrTy}; - - // Add captured if any - // <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [X x i8] }> - // Note: captured data stored in structure as array of char - if (LCaptSize->getZExtValue() > 0) - BlockEls.push_back(ArrayType::get(Int8Ty, LCaptSize->getZExtValue())); - - auto BlockTy = StructType::get(*Context, BlockEls, /*isPacked*/ true); - - // Allocate block on the stack, then store data to it - auto BlockAlloca = Builder.CreateAlloca(BlockTy, nullptr, "block"); - BlockAlloca->setAlignment(DL.getPrefTypeAlignment(BlockTy)); - - auto GetIndices = [Int32Ty](int A, int B) -> SmallVector { - return {ConstantInt::get(Int32Ty, A), ConstantInt::get(Int32Ty, B)}; - }; - - // 1. isa, flags and reserved fields isn't used in current implementation - // Fill them the same way as clang does - auto IsaGEP = Builder.CreateGEP(BlockAlloca, GetIndices(0, 0)); - Builder.CreateStore(ConstantPointerNull::get(Int8PtrTy), IsaGEP); - auto FlagsGEP = Builder.CreateGEP(BlockAlloca, GetIndices(0, 1)); - Builder.CreateStore(ConstantInt::get(Int32Ty, 1342177280), FlagsGEP); - auto ReservedGEP = Builder.CreateGEP(BlockAlloca, GetIndices(0, 2)); - Builder.CreateStore(ConstantInt::get(Int32Ty, 0), ReservedGEP); - - // 2. Store pointer to block invoke to the structure - auto InvokeCast = Builder.CreateBitCast(LInvoke, Int8PtrTy, "invoke"); - auto InvokeGEP = Builder.CreateGEP(BlockAlloca, GetIndices(0, 3)); - Builder.CreateStore(InvokeCast, InvokeGEP); - - // 3. Create and store a pointer to the block descriptor global value - uint64_t SizeOfBlock = DL.getTypeAllocSize(BlockTy); - - auto Int64Ty = Type::getInt64Ty(*Context); - Constant *BlockDescEls[2] = {ConstantInt::get(Int64Ty, 0), - ConstantInt::get(Int64Ty, SizeOfBlock)}; - auto BlockDesc = - ConstantStruct::get(dyn_cast(BlockDescTy), BlockDescEls); - - auto BlockDescGV = - new GlobalVariable(*M, BlockDescTy, true, GlobalValue::InternalLinkage, - BlockDesc, "__block_descriptor_spirv"); - auto BlockDescGEP = - Builder.CreateGEP(BlockAlloca, GetIndices(0, 4), "block.descriptor"); - Builder.CreateStore(BlockDescGV, BlockDescGEP); - - // 4. Copy captured data to the structure - if (LCaptSize->getZExtValue() > 0) { - auto CapturedGEP = - Builder.CreateGEP(BlockAlloca, GetIndices(0, 5), "block.captured"); - auto CapturedGEPCast = Builder.CreateBitCast(CapturedGEP, Int8PtrTy); - - // We can't make any guesses about type of captured data, so - // let's copy it through memcpy - Builder.CreateMemCpy(CapturedGEPCast, LCaptAlignment->getZExtValue(), - LCaptured, LCaptAlignment->getZExtValue(), LCaptSize, - SCaptured->isVolatile()); - - // Fix invoke function to correctly process its first argument - adaptBlockInvoke(LInvoke, BlockTy); - } - auto BlockCast = Builder.CreateBitCast(BlockAlloca, Int8PtrTy); - auto BlockCastGen = Builder.CreateAddrSpaceCast(BlockCast, Int8PtrTyGen); - BlockMap[SInvoke] = BlockCastGen; - return BlockCastGen; +Value *SPIRVToLLVM::transBlockInvoke(SPIRVValue *Invoke, BasicBlock *BB) { + auto *TranslatedInvoke = transFunction(static_cast(Invoke)); + auto *Int8PtrTyGen = Type::getInt8PtrTy(*Context, SPIRAS_Generic); + return CastInst::CreatePointerBitCastOrAddrSpaceCast(TranslatedInvoke, + Int8PtrTyGen, "", BB); } Instruction *SPIRVToLLVM::transEnqueueKernelBI(SPIRVInstruction *BI, BasicBlock *BB) { - Type *IntTy = Type::getInt32Ty(*Context); + Type *Int32Ty = Type::getInt32Ty(*Context); + Type *Int64Ty = Type::getInt64Ty(*Context); + Type *IntTy = + M->getDataLayout().getPointerSizeInBits(0) == 32 ? Int32Ty : Int64Ty; // Find or create enqueue kernel BI declaration auto Ops = BI->getOperands(); bool HasVaargs = Ops.size() > 10; - std::string FName = HasVaargs ? "__enqueue_kernel_events_vaargs" + std::string FName = HasVaargs ? "__enqueue_kernel_events_varargs" : "__enqueue_kernel_basic_events"; Function *F = M->getFunction(FName); if (!F) { @@ -2231,17 +2095,22 @@ Instruction *SPIRVToLLVM::transEnqueueKernelBI(SPIRVInstruction *BI, SmallVector Tys = { transType(Ops[0]->getType()), // queue - IntTy, // flags + Int32Ty, // flags transType(Ops[2]->getType()), // ndrange - IntTy, + Int32Ty, EventTy, - EventTy, // events - Type::getInt8PtrTy(*Context, SPIRAS_Generic) // block + EventTy, // events + Type::getInt8PtrTy(*Context, SPIRAS_Generic), // block_invoke + Type::getInt8PtrTy(*Context, SPIRAS_Generic) // block_literal }; - if (HasVaargs) - Tys.push_back(IntTy); // Number of variadics if any + if (HasVaargs) { + // Number of block invoke arguments (local arguments) + Tys.push_back(Int32Ty); + // Array of sizes of block invoke arguments + Tys.push_back(PointerType::get(IntTy, SPIRAS_Private)); + } - FunctionType *FT = FunctionType::get(IntTy, Tys, HasVaargs); + FunctionType *FT = FunctionType::get(Int32Ty, Tys, false); F = Function::Create(FT, GlobalValue::ExternalLinkage, FName, M); if (isFuncNoUnwind()) F->addFnAttr(Attribute::NoUnwind); @@ -2255,14 +2124,18 @@ Instruction *SPIRVToLLVM::transEnqueueKernelBI(SPIRVInstruction *BI, transValue(Ops[3], F, BB, false), // events number transDeviceEvent(Ops[4], F, BB), // event_wait_list transDeviceEvent(Ops[5], F, BB), // event_ret - transEnqueuedBlock(Ops[6], Ops[7], Ops[8], Ops[9], F, BB) // block + transBlockInvoke(Ops[6], BB), // block_invoke + transValue(Ops[7], F, BB, false) // block_literal }; if (HasVaargs) { - Args.push_back( - ConstantInt::get(IntTy, Ops.size() - 10)); // Number of vaargs - for (unsigned I = 10; I < Ops.size(); ++I) - Args.push_back(transValue(Ops[I], F, BB, false)); + // Number of local arguments + Args.push_back(ConstantInt::get(Int32Ty, Ops.size() - 10)); + // GEP to array of sizes of local arguments + if (Ops[10]->getOpCode() == OpPtrAccessChain) + Args.push_back(transValue(Ops[10], F, BB, false)); + else + llvm_unreachable("Not implemented"); } auto Call = CallInst::Create(F, Args, "", BB); setName(Call, BI); @@ -2270,23 +2143,57 @@ Instruction *SPIRVToLLVM::transEnqueueKernelBI(SPIRVInstruction *BI, return Call; } -Instruction *SPIRVToLLVM::transWGSizeBI(SPIRVInstruction *BI, BasicBlock *BB) { - std::string FName = (BI->getOpCode() == OpGetKernelWorkGroupSize) - ? "__get_kernel_work_group_size_impl" - : "__get_kernel_preferred_work_group_multiple_impl"; +Instruction *SPIRVToLLVM::transWGSizeQueryBI(SPIRVInstruction *BI, + BasicBlock *BB) { + std::string FName = + (BI->getOpCode() == OpGetKernelWorkGroupSize) + ? "__get_kernel_work_group_size_impl" + : "__get_kernel_preferred_work_group_size_multiple_impl"; Function *F = M->getFunction(FName); if (!F) { auto Int8PtrTyGen = Type::getInt8PtrTy(*Context, SPIRAS_Generic); - FunctionType *FT = - FunctionType::get(Type::getInt32Ty(*Context), Int8PtrTyGen, false); + FunctionType *FT = FunctionType::get(Type::getInt32Ty(*Context), + {Int8PtrTyGen, Int8PtrTyGen}, false); F = Function::Create(FT, GlobalValue::ExternalLinkage, FName, M); if (isFuncNoUnwind()) F->addFnAttr(Attribute::NoUnwind); } auto Ops = BI->getOperands(); - auto Block = transEnqueuedBlock(Ops[0], Ops[1], Ops[2], Ops[3], F, BB); - auto Call = CallInst::Create(F, Block, "", BB); + SmallVector Args = {transBlockInvoke(Ops[0], BB), + transValue(Ops[1], F, BB, false)}; + auto Call = CallInst::Create(F, Args, "", BB); + setName(Call, BI); + setAttrByCalledFunc(Call); + return Call; +} + +Instruction *SPIRVToLLVM::transSGSizeQueryBI(SPIRVInstruction *BI, + BasicBlock *BB) { + std::string FName = (BI->getOpCode() == OpGetKernelNDrangeMaxSubGroupSize) + ? "__get_kernel_max_sub_group_size_for_ndrange_impl" + : "__get_kernel_sub_group_count_for_ndrange_impl"; + + auto Ops = BI->getOperands(); + Function *F = M->getFunction(FName); + if (!F) { + auto Int8PtrTyGen = Type::getInt8PtrTy(*Context, SPIRAS_Generic); + SmallVector Tys = { + transType(Ops[0]->getType()), // ndrange + Int8PtrTyGen, // block_invoke + Int8PtrTyGen // block_literal + }; + auto *FT = FunctionType::get(Type::getInt32Ty(*Context), Tys, false); + F = Function::Create(FT, GlobalValue::ExternalLinkage, FName, M); + if (isFuncNoUnwind()) + F->addFnAttr(Attribute::NoUnwind); + } + SmallVector Args = { + transValue(Ops[0], F, BB, false), // ndrange + transBlockInvoke(Ops[1], BB), // block_invoke + transValue(Ops[2], F, BB, false) // block_literal + }; + auto Call = CallInst::Create(F, Args, "", BB); setName(Call, BI); setAttrByCalledFunc(Call); return Call; diff --git a/lib/SPIRV/SPIRVUtil.cpp b/lib/SPIRV/SPIRVUtil.cpp index a05630387c..b80ca97e26 100644 --- a/lib/SPIRV/SPIRVUtil.cpp +++ b/lib/SPIRV/SPIRVUtil.cpp @@ -461,6 +461,13 @@ bool getSPIRVBuiltin(const std::string &OrigName, spv::BuiltIn &B) { return getByName(R.str(), B); } +// Some OpenCL built-ins generated by Clang are not mangled +// TODO: add other built-ins which are not mangled by Clang +bool isNonMangledOCLBuiltin(const StringRef &Name) { + if (isEnqueueKernelBI(Name) || isKernelQueryBI(Name)) + return true; + return false; +} bool oclIsBuiltin(const StringRef &Name, std::string *DemangledName, bool IsCpp) { if (Name == "printf") { @@ -468,6 +475,11 @@ bool oclIsBuiltin(const StringRef &Name, std::string *DemangledName, *DemangledName = Name; return true; } + if (isNonMangledOCLBuiltin(Name)) { + if (DemangledName) + *DemangledName = Name.drop_front(2); + return true; + } if (!Name.startswith("_Z")) return false; if (!DemangledName) @@ -1388,4 +1400,18 @@ Type *getSPIRVImageTypeFromOCL(Module *M, Type *ImageTy) { Acc = getAccessQualifier(ImageTypeName); return getOrCreateOpaquePtrType(M, mapOCLTypeNameToSPIRV(ImageTypeName, Acc)); } + +llvm::PointerType *getOCLClkEventType(Module *M) { + return getOrCreateOpaquePtrType(M, SPIR_TYPE_NAME_CLK_EVENT_T, + SPIRAS_Private); +} + +llvm::PointerType *getOCLClkEventPtrType(Module *M) { + return PointerType::get(getOCLClkEventType(M), SPIRAS_Generic); +} + +llvm::Constant *getOCLNullClkEventPtr(Module *M) { + return Constant::getNullValue(getOCLClkEventPtrType(M)); +} + } // namespace SPIRV diff --git a/lib/SPIRV/SPIRVWriter.cpp b/lib/SPIRV/SPIRVWriter.cpp index 27f5b8f014..e0c8ec9064 100644 --- a/lib/SPIRV/SPIRVWriter.cpp +++ b/lib/SPIRV/SPIRVWriter.cpp @@ -692,7 +692,8 @@ SPIRVValue *LLVMToSPIRV::transConstant(Value *V) { if (auto CAZero = dyn_cast(V)) { Type *AggType = CAZero->getType(); if (const StructType *ST = dyn_cast(AggType)) - if (ST->getName() == getSPIRVTypeName(kSPIRVTypeName::ConstantSampler)) + if (ST->hasName() && + ST->getName() == getSPIRVTypeName(kSPIRVTypeName::ConstantSampler)) return BM->addSamplerConstant(transType(AggType), 0, 0, 0); return BM->addNullConstant(transType(AggType)); @@ -735,9 +736,11 @@ SPIRVValue *LLVMToSPIRV::transConstant(Value *V) { return BM->addCompositeConstant(transType(V->getType()), BV); } - if (auto ConstV = dyn_cast(V)) { - if (ConstV->getType()->getName() == - getSPIRVTypeName(kSPIRVTypeName::ConstantSampler)) { + if (const auto *ConstV = dyn_cast(V)) { + StringRef StructName; + if (ConstV->getType()->hasName()) + StructName = ConstV->getType()->getName(); + if (StructName == getSPIRVTypeName(kSPIRVTypeName::ConstantSampler)) { assert(ConstV->getNumOperands() == 3); SPIRVWord AddrMode = ConstV->getOperand(0)->getUniqueInteger().getZExtValue(), @@ -752,8 +755,7 @@ SPIRVValue *LLVMToSPIRV::transConstant(Value *V) { return BM->addSamplerConstant(SamplerTy, AddrMode, Normalized, FilterMode); } - if (ConstV->getType()->getName() == - getSPIRVTypeName(kSPIRVTypeName::ConstantPipeStorage)) { + if (StructName == getSPIRVTypeName(kSPIRVTypeName::ConstantPipeStorage)) { assert(ConstV->getNumOperands() == 3); SPIRVWord PacketSize = ConstV->getOperand(0)->getUniqueInteger().getZExtValue(), @@ -1800,6 +1802,7 @@ void addPassesForSPIRV(legacy::PassManager &PassMgr) { PassMgr.add(createOCL21ToSPIRV()); PassMgr.add(createSPIRVLowerSPIRBlocks()); PassMgr.add(createOCLTypeToSPIRV()); + PassMgr.add(createSPIRVLowerOCLBlocks()); PassMgr.add(createOCL20ToSPIRV()); PassMgr.add(createSPIRVRegularizeLLVM()); PassMgr.add(createSPIRVLowerConstExpr()); diff --git a/test/EnqueueEmptyKernel.ll b/test/EnqueueEmptyKernel.ll index 656fa94e89..744afb2550 100644 --- a/test/EnqueueEmptyKernel.ll +++ b/test/EnqueueEmptyKernel.ll @@ -1,6 +1,6 @@ ;; This test checks that Invoke parameter of OpEnueueKernel instruction meet the ;; following specification requirements in case of enqueueing empty block: -;;"Invoke must be an OpFunction whose OpTypeFunction operand has: +;; "Invoke must be an OpFunction whose OpTypeFunction operand has: ;; - Result Type must be OpTypeVoid. ;; - The first parameter must have a type of OpTypePointer to an 8-bit OpTypeInt. ;; - An optional list of parameters, each of which must have a type of OpTypePointer to the Workgroup Storage Class. @@ -11,6 +11,7 @@ ;; ndrange_1D(1), ;; 0, NULL, NULL, ;; ^(){}); +;; } ; RUN: llvm-as < %s > %t.bc ; RUN: llvm-spirv %t.bc -o - -spirv-text | FileCheck %s --check-prefix=CHECK-SPIRV @@ -19,63 +20,66 @@ target triple = "spir64-unknown-unknown" %struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] } %opencl.queue_t = type opaque -%opencl.block = type opaque %opencl.clk_event_t = type opaque +@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4 + +; CHECK-SPIRV: Name [[Block:[0-9]+]] "__block_literal_global" ; CHECK-SPIRV: TypeInt [[Int8:[0-9]+]] 8 ; CHECK-SPIRV: TypeVoid [[Void:[0-9]+]] -; CHECK-SPIRV: TypePointer [[Int8Ptr:[0-9]+]] {{[0-9]+}} [[Int8]] -; CHECK-SPIRV: ConstantNull [[Int8Ptr]] [[NullInt8Ptr:[0-9]+]] +; CHECK-SPIRV: TypePointer [[Int8Ptr:[0-9]+]] 5 [[Int8]] +; CHECK-SPIRV: TypePointer [[Int8PtrGen:[0-9]+]] 8 [[Int8]] +; CHECK-SPIRV: Variable {{[0-9]+}} [[Block:[0-9]+]] -; Function Attrs: nounwind -define spir_kernel void @test_enqueue_empty() #0 { +; Function Attrs: convergent nounwind +define spir_kernel void @test_enqueue_empty() #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !2 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !2 { entry: - %agg.tmp = alloca %struct.ndrange_t, align 8 - %call = call spir_func %opencl.queue_t* @_Z17get_default_queuev() - call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1) - %0 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__test_enqueue_empty_block_invoke to i8*), i32 0, i32 0, i8* null) - %call1 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t* %call, i32 1, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %0) + %tmp = alloca %struct.ndrange_t, align 8 + %call = call spir_func %opencl.queue_t* @_Z17get_default_queuev() #4 + call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %tmp, i64 1) #4 + %0 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %call, i32 1, %struct.ndrange_t* %tmp, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__test_enqueue_empty_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) ret void -; CHECK-SPIRV: EnqueueKernel {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} [[Invoke:[0-9]+]] [[NullInt8Ptr]] {{[0-9]+}} {{[0-9]+}} - +; CHECK-SPIRV: Bitcast [[Int8Ptr]] [[Int8PtrBlock:[0-9]+]] [[Block]] +; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGen]] [[Int8PtrGenBlock:[0-9]+]] [[Int8PtrBlock]] +; CHECK-SPIRV: EnqueueKernel {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} [[Invoke:[0-9]+]] [[Int8PtrGenBlock]] {{[0-9]+}} {{[0-9]+}} } -declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*) #1 - +; Function Attrs: convergent declare spir_func %opencl.queue_t* @_Z17get_default_queuev() #1 +; Function Attrs: convergent declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1 - -; CHECK-SPIRV: Function [[Void]] [[Invoke]] {{[0-9]+}} {{[0-9]+}} -; CHECK-SPIRV-NEXT: FunctionParameter [[Int8Ptr]] {{[0-9]+}} +; Function Attrs: convergent nounwind +define internal spir_func void @__test_enqueue_empty_block_invoke(i8 addrspace(4)* %.block_descriptor) #2 { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 8 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)* + ret void +} ; Function Attrs: nounwind -define internal spir_func void @__test_enqueue_empty_block_invoke(i8* %.block_descriptor) #0 { +define internal spir_kernel void @__test_enqueue_empty_block_invoke_kernel(i8 addrspace(4)*) #3 { entry: - %block = bitcast i8* %.block_descriptor to <{}>* + call void @__test_enqueue_empty_block_invoke(i8 addrspace(4)* %0) ret void } -declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*) +declare i32 @__enqueue_kernel_basic_events(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*) + +; CHECK-SPIRV: Function [[Void]] [[Invoke]] {{[0-9]+}} {{[0-9]+}} +; CHECK-SPIRV-NEXT: FunctionParameter [[Int8PtrGen]] {{[0-9]+}} -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } +attributes #4 = { convergent } -!opencl.kernels = !{!0} !opencl.enable.FP_CONTRACT = !{} -!opencl.spir.version = !{!6} -!opencl.ocl.version = !{!7} -!opencl.used.extensions = !{!8} -!opencl.used.optional.core.features = !{!8} -!opencl.compiler.options = !{!8} +!opencl.ocl.version = !{!0} +!opencl.spir.version = !{!0} -!0 = !{void ()* @test_enqueue_empty, !1, !2, !3, !4, !5} -!1 = !{!"kernel_arg_addr_space"} -!2 = !{!"kernel_arg_access_qual"} -!3 = !{!"kernel_arg_type"} -!4 = !{!"kernel_arg_base_type"} -!5 = !{!"kernel_arg_type_qual"} -!6 = !{i32 1, i32 2} -!7 = !{i32 2, i32 0} -!8 = !{} +!0 = !{i32 2, i32 0} +!2 = !{} diff --git a/test/global_block.ll b/test/global_block.ll index cf43b18b47..d1ede654a4 100644 --- a/test/global_block.ll +++ b/test/global_block.ll @@ -26,62 +26,74 @@ target triple = "spir-unknown-unknown" ;; Check that block invoke function has no block descriptor argument in SPIR-V ; CHECK-SPIRV-NOT: TypeFunction [[block_invoke_type]] [[int]] {{[0-9]+}} [[int]] -%opencl.block = type opaque - -@block_kernel.b1 = internal addrspace(2) constant %opencl.block* bitcast (i32 (i8*, i32)* @_block_invoke to %opencl.block*), align 4 ;; This variable is not needed in SPIRV -; CHECK-SPIRV-NOT Variable +; CHECK-SPIRV-NOT: Name {{[0-9]+}} block_kernel.b1 ; CHECK-LLVM-NOT: @block_kernel.b1 +@block_kernel.b1 = internal addrspace(2) constant i32 (i32) addrspace(4)* addrspacecast (i32 (i32) addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i32 (i32) addrspace(1)*) to i32 (i32) addrspace(4)*), align 8 + +@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4 -; Function Attrs: nounwind -define spir_kernel void @block_kernel(i32 addrspace(1)* %res) #0 { +; Function Attrs: convergent nounwind +define spir_kernel void @block_kernel(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 { entry: - %0 = load %opencl.block*, %opencl.block* addrspace(2)* @block_kernel.b1, align 4 - %1 = call i8* @spir_get_block_invoke(%opencl.block* %0) - %2 = call i8* @spir_get_block_context(%opencl.block* %0) - %3 = bitcast i8* %1 to i32 (i8*, i32)* - %call = call spir_func i32 %3(i8* %2, i32 5) + %res.addr = alloca i32 addrspace(1)*, align 8 + store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 8, !tbaa !10 + ; CHECK-SPIRV: FunctionCall [[int]] {{[0-9]+}} [[block_invoke]] [[five]] ; CHECK-LLVM: %call = call spir_func i32 @_block_invoke(i32 5) - store i32 %call, i32 addrspace(1)* %res, align 4 + %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 5) #2 + + %0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 8, !tbaa !10 + store i32 %call, i32 addrspace(1)* %0, align 4, !tbaa !14 ret void } -; Function Attrs: nounwind -define internal spir_func i32 @_block_invoke(i8* %.block_descriptor, i32 %i) #0 { ; CHECK-SPIRV: 5 Function [[int]] [[block_invoke]] 0 [[block_invoke_type]] ; CHECK-SPIRV-NEXT: 3 FunctionParameter [[int]] {{[0-9]+}} ; CHECK-LLVM: define internal spir_func i32 @_block_invoke(i32 %i) +; Function Attrs: convergent nounwind +define internal spir_func i32 @_block_invoke(i8 addrspace(4)* %.block_descriptor, i32 %i) #1 { entry: - %block = bitcast i8* %.block_descriptor to <{}>* -;; Instruction above is useless and should be removed. + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 8 + %i.addr = alloca i32, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8 + +;; Instruction below is useless and should be removed. ; CHECK-SPIRV-NOT: Bitcast ; CHECK-LLVM-NOT: bitcast - %add = add nsw i32 %i, 1 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)* + store i32 %i, i32* %i.addr, align 4, !tbaa !14 + %0 = load i32, i32* %i.addr, align 4, !tbaa !14 + %add = add nsw i32 %0, 1 ret i32 %add } -declare i8* @spir_get_block_invoke(%opencl.block*) - -declare i8* @spir_get_block_context(%opencl.block*) +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent } -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!opencl.kernels = !{!0} +!llvm.module.flags = !{!0} !opencl.enable.FP_CONTRACT = !{} -!opencl.spir.version = !{!6} -!opencl.ocl.version = !{!7} -!opencl.used.extensions = !{!8} -!opencl.used.optional.core.features = !{!8} -!opencl.compiler.options = !{!8} - -!0 = !{void (i32 addrspace(1)*)* @block_kernel, !1, !2, !3, !4, !5} -!1 = !{!"kernel_arg_addr_space", i32 1} -!2 = !{!"kernel_arg_access_qual", !"none"} -!3 = !{!"kernel_arg_type", !"int*"} -!4 = !{!"kernel_arg_base_type", !"int*"} -!5 = !{!"kernel_arg_type_qual", !""} -!6 = !{i32 1, i32 2} -!7 = !{i32 2, i32 0} -!8 = !{} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.used.extensions = !{!2} +!opencl.used.optional.core.features = !{!2} +!opencl.compiler.options = !{!2} +!llvm.ident = !{!3} +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 2, i32 0} +!2 = !{} +!3 = !{!"clang version 7.0.0"} +!4 = !{i32 1} +!5 = !{!"none"} +!6 = !{!"int*"} +!7 = !{!""} +!8 = !{i1 false} +!9 = !{i32 0} +!10 = !{!11, !11, i64 0} +!11 = !{!"any pointer", !12, i64 0} +!12 = !{!"omnipotent char", !13, i64 0} +!13 = !{!"Simple C/C++ TBAA"} +!14 = !{!15, !15, i64 0} +!15 = !{!"int", !12, i64 0} diff --git a/test/literal-struct.ll b/test/literal-struct.ll new file mode 100644 index 0000000000..c52170a5e7 --- /dev/null +++ b/test/literal-struct.ll @@ -0,0 +1,60 @@ +; This test checks that the translator doesn't crash if the module has literal +; structs, i.e. structs whose type has no name. Typicaly clang generate such +; structs if the kernel contains OpenCL 2.0 blocks. The IR was produced with +; the following command: +; clang -cc1 -triple spir -cl-std=cl2.0 -O0 -finclude-default-header literal-struct.cl -emit-llvm -o test/literal-struct.ll + +; literal-struct.cl: +; void foo() +; { +; void (^myBlock)(void) = ^{}; +; myBlock(); +; } + +; RUN: llvm-as < %s | llvm-spirv -spirv-text -o %t +; RUN: FileCheck < %t %s + +; CHECK-DAG: TypeInt [[Int:[0-9]+]] 32 0 +; CHECK-DAG: TypeStruct [[StructType:[0-9]+]] [[Int]] [[Int]] {{$}} + +target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir" + +@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4 +; CHECK: ConstantComposite [[StructType]] + +; This is artificial case is added to cover ConstantNull instrucitions with TypeStruct. +@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } zeroinitializer, align 4 +; CHECK: ConstantNull [[StructType]] + +; Function Attrs: convergent noinline nounwind optnone +define spir_func void @foo() #0 { +entry: + %myBlock = alloca void () addrspace(4)*, align 4 + store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %myBlock, align 4 + call spir_func void @__foo_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) #1 + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal spir_func void @__foo_block_invoke(i8 addrspace(4)* %.block_descriptor) #0 { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)* + store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4 + ret void +} + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 2, i32 0} +!2 = !{!"clang version 8.0.0 "} diff --git a/test/transcoding/device_execution.ll b/test/transcoding/device_execution.ll deleted file mode 100644 index 538a1b21fe..0000000000 --- a/test/transcoding/device_execution.ll +++ /dev/null @@ -1,112 +0,0 @@ -; ModuleID = 'repro.cl' -; -; bash$ cat repro.cl -; void __kernel device_kernel(__global float * inout) { -; *inout = cos(*inout); -; } -; -; void __kernel host_kernel(__global float * inout) { -; enqueue_kernel(get_default_queue(), -; CLK_ENQUEUE_FLAGS_WAIT_KERNEL, -; ndrange_1D(1), -; 0, NULL, NULL, -; ^{ device_kernel(inout); }); -; } -; bash$ export PATH_TO_INCLUDE= $PATH_TO_GEN/lib/clang/3.6.1/include -; bash$ $PATH_TO_GEN/bin/clang -cc1 -x cl -cl-std=CL2.0 -triple spir64-unknonw-unknown -emit-llvm -include opencl-20.h repro.cl -o device_execution.ll - -;; Regression test for -;; 1. enqueue_kernel built-in is mangled accordingly to SPIR2.0/C++ ABI (no substitution is done) -;; 2. the 4rth argument of enqueue_kernel is mangled as "const clk_event_t*" - -; RUN: llvm-as %s -o %t.bc -; RUN: llvm-spirv %t.bc -o %t.spv -; RUN: llvm-spirv -r %t.spv -o %t.bc -; RUN: llvm-dis < %t.bc | FileCheck %s - -; CHECK-NOT: @_Z14enqueue_kernel9ocl_queue{{22kernel_enqueue_flags_t|i}}9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE -; CHECK-NOT: @spir_block_bind -; CHECK: call i32 @__enqueue_kernel_basic_events -declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*) #1 - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknonw-unknown" - -%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] } -%opencl.queue_t = type opaque -%opencl.block = type opaque -%opencl.clk_event_t = type opaque - -; Function Attrs: nounwind -define spir_kernel void @device_kernel(float addrspace(1)* nocapture %inout) #0 { -entry: - %0 = load float, float addrspace(1)* %inout, align 4, !tbaa !11 - %call = tail call spir_func float @_Z3cosf(float %0) #2 - store float %call, float addrspace(1)* %inout, align 4, !tbaa !11 - ret void -} - -declare spir_func float @_Z3cosf(float) #1 - -; Function Attrs: nounwind -define spir_kernel void @host_kernel(float addrspace(1)* %inout) #0 { -entry: - %captured = alloca <{ float addrspace(1)* }>, align 8 - %agg.tmp = alloca %struct.ndrange_t, align 8 - %call = tail call spir_func %opencl.queue_t* @get_default_queue() #2 - call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1) #2 - %block.captured = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %captured, i64 0, i32 0 - store float addrspace(1)* %inout, float addrspace(1)** %block.captured, align 8, !tbaa !15 - %0 = bitcast <{ float addrspace(1)* }>* %captured to i8* - %1 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__host_kernel_block_invoke to i8*), i32 8, i32 8, i8* %0) #2 - %call1 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t* %call, i32 241, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %1) #2 - ret void -} - -declare spir_func %opencl.queue_t* @get_default_queue() #1 - -declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1 - -; Function Attrs: nounwind -define internal spir_func void @__host_kernel_block_invoke(i8* nocapture readonly %.block_descriptor) #0 { -entry: - %block.capture.addr = bitcast i8* %.block_descriptor to float addrspace(1)** - %0 = load float addrspace(1)*, float addrspace(1)** %block.capture.addr, align 8, !tbaa !15 - %1 = load float, float addrspace(1)* %0, align 4, !tbaa !11 - %call.i = tail call spir_func float @_Z3cosf(float %1) #2 - store float %call.i, float addrspace(1)* %0, align 4, !tbaa !11 - ret void -} - -declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*) - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } - -!opencl.kernels = !{!0, !6} -!opencl.enable.FP_CONTRACT = !{} -!opencl.spir.version = !{!7} -!opencl.ocl.version = !{!8} -!opencl.used.extensions = !{!9} -!opencl.used.optional.core.features = !{!9} -!opencl.compiler.options = !{!9} -!llvm.ident = !{!10} - -!0 = !{void (float addrspace(1)*)* @device_kernel, !1, !2, !3, !4, !5} -!1 = !{!"kernel_arg_addr_space", i32 1} -!2 = !{!"kernel_arg_access_qual", !"none"} -!3 = !{!"kernel_arg_type", !"float*"} -!4 = !{!"kernel_arg_base_type", !"float*"} -!5 = !{!"kernel_arg_type_qual", !""} -!6 = !{void (float addrspace(1)*)* @host_kernel, !1, !2, !3, !4, !5} -!7 = !{i32 1, i32 2} -!8 = !{i32 2, i32 0} -!9 = !{} -!10 = !{!"clang version 3.6.1 (https://github.com/KhronosGroup/SPIR.git 49a8b4a760d227b12116a79b2f7b2e34ef2e6879) (ssh://nnopencl-git-01.inn.intel.com/home/git/repo/opencl_qa-llvm d9b98710f905089caec167209da23af2e4f72bf0)"} -!11 = !{!12, !12, i64 0} -!12 = !{!"float", !13, i64 0} -!13 = !{!"omnipotent char", !14, i64 0} -!14 = !{!"Simple C/C++ TBAA"} -!15 = !{!16, !16, i64 0} -!16 = !{!"any pointer", !13, i64 0} diff --git a/test/transcoding/device_execution_multiple_blocks.ll b/test/transcoding/device_execution_multiple_blocks.ll deleted file mode 100644 index 17421013a6..0000000000 --- a/test/transcoding/device_execution_multiple_blocks.ll +++ /dev/null @@ -1,153 +0,0 @@ -;; Test what the reader correctly mangles get_kernel_work_group_size, -;; get_kernel_preferred_work_group_size_multiple, and enqueue_kernel built-ins and -;; produces spir_block_bind for the both blocks one of which is w\o captured context. -;; Notice what for the moment spir_block_bind is called as many times as how much -;; built-ins what using it. This is not against SPIR 2.0 specification so it is done this -;; way to simplify the reader implementation. -;; -;; See below how this LLVM IR has been obtained: -;; bash$ -;; bash$ cat device_execution_multiple_blocks.cl -;; void block_fn(int arg, __global int* res) -;; { -;; *res = arg; -;; } -;; -;; __global int glbRes = 0; -;; void (^kernelBlockNoCtx)(void) = ^{ block_fn(1, &glbRes); }; -;; -;; kernel void enqueue_block_get_kernel_preferred_work_group_size_multiple(__global int* res) -;; { -;; -;; -;; void (^kernelBlock)(void) = ^{ block_fn(2, res); }; -;; uint globalSize = get_kernel_work_group_size(kernelBlock); -;; uint multiple = get_kernel_preferred_work_group_size_multiple(kernelBlock); -;; uint localSize = globalSize / multiple; -;; -;; queue_t q1 = get_default_queue(); -;; ndrange_t ndrange = ndrange_1D(localSize, globalSize); -;; enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock); -;; // Enqueue kernel w\o captured context -;; enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlockNoCtx); -;; } -;; bash$ -;; bash$ export PATH_TO_GEN=path_to_spir20_generator_install_dir -;; bash$ $PATH_TO_GEN/bin/clang -cc1 -x cl -O2 -cl-std=CL2.0 -triple spir64-unknonw-unknown\ -;; -emit-spirv -include $PATH_TO_GEN/lib/clang/3.6.1/include/opencl-20.h\ -;; device_execution_multiple_blocks.cl -o device_execution_multiple_blocks.ll - -; RUN: llvm-as %s -o %t.bc -; RUN: llvm-spirv %t.bc -o %t.spv -; RUN: llvm-spirv -r %t.spv -o %t.bc -; RUN: llvm-dis < %t.bc | FileCheck %s - -; ModuleID = 'device_execution_multiple_blocks.cl' -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknonw-unknown" - -%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] } -%opencl.block = type opaque -%opencl.queue_t = type opaque - -@glbRes = addrspace(1) global i32 0, align 4 - -; Function Attrs: nounwind -define spir_func void @block_fn(i32 %arg, i32 addrspace(1)* nocapture %res) #0 { -entry: - store i32 %arg, i32 addrspace(1)* %res, align 4 - ret void -} - -; Function Attrs: nounwind -define internal spir_func void @kernelBlockNoCtx_block_invoke(i8* nocapture readnone %.block_descriptor) #0 { -entry: - store i32 1, i32 addrspace(1)* @glbRes, align 4 - ret void -} - -; Function Attrs: nounwind -define spir_kernel void @enqueue_block_get_kernel_preferred_work_group_size_multiple(i32 addrspace(1)* %res) #0 { -; CHECK: [[CTX:%.*]] = bitcast %0* %captured to i8* -; CHECK: [[BLOCK:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, align 8 -; CHECK: store i8* {{.*}} @__enqueue_block_get_kernel_preferred_work_group_size_multiple_block_invoke -; CHECK: [[CAPTUREDGEP:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK]], i32 0, i32 5 -; CHECK: [[CAPTUREDCAST:%.*]] = bitcast [8 x i8]* [[CAPTUREDGEP]] to i8* -; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[CAPTUREDCAST]], i8* align 8 [[CTX]], i32 8, i1 false) -; CHECK: [[BLOCKBCST:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK]] to i8* -; CHECK: [[BLOCKADDRCST:%.*]] = addrspacecast i8* [[BLOCKBCST]] to i8 addrspace(4)* -; CHECK: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* [[BLOCKADDRCST]]) -; CHECK: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* [[BLOCKADDRCST]]) -; CHECK: call i32 @__enqueue_kernel_basic_events({{.*}} i8 addrspace(4)* [[BLOCKADDRCST]]) - -; CHECK: [[BLOCK2:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>, align 8 -; CHECK: store i8* {{.*}} @kernelBlockNoCtx_block_invoke -; CHECK: [[BLOCKBCST2:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK2]] to i8* -; CHECK: [[BLOCKADDRCST2:%.*]] = addrspacecast i8* [[BLOCKBCST2]] to i8 addrspace(4)* -; CHECK-NOT: call void @llvm.memcpy -; CHECK: call i32 @__enqueue_kernel_basic_events({{.*}} i8 addrspace(4)* [[BLOCKADDRCST2]]) - -entry: - %captured = alloca <{ i32 addrspace(1)* }>, align 8 - %ndrange = alloca %struct.ndrange_t, align 8 - %block.captured = getelementptr inbounds <{ i32 addrspace(1)* }>, <{ i32 addrspace(1)* }>* %captured, i64 0, i32 0 - store i32 addrspace(1)* %res, i32 addrspace(1)** %block.captured, align 8 - %0 = bitcast <{ i32 addrspace(1)* }>* %captured to i8* - %1 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__enqueue_block_get_kernel_preferred_work_group_size_multiple_block_invoke to i8*), i32 8, i32 8, i8* %0) #2 - %call = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block* %1) #2 - %call1 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block* %1) #2 - %div = udiv i32 %call, %call1 - %call2 = call spir_func %opencl.queue_t* @get_default_queue() #2 - %conv = zext i32 %div to i64 - %conv3 = zext i32 %call to i64 - call spir_func void @_Z10ndrange_1Dmm(%struct.ndrange_t* sret %ndrange, i64 %conv, i64 %conv3) #2 - %call4 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tU13block_pointerFvvE(%opencl.queue_t* %call2, i32 241, %struct.ndrange_t* byval %ndrange, %opencl.block* %1) #2 - %2 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @kernelBlockNoCtx_block_invoke to i8*), i32 0, i32 0, i8* null) #2 - %call5 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tU13block_pointerFvvE(%opencl.queue_t* %call2, i32 241, %struct.ndrange_t* byval %ndrange, %opencl.block* %2) #2 - ret void -} - -; Function Attrs: nounwind -define internal spir_func void @__enqueue_block_get_kernel_preferred_work_group_size_multiple_block_invoke(i8* nocapture readonly %.block_descriptor) #0 { -entry: - %block.capture.addr = bitcast i8* %.block_descriptor to i32 addrspace(1)** - %0 = load i32 addrspace(1)*, i32 addrspace(1)** %block.capture.addr, align 8 - store i32 2, i32 addrspace(1)* %0, align 4 - ret void -} - -declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*) - -declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block*) #1 - -declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block*) #1 - -declare spir_func %opencl.queue_t* @get_default_queue() #1 - -declare spir_func void @_Z10ndrange_1Dmm(%struct.ndrange_t* sret, i64, i64) #1 - -declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, %opencl.block*) #1 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } - -!opencl.kernels = !{!0} -!opencl.enable.FP_CONTRACT = !{} -!opencl.spir.version = !{!6} -!opencl.ocl.version = !{!7} -!opencl.used.extensions = !{!8} -!opencl.used.optional.core.features = !{!8} -!opencl.compiler.options = !{!8} -!llvm.ident = !{!9} - -!0 = !{void (i32 addrspace(1)*)* @enqueue_block_get_kernel_preferred_work_group_size_multiple, !1, !2, !3, !4, !5} -!1 = !{!"kernel_arg_addr_space", i32 1} -!2 = !{!"kernel_arg_access_qual", !"none"} -!3 = !{!"kernel_arg_type", !"int*"} -!4 = !{!"kernel_arg_base_type", !"int*"} -!5 = !{!"kernel_arg_type_qual", !""} -!6 = !{i32 1, i32 2} -!7 = !{i32 2, i32 0} -!8 = !{} -!9 = !{!"clang version 3.6.1 (https://github.com/KhronosGroup/SPIR.git 49a8b4a760d227b12116a79b2f7b2e34ef2e6879) (ssh://nnopencl-git-01.inn.intel.com/home/git/repo/opencl_qa-llvm d9b98710f905089caec167209da23af2e4f72bf0)"} diff --git a/test/transcoding/device_execution_overloading.ll b/test/transcoding/device_execution_overloading.ll deleted file mode 100644 index e5ee309217..0000000000 --- a/test/transcoding/device_execution_overloading.ll +++ /dev/null @@ -1,203 +0,0 @@ -;; bash$ cat device_execution_overloading.cl -;; void device_kernel_with_local_args(__local float* ptr0, __local float* ptr1) { -;; *ptr0 = 0; -;; *ptr1 = 1; -;; } -;; -;; void device_kernel(__global float* ptr) { -;; *ptr = 3; -;; } -;; -;; __kernel void host_kernel(uint size, __global float* ptr) { -;; void(^block_with_local)(__local void*, __local void*) = ^(__local void* ptr0, __local void* ptr1){ -;; device_kernel_with_local_args(ptr0, ptr1); -;; }; -;; -;; void(^block)(void) = ^{ -;; device_kernel(ptr); -;; }; -;; -;n; uint wgSize = get_kernel_work_group_size(block_with_local); -;; uint prefMul = get_kernel_preferred_work_group_size_multiple(block_with_local); -;; enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1), -;; 0, NULL, NULL, block_with_local, size, wgSize * prefMul); -;; -;; wgSize = get_kernel_work_group_size(block); -;; prefMul = get_kernel_preferred_work_group_size_multiple(block); -;; enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1), -;; 0, NULL, NULL, block); -;; } -;; bash$ -;;$PATH_TO_GEN/bin/clang -cc1 -x cl -O0 -cl-std=CL2.0 -triple spir64-unknonw-unknown -include $PATH_TO_GEN/lib/clang/3.6.1/include/opencl-20.h -emit-llvm device_execution_overloading.cl -o device_execution_overloading.ll - -;; Test overloading of device exectuion built-ins is OK after translation from SPIR-V - -; RUN: llvm-as %s -o %t.bc -; RUN: llvm-spirv %t.bc -o %t.spv -; RUN: llvm-spirv -r %t.spv -o %t.bc -; RUN: llvm-dis < %t.bc | FileCheck %s - -; ModuleID = 'device_execution_overloading.cl' -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknonw-unknown" - -%opencl.block = type opaque -%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] } -%opencl.queue_t = type opaque -%opencl.clk_event_t = type opaque - -; Function Attrs: nounwind -define spir_func void @device_kernel_with_local_args(float addrspace(3)* %ptr0, float addrspace(3)* %ptr1) #0 { -entry: - %ptr0.addr = alloca float addrspace(3)*, align 8 - %ptr1.addr = alloca float addrspace(3)*, align 8 - store float addrspace(3)* %ptr0, float addrspace(3)** %ptr0.addr, align 8 - store float addrspace(3)* %ptr1, float addrspace(3)** %ptr1.addr, align 8 - %0 = load float addrspace(3)*, float addrspace(3)** %ptr0.addr, align 8 - store float 0.000000e+00, float addrspace(3)* %0, align 4 - %1 = load float addrspace(3)*, float addrspace(3)** %ptr1.addr, align 8 - store float 1.000000e+00, float addrspace(3)* %1, align 4 - ret void -} - -; Function Attrs: nounwind -define spir_func void @device_kernel(float addrspace(1)* %ptr) #0 { -entry: - %ptr.addr = alloca float addrspace(1)*, align 8 - store float addrspace(1)* %ptr, float addrspace(1)** %ptr.addr, align 8 - %0 = load float addrspace(1)*, float addrspace(1)** %ptr.addr, align 8 - store float 3.000000e+00, float addrspace(1)* %0, align 4 - ret void -} - -; CHECK: @__get_kernel_work_group_size_impl -; CHECK: @__get_kernel_preferred_work_group_multiple_impl -; CHECK: @__enqueue_kernel_events_vaargs - -; CHECK: @__get_kernel_work_group_size_impl -; CHECK: @__get_kernel_preferred_work_group_multiple_impl -; CHECK: @__enqueue_kernel_basic_events - -; Function Attrs: nounwind -define spir_kernel void @host_kernel(i32 %size, float addrspace(1)* %ptr) #0 { -entry: - %size.addr = alloca i32, align 4 - %ptr.addr = alloca float addrspace(1)*, align 8 - %block_with_local = alloca %opencl.block*, align 8 - %block = alloca %opencl.block*, align 8 - %captured = alloca <{ float addrspace(1)* }>, align 8 - %wgSize = alloca i32, align 4 - %prefMul = alloca i32, align 4 - %agg.tmp = alloca %struct.ndrange_t, align 8 - %agg.tmp8 = alloca %struct.ndrange_t, align 8 - store i32 %size, i32* %size.addr, align 4 - store float addrspace(1)* %ptr, float addrspace(1)** %ptr.addr, align 8 - %0 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*, i8 addrspace(3)*, i8 addrspace(3)*)* @__host_kernel_block_invoke to i8*), i32 0, i32 0, i8* null) - store %opencl.block* %0, %opencl.block** %block_with_local, align 8 - %block.captured = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %captured, i32 0, i32 0 - %1 = load float addrspace(1)*, float addrspace(1)** %ptr.addr, align 8 - store float addrspace(1)* %1, float addrspace(1)** %block.captured, align 8 - %2 = bitcast <{ float addrspace(1)* }>* %captured to i8* - %3 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__host_kernel_block_invoke_2 to i8*), i32 8, i32 8, i8* %2) - store %opencl.block* %3, %opencl.block** %block, align 8 - %4 = load %opencl.block*, %opencl.block** %block_with_local, align 8 - %call = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block* %4) - store i32 %call, i32* %wgSize, align 4 - %5 = load %opencl.block*, %opencl.block** %block_with_local, align 8 - %call2 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block* %5) - store i32 %call2, i32* %prefMul, align 4 - %call3 = call spir_func %opencl.queue_t* @_Z17get_default_queuev() - call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1) - %6 = load %opencl.block*, %opencl.block** %block_with_local, align 8 - %7 = load i32, i32* %size.addr, align 4 - %8 = load i32, i32* %wgSize, align 4 - %9 = load i32, i32* %prefMul, align 4 - %mul = mul i32 %8, %9 - %call4 = call spir_func i32 (%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t* %call3, i32 241, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %6, i32 %7, i32 %mul) - %10 = load %opencl.block*, %opencl.block** %block, align 8 - %call5 = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block* %10) - store i32 %call5, i32* %wgSize, align 4 - %11 = load %opencl.block*, %opencl.block** %block, align 8 - %call6 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block* %11) - store i32 %call6, i32* %prefMul, align 4 - %call7 = call spir_func %opencl.queue_t* @_Z17get_default_queuev() - call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp8, i64 1) - %12 = load %opencl.block*, %opencl.block** %block, align 8 - %call9 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t* %call7, i32 241, %struct.ndrange_t* byval %agg.tmp8, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %12) - ret void -} - -; Function Attrs: nounwind -define internal spir_func void @__host_kernel_block_invoke(i8* %.block_descriptor, i8 addrspace(3)* %ptr0, i8 addrspace(3)* %ptr1) #0 { -entry: - %.block_descriptor.addr = alloca i8*, align 8 - %ptr0.addr = alloca i8 addrspace(3)*, align 8 - %ptr1.addr = alloca i8 addrspace(3)*, align 8 - %block.addr = alloca <{}>*, align 8 - store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8 - %0 = load i8*, i8** %.block_descriptor.addr - store i8 addrspace(3)* %ptr0, i8 addrspace(3)** %ptr0.addr, align 8 - store i8 addrspace(3)* %ptr1, i8 addrspace(3)** %ptr1.addr, align 8 - %block = bitcast i8* %.block_descriptor to <{}>* - store <{}>* %block, <{}>** %block.addr, align 8 - %1 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr0.addr, align 8 - %2 = bitcast i8 addrspace(3)* %1 to float addrspace(3)* - %3 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr1.addr, align 8 - %4 = bitcast i8 addrspace(3)* %3 to float addrspace(3)* - call spir_func void @device_kernel_with_local_args(float addrspace(3)* %2, float addrspace(3)* %4) - ret void -} - -declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*) - -; Function Attrs: nounwind -define internal spir_func void @__host_kernel_block_invoke_2(i8* %.block_descriptor) #0 { -entry: - %.block_descriptor.addr = alloca i8*, align 8 - %block.addr = alloca <{ float addrspace(1)* }>*, align 8 - store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8 - %0 = load i8*, i8** %.block_descriptor.addr - %block = bitcast i8* %.block_descriptor to <{ float addrspace(1)* }>* - store <{ float addrspace(1)* }>* %block, <{ float addrspace(1)* }>** %block.addr, align 8 - %block.capture.addr = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %block, i32 0, i32 0 - %1 = load float addrspace(1)*, float addrspace(1)** %block.capture.addr, align 8 - call spir_func void @device_kernel(float addrspace(1)* %1) - ret void -} - -declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block*) #1 - -declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block*) #1 - -declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) #1 - -declare spir_func %opencl.queue_t* @_Z17get_default_queuev() #1 - -declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1 - -declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block*) #1 - -declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block*) #1 - -declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*) #1 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!opencl.kernels = !{!0} -!opencl.enable.FP_CONTRACT = !{} -!opencl.spir.version = !{!6} -!opencl.ocl.version = !{!7} -!opencl.used.extensions = !{!8} -!opencl.used.optional.core.features = !{!8} -!opencl.compiler.options = !{!8} - -!0 = !{void (i32, float addrspace(1)*)* @host_kernel, !1, !2, !3, !4, !5} -!1 = !{!"kernel_arg_addr_space", i32 0, i32 1} -!2 = !{!"kernel_arg_access_qual", !"none", !"none"} -!3 = !{!"kernel_arg_type", !"uint", !"float*"} -!4 = !{!"kernel_arg_base_type", !"uint", !"float*"} -!5 = !{!"kernel_arg_type_qual", !"", !""} -!6 = !{i32 1, i32 2} -!7 = !{i32 2, i32 0} -!8 = !{} diff --git a/test/transcoding/device_execution_simple_local_memory.ll b/test/transcoding/device_execution_simple_local_memory.ll deleted file mode 100644 index 51601eb74e..0000000000 --- a/test/transcoding/device_execution_simple_local_memory.ll +++ /dev/null @@ -1,137 +0,0 @@ -;; bash$ cat repro.cl -;; void device_kernel(__local float* ptr0, __local float* ptr1) { -;; *ptr0 = 0; -;; *ptr1 = 1; -;; } -;; -;; __kernel void host_kernel(uint size) { -;; void(^block)(__local void*, __local void*) = ^(__local void* ptr0, __local void* ptr1){ -;; device_kernel(ptr0, ptr1); -;; }; -;; -;; uint wgSize = get_kernel_work_group_size(block); -;; uint prefMul = get_kernel_preferred_work_group_size_multiple(block); -;; enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1), -;; 0, NULL, NULL, block, size, wgSize * prefMul); -;; } -;; bash$ -;; bash$ export PATH_TO_INCLUDE= $PATH_TO_GEN/lib/clang/3.6.1/include -;; bash$ $PATH_TO_GEN/bin/clang -cc1 -x cl -cl-std=CL2.0 -triple spir64-unknonw-unknown -emit-llvm -include opencl-20.h repro.cl -o device_execution.ll - -;; Check that device enqueue BIs wasn't mangled - - -; RUN: llvm-as %s -o %t.bc -; RUN: llvm-spirv %t.bc -o %t.spv -; RUN: llvm-spirv -r %t.spv -o %t.bc -; RUN: llvm-dis < %t.bc | FileCheck %s - -; ModuleID = 'repro.cl' -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknonw-unknown" - -%opencl.block = type opaque -%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] } -%opencl.queue_t = type opaque -%opencl.clk_event_t = type opaque - -; Function Attrs: nounwind -define spir_func void @device_kernel(float addrspace(3)* %ptr0, float addrspace(3)* %ptr1) #0 { -entry: - %ptr0.addr = alloca float addrspace(3)*, align 8 - %ptr1.addr = alloca float addrspace(3)*, align 8 - store float addrspace(3)* %ptr0, float addrspace(3)** %ptr0.addr, align 8 - store float addrspace(3)* %ptr1, float addrspace(3)** %ptr1.addr, align 8 - %0 = load float addrspace(3)*, float addrspace(3)** %ptr0.addr, align 8 - store float 0.000000e+00, float addrspace(3)* %0, align 4 - %1 = load float addrspace(3)*, float addrspace(3)** %ptr1.addr, align 8 - store float 1.000000e+00, float addrspace(3)* %1, align 4 - ret void -} - -; Function Attrs: nounwind -define spir_kernel void @host_kernel(i32 %size) #0 { -entry: - %size.addr = alloca i32, align 4 - %block = alloca %opencl.block*, align 8 - %wgSize = alloca i32, align 4 - %prefMul = alloca i32, align 4 - %agg.tmp = alloca %struct.ndrange_t, align 8 - store i32 %size, i32* %size.addr, align 4 - %0 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*, i8 addrspace(3)*, i8 addrspace(3)*)* @__host_kernel_block_invoke to i8*), i32 0, i32 0, i8* null) - store %opencl.block* %0, %opencl.block** %block, align 8 - %1 = load %opencl.block*, %opencl.block** %block, align 8 -; CHECK: call {{.*}} @__get_kernel_work_group_size_impl - %call = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block* %1) - store i32 %call, i32* %wgSize, align 4 - %2 = load %opencl.block*, %opencl.block** %block, align 8 -; CHECK: call {{.*}} @__get_kernel_preferred_work_group_multiple_impl - %call1 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block* %2) - store i32 %call1, i32* %prefMul, align 4 - %call2 = call spir_func %opencl.queue_t* @_Z17get_default_queuev() - call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1) - %3 = load %opencl.block*, %opencl.block** %block, align 8 - %4 = load i32, i32* %size.addr, align 4 - %5 = load i32, i32* %wgSize, align 4 - %6 = load i32, i32* %prefMul, align 4 - %mul = mul i32 %5, %6 -; CHECK: call {{.*}} @__enqueue_kernel_events_vaargs({{.*}}, i32 {{.*}}, i32 {{.*}}) - %call3 = call spir_func i32 (%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t* %call2, i32 241, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %3, i32 %4, i32 %mul) - ret void -} - -; Function Attrs: nounwind -; CHECK-LABEL: define {{.*}} @__host_kernel_block_invoke -define internal spir_func void @__host_kernel_block_invoke(i8* %.block_descriptor, i8 addrspace(3)* %ptr0, i8 addrspace(3)* %ptr1) #0 { -entry: - %.block_descriptor.addr = alloca i8*, align 8 - %ptr0.addr = alloca i8 addrspace(3)*, align 8 - %ptr1.addr = alloca i8 addrspace(3)*, align 8 - %block.addr = alloca <{}>*, align 8 - store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8 - %0 = load i8*, i8** %.block_descriptor.addr - store i8 addrspace(3)* %ptr0, i8 addrspace(3)** %ptr0.addr, align 8 - store i8 addrspace(3)* %ptr1, i8 addrspace(3)** %ptr1.addr, align 8 - %block = bitcast i8* %.block_descriptor to <{}>* - store <{}>* %block, <{}>** %block.addr, align 8 - %1 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr0.addr, align 8 - %2 = bitcast i8 addrspace(3)* %1 to float addrspace(3)* - %3 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr1.addr, align 8 - %4 = bitcast i8 addrspace(3)* %3 to float addrspace(3)* - call spir_func void @device_kernel(float addrspace(3)* %2, float addrspace(3)* %4) - ret void -} - -declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*) - -declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block*) #1 - -declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block*) #1 - -; CHECK: declare {{.*}} @__enqueue_kernel_events_vaargs{{.*}}({{.*}}, i32, ...) -declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) #1 - -declare spir_func %opencl.queue_t* @_Z17get_default_queuev() #1 - -declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!opencl.kernels = !{!0} -!opencl.enable.FP_CONTRACT = !{} -!opencl.spir.version = !{!6} -!opencl.ocl.version = !{!7} -!opencl.used.extensions = !{!8} -!opencl.used.optional.core.features = !{!8} -!opencl.compiler.options = !{!8} - -!0 = !{void (i32)* @host_kernel, !1, !2, !3, !4, !5} -!1 = !{!"kernel_arg_addr_space", i32 0} -!2 = !{!"kernel_arg_access_qual", !"none"} -!3 = !{!"kernel_arg_type", !"uint"} -!4 = !{!"kernel_arg_base_type", !"uint"} -!5 = !{!"kernel_arg_type_qual", !""} -!6 = !{i32 1, i32 2} -!7 = !{i32 2, i32 0} -!8 = !{} diff --git a/test/transcoding/device_execution_vaargs.ll b/test/transcoding/device_execution_vaargs.ll deleted file mode 100644 index 5f616b4f89..0000000000 --- a/test/transcoding/device_execution_vaargs.ll +++ /dev/null @@ -1,231 +0,0 @@ -;; bash$ cat device_execution_overloading.cl -;; void device_kernel_with_local_args(__local float* ptr0, __local float* ptr1) { -;; *ptr0 = 0; -;; *ptr1 = 1; -;; } -;; -;; void device_kernel(__global float* ptr) { -;; *ptr = 3; -;; } -;; -;; __kernel void host_kernel(uint size, __global float* ptr) { -;; void(^block_with_local)(__local void*, __local void*) = ^(__local void* ptr0, __local void* ptr1){ -;; device_kernel_with_local_args(ptr0, ptr1); -;; }; -;; -;; void(^block)(void) = ^{ -;; device_kernel(ptr); -;; }; -;; -;n; uint wgSize = get_kernel_work_group_size(block_with_local); -;; uint prefMul = get_kernel_preferred_work_group_size_multiple(block_with_local); -;; enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1), -;; 0, NULL, NULL, block_with_local, size, wgSize * prefMul); -;; -;; wgSize = get_kernel_work_group_size(block); -;; prefMul = get_kernel_preferred_work_group_size_multiple(block); -;; enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1), -;; 0, NULL, NULL, block); -;; } -;; bash$ -;;$PATH_TO_GEN/bin/clang -cc1 -x cl -O0 -cl-std=CL2.0 -triple spir64-unknonw-unknown -include $PATH_TO_GEN/lib/clang/3.6.1/include/opencl-20.h -emit-llvm device_execution_overloading.cl -o device_execution_overloading.ll - -;; Test enqueue_kernel with and wthout vaargs - -; RUN: llvm-as %s -o %t.bc -; RUN: llvm-spirv %t.bc -o %t.spv -; RUN: llvm-spirv -r %t.spv -o %t.bc -; RUN: llvm-dis < %t.bc | FileCheck %s - -; CHECK: %struct.__block_descriptor = type { i64, i64 } - -; CHECK: [[BLOCK:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>, align 8 -; CHECK: [[INVOKEGEP:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]], i32 0, i32 3 -; CHECK: store i8* bitcast (void (i8*, i8 addrspace(3)*, i8 addrspace(3)*)* @__host_kernel_block_invoke to i8*), i8** [[INVOKEGEP]] -; CHECK: [[BLOCKDESCGEP:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]], i32 0, i32 4 -; CHECK: store %struct.__block_descriptor* @__block_descriptor_spirv, %struct.__block_descriptor** [[BLOCKDESCGEP]] -; CHECK: [[BLOCKBCAST:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]] to i8* -; CHECK: [[BLOCKADDRCAST:%.*]] = addrspacecast i8* [[BLOCKBCAST]] to i8 addrspace(4)* -; CHECK: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* [[BLOCKADDRCAST]]) -; CHECK: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* [[BLOCKADDRCAST]]) -; CHECK: call i32 {{.*}} @__enqueue_kernel_events_vaargs - -; CHECK: [[BLOCK2:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, align 8 -; CHECK: [[INVOKEGEP2:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK2]], i32 0, i32 3 -; CHECK: store i8* bitcast (void (i8*)* @__host_kernel_block_invoke_2 to i8*), i8** [[INVOKEGEP2]] -; CHECK: [[BLOCKDESCGEP2:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK2]], i32 0, i32 4 -; CHECK: store %struct.__block_descriptor* @__block_descriptor_spirv.1, %struct.__block_descriptor** [[BLOCKDESCGEP2]] -; CHECK: [[CAPTUREDGEP:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK2]], i32 0, i32 5 -; CHECK: [[CAPTUREDCAST:%.*]] = bitcast [8 x i8]* [[CAPTUREDGEP]] to i8* -; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[CAPTUREDCAST]], i8* align 8 %0, i32 8, i1 false) -; CHECK: [[BLOCKBCAST2:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK2]] to i8* -; CHECK: [[BLOCKADDRCAST2:%.*]] = addrspacecast i8* [[BLOCKBCAST2]] to i8 addrspace(4)* -; CHECK: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* [[BLOCKADDRCAST2]]) -; CHECK: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* [[BLOCKADDRCAST2]]) #0 -; CHECK: call i32 @__enqueue_kernel_basic_events - -; CHECK: define internal spir_func void @__host_kernel_block_invoke( -; CHECK-NOT: bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [0 x i8] }>* - -; CHECK: define internal spir_func void @__host_kernel_block_invoke_2( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[INV_BLOCK:%.*]] = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* -; CHECK-NEXT: [[INV_CAPTUREDGEP:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[INV_BLOCK]], i32 0, i32 5 -; CHECK-NEXT: bitcast [8 x i8]* [[INV_CAPTUREDGEP]] to i8* -; CHECK-NEXT: br label %invoke - -; ModuleID = 'device_execution_overloading.cl' -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknonw-unknown" - -%opencl.block = type opaque -%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] } -%opencl.queue_t = type opaque -%opencl.clk_event_t = type opaque - -; Function Attrs: nounwind -define spir_func void @device_kernel_with_local_args(float addrspace(3)* %ptr0, float addrspace(3)* %ptr1) #0 { -entry: - %ptr0.addr = alloca float addrspace(3)*, align 8 - %ptr1.addr = alloca float addrspace(3)*, align 8 - store float addrspace(3)* %ptr0, float addrspace(3)** %ptr0.addr, align 8 - store float addrspace(3)* %ptr1, float addrspace(3)** %ptr1.addr, align 8 - %0 = load float addrspace(3)*, float addrspace(3)** %ptr0.addr, align 8 - store float 0.000000e+00, float addrspace(3)* %0, align 4 - %1 = load float addrspace(3)*, float addrspace(3)** %ptr1.addr, align 8 - store float 1.000000e+00, float addrspace(3)* %1, align 4 - ret void -} - -; Function Attrs: nounwind -define spir_func void @device_kernel(float addrspace(1)* %ptr) #0 { -entry: - %ptr.addr = alloca float addrspace(1)*, align 8 - store float addrspace(1)* %ptr, float addrspace(1)** %ptr.addr, align 8 - %0 = load float addrspace(1)*, float addrspace(1)** %ptr.addr, align 8 - store float 3.000000e+00, float addrspace(1)* %0, align 4 - ret void -} -; Function Attrs: nounwind -define spir_kernel void @host_kernel(i32 %size, float addrspace(1)* %ptr) #0 { -entry: - %size.addr = alloca i32, align 4 - %ptr.addr = alloca float addrspace(1)*, align 8 - %block_with_local = alloca %opencl.block*, align 8 - %block = alloca %opencl.block*, align 8 - %captured = alloca <{ float addrspace(1)* }>, align 8 - %wgSize = alloca i32, align 4 - %prefMul = alloca i32, align 4 - %agg.tmp = alloca %struct.ndrange_t, align 8 - %agg.tmp8 = alloca %struct.ndrange_t, align 8 - store i32 %size, i32* %size.addr, align 4 - store float addrspace(1)* %ptr, float addrspace(1)** %ptr.addr, align 8 - %0 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*, i8 addrspace(3)*, i8 addrspace(3)*)* @__host_kernel_block_invoke to i8*), i32 0, i32 0, i8* null) - store %opencl.block* %0, %opencl.block** %block_with_local, align 8 - %block.captured = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %captured, i32 0, i32 0 - %1 = load float addrspace(1)*, float addrspace(1)** %ptr.addr, align 8 - store float addrspace(1)* %1, float addrspace(1)** %block.captured, align 8 - %2 = bitcast <{ float addrspace(1)* }>* %captured to i8* - %3 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__host_kernel_block_invoke_2 to i8*), i32 8, i32 8, i8* %2) - store %opencl.block* %3, %opencl.block** %block, align 8 - %4 = load %opencl.block*, %opencl.block** %block_with_local, align 8 - %call = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block* %4) - store i32 %call, i32* %wgSize, align 4 - %5 = load %opencl.block*, %opencl.block** %block_with_local, align 8 - %call2 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block* %5) - store i32 %call2, i32* %prefMul, align 4 - %call3 = call spir_func %opencl.queue_t* @_Z17get_default_queuev() - call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1) - %6 = load %opencl.block*, %opencl.block** %block_with_local, align 8 - %7 = load i32, i32* %size.addr, align 4 - %8 = load i32, i32* %wgSize, align 4 - %9 = load i32, i32* %prefMul, align 4 - %mul = mul i32 %8, %9 - %call4 = call spir_func i32 (%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t* %call3, i32 241, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %6, i32 %7, i32 %mul) - %10 = load %opencl.block*, %opencl.block** %block, align 8 - %call5 = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block* %10) - store i32 %call5, i32* %wgSize, align 4 - %11 = load %opencl.block*, %opencl.block** %block, align 8 - %call6 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block* %11) - store i32 %call6, i32* %prefMul, align 4 - %call7 = call spir_func %opencl.queue_t* @_Z17get_default_queuev() - call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp8, i64 1) - %12 = load %opencl.block*, %opencl.block** %block, align 8 - %call9 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t* %call7, i32 241, %struct.ndrange_t* byval %agg.tmp8, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %12) - ret void -} - -; Function Attrs: nounwind -define internal spir_func void @__host_kernel_block_invoke(i8* %.block_descriptor, i8 addrspace(3)* %ptr0, i8 addrspace(3)* %ptr1) #0 { -entry: - %.block_descriptor.addr = alloca i8*, align 8 - %ptr0.addr = alloca i8 addrspace(3)*, align 8 - %ptr1.addr = alloca i8 addrspace(3)*, align 8 - %block.addr = alloca <{}>*, align 8 - store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8 - %0 = load i8*, i8** %.block_descriptor.addr - store i8 addrspace(3)* %ptr0, i8 addrspace(3)** %ptr0.addr, align 8 - store i8 addrspace(3)* %ptr1, i8 addrspace(3)** %ptr1.addr, align 8 - %block = bitcast i8* %.block_descriptor to <{}>* - store <{}>* %block, <{}>** %block.addr, align 8 - %1 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr0.addr, align 8 - %2 = bitcast i8 addrspace(3)* %1 to float addrspace(3)* - %3 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr1.addr, align 8 - %4 = bitcast i8 addrspace(3)* %3 to float addrspace(3)* - call spir_func void @device_kernel_with_local_args(float addrspace(3)* %2, float addrspace(3)* %4) - ret void -} - -declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*) - -; Function Attrs: nounwind -define internal spir_func void @__host_kernel_block_invoke_2(i8* %.block_descriptor) #0 { -entry: - %.block_descriptor.addr = alloca i8*, align 8 - %block.addr = alloca <{ float addrspace(1)* }>*, align 8 - store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8 - %0 = load i8*, i8** %.block_descriptor.addr - %block = bitcast i8* %.block_descriptor to <{ float addrspace(1)* }>* - store <{ float addrspace(1)* }>* %block, <{ float addrspace(1)* }>** %block.addr, align 8 - %block.capture.addr = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %block, i32 0, i32 0 - %1 = load float addrspace(1)*, float addrspace(1)** %block.capture.addr, align 8 - call spir_func void @device_kernel(float addrspace(1)* %1) - ret void -} - -declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block*) #1 - -declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block*) #1 - -declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) #1 - -declare spir_func %opencl.queue_t* @_Z17get_default_queuev() #1 - -declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1 - -declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block*) #1 - -declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block*) #1 - -declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*) #1 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!opencl.kernels = !{!0} -!opencl.enable.FP_CONTRACT = !{} -!opencl.spir.version = !{!6} -!opencl.ocl.version = !{!7} -!opencl.used.extensions = !{!8} -!opencl.used.optional.core.features = !{!8} -!opencl.compiler.options = !{!8} - -!0 = !{void (i32, float addrspace(1)*)* @host_kernel, !1, !2, !3, !4, !5} -!1 = !{!"kernel_arg_addr_space", i32 0, i32 1} -!2 = !{!"kernel_arg_access_qual", !"none", !"none"} -!3 = !{!"kernel_arg_type", !"uint", !"float*"} -!4 = !{!"kernel_arg_base_type", !"uint", !"float*"} -!5 = !{!"kernel_arg_type_qual", !"", !""} -!6 = !{i32 1, i32 2} -!7 = !{i32 2, i32 0} -!8 = !{} diff --git a/test/transcoding/enqueue_kernel.ll b/test/transcoding/enqueue_kernel.ll new file mode 100644 index 0000000000..0d29c719a8 --- /dev/null +++ b/test/transcoding/enqueue_kernel.ll @@ -0,0 +1,355 @@ +; Source +; typedef struct {int a;} ndrange_t; +; +; kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) { +; queue_t default_queue; +; unsigned flags = 0; +; ndrange_t ndrange; +; clk_event_t clk_event; +; clk_event_t event_wait_list; +; clk_event_t event_wait_list2[] = {clk_event}; +; +; // Emits block literal on stack and block kernel. +; enqueue_kernel(default_queue, flags, ndrange, +; ^(void) { +; a[i] = c0; +; }); +; +; // Emits block literal on stack and block kernel. +; enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event, +; ^(void) { +; a[i] = b[i]; +; }); +; +; char c; +; // Emits global block literal and block kernel. +; enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event, +; ^(local void *p) { +; return; +; }, +; c); +; +; // Emits global block literal and block kernel. +; enqueue_kernel(default_queue, flags, ndrange, +; ^(local void *p1, local void *p2, local void *p3) { +; return; +; }, +; 1, 2, 4); +; } +; +; Compilation command: +; clang -cc1 -triple spir-unknown-unknown -O0 -cl-std=CL2.0 -emit-llvm enqueue_kernel.cl + +; RUN: llvm-as %s -o %t.bc +; RUN: llvm-spirv %t.bc -spirv-text -o %t.spv.txt +; RUN: FileCheck < %t.spv.txt %s --check-prefix=CHECK-SPIRV +; RUN: llvm-spirv %t.bc -o %t.spv +; RUN: llvm-spirv -r %t.spv -o %t.rev.bc +; RUN: llvm-dis %t.rev.bc +; RUN: FileCheck < %t.rev.ll %s --check-prefix=CHECK-LLVM + +; ModuleID = 'enqueue_kernel.cl' +source_filename = "enqueue_kernel.cl" +target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir-unknown-unknown" + +%opencl.queue_t = type opaque +%struct.ndrange_t = type { i32 } +%opencl.clk_event_t = type opaque + +; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer1:[0-9]+]] "__device_side_enqueue_block_invoke_kernel" +; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer2:[0-9]+]] "__device_side_enqueue_block_invoke_2_kernel" +; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer3:[0-9]+]] "__device_side_enqueue_block_invoke_3_kernel" +; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer4:[0-9]+]] "__device_side_enqueue_block_invoke_4_kernel" +; CHECK-SPIRV: Name [[BlockGlb1:[0-9]+]] "__block_literal_global" +; CHECK-SPIRV: Name [[BlockGlb2:[0-9]+]] "__block_literal_global.1" + +; CHECK-SPIRV: TypeInt [[Int32Ty:[0-9]+]] 32 +; CHECK-SPIRV: TypeInt [[Int8Ty:[0-9]+]] 8 +; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt8:[0-9]+]] 8 +; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt0:[0-9]+]] 0 +; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt17:[0-9]+]] 17 +; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt2:[0-9]+]] 2 +; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt20:[0-9]+]] 20 +; CHECK-SPIRV: TypeVoid [[VoidTy:[0-9]+]] + +; CHECK-SPIRV: TypePointer {{[0-9]+}} 7 {{[0-9]+}} +; CHECK-SPIRV: TypePointer [[Int32LocPtrTy:[0-9]+]] 7 [[Int32Ty]] +; CHECK-SPIRV: TypeDeviceEvent [[EventTy:[0-9]+]] +; CHECK-SPIRV: TypePointer [[Int8PtrGenTy:[0-9]+]] 8 [[Int8Ty]] +; CHECK-SPIRV: TypePointer [[EventPtrTy:[0-9]+]] 8 [[EventTy]] +; CHECK-SPIRV: TypeFunction [[BlockTy1:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]] +; CHECK-SPIRV: TypeFunction [[BlockTy2:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]] +; CHECK-SPIRV: TypeFunction [[BlockTy3:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]] +; CHECK-SPIRV: ConstantNull [[EventPtrTy]] [[EventNull:[0-9]+]] + +; CHECK-LLVM: [[BlockTy1:%[0-9]+]] = type { i32, i32 } +; CHECK-LLVM: [[BlockTy2:%[0-9]+]] = type <{ i32, i32, i32 addrspace(1)*, i32, i8 }> +; CHECK-LLVM: [[BlockTy3:%[0-9]+]] = type <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> +; CHECK-LLVM: [[BlockTy4:%[0-9]+]] = type <{ i32, i32 }> + +; CHECK-LLVM: @__block_literal_global = internal addrspace(1) constant [[BlockTy1]] { i32 8, i32 4 }, align 4 +; CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy1]] { i32 8, i32 4 }, align 4 + +@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4 +@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4 + +; Function Attrs: convergent noinline nounwind optnone +define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %i, i8 signext %c0) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 { +entry: + %default_queue = alloca %opencl.queue_t*, align 4 + %flags = alloca i32, align 4 + %ndrange = alloca %struct.ndrange_t, align 4 + %clk_event = alloca %opencl.clk_event_t*, align 4 + %event_wait_list = alloca %opencl.clk_event_t*, align 4 + %event_wait_list2 = alloca [1 x %opencl.clk_event_t*], align 4 + %block = alloca <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, align 4 + %block3 = alloca <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4 + %c = alloca i8, align 1 + store i32 0, i32* %flags, align 4 + %arrayinit.begin = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 + %0 = load %opencl.clk_event_t*, %opencl.clk_event_t** %clk_event, align 4 + store %opencl.clk_event_t* %0, %opencl.clk_event_t** %arrayinit.begin, align 4 + %1 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4 + %2 = load i32, i32* %flags, align 4 + %block.size = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 0 + store i32 17, i32* %block.size, align 4 + %block.align = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 1 + store i32 4, i32* %block.align, align 4 + %block.captured = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 2 + store i32 addrspace(1)* %a, i32 addrspace(1)** %block.captured, align 4 + %block.captured1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 3 + store i32 %i, i32* %block.captured1, align 4 + %block.captured2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 4 + store i8 %c0, i8* %block.captured2, align 4 + %3 = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block to void ()* + %4 = addrspacecast void ()* %3 to i8 addrspace(4)* + +; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit1:[0-9]+]] +; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} +; [[ConstInt0]] [[EventNull]] [[EventNull]] +; [[BlockKer1]] [[BlockLit1]] [[ConstInt17]] [[ConstInt8]] + +; CHECK-LLVM: [[Block2:%[0-9]+]] = addrspacecast [[BlockTy2]]* %block to i8 addrspace(4)* +; CHECK-LLVM: [[BlockInv2:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8 addrspace(4)* +; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2]]) + + %5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* byval %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %4) + %6 = addrspacecast %opencl.clk_event_t** %event_wait_list to %opencl.clk_event_t* addrspace(4)* + %7 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)* + %block.size5 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 0 + store i32 20, i32* %block.size5, align 4 + %block.align6 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 1 + store i32 4, i32* %block.align6, align 4 + %block.captured7 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 2 + store i32 addrspace(1)* %a, i32 addrspace(1)** %block.captured7, align 4 + %block.captured8 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 3 + store i32 %i, i32* %block.captured8, align 4 + %block.captured9 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 4 + store i32 addrspace(1)* %b, i32 addrspace(1)** %block.captured9, align 4 + %8 = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3 to void ()* + %9 = addrspacecast void ()* %8 to i8 addrspace(4)* + +; CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event1:[0-9]+]] +; CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event2:[0-9]+]] + +; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit2:[0-9]+]] +; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} +; [[ConstInt2]] [[Event1]] [[Event2]] +; [[BlockKer2]] [[BlockLit2]] [[ConstInt20]] [[ConstInt8]] + +; CHECK-LLVM: [[Block3:%[0-9]+]] = addrspacecast [[BlockTy3]]* %block3 to i8 addrspace(4)* +; CHECK-LLVM: [[BlockInv3:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8 addrspace(4)* +; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv3]], i8 addrspace(4)* [[Block3]]) + + %10 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i32 2, %opencl.clk_event_t* addrspace(4)* %6, %opencl.clk_event_t* addrspace(4)* %7, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %9) + %11 = alloca [1 x i32] + %12 = getelementptr [1 x i32], [1 x i32]* %11, i32 0, i32 0 + %13 = load i8, i8* %c, align 1 + %14 = zext i8 %13 to i32 + store i32 %14, i32* %12, align 4 + +; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf31:[0-9]+]] +; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit3Tmp:[0-9]+]] [[BlockGlb1:[0-9]+]] +; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit3:[0-9]+]] [[BlockLit3Tmp]] +; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} +; [[ConstInt2]] [[Event1]] [[Event2]] +; [[BlockKer3]] [[BlockLit3]] [[ConstInt8]] [[ConstInt8]] +; [[LocalBuf31]] + +; CHECK-LLVM: [[Block0Tmp:%[0-9]+]] = bitcast [[BlockTy1]] addrspace(1)* @__block_literal_global to i8 addrspace(1)* +; CHECK-LLVM: [[Block0:%[0-9]+]] = addrspacecast i8 addrspace(1)* [[Block0Tmp]] to i8 addrspace(4)* +; CHECK-LLVM: [[BlockInv0:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8 addrspace(4)* +; CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv0]], i8 addrspace(4)* [[Block0]], i32 1, i32* {{.*}}) + + %15 = call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i32 2, %opencl.clk_event_t* addrspace(4)* %6, %opencl.clk_event_t* addrspace(4)* %7, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %12) + %16 = alloca [3 x i32] + %17 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 0 + store i32 1, i32* %17, align 4 + %18 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 1 + store i32 2, i32* %18, align 4 + %19 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 2 + store i32 4, i32* %19, align 4 + +; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf41:[0-9]+]] +; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf42:[0-9]+]] +; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf43:[0-9]+]] +; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit4Tmp:[0-9]+]] [[BlockGlb2:[0-9]+]] +; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit4:[0-9]+]] [[BlockLit4Tmp]] +; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} +; [[ConstInt0]] [[EventNull]] [[EventNull]] +; [[BlockKer4]] [[BlockLit4]] [[ConstInt8]] [[ConstInt8]] +; [[LocalBuf41]] [[LocalBuf42]] [[LocalBuf43]] + +; CHECK-LLVM: [[Block1Tmp:%[0-9]+]] = bitcast [[BlockTy1]] addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)* +; CHECK-LLVM: [[Block1:%[0-9]+]] = addrspacecast i8 addrspace(1)* [[Block1Tmp]] to i8 addrspace(4)* +; CHECK-LLVM: [[BlockInv1:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8 addrspace(4)* +; CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv1]], i8 addrspace(4)* [[Block1]], i32 3, i32* {{.*}}) + + %20 = call i32 @__enqueue_kernel_varargs(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %17) + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %.block_descriptor) #2 { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %block.addr = alloca <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* + store <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)** %block.addr, align 4 + %block.capture.addr = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 4 + %0 = load i8, i8 addrspace(4)* %block.capture.addr, align 4 + %conv = sext i8 %0 to i32 + %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 2 + %1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr1, align 4 + %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 3 + %2 = load i32, i32 addrspace(4)* %block.capture.addr2, align 4 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 %2 + store i32 %conv, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind +define internal spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)*) #3 { +entry: + call void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %0) + ret void +} + +declare i32 @__enqueue_kernel_basic(%opencl.queue_t*, i32, %struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*) + +; Function Attrs: convergent noinline nounwind optnone +define internal spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %.block_descriptor) #2 { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %block.addr = alloca <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* + store <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4 + %block.capture.addr = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4 + %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr, align 4 + %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3 + %1 = load i32, i32 addrspace(4)* %block.capture.addr1, align 4 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 %1 + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 2 + %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr2, align 4 + %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3 + %4 = load i32, i32 addrspace(4)* %block.capture.addr3, align 4 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 %4 + store i32 %2, i32 addrspace(1)* %arrayidx4, align 4 + ret void +} + +; Function Attrs: nounwind +define internal spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)*) #3 { +entry: + call void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %0) + ret void +} + +declare i32 @__enqueue_kernel_basic_events(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*) + +; Function Attrs: convergent noinline nounwind optnone +define internal spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* %.block_descriptor, i8 addrspace(3)* %p) #2 { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %p.addr = alloca i8 addrspace(3)*, align 4 + %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)* + store i8 addrspace(3)* %p, i8 addrspace(3)** %p.addr, align 4 + store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4 + ret void +} + +; Function Attrs: nounwind +define internal spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*, i8 addrspace(3)*) #3 { +entry: + call void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* %0, i8 addrspace(3)* %1) + ret void +} + +declare i32 @__enqueue_kernel_events_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*, i32, i32*) + +; Function Attrs: convergent noinline nounwind optnone +define internal spir_func void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* %.block_descriptor, i8 addrspace(3)* %p1, i8 addrspace(3)* %p2, i8 addrspace(3)* %p3) #2 { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %p1.addr = alloca i8 addrspace(3)*, align 4 + %p2.addr = alloca i8 addrspace(3)*, align 4 + %p3.addr = alloca i8 addrspace(3)*, align 4 + %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)* + store i8 addrspace(3)* %p1, i8 addrspace(3)** %p1.addr, align 4 + store i8 addrspace(3)* %p2, i8 addrspace(3)** %p2.addr, align 4 + store i8 addrspace(3)* %p3, i8 addrspace(3)** %p3.addr, align 4 + store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4 + ret void +} + +; Function Attrs: nounwind +define internal spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*) #3 { +entry: + call void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* %0, i8 addrspace(3)* %1, i8 addrspace(3)* %2, i8 addrspace(3)* %3) + ret void +} + +declare i32 @__enqueue_kernel_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*, i32, i32*) + +; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer1]] 0 [[BlockTy1]] +; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer2]] 0 [[BlockTy1]] +; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer3]] 0 [[BlockTy2]] +; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer4]] 0 [[BlockTy3]] + +; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)*) +; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)*) +; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*, i8 addrspace(3)*) +; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*) + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!opencl.enable.FP_CONTRACT = !{} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.used.extensions = !{!2} +!opencl.used.optional.core.features = !{!2} +!opencl.compiler.options = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 2, i32 0} +!2 = !{} +!3 = !{!"clang version 7.0.0"} +!4 = !{i32 1, i32 1, i32 0, i32 0} +!5 = !{!"none", !"none", !"none", !"none"} +!6 = !{!"int*", !"int*", !"int", !"char"} +!7 = !{!"", !"", !"", !""} +!8 = !{i1 false, i1 false, i1 false, i1 false} +!9 = !{i32 0, i32 0, i32 0, i32 0} diff --git a/test/transcoding/kernel_query.ll b/test/transcoding/kernel_query.ll new file mode 100644 index 0000000000..f8e6cd3ea6 --- /dev/null +++ b/test/transcoding/kernel_query.ll @@ -0,0 +1,212 @@ +; Source +; typedef struct {int a;} ndrange_t; +; +; kernel void device_side_enqueue() { +; ndrange_t ndrange; +; +; get_kernel_work_group_size(^(){}); +; get_kernel_preferred_work_group_size_multiple(^(){}); +; +; #pragma OPENCL EXTENSION cl_khr_subgroups : enable +; get_kernel_max_sub_group_size_for_ndrange(ndrange, ^(){}); +; get_kernel_sub_group_count_for_ndrange(ndrange, ^(){}); +; } +; +; Compilation command: +; clang -cc1 -triple spir-unknown-unknown -O0 -cl-std=CL2.0 -emit-llvm kernel_query.cl + +; RUN: llvm-as %s -o %t.bc +; RUN: llvm-spirv %t.bc -spirv-text -o %t.spv.txt +; RUN: FileCheck < %t.spv.txt %s --check-prefix=CHECK-SPIRV +; RUN: llvm-spirv %t.bc -o %t.spv +; RUN: llvm-spirv -r %t.spv -o %t.rev.bc +; RUN: llvm-dis %t.rev.bc +; RUN: FileCheck < %t.rev.ll %s --check-prefix=CHECK-LLVM + +source_filename = "kernel_query.cl" +target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir-unknown-unknown" + +%struct.ndrange_t = type { i32 } + +; CHECK-SPIRV: EntryPoint {{.*}} [[BlockKer1:[0-9]+]] "__device_side_enqueue_block_invoke_kernel" +; CHECK-SPIRV: EntryPoint {{.*}} [[BlockKer2:[0-9]+]] "__device_side_enqueue_block_invoke_2_kernel" +; CHECK-SPIRV: EntryPoint {{.*}} [[BlockKer3:[0-9]+]] "__device_side_enqueue_block_invoke_3_kernel" +; CHECK-SPIRV: EntryPoint {{.*}} [[BlockKer4:[0-9]+]] "__device_side_enqueue_block_invoke_4_kernel" +; CHECK-SPIRV: Name [[BlockGlb1:[0-9]+]] "__block_literal_global" +; CHECK-SPIRV: Name [[BlockGlb2:[0-9]+]] "__block_literal_global.1" +; CHECK-SPIRV: Name [[BlockGlb3:[0-9]+]] "__block_literal_global.2" +; CHECK-SPIRV: Name [[BlockGlb4:[0-9]+]] "__block_literal_global.3" + +; CHECK-LLVM: [[BlockTy:%[0-9]+]] = type { i32, i32 } +%1 = type <{ i32, i32 }> + +; CHECK-LLVM: @__block_literal_global = internal addrspace(1) constant [[BlockTy]] { i32 8, i32 4 }, align 4 +; CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy]] { i32 8, i32 4 }, align 4 +; CHECK-LLVM: @__block_literal_global.2 = internal addrspace(1) constant [[BlockTy]] { i32 8, i32 4 }, align 4 +; CHECK-LLVM: @__block_literal_global.3 = internal addrspace(1) constant [[BlockTy]] { i32 8, i32 4 }, align 4 + +@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4 +@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4 +@__block_literal_global.2 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4 +@__block_literal_global.3 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4 + +; CHECK-SPIRV: TypeInt [[Int32Ty:[0-9]+]] 32 +; CHECK-SPIRV: TypeInt [[Int8Ty:[0-9]+]] 8 +; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt8:[0-9]+]] 8 +; CHECK-SPIRV: TypeVoid [[VoidTy:[0-9]+]] +; CHECK-SPIRV: TypeStruct [[NDRangeTy:[0-9]+]] [[Int32Ty]] {{$}} +; CHECK-SPIRV: TypePointer [[NDRangePtrTy:[0-9]+]] 7 [[NDRangeTy]] +; CHECK-SPIRV: TypePointer [[Int8PtrGenTy:[0-9]+]] 8 [[Int8Ty]] +; CHECK-SPIRV: TypeFunction [[BlockKerTy:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]] + +; Function Attrs: convergent noinline nounwind optnone +define spir_kernel void @device_side_enqueue() #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !2 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !2 { +entry: + +; CHECK-SPIRV: Variable [[NDRangePtrTy]] [[NDRange:[0-9]+]] + + %ndrange = alloca %struct.ndrange_t, align 4 + +; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit1Tmp:[0-9]+]] [[BlockGlb1]] +; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit1:[0-9]+]] [[BlockLit1Tmp]] +; CHECK-SPIRV: GetKernelWorkGroupSize [[Int32Ty]] {{[0-9]+}} [[BlockKer1]] [[BlockLit1]] [[ConstInt8]] [[ConstInt8]] + +; CHECK-LLVM: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* {{.*}}, i8 addrspace(4)* {{.*}}) + + %0 = call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) + +; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit2Tmp:[0-9]+]] [[BlockGlb2]] +; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit2:[0-9]+]] [[BlockLit2Tmp]] +; CHECK-SPIRV: GetKernelPreferredWorkGroupSizeMultiple [[Int32Ty]] {{[0-9]+}} [[BlockKer2]] [[BlockLit2]] [[ConstInt8]] [[ConstInt8]] + +; CHECK-LLVM: call i32 @__get_kernel_preferred_work_group_size_multiple_impl(i8 addrspace(4)* {{.*}}, i8 addrspace(4)* {{.*}}) #1 + + %1 = call i32 @__get_kernel_preferred_work_group_size_multiple_impl(i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*)) + +; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit3Tmp:[0-9]+]] [[BlockGlb3]] +; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit3:[0-9]+]] [[BlockLit3Tmp]] +; CHECK-SPIRV: GetKernelNDrangeMaxSubGroupSize [[Int32Ty]] {{[0-9]+}} [[NDRange]] [[BlockKer3]] [[BlockLit3]] [[ConstInt8]] [[ConstInt8]] + +; CHECK-LLVM: call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* {{.*}}, i8 addrspace(4)* {{.*}}) + + %2 = call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.2 to i8 addrspace(1)*) to i8 addrspace(4)*)) + +; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit4Tmp:[0-9]+]] [[BlockGlb4]] +; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit4:[0-9]+]] [[BlockLit4Tmp]] +; CHECK-SPIRV: GetKernelNDrangeSubGroupCount [[Int32Ty]] {{[0-9]+}} [[NDRange]] [[BlockKer4]] [[BlockLit4]] [[ConstInt8]] [[ConstInt8]] + +; CHECK-LLVM: call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* {{.*}}, i8 addrspace(4)* {{.*}}) + + %3 = call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.3 to i8 addrspace(1)*) to i8 addrspace(4)*)) + ret void +} + +; Function Attrs: convergent noinline nounwind optnone +define internal spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %.block_descriptor) #1 { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)* + store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4 + ret void +} + +; Function Attrs: nounwind +define internal spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)*) #2 { +entry: + call void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %0) + ret void +} + +declare i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)*, i8 addrspace(4)*) + +; Function Attrs: convergent noinline nounwind optnone +define internal spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %.block_descriptor) #1 { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)* + store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4 + ret void +} + +; Function Attrs: nounwind +define internal spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)*) #2 { +entry: + call void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %0) + ret void +} + +declare i32 @__get_kernel_preferred_work_group_size_multiple_impl(i8 addrspace(4)*, i8 addrspace(4)*) + +; Function Attrs: convergent noinline nounwind optnone +define internal spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* %.block_descriptor) #1 { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)* + store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4 + ret void +} + +; Function Attrs: nounwind +define internal spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*) #2 { +entry: + call void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* %0) + ret void +} + +declare i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*) + +; Function Attrs: convergent noinline nounwind optnone +define internal spir_func void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* %.block_descriptor) #1 { +entry: + %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 + %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4 + store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 + %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)* + store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4 + ret void +} + +; Function Attrs: nounwind +define internal spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*) #2 { +entry: + call void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* %0) + ret void +} + +declare i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*) + +; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer1]] 0 [[BlockKerTy]] +; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer2]] 0 [[BlockKerTy]] +; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer3]] 0 [[BlockKerTy]] +; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer4]] 0 [[BlockKerTy]] + +; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)*) +; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)*) +; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*) +; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*) + +attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } +attributes #3 = { argmemonly nounwind } + +!llvm.module.flags = !{!0} +!opencl.enable.FP_CONTRACT = !{} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.used.extensions = !{!2} +!opencl.used.optional.core.features = !{!2} +!opencl.compiler.options = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 2, i32 0} +!2 = !{} +!3 = !{!"clang version 7.0.0"}