diff --git a/include/LLVMSPIRVLib.h b/include/LLVMSPIRVLib.h
index eb67de3b77..b39c4bdce6 100644
--- a/include/LLVMSPIRVLib.h
+++ b/include/LLVMSPIRVLib.h
@@ -56,6 +56,7 @@ void initializeOCLTypeToSPIRVPass(PassRegistry &);
 void initializeSPIRVLowerBoolPass(PassRegistry &);
 void initializeSPIRVLowerConstExprPass(PassRegistry &);
 void initializeSPIRVLowerSPIRBlocksPass(PassRegistry &);
+void initializeSPIRVLowerOCLBlocksPass(PassRegistry &);
 void initializeSPIRVLowerMemmovePass(PassRegistry &);
 void initializeSPIRVRegularizeLLVMPass(PassRegistry &);
 void initializeSPIRVToOCL20Pass(PassRegistry &);
@@ -136,6 +137,9 @@ ModulePass *createSPIRVLowerConstExpr();
 /// Create a pass for lowering SPIR 2.0 blocks to functions calls.
 ModulePass *createSPIRVLowerSPIRBlocks();
 
+/// Create a pass for removing function pointers related to OCL 2.0 blocks
+ModulePass *createSPIRVLowerOCLBlocks();
+
 /// Create a pass for lowering llvm.memmove to llvm.memcpys with a temporary
 /// variable.
 ModulePass *createSPIRVLowerMemmove();
diff --git a/lib/SPIRV/CMakeLists.txt b/lib/SPIRV/CMakeLists.txt
index e71c80464e..9fca450647 100644
--- a/lib/SPIRV/CMakeLists.txt
+++ b/lib/SPIRV/CMakeLists.txt
@@ -4,16 +4,6 @@ if(SPIRV_USE_LLVM_API)
 endif(SPIRV_USE_LLVM_API)
 
 add_llvm_library(LLVMSPIRVLib
-  libSPIRV/SPIRVBasicBlock.cpp
-  libSPIRV/SPIRVDebug.cpp
-  libSPIRV/SPIRVDecorate.cpp
-  libSPIRV/SPIRVEntry.cpp
-  libSPIRV/SPIRVFunction.cpp
-  libSPIRV/SPIRVInstruction.cpp
-  libSPIRV/SPIRVModule.cpp
-  libSPIRV/SPIRVStream.cpp
-  libSPIRV/SPIRVType.cpp
-  libSPIRV/SPIRVValue.cpp
   Mangler/FunctionDescriptor.cpp
   Mangler/Mangler.cpp
   Mangler/ManglingUtils.cpp
@@ -25,8 +15,9 @@ add_llvm_library(LLVMSPIRVLib
   OCLUtil.cpp
   SPIRVLowerBool.cpp
   SPIRVLowerConstExpr.cpp
-  SPIRVLowerSPIRBlocks.cpp
   SPIRVLowerMemmove.cpp
+  SPIRVLowerOCLBlocks.cpp
+  SPIRVLowerSPIRBlocks.cpp
   SPIRVReader.cpp
   SPIRVRegularizeLLVM.cpp
   SPIRVToOCL20.cpp
@@ -34,6 +25,16 @@ add_llvm_library(LLVMSPIRVLib
   SPIRVWriter.cpp
   SPIRVWriterPass.cpp
   TransOCLMD.cpp
+  libSPIRV/SPIRVBasicBlock.cpp
+  libSPIRV/SPIRVDebug.cpp
+  libSPIRV/SPIRVDecorate.cpp
+  libSPIRV/SPIRVEntry.cpp
+  libSPIRV/SPIRVFunction.cpp
+  libSPIRV/SPIRVInstruction.cpp
+  libSPIRV/SPIRVModule.cpp
+  libSPIRV/SPIRVStream.cpp
+  libSPIRV/SPIRVType.cpp
+  libSPIRV/SPIRVValue.cpp
   LINK_COMPONENTS
     Analysis
     BitWriter
diff --git a/lib/SPIRV/OCL20ToSPIRV.cpp b/lib/SPIRV/OCL20ToSPIRV.cpp
index a8b8e8c7b0..006ced61ac 100644
--- a/lib/SPIRV/OCL20ToSPIRV.cpp
+++ b/lib/SPIRV/OCL20ToSPIRV.cpp
@@ -42,6 +42,7 @@
 #include "SPIRVInternal.h"
 
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instruction.h"
@@ -259,6 +260,12 @@ class OCL20ToSPIRV : public ModulePass, public InstVisitor<OCL20ToSPIRV> {
                                 const std::string &DemangledName,
                                 unsigned int Offset);
 
+  /// Transform enqueue_kernel and kernel query built-in functions to
+  /// spirv-friendly format filling arguments, required for device-side enqueue
+  /// instructions, but missed in the original call
+  void visitCallEnqueueKernel(CallInst *CI, const std::string &DemangledName);
+  void visitCallKernelQuery(CallInst *CI, const std::string &DemangledName);
+
   /// For cl_intel_subgroups block read built-ins:
   void visitSubgroupBlockReadINTEL(CallInst *CI, StringRef MangledName,
                                    const std::string &DemangledName);
@@ -351,6 +358,7 @@ bool OCL20ToSPIRV::runOnModule(Module &Module) {
     if (auto GV = dyn_cast<GlobalValue>(I))
       GV->eraseFromParent();
 
+  eraseUselessFunctions(M); // remove unused functions declarations
   LLVM_DEBUG(dbgs() << "After OCL20ToSPIRV:\n" << *M);
 
   std::string Err;
@@ -512,6 +520,14 @@ void OCL20ToSPIRV::visitCallInst(CallInst &CI) {
                              OCLImageChannelOrderOffset);
     return;
   }
+  if (isEnqueueKernelBI(MangledName)) {
+    visitCallEnqueueKernel(&CI, DemangledName);
+    return;
+  }
+  if (isKernelQueryBI(MangledName)) {
+    visitCallKernelQuery(&CI, DemangledName);
+    return;
+  }
   if (DemangledName.find(kOCLBuiltinName::SubgroupBlockReadINTELPrefix) == 0) {
     visitSubgroupBlockReadINTEL(&CI, MangledName, DemangledName);
     return;
@@ -1449,6 +1465,110 @@ void OCL20ToSPIRV::visitCallGetImageChannel(CallInst *CI, StringRef MangledName,
                       },
                       &Attrs);
 }
+void OCL20ToSPIRV::visitCallEnqueueKernel(CallInst *CI,
+                                          const std::string &DemangledName) {
+  const DataLayout &DL = M->getDataLayout();
+  bool HasEvents = DemangledName.find("events") != std::string::npos;
+
+  // SPIRV OpEnqueueKernel instruction has 10+ arguments.
+  SmallVector<Value *, 16> Args;
+
+  // Copy all arguments before block invoke function pointer
+  // which match with what Clang 6.0 produced
+  const unsigned BlockFIdx = HasEvents ? 6 : 3;
+  Args.assign(CI->arg_begin(), CI->arg_begin() + BlockFIdx);
+
+  // If no event arguments in original call, add dummy ones
+  if (!HasEvents) {
+    Args.push_back(getInt32(M, 0));           // dummy num events
+    Args.push_back(getOCLNullClkEventPtr(M)); // dummy wait events
+    Args.push_back(getOCLNullClkEventPtr(M)); // dummy ret event
+  }
+
+  // Invoke: Pointer to invoke function
+  Value *BlockFunc = CI->getArgOperand(BlockFIdx);
+  Args.push_back(cast<Function>(GetUnderlyingObject(BlockFunc, DL)));
+
+  // Param: Pointer to block literal
+  Value *BlockLiteral = CI->getArgOperand(BlockFIdx + 1);
+  Args.push_back(BlockLiteral);
+
+  // Param Size: Size of block literal structure
+  // Param Aligment: Aligment of block literal structure
+  // TODO: these numbers should be obtained from block literal structure
+  Type *ParamType = GetUnderlyingObject(BlockLiteral, DL)->getType();
+  if (PointerType *PT = dyn_cast<PointerType>(ParamType))
+    ParamType = PT->getElementType();
+  Args.push_back(getInt32(M, DL.getTypeStoreSize(ParamType)));
+  Args.push_back(getInt32(M, DL.getPrefTypeAlignment(ParamType)));
+
+  // Local sizes arguments: Sizes of block invoke arguments
+  // Clang 6.0 and higher generates local size operands as an array,
+  // so we need to unpack them
+  if (DemangledName.find("_varargs") != std::string::npos) {
+    const unsigned LocalSizeArrayIdx = HasEvents ? 9 : 6;
+    auto *LocalSizeArray =
+        cast<GetElementPtrInst>(CI->getArgOperand(LocalSizeArrayIdx));
+    auto *LocalSizeArrayTy =
+        cast<ArrayType>(LocalSizeArray->getSourceElementType());
+    const uint64_t LocalSizeNum = LocalSizeArrayTy->getNumElements();
+    for (unsigned I = 0; I < LocalSizeNum; ++I)
+      Args.push_back(GetElementPtrInst::Create(
+          LocalSizeArray->getSourceElementType(), // Pointee type
+          LocalSizeArray->getPointerOperand(),    // Alloca
+          {getInt32(M, 0), getInt32(M, I)},       // Indices
+          "", CI));
+  }
+
+  StringRef NewName = "__spirv_EnqueueKernel__";
+  FunctionType *FT =
+      FunctionType::get(CI->getType(), getTypes(Args), false /*isVarArg*/);
+  Function *NewF =
+      Function::Create(FT, GlobalValue::ExternalLinkage, NewName, M);
+  NewF->setCallingConv(CallingConv::SPIR_FUNC);
+  CallInst *NewCall = CallInst::Create(NewF, Args, "", CI);
+  NewCall->setCallingConv(NewF->getCallingConv());
+  CI->replaceAllUsesWith(NewCall);
+  CI->eraseFromParent();
+}
+
+void OCL20ToSPIRV::visitCallKernelQuery(CallInst *CI,
+                                        const std::string &DemangledName) {
+  const DataLayout &DL = M->getDataLayout();
+  bool HasNDRange =
+      DemangledName.find("_for_ndrange_impl") != std::string::npos;
+  // BIs with "_for_ndrange_impl" suffix has NDRange argument first, and
+  // Invoke argument following. For other BIs Invoke function is the first arg
+  const unsigned BlockFIdx = HasNDRange ? 1 : 0;
+  Value *BlockFVal = CI->getArgOperand(BlockFIdx)->stripPointerCasts();
+
+  auto *BlockF = cast<Function>(GetUnderlyingObject(BlockFVal, DL));
+
+  AttributeList Attrs = CI->getCalledFunction()->getAttributes();
+  mutateCallInst(M, CI,
+                 [=](CallInst *CI, std::vector<Value *> &Args) {
+                   Value *Param = *Args.rbegin();
+                   Type *ParamType = GetUnderlyingObject(Param, DL)->getType();
+                   if (PointerType *PT = dyn_cast<PointerType>(ParamType)) {
+                     ParamType = PT->getElementType();
+                   }
+                   // Last arg corresponds to SPIRV Param operand.
+                   // Insert Invoke in front of Param.
+                   // Add Param Size and Param Align at the end.
+                   Args[BlockFIdx] = BlockF;
+                   Args.push_back(getInt32(M, DL.getTypeStoreSize(ParamType)));
+                   Args.push_back(
+                       getInt32(M, DL.getPrefTypeAlignment(ParamType)));
+
+                   Op Opcode = OCLSPIRVBuiltinMap::map(DemangledName);
+                   // Adding "__" postfix, so in case we have multiple such
+                   // functions and their names will have numerical postfix,
+                   // then the numerical postfix will be droped and we will get
+                   // correct function name.
+                   return getSPIRVFuncName(Opcode, kSPIRVName::Postfix);
+                 },
+                 /*BuiltinFuncMangleInfo*/ nullptr, &Attrs);
+}
 
 // The intel_sub_group_block_read built-ins are overloaded to support both
 // buffers and images, but need to be mapped to distinct SPIR-V instructions.
diff --git a/lib/SPIRV/OCLUtil.cpp b/lib/SPIRV/OCLUtil.cpp
index 83e2182763..4be0ce766d 100644
--- a/lib/SPIRV/OCLUtil.cpp
+++ b/lib/SPIRV/OCLUtil.cpp
@@ -605,6 +605,19 @@ bool isSpecialTypeInitializer(Instruction *Inst) {
   return isSamplerInitializer(Inst) || isPipeStorageInitializer(Inst);
 }
 
+bool isEnqueueKernelBI(const StringRef MangledName) {
+  return MangledName == "__enqueue_kernel_basic" ||
+         MangledName == "__enqueue_kernel_basic_events" ||
+         MangledName == "__enqueue_kernel_varargs" ||
+         MangledName == "__enqueue_kernel_events_varargs";
+}
+
+bool isKernelQueryBI(const StringRef MangledName) {
+  return MangledName == "__get_kernel_work_group_size_impl" ||
+         MangledName == "__get_kernel_sub_group_count_for_ndrange_impl" ||
+         MangledName == "__get_kernel_max_sub_group_size_for_ndrange_impl" ||
+         MangledName == "__get_kernel_preferred_work_group_size_multiple_impl";
+}
 } // namespace OCLUtil
 
 void llvm::mangleOpenClBuiltin(const std::string &UniqName,
diff --git a/lib/SPIRV/OCLUtil.h b/lib/SPIRV/OCLUtil.h
index 4ac4b5aa01..e67705e280 100644
--- a/lib/SPIRV/OCLUtil.h
+++ b/lib/SPIRV/OCLUtil.h
@@ -365,6 +365,9 @@ bool isPipeStorageInitializer(Instruction *Inst);
 /// Check (isSamplerInitializer || isPipeStorageInitializer)
 bool isSpecialTypeInitializer(Instruction *Inst);
 
+bool isEnqueueKernelBI(const StringRef MangledName);
+bool isKernelQueryBI(const StringRef MangledName);
+
 } // namespace OCLUtil
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -542,11 +545,12 @@ template <> inline void SPIRVMap<std::string, Op, SPIRVInstruction>::init() {
   // CL 2.0 kernel enqueue builtins
   _SPIRV_OP(enqueue_marker, EnqueueMarker)
   _SPIRV_OP(enqueue_kernel, EnqueueKernel)
-  _SPIRV_OP(get_kernel_ndrange_subgroup_count, GetKernelNDrangeSubGroupCount)
-  _SPIRV_OP(get_kernel_ndrange_max_subgroup_count,
+  _SPIRV_OP(get_kernel_sub_group_count_for_ndrange_impl,
+            GetKernelNDrangeSubGroupCount)
+  _SPIRV_OP(get_kernel_max_sub_group_size_for_ndrange_impl,
             GetKernelNDrangeMaxSubGroupSize)
-  _SPIRV_OP(get_kernel_work_group_size, GetKernelWorkGroupSize)
-  _SPIRV_OP(get_kernel_preferred_work_group_size_multiple,
+  _SPIRV_OP(get_kernel_work_group_size_impl, GetKernelWorkGroupSize)
+  _SPIRV_OP(get_kernel_preferred_work_group_size_multiple_impl,
             GetKernelPreferredWorkGroupSizeMultiple)
   _SPIRV_OP(retain_event, RetainEvent)
   _SPIRV_OP(release_event, ReleaseEvent)
diff --git a/lib/SPIRV/SPIRVInternal.h b/lib/SPIRV/SPIRVInternal.h
index 99962b0008..b0a8d6c198 100644
--- a/lib/SPIRV/SPIRVInternal.h
+++ b/lib/SPIRV/SPIRVInternal.h
@@ -533,6 +533,10 @@ Function *getOrCreateFunction(Module *M, Type *RetTy, ArrayRef<Type *> ArgTypes,
                               AttributeList *Attrs = nullptr,
                               bool TakeName = true);
 
+PointerType *getOCLClkEventType(Module *M);
+PointerType *getOCLClkEventPtrType(Module *M);
+Constant *getOCLNullClkEventPtr(Module *M);
+
 /// Get function call arguments.
 /// \param Start Starting index.
 /// \param End Ending index.
diff --git a/lib/SPIRV/SPIRVLowerOCLBlocks.cpp b/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
new file mode 100644
index 0000000000..50e183826b
--- /dev/null
+++ b/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
@@ -0,0 +1,303 @@
+//===- SPIRVLowerOCLBlocks.cpp - OCL Utilities ----------------------------===//
+//
+//                     The LLVM/SPIRV Translator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// Copyright (c) 2018 Intel Corporation. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimers in the documentation
+// and/or other materials provided with the distribution.
+// Neither the names of Intel Corporation, nor the names of its
+// contributors may be used to endorse or promote products derived from this
+// Software without specific prior written permission.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+// THE SOFTWARE.
+//
+//===----------------------------------------------------------------------===//
+//
+// SPIR-V specification doesn't allow function pointers, so SPIR-V translator
+// is designed to fail if a value with function type (except calls) is occured.
+// Currently there is only two cases, when function pointers are generating in
+// LLVM IR in OpenCL - block calls and device side enqueue built-in calls.
+//
+// In both cases values with function type used as intermediate representation
+// for block literal structure.
+//
+// This pass is designed to find such cases and simplify them to avoid any
+// function pointer types occurrences in LLVM IR in 4 steps.
+//
+// 1. Find all function pointer allocas, like
+//      %block = alloca void () *
+//
+//    Then find a single store to that alloca:
+//      %blockLit = alloca <{ i32, i32, ...}>, align 4
+//      %0 = bitcast <{ i32, i32, ... }>* %blockLit to void ()*
+//    > store void ()* %0, void ()** %block, align 4
+//
+//    And replace the alloca users by new instructions which used stored value
+//    %blockLit itself instead of function pointer alloca %block.
+//
+// 2. Find consecutive casts from block literal type to i8 addrspace(4)*
+//    used function pointers as an intermediate type:
+//      %0 = bitcast <{ i32, i32 }> %block to void() *
+//      %1 = addrspacecast void() * %0 to i8 addrspace(4)*
+//    And simplify them:
+//      %2 = addrspacecast <{ i32, i32 }> %block to i8 addrspace(4)*
+//
+// 3. Find all unused instructions with function pointer type occured after
+//    pp.1-2 and remove them.
+//
+// 4. Find unused globals with function pointer type, like
+//    @block = constant void ()*
+//             bitcast ({ i32, i32 }* @__block_literal_global to void ()*
+//
+//    And remove them.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "spv-lower-ocl-blocks"
+
+#include "OCLUtil.h"
+#include "SPIRVInternal.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Casting.h"
+
+using namespace llvm;
+
+namespace {
+
+static void
+removeUnusedFunctionPtrInst(Instruction *I,
+                            SmallSetVector<Instruction *, 16> &FuncPtrInsts) {
+  for (unsigned OpIdx = 0, Ops = I->getNumOperands(); OpIdx != Ops; ++OpIdx) {
+    Instruction *OpI = dyn_cast<Instruction>(I->getOperand(OpIdx));
+    I->setOperand(OpIdx, nullptr);
+    if (OpI && OpI != I && OpI->user_empty())
+      FuncPtrInsts.insert(OpI);
+  }
+  I->eraseFromParent();
+}
+
+static bool isFuncPtrAlloca(const AllocaInst *AI) {
+  auto *ET = dyn_cast<PointerType>(AI->getAllocatedType());
+  return ET && ET->getElementType()->isFunctionTy();
+}
+
+static bool hasFuncPtrType(const Value *V) {
+  auto *PT = dyn_cast<PointerType>(V->getType());
+  return PT && PT->getElementType()->isFunctionTy();
+}
+
+static bool isFuncPtrInst(const Instruction *I) {
+  if (auto *AI = dyn_cast<AllocaInst>(I))
+    return isFuncPtrAlloca(AI);
+
+  for (auto &Op : I->operands()) {
+    if (auto *AI = dyn_cast<AllocaInst>(Op))
+      return isFuncPtrAlloca(AI);
+
+    auto *OpI = dyn_cast<Instruction>(&Op);
+    if (OpI && OpI != I && hasFuncPtrType(OpI))
+      return true;
+  }
+  return false;
+}
+
+static StoreInst *findSingleStore(AllocaInst *AI) {
+  StoreInst *Store = nullptr;
+  for (auto *U : AI->users()) {
+    if (!isa<StoreInst>(U))
+      continue; // not a store
+    if (Store)
+      return nullptr; // there are more than one stores
+    Store = dyn_cast<StoreInst>(U);
+  }
+  return Store;
+}
+
+static void fixFunctionPtrAllocaUsers(AllocaInst *AI) {
+  // Find and remove a single store to alloca
+  auto *SingleStore = findSingleStore(AI);
+  assert(SingleStore && "More than one store to the function pointer alloca");
+  auto *StoredVal = SingleStore->getValueOperand();
+  SingleStore->eraseFromParent();
+
+  // Find loads from the alloca and replace thier users
+  for (auto *U : AI->users()) {
+    auto *LI = dyn_cast<LoadInst>(U);
+    if (!LI)
+      continue;
+
+    for (auto *U : LI->users()) {
+      auto *UInst = cast<Instruction>(U);
+      auto *Cast = CastInst::CreatePointerBitCastOrAddrSpaceCast(
+          StoredVal, UInst->getType(), "", UInst);
+      UInst->replaceAllUsesWith(Cast);
+    }
+  }
+}
+
+static int getBlockLiteralIdx(const Function &F) {
+  StringRef FName = F.getName();
+  if (isEnqueueKernelBI(FName))
+    return FName.contains("events") ? 7 : 4;
+  if (isKernelQueryBI(FName))
+    return FName.contains("for_ndrange") ? 2 : 1;
+  if (FName.startswith("__") && FName.contains("_block_invoke"))
+    return F.hasStructRetAttr() ? 1 : 0;
+
+  return -1; // No block literal argument
+}
+
+static bool hasBlockLiteralArg(const Function &F) {
+  return getBlockLiteralIdx(F) != -1;
+}
+
+static bool simplifyFunctionPtrCasts(Function &F) {
+  bool Changed = false;
+  int BlockLiteralIdx = getBlockLiteralIdx(F);
+  for (auto *U : F.users()) {
+    auto *Call = dyn_cast<CallInst>(U);
+    if (!Call)
+      continue;
+    if (Call->getFunction()->getName() == F.getName().str() + "_kernel")
+      continue; // Skip block invoke function calls inside block invoke kernels
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    auto *BlockLiteral = Call->getOperand(BlockLiteralIdx);
+    auto *BlockLiteralVal = GetUnderlyingObject(BlockLiteral, DL);
+    if (isa<GlobalVariable>(BlockLiteralVal))
+      continue; // nothing to do with globals
+
+    auto *BlockLiteralAlloca = cast<AllocaInst>(BlockLiteralVal);
+    assert(!BlockLiteralAlloca->getAllocatedType()->isFunctionTy() &&
+           "Function type shouldn't be there");
+
+    auto *NewBlockLiteral = CastInst::CreatePointerBitCastOrAddrSpaceCast(
+        BlockLiteralAlloca, BlockLiteral->getType(), "", Call);
+    BlockLiteral->replaceAllUsesWith(NewBlockLiteral);
+    Changed |= true;
+  }
+  return Changed;
+}
+
+static void
+findFunctionPtrAllocas(Module &M,
+                       SmallVectorImpl<AllocaInst *> &FuncPtrAllocas) {
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    for (auto &I : instructions(F)) {
+      auto *AI = dyn_cast<AllocaInst>(&I);
+      if (!AI || !isFuncPtrAlloca(AI))
+        continue;
+      FuncPtrAllocas.push_back(AI);
+    }
+  }
+}
+
+static void
+findUnusedFunctionPtrInsts(Module &M,
+                           SmallSetVector<Instruction *, 16> &FuncPtrInsts) {
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    for (auto &I : instructions(F))
+      if (I.user_empty() && isFuncPtrInst(&I))
+        FuncPtrInsts.insert(&I);
+  }
+}
+
+static void
+findUnusedFunctionPtrGlbs(Module &M,
+                          SmallVectorImpl<GlobalVariable *> &FuncPtrGlbs) {
+  for (auto &GV : M.globals()) {
+    if (!GV.user_empty())
+      continue;
+    auto *GVType = dyn_cast<PointerType>(GV.getType()->getElementType());
+    if (GVType && GVType->getElementType()->isFunctionTy())
+      FuncPtrGlbs.push_back(&GV);
+  }
+}
+
+class SPIRVLowerOCLBlocks : public ModulePass {
+
+public:
+  SPIRVLowerOCLBlocks() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) {
+    bool Changed = false;
+
+    // 1. Find function pointer allocas and fix their users
+    SmallVector<AllocaInst *, 16> FuncPtrAllocas;
+    findFunctionPtrAllocas(M, FuncPtrAllocas);
+
+    Changed |= !FuncPtrAllocas.empty();
+    for (auto *AI : FuncPtrAllocas)
+      fixFunctionPtrAllocaUsers(AI);
+
+    // 2. Simplify consecutive casts which use function pointer types
+    for (auto &F : M)
+      if (hasBlockLiteralArg(F))
+        Changed |= simplifyFunctionPtrCasts(F);
+
+    // 3. Cleanup unused instructions with function pointer type
+    // which are occured after pp. 1-2
+    SmallSetVector<Instruction *, 16> FuncPtrInsts;
+    findUnusedFunctionPtrInsts(M, FuncPtrInsts);
+
+    Changed |= !FuncPtrInsts.empty();
+    while (!FuncPtrInsts.empty()) {
+      Instruction *I = FuncPtrInsts.pop_back_val();
+      removeUnusedFunctionPtrInst(I, FuncPtrInsts);
+    }
+
+    // 4. Find and remove unused global variables with function pointer type
+    SmallVector<GlobalVariable *, 16> FuncPtrGlbs;
+    findUnusedFunctionPtrGlbs(M, FuncPtrGlbs);
+
+    Changed |= !FuncPtrGlbs.empty();
+    for (auto *GV : FuncPtrGlbs)
+      GV->eraseFromParent();
+
+    return Changed;
+  }
+
+  static char ID;
+}; // class SPIRVLowerOCLBlocks
+
+char SPIRVLowerOCLBlocks::ID = 0;
+
+} // namespace
+
+INITIALIZE_PASS(
+    SPIRVLowerOCLBlocks, "spv-lower-ocl-blocks",
+    "Remove function pointers occured in case of using OpenCL blocks", false,
+    false)
+
+llvm::ModulePass *llvm::createSPIRVLowerOCLBlocks() {
+  return new SPIRVLowerOCLBlocks();
+}
diff --git a/lib/SPIRV/SPIRVReader.cpp b/lib/SPIRV/SPIRVReader.cpp
index 6799b2d466..ad59f645c3 100644
--- a/lib/SPIRV/SPIRVReader.cpp
+++ b/lib/SPIRV/SPIRVReader.cpp
@@ -295,17 +295,16 @@ class SPIRVToLLVM {
   Value *transValueWithoutDecoration(SPIRVValue *, Function *F, BasicBlock *,
                                      bool CreatePlaceHolder = true);
   Value *transDeviceEvent(SPIRVValue *BV, Function *F, BasicBlock *BB);
-  Value *transEnqueuedBlock(SPIRVValue *BF, SPIRVValue *BC, SPIRVValue *BCSize,
-                            SPIRVValue *BCAligment, Function *F,
-                            BasicBlock *BB);
   bool transDecoration(SPIRVValue *, Value *);
   bool transAlign(SPIRVValue *, Value *);
   Instruction *transOCLBuiltinFromExtInst(SPIRVExtInst *BC, BasicBlock *BB);
   std::vector<Value *> transValue(const std::vector<SPIRVValue *> &,
                                   Function *F, BasicBlock *);
   Function *transFunction(SPIRVFunction *F);
+  Value *transBlockInvoke(SPIRVValue *Invoke, BasicBlock *BB);
   Instruction *transEnqueueKernelBI(SPIRVInstruction *BI, BasicBlock *BB);
-  Instruction *transWGSizeBI(SPIRVInstruction *BI, BasicBlock *BB);
+  Instruction *transWGSizeQueryBI(SPIRVInstruction *BI, BasicBlock *BB);
+  Instruction *transSGSizeQueryBI(SPIRVInstruction *BI, BasicBlock *BB);
   bool transFPContractMetadata();
   bool transKernelMetadata();
   bool transNonTemporalMetadata(Instruction *I);
@@ -1893,7 +1892,12 @@ Value *SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
         BV, transEnqueueKernelBI(static_cast<SPIRVInstruction *>(BV), BB));
   case OpGetKernelWorkGroupSize:
   case OpGetKernelPreferredWorkGroupSizeMultiple:
-    return mapValue(BV, transWGSizeBI(static_cast<SPIRVInstruction *>(BV), BB));
+    return mapValue(
+        BV, transWGSizeQueryBI(static_cast<SPIRVInstruction *>(BV), BB));
+  case OpGetKernelNDrangeMaxSubGroupSize:
+  case OpGetKernelNDrangeSubGroupCount:
+    return mapValue(
+        BV, transSGSizeQueryBI(static_cast<SPIRVInstruction *>(BV), BB));
   default: {
     auto OC = BV->getOpCode();
     if (isSPIRVCmpInstTransToLLVMInst(static_cast<SPIRVInstruction *>(BV))) {
@@ -2063,165 +2067,25 @@ SPIRVToLLVM::transOCLBuiltinPostproc(SPIRVInstruction *BI, CallInst *CI,
   return CI;
 }
 
-static void adaptBlockInvoke(Function *Invoke, Type *BlockStructTy) {
-  // As first argument block invoke takes a pointer to captured data.
-  // We pass to block invoke whole block structure, not only captured data
-  // as it expected. So we need to update original function to unpack expected
-  // captured data and use it instead of an original argument
-  //
-  // %block = bitcast i8 addrspace(4)* to <{ ..., [X x i8] }> addrspace(4)*
-  // %block.1 = addrspacecast %block to <{ ..., [X x i8] }>*
-  // %captured = getelementptr <{ ..., [X x i8] }>, i32 0, i32 5
-  // %captured.1 = bitcast %captured to i8*
-
-  BasicBlock *BB = &(Invoke->getEntryBlock());
-  BB->splitBasicBlock(BB->begin(), "invoke");
-  auto FirstArg = &*(Invoke->arg_begin());
-  IRBuilder<> Builder(BB, BB->begin());
-
-  auto FirstArgTy = dyn_cast<PointerType>(FirstArg->getType());
-  assert(FirstArgTy && "Expects that first argument of invoke is a pointer");
-  unsigned FirstArgAS = FirstArgTy->getAddressSpace();
-
-  auto Int8PtrTy =
-      Type::getInt8PtrTy(Invoke->getParent()->getContext(), FirstArgAS);
-  auto BlockStructPtrTy = PointerType::get(BlockStructTy, FirstArgAS);
-
-  auto Int32Ty = Type::getInt32Ty(Invoke->getParent()->getContext());
-  Value *CapturedGEPIndices[2] = {ConstantInt::get(Int32Ty, 0),
-                                  ConstantInt::get(Int32Ty, 5)};
-  auto BlockToStructCast =
-      Builder.CreateBitCast(FirstArg, BlockStructPtrTy, "block");
-  auto CapturedGEP = Builder.CreateGEP(BlockToStructCast, CapturedGEPIndices);
-  auto CapturedToInt8Cast = Builder.CreateBitCast(CapturedGEP, Int8PtrTy);
-
-  FirstArg->replaceUsesOutsideBlock(CapturedToInt8Cast, BB);
-}
-
-static Type *getOrCreateBlockDescTy(Module *M) {
-  // Get or create block descriptor type which contains block size
-  // in the last element:  %struct.__block_descriptor = type { i64, i64 }
-  auto BlockDescTy = M->getTypeByName("struct.__block_descriptor");
-  if (BlockDescTy)
-    return BlockDescTy;
-
-  auto Int64Ty = Type::getInt64Ty(M->getContext());
-  Type *BlockDescElements[2] = {/*Reserved*/ Int64Ty, /*Block size*/ Int64Ty};
-  return StructType::create(M->getContext(), BlockDescElements,
-                            "struct.__block_descriptor");
-}
-
-Value *SPIRVToLLVM::transEnqueuedBlock(SPIRVValue *SInvoke,
-                                       SPIRVValue *SCaptured,
-                                       SPIRVValue *SCaptSize,
-                                       SPIRVValue *SCaptAlignment,
-                                       Function *LBI, BasicBlock *LBB) {
-  // Search if that block have been already translated
-  auto Loc = BlockMap.find(SInvoke);
-  if (Loc != BlockMap.end())
-    return Loc->second;
-
-  IRBuilder<> Builder(LBB);
-  const DataLayout &DL = M->getDataLayout();
-
-  // Translate block and its arguments from SPIRV values to LLVM
-  auto LInvoke = transFunction(static_cast<SPIRVFunction *>(SInvoke));
-  auto LCaptured = transValue(SCaptured, LBI, LBB, false);
-  auto LCaptSize =
-      dyn_cast<ConstantInt>(transValue(SCaptSize, LBI, LBB, false));
-  auto LCaptAlignment =
-      dyn_cast<ConstantInt>(transValue(SCaptAlignment, LBI, LBB, false));
-
-  // Create basic types
-  auto Int8Ty = Type::getInt8Ty(*Context);
-  auto Int32Ty = Type::getInt32Ty(*Context);
-  auto Int8PtrTy = Type::getInt8PtrTy(*Context, SPIRAS_Private);
-  auto Int8PtrTyGen = Type::getInt8PtrTy(*Context, SPIRAS_Generic);
-  auto BlockDescTy = getOrCreateBlockDescTy(M);
-  auto BlockDescPtrTy = BlockDescTy->getPointerTo(SPIRAS_Private);
-
-  // Create a block as structure:
-  // <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>
-  SmallVector<Type *, 8> BlockEls = {
-      /*isa*/ Int8PtrTy, /*flags*/ Int32Ty, /*reserved*/ Int32Ty,
-      /*invoke*/ Int8PtrTy, /*block_descriptor*/ BlockDescPtrTy};
-
-  // Add captured if any
-  // <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [X x i8] }>
-  // Note: captured data stored in structure as array of char
-  if (LCaptSize->getZExtValue() > 0)
-    BlockEls.push_back(ArrayType::get(Int8Ty, LCaptSize->getZExtValue()));
-
-  auto BlockTy = StructType::get(*Context, BlockEls, /*isPacked*/ true);
-
-  // Allocate block on the stack, then store data to it
-  auto BlockAlloca = Builder.CreateAlloca(BlockTy, nullptr, "block");
-  BlockAlloca->setAlignment(DL.getPrefTypeAlignment(BlockTy));
-
-  auto GetIndices = [Int32Ty](int A, int B) -> SmallVector<Value *, 2> {
-    return {ConstantInt::get(Int32Ty, A), ConstantInt::get(Int32Ty, B)};
-  };
-
-  // 1. isa, flags and reserved fields isn't used in current implementation
-  // Fill them the same way as clang does
-  auto IsaGEP = Builder.CreateGEP(BlockAlloca, GetIndices(0, 0));
-  Builder.CreateStore(ConstantPointerNull::get(Int8PtrTy), IsaGEP);
-  auto FlagsGEP = Builder.CreateGEP(BlockAlloca, GetIndices(0, 1));
-  Builder.CreateStore(ConstantInt::get(Int32Ty, 1342177280), FlagsGEP);
-  auto ReservedGEP = Builder.CreateGEP(BlockAlloca, GetIndices(0, 2));
-  Builder.CreateStore(ConstantInt::get(Int32Ty, 0), ReservedGEP);
-
-  // 2. Store pointer to block invoke to the structure
-  auto InvokeCast = Builder.CreateBitCast(LInvoke, Int8PtrTy, "invoke");
-  auto InvokeGEP = Builder.CreateGEP(BlockAlloca, GetIndices(0, 3));
-  Builder.CreateStore(InvokeCast, InvokeGEP);
-
-  // 3. Create and store a pointer to the block descriptor global value
-  uint64_t SizeOfBlock = DL.getTypeAllocSize(BlockTy);
-
-  auto Int64Ty = Type::getInt64Ty(*Context);
-  Constant *BlockDescEls[2] = {ConstantInt::get(Int64Ty, 0),
-                               ConstantInt::get(Int64Ty, SizeOfBlock)};
-  auto BlockDesc =
-      ConstantStruct::get(dyn_cast<StructType>(BlockDescTy), BlockDescEls);
-
-  auto BlockDescGV =
-      new GlobalVariable(*M, BlockDescTy, true, GlobalValue::InternalLinkage,
-                         BlockDesc, "__block_descriptor_spirv");
-  auto BlockDescGEP =
-      Builder.CreateGEP(BlockAlloca, GetIndices(0, 4), "block.descriptor");
-  Builder.CreateStore(BlockDescGV, BlockDescGEP);
-
-  // 4. Copy captured data to the structure
-  if (LCaptSize->getZExtValue() > 0) {
-    auto CapturedGEP =
-        Builder.CreateGEP(BlockAlloca, GetIndices(0, 5), "block.captured");
-    auto CapturedGEPCast = Builder.CreateBitCast(CapturedGEP, Int8PtrTy);
-
-    // We can't make any guesses about type of captured data, so
-    // let's copy it through memcpy
-    Builder.CreateMemCpy(CapturedGEPCast, LCaptAlignment->getZExtValue(),
-                         LCaptured, LCaptAlignment->getZExtValue(), LCaptSize,
-                         SCaptured->isVolatile());
-
-    // Fix invoke function to correctly process its first argument
-    adaptBlockInvoke(LInvoke, BlockTy);
-  }
-  auto BlockCast = Builder.CreateBitCast(BlockAlloca, Int8PtrTy);
-  auto BlockCastGen = Builder.CreateAddrSpaceCast(BlockCast, Int8PtrTyGen);
-  BlockMap[SInvoke] = BlockCastGen;
-  return BlockCastGen;
+Value *SPIRVToLLVM::transBlockInvoke(SPIRVValue *Invoke, BasicBlock *BB) {
+  auto *TranslatedInvoke = transFunction(static_cast<SPIRVFunction *>(Invoke));
+  auto *Int8PtrTyGen = Type::getInt8PtrTy(*Context, SPIRAS_Generic);
+  return CastInst::CreatePointerBitCastOrAddrSpaceCast(TranslatedInvoke,
+                                                       Int8PtrTyGen, "", BB);
 }
 
 Instruction *SPIRVToLLVM::transEnqueueKernelBI(SPIRVInstruction *BI,
                                                BasicBlock *BB) {
-  Type *IntTy = Type::getInt32Ty(*Context);
+  Type *Int32Ty = Type::getInt32Ty(*Context);
+  Type *Int64Ty = Type::getInt64Ty(*Context);
+  Type *IntTy =
+      M->getDataLayout().getPointerSizeInBits(0) == 32 ? Int32Ty : Int64Ty;
 
   // Find or create enqueue kernel BI declaration
   auto Ops = BI->getOperands();
   bool HasVaargs = Ops.size() > 10;
 
-  std::string FName = HasVaargs ? "__enqueue_kernel_events_vaargs"
+  std::string FName = HasVaargs ? "__enqueue_kernel_events_varargs"
                                 : "__enqueue_kernel_basic_events";
   Function *F = M->getFunction(FName);
   if (!F) {
@@ -2231,17 +2095,22 @@ Instruction *SPIRVToLLVM::transEnqueueKernelBI(SPIRVInstruction *BI,
 
     SmallVector<Type *, 8> Tys = {
         transType(Ops[0]->getType()), // queue
-        IntTy,                        // flags
+        Int32Ty,                      // flags
         transType(Ops[2]->getType()), // ndrange
-        IntTy,
+        Int32Ty,
         EventTy,
-        EventTy,                                     // events
-        Type::getInt8PtrTy(*Context, SPIRAS_Generic) // block
+        EventTy,                                      // events
+        Type::getInt8PtrTy(*Context, SPIRAS_Generic), // block_invoke
+        Type::getInt8PtrTy(*Context, SPIRAS_Generic)  // block_literal
     };
-    if (HasVaargs)
-      Tys.push_back(IntTy); // Number of variadics if any
+    if (HasVaargs) {
+      // Number of block invoke arguments (local arguments)
+      Tys.push_back(Int32Ty);
+      // Array of sizes of block invoke arguments
+      Tys.push_back(PointerType::get(IntTy, SPIRAS_Private));
+    }
 
-    FunctionType *FT = FunctionType::get(IntTy, Tys, HasVaargs);
+    FunctionType *FT = FunctionType::get(Int32Ty, Tys, false);
     F = Function::Create(FT, GlobalValue::ExternalLinkage, FName, M);
     if (isFuncNoUnwind())
       F->addFnAttr(Attribute::NoUnwind);
@@ -2255,14 +2124,18 @@ Instruction *SPIRVToLLVM::transEnqueueKernelBI(SPIRVInstruction *BI,
       transValue(Ops[3], F, BB, false), // events number
       transDeviceEvent(Ops[4], F, BB),  // event_wait_list
       transDeviceEvent(Ops[5], F, BB),  // event_ret
-      transEnqueuedBlock(Ops[6], Ops[7], Ops[8], Ops[9], F, BB) // block
+      transBlockInvoke(Ops[6], BB),     // block_invoke
+      transValue(Ops[7], F, BB, false)  // block_literal
   };
 
   if (HasVaargs) {
-    Args.push_back(
-        ConstantInt::get(IntTy, Ops.size() - 10)); // Number of vaargs
-    for (unsigned I = 10; I < Ops.size(); ++I)
-      Args.push_back(transValue(Ops[I], F, BB, false));
+    // Number of local arguments
+    Args.push_back(ConstantInt::get(Int32Ty, Ops.size() - 10));
+    // GEP to array of sizes of local arguments
+    if (Ops[10]->getOpCode() == OpPtrAccessChain)
+      Args.push_back(transValue(Ops[10], F, BB, false));
+    else
+      llvm_unreachable("Not implemented");
   }
   auto Call = CallInst::Create(F, Args, "", BB);
   setName(Call, BI);
@@ -2270,23 +2143,57 @@ Instruction *SPIRVToLLVM::transEnqueueKernelBI(SPIRVInstruction *BI,
   return Call;
 }
 
-Instruction *SPIRVToLLVM::transWGSizeBI(SPIRVInstruction *BI, BasicBlock *BB) {
-  std::string FName = (BI->getOpCode() == OpGetKernelWorkGroupSize)
-                          ? "__get_kernel_work_group_size_impl"
-                          : "__get_kernel_preferred_work_group_multiple_impl";
+Instruction *SPIRVToLLVM::transWGSizeQueryBI(SPIRVInstruction *BI,
+                                             BasicBlock *BB) {
+  std::string FName =
+      (BI->getOpCode() == OpGetKernelWorkGroupSize)
+          ? "__get_kernel_work_group_size_impl"
+          : "__get_kernel_preferred_work_group_size_multiple_impl";
 
   Function *F = M->getFunction(FName);
   if (!F) {
     auto Int8PtrTyGen = Type::getInt8PtrTy(*Context, SPIRAS_Generic);
-    FunctionType *FT =
-        FunctionType::get(Type::getInt32Ty(*Context), Int8PtrTyGen, false);
+    FunctionType *FT = FunctionType::get(Type::getInt32Ty(*Context),
+                                         {Int8PtrTyGen, Int8PtrTyGen}, false);
     F = Function::Create(FT, GlobalValue::ExternalLinkage, FName, M);
     if (isFuncNoUnwind())
       F->addFnAttr(Attribute::NoUnwind);
   }
   auto Ops = BI->getOperands();
-  auto Block = transEnqueuedBlock(Ops[0], Ops[1], Ops[2], Ops[3], F, BB);
-  auto Call = CallInst::Create(F, Block, "", BB);
+  SmallVector<Value *, 2> Args = {transBlockInvoke(Ops[0], BB),
+                                  transValue(Ops[1], F, BB, false)};
+  auto Call = CallInst::Create(F, Args, "", BB);
+  setName(Call, BI);
+  setAttrByCalledFunc(Call);
+  return Call;
+}
+
+Instruction *SPIRVToLLVM::transSGSizeQueryBI(SPIRVInstruction *BI,
+                                             BasicBlock *BB) {
+  std::string FName = (BI->getOpCode() == OpGetKernelNDrangeMaxSubGroupSize)
+                          ? "__get_kernel_max_sub_group_size_for_ndrange_impl"
+                          : "__get_kernel_sub_group_count_for_ndrange_impl";
+
+  auto Ops = BI->getOperands();
+  Function *F = M->getFunction(FName);
+  if (!F) {
+    auto Int8PtrTyGen = Type::getInt8PtrTy(*Context, SPIRAS_Generic);
+    SmallVector<Type *, 3> Tys = {
+        transType(Ops[0]->getType()), // ndrange
+        Int8PtrTyGen,                 // block_invoke
+        Int8PtrTyGen                  // block_literal
+    };
+    auto *FT = FunctionType::get(Type::getInt32Ty(*Context), Tys, false);
+    F = Function::Create(FT, GlobalValue::ExternalLinkage, FName, M);
+    if (isFuncNoUnwind())
+      F->addFnAttr(Attribute::NoUnwind);
+  }
+  SmallVector<Value *, 2> Args = {
+      transValue(Ops[0], F, BB, false), // ndrange
+      transBlockInvoke(Ops[1], BB),     // block_invoke
+      transValue(Ops[2], F, BB, false)  // block_literal
+  };
+  auto Call = CallInst::Create(F, Args, "", BB);
   setName(Call, BI);
   setAttrByCalledFunc(Call);
   return Call;
diff --git a/lib/SPIRV/SPIRVUtil.cpp b/lib/SPIRV/SPIRVUtil.cpp
index a05630387c..b80ca97e26 100644
--- a/lib/SPIRV/SPIRVUtil.cpp
+++ b/lib/SPIRV/SPIRVUtil.cpp
@@ -461,6 +461,13 @@ bool getSPIRVBuiltin(const std::string &OrigName, spv::BuiltIn &B) {
   return getByName(R.str(), B);
 }
 
+// Some OpenCL built-ins generated by Clang are not mangled
+// TODO: add other built-ins which are not mangled by Clang
+bool isNonMangledOCLBuiltin(const StringRef &Name) {
+  if (isEnqueueKernelBI(Name) || isKernelQueryBI(Name))
+    return true;
+  return false;
+}
 bool oclIsBuiltin(const StringRef &Name, std::string *DemangledName,
                   bool IsCpp) {
   if (Name == "printf") {
@@ -468,6 +475,11 @@ bool oclIsBuiltin(const StringRef &Name, std::string *DemangledName,
       *DemangledName = Name;
     return true;
   }
+  if (isNonMangledOCLBuiltin(Name)) {
+    if (DemangledName)
+      *DemangledName = Name.drop_front(2);
+    return true;
+  }
   if (!Name.startswith("_Z"))
     return false;
   if (!DemangledName)
@@ -1388,4 +1400,18 @@ Type *getSPIRVImageTypeFromOCL(Module *M, Type *ImageTy) {
     Acc = getAccessQualifier(ImageTypeName);
   return getOrCreateOpaquePtrType(M, mapOCLTypeNameToSPIRV(ImageTypeName, Acc));
 }
+
+llvm::PointerType *getOCLClkEventType(Module *M) {
+  return getOrCreateOpaquePtrType(M, SPIR_TYPE_NAME_CLK_EVENT_T,
+                                  SPIRAS_Private);
+}
+
+llvm::PointerType *getOCLClkEventPtrType(Module *M) {
+  return PointerType::get(getOCLClkEventType(M), SPIRAS_Generic);
+}
+
+llvm::Constant *getOCLNullClkEventPtr(Module *M) {
+  return Constant::getNullValue(getOCLClkEventPtrType(M));
+}
+
 } // namespace SPIRV
diff --git a/lib/SPIRV/SPIRVWriter.cpp b/lib/SPIRV/SPIRVWriter.cpp
index 27f5b8f014..e0c8ec9064 100644
--- a/lib/SPIRV/SPIRVWriter.cpp
+++ b/lib/SPIRV/SPIRVWriter.cpp
@@ -692,7 +692,8 @@ SPIRVValue *LLVMToSPIRV::transConstant(Value *V) {
   if (auto CAZero = dyn_cast<ConstantAggregateZero>(V)) {
     Type *AggType = CAZero->getType();
     if (const StructType *ST = dyn_cast<StructType>(AggType))
-      if (ST->getName() == getSPIRVTypeName(kSPIRVTypeName::ConstantSampler))
+      if (ST->hasName() &&
+          ST->getName() == getSPIRVTypeName(kSPIRVTypeName::ConstantSampler))
         return BM->addSamplerConstant(transType(AggType), 0, 0, 0);
 
     return BM->addNullConstant(transType(AggType));
@@ -735,9 +736,11 @@ SPIRVValue *LLVMToSPIRV::transConstant(Value *V) {
     return BM->addCompositeConstant(transType(V->getType()), BV);
   }
 
-  if (auto ConstV = dyn_cast<ConstantStruct>(V)) {
-    if (ConstV->getType()->getName() ==
-        getSPIRVTypeName(kSPIRVTypeName::ConstantSampler)) {
+  if (const auto *ConstV = dyn_cast<ConstantStruct>(V)) {
+    StringRef StructName;
+    if (ConstV->getType()->hasName())
+      StructName = ConstV->getType()->getName();
+    if (StructName == getSPIRVTypeName(kSPIRVTypeName::ConstantSampler)) {
       assert(ConstV->getNumOperands() == 3);
       SPIRVWord AddrMode =
                     ConstV->getOperand(0)->getUniqueInteger().getZExtValue(),
@@ -752,8 +755,7 @@ SPIRVValue *LLVMToSPIRV::transConstant(Value *V) {
       return BM->addSamplerConstant(SamplerTy, AddrMode, Normalized,
                                     FilterMode);
     }
-    if (ConstV->getType()->getName() ==
-        getSPIRVTypeName(kSPIRVTypeName::ConstantPipeStorage)) {
+    if (StructName == getSPIRVTypeName(kSPIRVTypeName::ConstantPipeStorage)) {
       assert(ConstV->getNumOperands() == 3);
       SPIRVWord PacketSize =
                     ConstV->getOperand(0)->getUniqueInteger().getZExtValue(),
@@ -1800,6 +1802,7 @@ void addPassesForSPIRV(legacy::PassManager &PassMgr) {
   PassMgr.add(createOCL21ToSPIRV());
   PassMgr.add(createSPIRVLowerSPIRBlocks());
   PassMgr.add(createOCLTypeToSPIRV());
+  PassMgr.add(createSPIRVLowerOCLBlocks());
   PassMgr.add(createOCL20ToSPIRV());
   PassMgr.add(createSPIRVRegularizeLLVM());
   PassMgr.add(createSPIRVLowerConstExpr());
diff --git a/test/EnqueueEmptyKernel.ll b/test/EnqueueEmptyKernel.ll
index 656fa94e89..744afb2550 100644
--- a/test/EnqueueEmptyKernel.ll
+++ b/test/EnqueueEmptyKernel.ll
@@ -1,6 +1,6 @@
 ;; This test checks that Invoke parameter of OpEnueueKernel instruction meet the
 ;; following specification requirements in case of enqueueing empty block:
-;;"Invoke must be an OpFunction whose OpTypeFunction operand has:
+;; "Invoke must be an OpFunction whose OpTypeFunction operand has:
 ;; - Result Type must be OpTypeVoid.
 ;; - The first parameter must have a type of OpTypePointer to an 8-bit OpTypeInt.
 ;; - An optional list of parameters, each of which must have a type of OpTypePointer to the Workgroup Storage Class.
@@ -11,6 +11,7 @@
 ;;                  ndrange_1D(1),
 ;;                  0, NULL, NULL,
 ;;                  ^(){});
+;; }
 ; RUN: llvm-as < %s > %t.bc
 ; RUN: llvm-spirv %t.bc -o - -spirv-text | FileCheck %s --check-prefix=CHECK-SPIRV
 
@@ -19,63 +20,66 @@ target triple = "spir64-unknown-unknown"
 
 %struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] }
 %opencl.queue_t = type opaque
-%opencl.block = type opaque
 %opencl.clk_event_t = type opaque
 
+@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+
+; CHECK-SPIRV: Name [[Block:[0-9]+]] "__block_literal_global"
 ; CHECK-SPIRV: TypeInt [[Int8:[0-9]+]] 8
 ; CHECK-SPIRV: TypeVoid [[Void:[0-9]+]]
-; CHECK-SPIRV: TypePointer [[Int8Ptr:[0-9]+]] {{[0-9]+}} [[Int8]]
-; CHECK-SPIRV: ConstantNull [[Int8Ptr]] [[NullInt8Ptr:[0-9]+]]
+; CHECK-SPIRV: TypePointer [[Int8Ptr:[0-9]+]] 5 [[Int8]]
+; CHECK-SPIRV: TypePointer [[Int8PtrGen:[0-9]+]] 8 [[Int8]]
+; CHECK-SPIRV: Variable {{[0-9]+}} [[Block:[0-9]+]]
 
-; Function Attrs: nounwind
-define spir_kernel void @test_enqueue_empty() #0 {
+; Function Attrs: convergent nounwind
+define spir_kernel void @test_enqueue_empty() #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !2 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !2 {
 entry:
-  %agg.tmp = alloca %struct.ndrange_t, align 8
-  %call = call spir_func %opencl.queue_t* @_Z17get_default_queuev()
-  call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1)
-  %0 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__test_enqueue_empty_block_invoke to i8*), i32 0, i32 0, i8* null)
-  %call1 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t* %call, i32 1, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %0)
+  %tmp = alloca %struct.ndrange_t, align 8
+  %call = call spir_func %opencl.queue_t* @_Z17get_default_queuev() #4
+  call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %tmp, i64 1) #4
+  %0 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %call, i32 1, %struct.ndrange_t* %tmp, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__test_enqueue_empty_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*))
   ret void
-; CHECK-SPIRV: EnqueueKernel {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} [[Invoke:[0-9]+]] [[NullInt8Ptr]] {{[0-9]+}} {{[0-9]+}}
-
+; CHECK-SPIRV: Bitcast [[Int8Ptr]] [[Int8PtrBlock:[0-9]+]] [[Block]]
+; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGen]] [[Int8PtrGenBlock:[0-9]+]] [[Int8PtrBlock]]
+; CHECK-SPIRV: EnqueueKernel {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} [[Invoke:[0-9]+]] [[Int8PtrGenBlock]] {{[0-9]+}} {{[0-9]+}}
 }
 
-declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*) #1
-
+; Function Attrs: convergent
 declare spir_func %opencl.queue_t* @_Z17get_default_queuev() #1
 
+; Function Attrs: convergent
 declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1
 
-
-; CHECK-SPIRV: Function [[Void]] [[Invoke]] {{[0-9]+}} {{[0-9]+}}
-; CHECK-SPIRV-NEXT: FunctionParameter [[Int8Ptr]]  {{[0-9]+}}
+; Function Attrs: convergent nounwind
+define internal spir_func void @__test_enqueue_empty_block_invoke(i8 addrspace(4)* %.block_descriptor) #2 {
+entry:
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 8
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+  ret void
+}
 
 ; Function Attrs: nounwind
-define internal spir_func void @__test_enqueue_empty_block_invoke(i8* %.block_descriptor) #0 {
+define internal spir_kernel void @__test_enqueue_empty_block_invoke_kernel(i8 addrspace(4)*) #3 {
 entry:
-  %block = bitcast i8* %.block_descriptor to <{}>*
+  call void @__test_enqueue_empty_block_invoke(i8 addrspace(4)* %0)
   ret void
 }
 
-declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*)
+declare i32 @__enqueue_kernel_basic_events(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*)
+
+; CHECK-SPIRV: Function [[Void]] [[Invoke]] {{[0-9]+}} {{[0-9]+}}
+; CHECK-SPIRV-NEXT: FunctionParameter [[Int8PtrGen]]  {{[0-9]+}}
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { convergent }
 
-!opencl.kernels = !{!0}
 !opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!6}
-!opencl.ocl.version = !{!7}
-!opencl.used.extensions = !{!8}
-!opencl.used.optional.core.features = !{!8}
-!opencl.compiler.options = !{!8}
+!opencl.ocl.version = !{!0}
+!opencl.spir.version = !{!0}
 
-!0 = !{void ()* @test_enqueue_empty, !1, !2, !3, !4, !5}
-!1 = !{!"kernel_arg_addr_space"}
-!2 = !{!"kernel_arg_access_qual"}
-!3 = !{!"kernel_arg_type"}
-!4 = !{!"kernel_arg_base_type"}
-!5 = !{!"kernel_arg_type_qual"}
-!6 = !{i32 1, i32 2}
-!7 = !{i32 2, i32 0}
-!8 = !{}
+!0 = !{i32 2, i32 0}
+!2 = !{}
diff --git a/test/global_block.ll b/test/global_block.ll
index cf43b18b47..d1ede654a4 100644
--- a/test/global_block.ll
+++ b/test/global_block.ll
@@ -26,62 +26,74 @@ target triple = "spir-unknown-unknown"
 ;; Check that block invoke function has no block descriptor argument in SPIR-V
 ; CHECK-SPIRV-NOT: TypeFunction [[block_invoke_type]] [[int]] {{[0-9]+}} [[int]]
 
-%opencl.block = type opaque
-
-@block_kernel.b1 = internal addrspace(2) constant %opencl.block* bitcast (i32 (i8*, i32)* @_block_invoke to %opencl.block*), align 4
 ;; This variable is not needed in SPIRV
-; CHECK-SPIRV-NOT Variable
+; CHECK-SPIRV-NOT: Name {{[0-9]+}} block_kernel.b1
 ; CHECK-LLVM-NOT: @block_kernel.b1
+@block_kernel.b1 = internal addrspace(2) constant i32 (i32) addrspace(4)* addrspacecast (i32 (i32) addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i32 (i32) addrspace(1)*) to i32 (i32) addrspace(4)*), align 8
+
+@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
 
-; Function Attrs: nounwind
-define spir_kernel void @block_kernel(i32 addrspace(1)* %res) #0 {
+; Function Attrs: convergent nounwind
+define spir_kernel void @block_kernel(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
 entry:
-  %0 = load %opencl.block*, %opencl.block* addrspace(2)* @block_kernel.b1, align 4
-  %1 = call i8* @spir_get_block_invoke(%opencl.block* %0)
-  %2 = call i8* @spir_get_block_context(%opencl.block* %0)
-  %3 = bitcast i8* %1 to i32 (i8*, i32)*
-  %call = call spir_func i32 %3(i8* %2, i32 5)
+  %res.addr = alloca i32 addrspace(1)*, align 8
+  store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 8, !tbaa !10
+
 ; CHECK-SPIRV: FunctionCall [[int]] {{[0-9]+}} [[block_invoke]] [[five]]
 ; CHECK-LLVM: %call = call spir_func i32 @_block_invoke(i32 5)
-  store i32 %call, i32 addrspace(1)* %res, align 4
+  %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 5) #2
+
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 8, !tbaa !10
+  store i32 %call, i32 addrspace(1)* %0, align 4, !tbaa !14
   ret void
 }
 
-; Function Attrs: nounwind
-define internal spir_func i32 @_block_invoke(i8* %.block_descriptor, i32 %i) #0 {
 ; CHECK-SPIRV: 5 Function [[int]] [[block_invoke]] 0 [[block_invoke_type]]
 ; CHECK-SPIRV-NEXT: 3 FunctionParameter [[int]] {{[0-9]+}}
 ; CHECK-LLVM: define internal spir_func i32 @_block_invoke(i32 %i)
+; Function Attrs: convergent nounwind
+define internal spir_func i32 @_block_invoke(i8 addrspace(4)* %.block_descriptor, i32 %i) #1 {
 entry:
-  %block = bitcast i8* %.block_descriptor to <{}>*
-;; Instruction above is useless and should be removed.
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 8
+  %i.addr = alloca i32, align 4
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8
+
+;; Instruction below is useless and should be removed.
 ; CHECK-SPIRV-NOT: Bitcast
 ; CHECK-LLVM-NOT: bitcast
-  %add = add nsw i32 %i, 1
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+  store i32 %i, i32* %i.addr, align 4, !tbaa !14
+  %0 = load i32, i32* %i.addr, align 4, !tbaa !14
+  %add = add nsw i32 %0, 1
   ret i32 %add
 }
 
-declare i8* @spir_get_block_invoke(%opencl.block*)
-
-declare i8* @spir_get_block_context(%opencl.block*)
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!opencl.kernels = !{!0}
+!llvm.module.flags = !{!0}
 !opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!6}
-!opencl.ocl.version = !{!7}
-!opencl.used.extensions = !{!8}
-!opencl.used.optional.core.features = !{!8}
-!opencl.compiler.options = !{!8}
-
-!0 = !{void (i32 addrspace(1)*)* @block_kernel, !1, !2, !3, !4, !5}
-!1 = !{!"kernel_arg_addr_space", i32 1}
-!2 = !{!"kernel_arg_access_qual", !"none"}
-!3 = !{!"kernel_arg_type", !"int*"}
-!4 = !{!"kernel_arg_base_type", !"int*"}
-!5 = !{!"kernel_arg_type_qual", !""}
-!6 = !{i32 1, i32 2}
-!7 = !{i32 2, i32 0}
-!8 = !{}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.used.extensions = !{!2}
+!opencl.used.optional.core.features = !{!2}
+!opencl.compiler.options = !{!2}
+!llvm.ident = !{!3}
 
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 2, i32 0}
+!2 = !{}
+!3 = !{!"clang version 7.0.0"}
+!4 = !{i32 1}
+!5 = !{!"none"}
+!6 = !{!"int*"}
+!7 = !{!""}
+!8 = !{i1 false}
+!9 = !{i32 0}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"any pointer", !12, i64 0}
+!12 = !{!"omnipotent char", !13, i64 0}
+!13 = !{!"Simple C/C++ TBAA"}
+!14 = !{!15, !15, i64 0}
+!15 = !{!"int", !12, i64 0}
diff --git a/test/literal-struct.ll b/test/literal-struct.ll
new file mode 100644
index 0000000000..c52170a5e7
--- /dev/null
+++ b/test/literal-struct.ll
@@ -0,0 +1,60 @@
+; This test checks that the translator doesn't crash if the module has literal
+; structs, i.e. structs whose type has no name. Typicaly clang generate such
+; structs if the kernel contains OpenCL 2.0 blocks. The IR was produced with
+; the following command:
+; clang -cc1 -triple spir -cl-std=cl2.0 -O0 -finclude-default-header literal-struct.cl -emit-llvm -o test/literal-struct.ll
+
+; literal-struct.cl:
+; void foo()
+; {
+;   void (^myBlock)(void) = ^{};
+;   myBlock();
+; }
+
+; RUN: llvm-as < %s | llvm-spirv -spirv-text -o %t
+; RUN: FileCheck < %t %s
+
+; CHECK-DAG: TypeInt [[Int:[0-9]+]] 32 0
+; CHECK-DAG: TypeStruct [[StructType:[0-9]+]] [[Int]] [[Int]] {{$}}
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
+@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+; CHECK: ConstantComposite [[StructType]]
+
+; This is artificial case is added to cover ConstantNull instrucitions with TypeStruct.
+@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } zeroinitializer, align 4
+; CHECK: ConstantNull [[StructType]]
+
+; Function Attrs: convergent noinline nounwind optnone
+define spir_func void @foo() #0 {
+entry:
+  %myBlock = alloca void () addrspace(4)*, align 4
+  store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %myBlock, align 4
+  call spir_func void @__foo_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) #1
+  ret void
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define internal spir_func void @__foo_block_invoke(i8 addrspace(4)* %.block_descriptor) #0 {
+entry:
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
+  ret void
+}
+
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 2, i32 0}
+!2 = !{!"clang version 8.0.0 "}
diff --git a/test/transcoding/device_execution.ll b/test/transcoding/device_execution.ll
deleted file mode 100644
index 538a1b21fe..0000000000
--- a/test/transcoding/device_execution.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; ModuleID = 'repro.cl'
-;
-; bash$ cat repro.cl
-; void __kernel device_kernel(__global float * inout) {
-;   *inout = cos(*inout);
-; }
-;
-; void __kernel host_kernel(__global float * inout) {
-;   enqueue_kernel(get_default_queue(),
-;                  CLK_ENQUEUE_FLAGS_WAIT_KERNEL,
-;                  ndrange_1D(1),
-;                  0, NULL, NULL,
-;                  ^{ device_kernel(inout); });
-; }
-; bash$ export PATH_TO_INCLUDE= $PATH_TO_GEN/lib/clang/3.6.1/include
-; bash$ $PATH_TO_GEN/bin/clang -cc1 -x cl -cl-std=CL2.0 -triple spir64-unknonw-unknown -emit-llvm  -include opencl-20.h  repro.cl -o device_execution.ll
-
-;; Regression test for
-;; 1. enqueue_kernel built-in is mangled accordingly to SPIR2.0/C++ ABI (no substitution is done)
-;; 2. the 4rth argument of enqueue_kernel is mangled as "const clk_event_t*"
-
-; RUN: llvm-as %s -o %t.bc
-; RUN: llvm-spirv %t.bc -o %t.spv
-; RUN: llvm-spirv -r %t.spv -o %t.bc
-; RUN: llvm-dis < %t.bc | FileCheck %s
-
-; CHECK-NOT: @_Z14enqueue_kernel9ocl_queue{{22kernel_enqueue_flags_t|i}}9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE
-; CHECK-NOT: @spir_block_bind
-; CHECK: call i32 @__enqueue_kernel_basic_events
-declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*) #1
-
-target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
-target triple = "spir64-unknonw-unknown"
-
-%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] }
-%opencl.queue_t = type opaque
-%opencl.block = type opaque
-%opencl.clk_event_t = type opaque
-
-; Function Attrs: nounwind
-define spir_kernel void @device_kernel(float addrspace(1)* nocapture %inout) #0 {
-entry:
-  %0 = load float, float addrspace(1)* %inout, align 4, !tbaa !11
-  %call = tail call spir_func float @_Z3cosf(float %0) #2
-  store float %call, float addrspace(1)* %inout, align 4, !tbaa !11
-  ret void
-}
-
-declare spir_func float @_Z3cosf(float) #1
-
-; Function Attrs: nounwind
-define spir_kernel void @host_kernel(float addrspace(1)* %inout) #0 {
-entry:
-  %captured = alloca <{ float addrspace(1)* }>, align 8
-  %agg.tmp = alloca %struct.ndrange_t, align 8
-  %call = tail call spir_func %opencl.queue_t* @get_default_queue() #2
-  call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1) #2
-  %block.captured = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %captured, i64 0, i32 0
-  store float addrspace(1)* %inout, float addrspace(1)** %block.captured, align 8, !tbaa !15
-  %0 = bitcast <{ float addrspace(1)* }>* %captured to i8*
-  %1 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__host_kernel_block_invoke to i8*), i32 8, i32 8, i8* %0) #2
-  %call1 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t* %call, i32 241, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %1) #2
-  ret void
-}
-
-declare spir_func %opencl.queue_t* @get_default_queue() #1
-
-declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1
-
-; Function Attrs: nounwind
-define internal spir_func void @__host_kernel_block_invoke(i8* nocapture readonly %.block_descriptor) #0 {
-entry:
-  %block.capture.addr = bitcast i8* %.block_descriptor to float addrspace(1)**
-  %0 = load float addrspace(1)*, float addrspace(1)** %block.capture.addr, align 8, !tbaa !15
-  %1 = load float, float addrspace(1)* %0, align 4, !tbaa !11
-  %call.i = tail call spir_func float @_Z3cosf(float %1) #2
-  store float %call.i, float addrspace(1)* %0, align 4, !tbaa !11
-  ret void
-}
-
-declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*)
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
-!opencl.kernels = !{!0, !6}
-!opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!7}
-!opencl.ocl.version = !{!8}
-!opencl.used.extensions = !{!9}
-!opencl.used.optional.core.features = !{!9}
-!opencl.compiler.options = !{!9}
-!llvm.ident = !{!10}
-
-!0 = !{void (float addrspace(1)*)* @device_kernel, !1, !2, !3, !4, !5}
-!1 = !{!"kernel_arg_addr_space", i32 1}
-!2 = !{!"kernel_arg_access_qual", !"none"}
-!3 = !{!"kernel_arg_type", !"float*"}
-!4 = !{!"kernel_arg_base_type", !"float*"}
-!5 = !{!"kernel_arg_type_qual", !""}
-!6 = !{void (float addrspace(1)*)* @host_kernel, !1, !2, !3, !4, !5}
-!7 = !{i32 1, i32 2}
-!8 = !{i32 2, i32 0}
-!9 = !{}
-!10 = !{!"clang version 3.6.1 (https://github.com/KhronosGroup/SPIR.git 49a8b4a760d227b12116a79b2f7b2e34ef2e6879) (ssh://nnopencl-git-01.inn.intel.com/home/git/repo/opencl_qa-llvm d9b98710f905089caec167209da23af2e4f72bf0)"}
-!11 = !{!12, !12, i64 0}
-!12 = !{!"float", !13, i64 0}
-!13 = !{!"omnipotent char", !14, i64 0}
-!14 = !{!"Simple C/C++ TBAA"}
-!15 = !{!16, !16, i64 0}
-!16 = !{!"any pointer", !13, i64 0}
diff --git a/test/transcoding/device_execution_multiple_blocks.ll b/test/transcoding/device_execution_multiple_blocks.ll
deleted file mode 100644
index 17421013a6..0000000000
--- a/test/transcoding/device_execution_multiple_blocks.ll
+++ /dev/null
@@ -1,153 +0,0 @@
-;; Test what the reader correctly mangles get_kernel_work_group_size,
-;; get_kernel_preferred_work_group_size_multiple, and enqueue_kernel built-ins and
-;; produces spir_block_bind for the both blocks one of which is w\o captured context.
-;; Notice what for the moment spir_block_bind is called as many times as how much
-;; built-ins what using it. This is not against SPIR 2.0 specification so it is done this
-;; way to simplify the reader implementation.
-;;
-;; See below how this LLVM IR has been obtained:
-;; bash$
-;; bash$ cat device_execution_multiple_blocks.cl
-;; void block_fn(int arg, __global int* res)
-;; {
-;;   *res = arg;
-;; }
-;;
-;; __global int glbRes = 0;
-;; void (^kernelBlockNoCtx)(void) = ^{ block_fn(1, &glbRes); };
-;;
-;; kernel void enqueue_block_get_kernel_preferred_work_group_size_multiple(__global int* res)
-;; {
-;;
-;;
-;;     void (^kernelBlock)(void) = ^{ block_fn(2, res); };
-;;     uint globalSize = get_kernel_work_group_size(kernelBlock);
-;;     uint multiple   = get_kernel_preferred_work_group_size_multiple(kernelBlock);
-;;     uint localSize  = globalSize / multiple;
-;;
-;;     queue_t q1 = get_default_queue();
-;;     ndrange_t ndrange = ndrange_1D(localSize, globalSize);
-;;     enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
-;;     // Enqueue kernel w\o captured context
-;;     enqueue_kernel(q1, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlockNoCtx);
-;; }
-;; bash$
-;; bash$ export PATH_TO_GEN=path_to_spir20_generator_install_dir
-;; bash$ $PATH_TO_GEN/bin/clang -cc1 -x cl -O2 -cl-std=CL2.0 -triple spir64-unknonw-unknown\
-;; -emit-spirv -include $PATH_TO_GEN/lib/clang/3.6.1/include/opencl-20.h\
-;; device_execution_multiple_blocks.cl -o device_execution_multiple_blocks.ll
-
-; RUN: llvm-as %s -o %t.bc
-; RUN: llvm-spirv %t.bc -o %t.spv
-; RUN: llvm-spirv -r %t.spv -o %t.bc
-; RUN: llvm-dis < %t.bc | FileCheck %s
-
-; ModuleID = 'device_execution_multiple_blocks.cl'
-target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
-target triple = "spir64-unknonw-unknown"
-
-%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] }
-%opencl.block = type opaque
-%opencl.queue_t = type opaque
-
-@glbRes = addrspace(1) global i32 0, align 4
-
-; Function Attrs: nounwind
-define spir_func void @block_fn(i32 %arg, i32 addrspace(1)* nocapture %res) #0 {
-entry:
-  store i32 %arg, i32 addrspace(1)* %res, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define internal spir_func void @kernelBlockNoCtx_block_invoke(i8* nocapture readnone %.block_descriptor) #0 {
-entry:
-  store i32 1, i32 addrspace(1)* @glbRes, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define spir_kernel void @enqueue_block_get_kernel_preferred_work_group_size_multiple(i32 addrspace(1)* %res) #0 {
-; CHECK: [[CTX:%.*]] = bitcast %0* %captured to i8*
-; CHECK: [[BLOCK:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, align 8
-; CHECK: store i8* {{.*}} @__enqueue_block_get_kernel_preferred_work_group_size_multiple_block_invoke
-; CHECK: [[CAPTUREDGEP:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK]], i32 0, i32 5
-; CHECK: [[CAPTUREDCAST:%.*]] = bitcast [8 x i8]* [[CAPTUREDGEP]] to i8*
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[CAPTUREDCAST]], i8* align 8 [[CTX]], i32 8, i1 false)
-; CHECK: [[BLOCKBCST:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK]] to i8*
-; CHECK: [[BLOCKADDRCST:%.*]] = addrspacecast i8* [[BLOCKBCST]] to i8 addrspace(4)*
-; CHECK: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* [[BLOCKADDRCST]])
-; CHECK: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* [[BLOCKADDRCST]])
-; CHECK: call i32 @__enqueue_kernel_basic_events({{.*}} i8 addrspace(4)* [[BLOCKADDRCST]])
-
-; CHECK: [[BLOCK2:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>, align 8
-; CHECK: store i8* {{.*}} @kernelBlockNoCtx_block_invoke
-; CHECK: [[BLOCKBCST2:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK2]] to i8*
-; CHECK: [[BLOCKADDRCST2:%.*]] = addrspacecast i8* [[BLOCKBCST2]] to i8 addrspace(4)*
-; CHECK-NOT: call void @llvm.memcpy
-; CHECK: call i32 @__enqueue_kernel_basic_events({{.*}} i8 addrspace(4)* [[BLOCKADDRCST2]])
-
-entry:
-  %captured = alloca <{ i32 addrspace(1)* }>, align 8
-  %ndrange = alloca %struct.ndrange_t, align 8
-  %block.captured = getelementptr inbounds <{ i32 addrspace(1)* }>, <{ i32 addrspace(1)* }>* %captured, i64 0, i32 0
-  store i32 addrspace(1)* %res, i32 addrspace(1)** %block.captured, align 8
-  %0 = bitcast <{ i32 addrspace(1)* }>* %captured to i8*
-  %1 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__enqueue_block_get_kernel_preferred_work_group_size_multiple_block_invoke to i8*), i32 8, i32 8, i8* %0) #2
-  %call = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block* %1) #2
-  %call1 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block* %1) #2
-  %div = udiv i32 %call, %call1
-  %call2 = call spir_func %opencl.queue_t* @get_default_queue() #2
-  %conv = zext i32 %div to i64
-  %conv3 = zext i32 %call to i64
-  call spir_func void @_Z10ndrange_1Dmm(%struct.ndrange_t* sret %ndrange, i64 %conv, i64 %conv3) #2
-  %call4 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tU13block_pointerFvvE(%opencl.queue_t* %call2, i32 241, %struct.ndrange_t* byval %ndrange, %opencl.block* %1) #2
-  %2 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @kernelBlockNoCtx_block_invoke to i8*), i32 0, i32 0, i8* null) #2
-  %call5 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tU13block_pointerFvvE(%opencl.queue_t* %call2, i32 241, %struct.ndrange_t* byval %ndrange, %opencl.block* %2) #2
-  ret void
-}
-
-; Function Attrs: nounwind
-define internal spir_func void @__enqueue_block_get_kernel_preferred_work_group_size_multiple_block_invoke(i8* nocapture readonly %.block_descriptor) #0 {
-entry:
-  %block.capture.addr = bitcast i8* %.block_descriptor to i32 addrspace(1)**
-  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %block.capture.addr, align 8
-  store i32 2, i32 addrspace(1)* %0, align 4
-  ret void
-}
-
-declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*)
-
-declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block*) #1
-
-declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block*) #1
-
-declare spir_func %opencl.queue_t* @get_default_queue() #1
-
-declare spir_func void @_Z10ndrange_1Dmm(%struct.ndrange_t* sret, i64, i64) #1
-
-declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, %opencl.block*) #1
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
-!opencl.kernels = !{!0}
-!opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!6}
-!opencl.ocl.version = !{!7}
-!opencl.used.extensions = !{!8}
-!opencl.used.optional.core.features = !{!8}
-!opencl.compiler.options = !{!8}
-!llvm.ident = !{!9}
-
-!0 = !{void (i32 addrspace(1)*)* @enqueue_block_get_kernel_preferred_work_group_size_multiple, !1, !2, !3, !4, !5}
-!1 = !{!"kernel_arg_addr_space", i32 1}
-!2 = !{!"kernel_arg_access_qual", !"none"}
-!3 = !{!"kernel_arg_type", !"int*"}
-!4 = !{!"kernel_arg_base_type", !"int*"}
-!5 = !{!"kernel_arg_type_qual", !""}
-!6 = !{i32 1, i32 2}
-!7 = !{i32 2, i32 0}
-!8 = !{}
-!9 = !{!"clang version 3.6.1 (https://github.com/KhronosGroup/SPIR.git 49a8b4a760d227b12116a79b2f7b2e34ef2e6879) (ssh://nnopencl-git-01.inn.intel.com/home/git/repo/opencl_qa-llvm d9b98710f905089caec167209da23af2e4f72bf0)"}
diff --git a/test/transcoding/device_execution_overloading.ll b/test/transcoding/device_execution_overloading.ll
deleted file mode 100644
index e5ee309217..0000000000
--- a/test/transcoding/device_execution_overloading.ll
+++ /dev/null
@@ -1,203 +0,0 @@
-;; bash$ cat device_execution_overloading.cl
-;; void device_kernel_with_local_args(__local float* ptr0, __local float* ptr1) {
-;;   *ptr0 = 0;
-;;   *ptr1 = 1;
-;; }
-;;
-;; void device_kernel(__global float* ptr) {
-;;   *ptr = 3;
-;; }
-;;
-;; __kernel void host_kernel(uint size, __global float* ptr) {
-;;   void(^block_with_local)(__local void*, __local void*) = ^(__local void* ptr0, __local void* ptr1){
-;;     device_kernel_with_local_args(ptr0, ptr1);
-;;   };
-;;
-;;   void(^block)(void) = ^{
-;;     device_kernel(ptr);
-;;   };
-;;
-;n;   uint wgSize = get_kernel_work_group_size(block_with_local);
-;;   uint prefMul =  get_kernel_preferred_work_group_size_multiple(block_with_local);
-;;   enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1),
-;;                  0, NULL, NULL, block_with_local, size, wgSize * prefMul);
-;;
-;;   wgSize = get_kernel_work_group_size(block);
-;;   prefMul =  get_kernel_preferred_work_group_size_multiple(block);
-;;   enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1),
-;;                  0, NULL, NULL, block);
-;; }
-;; bash$
-;;$PATH_TO_GEN/bin/clang -cc1 -x cl -O0 -cl-std=CL2.0 -triple spir64-unknonw-unknown -include $PATH_TO_GEN/lib/clang/3.6.1/include/opencl-20.h -emit-llvm device_execution_overloading.cl -o device_execution_overloading.ll
-
-;; Test overloading of device exectuion built-ins is OK after translation from SPIR-V
-
-; RUN: llvm-as %s -o %t.bc
-; RUN: llvm-spirv %t.bc -o %t.spv
-; RUN: llvm-spirv -r %t.spv -o %t.bc
-; RUN: llvm-dis < %t.bc | FileCheck %s
-
-; ModuleID = 'device_execution_overloading.cl'
-target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
-target triple = "spir64-unknonw-unknown"
-
-%opencl.block = type opaque
-%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] }
-%opencl.queue_t = type opaque
-%opencl.clk_event_t = type opaque
-
-; Function Attrs: nounwind
-define spir_func void @device_kernel_with_local_args(float addrspace(3)* %ptr0, float addrspace(3)* %ptr1) #0 {
-entry:
-  %ptr0.addr = alloca float addrspace(3)*, align 8
-  %ptr1.addr = alloca float addrspace(3)*, align 8
-  store float addrspace(3)* %ptr0, float addrspace(3)** %ptr0.addr, align 8
-  store float addrspace(3)* %ptr1, float addrspace(3)** %ptr1.addr, align 8
-  %0 = load float addrspace(3)*, float addrspace(3)** %ptr0.addr, align 8
-  store float 0.000000e+00, float addrspace(3)* %0, align 4
-  %1 = load float addrspace(3)*, float addrspace(3)** %ptr1.addr, align 8
-  store float 1.000000e+00, float addrspace(3)* %1, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define spir_func void @device_kernel(float addrspace(1)* %ptr) #0 {
-entry:
-  %ptr.addr = alloca float addrspace(1)*, align 8
-  store float addrspace(1)* %ptr, float addrspace(1)** %ptr.addr, align 8
-  %0 = load float addrspace(1)*, float addrspace(1)** %ptr.addr, align 8
-  store float 3.000000e+00, float addrspace(1)* %0, align 4
-  ret void
-}
-
-; CHECK: @__get_kernel_work_group_size_impl
-; CHECK: @__get_kernel_preferred_work_group_multiple_impl
-; CHECK: @__enqueue_kernel_events_vaargs
-
-; CHECK: @__get_kernel_work_group_size_impl
-; CHECK: @__get_kernel_preferred_work_group_multiple_impl
-; CHECK: @__enqueue_kernel_basic_events
-
-; Function Attrs: nounwind
-define spir_kernel void @host_kernel(i32 %size, float addrspace(1)* %ptr) #0 {
-entry:
-  %size.addr = alloca i32, align 4
-  %ptr.addr = alloca float addrspace(1)*, align 8
-  %block_with_local = alloca %opencl.block*, align 8
-  %block = alloca %opencl.block*, align 8
-  %captured = alloca <{ float addrspace(1)* }>, align 8
-  %wgSize = alloca i32, align 4
-  %prefMul = alloca i32, align 4
-  %agg.tmp = alloca %struct.ndrange_t, align 8
-  %agg.tmp8 = alloca %struct.ndrange_t, align 8
-  store i32 %size, i32* %size.addr, align 4
-  store float addrspace(1)* %ptr, float addrspace(1)** %ptr.addr, align 8
-  %0 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*, i8 addrspace(3)*, i8 addrspace(3)*)* @__host_kernel_block_invoke to i8*), i32 0, i32 0, i8* null)
-  store %opencl.block* %0, %opencl.block** %block_with_local, align 8
-  %block.captured = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %captured, i32 0, i32 0
-  %1 = load float addrspace(1)*, float addrspace(1)** %ptr.addr, align 8
-  store float addrspace(1)* %1, float addrspace(1)** %block.captured, align 8
-  %2 = bitcast <{ float addrspace(1)* }>* %captured to i8*
-  %3 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__host_kernel_block_invoke_2 to i8*), i32 8, i32 8, i8* %2)
-  store %opencl.block* %3, %opencl.block** %block, align 8
-  %4 = load %opencl.block*, %opencl.block** %block_with_local, align 8
-  %call = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block* %4)
-  store i32 %call, i32* %wgSize, align 4
-  %5 = load %opencl.block*, %opencl.block** %block_with_local, align 8
-  %call2 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block* %5)
-  store i32 %call2, i32* %prefMul, align 4
-  %call3 = call spir_func %opencl.queue_t* @_Z17get_default_queuev()
-  call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1)
-  %6 = load %opencl.block*, %opencl.block** %block_with_local, align 8
-  %7 = load i32, i32* %size.addr, align 4
-  %8 = load i32, i32* %wgSize, align 4
-  %9 = load i32, i32* %prefMul, align 4
-  %mul = mul i32 %8, %9
-  %call4 = call spir_func i32 (%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t* %call3, i32 241, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %6, i32 %7, i32 %mul)
-  %10 = load %opencl.block*, %opencl.block** %block, align 8
-  %call5 = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block* %10)
-  store i32 %call5, i32* %wgSize, align 4
-  %11 = load %opencl.block*, %opencl.block** %block, align 8
-  %call6 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block* %11)
-  store i32 %call6, i32* %prefMul, align 4
-  %call7 = call spir_func %opencl.queue_t* @_Z17get_default_queuev()
-  call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp8, i64 1)
-  %12 = load %opencl.block*, %opencl.block** %block, align 8
-  %call9 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t* %call7, i32 241, %struct.ndrange_t* byval %agg.tmp8, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %12)
-  ret void
-}
-
-; Function Attrs: nounwind
-define internal spir_func void @__host_kernel_block_invoke(i8* %.block_descriptor, i8 addrspace(3)* %ptr0, i8 addrspace(3)* %ptr1) #0 {
-entry:
-  %.block_descriptor.addr = alloca i8*, align 8
-  %ptr0.addr = alloca i8 addrspace(3)*, align 8
-  %ptr1.addr = alloca i8 addrspace(3)*, align 8
-  %block.addr = alloca <{}>*, align 8
-  store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8
-  %0 = load i8*, i8** %.block_descriptor.addr
-  store i8 addrspace(3)* %ptr0, i8 addrspace(3)** %ptr0.addr, align 8
-  store i8 addrspace(3)* %ptr1, i8 addrspace(3)** %ptr1.addr, align 8
-  %block = bitcast i8* %.block_descriptor to <{}>*
-  store <{}>* %block, <{}>** %block.addr, align 8
-  %1 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr0.addr, align 8
-  %2 = bitcast i8 addrspace(3)* %1 to float addrspace(3)*
-  %3 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr1.addr, align 8
-  %4 = bitcast i8 addrspace(3)* %3 to float addrspace(3)*
-  call spir_func void @device_kernel_with_local_args(float addrspace(3)* %2, float addrspace(3)* %4)
-  ret void
-}
-
-declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*)
-
-; Function Attrs: nounwind
-define internal spir_func void @__host_kernel_block_invoke_2(i8* %.block_descriptor) #0 {
-entry:
-  %.block_descriptor.addr = alloca i8*, align 8
-  %block.addr = alloca <{ float addrspace(1)* }>*, align 8
-  store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8
-  %0 = load i8*, i8** %.block_descriptor.addr
-  %block = bitcast i8* %.block_descriptor to <{ float addrspace(1)* }>*
-  store <{ float addrspace(1)* }>* %block, <{ float addrspace(1)* }>** %block.addr, align 8
-  %block.capture.addr = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %block, i32 0, i32 0
-  %1 = load float addrspace(1)*, float addrspace(1)** %block.capture.addr, align 8
-  call spir_func void @device_kernel(float addrspace(1)* %1)
-  ret void
-}
-
-declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block*) #1
-
-declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block*) #1
-
-declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) #1
-
-declare spir_func %opencl.queue_t* @_Z17get_default_queuev() #1
-
-declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1
-
-declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block*) #1
-
-declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block*) #1
-
-declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*) #1
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!opencl.kernels = !{!0}
-!opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!6}
-!opencl.ocl.version = !{!7}
-!opencl.used.extensions = !{!8}
-!opencl.used.optional.core.features = !{!8}
-!opencl.compiler.options = !{!8}
-
-!0 = !{void (i32, float addrspace(1)*)* @host_kernel, !1, !2, !3, !4, !5}
-!1 = !{!"kernel_arg_addr_space", i32 0, i32 1}
-!2 = !{!"kernel_arg_access_qual", !"none", !"none"}
-!3 = !{!"kernel_arg_type", !"uint", !"float*"}
-!4 = !{!"kernel_arg_base_type", !"uint", !"float*"}
-!5 = !{!"kernel_arg_type_qual", !"", !""}
-!6 = !{i32 1, i32 2}
-!7 = !{i32 2, i32 0}
-!8 = !{}
diff --git a/test/transcoding/device_execution_simple_local_memory.ll b/test/transcoding/device_execution_simple_local_memory.ll
deleted file mode 100644
index 51601eb74e..0000000000
--- a/test/transcoding/device_execution_simple_local_memory.ll
+++ /dev/null
@@ -1,137 +0,0 @@
-;; bash$ cat repro.cl
-;; void device_kernel(__local float* ptr0, __local float* ptr1) {
-;;   *ptr0 = 0;
-;;   *ptr1 = 1;
-;; }
-;;
-;; __kernel void host_kernel(uint size) {
-;;   void(^block)(__local void*, __local void*) = ^(__local void* ptr0, __local void* ptr1){
-;;     device_kernel(ptr0, ptr1);
-;;   };
-;;
-;;   uint wgSize = get_kernel_work_group_size(block);
-;;   uint prefMul =  get_kernel_preferred_work_group_size_multiple(block);
-;;   enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1),
-;;                  0, NULL, NULL, block, size, wgSize * prefMul);
-;; }
-;; bash$
-;; bash$ export PATH_TO_INCLUDE= $PATH_TO_GEN/lib/clang/3.6.1/include
-;; bash$ $PATH_TO_GEN/bin/clang -cc1 -x cl -cl-std=CL2.0 -triple spir64-unknonw-unknown -emit-llvm  -include opencl-20.h  repro.cl -o device_execution.ll
-
-;; Check that device enqueue BIs wasn't mangled
-
-
-; RUN: llvm-as %s -o %t.bc
-; RUN: llvm-spirv %t.bc -o %t.spv
-; RUN: llvm-spirv -r %t.spv -o %t.bc
-; RUN: llvm-dis < %t.bc | FileCheck %s
-
-; ModuleID = 'repro.cl'
-target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
-target triple = "spir64-unknonw-unknown"
-
-%opencl.block = type opaque
-%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] }
-%opencl.queue_t = type opaque
-%opencl.clk_event_t = type opaque
-
-; Function Attrs: nounwind
-define spir_func void @device_kernel(float addrspace(3)* %ptr0, float addrspace(3)* %ptr1) #0 {
-entry:
-  %ptr0.addr = alloca float addrspace(3)*, align 8
-  %ptr1.addr = alloca float addrspace(3)*, align 8
-  store float addrspace(3)* %ptr0, float addrspace(3)** %ptr0.addr, align 8
-  store float addrspace(3)* %ptr1, float addrspace(3)** %ptr1.addr, align 8
-  %0 = load float addrspace(3)*, float addrspace(3)** %ptr0.addr, align 8
-  store float 0.000000e+00, float addrspace(3)* %0, align 4
-  %1 = load float addrspace(3)*, float addrspace(3)** %ptr1.addr, align 8
-  store float 1.000000e+00, float addrspace(3)* %1, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define spir_kernel void @host_kernel(i32 %size) #0 {
-entry:
-  %size.addr = alloca i32, align 4
-  %block = alloca %opencl.block*, align 8
-  %wgSize = alloca i32, align 4
-  %prefMul = alloca i32, align 4
-  %agg.tmp = alloca %struct.ndrange_t, align 8
-  store i32 %size, i32* %size.addr, align 4
-  %0 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*, i8 addrspace(3)*, i8 addrspace(3)*)* @__host_kernel_block_invoke to i8*), i32 0, i32 0, i8* null)
-  store %opencl.block* %0, %opencl.block** %block, align 8
-  %1 = load %opencl.block*, %opencl.block** %block, align 8
-; CHECK: call {{.*}} @__get_kernel_work_group_size_impl
-  %call = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block* %1)
-  store i32 %call, i32* %wgSize, align 4
-  %2 = load %opencl.block*, %opencl.block** %block, align 8
-; CHECK: call {{.*}} @__get_kernel_preferred_work_group_multiple_impl
-  %call1 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block* %2)
-  store i32 %call1, i32* %prefMul, align 4
-  %call2 = call spir_func %opencl.queue_t* @_Z17get_default_queuev()
-  call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1)
-  %3 = load %opencl.block*, %opencl.block** %block, align 8
-  %4 = load i32, i32* %size.addr, align 4
-  %5 = load i32, i32* %wgSize, align 4
-  %6 = load i32, i32* %prefMul, align 4
-  %mul = mul i32 %5, %6
-; CHECK: call {{.*}} @__enqueue_kernel_events_vaargs({{.*}}, i32 {{.*}}, i32 {{.*}})
-  %call3 = call spir_func i32 (%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t* %call2, i32 241, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %3, i32 %4, i32 %mul)
-  ret void
-}
-
-; Function Attrs: nounwind
-; CHECK-LABEL: define {{.*}} @__host_kernel_block_invoke
-define internal spir_func void @__host_kernel_block_invoke(i8* %.block_descriptor, i8 addrspace(3)* %ptr0, i8 addrspace(3)* %ptr1) #0 {
-entry:
-  %.block_descriptor.addr = alloca i8*, align 8
-  %ptr0.addr = alloca i8 addrspace(3)*, align 8
-  %ptr1.addr = alloca i8 addrspace(3)*, align 8
-  %block.addr = alloca <{}>*, align 8
-  store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8
-  %0 = load i8*, i8** %.block_descriptor.addr
-  store i8 addrspace(3)* %ptr0, i8 addrspace(3)** %ptr0.addr, align 8
-  store i8 addrspace(3)* %ptr1, i8 addrspace(3)** %ptr1.addr, align 8
-  %block = bitcast i8* %.block_descriptor to <{}>*
-  store <{}>* %block, <{}>** %block.addr, align 8
-  %1 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr0.addr, align 8
-  %2 = bitcast i8 addrspace(3)* %1 to float addrspace(3)*
-  %3 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr1.addr, align 8
-  %4 = bitcast i8 addrspace(3)* %3 to float addrspace(3)*
-  call spir_func void @device_kernel(float addrspace(3)* %2, float addrspace(3)* %4)
-  ret void
-}
-
-declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*)
-
-declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block*) #1
-
-declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block*) #1
-
-; CHECK: declare {{.*}} @__enqueue_kernel_events_vaargs{{.*}}({{.*}}, i32, ...)
-declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) #1
-
-declare spir_func %opencl.queue_t* @_Z17get_default_queuev() #1
-
-declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!opencl.kernels = !{!0}
-!opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!6}
-!opencl.ocl.version = !{!7}
-!opencl.used.extensions = !{!8}
-!opencl.used.optional.core.features = !{!8}
-!opencl.compiler.options = !{!8}
-
-!0 = !{void (i32)* @host_kernel, !1, !2, !3, !4, !5}
-!1 = !{!"kernel_arg_addr_space", i32 0}
-!2 = !{!"kernel_arg_access_qual", !"none"}
-!3 = !{!"kernel_arg_type", !"uint"}
-!4 = !{!"kernel_arg_base_type", !"uint"}
-!5 = !{!"kernel_arg_type_qual", !""}
-!6 = !{i32 1, i32 2}
-!7 = !{i32 2, i32 0}
-!8 = !{}
diff --git a/test/transcoding/device_execution_vaargs.ll b/test/transcoding/device_execution_vaargs.ll
deleted file mode 100644
index 5f616b4f89..0000000000
--- a/test/transcoding/device_execution_vaargs.ll
+++ /dev/null
@@ -1,231 +0,0 @@
-;; bash$ cat device_execution_overloading.cl
-;; void device_kernel_with_local_args(__local float* ptr0, __local float* ptr1) {
-;;   *ptr0 = 0;
-;;   *ptr1 = 1;
-;; }
-;;
-;; void device_kernel(__global float* ptr) {
-;;   *ptr = 3;
-;; }
-;;
-;; __kernel void host_kernel(uint size, __global float* ptr) {
-;;   void(^block_with_local)(__local void*, __local void*) = ^(__local void* ptr0, __local void* ptr1){
-;;     device_kernel_with_local_args(ptr0, ptr1);
-;;   };
-;;
-;;   void(^block)(void) = ^{
-;;     device_kernel(ptr);
-;;   };
-;;
-;n;   uint wgSize = get_kernel_work_group_size(block_with_local);
-;;   uint prefMul =  get_kernel_preferred_work_group_size_multiple(block_with_local);
-;;   enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1),
-;;                  0, NULL, NULL, block_with_local, size, wgSize * prefMul);
-;;
-;;   wgSize = get_kernel_work_group_size(block);
-;;   prefMul =  get_kernel_preferred_work_group_size_multiple(block);
-;;   enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1),
-;;                  0, NULL, NULL, block);
-;; }
-;; bash$
-;;$PATH_TO_GEN/bin/clang -cc1 -x cl -O0 -cl-std=CL2.0 -triple spir64-unknonw-unknown -include $PATH_TO_GEN/lib/clang/3.6.1/include/opencl-20.h -emit-llvm device_execution_overloading.cl -o device_execution_overloading.ll
-
-;; Test enqueue_kernel with and wthout vaargs
-
-; RUN: llvm-as %s -o %t.bc
-; RUN: llvm-spirv %t.bc -o %t.spv
-; RUN: llvm-spirv -r %t.spv -o %t.bc
-; RUN: llvm-dis < %t.bc | FileCheck %s
-
-; CHECK: %struct.__block_descriptor = type { i64, i64 }
-
-; CHECK: [[BLOCK:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>, align 8
-; CHECK: [[INVOKEGEP:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]], i32 0, i32 3
-; CHECK: store i8* bitcast (void (i8*, i8 addrspace(3)*, i8 addrspace(3)*)* @__host_kernel_block_invoke to i8*), i8** [[INVOKEGEP]]
-; CHECK: [[BLOCKDESCGEP:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]], i32 0, i32 4
-; CHECK: store %struct.__block_descriptor* @__block_descriptor_spirv, %struct.__block_descriptor** [[BLOCKDESCGEP]]
-; CHECK: [[BLOCKBCAST:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor* }>* [[BLOCK]] to i8*
-; CHECK: [[BLOCKADDRCAST:%.*]] = addrspacecast i8* [[BLOCKBCAST]] to i8 addrspace(4)*
-; CHECK: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* [[BLOCKADDRCAST]])
-; CHECK: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* [[BLOCKADDRCAST]])
-; CHECK: call i32 {{.*}} @__enqueue_kernel_events_vaargs
-
-; CHECK: [[BLOCK2:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, align 8
-; CHECK: [[INVOKEGEP2:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK2]], i32 0, i32 3
-; CHECK: store i8* bitcast (void (i8*)* @__host_kernel_block_invoke_2 to i8*), i8** [[INVOKEGEP2]]
-; CHECK: [[BLOCKDESCGEP2:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK2]], i32 0, i32 4
-; CHECK: store %struct.__block_descriptor* @__block_descriptor_spirv.1, %struct.__block_descriptor** [[BLOCKDESCGEP2]]
-; CHECK: [[CAPTUREDGEP:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK2]], i32 0, i32 5
-; CHECK: [[CAPTUREDCAST:%.*]] = bitcast [8 x i8]* [[CAPTUREDGEP]] to i8*
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[CAPTUREDCAST]], i8* align 8 %0, i32 8, i1 false)
-; CHECK: [[BLOCKBCAST2:%.*]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[BLOCK2]] to i8*
-; CHECK: [[BLOCKADDRCAST2:%.*]] = addrspacecast i8* [[BLOCKBCAST2]] to i8 addrspace(4)*
-; CHECK: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* [[BLOCKADDRCAST2]])
-; CHECK: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* [[BLOCKADDRCAST2]]) #0
-; CHECK: call i32 @__enqueue_kernel_basic_events
-
-; CHECK: define internal spir_func void @__host_kernel_block_invoke(
-; CHECK-NOT: bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [0 x i8] }>*
-
-; CHECK: define internal spir_func void @__host_kernel_block_invoke_2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INV_BLOCK:%.*]] = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>*
-; CHECK-NEXT: [[INV_CAPTUREDGEP:%.*]] = getelementptr <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [8 x i8] }>* [[INV_BLOCK]], i32 0, i32 5
-; CHECK-NEXT: bitcast [8 x i8]* [[INV_CAPTUREDGEP]] to i8*
-; CHECK-NEXT: br label %invoke
-
-; ModuleID = 'device_execution_overloading.cl'
-target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
-target triple = "spir64-unknonw-unknown"
-
-%opencl.block = type opaque
-%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] }
-%opencl.queue_t = type opaque
-%opencl.clk_event_t = type opaque
-
-; Function Attrs: nounwind
-define spir_func void @device_kernel_with_local_args(float addrspace(3)* %ptr0, float addrspace(3)* %ptr1) #0 {
-entry:
-  %ptr0.addr = alloca float addrspace(3)*, align 8
-  %ptr1.addr = alloca float addrspace(3)*, align 8
-  store float addrspace(3)* %ptr0, float addrspace(3)** %ptr0.addr, align 8
-  store float addrspace(3)* %ptr1, float addrspace(3)** %ptr1.addr, align 8
-  %0 = load float addrspace(3)*, float addrspace(3)** %ptr0.addr, align 8
-  store float 0.000000e+00, float addrspace(3)* %0, align 4
-  %1 = load float addrspace(3)*, float addrspace(3)** %ptr1.addr, align 8
-  store float 1.000000e+00, float addrspace(3)* %1, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define spir_func void @device_kernel(float addrspace(1)* %ptr) #0 {
-entry:
-  %ptr.addr = alloca float addrspace(1)*, align 8
-  store float addrspace(1)* %ptr, float addrspace(1)** %ptr.addr, align 8
-  %0 = load float addrspace(1)*, float addrspace(1)** %ptr.addr, align 8
-  store float 3.000000e+00, float addrspace(1)* %0, align 4
-  ret void
-}
-; Function Attrs: nounwind
-define spir_kernel void @host_kernel(i32 %size, float addrspace(1)* %ptr) #0 {
-entry:
-  %size.addr = alloca i32, align 4
-  %ptr.addr = alloca float addrspace(1)*, align 8
-  %block_with_local = alloca %opencl.block*, align 8
-  %block = alloca %opencl.block*, align 8
-  %captured = alloca <{ float addrspace(1)* }>, align 8
-  %wgSize = alloca i32, align 4
-  %prefMul = alloca i32, align 4
-  %agg.tmp = alloca %struct.ndrange_t, align 8
-  %agg.tmp8 = alloca %struct.ndrange_t, align 8
-  store i32 %size, i32* %size.addr, align 4
-  store float addrspace(1)* %ptr, float addrspace(1)** %ptr.addr, align 8
-  %0 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*, i8 addrspace(3)*, i8 addrspace(3)*)* @__host_kernel_block_invoke to i8*), i32 0, i32 0, i8* null)
-  store %opencl.block* %0, %opencl.block** %block_with_local, align 8
-  %block.captured = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %captured, i32 0, i32 0
-  %1 = load float addrspace(1)*, float addrspace(1)** %ptr.addr, align 8
-  store float addrspace(1)* %1, float addrspace(1)** %block.captured, align 8
-  %2 = bitcast <{ float addrspace(1)* }>* %captured to i8*
-  %3 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__host_kernel_block_invoke_2 to i8*), i32 8, i32 8, i8* %2)
-  store %opencl.block* %3, %opencl.block** %block, align 8
-  %4 = load %opencl.block*, %opencl.block** %block_with_local, align 8
-  %call = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block* %4)
-  store i32 %call, i32* %wgSize, align 4
-  %5 = load %opencl.block*, %opencl.block** %block_with_local, align 8
-  %call2 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block* %5)
-  store i32 %call2, i32* %prefMul, align 4
-  %call3 = call spir_func %opencl.queue_t* @_Z17get_default_queuev()
-  call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1)
-  %6 = load %opencl.block*, %opencl.block** %block_with_local, align 8
-  %7 = load i32, i32* %size.addr, align 4
-  %8 = load i32, i32* %wgSize, align 4
-  %9 = load i32, i32* %prefMul, align 4
-  %mul = mul i32 %8, %9
-  %call4 = call spir_func i32 (%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t* %call3, i32 241, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %6, i32 %7, i32 %mul)
-  %10 = load %opencl.block*, %opencl.block** %block, align 8
-  %call5 = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block* %10)
-  store i32 %call5, i32* %wgSize, align 4
-  %11 = load %opencl.block*, %opencl.block** %block, align 8
-  %call6 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block* %11)
-  store i32 %call6, i32* %prefMul, align 4
-  %call7 = call spir_func %opencl.queue_t* @_Z17get_default_queuev()
-  call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp8, i64 1)
-  %12 = load %opencl.block*, %opencl.block** %block, align 8
-  %call9 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t* %call7, i32 241, %struct.ndrange_t* byval %agg.tmp8, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %12)
-  ret void
-}
-
-; Function Attrs: nounwind
-define internal spir_func void @__host_kernel_block_invoke(i8* %.block_descriptor, i8 addrspace(3)* %ptr0, i8 addrspace(3)* %ptr1) #0 {
-entry:
-  %.block_descriptor.addr = alloca i8*, align 8
-  %ptr0.addr = alloca i8 addrspace(3)*, align 8
-  %ptr1.addr = alloca i8 addrspace(3)*, align 8
-  %block.addr = alloca <{}>*, align 8
-  store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8
-  %0 = load i8*, i8** %.block_descriptor.addr
-  store i8 addrspace(3)* %ptr0, i8 addrspace(3)** %ptr0.addr, align 8
-  store i8 addrspace(3)* %ptr1, i8 addrspace(3)** %ptr1.addr, align 8
-  %block = bitcast i8* %.block_descriptor to <{}>*
-  store <{}>* %block, <{}>** %block.addr, align 8
-  %1 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr0.addr, align 8
-  %2 = bitcast i8 addrspace(3)* %1 to float addrspace(3)*
-  %3 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr1.addr, align 8
-  %4 = bitcast i8 addrspace(3)* %3 to float addrspace(3)*
-  call spir_func void @device_kernel_with_local_args(float addrspace(3)* %2, float addrspace(3)* %4)
-  ret void
-}
-
-declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*)
-
-; Function Attrs: nounwind
-define internal spir_func void @__host_kernel_block_invoke_2(i8* %.block_descriptor) #0 {
-entry:
-  %.block_descriptor.addr = alloca i8*, align 8
-  %block.addr = alloca <{ float addrspace(1)* }>*, align 8
-  store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8
-  %0 = load i8*, i8** %.block_descriptor.addr
-  %block = bitcast i8* %.block_descriptor to <{ float addrspace(1)* }>*
-  store <{ float addrspace(1)* }>* %block, <{ float addrspace(1)* }>** %block.addr, align 8
-  %block.capture.addr = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %block, i32 0, i32 0
-  %1 = load float addrspace(1)*, float addrspace(1)** %block.capture.addr, align 8
-  call spir_func void @device_kernel(float addrspace(1)* %1)
-  ret void
-}
-
-declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block*) #1
-
-declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block*) #1
-
-declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) #1
-
-declare spir_func %opencl.queue_t* @_Z17get_default_queuev() #1
-
-declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1
-
-declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block*) #1
-
-declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block*) #1
-
-declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*) #1
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!opencl.kernels = !{!0}
-!opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!6}
-!opencl.ocl.version = !{!7}
-!opencl.used.extensions = !{!8}
-!opencl.used.optional.core.features = !{!8}
-!opencl.compiler.options = !{!8}
-
-!0 = !{void (i32, float addrspace(1)*)* @host_kernel, !1, !2, !3, !4, !5}
-!1 = !{!"kernel_arg_addr_space", i32 0, i32 1}
-!2 = !{!"kernel_arg_access_qual", !"none", !"none"}
-!3 = !{!"kernel_arg_type", !"uint", !"float*"}
-!4 = !{!"kernel_arg_base_type", !"uint", !"float*"}
-!5 = !{!"kernel_arg_type_qual", !"", !""}
-!6 = !{i32 1, i32 2}
-!7 = !{i32 2, i32 0}
-!8 = !{}
diff --git a/test/transcoding/enqueue_kernel.ll b/test/transcoding/enqueue_kernel.ll
new file mode 100644
index 0000000000..0d29c719a8
--- /dev/null
+++ b/test/transcoding/enqueue_kernel.ll
@@ -0,0 +1,355 @@
+; Source
+; typedef struct {int a;} ndrange_t;
+;
+; kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
+;   queue_t default_queue;
+;   unsigned flags = 0;
+;   ndrange_t ndrange;
+;   clk_event_t clk_event;
+;   clk_event_t event_wait_list;
+;   clk_event_t event_wait_list2[] = {clk_event};
+;
+;   // Emits block literal on stack and block kernel.
+;   enqueue_kernel(default_queue, flags, ndrange,
+;                  ^(void) {
+;                    a[i] = c0;
+;                  });
+;
+;   // Emits block literal on stack and block kernel.
+;   enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event,
+;                  ^(void) {
+;                    a[i] = b[i];
+;                  });
+;
+;   char c;
+;   // Emits global block literal and block kernel.
+;   enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event,
+;                  ^(local void *p) {
+;                    return;
+;                  },
+;                  c);
+;
+;   // Emits global block literal and block kernel.
+;   enqueue_kernel(default_queue, flags, ndrange,
+;                  ^(local void *p1, local void *p2, local void *p3) {
+;                    return;
+;                  },
+;                  1, 2, 4);
+; }
+;
+; Compilation command:
+; clang -cc1 -triple spir-unknown-unknown -O0 -cl-std=CL2.0 -emit-llvm enqueue_kernel.cl
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -spirv-text -o %t.spv.txt
+; RUN: FileCheck < %t.spv.txt %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis %t.rev.bc
+; RUN: FileCheck < %t.rev.ll %s --check-prefix=CHECK-LLVM
+
+; ModuleID = 'enqueue_kernel.cl'
+source_filename = "enqueue_kernel.cl"
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+%opencl.queue_t = type opaque
+%struct.ndrange_t = type { i32 }
+%opencl.clk_event_t = type opaque
+
+; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer1:[0-9]+]] "__device_side_enqueue_block_invoke_kernel"
+; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer2:[0-9]+]] "__device_side_enqueue_block_invoke_2_kernel"
+; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer3:[0-9]+]] "__device_side_enqueue_block_invoke_3_kernel"
+; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer4:[0-9]+]] "__device_side_enqueue_block_invoke_4_kernel"
+; CHECK-SPIRV: Name [[BlockGlb1:[0-9]+]] "__block_literal_global"
+; CHECK-SPIRV: Name [[BlockGlb2:[0-9]+]] "__block_literal_global.1"
+
+; CHECK-SPIRV: TypeInt [[Int32Ty:[0-9]+]] 32
+; CHECK-SPIRV: TypeInt [[Int8Ty:[0-9]+]] 8
+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt8:[0-9]+]] 8
+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt0:[0-9]+]] 0
+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt17:[0-9]+]] 17
+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt2:[0-9]+]] 2
+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt20:[0-9]+]] 20
+; CHECK-SPIRV: TypeVoid [[VoidTy:[0-9]+]]
+
+; CHECK-SPIRV: TypePointer {{[0-9]+}} 7 {{[0-9]+}}
+; CHECK-SPIRV: TypePointer [[Int32LocPtrTy:[0-9]+]] 7 [[Int32Ty]]
+; CHECK-SPIRV: TypeDeviceEvent [[EventTy:[0-9]+]]
+; CHECK-SPIRV: TypePointer [[Int8PtrGenTy:[0-9]+]] 8 [[Int8Ty]]
+; CHECK-SPIRV: TypePointer [[EventPtrTy:[0-9]+]] 8 [[EventTy]]
+; CHECK-SPIRV: TypeFunction [[BlockTy1:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
+; CHECK-SPIRV: TypeFunction [[BlockTy2:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
+; CHECK-SPIRV: TypeFunction [[BlockTy3:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
+; CHECK-SPIRV: ConstantNull [[EventPtrTy]] [[EventNull:[0-9]+]]
+
+; CHECK-LLVM: [[BlockTy1:%[0-9]+]] = type { i32, i32 }
+; CHECK-LLVM: [[BlockTy2:%[0-9]+]] = type <{ i32, i32, i32 addrspace(1)*, i32, i8 }>
+; CHECK-LLVM: [[BlockTy3:%[0-9]+]] = type <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>
+; CHECK-LLVM: [[BlockTy4:%[0-9]+]] = type <{ i32, i32 }>
+
+; CHECK-LLVM: @__block_literal_global = internal addrspace(1) constant [[BlockTy1]] { i32 8, i32 4 }, align 4
+; CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy1]] { i32 8, i32 4 }, align 4
+
+@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+
+; Function Attrs: convergent noinline nounwind optnone
+define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %i, i8 signext %c0) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+entry:
+  %default_queue = alloca %opencl.queue_t*, align 4
+  %flags = alloca i32, align 4
+  %ndrange = alloca %struct.ndrange_t, align 4
+  %clk_event = alloca %opencl.clk_event_t*, align 4
+  %event_wait_list = alloca %opencl.clk_event_t*, align 4
+  %event_wait_list2 = alloca [1 x %opencl.clk_event_t*], align 4
+  %block = alloca <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, align 4
+  %block3 = alloca <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4
+  %c = alloca i8, align 1
+  store i32 0, i32* %flags, align 4
+  %arrayinit.begin = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
+  %0 = load %opencl.clk_event_t*, %opencl.clk_event_t** %clk_event, align 4
+  store %opencl.clk_event_t* %0, %opencl.clk_event_t** %arrayinit.begin, align 4
+  %1 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
+  %2 = load i32, i32* %flags, align 4
+  %block.size = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 0
+  store i32 17, i32* %block.size, align 4
+  %block.align = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 1
+  store i32 4, i32* %block.align, align 4
+  %block.captured = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 2
+  store i32 addrspace(1)* %a, i32 addrspace(1)** %block.captured, align 4
+  %block.captured1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 3
+  store i32 %i, i32* %block.captured1, align 4
+  %block.captured2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 4
+  store i8 %c0, i8* %block.captured2, align 4
+  %3 = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block to void ()*
+  %4 = addrspacecast void ()* %3 to i8 addrspace(4)*
+
+; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit1:[0-9]+]]
+; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}}
+;                            [[ConstInt0]] [[EventNull]] [[EventNull]]
+;                            [[BlockKer1]] [[BlockLit1]] [[ConstInt17]] [[ConstInt8]]
+
+; CHECK-LLVM: [[Block2:%[0-9]+]] = addrspacecast [[BlockTy2]]* %block to i8 addrspace(4)*
+; CHECK-LLVM: [[BlockInv2:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8 addrspace(4)*
+; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2]])
+
+  %5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* byval %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %4)
+  %6 = addrspacecast %opencl.clk_event_t** %event_wait_list to %opencl.clk_event_t* addrspace(4)*
+  %7 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
+  %block.size5 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 0
+  store i32 20, i32* %block.size5, align 4
+  %block.align6 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 1
+  store i32 4, i32* %block.align6, align 4
+  %block.captured7 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 2
+  store i32 addrspace(1)* %a, i32 addrspace(1)** %block.captured7, align 4
+  %block.captured8 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 3
+  store i32 %i, i32* %block.captured8, align 4
+  %block.captured9 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 4
+  store i32 addrspace(1)* %b, i32 addrspace(1)** %block.captured9, align 4
+  %8 = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3 to void ()*
+  %9 = addrspacecast void ()* %8 to i8 addrspace(4)*
+
+; CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event1:[0-9]+]]
+; CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event2:[0-9]+]]
+
+; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit2:[0-9]+]]
+; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}}
+;                            [[ConstInt2]] [[Event1]] [[Event2]]
+;                            [[BlockKer2]] [[BlockLit2]] [[ConstInt20]] [[ConstInt8]]
+
+; CHECK-LLVM: [[Block3:%[0-9]+]] = addrspacecast [[BlockTy3]]* %block3 to i8 addrspace(4)*
+; CHECK-LLVM: [[BlockInv3:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8 addrspace(4)*
+; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv3]], i8 addrspace(4)* [[Block3]])
+
+  %10 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i32 2, %opencl.clk_event_t* addrspace(4)* %6, %opencl.clk_event_t* addrspace(4)* %7, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %9)
+  %11 = alloca [1 x i32]
+  %12 = getelementptr [1 x i32], [1 x i32]* %11, i32 0, i32 0
+  %13 = load i8, i8* %c, align 1
+  %14 = zext i8 %13 to i32
+  store i32 %14, i32* %12, align 4
+
+; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf31:[0-9]+]]
+; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit3Tmp:[0-9]+]] [[BlockGlb1:[0-9]+]]
+; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit3:[0-9]+]] [[BlockLit3Tmp]]
+; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}}
+;                            [[ConstInt2]] [[Event1]] [[Event2]]
+;                            [[BlockKer3]] [[BlockLit3]] [[ConstInt8]] [[ConstInt8]]
+;                            [[LocalBuf31]]
+
+; CHECK-LLVM: [[Block0Tmp:%[0-9]+]] = bitcast [[BlockTy1]] addrspace(1)* @__block_literal_global to i8 addrspace(1)*
+; CHECK-LLVM: [[Block0:%[0-9]+]] = addrspacecast i8 addrspace(1)* [[Block0Tmp]] to i8 addrspace(4)*
+; CHECK-LLVM: [[BlockInv0:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8 addrspace(4)*
+; CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv0]], i8 addrspace(4)* [[Block0]], i32 1, i32* {{.*}})
+
+  %15 = call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i32 2, %opencl.clk_event_t* addrspace(4)* %6, %opencl.clk_event_t* addrspace(4)* %7, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %12)
+  %16 = alloca [3 x i32]
+  %17 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 0
+  store i32 1, i32* %17, align 4
+  %18 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 1
+  store i32 2, i32* %18, align 4
+  %19 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 2
+  store i32 4, i32* %19, align 4
+
+; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf41:[0-9]+]]
+; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf42:[0-9]+]]
+; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf43:[0-9]+]]
+; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit4Tmp:[0-9]+]] [[BlockGlb2:[0-9]+]]
+; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit4:[0-9]+]] [[BlockLit4Tmp]]
+; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}}
+;                            [[ConstInt0]] [[EventNull]] [[EventNull]]
+;                            [[BlockKer4]] [[BlockLit4]] [[ConstInt8]] [[ConstInt8]]
+;                            [[LocalBuf41]] [[LocalBuf42]] [[LocalBuf43]]
+
+; CHECK-LLVM: [[Block1Tmp:%[0-9]+]] = bitcast [[BlockTy1]] addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*
+; CHECK-LLVM: [[Block1:%[0-9]+]] = addrspacecast i8 addrspace(1)* [[Block1Tmp]] to i8 addrspace(4)*
+; CHECK-LLVM: [[BlockInv1:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8 addrspace(4)*
+; CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv1]], i8 addrspace(4)* [[Block1]], i32 3, i32* {{.*}})
+
+  %20 = call i32 @__enqueue_kernel_varargs(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %17)
+  ret void
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define internal spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %.block_descriptor) #2 {
+entry:
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+  %block.addr = alloca <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)*, align 4
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)*
+  store <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)** %block.addr, align 4
+  %block.capture.addr = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 4
+  %0 = load i8, i8 addrspace(4)* %block.capture.addr, align 4
+  %conv = sext i8 %0 to i32
+  %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 2
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr1, align 4
+  %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 3
+  %2 = load i32, i32 addrspace(4)* %block.capture.addr2, align 4
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 %2
+  store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)*) #3 {
+entry:
+  call void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %0)
+  ret void
+}
+
+declare i32 @__enqueue_kernel_basic(%opencl.queue_t*, i32, %struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*)
+
+; Function Attrs: convergent noinline nounwind optnone
+define internal spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %.block_descriptor) #2 {
+entry:
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+  %block.addr = alloca <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*
+  store <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4
+  %block.capture.addr = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr, align 4
+  %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
+  %1 = load i32, i32 addrspace(4)* %block.capture.addr1, align 4
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 %1
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 2
+  %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr2, align 4
+  %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
+  %4 = load i32, i32 addrspace(4)* %block.capture.addr3, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 %4
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)*) #3 {
+entry:
+  call void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %0)
+  ret void
+}
+
+declare i32 @__enqueue_kernel_basic_events(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*)
+
+; Function Attrs: convergent noinline nounwind optnone
+define internal spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* %.block_descriptor, i8 addrspace(3)* %p) #2 {
+entry:
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+  %p.addr = alloca i8 addrspace(3)*, align 4
+  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+  store i8 addrspace(3)* %p, i8 addrspace(3)** %p.addr, align 4
+  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*, i8 addrspace(3)*) #3 {
+entry:
+  call void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* %0, i8 addrspace(3)* %1)
+  ret void
+}
+
+declare i32 @__enqueue_kernel_events_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*, i32, i32*)
+
+; Function Attrs: convergent noinline nounwind optnone
+define internal spir_func void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* %.block_descriptor, i8 addrspace(3)* %p1, i8 addrspace(3)* %p2, i8 addrspace(3)* %p3) #2 {
+entry:
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+  %p1.addr = alloca i8 addrspace(3)*, align 4
+  %p2.addr = alloca i8 addrspace(3)*, align 4
+  %p3.addr = alloca i8 addrspace(3)*, align 4
+  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+  store i8 addrspace(3)* %p1, i8 addrspace(3)** %p1.addr, align 4
+  store i8 addrspace(3)* %p2, i8 addrspace(3)** %p2.addr, align 4
+  store i8 addrspace(3)* %p3, i8 addrspace(3)** %p3.addr, align 4
+  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*) #3 {
+entry:
+  call void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* %0, i8 addrspace(3)* %1, i8 addrspace(3)* %2, i8 addrspace(3)* %3)
+  ret void
+}
+
+declare i32 @__enqueue_kernel_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*, i32, i32*)
+
+; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer1]] 0 [[BlockTy1]]
+; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer2]] 0 [[BlockTy1]]
+; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer3]] 0 [[BlockTy2]]
+; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer4]] 0 [[BlockTy3]]
+
+; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)*)
+; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)*)
+; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*, i8 addrspace(3)*)
+; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)
+
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.enable.FP_CONTRACT = !{}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.used.extensions = !{!2}
+!opencl.used.optional.core.features = !{!2}
+!opencl.compiler.options = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 2, i32 0}
+!2 = !{}
+!3 = !{!"clang version 7.0.0"}
+!4 = !{i32 1, i32 1, i32 0, i32 0}
+!5 = !{!"none", !"none", !"none", !"none"}
+!6 = !{!"int*", !"int*", !"int", !"char"}
+!7 = !{!"", !"", !"", !""}
+!8 = !{i1 false, i1 false, i1 false, i1 false}
+!9 = !{i32 0, i32 0, i32 0, i32 0}
diff --git a/test/transcoding/kernel_query.ll b/test/transcoding/kernel_query.ll
new file mode 100644
index 0000000000..f8e6cd3ea6
--- /dev/null
+++ b/test/transcoding/kernel_query.ll
@@ -0,0 +1,212 @@
+; Source
+; typedef struct {int a;} ndrange_t;
+;
+; kernel void device_side_enqueue() {
+;   ndrange_t ndrange;
+;
+;   get_kernel_work_group_size(^(){});
+;   get_kernel_preferred_work_group_size_multiple(^(){});
+;
+; #pragma OPENCL EXTENSION cl_khr_subgroups : enable
+;   get_kernel_max_sub_group_size_for_ndrange(ndrange, ^(){});
+;   get_kernel_sub_group_count_for_ndrange(ndrange, ^(){});
+; }
+;
+; Compilation command:
+; clang -cc1 -triple spir-unknown-unknown -O0 -cl-std=CL2.0 -emit-llvm kernel_query.cl
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -spirv-text -o %t.spv.txt
+; RUN: FileCheck < %t.spv.txt %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis %t.rev.bc
+; RUN: FileCheck < %t.rev.ll %s --check-prefix=CHECK-LLVM
+
+source_filename = "kernel_query.cl"
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+%struct.ndrange_t = type { i32 }
+
+; CHECK-SPIRV: EntryPoint {{.*}} [[BlockKer1:[0-9]+]] "__device_side_enqueue_block_invoke_kernel"
+; CHECK-SPIRV: EntryPoint {{.*}} [[BlockKer2:[0-9]+]] "__device_side_enqueue_block_invoke_2_kernel"
+; CHECK-SPIRV: EntryPoint {{.*}} [[BlockKer3:[0-9]+]] "__device_side_enqueue_block_invoke_3_kernel"
+; CHECK-SPIRV: EntryPoint {{.*}} [[BlockKer4:[0-9]+]] "__device_side_enqueue_block_invoke_4_kernel"
+; CHECK-SPIRV: Name [[BlockGlb1:[0-9]+]] "__block_literal_global"
+; CHECK-SPIRV: Name [[BlockGlb2:[0-9]+]] "__block_literal_global.1"
+; CHECK-SPIRV: Name [[BlockGlb3:[0-9]+]] "__block_literal_global.2"
+; CHECK-SPIRV: Name [[BlockGlb4:[0-9]+]] "__block_literal_global.3"
+
+; CHECK-LLVM: [[BlockTy:%[0-9]+]] = type { i32, i32 }
+%1 = type <{ i32, i32 }>
+
+; CHECK-LLVM: @__block_literal_global = internal addrspace(1) constant [[BlockTy]] { i32 8, i32 4 }, align 4
+; CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy]] { i32 8, i32 4 }, align 4
+; CHECK-LLVM: @__block_literal_global.2 = internal addrspace(1) constant [[BlockTy]] { i32 8, i32 4 }, align 4
+; CHECK-LLVM: @__block_literal_global.3 = internal addrspace(1) constant [[BlockTy]] { i32 8, i32 4 }, align 4
+
+@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+@__block_literal_global.2 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+@__block_literal_global.3 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
+
+; CHECK-SPIRV: TypeInt [[Int32Ty:[0-9]+]] 32
+; CHECK-SPIRV: TypeInt [[Int8Ty:[0-9]+]] 8
+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt8:[0-9]+]] 8
+; CHECK-SPIRV: TypeVoid [[VoidTy:[0-9]+]]
+; CHECK-SPIRV: TypeStruct [[NDRangeTy:[0-9]+]] [[Int32Ty]] {{$}}
+; CHECK-SPIRV: TypePointer [[NDRangePtrTy:[0-9]+]] 7 [[NDRangeTy]]
+; CHECK-SPIRV: TypePointer [[Int8PtrGenTy:[0-9]+]] 8 [[Int8Ty]]
+; CHECK-SPIRV: TypeFunction [[BlockKerTy:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
+
+; Function Attrs: convergent noinline nounwind optnone
+define spir_kernel void @device_side_enqueue() #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !2 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !2 {
+entry:
+
+; CHECK-SPIRV: Variable [[NDRangePtrTy]] [[NDRange:[0-9]+]]
+
+  %ndrange = alloca %struct.ndrange_t, align 4
+
+; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit1Tmp:[0-9]+]] [[BlockGlb1]]
+; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit1:[0-9]+]] [[BlockLit1Tmp]]
+; CHECK-SPIRV: GetKernelWorkGroupSize [[Int32Ty]] {{[0-9]+}} [[BlockKer1]] [[BlockLit1]] [[ConstInt8]] [[ConstInt8]]
+
+; CHECK-LLVM: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* {{.*}}, i8 addrspace(4)* {{.*}})
+
+  %0 = call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*))
+
+; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit2Tmp:[0-9]+]] [[BlockGlb2]]
+; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit2:[0-9]+]] [[BlockLit2Tmp]]
+; CHECK-SPIRV: GetKernelPreferredWorkGroupSizeMultiple [[Int32Ty]] {{[0-9]+}} [[BlockKer2]] [[BlockLit2]] [[ConstInt8]] [[ConstInt8]]
+
+; CHECK-LLVM: call i32 @__get_kernel_preferred_work_group_size_multiple_impl(i8 addrspace(4)* {{.*}}, i8 addrspace(4)* {{.*}}) #1
+
+  %1 = call i32 @__get_kernel_preferred_work_group_size_multiple_impl(i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*))
+
+; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit3Tmp:[0-9]+]] [[BlockGlb3]]
+; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit3:[0-9]+]] [[BlockLit3Tmp]]
+; CHECK-SPIRV: GetKernelNDrangeMaxSubGroupSize [[Int32Ty]] {{[0-9]+}} [[NDRange]] [[BlockKer3]] [[BlockLit3]] [[ConstInt8]] [[ConstInt8]]
+
+; CHECK-LLVM: call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* {{.*}}, i8 addrspace(4)* {{.*}})
+
+  %2 = call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.2 to i8 addrspace(1)*) to i8 addrspace(4)*))
+
+; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit4Tmp:[0-9]+]] [[BlockGlb4]]
+; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit4:[0-9]+]] [[BlockLit4Tmp]]
+; CHECK-SPIRV: GetKernelNDrangeSubGroupCount [[Int32Ty]] {{[0-9]+}} [[NDRange]] [[BlockKer4]] [[BlockLit4]] [[ConstInt8]] [[ConstInt8]]
+
+; CHECK-LLVM: call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* {{.*}}, i8 addrspace(4)* {{.*}})
+
+  %3 = call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.3 to i8 addrspace(1)*) to i8 addrspace(4)*))
+  ret void
+}
+
+; Function Attrs: convergent noinline nounwind optnone
+define internal spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %.block_descriptor) #1 {
+entry:
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)*) #2 {
+entry:
+  call void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %0)
+  ret void
+}
+
+declare i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)*, i8 addrspace(4)*)
+
+; Function Attrs: convergent noinline nounwind optnone
+define internal spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %.block_descriptor) #1 {
+entry:
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)*) #2 {
+entry:
+  call void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %0)
+  ret void
+}
+
+declare i32 @__get_kernel_preferred_work_group_size_multiple_impl(i8 addrspace(4)*, i8 addrspace(4)*)
+
+; Function Attrs: convergent noinline nounwind optnone
+define internal spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* %.block_descriptor) #1 {
+entry:
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*) #2 {
+entry:
+  call void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* %0)
+  ret void
+}
+
+declare i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*)
+
+; Function Attrs: convergent noinline nounwind optnone
+define internal spir_func void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* %.block_descriptor) #1 {
+entry:
+  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
+  %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
+  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
+  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
+  store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*) #2 {
+entry:
+  call void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* %0)
+  ret void
+}
+
+declare i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*)
+
+; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer1]] 0 [[BlockKerTy]]
+; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer2]] 0 [[BlockKerTy]]
+; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer3]] 0 [[BlockKerTy]]
+; CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer4]] 0 [[BlockKerTy]]
+
+; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)*)
+; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)*)
+; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*)
+; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*)
+
+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+attributes #3 = { argmemonly nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.enable.FP_CONTRACT = !{}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.used.extensions = !{!2}
+!opencl.used.optional.core.features = !{!2}
+!opencl.compiler.options = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 2, i32 0}
+!2 = !{}
+!3 = !{!"clang version 7.0.0"}