diff --git a/buildbot/configure.py b/buildbot/configure.py
index 2134a566229be..fbc1160715d98 100644
--- a/buildbot/configure.py
+++ b/buildbot/configure.py
@@ -18,6 +18,8 @@ def do_configure(args):
     else:
       icd_loader_lib = os.path.join(args.obj_dir, "OpenCL-ICD-Loader", "build", "OpenCL.lib")
 
+    install_dir = os.path.join(args.obj_dir, "install")
+
     cmake_cmd = ["cmake",
                  "-G", "Ninja",
                  "-DCMAKE_BUILD_TYPE={}".format(args.build_type),
@@ -30,6 +32,7 @@ def do_configure(args):
                  "-DLLVM_BUILD_TOOLS=OFF",
                  "-DSYCL_ENABLE_WERROR=ON",
                  "-DLLVM_ENABLE_ASSERTIONS=ON",
+                 "-DCMAKE_INSTALL_PREFIX={}".format(install_dir),
                  llvm_dir]
 
     print(cmake_cmd)
diff --git a/llvm-spirv/CMakeLists.txt b/llvm-spirv/CMakeLists.txt
index 915fd670840a2..9a5cf99631691 100644
--- a/llvm-spirv/CMakeLists.txt
+++ b/llvm-spirv/CMakeLists.txt
@@ -2,6 +2,10 @@ cmake_minimum_required(VERSION 3.3)
 
 set(LLVM_SPIRV_VERSION 0.2.1.0)
 
+option(LLVM_SPIRV_INCLUDE_TESTS
+  "Generate build targets for the llvm-spirv lit tests."
+  ${LLVM_INCLUDE_TESTS})
+
 # check if we build inside llvm or not
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   set(BUILD_EXTERNAL YES)
@@ -12,17 +16,17 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
       CXX
   )
 
-  set(CMAKE_CXX_STANDARD 11)
+  set(CMAKE_CXX_STANDARD 14)
   set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-  if(LLVM_INCLUDE_TESTS)
+  if(LLVM_SPIRV_INCLUDE_TESTS)
     set(LLVM_TEST_COMPONENTS
       llvm-as
       llvm-dis
     )
-  endif(LLVM_INCLUDE_TESTS)
+  endif(LLVM_SPIRV_INCLUDE_TESTS)
 
-  find_package(LLVM 9.0.0 REQUIRED
+  find_package(LLVM 10.0.0 REQUIRED
     COMPONENTS
       Analysis
       BitReader
@@ -56,9 +60,9 @@ set(LLVM_SPIRV_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 add_subdirectory(lib/SPIRV)
 add_subdirectory(tools/llvm-spirv)
-if(LLVM_INCLUDE_TESTS)
+if(LLVM_SPIRV_INCLUDE_TESTS)
   add_subdirectory(test)
-endif(LLVM_INCLUDE_TESTS)
+endif(LLVM_SPIRV_INCLUDE_TESTS)
 
 install(
   FILES
diff --git a/llvm-spirv/README.md b/llvm-spirv/README.md
index 17fb769ec38c2..6fa5526be4698 100644
--- a/llvm-spirv/README.md
+++ b/llvm-spirv/README.md
@@ -26,7 +26,7 @@ The translator can be built with the latest(nightly) package of LLVM. For Ubuntu
 ```
 sudo add-apt-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial main"
 sudo apt-get update
-sudo apt-get install llvm-9-dev llvm-9-tools clang-9 libclang-9-dev
+sudo apt-get install llvm-10-dev llvm-10-tools clang-10 libclang-10-dev
 ```
 The installed version of LLVM will be used by default for out-of-tree build of the translator.
 ```
@@ -73,9 +73,12 @@ Execute the following command inside the build directory to run translator tests
 make test
 ```
 This requires that the `-DLLVM_INCLUDE_TESTS=ON` and
-`-DLLVM_EXTERNAL_LIT="/usr/lib/llvm-9/build/utils/lit/lit.py"` arguments were
+`-DLLVM_EXTERNAL_LIT="/usr/lib/llvm-10/build/utils/lit/lit.py"` arguments were
 passed to CMake during the build step.
 
+The translator test suite can be disabled by passing
+`-DLLVM_SPIRV_INCLUDE_TESTS=OFF` to cmake.
+
 ## Run Instructions for `llvm-spirv`
 
 
@@ -100,6 +103,11 @@ To translate between LLVM IR and SPIR-V:
     * `-spirv-text` - read/write SPIR-V in an internal textual format for debugging purpose. The textual format is not defined by SPIR-V spec.
     * `-help` - to see full list of options
 
+Translation from LLVM IR to SPIR-V and then back to LLVM IR is not guaranteed to
+produce the original LLVM IR.  In particular, LLVM intrinsic call instructions
+may get replaced by function calls to OpenCL builtins and metadata may be
+dropped.
+
 ### Handling SPIR-V versions generated by the translator
 
 There is one option to control the behavior of the translator with respect to
diff --git a/llvm-spirv/include/LLVMSPIRVOpts.h b/llvm-spirv/include/LLVMSPIRVOpts.h
index 1f5aa39855d91..17c452c98aae4 100644
--- a/llvm-spirv/include/LLVMSPIRVOpts.h
+++ b/llvm-spirv/include/LLVMSPIRVOpts.h
@@ -71,8 +71,9 @@ class TranslatorOpts {
 
   TranslatorOpts() = default;
 
-  TranslatorOpts(VersionNumber Max, const ExtensionsStatusMap &Map = {})
-      : MaxVersion(Max), ExtStatusMap(Map) {}
+  TranslatorOpts(VersionNumber Max, const ExtensionsStatusMap &Map = {},
+                 bool ArgNameMD = false)
+      : MaxVersion(Max), ExtStatusMap(Map), GenKernelArgNameMD(ArgNameMD) {}
 
   bool isAllowedToUseVersion(VersionNumber RequestedVersion) const {
     return RequestedVersion <= MaxVersion;
@@ -88,15 +89,22 @@ class TranslatorOpts {
 
   VersionNumber getMaxVersion() const { return MaxVersion; }
 
+  bool isGenArgNameMDEnabled() const { return GenKernelArgNameMD; }
+
   void enableAllExtensions() {
 #define EXT(X) ExtStatusMap[ExtensionID::X] = true;
 #include "LLVMSPIRVExtensions.inc"
 #undef EXT
   }
 
+  void enableGenArgNameMD() { GenKernelArgNameMD = true; }
+
 private:
+  // Common translation options
   VersionNumber MaxVersion = VersionNumber::MaximumVersion;
   ExtensionsStatusMap ExtStatusMap;
+  // SPIR-V to LLVM translation options
+  bool GenKernelArgNameMD;
 };
 
 } // namespace SPIRV
diff --git a/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp b/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp
index 226582471cb9b..a8e8eed69bce3 100644
--- a/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp
+++ b/llvm-spirv/lib/SPIRV/LLVMToSPIRVDbgTran.cpp
@@ -205,6 +205,15 @@ void LLVMToSPIRVDbgTran::transLocationInfo() {
           LineNo = DL.getLine();
           Col = DL.getCol();
           V = SPIRVWriter->getTranslatedValue(&I);
+          // According to the spec, OpLine for an OpBranch/OpBranchConditional
+          // must precede the merge instruction and not the branch instruction
+          auto *VPrev = static_cast<SPIRVInstruction *>(V)->getPrevious();
+          if (VPrev->getOpCode() == OpLoopMerge ||
+              VPrev->getOpCode() == OpLoopControlINTEL) {
+            assert(V->getOpCode() == OpBranch ||
+                   V->getOpCode() == OpBranchConditional);
+            V = VPrev;
+          }
           BM->addLine(V, File ? File->getId() : getDebugInfoNone()->getId(),
                       LineNo, Col);
         }
diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.cpp b/llvm-spirv/lib/SPIRV/OCLUtil.cpp
index ff32960f78bc1..a3a7b109bfa5b 100644
--- a/llvm-spirv/lib/SPIRV/OCLUtil.cpp
+++ b/llvm-spirv/lib/SPIRV/OCLUtil.cpp
@@ -777,7 +777,7 @@ bool isSamplerTy(Type *Ty) {
   return STy && STy->hasName() && STy->getName() == kSPR2TypeName::Sampler;
 }
 
-bool isPipeBI(const StringRef MangledName) {
+bool isPipeOrAddressSpaceCastBI(const StringRef MangledName) {
   return MangledName == "write_pipe_2" || MangledName == "read_pipe_2" ||
          MangledName == "write_pipe_2_bl" || MangledName == "read_pipe_2_bl" ||
          MangledName == "write_pipe_4" || MangledName == "read_pipe_4" ||
@@ -796,7 +796,9 @@ bool isPipeBI(const StringRef MangledName) {
          MangledName == "sub_group_reserve_write_pipe" ||
          MangledName == "sub_group_reserve_read_pipe" ||
          MangledName == "sub_group_commit_write_pipe" ||
-         MangledName == "sub_group_commit_read_pipe";
+         MangledName == "sub_group_commit_read_pipe" ||
+         MangledName == "to_global" || MangledName == "to_local" ||
+         MangledName == "to_private";
 }
 
 bool isEnqueueKernelBI(const StringRef MangledName) {
diff --git a/llvm-spirv/lib/SPIRV/OCLUtil.h b/llvm-spirv/lib/SPIRV/OCLUtil.h
index b246e0f4b3236..09b8db646d12c 100644
--- a/llvm-spirv/lib/SPIRV/OCLUtil.h
+++ b/llvm-spirv/lib/SPIRV/OCLUtil.h
@@ -416,7 +416,7 @@ bool isPipeStorageInitializer(Instruction *Inst);
 /// Check (isSamplerInitializer || isPipeStorageInitializer)
 bool isSpecialTypeInitializer(Instruction *Inst);
 
-bool isPipeBI(const StringRef MangledName);
+bool isPipeOrAddressSpaceCastBI(const StringRef MangledName);
 bool isEnqueueKernelBI(const StringRef MangledName);
 bool isKernelQueryBI(const StringRef MangledName);
 
@@ -751,7 +751,7 @@ template <> inline void SPIRVMap<std::string, Op, SPIRVInstruction>::init() {
 }
 
 template <> inline void SPIRVMap<std::string, Op, OCL12Builtin>::init() {
-#define _SPIRV_OP(x, y) add("atomic_" #x, Op##y);
+#define _SPIRV_OP(x, y) add(#x, Op##y);
   _SPIRV_OP(add, AtomicIAdd)
   _SPIRV_OP(sub, AtomicISub)
   _SPIRV_OP(xchg, AtomicExchange)
diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
index 6315f5bac6acc..cb27067e3067b 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
@@ -51,6 +51,7 @@
 #include "SPIRVValue.h"
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -90,11 +91,6 @@ cl::opt<bool> SPIRVEnableStepExpansion(
     "spirv-expand-step", cl::init(true),
     cl::desc("Enable expansion of OpenCL step and smoothstep function"));
 
-cl::opt<bool> SPIRVGenKernelArgNameMD(
-    "spirv-gen-kernel-arg-name-md", cl::init(false),
-    cl::desc("Enable generating OpenCL kernel argument name "
-             "metadata"));
-
 // Prefix for placeholder global variable name.
 const char *KPlaceholderPrefix = "placeholder.";
 
@@ -391,6 +387,9 @@ Type *SPIRVToLLVM::transType(SPIRVType *T, bool IsClassMember) {
   case OpTypeVector:
     return mapType(T, VectorType::get(transType(T->getVectorComponentType()),
                                       T->getVectorComponentCount()));
+  case OpTypeMatrix:
+    return mapType(T, ArrayType::get(transType(T->getMatrixColumnType()),
+                                     T->getMatrixColumnCount()));
   case OpTypeOpaque:
     return mapType(T, StructType::create(*Context, T->getName()));
   case OpTypeFunction: {
@@ -516,6 +515,9 @@ std::string SPIRVToLLVM::transTypeToOCLTypeName(SPIRVType *T, bool IsSigned) {
   case OpTypeVector:
     return transTypeToOCLTypeName(T->getVectorComponentType()) +
            T->getVectorComponentCount();
+  case OpTypeMatrix:
+    return transTypeToOCLTypeName(T->getMatrixColumnType()) +
+           T->getMatrixColumnCount();
   case OpTypeOpaque:
     return T->getName();
   case OpTypeFunction:
@@ -588,9 +590,12 @@ SPIRVToLLVM::getMetadataFromNameAndParameter(std::string Name,
 }
 
 template <typename LoopInstType>
-void SPIRVToLLVM::setLLVMLoopMetadata(LoopInstType *LM, BranchInst *BI) {
+void SPIRVToLLVM::setLLVMLoopMetadata(const LoopInstType *LM, Instruction *BI) {
   if (!LM)
     return;
+
+  assert(BI && isa<BranchInst>(BI));
+
   auto Temp = MDNode::getTemporary(*Context, None);
   auto Self = MDNode::get(*Context, Temp.get());
   Self->replaceOperandWith(0, Self);
@@ -693,6 +698,40 @@ void SPIRVToLLVM::setLLVMLoopMetadata(LoopInstType *LM, BranchInst *BI) {
   BI->setMetadata("llvm.loop", Node);
 }
 
+void SPIRVToLLVM::transLLVMLoopMetadata(const Function *F) {
+  assert(F);
+
+  if (!FuncLoopMetadataMap.empty()) {
+    // In SPIRV loop metadata is linked to a header basic block of a loop
+    // whilst in LLVM IR it is linked to a latch basic block (the one
+    // whose back edge goes to a header basic block) of the loop.
+
+    using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
+    SmallVector<Edge, 32> Edges;
+    FindFunctionBackedges(*F, Edges);
+
+    for (const auto &BkEdge : Edges) {
+      // Check that loop header BB contains loop metadata.
+      const auto LMDItr = FuncLoopMetadataMap.find(BkEdge.second);
+      if (LMDItr == FuncLoopMetadataMap.end())
+        continue;
+
+      auto *BI = const_cast<Instruction *>(BkEdge.first->getTerminator());
+      const auto *LMD = LMDItr->second;
+      if (LMD->getOpCode() == OpLoopMerge) {
+        const auto *LM = static_cast<const SPIRVLoopMerge *>(LMD);
+        setLLVMLoopMetadata<SPIRVLoopMerge>(LM, BI);
+      } else if (LMD->getOpCode() == OpLoopControlINTEL) {
+        const auto *LCI = static_cast<const SPIRVLoopControlINTEL *>(LMD);
+        setLLVMLoopMetadata<SPIRVLoopControlINTEL>(LCI, BI);
+      }
+    }
+
+    // Loop metadata map should be re-filled during each function translation.
+    FuncLoopMetadataMap.clear();
+  }
+}
+
 void SPIRVToLLVM::insertImageNameAccessQualifier(SPIRV::SPIRVTypeImage *ST,
                                                  std::string &Name) {
   SPIRVAccessQualifierKind Acc = ST->hasAccessQualifier()
@@ -1233,6 +1272,7 @@ Value *SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
     switch (BV->getType()->getOpCode()) {
     case OpTypeVector:
       return mapValue(BV, ConstantVector::get(CV));
+    case OpTypeMatrix:
     case OpTypeArray:
       return mapValue(
           BV, ConstantArray::get(dyn_cast<ArrayType>(transType(BCC->getType())),
@@ -1364,34 +1404,20 @@ Value *SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
   // Translation of instructions
   switch (BV->getOpCode()) {
   case OpBranch: {
-    auto BR = static_cast<SPIRVBranch *>(BV);
-    auto BI = BranchInst::Create(
-        dyn_cast<BasicBlock>(transValue(BR->getTargetLabel(), F, BB)), BB);
-    auto Prev = BR->getPrevious();
-    if (Prev && Prev->getOpCode() == OpLoopMerge) {
-      auto LM = static_cast<SPIRVLoopMerge *>(Prev);
-      setLLVMLoopMetadata<SPIRVLoopMerge>(LM, BI);
-    } else if (Prev && Prev->getOpCode() == OpLoopControlINTEL) {
-      auto LCI = static_cast<SPIRVLoopControlINTEL *>(Prev);
-      setLLVMLoopMetadata<SPIRVLoopControlINTEL>(LCI, BI);
-    }
+    auto *BR = static_cast<SPIRVBranch *>(BV);
+    auto *BI = BranchInst::Create(
+        cast<BasicBlock>(transValue(BR->getTargetLabel(), F, BB)), BB);
+    // Loop metadata will be translated in the end of function translation.
     return mapValue(BV, BI);
   }
 
   case OpBranchConditional: {
-    auto BR = static_cast<SPIRVBranchConditional *>(BV);
-    auto BC = BranchInst::Create(
-        dyn_cast<BasicBlock>(transValue(BR->getTrueLabel(), F, BB)),
-        dyn_cast<BasicBlock>(transValue(BR->getFalseLabel(), F, BB)),
+    auto *BR = static_cast<SPIRVBranchConditional *>(BV);
+    auto *BC = BranchInst::Create(
+        cast<BasicBlock>(transValue(BR->getTrueLabel(), F, BB)),
+        cast<BasicBlock>(transValue(BR->getFalseLabel(), F, BB)),
         transValue(BR->getCondition(), F, BB), BB);
-    auto Prev = BR->getPrevious();
-    if (Prev && Prev->getOpCode() == OpLoopMerge) {
-      auto LM = static_cast<SPIRVLoopMerge *>(Prev);
-      setLLVMLoopMetadata<SPIRVLoopMerge>(LM, BC);
-    } else if (Prev && Prev->getOpCode() == OpLoopControlINTEL) {
-      auto LCI = static_cast<SPIRVLoopControlINTEL *>(Prev);
-      setLLVMLoopMetadata<SPIRVLoopControlINTEL>(LCI, BC);
-    }
+    // Loop metadata will be translated in the end of function translation.
     return mapValue(BV, BC);
   }
 
@@ -1518,8 +1544,11 @@ Value *SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
   case OpVmeImageINTEL:
   case OpLine:
   case OpSelectionMerge:   // OpenCL Compiler does not use this instruction
-  case OpLoopMerge:        // Should be translated at OpBranch or
-  case OpLoopControlINTEL: // OpBranchConditional cases
+    return nullptr;
+
+  case OpLoopMerge:        // Will be translated after all other function's
+  case OpLoopControlINTEL: // instructions are translated.
+    FuncLoopMetadataMap[BB] = BV;
     return nullptr;
 
   case OpSwitch: {
@@ -1557,6 +1586,67 @@ Value *SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
     return mapValue(BV, Scale);
   }
 
+  case OpMatrixTimesScalar: {
+    auto MTS = static_cast<SPIRVMatrixTimesScalar *>(BV);
+    IRBuilder<> Builder(BB);
+    auto Scalar = transValue(MTS->getScalar(), F, BB);
+    auto Matrix = transValue(MTS->getMatrix(), F, BB);
+    uint64_t ColNum = Matrix->getType()->getArrayNumElements();
+    auto ColType = cast<ArrayType>(Matrix->getType())->getElementType();
+    auto VecSize = ColType->getVectorNumElements();
+    auto NewVec = Builder.CreateVectorSplat(VecSize, Scalar, Scalar->getName());
+    NewVec->takeName(Scalar);
+
+    Value *V = UndefValue::get(Matrix->getType());
+    for (uint64_t Idx = 0; Idx != ColNum; Idx++) {
+      auto Col = Builder.CreateExtractValue(Matrix, Idx);
+      auto I = Builder.CreateFMul(Col, NewVec);
+      V = Builder.CreateInsertValue(V, I, Idx);
+    }
+
+    return mapValue(BV, V);
+  }
+
+  case OpMatrixTimesVector: {
+    auto *MTV = static_cast<SPIRVMatrixTimesVector *>(BV);
+    IRBuilder<> Builder(BB);
+    Value *Mat = transValue(MTV->getMatrix(), F, BB);
+    Value *Vec = transValue(MTV->getVector(), F, BB);
+
+    // Result is similar to Matrix * Matrix
+    // Mat is of M columns and N rows.
+    // Mat consists of vectors: V_1, V_2, ..., V_M
+    // where each vector is of size N.
+    //
+    // Vec is of size M.
+    // The product is a vector of size N.
+    //
+    //                |------- N ----------|
+    // Result = sum ( {Vec_1, Vec_1, ..., Vec_1} * V_1,
+    //                {Vec_2, Vec_2, ..., Vec_2} * V_2,
+    //                ...
+    //                {Vec_M, Vec_M, ..., Vec_M} * V_N );
+    //
+    // where sum is defined as vector sum.
+
+    unsigned M = Mat->getType()->getArrayNumElements();
+    VectorType *VTy =
+        cast<VectorType>(cast<ArrayType>(Mat->getType())->getElementType());
+    unsigned N = VTy->getVectorNumElements();
+    auto ETy = VTy->getElementType();
+    Value *V = Builder.CreateVectorSplat(N, ConstantFP::get(ETy, 0.0));
+
+    for (unsigned Idx = 0; Idx != M; ++Idx) {
+      Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(Idx));
+      Value *Lhs = Builder.CreateVectorSplat(N, S);
+      Value *Vx = Builder.CreateExtractValue(Mat, Idx);
+      Value *Mul = Builder.CreateFMul(Lhs, Vx);
+      V = Builder.CreateFAdd(V, Mul);
+    }
+
+    return mapValue(BV, V);
+  }
+
   case OpCopyObject: {
     SPIRVCopyObject *CO = static_cast<SPIRVCopyObject *>(BV);
     AllocaInst *AI =
@@ -1680,9 +1770,9 @@ Value *SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
 
   case OpBitReverse: {
     auto *BR = static_cast<SPIRVUnary *>(BV);
-    IntegerType *Int32Ty = IntegerType::get(*Context, 32);
+    auto Ty = transType(BV->getType());
     Function *intr =
-        Intrinsic::getDeclaration(M, llvm::Intrinsic::bitreverse, Int32Ty);
+        Intrinsic::getDeclaration(M, llvm::Intrinsic::bitreverse, Ty);
     auto *Call = CallInst::Create(intr, transValue(BR->getOperand(0), F, BB),
                                   BR->getName(), BB);
     return mapValue(BV, Call);
@@ -1739,36 +1829,44 @@ Value *SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
   }
 
   case OpFMod: {
-    // translate OpFMod(a, b) to copysign(frem(a, b), b)
+    // translate OpFMod(a, b) to:
+    //   r = frem(a, b)
+    //   c = copysign(r, b)
+    //   needs_fixing = islessgreater(r, c)
+    //   result = needs_fixing ? r + b : c
+    IRBuilder<> Builder(BB);
     SPIRVFMod *FMod = static_cast<SPIRVFMod *>(BV);
     auto Dividend = transValue(FMod->getDividend(), F, BB);
     auto Divisor = transValue(FMod->getDivisor(), F, BB);
-    auto FRem = BinaryOperator::CreateFRem(Dividend, Divisor, "frem.res", BB);
-
-    std::string UnmangledName = OCLExtOpMap::map(OpenCLLIB::Copysign);
-    std::string MangledName = "copysign";
-
-    std::vector<Type *> ArgTypes;
-    ArgTypes.push_back(FRem->getType());
-    ArgTypes.push_back(Divisor->getType());
-    mangleOpenClBuiltin(UnmangledName, ArgTypes, MangledName);
-
-    auto FT = FunctionType::get(transType(BV->getType()), ArgTypes, false);
-    auto Func =
-        Function::Create(FT, GlobalValue::ExternalLinkage, MangledName, M);
-    Func->setCallingConv(CallingConv::SPIR_FUNC);
-    if (isFuncNoUnwind())
-      Func->addFnAttr(Attribute::NoUnwind);
-
-    std::vector<Value *> Args;
-    Args.push_back(FRem);
-    Args.push_back(Divisor);
-
-    auto Call = CallInst::Create(Func, Args, "copysign", BB);
-    setCallingConv(Call);
-    addFnAttr(Call, Attribute::NoUnwind);
-    return mapValue(BV, Call);
+    auto FRem = Builder.CreateFRem(Dividend, Divisor, "frem.res");
+    auto CopySign = Builder.CreateBinaryIntrinsic(
+        llvm::Intrinsic::copysign, FRem, Divisor, nullptr, "copysign.res");
+    auto FAdd = Builder.CreateFAdd(FRem, Divisor, "fadd.res");
+    auto Cmp = Builder.CreateFCmpONE(FRem, CopySign, "cmp.res");
+    auto Select = Builder.CreateSelect(Cmp, FAdd, CopySign);
+    return mapValue(BV, Select);
+  }
+
+  case OpSMod: {
+    // translate OpSMod(a, b) to:
+    //   r = srem(a, b)
+    //   needs_fixing = ((a < 0) != (b < 0) && r != 0)
+    //   result = needs_fixing ? r + b : r
+    IRBuilder<> Builder(BB);
+    SPIRVSMod *SMod = static_cast<SPIRVSMod *>(BV);
+    auto Dividend = transValue(SMod->getDividend(), F, BB);
+    auto Divisor = transValue(SMod->getDivisor(), F, BB);
+    auto SRem = Builder.CreateSRem(Dividend, Divisor, "srem.res");
+    auto Xor = Builder.CreateXor(Dividend, Divisor, "xor.res");
+    auto Zero = ConstantInt::getNullValue(Dividend->getType());
+    auto CmpSign = Builder.CreateICmpSLT(Xor, Zero, "cmpsign.res");
+    auto CmpSRem = Builder.CreateICmpNE(SRem, Zero, "cmpsrem.res");
+    auto Add = Builder.CreateNSWAdd(SRem, Divisor, "add.res");
+    auto Cmp = Builder.CreateAnd(CmpSign, CmpSRem, "cmp.res");
+    auto Select = Builder.CreateSelect(Cmp, Add, SRem);
+    return mapValue(BV, Select);
   }
+
   case OpFNegate: {
     SPIRVUnary *BC = static_cast<SPIRVUnary *>(BV);
     return mapValue(
@@ -1953,6 +2051,9 @@ Function *SPIRVToLLVM::transFunction(SPIRVFunction *BF) {
       transValue(BInst, F, BB, false);
     }
   }
+
+  transLLVMLoopMetadata(F);
+
   return F;
 }
 
@@ -2085,26 +2186,49 @@ Instruction *SPIRVToLLVM::transEnqueueKernelBI(SPIRVInstruction *BI,
   // Find or create enqueue kernel BI declaration
   auto Ops = BI->getOperands();
   bool HasVaargs = Ops.size() > 10;
+  bool HasEvents = true;
+  SPIRVValue *EventRet = Ops[5];
+  if (EventRet->getOpCode() == OpConstantNull) {
+    SPIRVValue *NumEvents = Ops[3];
+    if (NumEvents->getOpCode() == OpConstant) {
+      SPIRVConstant *NE = static_cast<SPIRVConstant *>(NumEvents);
+      HasEvents = NE->getZExtIntValue() != 0;
+    } else if (NumEvents->getOpCode() == OpConstantNull)
+      HasEvents = false;
+  }
+
+  std::string FName = "";
+  if (!HasVaargs && !HasEvents)
+    FName = "__enqueue_kernel_basic";
+  else if (!HasVaargs && HasEvents)
+    FName = "__enqueue_kernel_basic_events";
+  else if (HasVaargs && !HasEvents)
+    FName = "__enqueue_kernel_varargs";
+  else
+    FName = "__enqueue_kernel_events_varargs";
 
-  std::string FName = HasVaargs ? "__enqueue_kernel_events_varargs"
-                                : "__enqueue_kernel_basic_events";
   Function *F = M->getFunction(FName);
   if (!F) {
-    Type *EventTy = PointerType::get(
-        getOrCreateOpaquePtrType(M, SPIR_TYPE_NAME_CLK_EVENT_T,
-                                 getOCLOpaqueTypeAddrSpace(OpTypeDeviceEvent)),
-        SPIRAS_Generic);
-
     SmallVector<Type *, 8> Tys = {
         transType(Ops[0]->getType()), // queue
         Int32Ty,                      // flags
         transType(Ops[2]->getType()), // ndrange
-        Int32Ty,
-        EventTy,
-        EventTy,                                      // events
-        Type::getInt8PtrTy(*Context, SPIRAS_Generic), // block_invoke
-        Type::getInt8PtrTy(*Context, SPIRAS_Generic)  // block_literal
     };
+    if (HasEvents) {
+      Type *EventTy =
+          PointerType::get(getOrCreateOpaquePtrType(
+                               M, SPIR_TYPE_NAME_CLK_EVENT_T,
+                               getOCLOpaqueTypeAddrSpace(OpTypeDeviceEvent)),
+                           SPIRAS_Generic);
+
+      Tys.push_back(Int32Ty);
+      Tys.push_back(EventTy);
+      Tys.push_back(EventTy);
+    }
+
+    Tys.push_back(Type::getInt8PtrTy(*Context, SPIRAS_Generic));
+    Tys.push_back(Type::getInt8PtrTy(*Context, SPIRAS_Generic));
+
     if (HasVaargs) {
       // Number of block invoke arguments (local arguments)
       Tys.push_back(Int32Ty);
@@ -2123,13 +2247,17 @@ Instruction *SPIRVToLLVM::transEnqueueKernelBI(SPIRVInstruction *BI,
       transValue(Ops[0], F, BB, false), // queue
       transValue(Ops[1], F, BB, false), // flags
       transValue(Ops[2], F, BB, false), // ndrange
-      transValue(Ops[3], F, BB, false), // events number
-      transDeviceEvent(Ops[4], F, BB),  // event_wait_list
-      transDeviceEvent(Ops[5], F, BB),  // event_ret
-      transBlockInvoke(Ops[6], BB),     // block_invoke
-      transValue(Ops[7], F, BB, false)  // block_literal
   };
 
+  if (HasEvents) {
+    Args.push_back(transValue(Ops[3], F, BB, false)); // events number
+    Args.push_back(transDeviceEvent(Ops[4], F, BB));  // event_wait_list
+    Args.push_back(transDeviceEvent(Ops[5], F, BB));  // event_ret
+  }
+
+  Args.push_back(transBlockInvoke(Ops[6], BB));     // block_invoke
+  Args.push_back(transValue(Ops[7], F, BB, false)); // block_literal
+
   if (HasVaargs) {
     // Number of local arguments
     Args.push_back(ConstantInt::get(Int32Ty, Ops.size() - 10));
@@ -2380,6 +2508,8 @@ bool SPIRVToLLVM::translate() {
     if (BV->getStorageClass() != StorageClassFunction)
       transValue(BV, nullptr, nullptr);
   }
+  transGlobalAnnotations();
+
   // Compile unit might be needed during translation of debug intrinsics.
   for (SPIRVExtInst *EI : BM->getDebugInstVec()) {
     // Translate Compile Unit first.
@@ -2519,57 +2649,100 @@ void generateIntelFPGAAnnotationForStructMember(
 }
 
 void SPIRVToLLVM::transIntelFPGADecorations(SPIRVValue *BV, Value *V) {
-  if (BV->isVariable()) {
-    if (auto AL = dyn_cast<AllocaInst>(V)) {
-      IRBuilder<> Builder(AL->getParent());
+  if (!BV->isVariable())
+    return;
+
+  if (auto AL = dyn_cast<AllocaInst>(V)) {
+    IRBuilder<> Builder(AL->getParent());
 
-      SPIRVType *ST = BV->getType()->getPointerElementType();
+    SPIRVType *ST = BV->getType()->getPointerElementType();
 
-      Type *Int8PtrTyPrivate = Type::getInt8PtrTy(*Context, SPIRAS_Private);
-      IntegerType *Int32Ty = IntegerType::get(*Context, 32);
+    Type *Int8PtrTyPrivate = Type::getInt8PtrTy(*Context, SPIRAS_Private);
+    IntegerType *Int32Ty = IntegerType::get(*Context, 32);
 
-      Value *UndefInt8Ptr = UndefValue::get(Int8PtrTyPrivate);
-      Value *UndefInt32 = UndefValue::get(Int32Ty);
+    Value *UndefInt8Ptr = UndefValue::get(Int8PtrTyPrivate);
+    Value *UndefInt32 = UndefValue::get(Int32Ty);
 
-      if (ST->isTypeStruct()) {
-        SPIRVTypeStruct *STS = static_cast<SPIRVTypeStruct *>(ST);
+    if (ST->isTypeStruct()) {
+      SPIRVTypeStruct *STS = static_cast<SPIRVTypeStruct *>(ST);
 
-        for (SPIRVWord I = 0; I < STS->getMemberCount(); ++I) {
-          SmallString<256> AnnotStr;
-          generateIntelFPGAAnnotationForStructMember(ST, I, AnnotStr);
-          if (!AnnotStr.empty()) {
-            auto *GS = Builder.CreateGlobalStringPtr(AnnotStr);
+      for (SPIRVWord I = 0; I < STS->getMemberCount(); ++I) {
+        SmallString<256> AnnotStr;
+        generateIntelFPGAAnnotationForStructMember(ST, I, AnnotStr);
+        if (!AnnotStr.empty()) {
+          auto *GS = Builder.CreateGlobalStringPtr(AnnotStr);
 
-            auto AnnotationFn = llvm::Intrinsic::getDeclaration(
-                M, Intrinsic::ptr_annotation, Int8PtrTyPrivate);
+          auto AnnotationFn = llvm::Intrinsic::getDeclaration(
+              M, Intrinsic::ptr_annotation, Int8PtrTyPrivate);
 
-            auto GEP = Builder.CreateConstInBoundsGEP2_32(
-                AL->getAllocatedType(), AL, 0, I);
+          auto GEP = Builder.CreateConstInBoundsGEP2_32(AL->getAllocatedType(),
+                                                        AL, 0, I);
 
-            llvm::Value *Args[] = {
-                Builder.CreateBitCast(GEP, Int8PtrTyPrivate, GEP->getName()),
-                Builder.CreateBitCast(GS, Int8PtrTyPrivate), UndefInt8Ptr,
-                UndefInt32};
-            Builder.CreateCall(AnnotationFn, Args);
-          }
+          llvm::Value *Args[] = {
+              Builder.CreateBitCast(GEP, Int8PtrTyPrivate, GEP->getName()),
+              Builder.CreateBitCast(GS, Int8PtrTyPrivate), UndefInt8Ptr,
+              UndefInt32};
+          Builder.CreateCall(AnnotationFn, Args);
         }
       }
+    }
 
-      SmallString<256> AnnotStr;
-      generateIntelFPGAAnnotation(BV, AnnotStr);
-      if (!AnnotStr.empty()) {
-        auto *GS = Builder.CreateGlobalStringPtr(AnnotStr);
+    SmallString<256> AnnotStr;
+    generateIntelFPGAAnnotation(BV, AnnotStr);
+    if (!AnnotStr.empty()) {
+      auto *GS = Builder.CreateGlobalStringPtr(AnnotStr);
 
-        auto AnnotationFn =
-            llvm::Intrinsic::getDeclaration(M, Intrinsic::var_annotation);
+      auto AnnotationFn =
+          llvm::Intrinsic::getDeclaration(M, Intrinsic::var_annotation);
 
-        llvm::Value *Args[] = {
-            Builder.CreateBitCast(V, Int8PtrTyPrivate, V->getName()),
-            Builder.CreateBitCast(GS, Int8PtrTyPrivate), UndefInt8Ptr,
-            UndefInt32};
-        Builder.CreateCall(AnnotationFn, Args);
-      }
+      llvm::Value *Args[] = {
+          Builder.CreateBitCast(V, Int8PtrTyPrivate, V->getName()),
+          Builder.CreateBitCast(GS, Int8PtrTyPrivate), UndefInt8Ptr,
+          UndefInt32};
+      Builder.CreateCall(AnnotationFn, Args);
     }
+  } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
+    SmallString<256> AnnotStr;
+    generateIntelFPGAAnnotation(BV, AnnotStr);
+
+    if (AnnotStr.empty())
+      return;
+
+    Constant *StrConstant =
+        ConstantDataArray::getString(*Context, StringRef(AnnotStr));
+
+    auto *GS = new GlobalVariable(*GV->getParent(), StrConstant->getType(),
+                                  /*IsConstant*/ true,
+                                  GlobalValue::PrivateLinkage, StrConstant, "");
+
+    GS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+    GS->setSection("llvm.metadata");
+
+    Type *ResType = PointerType::getInt8PtrTy(
+        GV->getContext(), GV->getType()->getPointerAddressSpace());
+    Constant *C = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, ResType);
+
+    Type *Int8PtrTyPrivate = Type::getInt8PtrTy(*Context, SPIRAS_Private);
+    IntegerType *Int32Ty = Type::getInt32Ty(*Context);
+
+    llvm::Constant *Fields[4] = {
+        C, ConstantExpr::getBitCast(GS, Int8PtrTyPrivate),
+        UndefValue::get(Int8PtrTyPrivate), UndefValue::get(Int32Ty)};
+
+    GlobalAnnotations.push_back(ConstantStruct::getAnon(Fields));
+  }
+}
+
+void SPIRVToLLVM::transGlobalAnnotations() {
+  if (!GlobalAnnotations.empty()) {
+    Constant *Array =
+        ConstantArray::get(ArrayType::get(GlobalAnnotations[0]->getType(),
+                                          GlobalAnnotations.size()),
+                           GlobalAnnotations);
+    auto *GV = new GlobalVariable(*M, Array->getType(), /*IsConstant*/ false,
+                                  GlobalValue::AppendingLinkage, Array,
+                                  "llvm.global.annotations");
+    GV->setSection("llvm.metadata");
   }
 }
 
@@ -2734,7 +2907,7 @@ bool SPIRVToLLVM::transKernelMetadata() {
                                    return transOCLKernelArgTypeName(Arg);
                                  });
     // Generate metadata for kernel_arg_name
-    if (SPIRVGenKernelArgNameMD) {
+    if (BM->isGenArgNameMDEnabled()) {
       addOCLKernelArgumentMetadata(Context, SPIR_MD_KERNEL_ARG_NAME, BF, F,
                                    [=](SPIRVFunctionParameter *Arg) {
                                      return MDString::get(*Context,
diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.h b/llvm-spirv/lib/SPIRV/SPIRVReader.h
index 7e2d1c8eadaa9..34e471e4a9a2a 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.h
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.h
@@ -187,6 +187,9 @@ class SPIRVToLLVM {
   // which are supposed to be replaced by the real values later.
   typedef std::map<SPIRVValue *, LoadInst *> SPIRVToLLVMPlaceholderMap;
 
+  typedef std::map<const BasicBlock *, const SPIRVValue *>
+      SPIRVToLLVMLoopMetadataMap;
+
 private:
   Module *M;
   BuiltinVarMap BuiltinGVMap;
@@ -198,6 +201,12 @@ class SPIRVToLLVM {
   SPIRVBlockToLLVMStructMap BlockMap;
   SPIRVToLLVMPlaceholderMap PlaceholderMap;
   std::unique_ptr<SPIRVToLLVMDbgTran> DbgTran;
+  std::vector<Constant *> GlobalAnnotations;
+
+  // Loops metadata is translated in the end of a function translation.
+  // This storage contains pairs of translated loop header basic block and loop
+  // metadata SPIR-V instruction in SPIR-V representation of this basic block.
+  SPIRVToLLVMLoopMetadataMap FuncLoopMetadataMap;
 
   Type *mapType(SPIRVType *BT, Type *T);
 
@@ -246,7 +255,8 @@ class SPIRVToLLVM {
   Value *oclTransConstantPipeStorage(SPIRV::SPIRVConstantPipeStorage *BCPS);
   void setName(llvm::Value *V, SPIRVValue *BV);
   template <typename LoopInstType>
-  void setLLVMLoopMetadata(LoopInstType *LM, BranchInst *BI);
+  void setLLVMLoopMetadata(const LoopInstType *LM, Instruction *BI);
+  void transLLVMLoopMetadata(const Function *F);
   inline llvm::Metadata *getMetadataFromName(std::string Name);
   inline std::vector<llvm::Metadata *>
   getMetadataFromNameAndParameter(std::string Name, SPIRVWord Parameter);
@@ -257,6 +267,7 @@ class SPIRVToLLVM {
   Instruction *transOCLAllAny(SPIRVInstruction *BI, BasicBlock *BB);
   Instruction *transOCLRelational(SPIRVInstruction *BI, BasicBlock *BB);
 
+  void transGlobalAnnotations();
   void transIntelFPGADecorations(SPIRVValue *BV, Value *V);
 }; // class SPIRVToLLVM
 
diff --git a/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp b/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp
index e3baa7b8de7ae..8bb2a5162c13b 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp
@@ -56,8 +56,6 @@ static cl::opt<std::string>
                        cl::desc("Specify version of OCL builtins to translate "
                                 "to (CL1.2, CL2.0, CL2.1)"));
 
-char SPIRVToOCL::ID = 0;
-
 void SPIRVToOCL::visitCallInst(CallInst &CI) {
   LLVM_DEBUG(dbgs() << "[visistCallInst] " << CI << '\n');
   auto F = CI.getCalledFunction();
diff --git a/llvm-spirv/lib/SPIRV/SPIRVToOCL.h b/llvm-spirv/lib/SPIRV/SPIRVToOCL.h
index adaf7df6513d0..a582de92c8cbb 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVToOCL.h
+++ b/llvm-spirv/lib/SPIRV/SPIRVToOCL.h
@@ -49,7 +49,7 @@
 namespace SPIRV {
 class SPIRVToOCL : public ModulePass, public InstVisitor<SPIRVToOCL> {
 protected:
-  SPIRVToOCL() : ModulePass(ID), M(nullptr), Ctx(nullptr) {}
+  SPIRVToOCL(char &ID) : ModulePass(ID), M(nullptr), Ctx(nullptr) {}
 
 public:
   virtual bool runOnModule(Module &M) = 0;
@@ -133,8 +133,6 @@ class SPIRVToOCL : public ModulePass, public InstVisitor<SPIRVToOCL> {
   /// using separate maps for OpenCL 1.2 and OpenCL 2.0
   virtual Instruction *mutateAtomicName(CallInst *CI, Op OC) = 0;
 
-  static char ID;
-
 protected:
   Module *M;
   LLVMContext *Ctx;
diff --git a/llvm-spirv/lib/SPIRV/SPIRVToOCL12.cpp b/llvm-spirv/lib/SPIRV/SPIRVToOCL12.cpp
index ddfaaea92848a..716599660ed6d 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVToOCL12.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVToOCL12.cpp
@@ -45,7 +45,7 @@ namespace SPIRV {
 
 class SPIRVToOCL12 : public SPIRVToOCL {
 public:
-  SPIRVToOCL12() {
+  SPIRVToOCL12() : SPIRVToOCL(ID) {
     initializeSPIRVToOCL12Pass(*PassRegistry::getPassRegistry());
   }
   bool runOnModule(Module &M) override;
@@ -74,18 +74,18 @@ class SPIRVToOCL12 : public SPIRVToOCL {
   Instruction *visitCallSPIRVAtomicUMinUMax(CallInst *CI, Op OC);
 
   /// Transform __spirv_OpAtomicLoad to atomic_add(*ptr, 0)
-  Instruction *visitCallSPIRVAtomicLoad(CallInst *CI, Op OC);
+  Instruction *visitCallSPIRVAtomicLoad(CallInst *CI);
 
   /// Transform __spirv_OpAtomicStore to atomic_xchg(*ptr, value)
-  Instruction *visitCallSPIRVAtomicStore(CallInst *CI, Op OC);
+  Instruction *visitCallSPIRVAtomicStore(CallInst *CI);
 
   /// Transform __spirv_OpAtomicFlagClear to atomic_xchg(*ptr, 0)
   /// with ignoring the result
-  Instruction *visitCallSPIRVAtomicFlagClear(CallInst *CI, Op OC);
+  Instruction *visitCallSPIRVAtomicFlagClear(CallInst *CI);
 
   /// Transform __spirv_OpAtomicFlagTestAndTest to
   /// (bool)atomic_xchg(*ptr, 1)
-  Instruction *visitCallSPIRVAtomicFlagTestAndSet(CallInst *CI, Op OC);
+  Instruction *visitCallSPIRVAtomicFlagTestAndSet(CallInst *CI);
 
   /// Transform __spirv_OpAtomicCompareExchange and
   /// __spirv_OpAtomicCompareExchangeWeak into atomic_cmpxchg. There is no
@@ -97,15 +97,23 @@ class SPIRVToOCL12 : public SPIRVToOCL {
 
   /// Transform atomic builtin name into correct ocl-dependent name
   Instruction *mutateAtomicName(CallInst *CI, Op OC) override;
+
+  /// Transform SPIR-V atomic instruction opcode into OpenCL 1.2 builtin name.
+  /// Depending on the type, the return name starts with "atomic_" for 32-bit
+  /// types or with "atom_" for 64-bit types, as specified by
+  /// cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics extensions.
+  std::string mapAtomicName(Op OC, Type *Ty);
+
+  static char ID;
 };
 
+char SPIRVToOCL12::ID = 0;
+
 bool SPIRVToOCL12::runOnModule(Module &Module) {
   M = &Module;
   Ctx = &M->getContext();
   visit(*M);
 
-  translateMangledAtomicTypeName();
-
   eraseUselessFunctions(&Module);
 
   LLVM_DEBUG(dbgs() << "After SPIRVToOCL12:\n" << *M);
@@ -186,7 +194,7 @@ Instruction *SPIRVToOCL12::visitCallSPIRVAtomicIncDec(CallInst *CI, Op OC) {
       M, CI,
       [=](CallInst *, std::vector<Value *> &Args) {
         Args.resize(1);
-        return OCLSPIRVBuiltinMap::rmap(OC);
+        return mapAtomicName(OC, CI->getType());
       },
       &Attrs);
 }
@@ -205,7 +213,7 @@ CallInst *SPIRVToOCL12::mutateCommonAtomicArguments(CallInst *CI, Op OC) {
         auto StartIdx = Ptr + 1;
         auto StopIdx = StartIdx + ArgsToRemove;
         Args.erase(Args.begin() + StartIdx, Args.begin() + StopIdx);
-        return OCL12SPIRVBuiltinMap::rmap(OC);
+        return mapAtomicName(OC, CI->getType());
       },
       &Attrs);
 }
@@ -217,13 +225,13 @@ Instruction *SPIRVToOCL12::visitCallSPIRVAtomicUMinUMax(CallInst *CI, Op OC) {
       [=](CallInst *, std::vector<Value *> &Args) {
         std::swap(Args[1], Args[3]);
         Args.resize(2);
-        return OCL12SPIRVBuiltinMap::rmap(OC == OpAtomicUMin ? OpAtomicSMin
-                                                             : OpAtomicSMax);
+        return mapAtomicName(OC == OpAtomicUMin ? OpAtomicSMin : OpAtomicSMax,
+                             CI->getType());
       },
       &Attrs);
 }
 
-Instruction *SPIRVToOCL12::visitCallSPIRVAtomicLoad(CallInst *CI, Op OC) {
+Instruction *SPIRVToOCL12::visitCallSPIRVAtomicLoad(CallInst *CI) {
   AttributeList Attrs = CI->getCalledFunction()->getAttributes();
   return mutateCallInstOCL(
       M, CI,
@@ -233,12 +241,12 @@ Instruction *SPIRVToOCL12::visitCallSPIRVAtomicLoad(CallInst *CI, Op OC) {
         // Emit this builtin via call of atomic_add(*p, 0).
         Type *ptrElemTy = Args[0]->getType()->getPointerElementType();
         Args.push_back(Constant::getNullValue(ptrElemTy));
-        return OCL12SPIRVBuiltinMap::rmap(OpAtomicIAdd);
+        return mapAtomicName(OpAtomicIAdd, ptrElemTy);
       },
       &Attrs);
 }
 
-Instruction *SPIRVToOCL12::visitCallSPIRVAtomicStore(CallInst *CI, Op OC) {
+Instruction *SPIRVToOCL12::visitCallSPIRVAtomicStore(CallInst *CI) {
   AttributeList Attrs = CI->getCalledFunction()->getAttributes();
   return mutateCallInstOCL(
       M, CI,
@@ -248,12 +256,12 @@ Instruction *SPIRVToOCL12::visitCallSPIRVAtomicStore(CallInst *CI, Op OC) {
         // The type of the value pointed to by Pointer (1st argument)
         // must be the same as Result Type.
         RetTy = Args[0]->getType()->getPointerElementType();
-        return OCL12SPIRVBuiltinMap::rmap(OpAtomicExchange);
+        return mapAtomicName(OpAtomicExchange, RetTy);
       },
       [=](CallInst *CI) -> Instruction * { return CI; }, &Attrs);
 }
 
-Instruction *SPIRVToOCL12::visitCallSPIRVAtomicFlagClear(CallInst *CI, Op OC) {
+Instruction *SPIRVToOCL12::visitCallSPIRVAtomicFlagClear(CallInst *CI) {
   AttributeList Attrs = CI->getCalledFunction()->getAttributes();
   return mutateCallInstOCL(
       M, CI,
@@ -261,13 +269,12 @@ Instruction *SPIRVToOCL12::visitCallSPIRVAtomicFlagClear(CallInst *CI, Op OC) {
         Args.resize(1);
         Args.push_back(getInt32(M, 0));
         RetTy = Type::getInt32Ty(M->getContext());
-        return OCL12SPIRVBuiltinMap::rmap(OpAtomicExchange);
+        return mapAtomicName(OpAtomicExchange, RetTy);
       },
       [=](CallInst *CI) -> Instruction * { return CI; }, &Attrs);
 }
 
-Instruction *SPIRVToOCL12::visitCallSPIRVAtomicFlagTestAndSet(CallInst *CI,
-                                                              Op OC) {
+Instruction *SPIRVToOCL12::visitCallSPIRVAtomicFlagTestAndSet(CallInst *CI) {
   AttributeList Attrs = CI->getCalledFunction()->getAttributes();
   return mutateCallInstOCL(
       M, CI,
@@ -275,7 +282,7 @@ Instruction *SPIRVToOCL12::visitCallSPIRVAtomicFlagTestAndSet(CallInst *CI,
         Args.resize(1);
         Args.push_back(getInt32(M, 1));
         RetTy = Type::getInt32Ty(M->getContext());
-        return OCL12SPIRVBuiltinMap::rmap(OpAtomicExchange);
+        return mapAtomicName(OpAtomicExchange, RetTy);
       },
       [=](CallInst *CI) -> Instruction * {
         return BitCastInst::Create(Instruction::Trunc, CI,
@@ -295,7 +302,9 @@ Instruction *SPIRVToOCL12::visitCallSPIRVAtomicCmpExchg(CallInst *CI, Op OC) {
         // has Value and Comparator in different order than ocl functions
         // both of them are translated into atomic_cmpxchg
         std::swap(Args[1], Args[2]);
-        return OCL12SPIRVBuiltinMap::rmap(OpAtomicCompareExchange);
+        // Type of return value, pointee of the pointer operand, other operands,
+        // all match, and should be integer scalar types.
+        return mapAtomicName(OpAtomicCompareExchange, CI->getType());
       },
       &Attrs);
 }
@@ -304,16 +313,16 @@ Instruction *SPIRVToOCL12::visitCallSPIRVAtomicBuiltin(CallInst *CI, Op OC) {
   Instruction *NewCI = nullptr;
   switch (OC) {
   case OpAtomicLoad:
-    NewCI = visitCallSPIRVAtomicLoad(CI, OC);
+    NewCI = visitCallSPIRVAtomicLoad(CI);
     break;
   case OpAtomicStore:
-    NewCI = visitCallSPIRVAtomicStore(CI, OC);
+    NewCI = visitCallSPIRVAtomicStore(CI);
     break;
   case OpAtomicFlagClear:
-    NewCI = visitCallSPIRVAtomicFlagClear(CI, OC);
+    NewCI = visitCallSPIRVAtomicFlagClear(CI);
     break;
   case OpAtomicFlagTestAndSet:
-    NewCI = visitCallSPIRVAtomicFlagTestAndSet(CI, OC);
+    NewCI = visitCallSPIRVAtomicFlagTestAndSet(CI);
     break;
   case OpAtomicUMin:
   case OpAtomicUMax:
@@ -340,6 +349,12 @@ Instruction *SPIRVToOCL12::mutateAtomicName(CallInst *CI, Op OC) {
       &Attrs);
 }
 
+std::string SPIRVToOCL12::mapAtomicName(Op OC, Type *Ty) {
+  std::string Prefix = Ty->isIntegerTy(64) ? kOCLBuiltinName::AtomPrefix
+                                           : kOCLBuiltinName::AtomicPrefix;
+  return Prefix += OCL12SPIRVBuiltinMap::rmap(OC);
+}
+
 } // namespace SPIRV
 
 INITIALIZE_PASS(SPIRVToOCL12, "spvtoocl12",
diff --git a/llvm-spirv/lib/SPIRV/SPIRVToOCL20.cpp b/llvm-spirv/lib/SPIRV/SPIRVToOCL20.cpp
index 93f4319f70f90..833313d469e28 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVToOCL20.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVToOCL20.cpp
@@ -44,7 +44,7 @@ namespace SPIRV {
 
 class SPIRVToOCL20 : public SPIRVToOCL {
 public:
-  SPIRVToOCL20() {
+  SPIRVToOCL20() : SPIRVToOCL(ID) {
     initializeSPIRVToOCL20Pass(*PassRegistry::getPassRegistry());
   }
   bool runOnModule(Module &M) override;
@@ -81,8 +81,12 @@ class SPIRVToOCL20 : public SPIRVToOCL {
   /// Transform __spirv_OpAtomicCompareExchange/Weak into
   /// compare_exchange_strong/weak_explicit
   Instruction *visitCallSPIRVAtomicCmpExchg(CallInst *CI, Op OC) override;
+
+  static char ID;
 };
 
+char SPIRVToOCL20::ID = 0;
+
 bool SPIRVToOCL20::runOnModule(Module &Module) {
   M = &Module;
   Ctx = &M->getContext();
@@ -283,7 +287,8 @@ Instruction *SPIRVToOCL20::visitCallSPIRVAtomicCmpExchg(CallInst *CI, Op OC) {
                                                       ->getParent()
                                                       ->getEntryBlock()
                                                       .getFirstInsertionPt()));
-        PExpected->setAlignment(MaybeAlign(CI->getType()->getScalarSizeInBits() / 8));
+        PExpected->setAlignment(
+            MaybeAlign(CI->getType()->getScalarSizeInBits() / 8));
         new StoreInst(Args[1], PExpected, PInsertBefore);
         unsigned AddrSpc = SPIRAS_Generic;
         Type *PtrTyAS =
diff --git a/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp b/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp
index cdb6c9bff93b4..a5423217f272b 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp
@@ -461,13 +461,14 @@ bool getSPIRVBuiltin(const std::string &OrigName, spv::BuiltIn &B) {
   return getByName(R.str(), B);
 }
 
-// Enqueue kernel, kernel query and pipe built-ins are not mangled
+// Enqueue kernel, kernel query, pipe and address space cast built-ins
+// are not mangled.
 bool isNonMangledOCLBuiltin(const StringRef &Name) {
   if (!Name.startswith("__"))
     return false;
 
   return isEnqueueKernelBI(Name) || isKernelQueryBI(Name) ||
-         isPipeBI(Name.drop_front(2));
+         isPipeOrAddressSpaceCastBI(Name.drop_front(2));
 }
 
 bool oclIsBuiltin(const StringRef &Name, std::string *DemangledName,
@@ -1391,10 +1392,12 @@ bool eraseUselessFunctions(Module *M) {
 }
 
 // The mangling algorithm follows OpenCL pipe built-ins clang 3.8 CodeGen rules.
-static SPIR::MangleError manglePipeBuiltin(const SPIR::FunctionDescriptor &Fd,
-                                           std::string &MangledName) {
-  assert(OCLUtil::isPipeBI(Fd.Name) &&
-         "Method is expected to be called only for pipe builtins!");
+static SPIR::MangleError
+manglePipeOrAddressSpaceCastBuiltin(const SPIR::FunctionDescriptor &Fd,
+                                    std::string &MangledName) {
+  assert(OCLUtil::isPipeOrAddressSpaceCastBI(Fd.Name) &&
+         "Method is expected to be called only for pipe and address space cast "
+         "builtins!");
   if (Fd.isNull()) {
     MangledName.assign(SPIR::FunctionDescriptor::nullString());
     return SPIR::MANGLE_NULL_FUNC_DESCRIPTOR;
@@ -1443,8 +1446,8 @@ std::string mangleBuiltin(const std::string &UniqName,
   SPIR::NameMangler Mangler(SPIR::SPIR20);
   Mangler.mangle(FD, MangledName);
 #else
-  if (OCLUtil::isPipeBI(BtnInfo->getUnmangledName())) {
-    manglePipeBuiltin(FD, MangledName);
+  if (OCLUtil::isPipeOrAddressSpaceCastBI(BtnInfo->getUnmangledName())) {
+    manglePipeOrAddressSpaceCastBuiltin(FD, MangledName);
   } else {
     SPIR::NameMangler Mangler(SPIR::SPIR20);
     Mangler.mangle(FD, MangledName);
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index d905d0b5fc34d..ba85085e22981 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -56,6 +56,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -663,10 +664,20 @@ SPIRVInstruction *LLVMToSPIRV::transBinaryInst(BinaryOperator *B,
 }
 
 SPIRVInstruction *LLVMToSPIRV::transCmpInst(CmpInst *Cmp, SPIRVBasicBlock *BB) {
-  auto Op0 = transValue(Cmp->getOperand(0), BB);
-  SPIRVInstruction *BI = BM->addCmpInst(
-      transBoolOpCode(Op0, CmpMap::map(Cmp->getPredicate())),
-      transType(Cmp->getType()), Op0, transValue(Cmp->getOperand(1), BB), BB);
+  auto *Op0 = Cmp->getOperand(0);
+  SPIRVValue *TOp0 = transValue(Op0, BB);
+  SPIRVValue *TOp1 = transValue(Cmp->getOperand(1), BB);
+  // TODO: once the translator supports SPIR-V 1.4, update the condition below:
+  // if (/* */->isPointerTy() && /* it is not allowed to use SPIR-V 1.4 */)
+  if (Op0->getType()->isPointerTy()) {
+    unsigned AS = cast<PointerType>(Op0->getType())->getAddressSpace();
+    SPIRVType *Ty = transType(getSizetType(AS));
+    TOp0 = BM->addUnaryInst(OpConvertPtrToU, Ty, TOp0, BB);
+    TOp1 = BM->addUnaryInst(OpConvertPtrToU, Ty, TOp1, BB);
+  }
+  SPIRVInstruction *BI =
+      BM->addCmpInst(transBoolOpCode(TOp0, CmpMap::map(Cmp->getPredicate())),
+                     transType(Cmp->getType()), TOp0, TOp1, BB);
   return BI;
 }
 
@@ -714,6 +725,11 @@ SPIRVValue *LLVMToSPIRV::transValueWithoutDecoration(Value *V,
     return transFunctionDecl(F);
 
   if (auto GV = dyn_cast<GlobalVariable>(V)) {
+    if (GV->getName() == "llvm.global.annotations") {
+      transGlobalAnnotation(GV);
+      return nullptr;
+    }
+
     llvm::PointerType *Ty = GV->getType();
     // Though variables with common linkage type are initialized by 0,
     // they can be represented in SPIR-V as uninitialized variables with
@@ -1207,6 +1223,11 @@ void addIntelFPGADecorations(
     return;
 
   for (const auto &I : Decorations) {
+    // Such decoration already exists on a type, skip it
+    if (E->hasDecorate(I.first, /*Index=*/0, /*Result=*/nullptr)) {
+      continue;
+    }
+
     switch (I.first) {
     case DecorationUserSemantic:
       E->addDecorate(new SPIRVDecorateUserSemanticAttr(E, I.second));
@@ -1319,6 +1340,15 @@ SPIRVValue *LLVMToSPIRV::transIntrinsicInst(IntrinsicInst *II,
     SPIRVValue *Op = transValue(II->getArgOperand(0), BB);
     return BM->addUnaryInst(OpBitReverse, Ty, Op, BB);
   }
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz: {
+    SPIRVWord ExtOp = II->getIntrinsicID() == Intrinsic::ctlz ? OpenCLLIB::Clz
+                                                              : OpenCLLIB::Ctz;
+    SPIRVType *Ty = transType(II->getType());
+    std::vector<SPIRVValue *> Ops(1, transValue(II->getArgOperand(0), BB));
+    return BM->addExtInst(Ty, BM->getExtInstSetId(SPIRVEIS_OpenCL), ExtOp, Ops,
+                          BB);
+  }
   case Intrinsic::fmuladd: {
     // For llvm.fmuladd.* fusion is not guaranteed. If a fused multiply-add
     // is required the corresponding llvm.fma.* intrinsic function should be
@@ -1395,9 +1425,8 @@ SPIRVValue *LLVMToSPIRV::transIntrinsicInst(IntrinsicInst *II,
     if (!GEP)
       return nullptr;
     Constant *C = cast<Constant>(GEP->getOperand(0));
-    // TODO: Refactor to use getConstantStringInfo()
-    StringRef AnnotationString =
-        cast<ConstantDataArray>(C->getOperand(0))->getAsCString();
+    StringRef AnnotationString;
+    getConstantStringInfo(C, AnnotationString);
 
     if (AnnotationString == kOCLBuiltinName::FPGARegIntel) {
       if (BM->isAllowedToUseExtension(ExtensionID::SPV_INTEL_fpga_reg))
@@ -1419,9 +1448,8 @@ SPIRVValue *LLVMToSPIRV::transIntrinsicInst(IntrinsicInst *II,
 
     GetElementPtrInst *GEP = cast<GetElementPtrInst>(II->getArgOperand(1));
     Constant *C = cast<Constant>(GEP->getOperand(0));
-    // TODO: Refactor to use getConstantStringInfo()
-    StringRef AnnotationString =
-        cast<ConstantDataArray>(C->getOperand(0))->getAsString();
+    StringRef AnnotationString;
+    getConstantStringInfo(C, AnnotationString);
 
     std::vector<std::pair<Decoration, std::string>> Decorations;
     if (BB->getModule()->isAllowedToUseExtension(
@@ -1433,8 +1461,7 @@ SPIRVValue *LLVMToSPIRV::transIntrinsicInst(IntrinsicInst *II,
     // If we didn't find any IntelFPGA-specific decorations, let's add the whole
     // annotation string as UserSemantic Decoration
     if (Decorations.empty()) {
-      SV->addDecorate(new SPIRVDecorateUserSemanticAttr(
-          SV, AnnotationString.substr(0, AnnotationString.size() - 1)));
+      SV->addDecorate(new SPIRVDecorateUserSemanticAttr(SV, AnnotationString));
     } else {
       addIntelFPGADecorations(SV, Decorations);
     }
@@ -1443,9 +1470,8 @@ SPIRVValue *LLVMToSPIRV::transIntrinsicInst(IntrinsicInst *II,
   case Intrinsic::ptr_annotation: {
     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(II->getArgOperand(1));
     Constant *C = dyn_cast<Constant>(GEP->getOperand(0));
-    // TODO: Refactor to use getConstantStringInfo()
-    StringRef AnnotationString =
-        dyn_cast<ConstantDataArray>(C->getOperand(0))->getAsCString();
+    StringRef AnnotationString;
+    getConstantStringInfo(C, AnnotationString);
 
     // Strip all bitcast and addrspace casts from the pointer argument:
     //   llvm annotation intrinsic only takes i8*, so the original pointer
@@ -1601,6 +1627,40 @@ SPIRVWord LLVMToSPIRV::transFunctionControlMask(Function *F) {
   return FCM;
 }
 
+void LLVMToSPIRV::transGlobalAnnotation(GlobalVariable *V) {
+  SPIRVDBG(dbgs() << "[transGlobalAnnotation] " << *V << '\n');
+
+  // @llvm.global.annotations is an array that contains structs with 4 fields.
+  // Get the array of structs with metadata
+  ConstantArray *CA = cast<ConstantArray>(V->getOperand(0));
+  for (Value *Op : CA->operands()) {
+    ConstantStruct *CS = cast<ConstantStruct>(Op);
+    // The first field of the struct contains a pointer to annotated variable
+    Value *AnnotatedVar = CS->getOperand(0)->stripPointerCasts();
+    SPIRVValue *SV = transValue(AnnotatedVar, nullptr);
+
+    // The second field contains a pointer to a global annotation string
+    GlobalVariable *GV =
+        cast<GlobalVariable>(CS->getOperand(1)->stripPointerCasts());
+
+    StringRef AnnotationString;
+    getConstantStringInfo(GV, AnnotationString);
+
+    std::vector<std::pair<Decoration, std::string>> Decorations;
+    if (BM->isAllowedToUseExtension(
+            ExtensionID::SPV_INTEL_fpga_memory_attributes))
+      Decorations = tryParseIntelFPGAAnnotationString(AnnotationString);
+
+    // If we didn't find any IntelFPGA-specific decorations, let's
+    // add the whole annotation string as UserSemantic Decoration
+    if (Decorations.empty()) {
+      SV->addDecorate(new SPIRVDecorateUserSemanticAttr(SV, AnnotationString));
+    } else {
+      addIntelFPGADecorations(SV, Decorations);
+    }
+  }
+}
+
 bool LLVMToSPIRV::transGlobalVariables() {
   for (auto I = M->global_begin(), E = M->global_end(); I != E; ++I) {
     if (!transValue(&(*I), nullptr))
@@ -1760,9 +1820,9 @@ bool LLVMToSPIRV::translate() {
   return true;
 }
 
-llvm::IntegerType *LLVMToSPIRV::getSizetType() {
+llvm::IntegerType *LLVMToSPIRV::getSizetType(unsigned AS) {
   return IntegerType::getIntNTy(M->getContext(),
-                                M->getDataLayout().getPointerSizeInBits());
+                                M->getDataLayout().getPointerSizeInBits(AS));
 }
 
 void LLVMToSPIRV::oclGetMutatedArgumentTypesByBuiltin(
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.h b/llvm-spirv/lib/SPIRV/SPIRVWriter.h
index d322605540019..d480f158d1f9d 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.h
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.h
@@ -110,6 +110,7 @@ class LLVMToSPIRV : public ModulePass {
   SPIRVValue *transConstant(Value *V);
   SPIRVValue *transValue(Value *V, SPIRVBasicBlock *BB,
                          bool CreateForward = true);
+  void transGlobalAnnotation(GlobalVariable *V);
   SPIRVValue *transValueWithoutDecoration(Value *V, SPIRVBasicBlock *BB,
                                           bool CreateForward = true);
 
@@ -131,7 +132,7 @@ class LLVMToSPIRV : public ModulePass {
   SPIRVValue *mapValue(Value *V, SPIRVValue *BV);
   SPIRVType *getSPIRVType(Type *T) { return TypeMap[T]; }
   SPIRVErrorLog &getErrorLog() { return BM->getErrorLog(); }
-  llvm::IntegerType *getSizetType();
+  llvm::IntegerType *getSizetType(unsigned AS = 0);
   std::vector<SPIRVValue *> transValue(const std::vector<Value *> &Values,
                                        SPIRVBasicBlock *BB);
   std::vector<SPIRVWord> transValue(const std::vector<Value *> &Values,
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
index fd141e865927d..ccdf2a20fcd4f 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
@@ -767,7 +767,6 @@ template <spv::Op OC> bool isa(SPIRVEntry *E) {
 #define _SPIRV_OP(x) typedef SPIRVEntryUnimplemented<Op##x> SPIRV##x;
 _SPIRV_OP(Nop)
 _SPIRV_OP(SourceContinued)
-_SPIRV_OP(TypeMatrix)
 _SPIRV_OP(TypeRuntimeArray)
 _SPIRV_OP(SpecConstantTrue)
 _SPIRV_OP(SpecConstantFalse)
@@ -787,10 +786,7 @@ _SPIRV_OP(ImageDrefGather)
 _SPIRV_OP(QuantizeToF16)
 _SPIRV_OP(Transpose)
 _SPIRV_OP(ArrayLength)
-_SPIRV_OP(SMod)
-_SPIRV_OP(MatrixTimesScalar)
 _SPIRV_OP(VectorTimesMatrix)
-_SPIRV_OP(MatrixTimesVector)
 _SPIRV_OP(MatrixTimesMatrix)
 _SPIRV_OP(OuterProduct)
 _SPIRV_OP(IAddCarry)
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index f8e428844b2e0..82325048d9467 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -1057,8 +1057,8 @@ class SPIRVLoopMerge : public SPIRVInstruction {
 
   SPIRVId getMergeBlock() { return MergeBlock; }
   SPIRVId getContinueTarget() { return ContinueTarget; }
-  SPIRVWord getLoopControl() { return LoopControl; }
-  std::vector<SPIRVWord> getLoopControlParameters() {
+  SPIRVWord getLoopControl() const { return LoopControl; }
+  std::vector<SPIRVWord> getLoopControlParameters() const {
     return LoopControlParameters;
   }
 
@@ -1105,7 +1105,7 @@ class SPIRVSwitch : public SPIRVInstruction {
     setHasNoId();
     setHasNoType();
   }
-  std::vector<SPIRVValue *> getPairs() { return getValues(Pairs); }
+  std::vector<SPIRVValue *> getPairs() const { return getValues(Pairs); }
   SPIRVValue *getSelect() const { return getValue(Select); }
   SPIRVBasicBlock *getDefault() const {
     return static_cast<SPIRVBasicBlock *>(getValue(Default));
@@ -1153,22 +1153,21 @@ class SPIRVSwitch : public SPIRVInstruction {
   std::vector<SPIRVWord> Pairs;
 };
 
-class SPIRVFMod : public SPIRVInstruction {
+class SPIRVFSMod : public SPIRVInstruction {
 public:
-  static const Op OC = OpFMod;
   static const SPIRVWord FixedWordCount = 4;
-  // Complete constructor
-  SPIRVFMod(SPIRVType *TheType, SPIRVId TheId, SPIRVId TheDividend,
-            SPIRVId TheDivisor, SPIRVBasicBlock *BB)
+  SPIRVFSMod(Op OC, SPIRVType *TheType, SPIRVId TheId, SPIRVId TheDividend,
+             SPIRVId TheDivisor, SPIRVBasicBlock *BB)
       : SPIRVInstruction(5, OC, TheType, TheId, BB), Dividend(TheDividend),
         Divisor(TheDivisor) {
     validate();
     assert(BB && "Invalid BB");
   }
   // Incomplete constructor
-  SPIRVFMod()
+  SPIRVFSMod(Op OC)
       : SPIRVInstruction(OC), Dividend(SPIRVID_INVALID),
         Divisor(SPIRVID_INVALID) {}
+
   SPIRVValue *getDividend() const { return getValue(Dividend); }
   SPIRVValue *getDivisor() const { return getValue(Divisor); }
 
@@ -1195,6 +1194,28 @@ class SPIRVFMod : public SPIRVInstruction {
   SPIRVId Divisor;
 };
 
+class SPIRVFMod : public SPIRVFSMod {
+public:
+  static const Op OC = OpFMod;
+  // Complete constructor
+  SPIRVFMod(SPIRVType *TheType, SPIRVId TheId, SPIRVId TheDividend,
+            SPIRVId TheDivisor, SPIRVBasicBlock *BB)
+      : SPIRVFSMod(OC, TheType, TheId, TheDividend, TheDivisor, BB) {}
+  // Incomplete constructor
+  SPIRVFMod() : SPIRVFSMod(OC) {}
+};
+
+class SPIRVSMod : public SPIRVFSMod {
+public:
+  static const Op OC = OpSMod;
+  // Complete constructor
+  SPIRVSMod(SPIRVType *TheType, SPIRVId TheId, SPIRVId TheDividend,
+            SPIRVId TheDivisor, SPIRVBasicBlock *BB)
+      : SPIRVFSMod(OC, TheType, TheId, TheDividend, TheDivisor, BB) {}
+  // Incomplete constructor
+  SPIRVSMod() : SPIRVFSMod(OC) {}
+};
+
 class SPIRVVectorTimesScalar : public SPIRVInstruction {
 public:
   static const Op OC = OpVectorTimesScalar;
@@ -1248,6 +1269,120 @@ class SPIRVVectorTimesScalar : public SPIRVInstruction {
   SPIRVId Scalar;
 };
 
+class SPIRVMatrixTimesScalar : public SPIRVInstruction {
+public:
+  static const Op OC = OpMatrixTimesScalar;
+  static const SPIRVWord FixedWordCount = 4;
+  // Complete constructor
+  SPIRVMatrixTimesScalar(SPIRVType *TheType, SPIRVId TheId, SPIRVId TheMatrix,
+                         SPIRVId TheScalar, SPIRVBasicBlock *BB)
+      : SPIRVInstruction(5, OC, TheType, TheId, BB), Matrix(TheMatrix),
+        Scalar(TheScalar) {
+    validate();
+    assert(BB && "Invalid BB");
+  }
+  // Incomplete constructor
+  SPIRVMatrixTimesScalar()
+      : SPIRVInstruction(OC), Matrix(SPIRVID_INVALID), Scalar(SPIRVID_INVALID) {
+  }
+  SPIRVValue *getMatrix() const { return getValue(Matrix); }
+  SPIRVValue *getScalar() const { return getValue(Scalar); }
+
+  std::vector<SPIRVValue *> getOperands() override {
+    std::vector<SPIRVId> Operands;
+    Operands.push_back(Matrix);
+    Operands.push_back(Scalar);
+    return getValues(Operands);
+  }
+
+  void setWordCount(SPIRVWord FixedWordCount) override {
+    SPIRVEntry::setWordCount(FixedWordCount);
+  }
+
+  _SPIRV_DEF_ENCDEC4(Type, Id, Matrix, Scalar)
+
+  void validate() const override {
+    SPIRVInstruction::validate();
+    if (getValue(Matrix)->isForward() || getValue(Scalar)->isForward())
+      return;
+
+    SPIRVType *Ty = getType()->getScalarType();
+    SPIRVType *MTy = getValueType(Matrix)->getScalarType();
+    SPIRVType *STy = getValueType(Scalar);
+
+    (void)Ty;
+    (void)MTy;
+    (void)STy;
+    assert(Ty->isTypeFloat() && "Invalid result type for OpMatrixTimesScalar");
+    assert(MTy->isTypeFloat() && "Invalid Matrix type for OpMatrixTimesScalar");
+    assert(STy->isTypeFloat() && "Invalid Scalar type for OpMatrixTimesScalar");
+    assert(Ty == MTy && Ty == STy && "Mismatch float type");
+  }
+
+private:
+  SPIRVId Matrix;
+  SPIRVId Scalar;
+};
+
+class SPIRVMatrixTimesVector : public SPIRVInstruction {
+public:
+  static const Op OC = OpMatrixTimesVector;
+  static const SPIRVWord FixedWordCount = 4;
+
+  // Complete constructor
+  SPIRVMatrixTimesVector(SPIRVType *TheType, SPIRVId TheId, SPIRVId TheMatrix,
+                         SPIRVId TheVector, SPIRVBasicBlock *BB)
+      : SPIRVInstruction(5, OC, TheType, TheId, BB), Matrix(TheMatrix),
+        Vector(TheVector) {
+    validate();
+    assert(BB && "Invalid BB");
+  }
+
+  // Incomplete constructor
+  SPIRVMatrixTimesVector()
+      : SPIRVInstruction(OC), Matrix(SPIRVID_INVALID), Vector(SPIRVID_INVALID) {
+  }
+
+  SPIRVValue *getMatrix() const { return getValue(Matrix); }
+
+  SPIRVValue *getVector() const { return getValue(Vector); }
+
+  std::vector<SPIRVValue *> getOperands() override {
+    std::vector<SPIRVId> Operands;
+    Operands.push_back(Matrix);
+    Operands.push_back(Vector);
+    return getValues(Operands);
+  }
+
+  void setWordCount(SPIRVWord FixedWordCount) override {
+    SPIRVEntry::setWordCount(FixedWordCount);
+  }
+
+  _SPIRV_DEF_ENCDEC4(Type, Id, Matrix, Vector)
+
+  void validate() const override {
+    SPIRVInstruction::validate();
+    if (getValue(Matrix)->isForward() || getValue(Vector)->isForward())
+      return;
+    SPIRVType *Ty = getType()->getScalarType();
+    SPIRVType *MTy = getValueType(Matrix)->getScalarType();
+    SPIRVType *VTy = getValueType(Vector)->getScalarType();
+
+    (void)Ty;
+    (void)MTy;
+    (void)VTy;
+    assert(Ty->isTypeFloat() && "Invalid result type for OpMatrixTimesVector");
+    assert(MTy->isTypeFloat() && "Invalid Matrix type for OpMatrixTimesVector");
+    assert(VTy->isTypeFloat() && "Invalid Vector type for OpMatrixTimesVector");
+
+    assert(Ty == MTy && Ty == VTy && "Mismatch float type");
+  }
+
+private:
+  SPIRVId Matrix;
+  SPIRVId Vector;
+};
+
 class SPIRVUnary : public SPIRVInstTemplateBase {
 protected:
   void validate() const override {
@@ -1361,9 +1496,9 @@ class SPIRVLoopControlINTEL : public SPIRVInstruction {
     setHasNoType();
   }
 
-  SPIRVWord getLoopControl() { return LoopControl; }
+  SPIRVWord getLoopControl() const { return LoopControl; }
 
-  std::vector<SPIRVWord> getLoopControlParameters() {
+  std::vector<SPIRVWord> getLoopControlParameters() const {
     return LoopControlParameters;
   }
 
@@ -2319,6 +2454,10 @@ class SPIRVSubgroupShuffleINTELInstBase : public SPIRVInstTemplateBase {
   SPIRVCapVec getRequiredCapability() const override {
     return getVec(CapabilitySubgroupShuffleINTEL);
   }
+
+  SPIRVExtSet getRequiredExtensions() const override {
+    return getSet(ExtensionID::SPV_INTEL_subgroups);
+  }
 };
 
 #define _SPIRV_OP(x, ...)                                                      \
@@ -2337,6 +2476,10 @@ class SPIRVSubgroupBufferBlockIOINTELInstBase : public SPIRVInstTemplateBase {
   SPIRVCapVec getRequiredCapability() const override {
     return getVec(CapabilitySubgroupBufferBlockIOINTEL);
   }
+
+  SPIRVExtSet getRequiredExtensions() const override {
+    return getSet(ExtensionID::SPV_INTEL_subgroups);
+  }
 };
 
 #define _SPIRV_OP(x, ...)                                                      \
@@ -2353,6 +2496,10 @@ class SPIRVSubgroupImageBlockIOINTELInstBase : public SPIRVInstTemplateBase {
   SPIRVCapVec getRequiredCapability() const override {
     return getVec(CapabilitySubgroupImageBlockIOINTEL);
   }
+
+  SPIRVExtSet getRequiredExtensions() const override {
+    return getSet(ExtensionID::SPV_INTEL_subgroups);
+  }
 };
 
 #define _SPIRV_OP(x, ...)                                                      \
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
index aa5b48c66a8c3..2cb364ea7dd0a 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
@@ -373,6 +373,14 @@ class SPIRVModuleImpl : public SPIRVModule {
                                              SPIRVId TheVector,
                                              SPIRVId TheScalar,
                                              SPIRVBasicBlock *BB) override;
+  SPIRVInstruction *addMatrixTimesScalarInst(SPIRVType *TheType,
+                                             SPIRVId TheMatrix,
+                                             SPIRVId TheScalar,
+                                             SPIRVBasicBlock *BB) override;
+  SPIRVInstruction *addMatrixTimesVectorInst(SPIRVType *TheType,
+                                             SPIRVId TheMatrix,
+                                             SPIRVId TheVector,
+                                             SPIRVBasicBlock *BB) override;
   SPIRVInstruction *addUnaryInst(Op, SPIRVType *, SPIRVValue *,
                                  SPIRVBasicBlock *) override;
   SPIRVInstruction *addVariable(SPIRVType *, bool, SPIRVLinkageTypeKind,
@@ -1063,6 +1071,22 @@ SPIRVModuleImpl::addVectorTimesScalarInst(SPIRVType *TheType, SPIRVId TheVector,
       new SPIRVVectorTimesScalar(TheType, getId(), TheVector, TheScalar, BB));
 }
 
+SPIRVInstruction *
+SPIRVModuleImpl::addMatrixTimesScalarInst(SPIRVType *TheType, SPIRVId TheMatrix,
+                                          SPIRVId TheScalar,
+                                          SPIRVBasicBlock *BB) {
+  return BB->addInstruction(
+      new SPIRVMatrixTimesScalar(TheType, getId(), TheMatrix, TheScalar, BB));
+}
+
+SPIRVInstruction *
+SPIRVModuleImpl::addMatrixTimesVectorInst(SPIRVType *TheType, SPIRVId TheMatrix,
+                                          SPIRVId TheVector,
+                                          SPIRVBasicBlock *BB) {
+  return BB->addInstruction(
+      new SPIRVMatrixTimesVector(TheType, getId(), TheMatrix, TheVector, BB));
+}
+
 SPIRVInstruction *
 SPIRVModuleImpl::addGroupInst(Op OpCode, SPIRVType *Type, Scope Scope,
                               const std::vector<SPIRVValue *> &Ops,
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h
index d72676379c223..225f1f77d551b 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h
@@ -374,6 +374,14 @@ class SPIRVModule {
                                                      SPIRVId TheVector,
                                                      SPIRVId TheScalar,
                                                      SPIRVBasicBlock *BB) = 0;
+  virtual SPIRVInstruction *addMatrixTimesScalarInst(SPIRVType *TheType,
+                                                     SPIRVId TheMatrix,
+                                                     SPIRVId TheScalar,
+                                                     SPIRVBasicBlock *BB) = 0;
+  virtual SPIRVInstruction *addMatrixTimesVectorInst(SPIRVType *TheType,
+                                                     SPIRVId TheMatrix,
+                                                     SPIRVId TheVector,
+                                                     SPIRVBasicBlock *BB) = 0;
   virtual SPIRVInstruction *addUnaryInst(Op, SPIRVType *, SPIRVValue *,
                                          SPIRVBasicBlock *) = 0;
   virtual SPIRVInstruction *addVariable(SPIRVType *, bool, SPIRVLinkageTypeKind,
@@ -427,6 +435,10 @@ class SPIRVModule {
     return true;
   }
 
+  virtual bool isGenArgNameMDEnabled() const final {
+    return TranslationOpts.isGenArgNameMDEnabled();
+  }
+
   // I/O functions
   friend spv_ostream &operator<<(spv_ostream &O, SPIRVModule &M);
   friend std::istream &operator>>(std::istream &I, SPIRVModule &M);
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.h
index c0cc69b768df0..d7c26614aff11 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.h
@@ -103,12 +103,38 @@ const SPIRVDecoder &decodeBinary(const SPIRVDecoder &I, T &V) {
   return I;
 }
 
+#ifdef _SPIRV_SUPPORT_TEXT_FMT
+/// Skip comment and whitespace. Comment starts with ';', ends with '\n'.
+inline std::istream &skipcomment(std::istream &IS) {
+  if (IS.eof() || IS.bad())
+    return IS;
+
+  char C = IS.peek();
+
+  while (std::char_traits<char>::not_eof(C) && std::isspace(C)) {
+    IS.get();
+    C = IS.peek();
+  }
+
+  while (std::char_traits<char>::not_eof(C) && C == ';') {
+    IS.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+    C = IS.peek();
+    while (std::char_traits<char>::not_eof(C) && std::isspace(C)) {
+      IS.get();
+      C = IS.peek();
+    }
+  }
+
+  return IS;
+}
+#endif
+
 template <typename T>
 const SPIRVDecoder &operator>>(const SPIRVDecoder &I, T &V) {
 #ifdef _SPIRV_SUPPORT_TEXT_FMT
   if (SPIRVUseTextFormat) {
     uint32_t W;
-    I.IS >> W;
+    I.IS >> skipcomment >> W;
     V = static_cast<T>(W);
     SPIRVDBG(spvdbgs() << "Read word: W = " << W << " V = " << V << '\n');
     return I;
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.cpp
index 8187a7e749048..72a94cce7d026 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.cpp
@@ -114,6 +114,36 @@ SPIRVType *SPIRVType::getVectorComponentType() const {
   return static_cast<const SPIRVTypeVector *>(this)->getComponentType();
 }
 
+SPIRVWord SPIRVType::getMatrixColumnCount() const {
+  assert(OpCode == OpTypeMatrix && "Not matrix type");
+  return static_cast<const SPIRVTypeMatrix *const>(this)->getColumnCount();
+}
+
+SPIRVType *SPIRVType::getMatrixColumnType() const {
+  assert(OpCode == OpTypeMatrix && "Not matrix type");
+  return static_cast<const SPIRVTypeMatrix *const>(this)->getColumnType();
+}
+
+SPIRVType *SPIRVType::getScalarType() const {
+  switch (OpCode) {
+  case OpTypePointer:
+    return getPointerElementType()->getScalarType();
+  case OpTypeArray:
+    return getArrayElementType();
+  case OpTypeVector:
+    return getVectorComponentType();
+  case OpTypeMatrix:
+    return getMatrixColumnType()->getVectorComponentType();
+  case OpTypeInt:
+  case OpTypeFloat:
+  case OpTypeBool:
+    return const_cast<SPIRVType *>(this);
+  default:
+    break;
+  }
+  return nullptr;
+}
+
 bool SPIRVType::isTypeVoid() const { return OpCode == OpTypeVoid; }
 bool SPIRVType::isTypeArray() const { return OpCode == OpTypeArray; }
 
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h
index a3ce1d0ab8af4..9cb0041a012c9 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVType.h
@@ -75,6 +75,9 @@ class SPIRVType : public SPIRVEntry {
   SPIRVWord getStructMemberCount() const;
   SPIRVWord getVectorComponentCount() const;
   SPIRVType *getVectorComponentType() const;
+  SPIRVWord getMatrixColumnCount() const;
+  SPIRVType *getMatrixColumnType() const;
+  SPIRVType *getScalarType() const;
 
   bool isTypeVoid() const;
   bool isTypeArray() const;
@@ -310,6 +313,48 @@ class SPIRVTypeVector : public SPIRVType {
   SPIRVWord CompCount; // Component Count
 };
 
+class SPIRVTypeMatrix : public SPIRVType {
+public:
+  // Complete constructor
+  SPIRVTypeMatrix(SPIRVModule *M, SPIRVId TheId, SPIRVType *TheColType,
+                  SPIRVWord TheColCount)
+      : SPIRVType(M, 4, OpTypeMatrix, TheId), ColType(TheColType),
+        ColCount(TheColCount) {
+    validate();
+  }
+  // Incomplete constructor
+  SPIRVTypeMatrix() : SPIRVType(OpTypeMatrix), ColType(nullptr), ColCount(0) {}
+
+  SPIRVType *getColumnType() const { return ColType; }
+  SPIRVWord getColumnCount() const { return ColCount; }
+
+  bool isValidIndex(SPIRVWord Index) const { return Index < ColCount; }
+
+  SPIRVCapVec getRequiredCapability() const override {
+    SPIRVCapVec V(getColumnType()->getRequiredCapability());
+    if (ColCount >= 8)
+      V.push_back(CapabilityVector16);
+    return V;
+  }
+
+  virtual std::vector<SPIRVEntry *> getNonLiteralOperands() const override {
+    return std::vector<SPIRVEntry *>(1, ColType);
+  }
+
+  void validate() const override {
+    SPIRVEntry::validate();
+    ColType->validate();
+    assert(ColCount >= 2);
+  }
+
+protected:
+  _SPIRV_DEF_ENCDEC3(Id, ColType, ColCount)
+
+private:
+  SPIRVType *ColType; // Column Type
+  SPIRVWord ColCount; // Column Count
+};
+
 class SPIRVConstant;
 class SPIRVTypeArray : public SPIRVType {
 public:
diff --git a/llvm-spirv/test/ComparePointers.cl b/llvm-spirv/test/ComparePointers.cl
new file mode 100644
index 0000000000000..fa484fe0ee2f6
--- /dev/null
+++ b/llvm-spirv/test/ComparePointers.cl
@@ -0,0 +1,28 @@
+kernel void test(int global *in, int global *in2) {
+  if (!in) 
+    return;
+  if (in == 1)
+    return; 
+  if (in > in2)   
+    return;
+  if (in < in2)
+    return;
+}
+// RUN: %clang_cc1 -triple spir64 -x cl -cl-std=CL2.0 -O0 -emit-llvm-bc %s -o %t.bc
+// RUN: llvm-spirv %t.bc -spirv-text -o %t.spt
+// RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+// RUN: llvm-spirv %t.bc -o %t.spv
+// RUN: spirv-val %t.spv
+
+// CHECK-SPIRV:ConvertPtrToU 
+// CHECK-SPIRV:ConvertPtrToU
+// CHECK-SPIRV:INotEqual
+// CHECK-SPIRV:ConvertPtrToU
+// CHECK-SPIRV:ConvertPtrToU
+// CHECK-SPIRV:IEqual
+// CHECK-SPIRV:ConvertPtrToU
+// CHECK-SPIRV:ConvertPtrToU
+// CHECK-SPIRV:UGreaterThan
+// CHECK-SPIRV:ConvertPtrToU
+// CHECK-SPIRV:ConvertPtrToU
+// CHECK-SPIRV:ULessThan
diff --git a/llvm-spirv/test/DebugInfo/DebugControlFlow.cl b/llvm-spirv/test/DebugInfo/DebugControlFlow.cl
index bb17b98a2147a..fcdaa31f74307 100644
--- a/llvm-spirv/test/DebugInfo/DebugControlFlow.cl
+++ b/llvm-spirv/test/DebugInfo/DebugControlFlow.cl
@@ -10,11 +10,6 @@
 // between LoopMerge and Branch/BranchConditional instructions.
 // Otherwise, debug info interferes with SPIRVToLLVM translation
 // of structured flow control
-//
-// Currently, Line DebugInfo instructions are still present
-// between LoopMerge and Branch/BranchConditional instructions.
-// This does not affect SPIRVToLLVM translation, however
-// should be fixed separately
 
 kernel
 void sample() {
@@ -29,12 +24,19 @@ void sample() {
   } while (j++ < 10);
 }
 
+// Check that all Line items are retained
+// CHECK-SPIRV: Line [[File:[0-9]+]] 18 0
+// Control flow
 // CHECK-SPIRV: {{[0-9]+}} LoopMerge [[MergeBlock:[0-9]+]] [[ContinueTarget:[0-9]+]] 1
-// CHECK-SPIRV-NOT: ExtInst
-// CHECK-SPIRV: BranchConditional
+// CHECK-SPIRV-NEXT: BranchConditional
+
+// Check that all Line items are retained
+// CHECK-SPIRV: Line [[File]] 23 0
+// CHECK-SPIRV: Line [[File]] 24 0
+// Control flow
 // CHECK-SPIRV: {{[0-9]+}} LoopMerge [[MergeBlock:[0-9]+]] [[ContinueTarget:[0-9]+]] 1
-// CHECK-SPIRV-NOT: ExtInst
-// CHECK-SPIRV: Branch
-// CHECK-LLVM: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !dbg !{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
+// CHECK-SPIRV-NEXT: Branch
+
+// CHECK-LLVM: br label %{{.*}}, !dbg !{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
 // CHECK-LLVM: ![[MD]] = distinct !{![[MD]], ![[MD_unroll:[0-9]+]]}
 // CHECK-LLVM: ![[MD_unroll]] = !{!"llvm.loop.unroll.enable"}
diff --git a/llvm-spirv/test/DebugInfo/DebugUnstructuredControlFlow.cl b/llvm-spirv/test/DebugInfo/DebugUnstructuredControlFlow.cl
index 7863924b54691..bb938ef02cd30 100644
--- a/llvm-spirv/test/DebugInfo/DebugUnstructuredControlFlow.cl
+++ b/llvm-spirv/test/DebugInfo/DebugUnstructuredControlFlow.cl
@@ -7,8 +7,7 @@
 
 // Test that no debug info instruction is inserted between LoopControlINTEL and
 // Branch instructions. Otherwise, debug info interferes with SPIRVToLLVM
-// translation of structured flow control. Yet, Line DebugInfo instruction is
-// still presenting between LoopControlINTEL and Branch instructions.
+// translation of structured flow control
 
 kernel
 void sample() {
@@ -16,10 +15,12 @@ void sample() {
   for(;;);
 }
 
+// Check that all Line items are retained
+// CHECK-SPIRV: Line [[File:[0-9]+]] 15 0
+// Loop control
 // CHECK-SPIRV: 2 LoopControlINTEL 1
-// CHECK-SPIRV-NOT: ExtInst
-// CHECK-SPIRV: {{[0-9]+}} Line {{[0-9]+}} {{[0-9]+}} {{[0-9]+}}
-// CHECK-SPIRV: Branch
+// CHECK-SPIRV-NEXT: Branch
+
 // CHECK-LLVM: br label %{{.*}}, !dbg !{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
 // CHECK-LLVM: ![[MD]] = distinct !{![[MD]], ![[MD_unroll:[0-9]+]]}
 // CHECK-LLVM: ![[MD_unroll]] = !{!"llvm.loop.unroll.enable"}
diff --git a/llvm-spirv/test/DebugInfo/LocalAddressSpace.ll b/llvm-spirv/test/DebugInfo/LocalAddressSpace.ll
index 025f7e8ea4e28..e0d470f12f572 100644
--- a/llvm-spirv/test/DebugInfo/LocalAddressSpace.ll
+++ b/llvm-spirv/test/DebugInfo/LocalAddressSpace.ll
@@ -22,7 +22,7 @@
 ; CHECK: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name {{.*}} = "a")
 ; CHECK-NEXT: DW_AT_type {{.*}} "int")
-; CHECK-NEXT: DW_AT_decl_file {{.*}} ("/work/tmp{{[/\\]}}tmp.cl")
+; CHECK-NEXT: DW_AT_decl_file {{.*}} ("/work{{[/\\]}}tmp{{[/\\]}}tmp.cl")
 ; CHECK-NEXT: DW_AT_decl_line {{.*}} (2)
 ; CHECK-NEXT: DW_AT_location [DW_FORM_exprloc]      (DW_OP_addr 0x0)
 
diff --git a/llvm-spirv/test/InfiniteLoopMetadataPlacement.ll b/llvm-spirv/test/InfiniteLoopMetadataPlacement.ll
new file mode 100644
index 0000000000000..b2e552af46efb
--- /dev/null
+++ b/llvm-spirv/test/InfiniteLoopMetadataPlacement.ll
@@ -0,0 +1,105 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv --spirv-ext=+SPV_INTEL_unstructured_loop_controls %t.bc -o %t.spv
+; RUN: llvm-spirv %t.spv --to-text -o - | FileCheck %s --check-prefix=CHECK-SPV
+
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis %t.rev.bc -o - | FileCheck %s --check-prefix=CHECK-REV-LLVM
+
+; ModuleID = 'llvm_loop_test.cpp'
+source_filename = "llvm_loop_test.cpp"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-linux-sycldevice"
+
+$_ZTS12WhileOneTest = comdat any
+
+; CHECK-SPV: {{[0-9]+}} Name [[WH_COND:[0-9]+]] "while.cond"
+
+; Function Attrs: inlinehint nounwind
+define weak_odr dso_local spir_kernel void @_ZTS12WhileOneTest() #0 comdat !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 {
+entry:
+  %i = alloca i32, align 4
+  %s = alloca i32, align 4
+  %0 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #2
+  store i32 0, i32* %i, align 4, !tbaa !7
+  %1 = bitcast i32* %s to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #2
+  store i32 0, i32* %s, align 4, !tbaa !7
+  br label %while.cond
+
+; CHECK-SPV-NOT: {{[0-9]+}} LoopControlINTEL
+; CHECK-SPV-NOT: {{[0-9]+}} LoopMerge
+
+while.cond:                                       ; preds = %if.end, %entry
+; CHECK-SPV:      {{[0-9]+}} Label [[WH_COND]]
+; CHECK-SPV-NEXT: {{[0-9]+}} LoopControlINTEL 4
+; CHECK-SPV-NEXT: {{[0-9]+}} Branch
+  br label %while.body
+
+; CHECK-SPV-NOT: {{[0-9]+}} LoopControlINTEL
+; CHECK-SPV-NOT: {{[0-9]+}} LoopMerge
+
+while.body:                                       ; preds = %while.cond
+  %2 = load i32, i32* %i, align 4, !tbaa !7
+  %cmp = icmp sge i32 %2, 16
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  br label %while.end
+
+if.else:                                          ; preds = %while.body
+  %3 = load i32, i32* %i, align 4, !tbaa !7
+  %4 = load i32, i32* %s, align 4, !tbaa !7
+  %add = add nsw i32 %4, %3
+  store i32 %add, i32* %s, align 4, !tbaa !7
+  br label %if.end
+
+; CHECK-REV-LLVM-NOT: br {{.*}}, !llvm.loop
+
+if.end:                                           ; preds = %if.else
+; CHECK-REV-LLVM: if.end:
+  %5 = load i32, i32* %i, align 4, !tbaa !7
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4, !tbaa !7
+  br label %while.cond, !llvm.loop !9
+; CHECK-REV-LLVM: br label %while.cond, !llvm.loop ![[MD_IVDEP:[0-9]+]]
+
+; CHECK-REV-LLVM-NOT: br {{.*}}, !llvm.loop
+
+while.end:                                        ; preds = %if.then
+  %6 = bitcast i32* %s to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %6) #2
+  %7 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %7) #2
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
+
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.spir.version = !{!1}
+!spirv.Source = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 4, i32 100000}
+!3 = !{!"clang version 9.0.0"}
+!4 = !{}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !5, i64 0}
+!9 = distinct !{!9, !10}
+!10 = !{!"llvm.loop.ivdep.enable"}
+
+; CHECK-REV-LLVM: ![[MD_IVDEP]] = distinct !{![[MD_IVDEP]], ![[MD_ivdep_enable:[0-9]+]]}
+; CHECK-REV-LLVM: ![[MD_ivdep_enable]] = !{!"llvm.loop.ivdep.enable"}
diff --git a/llvm-spirv/test/IntelFPGAMemoryAttributesForStaticVar.ll b/llvm-spirv/test/IntelFPGAMemoryAttributesForStaticVar.ll
new file mode 100644
index 0000000000000..ec28935ed293c
--- /dev/null
+++ b/llvm-spirv/test/IntelFPGAMemoryAttributesForStaticVar.ll
@@ -0,0 +1,156 @@
+; Source
+; void foo(int a) {
+;   static int a_one [[intelfpga::numbanks(2)]];
+;   a_one = a_one + a;
+; }
+
+; void bar(char b) {
+;   static char b_one [[intelfpga::memory("MLAB")]];
+;   b_one = b_one + b;
+; }
+
+; void baz(int c) {
+;   static int c_one[[clang::annotate("foobarbaz")]];
+;   c_one = c_one + c;
+; }
+
+; template <typename name, typename Func>
+; __attribute__((sycl_kernel)) void kernel_single_task(Func kernelFunc) {
+;   kernelFunc();
+; }
+
+; int main() {
+;   kernel_single_task<class kernel_function>([]() {
+;     foo(128);
+;     bar(42);
+;     baz(16);
+;   });
+;   return 0;
+; }
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_fpga_memory_attributes -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV: Capability FPGAMemoryAttributesINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_fpga_memory_attributes"
+; CHECK-SPIRV: Decorate {{[0-9]+}} UserSemantic "foobarbaz"
+; CHECK-SPIRV: Decorate {{[0-9]+}} MemoryINTEL "DEFAULT"
+; CHECK-SPIRV: Decorate {{[0-9]+}} MemoryINTEL "MLAB"
+; CHECK-SPIRV: Decorate {{[0-9]+}} NumbanksINTEL 2
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-linux-sycldevice"
+
+%"class._ZTSZ4mainE3$_0.anon" = type { i8 }
+
+; CHECK-LLVM: [[STR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:2}
+; CHECK-LLVM: [[STR2:@[0-9_.]+]] = {{.*}}{memory:MLAB}
+; CHECK-LLVM: [[STR3:@[0-9_.]+]] = {{.*}}foobarbaz
+; CHECK-LLVM: @llvm.global.annotations
+; CHECK-SAME: _ZZ3fooiE5a_one{{.*}}[[STR]]{{.*}}_ZZ3bariE5b_one{{.*}}[[STR2]]{{.*}}_ZZ3baziE5c_one{{.*}}[[STR3]]
+@_ZZ3fooiE5a_one = internal addrspace(1) global i32 0, align 4
+@.str = private unnamed_addr constant [29 x i8] c"{memory:DEFAULT}{numbanks:2}\00", section "llvm.metadata"
+@.str.1 = private unnamed_addr constant [9 x i8] c"test.cpp\00", section "llvm.metadata"
+@_ZZ3barcE5b_one = internal addrspace(1) global i8 0, align 1
+@.str.2 = private unnamed_addr constant [14 x i8] c"{memory:MLAB}\00", section "llvm.metadata"
+@_ZZ3baziE5c_one = internal addrspace(1) global i32 0, align 4
+@.str.3 = private unnamed_addr constant [10 x i8] c"foobarbaz\00", section "llvm.metadata"
+@llvm.global.annotations = appending global [3 x { i8 addrspace(1)*, i8*, i8*, i32 }] [{ i8 addrspace(1)*, i8*, i8*, i32 } { i8 addrspace(1)* bitcast (i32 addrspace(1)* @_ZZ3fooiE5a_one to i8 addrspace(1)*), i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.1, i32 0, i32 0), i32 2 }, { i8 addrspace(1)*, i8*, i8*, i32 } { i8 addrspace(1)* @_ZZ3barcE5b_one, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.1, i32 0, i32 0), i32 7 }, { i8 addrspace(1)*, i8*, i8*, i32 } { i8 addrspace(1)* bitcast (i32 addrspace(1)* @_ZZ3baziE5c_one to i8 addrspace(1)*), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.1, i32 0, i32 0), i32 12 }], section "llvm.metadata"
+
+; Function Attrs: nounwind
+define spir_kernel void @_ZTSZ4mainE15kernel_function() #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 {
+entry:
+  %0 = alloca %"class._ZTSZ4mainE3$_0.anon", align 1
+  %1 = bitcast %"class._ZTSZ4mainE3$_0.anon"* %0 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %1) #4
+  %2 = addrspacecast %"class._ZTSZ4mainE3$_0.anon"* %0 to %"class._ZTSZ4mainE3$_0.anon" addrspace(4)*
+  call spir_func void @"_ZZ4mainENK3$_0clEv"(%"class._ZTSZ4mainE3$_0.anon" addrspace(4)* %2)
+  %3 = bitcast %"class._ZTSZ4mainE3$_0.anon"* %0 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %3) #4
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
+
+; Function Attrs: inlinehint nounwind
+define internal spir_func void @"_ZZ4mainENK3$_0clEv"(%"class._ZTSZ4mainE3$_0.anon" addrspace(4)* %this) #2 align 2 {
+entry:
+  %this.addr = alloca %"class._ZTSZ4mainE3$_0.anon" addrspace(4)*, align 8
+  store %"class._ZTSZ4mainE3$_0.anon" addrspace(4)* %this, %"class._ZTSZ4mainE3$_0.anon" addrspace(4)** %this.addr, align 8, !tbaa !5
+  %this1 = load %"class._ZTSZ4mainE3$_0.anon" addrspace(4)*, %"class._ZTSZ4mainE3$_0.anon" addrspace(4)** %this.addr, align 8
+  call spir_func void @_Z3fooi(i32 128)
+  call spir_func void @_Z3barc(i8 signext 42)
+  call spir_func void @_Z3bazi(i32 16)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
+
+; Function Attrs: nounwind
+define spir_func void @_Z3fooi(i32 %a) #3 {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4, !tbaa !9
+  %0 = load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @_ZZ3fooiE5a_one to i32 addrspace(4)*), align 4, !tbaa !9
+  %1 = load i32, i32* %a.addr, align 4, !tbaa !9
+  %add = add nsw i32 %0, %1
+  store i32 %add, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @_ZZ3fooiE5a_one to i32 addrspace(4)*), align 4, !tbaa !9
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_func void @_Z3barc(i8 signext %b) #3 {
+entry:
+  %b.addr = alloca i8, align 1
+  store i8 %b, i8* %b.addr, align 1, !tbaa !11
+  %0 = load i8, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* @_ZZ3barcE5b_one to i8 addrspace(4)*), align 1, !tbaa !11
+  %conv = sext i8 %0 to i32
+  %1 = load i8, i8* %b.addr, align 1, !tbaa !11
+  %conv1 = sext i8 %1 to i32
+  %add = add nsw i32 %conv, %conv1
+  %conv2 = trunc i32 %add to i8
+  store i8 %conv2, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* @_ZZ3barcE5b_one to i8 addrspace(4)*), align 1, !tbaa !11
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_func void @_Z3bazi(i32 %c) #3 {
+entry:
+  %c.addr = alloca i32, align 4
+  store i32 %c, i32* %c.addr, align 4, !tbaa !9
+  %0 = load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @_ZZ3baziE5c_one to i32 addrspace(4)*), align 4, !tbaa !9
+  %1 = load i32, i32* %c.addr, align 4, !tbaa !9
+  %add = add nsw i32 %0, %1
+  store i32 %add, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @_ZZ3baziE5c_one to i32 addrspace(4)*), align 4, !tbaa !9
+  ret void
+}
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.spir.version = !{!1}
+!spirv.Source = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 4, i32 100000}
+!3 = !{!"clang version 9.0.0"}
+!4 = !{}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !7, i64 0}
+!11 = !{!7, !7, i64 0}
diff --git a/llvm-spirv/test/OpBitReverse_i32.ll b/llvm-spirv/test/OpBitReverse_i32.ll
new file mode 100644
index 0000000000000..7b29c327e98b0
--- /dev/null
+++ b/llvm-spirv/test/OpBitReverse_i32.ll
@@ -0,0 +1,40 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv -spirv-text %t.bc -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r %t.spv -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV: 4 TypeInt [[int:[0-9]+]] 32
+; CHECK-SPIRV: 4 BitReverse [[int]]
+
+; CHECK-LLVM: call i32 @llvm.bitreverse.i32
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+; Function Attrs: convergent nounwind writeonly
+define spir_kernel void @testBitRev(i32 %a, i32 %b, i32 %c, i32 addrspace(1)* nocapture %res) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+entry:
+  %call = tail call i32 @llvm.bitreverse.i32(i32 %b)
+  store i32 %call, i32 addrspace(1)* %res, align 4, !tbaa !7
+  ret void
+}
+
+declare i32 @llvm.bitreverse.i32(i32)
+
+attributes #0 = { convergent nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 2, i32 0}
+!3 = !{i32 0, i32 0, i32 0, i32 1}
+!4 = !{!"none", !"none", !"none", !"none"}
+!5 = !{!"int", !"int", !"int", !"int*"}
+!6 = !{!"", !"", !"", !""}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm-spirv/test/OpBitReverse_v2i16.ll b/llvm-spirv/test/OpBitReverse_v2i16.ll
new file mode 100644
index 0000000000000..76f4bb39e7e1b
--- /dev/null
+++ b/llvm-spirv/test/OpBitReverse_v2i16.ll
@@ -0,0 +1,41 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv -spirv-text %t.bc -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r %t.spv -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV: 4 TypeInt [[short:[0-9]+]] 16
+; CHECK-SPIRV: 4 TypeVector [[short2:[0-9]+]] [[short]] 2
+; CHECK-SPIRV: 4 BitReverse [[short2]]
+
+; CHECK-LLVM: call <2 x i16> @llvm.bitreverse.v2i16
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+; Function Attrs: convergent nounwind writeonly
+define spir_kernel void @testBitRev(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> addrspace(1)* nocapture %res) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+entry:
+  %call = tail call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %b)
+  store <2 x i16> %call, <2 x i16> addrspace(1)* %res, align 4, !tbaa !7
+  ret void
+}
+
+declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>)
+
+attributes #0 = { convergent nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 2, i32 0}
+!3 = !{i32 0, i32 0, i32 0, i32 1}
+!4 = !{!"none", !"none", !"none", !"none"}
+!5 = !{!"short2", !"short2", !"short2", !"short2*"}
+!6 = !{!"", !"", !"", !""}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"short2", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm-spirv/test/OpConvertPtrToU_narrowing.spvasm b/llvm-spirv/test/OpConvertPtrToU_narrowing.spvasm
new file mode 100644
index 0000000000000..7b53ed3b22f99
--- /dev/null
+++ b/llvm-spirv/test/OpConvertPtrToU_narrowing.spvasm
@@ -0,0 +1,27 @@
+; REQUIRES: spirv-as
+; RUN: spirv-as --target-env spv1.0 -o %t.spv %s
+; RUN: llvm-spirv -r -o - %t.spv | llvm-dis | FileCheck %s
+               OpCapability Addresses
+               OpCapability Kernel
+               OpCapability Int16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "testNarrowingPtrToU"
+               OpName %a "a"
+               OpName %res "res"
+               OpName %entry "entry"
+       %uint = OpTypeInt 32 0
+     %ushort = OpTypeInt 16 0
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+%_ptr_CrossWorkgroup_ushort = OpTypePointer CrossWorkgroup %ushort
+         %17 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_ushort
+          %1 = OpFunction %void None %17
+          %a = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_ushort
+      %entry = OpLabel
+         %18 = OpConvertPtrToU %ushort %a
+               OpStore %res %18 Aligned 2
+               OpReturn
+               OpFunctionEnd
+
+; CHECK: ptrtoint i32 addrspace(1)* %a to i16
diff --git a/llvm-spirv/test/OpConvertPtrToU_widening.spvasm b/llvm-spirv/test/OpConvertPtrToU_widening.spvasm
new file mode 100644
index 0000000000000..29151b58453f6
--- /dev/null
+++ b/llvm-spirv/test/OpConvertPtrToU_widening.spvasm
@@ -0,0 +1,27 @@
+; REQUIRES: spirv-as
+; RUN: spirv-as --target-env spv1.0 -o %t.spv %s
+; RUN: llvm-spirv -r -o - %t.spv | llvm-dis | FileCheck %s
+               OpCapability Addresses
+               OpCapability Kernel
+               OpCapability Int64
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "testWideningPtrToU"
+               OpName %a "a"
+               OpName %res "res"
+               OpName %entry "entry"
+       %uint = OpTypeInt 32 0
+      %ulong = OpTypeInt 64 0
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+%_ptr_CrossWorkgroup_ulong = OpTypePointer CrossWorkgroup %ulong
+         %17 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_ulong
+          %1 = OpFunction %void None %17
+          %a = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_ulong
+      %entry = OpLabel
+         %18 = OpConvertPtrToU %ulong %a
+               OpStore %res %18 Aligned 8
+               OpReturn
+               OpFunctionEnd
+
+; CHECK: ptrtoint i32 addrspace(1)* %a to i64
diff --git a/llvm-spirv/test/OpConvertUToPtr_narrowing.spvasm b/llvm-spirv/test/OpConvertUToPtr_narrowing.spvasm
new file mode 100644
index 0000000000000..10ac6dc719403
--- /dev/null
+++ b/llvm-spirv/test/OpConvertUToPtr_narrowing.spvasm
@@ -0,0 +1,25 @@
+; REQUIRES: spirv-as
+; RUN: spirv-as --target-env spv1.0 -o %t.spv %s
+; RUN: llvm-spirv -r -o - %t.spv | llvm-dis | FileCheck %s
+               OpCapability Addresses
+               OpCapability Kernel
+               OpCapability Int64
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "testNarrowingUToPtr"
+               OpName %a "a"
+               OpName %entry "entry"
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+      %ulong = OpTypeInt 64 0
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %9 = OpTypeFunction %void %ulong
+          %1 = OpFunction %void None %9
+          %a = OpFunctionParameter %ulong
+      %entry = OpLabel
+         %10 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %a
+               OpStore %10 %uint_0 Aligned 4
+               OpReturn
+               OpFunctionEnd
+
+; CHECK: inttoptr i64 %a to i32 addrspace(1)*
diff --git a/llvm-spirv/test/OpConvertUToPtr_widening.spvasm b/llvm-spirv/test/OpConvertUToPtr_widening.spvasm
new file mode 100644
index 0000000000000..93da43843e535
--- /dev/null
+++ b/llvm-spirv/test/OpConvertUToPtr_widening.spvasm
@@ -0,0 +1,25 @@
+; REQUIRES: spirv-as
+; RUN: spirv-as --target-env spv1.0 -o %t.spv %s
+; RUN: llvm-spirv -r -o - %t.spv | llvm-dis | FileCheck %s
+               OpCapability Addresses
+               OpCapability Kernel
+               OpCapability Int16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "testWideningUToPtr"
+               OpName %a "a"
+               OpName %entry "entry"
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+     %ushort = OpTypeInt 16 0
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %9 = OpTypeFunction %void %ushort
+          %1 = OpFunction %void None %9
+          %a = OpFunctionParameter %ushort
+      %entry = OpLabel
+         %10 = OpConvertUToPtr %_ptr_CrossWorkgroup_uint %a
+               OpStore %10 %uint_0 Aligned 4
+               OpReturn
+               OpFunctionEnd
+
+; CHECK: inttoptr i16 %a to i32 addrspace(1)*
diff --git a/llvm-spirv/test/OpFMod.spt b/llvm-spirv/test/OpFMod.spt
deleted file mode 100644
index 04f0f3f29145a..0000000000000
--- a/llvm-spirv/test/OpFMod.spt
+++ /dev/null
@@ -1,66 +0,0 @@
-119734787 65536 458752 27 0
-2 Capability Addresses
-2 Capability Linkage
-2 Capability Kernel
-2 Capability Float64
-2 Capability Int64
-5 ExtInstImport 1 "OpenCL.std"
-3 MemoryModel 2 2
-6 EntryPoint 6 2 "fmath_spv"
-3 Source 3 102000
-3 Name 3 "res"
-3 Name 4 "lhs"
-3 Name 5 "rhs"
-4 Name 6 "entry"
-9 Name 8 "__spirv_GlobalInvocationId"
-4 Decorate 7 FuncParamAttr 5
-2 DecorationGroup 7
-4 Decorate 8 BuiltIn 28
-3 Decorate 8 Constant
-11 Decorate 8 LinkageAttributes "__spirv_GlobalInvocationId" Import
-5 GroupDecorate 7 3 4 5
-4 TypeInt 9 64 0
-5 Constant 9 16 32 0
-4 TypeVector 10 9 3
-4 TypePointer 11 0 10
-2 TypeVoid 12
-3 TypeFloat 13 64
-4 TypePointer 14 5 13
-6 TypeFunction 15 12 14 14 14
-4 Variable 11 8 0
-
-5 Function 12 2 0 15
-3 FunctionParameter 14 3
-3 FunctionParameter 14 4
-3 FunctionParameter 14 5
-
-2 Label 6
-6 Load 10 17 8 2 0
-5 CompositeExtract 9 18 17 0
-5 ShiftLeftLogical 9 19 18 16
-5 ShiftRightArithmetic 9 20 19 16
-5 InBoundsPtrAccessChain 14 21 4 20
-6 Load 13 22 21 2 8
-5 InBoundsPtrAccessChain 14 23 5 20
-6 Load 13 24 23 2 8
-5 FMod 13 25 22 24
-5 InBoundsPtrAccessChain 14 26 3 20
-5 Store 26 25 2 8
-1 Return
-
-1 FunctionEnd
-
-; RUN: llvm-spirv %s -to-binary -o %t.spv
-; RUN: spirv-val %t.spv
-; RUN: llvm-spirv %t.spv -to-text -o %t.spt
-; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
-; RUN: llvm-spirv -r %t.spv -o %t.bc
-; RUN: llvm-dis < %t.bc | FileCheck %s --check-prefix=CHECK-LLVM
-
-; CHECK-SPIRV: 5 FMod {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}}
-
-; CHECK-LLVM: %frem.res = frem double %[[dividend:[0-9]+]], %[[divisor:[0-9]+]]
-; CHECK-LLVM: %copysign = call spir_func double @_Z8copysigndd(double %frem.res, double %[[divisor]]) #0
-; CHECK-LLVM: %[[ptr:[0-9]+]] = getelementptr inbounds double, double addrspace(1)* %res, i64 %{{[0-9]*}}
-; CHECK-LLVM: store double %copysign, double addrspace(1)* %[[ptr]], align 8
-  
diff --git a/llvm-spirv/test/OpFMod_f32.spvasm b/llvm-spirv/test/OpFMod_f32.spvasm
new file mode 100644
index 0000000000000..ffc0021926d5d
--- /dev/null
+++ b/llvm-spirv/test/OpFMod_f32.spvasm
@@ -0,0 +1,25 @@
+; REQUIRES: spirv-as
+; RUN: spirv-as --target-env spv1.0 -o %t.spv %s
+; RUN: llvm-spirv -r -o - %t.spv | llvm-dis | FileCheck %s
+               OpCapability Addresses
+               OpCapability Kernel
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "testFMod_f32"
+               OpName %a "a"
+               OpName %b "b"
+       %void = OpTypeVoid
+      %float = OpTypeFloat 32
+          %5 = OpTypeFunction %void %float %float
+          %1 = OpFunction %void None %5
+          %a = OpFunctionParameter %float
+          %b = OpFunctionParameter %float
+          %6 = OpLabel
+          %7 = OpFMod %float %a %b
+               OpReturn
+               OpFunctionEnd
+
+; CHECK-DAG: %frem.res = frem float %a, %b
+; CHECK-DAG: %copysign.res = call float @llvm.copysign.f32(float %frem.res, float %b)
+; CHECK-DAG: %fadd.res = fadd float {{%frem\.res, %b|%b, %frem\.res}}
+; CHECK-DAG: %cmp.res = fcmp one float {{%frem\.res, %copysign\.res|%copysign\.res, %frem\.res}}
+; CHECK: select i1 %cmp.res, float %fadd.res, float %copysign.res
diff --git a/llvm-spirv/test/OpFMod_v2f16.spvasm b/llvm-spirv/test/OpFMod_v2f16.spvasm
new file mode 100644
index 0000000000000..5675a7904fc76
--- /dev/null
+++ b/llvm-spirv/test/OpFMod_v2f16.spvasm
@@ -0,0 +1,27 @@
+; REQUIRES: spirv-as
+; RUN: spirv-as --target-env spv1.0 -o %t.spv %s
+; RUN: llvm-spirv -r -o - %t.spv | llvm-dis | FileCheck %s
+               OpCapability Addresses
+               OpCapability Kernel
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "testFMod_v2f16"
+               OpName %a "a"
+               OpName %b "b"
+       %void = OpTypeVoid
+       %half = OpTypeFloat 16
+      %half2 = OpTypeVector %half 2
+          %5 = OpTypeFunction %void %half2 %half2
+          %1 = OpFunction %void None %5
+          %a = OpFunctionParameter %half2
+          %b = OpFunctionParameter %half2
+          %6 = OpLabel
+          %7 = OpFMod %half2 %a %b
+               OpReturn
+               OpFunctionEnd
+
+; CHECK-DAG: %frem.res = frem <2 x half> %a, %b
+; CHECK-DAG: %copysign.res = call <2 x half> @llvm.copysign.v2f16(<2 x half> %frem.res, <2 x half> %b)
+; CHECK-DAG: %fadd.res = fadd <2 x half> {{%frem\.res, %b|%b, %frem\.res}}
+; CHECK-DAG: %cmp.res = fcmp one <2 x half> {{%frem\.res, %copysign\.res|%copysign\.res, %frem\.res}}
+; CHECK: select <2 x i1> %cmp.res, <2 x half> %fadd.res, <2 x half> %copysign.res
diff --git a/llvm-spirv/test/OpLoopMergeDontUnrollHint1.spt b/llvm-spirv/test/OpLoopMergeDontUnrollHint1.spt
index 4a8217afb3d3e..59b3da40fb0f0 100644
--- a/llvm-spirv/test/OpLoopMergeDontUnrollHint1.spt
+++ b/llvm-spirv/test/OpLoopMergeDontUnrollHint1.spt
@@ -83,6 +83,6 @@
 ; RUN: llvm-spirv -r %t.spv -o %t.bc
 ; RUN: llvm-dis < %t.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
-; CHECK-LLVM: br i1 %{{[0-9]+}}, label %{{[0-9]+}}, label %{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
+; CHECK-LLVM: br label %{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
 ; CHECK-LLVM: ![[MD]] = distinct !{![[MD]], ![[MD_unroll:[0-9]+]]}
 ; CHECK-LLVM: ![[MD_unroll]] = !{!"llvm.loop.unroll.disable"}
diff --git a/llvm-spirv/test/OpLoopMergeNone.spt b/llvm-spirv/test/OpLoopMergeNone.spt
index a17e4524bf5a9..734143f1abbef 100644
--- a/llvm-spirv/test/OpLoopMergeNone.spt
+++ b/llvm-spirv/test/OpLoopMergeNone.spt
@@ -84,5 +84,5 @@
 ; RUN: llvm-spirv -r %t.spv -o %t.bc
 ; RUN: llvm-dis < %t.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
-; CHECK-LLVM: br i1 %{{[0-9]+}}, label %{{[0-9]+}}, label %{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
+; CHECK-LLVM: br label %{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
 ; CHECK-LLVM: ![[MD]] = distinct !{![[MD]]}
diff --git a/llvm-spirv/test/OpLoopMergePartialUnroll.spt b/llvm-spirv/test/OpLoopMergePartialUnroll.spt
index 5f3fab6b6025c..9118fc63324ab 100644
--- a/llvm-spirv/test/OpLoopMergePartialUnroll.spt
+++ b/llvm-spirv/test/OpLoopMergePartialUnroll.spt
@@ -83,6 +83,6 @@
 ; RUN: llvm-spirv -r %t.spv -o %t.bc
 ; RUN: llvm-dis < %t.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
-; CHECK-LLVM: br i1 %{{[0-9]+}}, label %{{[0-9]+}}, label %{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
+; CHECK-LLVM: br label %{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
 ; CHECK-LLVM: ![[MD]] = distinct !{![[MD]], ![[MD_unroll:[0-9]+]]}
 ; CHECK-LLVM: ![[MD_unroll]] = !{!"llvm.loop.unroll.count", i32 4}
diff --git a/llvm-spirv/test/OpLoopMergeUnroll.spt b/llvm-spirv/test/OpLoopMergeUnroll.spt
index ad498b956b989..d976168d19fbb 100644
--- a/llvm-spirv/test/OpLoopMergeUnroll.spt
+++ b/llvm-spirv/test/OpLoopMergeUnroll.spt
@@ -84,6 +84,6 @@
 ; RUN: llvm-spirv -r %t.spv -o %t.bc
 ; RUN: llvm-dis < %t.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
-; CHECK-LLVM: br i1 %{{[0-9]+}}, label %{{[0-9]+}}, label %{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
+; CHECK-LLVM: br label %{{[0-9]+}}, !llvm.loop ![[MD:[0-9]+]]
 ; CHECK-LLVM: ![[MD]] = distinct !{![[MD]], ![[MD_unroll:[0-9]+]]}
 ; CHECK-LLVM: ![[MD_unroll]] = !{!"llvm.loop.unroll.enable"}
diff --git a/llvm-spirv/test/OpSMod_i32.spvasm b/llvm-spirv/test/OpSMod_i32.spvasm
new file mode 100644
index 0000000000000..2705f83183b69
--- /dev/null
+++ b/llvm-spirv/test/OpSMod_i32.spvasm
@@ -0,0 +1,27 @@
+; REQUIRES: spirv-as
+; RUN: spirv-as --target-env spv1.0 -o %t.spv %s
+; RUN: llvm-spirv -r -o - %t.spv | llvm-dis | FileCheck %s
+               OpCapability Addresses
+               OpCapability Kernel
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "testSMod_i32"
+               OpName %a "a"
+               OpName %b "b"
+       %void = OpTypeVoid
+       %uint = OpTypeInt 32 0
+          %5 = OpTypeFunction %void %uint %uint
+          %1 = OpFunction %void None %5
+          %a = OpFunctionParameter %uint
+          %b = OpFunctionParameter %uint
+          %6 = OpLabel
+          %7 = OpSMod %uint %a %b
+               OpReturn
+               OpFunctionEnd
+
+; CHECK-DAG: %srem.res = srem i32 %a, %b
+; CHECK-DAG: %xor.res = xor i32 {{%a, %b|%b, %a}}
+; CHECK-DAG: %cmpsign.res = icmp slt i32 %xor.res, 0
+; CHECK-DAG: %cmpsrem.res = icmp ne i32 {{%srem\.res, 0|0, %srem\.res}}
+; CHECK-DAG: %add.res = add nsw i32 {{%srem\.res, %b|%b, %srem\.res}}
+; CHECK-DAG: %cmp.res = and i1 {{%cmpsign\.res, %cmpsrem\.res|%cmpsrem\.res, %cmpsign\.res}}
+; CHECK: select i1 %cmp.res, i32 %add.res, i32 %srem.res
diff --git a/llvm-spirv/test/OpSMod_v2i16.spvasm b/llvm-spirv/test/OpSMod_v2i16.spvasm
new file mode 100644
index 0000000000000..b64e15d73622b
--- /dev/null
+++ b/llvm-spirv/test/OpSMod_v2i16.spvasm
@@ -0,0 +1,29 @@
+; REQUIRES: spirv-as
+; RUN: spirv-as --target-env spv1.0 -o %t.spv %s
+; RUN: llvm-spirv -r -o - %t.spv | llvm-dis | FileCheck %s
+               OpCapability Addresses
+               OpCapability Kernel
+               OpCapability Int16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "testSMod_v2i16"
+               OpName %a "a"
+               OpName %b "b"
+       %void = OpTypeVoid
+     %ushort = OpTypeInt 16 0
+    %ushort2 = OpTypeVector %ushort 2
+          %5 = OpTypeFunction %void %ushort2 %ushort2
+          %1 = OpFunction %void None %5
+          %a = OpFunctionParameter %ushort2
+          %b = OpFunctionParameter %ushort2
+          %6 = OpLabel
+          %7 = OpSMod %ushort2 %a %b
+               OpReturn
+               OpFunctionEnd
+
+; CHECK-DAG: %srem.res = srem <2 x i16> %a, %b
+; CHECK-DAG: %xor.res = xor <2 x i16> {{%a, %b|%b, %a}}
+; CHECK-DAG: %cmpsign.res = icmp slt <2 x i16> %xor.res, zeroinitializer
+; CHECK-DAG: %cmpsrem.res = icmp ne <2 x i16> {{%srem\.res, zeroinitializer|zeroinitializer, %srem\.res}}
+; CHECK-DAG: %add.res = add nsw <2 x i16> {{%srem\.res, %b|%b, %srem\.res}}
+; CHECK-DAG: %cmp.res = and <2 x i1> {{%cmpsign\.res, %cmpsrem\.res|%cmpsrem\.res, %cmpsign\.res}}
+; CHECK: select <2 x i1> %cmp.res, <2 x i16> %add.res, <2 x i16> %srem.res
diff --git a/llvm-spirv/test/copy_object.spt b/llvm-spirv/test/copy_object.spt
index 58a5126dcf58c..3587a4c966173 100644
--- a/llvm-spirv/test/copy_object.spt
+++ b/llvm-spirv/test/copy_object.spt
@@ -38,9 +38,6 @@
 
 1 FunctionEnd
 
-; FIXME: LIT comments/commands are moved at the end because llvm-spirv stops
-; reading the file after first ';' symbol
-
 ; RUN: llvm-spirv %s -to-binary -o %t.spv
 ; RUN: spirv-val %t.spv
 ; RUN: llvm-spirv -r %t.spv -o %t.bc
diff --git a/llvm-spirv/test/count-zero-bits.ll b/llvm-spirv/test/count-zero-bits.ll
new file mode 100644
index 0000000000000..7ed0e58b49f46
--- /dev/null
+++ b/llvm-spirv/test/count-zero-bits.ll
@@ -0,0 +1,62 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -spirv-text -o - | FileCheck %s
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: ExtInstImport [[extinst_id:[0-9]+]] "OpenCL.std"
+
+; CHECK: Function
+; CHECK: 6 ExtInst {{[0-9]+}} {{[0-9]+}} [[extinst_id]] clz
+; CHECK: FunctionEnd
+
+; Function Attrs: nounwind readnone
+define spir_func i32 @TestClz(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  ret i32 %0
+}
+
+; CHECK: Function
+; CHECK: 6 ExtInst {{[0-9]+}} {{[0-9]+}} [[extinst_id]] ctz
+; CHECK: FunctionEnd
+
+; Function Attrs: nounwind readnone
+define spir_func i32 @TestCtz(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  ret i32 %0
+}
+
+; CHECK: Function
+; CHECK: 6 ExtInst {{[0-9]+}} {{[0-9]+}} [[extinst_id]] ctz
+; CHECK: FunctionEnd
+
+; Function Attrs: nounwind readnone
+define spir_func <4 x i32> @TestCtzVec(<4 x i32> %x) local_unnamed_addr #0 {
+entry:
+  %0 = tail call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %x, i1 true)
+  ret <4 x i32> %0
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i32 @llvm.cttz.i32(i32, i1 immarg) #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1 immarg) #1
+
+attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 0}
+!2 = !{i32 1, i32 2}
diff --git a/llvm-spirv/test/linkage-name.spt b/llvm-spirv/test/linkage-name.spt
index 2cae495b49197..241c600b545b0 100644
--- a/llvm-spirv/test/linkage-name.spt
+++ b/llvm-spirv/test/linkage-name.spt
@@ -23,9 +23,6 @@
 
 1 FunctionEnd
 
-; FIXME: LIT comments/commands are moved at the end because llvm-spirv stops
-; reading the file after first ';' symbol
-
 ; RUN: llvm-spirv %s -to-binary -o %t.spv
 ; RUN: spirv-val %t.spv
 ; RUN: llvm-spirv -r %t.spv -o %t.bc
diff --git a/llvm-spirv/test/matrix_times_scalar.spt b/llvm-spirv/test/matrix_times_scalar.spt
new file mode 100644
index 0000000000000..fbf4ba9055359
--- /dev/null
+++ b/llvm-spirv/test/matrix_times_scalar.spt
@@ -0,0 +1,57 @@
+; RUN: llvm-spirv %s -to-binary -o %t.spv
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.bc
+; RUN: llvm-dis < %t.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-LLVM: %1 = load [4 x <4 x float>], [4 x <4 x float>]* %lhs
+; CHECK-LLVM: %2 = load float, float* %rhs
+; CHECK-LLVM: %.splatinsert = insertelement <4 x float> undef, float %2, i32 0
+; CHECK-LLVM: %3 = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-LLVM: %4 = extractvalue [4 x <4 x float>] %1, 0
+; CHECK-LLVM: %5 = fmul <4 x float> %4, %3
+; CHECK-LLVM: %6 = insertvalue [4 x <4 x float>] undef, <4 x float> %5, 0
+; CHECK-LLVM: %7 = extractvalue [4 x <4 x float>] %1, 1
+; CHECK-LLVM: %8 = fmul <4 x float> %7, %3
+; CHECK-LLVM: %9 = insertvalue [4 x <4 x float>] %6, <4 x float> %8, 1
+; CHECK-LLVM: %10 = extractvalue [4 x <4 x float>] %1, 2
+; CHECK-LLVM: %11 = fmul <4 x float> %10, %3
+; CHECK-LLVM: %12 = insertvalue [4 x <4 x float>] %9, <4 x float> %11, 2
+; CHECK-LLVM: %13 = extractvalue [4 x <4 x float>] %1, 3
+; CHECK-LLVM: %14 = fmul <4 x float> %13, %3
+; CHECK-LLVM: %15 = insertvalue [4 x <4 x float>] %12, <4 x float> %14, 3
+; CHECK-LLVM: store [4 x <4 x float>] %15, [4 x <4 x float>]* %res
+
+119734787 65536 458752 21 0
+2 Capability Addresses
+2 Capability Linkage
+2 Capability Kernel
+2 Capability Float64
+2 Capability Matrix
+3 MemoryModel 2 2
+8 EntryPoint 6 20 "matrix_times_scalar"
+3 Source 3 102000
+3 Name 12 "res"
+3 Name 13 "lhs"
+3 Name 14 "rhs"
+
+2 TypeVoid 5
+3 TypeFloat 6 32
+4 TypeVector 7 6 4
+4 TypeMatrix 8 7 4
+4 TypePointer 9 7 8             ; 9  : Pointer to Matrix
+4 TypePointer 10 7 6            ; 10 : Pointer to Scalar
+6 TypeFunction 11 5 9 9 10
+
+5 Function 5 20 0 11            ; lhs x rhs -> res
+3 FunctionParameter 9 12        ; res : Pointer to Matrix
+3 FunctionParameter 9 13        ; lhs : Pointer to Matrix
+3 FunctionParameter 10 14       ; rhs : Pointer to Scalar
+
+2 Label 15
+4 Load 8 16 13
+4 Load 6 17 14
+5 MatrixTimesScalar 8 18 16 17
+3 Store 12 18
+1 Return
+
+1 FunctionEnd
diff --git a/llvm-spirv/test/matrix_times_vector.spt b/llvm-spirv/test/matrix_times_vector.spt
new file mode 100644
index 0000000000000..0726381d440b5
--- /dev/null
+++ b/llvm-spirv/test/matrix_times_vector.spt
@@ -0,0 +1,68 @@
+119734787 65536 458752 21 0
+2 Capability Addresses
+2 Capability Linkage
+2 Capability Kernel
+2 Capability Float64
+2 Capability Matrix
+3 MemoryModel 2 2
+8 EntryPoint 6 20 "matrix_times_vector"
+3 Source 3 102000
+3 Name 12 "res"
+3 Name 13 "lhs"
+3 Name 14 "rhs"
+
+2 TypeVoid 5
+3 TypeFloat 6 32
+4 TypeVector 7 6 4
+4 TypeMatrix 8 7 4
+4 TypePointer 9 7 8
+4 TypePointer 10 7 7
+6 TypeFunction 11 5 10 9 10
+
+5 Function 5 20 0 11
+3 FunctionParameter 10 12
+3 FunctionParameter 9 13
+3 FunctionParameter 10 14
+
+2 Label 15
+4 Load 8 16 13
+4 Load 7 17 14
+5 MatrixTimesVector 7 18 16 17
+3 Store 12 18
+1 Return
+
+1 FunctionEnd
+
+; RUN: llvm-spirv %s -to-binary -o %t.spv
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.bc
+; RUN: llvm-dis < %t.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-LLVM: %1 = load [4 x <4 x float>], [4 x <4 x float>]* %lhs
+; CHECK-LLVM: %2 = load <4 x float>, <4 x float>* %rhs
+; CHECK-LLVM: %3 = extractelement <4 x float> %2, i32 0
+; CHECK-LLVM: %.splatinsert = insertelement <4 x float> undef, float %3, i32 0
+; CHECK-LLVM: %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-LLVM: %4 = extractvalue [4 x <4 x float>] %1, 0
+; CHECK-LLVM: %5 = fmul <4 x float> %.splat, %4
+; CHECK-LLVM: %6 = fadd <4 x float> zeroinitializer, %5
+; CHECK-LLVM: %7 = extractelement <4 x float> %2, i32 1
+; CHECK-LLVM: %.splatinsert1 = insertelement <4 x float> undef, float %7, i32 0
+; CHECK-LLVM: %.splat2 = shufflevector <4 x float> %.splatinsert1, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-LLVM: %8 = extractvalue [4 x <4 x float>] %1, 1
+; CHECK-LLVM: %9 = fmul <4 x float> %.splat2, %8
+; CHECK-LLVM: %10 = fadd <4 x float> %6, %9
+; CHECK-LLVM: %11 = extractelement <4 x float> %2, i32 2
+; CHECK-LLVM: %.splatinsert3 = insertelement <4 x float> undef, float %11, i32 0
+; CHECK-LLVM: %.splat4 = shufflevector <4 x float> %.splatinsert3, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-LLVM: %12 = extractvalue [4 x <4 x float>] %1, 2
+; CHECK-LLVM: %13 = fmul <4 x float> %.splat4, %12
+; CHECK-LLVM: %14 = fadd <4 x float> %10, %13
+; CHECK-LLVM: %15 = extractelement <4 x float> %2, i32 3
+; CHECK-LLVM: %.splatinsert5 = insertelement <4 x float> undef, float %15, i32 0
+; CHECK-LLVM: %.splat6 = shufflevector <4 x float> %.splatinsert5, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-LLVM: %16 = extractvalue [4 x <4 x float>] %1, 3
+; CHECK-LLVM: %17 = fmul <4 x float> %.splat6, %16
+; CHECK-LLVM: %18 = fadd <4 x float> %14, %17
+; CHECK-LLVM: store <4 x float> %18, <4 x float>* %res
+; CHECK-LLVM: ret void
diff --git a/llvm-spirv/test/opundef.spt b/llvm-spirv/test/opundef.spt
index 3a8356b00fd18..2a8e433392d13 100644
--- a/llvm-spirv/test/opundef.spt
+++ b/llvm-spirv/test/opundef.spt
@@ -38,9 +38,6 @@
 
 1 FunctionEnd 
 
-; FIXME: LIT comments/commands are moved at the end because llvm-spirv stops
-; reading the file after first ';' symbol
-
 ; RUN: llvm-spirv %s -to-binary -o %t.spv
 ; RUN: spirv-val %t.spv
 ; RUN: llvm-spirv -r %t.spv -o %t.bc
diff --git a/llvm-spirv/test/redundant_word.spt b/llvm-spirv/test/redundant_word.spt
index a6f500b76db75..e851292f02bb0 100644
--- a/llvm-spirv/test/redundant_word.spt
+++ b/llvm-spirv/test/redundant_word.spt
@@ -28,9 +28,6 @@
 1 FunctionEnd 
 
 
-; FIXME: LIT comments/commands are moved at the end because llvm-spirv stops
-; reading the file after first ';' symbol
-
 ; RUN: llvm-spirv %s -to-binary -o %t.spv
 ; RUN: spirv-val %t.spv
 ; RUN: llvm-spirv -r %t.spv -o %t.bc
diff --git a/llvm-spirv/test/transcoding/ConvertPtr.cl b/llvm-spirv/test/transcoding/ConvertPtr.cl
index 14f48731894e3..8f4bf2562bfc4 100644
--- a/llvm-spirv/test/transcoding/ConvertPtr.cl
+++ b/llvm-spirv/test/transcoding/ConvertPtr.cl
@@ -5,12 +5,23 @@
 // RUN: llvm-spirv -r %t.spv -o %t.rev.bc
 // RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
-kernel void test(global int *a, global unsigned long *res) {
+kernel void testConvertPtrToU(global int *a, global unsigned long *res) {
   res[0] = (unsigned long)&a[0];
 }
 
-// CHECK-SPIRV: ConvertPtrToU
+// CHECK-SPIRV: 4 ConvertPtrToU
 
-// CHECK-LLVM-LABEL: @test
+// CHECK-LLVM-LABEL: @testConvertPtrToU
 // CHECK-LLVM: %0 = ptrtoint i32 addrspace(1)* %a to i32
 // CHECK-LLVM: zext i32 %0 to i64
+
+kernel void testConvertUToPtr(unsigned long a) {
+  global unsigned int *res = (global unsigned int *)a;
+  res[0] = 0;
+}
+
+// CHECK-SPIRV: 4 ConvertUToPtr
+
+// CHECK-LLVM-LABEL: @testConvertUToPtr
+// CHECK-LLVM: %[[Conv:[a-z]+]] = trunc i64 %a to i32
+// CHECK-LLVM: inttoptr i32 %[[Conv]] to i32 addrspace(1)*
diff --git a/llvm-spirv/test/transcoding/FPGALoopAttr.ll b/llvm-spirv/test/transcoding/FPGALoopAttr.ll
index 0f0b1913a4b50..4337fb2f58050 100644
--- a/llvm-spirv/test/transcoding/FPGALoopAttr.ll
+++ b/llvm-spirv/test/transcoding/FPGALoopAttr.ll
@@ -161,11 +161,11 @@ attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide
 !10 = !{!"llvm.loop.max_concurrency.count", i32 2}
 !11 = distinct !{!11, !8, !10}
 
-; CHECK-LLVM: br i1 %cmp, label %for.body, label %for.end, !llvm.loop ![[MD_A:[0-9]+]]
-; CHECK-LLVM: br i1 %cmp{{[0-9]+}}, label %for.body{{[0-9]+}}, label %for.end{{[0-9]+}}, !llvm.loop ![[MD_B:[0-9]+]]
-; CHECK-LLVM: br i1 %cmp{{[0-9]+}}, label %for.body{{[0-9]+}}, label %for.end{{[0-9]+}}, !llvm.loop ![[MD_C:[0-9]+]]
-; CHECK-LLVM: br i1 %cmp{{[0-9]+}}, label %for.body{{[0-9]+}}, label %for.end{{[0-9]+}}, !llvm.loop ![[MD_D:[0-9]+]]
-; CHECK-LLVM: br i1 %cmp{{[0-9]+}}, label %for.body{{[0-9]+}}, label %for.end{{[0-9]+}}, !llvm.loop ![[MD_E:[0-9]+]]
+; CHECK-LLVM: br label %for.cond{{[0-9]*}}, !llvm.loop ![[MD_A:[0-9]+]]
+; CHECK-LLVM: br label %for.cond{{[0-9]+}}, !llvm.loop ![[MD_B:[0-9]+]]
+; CHECK-LLVM: br label %for.cond{{[0-9]+}}, !llvm.loop ![[MD_C:[0-9]+]]
+; CHECK-LLVM: br label %for.cond{{[0-9]+}}, !llvm.loop ![[MD_D:[0-9]+]]
+; CHECK-LLVM: br label %for.cond{{[0-9]+}}, !llvm.loop ![[MD_E:[0-9]+]]
 
 ; CHECK-LLVM: ![[MD_A]] = distinct !{![[MD_A]], ![[MD_ivdep_enable:[0-9]+]]}
 ; CHECK-LLVM: ![[MD_ivdep_enable]] = !{!"llvm.loop.ivdep.enable"}
diff --git a/llvm-spirv/test/transcoding/GenericCastToPtr.cl b/llvm-spirv/test/transcoding/GenericCastToPtr.cl
new file mode 100644
index 0000000000000..2088c67bf4c89
--- /dev/null
+++ b/llvm-spirv/test/transcoding/GenericCastToPtr.cl
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -O1 -cl-std=CL2.0 -emit-llvm-bc -finclude-default-header %s -o %t.bc
+// RUN: llvm-spirv %t.bc -spirv-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+// RUN: llvm-spirv %t.bc -o %t.spv
+// RUN: spirv-val %t.spv
+// RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+// RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+// CHECK-SPIRV: 4 GenericCastToPtr
+
+// CHECK-LLVM-LABEL: @testGenericCastToPtrGlobal
+// CHECK-LLVM: %0 = addrspacecast <2 x i16> addrspace(4)* %a to <2 x i16> addrspace(1)*
+
+global short2 *testGenericCastToPtrGlobal(generic short2 *a) {
+  return (global short2 *)a;
+}
+
+// CHECK-SPIRV: 4 GenericCastToPtr
+
+// CHECK-LLVM-LABEL: @testGenericCastToPtrLocal
+// CHECK-LLVM: %0 = addrspacecast <2 x i16> addrspace(4)* %a to <2 x i16> addrspace(3)*
+
+local short2 *testGenericCastToPtrLocal(generic short2 *a) {
+  return (local short2 *)a;
+}
+
+// CHECK-SPIRV: 4 GenericCastToPtr
+
+// CHECK-LLVM-LABEL: @testGenericCastToPtrPrivate
+// CHECK-LLVM: %0 = addrspacecast <2 x i16> addrspace(4)* %a to <2 x i16>*
+
+private short2 *testGenericCastToPtrPrivate(generic short2 *a) {
+  return (private short2 *)a;
+}
+
+// CHECK-SPIRV: 5 GenericCastToPtrExplicit
+
+// CHECK-LLVM-LABEL: @testGenericCastToPtrExplicitGlobal
+// CHECK-LLVM: %[[VoidPtrCast:[0-9]+]] = bitcast <2 x i16> addrspace(4)* %a to i8 addrspace(4)*
+// CHECK-LLVM-NEXT: %[[AddrSpaceCast:[0-9]+]] = bitcast i8 addrspace(4)* %[[VoidPtrCast]] to i8 addrspace(4)*
+// CHECK-LLVM-NEXT: %.tmp = call spir_func i8 addrspace(1)* @__to_global(i8 addrspace(4)* %[[AddrSpaceCast]])
+// CHECK-LLVM: bitcast i8 addrspace(1)* %{{[0-9]+}} to <2 x i16> addrspace(1)*
+
+global short2 *testGenericCastToPtrExplicitGlobal(generic short2 *a) {
+  return to_global(a);
+}
+
+// CHECK-SPIRV: 5 GenericCastToPtrExplicit
+
+// CHECK-LLVM-LABEL: @testGenericCastToPtrExplicitLocal
+// CHECK-LLVM: %[[VoidPtrCast:[0-9]+]] = bitcast <2 x i16> addrspace(4)* %a to i8 addrspace(4)*
+// CHECK-LLVM-NEXT: %[[AddrSpaceCast:[0-9]+]] = bitcast i8 addrspace(4)* %[[VoidPtrCast]] to i8 addrspace(4)*
+// CHECK-LLVM-NEXT: %.tmp = call spir_func i8 addrspace(3)* @__to_local(i8 addrspace(4)* %[[AddrSpaceCast]])
+// CHECK-LLVM: bitcast i8 addrspace(3)* %{{[0-9]+}} to <2 x i16> addrspace(3)*
+
+local short2 *testGenericCastToPtrExplicitLocal(generic short2 *a) {
+  return to_local(a);
+}
+
+// CHECK-SPIRV: 5 GenericCastToPtrExplicit
+
+// CHECK-LLVM-LABEL: @testGenericCastToPtrExplicitPrivate
+// CHECK-LLVM: %[[VoidPtrCast:[0-9]+]] = bitcast <2 x i16> addrspace(4)* %a to i8 addrspace(4)*
+// CHECK-LLVM-NEXT: %[[AddrSpaceCast:[0-9]+]] = bitcast i8 addrspace(4)* %[[VoidPtrCast]] to i8 addrspace(4)*
+// CHECK-LLVM-NEXT: %.tmp = call spir_func i8* @__to_private(i8 addrspace(4)* %[[AddrSpaceCast]])
+// CHECK-LLVM: bitcast i8* %{{[0-9]+}} to <2 x i16>*
+
+private short2 *testGenericCastToPtrExplicitPrivate(generic short2 *a) {
+  return to_private(a);
+}
diff --git a/llvm-spirv/test/transcoding/OpVectorExtractDynamic.ll b/llvm-spirv/test/transcoding/OpVectorExtractDynamic.ll
new file mode 100644
index 0000000000000..b39ebad02975b
--- /dev/null
+++ b/llvm-spirv/test/transcoding/OpVectorExtractDynamic.ll
@@ -0,0 +1,45 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -spirv-text -o %t.txt
+; RUN: FileCheck < %t.txt %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-LLVM: extractelement <2 x float> %vec, i32 %index
+
+; CHECK-SPIRV: 3 Name [[vec:[0-9]+]] "vec"
+; CHECK-SPIRV: 4 Name [[index:[0-9]+]] "index"
+; CHECK-SPIRV: 3 Name [[res:[0-9]+]] "res"
+
+; CHECK-SPIRV: 3 TypeFloat [[float:[0-9]+]] 32
+; CHECK-SPIRV: 4 TypeVector [[float2:[0-9]+]] [[float]] 2
+
+; CHECK-SPIRV: 5 VectorExtractDynamic [[float]] [[res]] [[vec]] [[index]]
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @test(float addrspace(1)* nocapture %out, <2 x float> %vec, i32 %index) #0 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+entry:
+  %res = extractelement <2 x float> %vec, i32 %index
+  store float %res, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!opencl.enable.FP_CONTRACT = !{}
+!opencl.spir.version = !{!6}
+!opencl.ocl.version = !{!7}
+!opencl.used.extensions = !{!8}
+!opencl.used.optional.core.features = !{!8}
+
+!1 = !{i32 1, i32 0}
+!2 = !{!"none", !"none", !"none"}
+!3 = !{!"float*", !"float2", !"int"}
+!4 = !{!"float*", !"float2", !"int"}
+!5 = !{!"", !"", !""}
+!6 = !{i32 1, i32 2}
+!7 = !{i32 2, i32 0}
+!8 = !{}
diff --git a/llvm-spirv/test/transcoding/atomics_int64.spt b/llvm-spirv/test/transcoding/atomics_int64.spt
new file mode 100644
index 0000000000000..f6b310aeaa131
--- /dev/null
+++ b/llvm-spirv/test/transcoding/atomics_int64.spt
@@ -0,0 +1,83 @@
+119734787 65536 393230 34 0
+2 Capability Addresses
+2 Capability Linkage
+2 Capability Kernel
+2 Capability Int64
+2 Capability Int64Atomics
+2 Capability GenericPointer
+5 ExtInstImport 1 "OpenCL.std"
+3 MemoryModel 1 2
+3 Source 3 200000
+5 Decorate 8 LinkageAttributes "foo" Export
+4 TypeInt 3 64 0
+4 TypeInt 5 32 0
+4 Constant 5 17 1
+4 Constant 5 18 16
+2 TypeVoid 2
+4 TypePointer 4 8 3
+4 TypePointer 6 8 5
+9 TypeFunction 7 2 4 4 6 3 3 3
+2 TypeBool 32
+
+
+5 Function 2 8 0 7
+3 FunctionParameter 4 9
+3 FunctionParameter 4 10
+3 FunctionParameter 6 11
+3 FunctionParameter 3 12
+3 FunctionParameter 3 13
+3 FunctionParameter 3 14
+
+2 Label 15
+6 AtomicLoad 3 16 9 17 18
+5 AtomicStore 9 17 18 12
+7 AtomicExchange 3 19 9 17 18 12
+9 AtomicCompareExchange 3 20 9 17 18 18 12 14
+6 AtomicIIncrement 3 21 9 17 18
+6 AtomicIDecrement 3 22 9 17 18
+7 AtomicIAdd 3 23 9 17 18 12
+7 AtomicISub 3 24 9 17 18 12
+7 AtomicSMin 3 25 9 17 18 12
+7 AtomicUMin 3 26 10 17 18 13
+7 AtomicSMax 3 27 9 17 18 12
+7 AtomicUMax 3 28 10 17 18 13
+7 AtomicAnd 3 29 9 17 18 12
+7 AtomicOr 3 30 9 17 18 12
+7 AtomicXor 3 31 9 17 18 12
+6 AtomicFlagTestAndSet 32 33 11 17 18
+4 AtomicFlagClear 11 17 18
+1 Return
+
+1 FunctionEnd
+
+; RUN: llvm-spirv --to-binary %s -o %t.spv
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv --spirv-ocl-builtins-version=CL1.2 -r %t.spv -o - | llvm-dis -o %t.ll
+; RUN: FileCheck %s < %t.ll
+
+; OpAtomicLoad is emulated via atom_add(*p, 0)
+; CHECK: call spir_func i64 @_Z8atom_add
+
+; OpAtomicStore is emulated via atom_xchg(*p, val)
+; CHECK: call spir_func i64 @_Z9atom_xchg
+
+; CHECK: call spir_func i64 @_Z9atom_xchg
+; CHECK: call spir_func i64 @_Z12atom_cmpxchg
+; CHECK: call spir_func i64 @_Z8atom_inc
+; CHECK: call spir_func i64 @_Z8atom_dec
+; CHECK: call spir_func i64 @_Z8atom_add
+; CHECK: call spir_func i64 @_Z8atom_sub
+; CHECK: call spir_func i64 @_Z8atom_min
+; CHECK: call spir_func i64 @_Z8atom_min
+; CHECK: call spir_func i64 @_Z8atom_max
+; CHECK: call spir_func i64 @_Z8atom_max
+; CHECK: call spir_func i64 @_Z8atom_and
+; CHECK: call spir_func i64 @_Z7atom_or
+; CHECK: call spir_func i64 @_Z8atom_xor
+
+; OpAtomicFlagTestAndSet is emulated via atomic_xchg(*p, 1)
+; CHECK: call spir_func i32 @_Z11atomic_xchg
+
+; OpAtomicFlagClear is emulated via atomic_xchg(*p, 0)
+; CHECK: call spir_func i32 @_Z11atomic_xchg
+
diff --git a/llvm-spirv/test/transcoding/cl_intel_sub_groups.ll b/llvm-spirv/test/transcoding/cl_intel_sub_groups.ll
index e27faad6ddb77..ae470bdb458ee 100644
--- a/llvm-spirv/test/transcoding/cl_intel_sub_groups.ll
+++ b/llvm-spirv/test/transcoding/cl_intel_sub_groups.ll
@@ -20,14 +20,15 @@
 ;}
 
 ; RUN: llvm-as %s -o %t.bc
-; RUN: llvm-spirv %t.bc -o - -spirv-text | FileCheck %s --check-prefix=CHECK-SPIRV
-; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv %t.bc -o - -spirv-text --spirv-ext=+SPV_INTEL_subgroups | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.bc -o %t.spv --spirv-ext=+SPV_INTEL_subgroups
 ; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
 ; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
 ; CHECK-SPIRV: Capability SubgroupShuffleINTEL
 ; CHECK-SPIRV: Capability SubgroupBufferBlockIOINTEL
 ; CHECK-SPIRV: Capability SubgroupImageBlockIOINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_subgroups"
 ; CHECK-SPIRV: Extension "cl_intel_subgroups"
 ; CHECK-SPIRV: Extension "cl_intel_subgroups_short"
 
diff --git a/llvm-spirv/test/transcoding/enqueue_kernel.cl b/llvm-spirv/test/transcoding/enqueue_kernel.cl
index aaa539e5f1089..7394366302cc2 100644
--- a/llvm-spirv/test/transcoding/enqueue_kernel.cl
+++ b/llvm-spirv/test/transcoding/enqueue_kernel.cl
@@ -10,6 +10,7 @@
 // CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer2:[0-9]+]] "__device_side_enqueue_block_invoke_2_kernel"
 // CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer3:[0-9]+]] "__device_side_enqueue_block_invoke_3_kernel"
 // CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer4:[0-9]+]] "__device_side_enqueue_block_invoke_4_kernel"
+// CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer5:[0-9]+]] "__device_side_enqueue_block_invoke_5_kernel"
 // CHECK-SPIRV: Name [[BlockGlb1:[0-9]+]] "__block_literal_global"
 // CHECK-SPIRV: Name [[BlockGlb2:[0-9]+]] "__block_literal_global.1"
 
@@ -41,6 +42,7 @@
 // CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy1]] { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* null to i8 addrspace(4)*) }, align 4
 
 typedef struct {int a;} ndrange_t;
+#define NULL ((void*)0)
 
 kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   queue_t default_queue;
@@ -60,7 +62,7 @@ kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   // CHECK-LLVM: [[Block2:%[0-9]+]] = bitcast [[BlockTy2]]* %block to %struct.__opencl_block_literal_generic*
   // CHECK-LLVM: [[Block2Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block2]] to i8 addrspace(4)*
   // CHECK-LLVM: [[BlockInv2:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8 addrspace(4)*
-  // CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2Ptr]])
+  // CHECK-LLVM: call i32 @__enqueue_kernel_basic(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2Ptr]])
   enqueue_kernel(default_queue, flags, ndrange,
                  ^(void) {
                    a[i] = c0;
@@ -121,20 +123,40 @@ kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   // CHECK-LLVM: [[Block1Tmp:%[0-9]+]] = bitcast [[BlockTy1]] addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*
   // CHECK-LLVM: [[Block1:%[0-9]+]] = addrspacecast i8 addrspace(1)* [[Block1Tmp]] to i8 addrspace(4)*
   // CHECK-LLVM: [[BlockInv1:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8 addrspace(4)*
-  // CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv1]], i8 addrspace(4)* [[Block1]], i32 3, i32* {{.*}})
+  // CHECK-LLVM: call i32 @__enqueue_kernel_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i8 addrspace(4)* [[BlockInv1]], i8 addrspace(4)* [[Block1]], i32 3, i32* {{.*}})
   enqueue_kernel(default_queue, flags, ndrange,
                  ^(local void *p1, local void *p2, local void *p3) {
                    return;
                  },
                  1, 2, 4);
+
+  // Emits block literal on stack and block kernel.
+
+  // CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event1:[0-9]+]]
+
+  // CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit2:[0-9]+]]
+  // CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}}
+  //                            [[ConstInt0]] [[EventNull]] [[Event1]]
+  //                            [[BlockKer5]] [[BlockLit5]] [[ConstInt20]] [[ConstInt8]]
+
+  // CHECK-LLVM: [[Block5:%[0-9]+]] = bitcast [[BlockTy3]]* %block14 to %struct.__opencl_block_literal_generic*
+  // CHECK-LLVM: [[Block5Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block5]] to i8 addrspace(4)
+  // CHECK-LLVM: [[BlockInv5:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_5_kernel to i8 addrspace(4)*
+  // CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv5]], i8 addrspace(4)* [[Block5Ptr]])
+  enqueue_kernel(default_queue, flags, ndrange, 0, NULL, &clk_event,
+                 ^(void) {
+                   a[i] = b[i];
+                 });
 }
 
 // CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer1]] 0 [[BlockTy1]]
 // CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer2]] 0 [[BlockTy1]]
 // CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer3]] 0 [[BlockTy2]]
 // CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer4]] 0 [[BlockTy3]]
+// CHECK-SPIRV-DAG: Function [[VoidTy]] [[BlockKer5]] 0 [[BlockTy1]]
 
 // CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)*{{.*}})
 // CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)*{{.*}})
 // CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
 // CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}, i8 addrspace(3)*{{.*}}, i8 addrspace(3)*{{.*}})
+// CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_5_kernel(i8 addrspace(4)*{{.*}})
diff --git a/llvm-spirv/test/vector_times_scalar.spt b/llvm-spirv/test/vector_times_scalar.spt
index a0fcbe1f020ff..4ad2134da6dcb 100644
--- a/llvm-spirv/test/vector_times_scalar.spt
+++ b/llvm-spirv/test/vector_times_scalar.spt
@@ -49,9 +49,6 @@
 
 1 FunctionEnd 
 
-; FIXME: LIT comments/commands are moved at the end because llvm-spirv stops
-; reading the file after first ';' symbol
-
 ; RUN: llvm-spirv %s -to-binary -o %t.spv
 ; RUN: spirv-val %t.spv
 ; RUN: llvm-spirv -r %t.spv -o %t.bc
diff --git a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
index 335255b9977e6..f6a97501c7fe6 100644
--- a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
+++ b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
@@ -111,6 +111,11 @@ static cl::list<std::string>
            cl::value_desc("+SPV_extenstion1_name,-SPV_extension2_name"),
            cl::ValueRequired);
 
+static cl::opt<bool> SPIRVGenKernelArgNameMD(
+    "spirv-gen-kernel-arg-name-md", cl::init(false),
+    cl::desc("Enable generating OpenCL kernel argument name "
+             "metadata"));
+
 using SPIRV::ExtensionID;
 
 #ifdef _SPIRV_SUPPORT_TEXT_FMT
@@ -236,6 +241,7 @@ static int convertSPIRV() {
     }
     return 0;
   };
+
   if (OutputFile == "-")
     return Action(std::cout);
 
@@ -360,7 +366,8 @@ int main(int Ac, char **Av) {
   if (0 != Ret)
     return Ret;
 
-  SPIRV::TranslatorOpts Opts(MaxSPIRVVersion, ExtensionsStatus);
+  SPIRV::TranslatorOpts Opts(MaxSPIRVVersion, ExtensionsStatus,
+                             SPIRVGenKernelArgNameMD);
 
 #ifdef _SPIRV_SUPPORT_TEXT_FMT
   if (ToText && (ToBinary || IsReverse || IsRegularization)) {
diff --git a/sycl/include/CL/__spirv/spirv_vars.hpp b/sycl/include/CL/__spirv/spirv_vars.hpp
index a7415b3ce59d6..51ca2e8fcc37e 100644
--- a/sycl/include/CL/__spirv/spirv_vars.hpp
+++ b/sycl/include/CL/__spirv/spirv_vars.hpp
@@ -55,12 +55,12 @@ extern "C" const __attribute__((ocl_constant)) uint32_t __spirv_BuiltInSubgroupL
   };                                                                           \
                                                                                \
   template <class DstT> struct InitSizesST##POSTFIX<2, DstT> {                 \
-    static DstT initSize() { return {get##POSTFIX<0>(), get##POSTFIX<1>()}; }  \
+    static DstT initSize() { return {get##POSTFIX<1>(), get##POSTFIX<0>()}; }  \
   };                                                                           \
                                                                                \
   template <class DstT> struct InitSizesST##POSTFIX<3, DstT> {                 \
     static DstT initSize() {                                                   \
-      return {get##POSTFIX<0>(), get##POSTFIX<1>(), get##POSTFIX<2>()};        \
+      return {get##POSTFIX<2>(), get##POSTFIX<1>(), get##POSTFIX<0>()};        \
     }                                                                          \
   };                                                                           \
                                                                                \
diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
index b52392ad1c119..8155af774f8c5 100644
--- a/sycl/include/CL/sycl/detail/cg.hpp
+++ b/sycl/include/CL/sycl/detail/cg.hpp
@@ -185,19 +185,12 @@ class HostKernel : public HostKernelBase {
   template <class ArgT = KernelArgType>
   typename std::enable_if<std::is_same<ArgT, sycl::id<Dims>>::value>::type
   runOnHost(const NDRDescT &NDRDesc) {
-    size_t XYZ[3] = {0};
-    sycl::id<Dims> ID;
-    for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
-      XYZ[1] = 0;
-      for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
-        XYZ[0] = 0;
-        for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
-          for (int I = 0; I < Dims; ++I)
-            ID[I] = XYZ[I];
-          MKernel(ID);
-        }
-      }
-    }
+    sycl::range<Dims> Range(InitializedVal<Dims, range>::template get<0>());
+    for (int I = 0; I < Dims; ++I)
+      Range[I] = NDRDesc.GlobalSize[I];
+
+    detail::NDLoop<Dims>::iterate(
+        Range, [&](const sycl::id<Dims> &ID) { MKernel(ID); });
   }
 
   template <class ArgT = KernelArgType>
@@ -210,20 +203,11 @@ class HostKernel : public HostKernelBase {
     for (int I = 0; I < Dims; ++I)
       Range[I] = NDRDesc.GlobalSize[I];
 
-    for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
-      XYZ[1] = 0;
-      for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
-        XYZ[0] = 0;
-        for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
-          for (int I = 0; I < Dims; ++I)
-            ID[I] = XYZ[I];
-
-          sycl::item<Dims, /*Offset=*/false> Item =
-              IDBuilder::createItem<Dims, false>(Range, ID);
-          MKernel(Item);
-        }
-      }
-    }
+    detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> ID) {
+      sycl::item<Dims, /*Offset=*/false> Item =
+          IDBuilder::createItem<Dims, false>(Range, ID);
+      MKernel(Item);
+    });
   }
 
   template <class ArgT = KernelArgType>
@@ -236,22 +220,13 @@ class HostKernel : public HostKernelBase {
       Range[I] = NDRDesc.GlobalSize[I];
       Offset[I] = NDRDesc.GlobalOffset[I];
     }
-    size_t XYZ[3] = {0};
-    sycl::id<Dims> ID;
-    for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
-      XYZ[1] = 0;
-      for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
-        XYZ[0] = 0;
-        for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
-          for (int I = 0; I < Dims; ++I)
-            ID[I] = XYZ[I] + Offset[I];
-
-          sycl::item<Dims, /*Offset=*/true> Item =
-              IDBuilder::createItem<Dims, true>(Range, ID, Offset);
-          MKernel(Item);
-        }
-      }
-    }
+
+    detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> &ID) {
+      sycl::id<Dims> OffsetID = ID + Offset;
+      sycl::item<Dims, /*Offset=*/true> Item =
+          IDBuilder::createItem<Dims, true>(Range, OffsetID, Offset);
+      MKernel(Item);
+    });
   }
 
   template <class ArgT = KernelArgType>
diff --git a/sycl/include/CL/sycl/detail/common.hpp b/sycl/include/CL/sycl/detail/common.hpp
index 54e6f59cdf339..7db45cebb8756 100644
--- a/sycl/include/CL/sycl/detail/common.hpp
+++ b/sycl/include/CL/sycl/detail/common.hpp
@@ -158,9 +158,9 @@ struct NDLoopIterateImpl {
                     const LoopBoundTy<NDIMS> &Stride,
                     const LoopBoundTy<NDIMS> &UpperBound, FuncTy f,
                     LoopIndexTy<NDIMS> &Index) {
-
-    for (Index[DIM] = LowerBound[DIM]; Index[DIM] < UpperBound[DIM];
-         Index[DIM] += Stride[DIM]) {
+    constexpr size_t AdjIdx = NDIMS - 1 - DIM;
+    for (Index[AdjIdx] = LowerBound[AdjIdx]; Index[AdjIdx] < UpperBound[AdjIdx];
+         Index[AdjIdx] += Stride[AdjIdx]) {
 
       NDLoopIterateImpl<NDIMS, DIM - 1, LoopBoundTy, FuncTy, LoopIndexTy>{
           LowerBound, Stride, UpperBound, f, Index};
@@ -177,8 +177,9 @@ struct NDLoopIterateImpl<NDIMS, 0, LoopBoundTy, FuncTy, LoopIndexTy> {
                     const LoopBoundTy<NDIMS> &UpperBound, FuncTy f,
                     LoopIndexTy<NDIMS> &Index) {
 
-    for (Index[0] = LowerBound[0]; Index[0] < UpperBound[0];
-         Index[0] += Stride[0]) {
+    constexpr size_t AdjIdx = NDIMS - 1;
+    for (Index[AdjIdx] = LowerBound[AdjIdx]; Index[AdjIdx] < UpperBound[AdjIdx];
+         Index[AdjIdx] += Stride[AdjIdx]) {
 
       f(Index);
     }
@@ -190,6 +191,7 @@ struct NDLoopIterateImpl<NDIMS, 0, LoopBoundTy, FuncTy, LoopIndexTy> {
 /// over a multi-dimensional space - it allows to avoid generating unnecessary
 /// outer loops like 'for (int z=0; z<1; z++)' in case of 1D and 2D iteration
 /// spaces or writing specializations of the algorithms for 1D, 2D and 3D cases.
+/// Loop is unrolled in a reverse directions, i.e. ID = 0 is the inner-most one.
 template <int NDIMS> struct NDLoop {
   /// Generates ND loop nest with {0,..0} .. \c UpperBound bounds with unit
   /// stride. Applies \c f at each iteration, passing current index of
diff --git a/sycl/include/CL/sycl/id.hpp b/sycl/include/CL/sycl/id.hpp
index fa9cffc43654a..ec400e9cd046e 100644
--- a/sycl/include/CL/sycl/id.hpp
+++ b/sycl/include/CL/sycl/id.hpp
@@ -21,7 +21,7 @@ template <int dimensions = 1> class id : public detail::array<dimensions> {
 private:
   using base = detail::array<dimensions>;
   static_assert(dimensions >= 1 && dimensions <= 3,
-                "id can only be 1, 2, or 3 dimentional.");
+                "id can only be 1, 2, or 3 dimensional.");
   template <int N, int val, typename T>
   using ParamTy = detail::enable_if_t<(N == val), T>;
 
diff --git a/sycl/include/CL/sycl/range.hpp b/sycl/include/CL/sycl/range.hpp
index bf00d1b0e05a2..8f59c6c94212d 100644
--- a/sycl/include/CL/sycl/range.hpp
+++ b/sycl/include/CL/sycl/range.hpp
@@ -17,7 +17,7 @@ namespace sycl {
 template <int dimensions> class id;
 template <int dimensions = 1> class range : public detail::array<dimensions> {
   static_assert(dimensions >= 1 && dimensions <= 3,
-                "range can only be 1, 2, or 3 dimentional.");
+                "range can only be 1, 2, or 3 dimensional.");
   using base = detail::array<dimensions>;
 
 public:
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 41a924e4202fe..91d1c38d9fb6b 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -578,6 +578,23 @@ static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
   NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
 }
 
+// We have the following mapping between dimensions with SPIRV builtins:
+// 1D: id[0] -> x
+// 2D: id[0] -> y, id[1] -> x
+// 3D: id[0] -> z, id[1] -> y, id[2] -> x
+// So in order to ensure the correctness we update all the kernel
+// parameters accordingly.
+// Initially we keep the order of NDRDescT as it provided by the user, this
+// simplifies overall handling and do the reverse only when
+// the kernel is enqueued.
+static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) {
+  if (NDR.Dims > 1) {
+    std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
+    std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
+    std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
+  }
+}
+
 // The function initialize accessors and calls lambda.
 // The function is used as argument to piEnqueueNativeKernel which requires
 // that the passed function takes one void* argument.
@@ -803,10 +820,15 @@ cl_int ExecCGCommand::enqueueImp() {
         getSyclObjImpl(Context)->getUSMDispatch();
     USMDispatch->setKernelIndirectAccess(Kernel, MQueue->getHandleRef());
 
+    // Remember this information before the range dimensions are reversed
+    const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
+
+    ReverseRangeDimensionsForKernel(NDRDesc);
+
     PI_CALL(RT::piEnqueueKernelLaunch(
         MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
         &NDRDesc.GlobalSize[0],
-        NDRDesc.LocalSize[0] ? &NDRDesc.LocalSize[0] : nullptr,
+        HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
         RawEvents.size(),
         RawEvents.empty() ? nullptr : &RawEvents[0], &Event));
 
diff --git a/sycl/test/linear_id/linear-host-dev.cpp b/sycl/test/linear_id/linear-host-dev.cpp
new file mode 100644
index 0000000000000..89437da135569
--- /dev/null
+++ b/sycl/test/linear_id/linear-host-dev.cpp
@@ -0,0 +1,50 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s
+//==--------------- linear-host-dev.cpp - SYCL linear id test -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+
+// Check that linear id is monotincally increased on host device.
+// Only there we can reliable check that. Since the kernel has a restriction
+// regarding usage of global variables, use stream to log the linear id
+// and ensure that they're monotonically increased.
+//
+// Note: This test heavily relies on the current implementation of
+// host device(single-threaded ordered executio). So if the implementation
+// is somehow changed so it's no longer possible to run this test reliable
+// it can be removed.
+
+namespace s = cl::sycl;
+
+int main(int argc, char *argv[]) {
+  s::queue q;
+
+  const size_t outer = 3;
+  const size_t inner = 2;
+  const s::range<2> rng = {outer, inner};
+
+  q.submit([&](s::handler &h) {
+    s::stream out(1024, 80, h);
+
+    h.parallel_for<class linear_id>(s::range<2>(rng), [=](s::item<2> item) {
+      // CHECK: 0
+      // CHECK-NEXT: 1
+      // CHECK-NEXT: 2
+      // CHECK-NEXT: 3
+      // CHECK-NEXT: 4
+      // CHECK-NEXT: 5
+      out << item.get_linear_id() << "\n";
+    });
+  });
+
+  return 0;
+}
diff --git a/sycl/test/linear_id/linear-sub_group.cpp b/sycl/test/linear_id/linear-sub_group.cpp
new file mode 100644
index 0000000000000..ba50b0e592777
--- /dev/null
+++ b/sycl/test/linear_id/linear-sub_group.cpp
@@ -0,0 +1,54 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==--------------- linear-sub_group.cpp - SYCL linear id test -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../sub_group/helper.hpp"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+
+using namespace cl::sycl;
+
+int main(int argc, char *argv[]) {
+  queue q;
+  if (!core_sg_supported(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  // Fill output array with sub-group IDs
+  const uint32_t outer = 2;
+  const uint32_t inner = 8;
+  std::vector<int> output(outer * inner, 0);
+  {
+    buffer<int, 2> output_buf(output.data(), range<2>(outer, inner));
+    q.submit([&](handler &cgh) {
+      auto output = output_buf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class linear_id>(
+          nd_range<2>(range<2>(outer, inner), range<2>(outer, inner)),
+          [=](nd_item<2> it) {
+            id<2> idx = it.get_global_id();
+            intel::sub_group sg = it.get_sub_group();
+            output[idx] = sg.get_group_id()[0] * sg.get_local_range()[0] +
+                          sg.get_local_id()[0];
+          });
+    });
+  }
+
+  // Compare with expected result
+  for (int idx = 0; idx < outer * inner; ++idx) {
+    assert(output[idx] == idx);
+  }
+  std::cout << "Test passed.\n";
+  return 0;
+}
diff --git a/sycl/test/linear_id/opencl-interop.cpp b/sycl/test/linear_id/opencl-interop.cpp
new file mode 100644
index 0000000000000..ea9d6620f730c
--- /dev/null
+++ b/sycl/test/linear_id/opencl-interop.cpp
@@ -0,0 +1,74 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==---------------- opencl-interop.cpp - SYCL linear id test --------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <numeric>
+
+using namespace cl::sycl;
+
+int main(int argc, char *argv[]) {
+  queue q;
+  if (q.is_host()) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  // Compute expected answer.
+  const uint32_t dimA = 2;
+  const uint32_t dimB = 8;
+  std::vector<int> input(dimA * dimB), output(dimA), expected(dimA);
+  std::iota(input.begin(), input.end(), 0);
+  for (int j = 0; j < dimA; ++j) {
+    int sum = 0;
+    for (int i = 0; i < dimB; ++i) {
+      sum += input[j * dimB + i];
+    }
+    expected[j] = sum;
+  }
+
+  // Compute sum using one work-group per element of dimA
+  program prog(q.get_context(), {q.get_device()});
+  prog.build_with_source("__kernel void sum(__global const int* input, "
+                         "__global int* output, const int dimA, const int dimB)"
+                         "{"
+                         "  int j = get_global_id(1);"
+                         "  int i = get_global_id(0);"
+                         "  int sum = work_group_reduce_add(input[j*dimB+i]);"
+                         "  if (get_local_id(0) == 0)"
+                         "  {"
+                         "    output[j] = sum;"
+                         "  }"
+                         "}",
+                         "-cl-std=CL2.0");
+  kernel sum = prog.get_kernel("sum");
+  {
+    buffer<int, 2> input_buf(input.data(), range<2>(dimA, dimB)),
+        output_buf(output.data(), range<2>(dimA, dimB));
+    q.submit([&](handler &cgh) {
+      auto input = input_buf.get_access<access::mode::read>(cgh);
+      auto output = output_buf.get_access<access::mode::discard_write>(cgh);
+      cgh.set_args(input, output, dimA, dimB);
+      cgh.parallel_for(nd_range<2>(range<2>(dimA, dimB), range<2>(1, dimB)),
+                       sum);
+    });
+  }
+
+  // Compare with expected result
+  for (int j = 0; j < dimA; ++j) {
+    assert(output[j] == expected[j]);
+  }
+  std::cout << "Test passed.\n";
+  return 0;
+}